rf_reconstruct.c revision 1.119.10.1 1 /* $NetBSD: rf_reconstruct.c,v 1.119.10.1 2014/08/10 06:54:57 tls Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /************************************************************
30 *
31 * rf_reconstruct.c -- code to perform on-line reconstruction
32 *
33 ************************************************************/
34
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.119.10.1 2014/08/10 06:54:57 tls Exp $");
37
38 #include <sys/param.h>
39 #include <sys/time.h>
40 #include <sys/buf.h>
41 #include <sys/errno.h>
42 #include <sys/systm.h>
43 #include <sys/proc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <sys/namei.h> /* for pathbuf */
48 #include <dev/raidframe/raidframevar.h>
49
50 #include <miscfs/specfs/specdev.h> /* for v_rdev */
51
52 #include "rf_raid.h"
53 #include "rf_reconutil.h"
54 #include "rf_revent.h"
55 #include "rf_reconbuffer.h"
56 #include "rf_acctrace.h"
57 #include "rf_etimer.h"
58 #include "rf_dag.h"
59 #include "rf_desc.h"
60 #include "rf_debugprint.h"
61 #include "rf_general.h"
62 #include "rf_driver.h"
63 #include "rf_utils.h"
64 #include "rf_shutdown.h"
65
66 #include "rf_kintf.h"
67
68 /* setting these to -1 causes them to be set to their default values if not set by debug options */
69
70 #if RF_DEBUG_RECON
71 #define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
72 #define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
73 #define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
74 #define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
75 #define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
76 #define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
77 #define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
78 #define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
79
80 #define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
81 #define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
82
83 #else /* RF_DEBUG_RECON */
84
85 #define Dprintf(s) {}
86 #define Dprintf1(s,a) {}
87 #define Dprintf2(s,a,b) {}
88 #define Dprintf3(s,a,b,c) {}
89 #define Dprintf4(s,a,b,c,d) {}
90 #define Dprintf5(s,a,b,c,d,e) {}
91 #define Dprintf6(s,a,b,c,d,e,f) {}
92 #define Dprintf7(s,a,b,c,d,e,f,g) {}
93
94 #define DDprintf1(s,a) {}
95 #define DDprintf2(s,a,b) {}
96
97 #endif /* RF_DEBUG_RECON */
98
99 #define RF_RECON_DONE_READS 1
100 #define RF_RECON_READ_ERROR 2
101 #define RF_RECON_WRITE_ERROR 3
102 #define RF_RECON_READ_STOPPED 4
103 #define RF_RECON_WRITE_DONE 5
104
105 #define RF_MAX_FREE_RECONBUFFER 32
106 #define RF_MIN_FREE_RECONBUFFER 16
107
108 static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t,
109 RF_RaidDisk_t *, int, RF_RowCol_t);
110 static void FreeReconDesc(RF_RaidReconDesc_t *);
111 static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *);
112 static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t);
113 static int TryToRead(RF_Raid_t *, RF_RowCol_t);
114 static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t,
115 RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *,
116 RF_SectorNum_t *);
117 static int IssueNextWriteRequest(RF_Raid_t *);
118 static int ReconReadDoneProc(void *, int);
119 static int ReconWriteDoneProc(void *, int);
120 static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t);
121 static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *,
122 RF_RowCol_t, RF_HeadSepLimit_t,
123 RF_ReconUnitNum_t);
124 static int CheckForcedOrBlockedReconstruction(RF_Raid_t *,
125 RF_ReconParityStripeStatus_t *,
126 RF_PerDiskReconCtrl_t *,
127 RF_RowCol_t, RF_StripeNum_t,
128 RF_ReconUnitNum_t);
129 static void ForceReconReadDoneProc(void *, int);
130 static void rf_ShutdownReconstruction(void *);
131
132 struct RF_ReconDoneProc_s {
133 void (*proc) (RF_Raid_t *, void *);
134 void *arg;
135 RF_ReconDoneProc_t *next;
136 };
137
138 /**************************************************************************
139 *
140 * sets up the parameters that will be used by the reconstruction process
141 * currently there are none, except for those that the layout-specific
142 * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
143 *
144 * in the kernel, we fire off the recon thread.
145 *
146 **************************************************************************/
147 static void
148 rf_ShutdownReconstruction(void *ignored)
149 {
150 pool_destroy(&rf_pools.reconbuffer);
151 }
152
153 int
154 rf_ConfigureReconstruction(RF_ShutdownList_t **listp)
155 {
156
157 rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t),
158 "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER);
159 rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
160
161 return (0);
162 }
163
164 static RF_RaidReconDesc_t *
165 AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col,
166 RF_RaidDisk_t *spareDiskPtr, int numDisksDone,
167 RF_RowCol_t scol)
168 {
169
170 RF_RaidReconDesc_t *reconDesc;
171
172 RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t),
173 (RF_RaidReconDesc_t *));
174 reconDesc->raidPtr = raidPtr;
175 reconDesc->col = col;
176 reconDesc->spareDiskPtr = spareDiskPtr;
177 reconDesc->numDisksDone = numDisksDone;
178 reconDesc->scol = scol;
179 reconDesc->next = NULL;
180
181 return (reconDesc);
182 }
183
184 static void
185 FreeReconDesc(RF_RaidReconDesc_t *reconDesc)
186 {
187 #if RF_RECON_STATS > 0
188 printf("raid%d: %lu recon event waits, %lu recon delays\n",
189 reconDesc->raidPtr->raidid,
190 (long) reconDesc->numReconEventWaits,
191 (long) reconDesc->numReconExecDelays);
192 #endif /* RF_RECON_STATS > 0 */
193 printf("raid%d: %lu max exec ticks\n",
194 reconDesc->raidPtr->raidid,
195 (long) reconDesc->maxReconExecTicks);
196 RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t));
197 }
198
199
200 /*****************************************************************************
201 *
202 * primary routine to reconstruct a failed disk. This should be called from
203 * within its own thread. It won't return until reconstruction completes,
204 * fails, or is aborted.
205 *****************************************************************************/
206 int
207 rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col)
208 {
209 const RF_LayoutSW_t *lp;
210 int rc;
211
212 lp = raidPtr->Layout.map;
213 if (lp->SubmitReconBuffer) {
214 /*
215 * The current infrastructure only supports reconstructing one
216 * disk at a time for each array.
217 */
218 rf_lock_mutex2(raidPtr->mutex);
219 while (raidPtr->reconInProgress) {
220 rf_wait_cond2(raidPtr->waitForReconCond, raidPtr->mutex);
221 }
222 raidPtr->reconInProgress++;
223 rf_unlock_mutex2(raidPtr->mutex);
224 rc = rf_ReconstructFailedDiskBasic(raidPtr, col);
225 rf_lock_mutex2(raidPtr->mutex);
226 raidPtr->reconInProgress--;
227 } else {
228 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
229 lp->parityConfig);
230 rc = EIO;
231 rf_lock_mutex2(raidPtr->mutex);
232 }
233 rf_signal_cond2(raidPtr->waitForReconCond);
234 rf_unlock_mutex2(raidPtr->mutex);
235 return (rc);
236 }
237
238 int
239 rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col)
240 {
241 RF_ComponentLabel_t *c_label;
242 RF_RaidDisk_t *spareDiskPtr = NULL;
243 RF_RaidReconDesc_t *reconDesc;
244 RF_RowCol_t scol;
245 int numDisksDone = 0, rc;
246
247 /* first look for a spare drive onto which to reconstruct the data */
248 /* spare disk descriptors are stored in row 0. This may have to
249 * change eventually */
250
251 rf_lock_mutex2(raidPtr->mutex);
252 RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed);
253 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
254 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
255 if (raidPtr->status != rf_rs_degraded) {
256 RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col);
257 rf_unlock_mutex2(raidPtr->mutex);
258 return (EINVAL);
259 }
260 scol = (-1);
261 } else {
262 #endif
263 for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
264 if (raidPtr->Disks[scol].status == rf_ds_spare) {
265 spareDiskPtr = &raidPtr->Disks[scol];
266 spareDiskPtr->status = rf_ds_used_spare;
267 break;
268 }
269 }
270 if (!spareDiskPtr) {
271 RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col);
272 rf_unlock_mutex2(raidPtr->mutex);
273 return (ENOSPC);
274 }
275 printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol);
276 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
277 }
278 #endif
279 rf_unlock_mutex2(raidPtr->mutex);
280
281 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol);
282 raidPtr->reconDesc = (void *) reconDesc;
283 #if RF_RECON_STATS > 0
284 reconDesc->hsStallCount = 0;
285 reconDesc->numReconExecDelays = 0;
286 reconDesc->numReconEventWaits = 0;
287 #endif /* RF_RECON_STATS > 0 */
288 reconDesc->reconExecTimerRunning = 0;
289 reconDesc->reconExecTicks = 0;
290 reconDesc->maxReconExecTicks = 0;
291 rc = rf_ContinueReconstructFailedDisk(reconDesc);
292
293 if (!rc) {
294 /* fix up the component label */
295 /* Don't actually need the read here.. */
296 c_label = raidget_component_label(raidPtr, scol);
297
298 raid_init_component_label(raidPtr, c_label);
299 c_label->row = 0;
300 c_label->column = col;
301 c_label->clean = RF_RAID_DIRTY;
302 c_label->status = rf_ds_optimal;
303 rf_component_label_set_partitionsize(c_label,
304 raidPtr->Disks[scol].partitionSize);
305
306 /* We've just done a rebuild based on all the other
307 disks, so at this point the parity is known to be
308 clean, even if it wasn't before. */
309
310 /* XXX doesn't hold for RAID 6!!*/
311
312 rf_lock_mutex2(raidPtr->mutex);
313 raidPtr->parity_good = RF_RAID_CLEAN;
314 rf_unlock_mutex2(raidPtr->mutex);
315
316 /* XXXX MORE NEEDED HERE */
317
318 raidflush_component_label(raidPtr, scol);
319 } else {
320 /* Reconstruct failed. */
321
322 rf_lock_mutex2(raidPtr->mutex);
323 /* Failed disk goes back to "failed" status */
324 raidPtr->Disks[col].status = rf_ds_failed;
325
326 /* Spare disk goes back to "spare" status. */
327 spareDiskPtr->status = rf_ds_spare;
328 rf_unlock_mutex2(raidPtr->mutex);
329
330 }
331 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
332 return (rc);
333 }
334
335 /*
336
337 Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
338 and you don't get a spare until the next Monday. With this function
339 (and hot-swappable drives) you can now put your new disk containing
340 /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
341 rebuild the data "on the spot".
342
343 */
344
345 int
346 rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col)
347 {
348 RF_RaidDisk_t *spareDiskPtr = NULL;
349 RF_RaidReconDesc_t *reconDesc;
350 const RF_LayoutSW_t *lp;
351 RF_ComponentLabel_t *c_label;
352 int numDisksDone = 0, rc;
353 uint64_t numsec;
354 unsigned int secsize;
355 struct pathbuf *pb;
356 struct vnode *vp;
357 int retcode;
358 int ac;
359
360 rf_lock_mutex2(raidPtr->mutex);
361 lp = raidPtr->Layout.map;
362 if (!lp->SubmitReconBuffer) {
363 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
364 lp->parityConfig);
365 /* wakeup anyone who might be waiting to do a reconstruct */
366 rf_signal_cond2(raidPtr->waitForReconCond);
367 rf_unlock_mutex2(raidPtr->mutex);
368 return(EIO);
369 }
370
371 /*
372 * The current infrastructure only supports reconstructing one
373 * disk at a time for each array.
374 */
375
376 if (raidPtr->Disks[col].status != rf_ds_failed) {
377 /* "It's gone..." */
378 raidPtr->numFailures++;
379 raidPtr->Disks[col].status = rf_ds_failed;
380 raidPtr->status = rf_rs_degraded;
381 rf_unlock_mutex2(raidPtr->mutex);
382 rf_update_component_labels(raidPtr,
383 RF_NORMAL_COMPONENT_UPDATE);
384 rf_lock_mutex2(raidPtr->mutex);
385 }
386
387 while (raidPtr->reconInProgress) {
388 rf_wait_cond2(raidPtr->waitForReconCond, raidPtr->mutex);
389 }
390
391 raidPtr->reconInProgress++;
392
393 /* first look for a spare drive onto which to reconstruct the
394 data. spare disk descriptors are stored in row 0. This
395 may have to change eventually */
396
397 /* Actually, we don't care if it's failed or not... On a RAID
398 set with correct parity, this function should be callable
399 on any component without ill effects. */
400 /* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */
401
402 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
403 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
404 RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col);
405
406 raidPtr->reconInProgress--;
407 rf_signal_cond2(raidPtr->waitForReconCond);
408 rf_unlock_mutex2(raidPtr->mutex);
409 return (EINVAL);
410 }
411 #endif
412
413 /* This device may have been opened successfully the
414 first time. Close it before trying to open it again.. */
415
416 if (raidPtr->raid_cinfo[col].ci_vp != NULL) {
417 #if 0
418 printf("Closed the open device: %s\n",
419 raidPtr->Disks[col].devname);
420 #endif
421 vp = raidPtr->raid_cinfo[col].ci_vp;
422 ac = raidPtr->Disks[col].auto_configured;
423 rf_unlock_mutex2(raidPtr->mutex);
424 rf_close_component(raidPtr, vp, ac);
425 rf_lock_mutex2(raidPtr->mutex);
426 raidPtr->raid_cinfo[col].ci_vp = NULL;
427 }
428 /* note that this disk was *not* auto_configured (any longer)*/
429 raidPtr->Disks[col].auto_configured = 0;
430
431 #if 0
432 printf("About to (re-)open the device for rebuilding: %s\n",
433 raidPtr->Disks[col].devname);
434 #endif
435 rf_unlock_mutex2(raidPtr->mutex);
436 pb = pathbuf_create(raidPtr->Disks[col].devname);
437 if (pb == NULL) {
438 retcode = ENOMEM;
439 } else {
440 retcode = dk_lookup(pb, curlwp, &vp);
441 pathbuf_destroy(pb);
442 }
443
444 if (retcode) {
445 printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid,
446 raidPtr->Disks[col].devname, retcode);
447
448 /* the component isn't responding properly...
449 must be still dead :-( */
450 rf_lock_mutex2(raidPtr->mutex);
451 raidPtr->reconInProgress--;
452 rf_signal_cond2(raidPtr->waitForReconCond);
453 rf_unlock_mutex2(raidPtr->mutex);
454 return(retcode);
455 }
456
457 /* Ok, so we can at least do a lookup...
458 How about actually getting a vp for it? */
459
460 retcode = getdisksize(vp, &numsec, &secsize);
461 if (retcode) {
462 vn_close(vp, FREAD | FWRITE, kauth_cred_get());
463 rf_lock_mutex2(raidPtr->mutex);
464 raidPtr->reconInProgress--;
465 rf_signal_cond2(raidPtr->waitForReconCond);
466 rf_unlock_mutex2(raidPtr->mutex);
467 return(retcode);
468 }
469 rf_lock_mutex2(raidPtr->mutex);
470 raidPtr->Disks[col].blockSize = secsize;
471 raidPtr->Disks[col].numBlocks = numsec - rf_protectedSectors;
472
473 raidPtr->raid_cinfo[col].ci_vp = vp;
474 raidPtr->raid_cinfo[col].ci_dev = vp->v_rdev;
475
476 raidPtr->Disks[col].dev = vp->v_rdev;
477
478 /* we allow the user to specify that only a fraction
479 of the disks should be used this is just for debug:
480 it speeds up * the parity scan */
481 raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks *
482 rf_sizePercentage / 100;
483 rf_unlock_mutex2(raidPtr->mutex);
484
485 spareDiskPtr = &raidPtr->Disks[col];
486 spareDiskPtr->status = rf_ds_used_spare;
487
488 printf("raid%d: initiating in-place reconstruction on column %d\n",
489 raidPtr->raidid, col);
490
491 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr,
492 numDisksDone, col);
493 raidPtr->reconDesc = (void *) reconDesc;
494 #if RF_RECON_STATS > 0
495 reconDesc->hsStallCount = 0;
496 reconDesc->numReconExecDelays = 0;
497 reconDesc->numReconEventWaits = 0;
498 #endif /* RF_RECON_STATS > 0 */
499 reconDesc->reconExecTimerRunning = 0;
500 reconDesc->reconExecTicks = 0;
501 reconDesc->maxReconExecTicks = 0;
502 rc = rf_ContinueReconstructFailedDisk(reconDesc);
503
504 if (!rc) {
505 rf_lock_mutex2(raidPtr->mutex);
506 /* Need to set these here, as at this point it'll be claiming
507 that the disk is in rf_ds_spared! But we know better :-) */
508
509 raidPtr->Disks[col].status = rf_ds_optimal;
510 raidPtr->status = rf_rs_optimal;
511 rf_unlock_mutex2(raidPtr->mutex);
512
513 /* fix up the component label */
514 /* Don't actually need the read here.. */
515 c_label = raidget_component_label(raidPtr, col);
516
517 rf_lock_mutex2(raidPtr->mutex);
518 raid_init_component_label(raidPtr, c_label);
519
520 c_label->row = 0;
521 c_label->column = col;
522
523 /* We've just done a rebuild based on all the other
524 disks, so at this point the parity is known to be
525 clean, even if it wasn't before. */
526
527 /* XXX doesn't hold for RAID 6!!*/
528
529 raidPtr->parity_good = RF_RAID_CLEAN;
530 rf_unlock_mutex2(raidPtr->mutex);
531
532 raidflush_component_label(raidPtr, col);
533 } else {
534 /* Reconstruct-in-place failed. Disk goes back to
535 "failed" status, regardless of what it was before. */
536 rf_lock_mutex2(raidPtr->mutex);
537 raidPtr->Disks[col].status = rf_ds_failed;
538 rf_unlock_mutex2(raidPtr->mutex);
539 }
540
541 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
542
543 rf_lock_mutex2(raidPtr->mutex);
544 raidPtr->reconInProgress--;
545 rf_signal_cond2(raidPtr->waitForReconCond);
546 rf_unlock_mutex2(raidPtr->mutex);
547
548 return (rc);
549 }
550
551
552 int
553 rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc)
554 {
555 RF_Raid_t *raidPtr = reconDesc->raidPtr;
556 RF_RowCol_t col = reconDesc->col;
557 RF_RowCol_t scol = reconDesc->scol;
558 RF_ReconMap_t *mapPtr;
559 RF_ReconCtrl_t *tmp_reconctrl;
560 RF_ReconEvent_t *event;
561 RF_StripeCount_t incPSID,lastPSID,num_writes,pending_writes,prev;
562 #if RF_INCLUDE_RAID5_RS > 0
563 RF_StripeCount_t startPSID,endPSID,aPSID,bPSID,offPSID;
564 #endif
565 RF_ReconUnitCount_t RUsPerPU;
566 struct timeval etime, elpsd;
567 unsigned long xor_s, xor_resid_us;
568 int i, ds;
569 int status, done;
570 int recon_error, write_error;
571
572 raidPtr->accumXorTimeUs = 0;
573 #if RF_ACC_TRACE > 0
574 /* create one trace record per physical disk */
575 RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
576 #endif
577
578 /* quiesce the array prior to starting recon. this is needed
579 * to assure no nasty interactions with pending user writes.
580 * We need to do this before we change the disk or row status. */
581
582 Dprintf("RECON: begin request suspend\n");
583 rf_SuspendNewRequestsAndWait(raidPtr);
584 Dprintf("RECON: end request suspend\n");
585
586 /* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */
587 tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol);
588
589 rf_lock_mutex2(raidPtr->mutex);
590
591 /* create the reconstruction control pointer and install it in
592 * the right slot */
593 raidPtr->reconControl = tmp_reconctrl;
594 mapPtr = raidPtr->reconControl->reconMap;
595 raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs;
596 raidPtr->reconControl->numRUsComplete = 0;
597 raidPtr->status = rf_rs_reconstructing;
598 raidPtr->Disks[col].status = rf_ds_reconstructing;
599 raidPtr->Disks[col].spareCol = scol;
600
601 rf_unlock_mutex2(raidPtr->mutex);
602
603 RF_GETTIME(raidPtr->reconControl->starttime);
604
605 Dprintf("RECON: resume requests\n");
606 rf_ResumeNewRequests(raidPtr);
607
608
609 mapPtr = raidPtr->reconControl->reconMap;
610
611 incPSID = RF_RECONMAP_SIZE;
612 lastPSID = raidPtr->Layout.numStripe / raidPtr->Layout.SUsPerPU;
613 RUsPerPU = raidPtr->Layout.SUsPerPU / raidPtr->Layout.SUsPerRU;
614 recon_error = 0;
615 write_error = 0;
616 pending_writes = incPSID;
617 raidPtr->reconControl->lastPSID = incPSID - 1;
618
619 /* bounds check raidPtr->reconControl->lastPSID and
620 pending_writes so that we don't attempt to wait for more IO
621 than can possibly happen */
622
623 if (raidPtr->reconControl->lastPSID > lastPSID)
624 raidPtr->reconControl->lastPSID = lastPSID;
625
626 if (pending_writes > lastPSID)
627 pending_writes = lastPSID;
628
629 /* start the actual reconstruction */
630
631 done = 0;
632 while (!done) {
633
634 if (raidPtr->waitShutdown) {
635 /* someone is unconfiguring this array... bail on the reconstruct.. */
636 recon_error = 1;
637 break;
638 }
639
640 num_writes = 0;
641
642 #if RF_INCLUDE_RAID5_RS > 0
643 /* For RAID5 with Rotated Spares we will be 'short'
644 some number of writes since no writes will get
645 issued for stripes where the spare is on the
646 component being rebuilt. Account for the shortage
647 here so that we don't hang indefinitely below
648 waiting for writes to complete that were never
649 scheduled.
650
651 XXX: Should be fixed for PARITY_DECLUSTERING and
652 others too!
653
654 */
655
656 if (raidPtr->Layout.numDataCol <
657 raidPtr->numCol - raidPtr->Layout.numParityCol) {
658 /* numDataCol is at least 2 less than numCol, so
659 should be RAID 5 with Rotated Spares */
660
661 /* XXX need to update for RAID 6 */
662
663 startPSID = raidPtr->reconControl->lastPSID - pending_writes + 1;
664 endPSID = raidPtr->reconControl->lastPSID;
665
666 offPSID = raidPtr->numCol - col - 1;
667
668 aPSID = startPSID - startPSID % raidPtr->numCol + offPSID;
669 if (aPSID < startPSID) {
670 aPSID += raidPtr->numCol;
671 }
672
673 bPSID = endPSID - ((endPSID - offPSID) % raidPtr->numCol);
674
675 if (aPSID < endPSID) {
676 num_writes = ((bPSID - aPSID) / raidPtr->numCol) + 1;
677 }
678
679 if ((aPSID == endPSID) && (bPSID == endPSID)) {
680 num_writes++;
681 }
682 }
683 #endif
684
685 /* issue a read for each surviving disk */
686
687 reconDesc->numDisksDone = 0;
688 for (i = 0; i < raidPtr->numCol; i++) {
689 if (i != col) {
690 /* find and issue the next I/O on the
691 * indicated disk */
692 if (IssueNextReadRequest(raidPtr, i)) {
693 Dprintf1("RECON: done issuing for c%d\n", i);
694 reconDesc->numDisksDone++;
695 }
696 }
697 }
698
699 /* process reconstruction events until all disks report that
700 * they've completed all work */
701
702 while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
703
704 event = rf_GetNextReconEvent(reconDesc);
705 status = ProcessReconEvent(raidPtr, event);
706
707 /* the normal case is that a read completes, and all is well. */
708 if (status == RF_RECON_DONE_READS) {
709 reconDesc->numDisksDone++;
710 } else if ((status == RF_RECON_READ_ERROR) ||
711 (status == RF_RECON_WRITE_ERROR)) {
712 /* an error was encountered while reconstructing...
713 Pretend we've finished this disk.
714 */
715 recon_error = 1;
716 raidPtr->reconControl->error = 1;
717
718 /* bump the numDisksDone count for reads,
719 but not for writes */
720 if (status == RF_RECON_READ_ERROR)
721 reconDesc->numDisksDone++;
722
723 /* write errors are special -- when we are
724 done dealing with the reads that are
725 finished, we don't want to wait for any
726 writes */
727 if (status == RF_RECON_WRITE_ERROR) {
728 write_error = 1;
729 num_writes++;
730 }
731
732 } else if (status == RF_RECON_READ_STOPPED) {
733 /* count this component as being "done" */
734 reconDesc->numDisksDone++;
735 } else if (status == RF_RECON_WRITE_DONE) {
736 num_writes++;
737 }
738
739 if (recon_error) {
740 /* make sure any stragglers are woken up so that
741 their theads will complete, and we can get out
742 of here with all IO processed */
743
744 rf_WakeupHeadSepCBWaiters(raidPtr);
745 }
746
747 raidPtr->reconControl->numRUsTotal =
748 mapPtr->totalRUs;
749 raidPtr->reconControl->numRUsComplete =
750 mapPtr->totalRUs -
751 rf_UnitsLeftToReconstruct(mapPtr);
752
753 #if RF_DEBUG_RECON
754 raidPtr->reconControl->percentComplete =
755 (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
756 if (rf_prReconSched) {
757 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
758 }
759 #endif
760 }
761
762 /* reads done, wakeup any waiters, and then wait for writes */
763
764 rf_WakeupHeadSepCBWaiters(raidPtr);
765
766 while (!recon_error && (num_writes < pending_writes)) {
767 event = rf_GetNextReconEvent(reconDesc);
768 status = ProcessReconEvent(raidPtr, event);
769
770 if (status == RF_RECON_WRITE_ERROR) {
771 num_writes++;
772 recon_error = 1;
773 raidPtr->reconControl->error = 1;
774 /* an error was encountered at the very end... bail */
775 } else if (status == RF_RECON_WRITE_DONE) {
776 num_writes++;
777 } /* else it's something else, and we don't care */
778 }
779 if (recon_error ||
780 (raidPtr->reconControl->lastPSID == lastPSID)) {
781 done = 1;
782 break;
783 }
784
785 prev = raidPtr->reconControl->lastPSID;
786 raidPtr->reconControl->lastPSID += incPSID;
787
788 if (raidPtr->reconControl->lastPSID > lastPSID) {
789 pending_writes = lastPSID - prev;
790 raidPtr->reconControl->lastPSID = lastPSID;
791 }
792
793 /* back down curPSID to get ready for the next round... */
794 for (i = 0; i < raidPtr->numCol; i++) {
795 if (i != col) {
796 raidPtr->reconControl->perDiskInfo[i].curPSID--;
797 raidPtr->reconControl->perDiskInfo[i].ru_count = RUsPerPU - 1;
798 }
799 }
800 }
801
802 mapPtr = raidPtr->reconControl->reconMap;
803 if (rf_reconDebug) {
804 printf("RECON: all reads completed\n");
805 }
806 /* at this point all the reads have completed. We now wait
807 * for any pending writes to complete, and then we're done */
808
809 while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) {
810
811 event = rf_GetNextReconEvent(reconDesc);
812 status = ProcessReconEvent(raidPtr, event);
813
814 if (status == RF_RECON_WRITE_ERROR) {
815 recon_error = 1;
816 raidPtr->reconControl->error = 1;
817 /* an error was encountered at the very end... bail */
818 } else {
819 #if RF_DEBUG_RECON
820 raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
821 if (rf_prReconSched) {
822 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
823 }
824 #endif
825 }
826 }
827
828 if (recon_error) {
829 /* we've encountered an error in reconstructing. */
830 printf("raid%d: reconstruction failed.\n", raidPtr->raidid);
831
832 /* we start by blocking IO to the RAID set. */
833 rf_SuspendNewRequestsAndWait(raidPtr);
834
835 rf_lock_mutex2(raidPtr->mutex);
836 /* mark set as being degraded, rather than
837 rf_rs_reconstructing as we were before the problem.
838 After this is done we can update status of the
839 component disks without worrying about someone
840 trying to read from a failed component.
841 */
842 raidPtr->status = rf_rs_degraded;
843 rf_unlock_mutex2(raidPtr->mutex);
844
845 /* resume IO */
846 rf_ResumeNewRequests(raidPtr);
847
848 /* At this point there are two cases:
849 1) If we've experienced a read error, then we've
850 already waited for all the reads we're going to get,
851 and we just need to wait for the writes.
852
853 2) If we've experienced a write error, we've also
854 already waited for all the reads to complete,
855 but there is little point in waiting for the writes --
856 when they do complete, they will just be ignored.
857
858 So we just wait for writes to complete if we didn't have a
859 write error.
860 */
861
862 if (!write_error) {
863 /* wait for writes to complete */
864 while (raidPtr->reconControl->pending_writes > 0) {
865
866 event = rf_GetNextReconEvent(reconDesc);
867 status = ProcessReconEvent(raidPtr, event);
868
869 if (status == RF_RECON_WRITE_ERROR) {
870 raidPtr->reconControl->error = 1;
871 /* an error was encountered at the very end... bail.
872 This will be very bad news for the user, since
873 at this point there will have been a read error
874 on one component, and a write error on another!
875 */
876 break;
877 }
878 }
879 }
880
881
882 /* cleanup */
883
884 /* drain the event queue - after waiting for the writes above,
885 there shouldn't be much (if anything!) left in the queue. */
886
887 rf_DrainReconEventQueue(reconDesc);
888
889 /* XXX As much as we'd like to free the recon control structure
890 and the reconDesc, we have no way of knowing if/when those will
891 be touched by IO that has yet to occur. It is rather poor to be
892 basically causing a 'memory leak' here, but there doesn't seem to be
893 a cleaner alternative at this time. Perhaps when the reconstruct code
894 gets a makeover this problem will go away.
895 */
896 #if 0
897 rf_FreeReconControl(raidPtr);
898 #endif
899
900 #if RF_ACC_TRACE > 0
901 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
902 #endif
903 /* XXX see comment above */
904 #if 0
905 FreeReconDesc(reconDesc);
906 #endif
907
908 return (1);
909 }
910
911 /* Success: mark the dead disk as reconstructed. We quiesce
912 * the array here to assure no nasty interactions with pending
913 * user accesses when we free up the psstatus structure as
914 * part of FreeReconControl() */
915
916 rf_SuspendNewRequestsAndWait(raidPtr);
917
918 rf_lock_mutex2(raidPtr->mutex);
919 raidPtr->numFailures--;
920 ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
921 raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared;
922 raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal;
923 rf_unlock_mutex2(raidPtr->mutex);
924 RF_GETTIME(etime);
925 RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd);
926
927 rf_ResumeNewRequests(raidPtr);
928
929 printf("raid%d: Reconstruction of disk at col %d completed\n",
930 raidPtr->raidid, col);
931 xor_s = raidPtr->accumXorTimeUs / 1000000;
932 xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
933 printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n",
934 raidPtr->raidid,
935 (int) elpsd.tv_sec, (int) elpsd.tv_usec,
936 raidPtr->accumXorTimeUs, xor_s, xor_resid_us);
937 printf("raid%d: (start time %d sec %d usec, end time %d sec %d usec)\n",
938 raidPtr->raidid,
939 (int) raidPtr->reconControl->starttime.tv_sec,
940 (int) raidPtr->reconControl->starttime.tv_usec,
941 (int) etime.tv_sec, (int) etime.tv_usec);
942 #if RF_RECON_STATS > 0
943 printf("raid%d: Total head-sep stall count was %d\n",
944 raidPtr->raidid, (int) reconDesc->hsStallCount);
945 #endif /* RF_RECON_STATS > 0 */
946 rf_FreeReconControl(raidPtr);
947 #if RF_ACC_TRACE > 0
948 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
949 #endif
950 FreeReconDesc(reconDesc);
951
952 return (0);
953
954 }
955 /*****************************************************************************
956 * do the right thing upon each reconstruction event.
957 *****************************************************************************/
958 static int
959 ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event)
960 {
961 int retcode = 0, submitblocked;
962 RF_ReconBuffer_t *rbuf;
963 RF_SectorCount_t sectorsPerRU;
964
965 retcode = RF_RECON_READ_STOPPED;
966
967 Dprintf1("RECON: ProcessReconEvent type %d\n", event->type);
968
969 switch (event->type) {
970
971 /* a read I/O has completed */
972 case RF_REVENT_READDONE:
973 rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf;
974 Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n",
975 event->col, rbuf->parityStripeID);
976 Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n",
977 rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
978 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
979 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
980 if (!raidPtr->reconControl->error) {
981 submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
982 Dprintf1("RECON: submitblocked=%d\n", submitblocked);
983 if (!submitblocked)
984 retcode = IssueNextReadRequest(raidPtr, event->col);
985 else
986 retcode = 0;
987 }
988 break;
989
990 /* a write I/O has completed */
991 case RF_REVENT_WRITEDONE:
992 #if RF_DEBUG_RECON
993 if (rf_floatingRbufDebug) {
994 rf_CheckFloatingRbufCount(raidPtr, 1);
995 }
996 #endif
997 sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
998 rbuf = (RF_ReconBuffer_t *) event->arg;
999 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1000 Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n",
1001 rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete);
1002 rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap,
1003 rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
1004 rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru);
1005
1006 rf_lock_mutex2(raidPtr->reconControl->rb_mutex);
1007 raidPtr->reconControl->pending_writes--;
1008 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex);
1009
1010 if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
1011 rf_lock_mutex2(raidPtr->reconControl->rb_mutex);
1012 while(raidPtr->reconControl->rb_lock) {
1013 rf_wait_cond2(raidPtr->reconControl->rb_cv,
1014 raidPtr->reconControl->rb_mutex);
1015 }
1016 raidPtr->reconControl->rb_lock = 1;
1017 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex);
1018
1019 raidPtr->numFullReconBuffers--;
1020 rf_ReleaseFloatingReconBuffer(raidPtr, rbuf);
1021
1022 rf_lock_mutex2(raidPtr->reconControl->rb_mutex);
1023 raidPtr->reconControl->rb_lock = 0;
1024 rf_broadcast_cond2(raidPtr->reconControl->rb_cv);
1025 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex);
1026 } else
1027 if (rbuf->type == RF_RBUF_TYPE_FORCED)
1028 rf_FreeReconBuffer(rbuf);
1029 else
1030 RF_ASSERT(0);
1031 retcode = RF_RECON_WRITE_DONE;
1032 break;
1033
1034 case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been
1035 * cleared */
1036 Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col);
1037 if (!raidPtr->reconControl->error) {
1038 submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf,
1039 0, (int) (long) event->arg);
1040 RF_ASSERT(!submitblocked); /* we wouldn't have gotten the
1041 * BUFCLEAR event if we
1042 * couldn't submit */
1043 retcode = IssueNextReadRequest(raidPtr, event->col);
1044 }
1045 break;
1046
1047 case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction
1048 * blockage has been cleared */
1049 DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col);
1050 if (!raidPtr->reconControl->error) {
1051 retcode = TryToRead(raidPtr, event->col);
1052 }
1053 break;
1054
1055 case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation
1056 * reconstruction blockage has been
1057 * cleared */
1058 Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col);
1059 if (!raidPtr->reconControl->error) {
1060 retcode = TryToRead(raidPtr, event->col);
1061 }
1062 break;
1063
1064 /* a buffer has become ready to write */
1065 case RF_REVENT_BUFREADY:
1066 Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col);
1067 if (!raidPtr->reconControl->error) {
1068 retcode = IssueNextWriteRequest(raidPtr);
1069 #if RF_DEBUG_RECON
1070 if (rf_floatingRbufDebug) {
1071 rf_CheckFloatingRbufCount(raidPtr, 1);
1072 }
1073 #endif
1074 }
1075 break;
1076
1077 /* we need to skip the current RU entirely because it got
1078 * recon'd while we were waiting for something else to happen */
1079 case RF_REVENT_SKIP:
1080 DDprintf1("RECON: SKIP EVENT: col %d\n", event->col);
1081 if (!raidPtr->reconControl->error) {
1082 retcode = IssueNextReadRequest(raidPtr, event->col);
1083 }
1084 break;
1085
1086 /* a forced-reconstruction read access has completed. Just
1087 * submit the buffer */
1088 case RF_REVENT_FORCEDREADDONE:
1089 rbuf = (RF_ReconBuffer_t *) event->arg;
1090 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1091 DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col);
1092 if (!raidPtr->reconControl->error) {
1093 submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
1094 RF_ASSERT(!submitblocked);
1095 retcode = 0;
1096 }
1097 break;
1098
1099 /* A read I/O failed to complete */
1100 case RF_REVENT_READ_FAILED:
1101 retcode = RF_RECON_READ_ERROR;
1102 break;
1103
1104 /* A write I/O failed to complete */
1105 case RF_REVENT_WRITE_FAILED:
1106 retcode = RF_RECON_WRITE_ERROR;
1107
1108 /* This is an error, but it was a pending write.
1109 Account for it. */
1110 rf_lock_mutex2(raidPtr->reconControl->rb_mutex);
1111 raidPtr->reconControl->pending_writes--;
1112 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex);
1113
1114 rbuf = (RF_ReconBuffer_t *) event->arg;
1115
1116 /* cleanup the disk queue data */
1117 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1118
1119 /* At this point we're erroring out, badly, and floatingRbufs
1120 may not even be valid. Rather than putting this back onto
1121 the floatingRbufs list, just arrange for its immediate
1122 destruction.
1123 */
1124 rf_FreeReconBuffer(rbuf);
1125 break;
1126
1127 /* a forced read I/O failed to complete */
1128 case RF_REVENT_FORCEDREAD_FAILED:
1129 retcode = RF_RECON_READ_ERROR;
1130 break;
1131
1132 default:
1133 RF_PANIC();
1134 }
1135 rf_FreeReconEventDesc(event);
1136 return (retcode);
1137 }
1138 /*****************************************************************************
1139 *
1140 * find the next thing that's needed on the indicated disk, and issue
1141 * a read request for it. We assume that the reconstruction buffer
1142 * associated with this process is free to receive the data. If
1143 * reconstruction is blocked on the indicated RU, we issue a
1144 * blockage-release request instead of a physical disk read request.
1145 * If the current disk gets too far ahead of the others, we issue a
1146 * head-separation wait request and return.
1147 *
1148 * ctrl->{ru_count, curPSID, diskOffset} and
1149 * rbuf->failedDiskSectorOffset are maintained to point to the unit
1150 * we're currently accessing. Note that this deviates from the
1151 * standard C idiom of having counters point to the next thing to be
1152 * accessed. This allows us to easily retry when we're blocked by
1153 * head separation or reconstruction-blockage events.
1154 *
1155 *****************************************************************************/
1156 static int
1157 IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col)
1158 {
1159 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1160 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1161 RF_ReconBuffer_t *rbuf = ctrl->rbuf;
1162 RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
1163 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1164 int do_new_check = 0, retcode = 0, status;
1165
1166 /* if we are currently the slowest disk, mark that we have to do a new
1167 * check */
1168 if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter)
1169 do_new_check = 1;
1170
1171 while (1) {
1172
1173 ctrl->ru_count++;
1174 if (ctrl->ru_count < RUsPerPU) {
1175 ctrl->diskOffset += sectorsPerRU;
1176 rbuf->failedDiskSectorOffset += sectorsPerRU;
1177 } else {
1178 ctrl->curPSID++;
1179 ctrl->ru_count = 0;
1180 /* code left over from when head-sep was based on
1181 * parity stripe id */
1182 if (ctrl->curPSID > raidPtr->reconControl->lastPSID) {
1183 CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter));
1184 return (RF_RECON_DONE_READS); /* finito! */
1185 }
1186 /* find the disk offsets of the start of the parity
1187 * stripe on both the current disk and the failed
1188 * disk. skip this entire parity stripe if either disk
1189 * does not appear in the indicated PS */
1190 status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset,
1191 &rbuf->spCol, &rbuf->spOffset);
1192 if (status) {
1193 ctrl->ru_count = RUsPerPU - 1;
1194 continue;
1195 }
1196 }
1197 rbuf->which_ru = ctrl->ru_count;
1198
1199 /* skip this RU if it's already been reconstructed */
1200 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) {
1201 Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count);
1202 continue;
1203 }
1204 break;
1205 }
1206 ctrl->headSepCounter++;
1207 if (do_new_check)
1208 CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter); /* update min if needed */
1209
1210
1211 /* at this point, we have definitely decided what to do, and we have
1212 * only to see if we can actually do it now */
1213 rbuf->parityStripeID = ctrl->curPSID;
1214 rbuf->which_ru = ctrl->ru_count;
1215 #if RF_ACC_TRACE > 0
1216 memset((char *) &raidPtr->recon_tracerecs[col], 0,
1217 sizeof(raidPtr->recon_tracerecs[col]));
1218 raidPtr->recon_tracerecs[col].reconacc = 1;
1219 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1220 #endif
1221 retcode = TryToRead(raidPtr, col);
1222 return (retcode);
1223 }
1224
1225 /*
1226 * tries to issue the next read on the indicated disk. We may be
1227 * blocked by (a) the heads being too far apart, or (b) recon on the
1228 * indicated RU being blocked due to a write by a user thread. In
1229 * this case, we issue a head-sep or blockage wait request, which will
1230 * cause this same routine to be invoked again later when the blockage
1231 * has cleared.
1232 */
1233
1234 static int
1235 TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col)
1236 {
1237 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1238 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
1239 RF_StripeNum_t psid = ctrl->curPSID;
1240 RF_ReconUnitNum_t which_ru = ctrl->ru_count;
1241 RF_DiskQueueData_t *req;
1242 int status;
1243 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr;
1244
1245 /* if the current disk is too far ahead of the others, issue a
1246 * head-separation wait and return */
1247 if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru))
1248 return (0);
1249
1250 /* allocate a new PSS in case we need it */
1251 newpssPtr = rf_AllocPSStatus(raidPtr);
1252
1253 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1254 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr);
1255
1256 if (pssPtr != newpssPtr) {
1257 rf_FreePSStatus(raidPtr, newpssPtr);
1258 }
1259
1260 /* if recon is blocked on the indicated parity stripe, issue a
1261 * block-wait request and return. this also must mark the indicated RU
1262 * in the stripe as under reconstruction if not blocked. */
1263 status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru);
1264 if (status == RF_PSS_RECON_BLOCKED) {
1265 Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru);
1266 goto out;
1267 } else
1268 if (status == RF_PSS_FORCED_ON_WRITE) {
1269 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1270 goto out;
1271 }
1272 /* make one last check to be sure that the indicated RU didn't get
1273 * reconstructed while we were waiting for something else to happen.
1274 * This is unfortunate in that it causes us to make this check twice
1275 * in the normal case. Might want to make some attempt to re-work
1276 * this so that we only do this check if we've definitely blocked on
1277 * one of the above checks. When this condition is detected, we may
1278 * have just created a bogus status entry, which we need to delete. */
1279 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) {
1280 Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru);
1281 if (pssPtr == newpssPtr)
1282 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1283 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1284 goto out;
1285 }
1286 /* found something to read. issue the I/O */
1287 Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n",
1288 psid, col, ctrl->diskOffset, ctrl->rbuf->buffer);
1289 #if RF_ACC_TRACE > 0
1290 RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
1291 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
1292 raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
1293 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
1294 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1295 #endif
1296 /* should be ok to use a NULL proc pointer here, all the bufs we use
1297 * should be in kernel space */
1298 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
1299 ReconReadDoneProc, (void *) ctrl,
1300 #if RF_ACC_TRACE > 0
1301 &raidPtr->recon_tracerecs[col],
1302 #else
1303 NULL,
1304 #endif
1305 (void *) raidPtr, 0, NULL, PR_WAITOK);
1306
1307 ctrl->rbuf->arg = (void *) req;
1308 rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY);
1309 pssPtr->issued[col] = 1;
1310
1311 out:
1312 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1313 return (0);
1314 }
1315
1316
1317 /*
1318 * given a parity stripe ID, we want to find out whether both the
1319 * current disk and the failed disk exist in that parity stripe. If
1320 * not, we want to skip this whole PS. If so, we want to find the
1321 * disk offset of the start of the PS on both the current disk and the
1322 * failed disk.
1323 *
1324 * this works by getting a list of disks comprising the indicated
1325 * parity stripe, and searching the list for the current and failed
1326 * disks. Once we've decided they both exist in the parity stripe, we
1327 * need to decide whether each is data or parity, so that we'll know
1328 * which mapping function to call to get the corresponding disk
1329 * offsets.
1330 *
1331 * this is kind of unpleasant, but doing it this way allows the
1332 * reconstruction code to use parity stripe IDs rather than physical
1333 * disks address to march through the failed disk, which greatly
1334 * simplifies a lot of code, as well as eliminating the need for a
1335 * reverse-mapping function. I also think it will execute faster,
1336 * since the calls to the mapping module are kept to a minimum.
1337 *
1338 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
1339 * THE STRIPE IN THE CORRECT ORDER
1340 *
1341 * raidPtr - raid descriptor
1342 * psid - parity stripe identifier
1343 * col - column of disk to find the offsets for
1344 * spCol - out: col of spare unit for failed unit
1345 * spOffset - out: offset into disk containing spare unit
1346 *
1347 */
1348
1349
1350 static int
1351 ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid,
1352 RF_RowCol_t col, RF_SectorNum_t *outDiskOffset,
1353 RF_SectorNum_t *outFailedDiskSectorOffset,
1354 RF_RowCol_t *spCol, RF_SectorNum_t *spOffset)
1355 {
1356 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1357 RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1358 RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */
1359 RF_RowCol_t *diskids;
1360 u_int i, j, k, i_offset, j_offset;
1361 RF_RowCol_t pcol;
1362 int testcol;
1363 RF_SectorNum_t poffset;
1364 char i_is_parity = 0, j_is_parity = 0;
1365 RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
1366
1367 /* get a listing of the disks comprising that stripe */
1368 sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
1369 (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids);
1370 RF_ASSERT(diskids);
1371
1372 /* reject this entire parity stripe if it does not contain the
1373 * indicated disk or it does not contain the failed disk */
1374
1375 for (i = 0; i < stripeWidth; i++) {
1376 if (col == diskids[i])
1377 break;
1378 }
1379 if (i == stripeWidth)
1380 goto skipit;
1381 for (j = 0; j < stripeWidth; j++) {
1382 if (fcol == diskids[j])
1383 break;
1384 }
1385 if (j == stripeWidth) {
1386 goto skipit;
1387 }
1388 /* find out which disk the parity is on */
1389 (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP);
1390
1391 /* find out if either the current RU or the failed RU is parity */
1392 /* also, if the parity occurs in this stripe prior to the data and/or
1393 * failed col, we need to decrement i and/or j */
1394 for (k = 0; k < stripeWidth; k++)
1395 if (diskids[k] == pcol)
1396 break;
1397 RF_ASSERT(k < stripeWidth);
1398 i_offset = i;
1399 j_offset = j;
1400 if (k < i)
1401 i_offset--;
1402 else
1403 if (k == i) {
1404 i_is_parity = 1;
1405 i_offset = 0;
1406 } /* set offsets to zero to disable multiply
1407 * below */
1408 if (k < j)
1409 j_offset--;
1410 else
1411 if (k == j) {
1412 j_is_parity = 1;
1413 j_offset = 0;
1414 }
1415 /* at this point, [ij]_is_parity tells us whether the [current,failed]
1416 * disk is parity at the start of this RU, and, if data, "[ij]_offset"
1417 * tells us how far into the stripe the [current,failed] disk is. */
1418
1419 /* call the mapping routine to get the offset into the current disk,
1420 * repeat for failed disk. */
1421 if (i_is_parity)
1422 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1423 else
1424 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1425
1426 RF_ASSERT(col == testcol);
1427
1428 if (j_is_parity)
1429 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1430 else
1431 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1432 RF_ASSERT(fcol == testcol);
1433
1434 /* now locate the spare unit for the failed unit */
1435 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1436 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
1437 if (j_is_parity)
1438 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1439 else
1440 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1441 } else {
1442 #endif
1443 *spCol = raidPtr->reconControl->spareCol;
1444 *spOffset = *outFailedDiskSectorOffset;
1445 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1446 }
1447 #endif
1448 return (0);
1449
1450 skipit:
1451 Dprintf2("RECON: Skipping psid %ld: nothing needed from c%d\n",
1452 psid, col);
1453 return (1);
1454 }
1455 /* this is called when a buffer has become ready to write to the replacement disk */
1456 static int
1457 IssueNextWriteRequest(RF_Raid_t *raidPtr)
1458 {
1459 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1460 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1461 #if RF_ACC_TRACE > 0
1462 RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1463 #endif
1464 RF_ReconBuffer_t *rbuf;
1465 RF_DiskQueueData_t *req;
1466
1467 rbuf = rf_GetFullReconBuffer(raidPtr->reconControl);
1468 RF_ASSERT(rbuf); /* there must be one available, or we wouldn't
1469 * have gotten the event that sent us here */
1470 RF_ASSERT(rbuf->pssPtr);
1471
1472 rbuf->pssPtr->writeRbuf = rbuf;
1473 rbuf->pssPtr = NULL;
1474
1475 Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n",
1476 rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
1477 rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
1478 Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n",
1479 rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
1480 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
1481
1482 /* should be ok to use a NULL b_proc here b/c all addrs should be in
1483 * kernel space */
1484 req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
1485 sectorsPerRU, rbuf->buffer,
1486 rbuf->parityStripeID, rbuf->which_ru,
1487 ReconWriteDoneProc, (void *) rbuf,
1488 #if RF_ACC_TRACE > 0
1489 &raidPtr->recon_tracerecs[fcol],
1490 #else
1491 NULL,
1492 #endif
1493 (void *) raidPtr, 0, NULL, PR_WAITOK);
1494
1495 rbuf->arg = (void *) req;
1496 rf_lock_mutex2(raidPtr->reconControl->rb_mutex);
1497 raidPtr->reconControl->pending_writes++;
1498 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex);
1499 rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY);
1500
1501 return (0);
1502 }
1503
1504 /*
1505 * this gets called upon the completion of a reconstruction read
1506 * operation the arg is a pointer to the per-disk reconstruction
1507 * control structure for the process that just finished a read.
1508 *
1509 * called at interrupt context in the kernel, so don't do anything
1510 * illegal here.
1511 */
1512 static int
1513 ReconReadDoneProc(void *arg, int status)
1514 {
1515 RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
1516 RF_Raid_t *raidPtr;
1517
1518 /* Detect that reconCtrl is no longer valid, and if that
1519 is the case, bail without calling rf_CauseReconEvent().
1520 There won't be anyone listening for this event anyway */
1521
1522 if (ctrl->reconCtrl == NULL)
1523 return(0);
1524
1525 raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
1526
1527 if (status) {
1528 printf("raid%d: Recon read failed: %d\n", raidPtr->raidid, status);
1529 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED);
1530 return(0);
1531 }
1532 #if RF_ACC_TRACE > 0
1533 RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1534 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1535 raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us =
1536 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1537 RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1538 #endif
1539 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE);
1540 return (0);
1541 }
1542 /* this gets called upon the completion of a reconstruction write operation.
1543 * the arg is a pointer to the rbuf that was just written
1544 *
1545 * called at interrupt context in the kernel, so don't do anything illegal here.
1546 */
1547 static int
1548 ReconWriteDoneProc(void *arg, int status)
1549 {
1550 RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
1551
1552 /* Detect that reconControl is no longer valid, and if that
1553 is the case, bail without calling rf_CauseReconEvent().
1554 There won't be anyone listening for this event anyway */
1555
1556 if (rbuf->raidPtr->reconControl == NULL)
1557 return(0);
1558
1559 Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru);
1560 if (status) {
1561 printf("raid%d: Recon write failed (status %d(0x%x))!\n", rbuf->raidPtr->raidid,status,status);
1562 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED);
1563 return(0);
1564 }
1565 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE);
1566 return (0);
1567 }
1568
1569
1570 /*
1571 * computes a new minimum head sep, and wakes up anyone who needs to
1572 * be woken as a result
1573 */
1574 static void
1575 CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr)
1576 {
1577 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1578 RF_HeadSepLimit_t new_min;
1579 RF_RowCol_t i;
1580 RF_CallbackDesc_t *p;
1581 RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition
1582 * of a minimum */
1583
1584
1585 rf_lock_mutex2(reconCtrlPtr->rb_mutex);
1586 while(reconCtrlPtr->rb_lock) {
1587 rf_wait_cond2(reconCtrlPtr->rb_cv, reconCtrlPtr->rb_mutex);
1588 }
1589 reconCtrlPtr->rb_lock = 1;
1590 rf_unlock_mutex2(reconCtrlPtr->rb_mutex);
1591
1592 new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */
1593 for (i = 0; i < raidPtr->numCol; i++)
1594 if (i != reconCtrlPtr->fcol) {
1595 if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min)
1596 new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter;
1597 }
1598 /* set the new minimum and wake up anyone who can now run again */
1599 if (new_min != reconCtrlPtr->minHeadSepCounter) {
1600 reconCtrlPtr->minHeadSepCounter = new_min;
1601 Dprintf1("RECON: new min head pos counter val is %ld\n", new_min);
1602 while (reconCtrlPtr->headSepCBList) {
1603 if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min)
1604 break;
1605 p = reconCtrlPtr->headSepCBList;
1606 reconCtrlPtr->headSepCBList = p->next;
1607 p->next = NULL;
1608 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
1609 rf_FreeCallbackDesc(p);
1610 }
1611
1612 }
1613 rf_lock_mutex2(reconCtrlPtr->rb_mutex);
1614 reconCtrlPtr->rb_lock = 0;
1615 rf_broadcast_cond2(reconCtrlPtr->rb_cv);
1616 rf_unlock_mutex2(reconCtrlPtr->rb_mutex);
1617 }
1618
1619 /*
1620 * checks to see that the maximum head separation will not be violated
1621 * if we initiate a reconstruction I/O on the indicated disk.
1622 * Limiting the maximum head separation between two disks eliminates
1623 * the nasty buffer-stall conditions that occur when one disk races
1624 * ahead of the others and consumes all of the floating recon buffers.
1625 * This code is complex and unpleasant but it's necessary to avoid
1626 * some very nasty, albeit fairly rare, reconstruction behavior.
1627 *
1628 * returns non-zero if and only if we have to stop working on the
1629 * indicated disk due to a head-separation delay.
1630 */
1631 static int
1632 CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl,
1633 RF_RowCol_t col, RF_HeadSepLimit_t hsCtr,
1634 RF_ReconUnitNum_t which_ru)
1635 {
1636 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1637 RF_CallbackDesc_t *cb, *p, *pt;
1638 int retval = 0;
1639
1640 /* if we're too far ahead of the slowest disk, stop working on this
1641 * disk until the slower ones catch up. We do this by scheduling a
1642 * wakeup callback for the time when the slowest disk has caught up.
1643 * We define "caught up" with 20% hysteresis, i.e. the head separation
1644 * must have fallen to at most 80% of the max allowable head
1645 * separation before we'll wake up.
1646 *
1647 */
1648 rf_lock_mutex2(reconCtrlPtr->rb_mutex);
1649 while(reconCtrlPtr->rb_lock) {
1650 rf_wait_cond2(reconCtrlPtr->rb_cv, reconCtrlPtr->rb_mutex);
1651 }
1652 reconCtrlPtr->rb_lock = 1;
1653 rf_unlock_mutex2(reconCtrlPtr->rb_mutex);
1654 if ((raidPtr->headSepLimit >= 0) &&
1655 ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) {
1656 Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n",
1657 raidPtr->raidid, col, ctrl->headSepCounter,
1658 reconCtrlPtr->minHeadSepCounter,
1659 raidPtr->headSepLimit);
1660 cb = rf_AllocCallbackDesc();
1661 /* the minHeadSepCounter value we have to get to before we'll
1662 * wake up. build in 20% hysteresis. */
1663 cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
1664 cb->col = col;
1665 cb->next = NULL;
1666
1667 /* insert this callback descriptor into the sorted list of
1668 * pending head-sep callbacks */
1669 p = reconCtrlPtr->headSepCBList;
1670 if (!p)
1671 reconCtrlPtr->headSepCBList = cb;
1672 else
1673 if (cb->callbackArg.v < p->callbackArg.v) {
1674 cb->next = reconCtrlPtr->headSepCBList;
1675 reconCtrlPtr->headSepCBList = cb;
1676 } else {
1677 for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next);
1678 cb->next = p;
1679 pt->next = cb;
1680 }
1681 retval = 1;
1682 #if RF_RECON_STATS > 0
1683 ctrl->reconCtrl->reconDesc->hsStallCount++;
1684 #endif /* RF_RECON_STATS > 0 */
1685 }
1686 rf_lock_mutex2(reconCtrlPtr->rb_mutex);
1687 reconCtrlPtr->rb_lock = 0;
1688 rf_broadcast_cond2(reconCtrlPtr->rb_cv);
1689 rf_unlock_mutex2(reconCtrlPtr->rb_mutex);
1690
1691 return (retval);
1692 }
1693 /*
1694 * checks to see if reconstruction has been either forced or blocked
1695 * by a user operation. if forced, we skip this RU entirely. else if
1696 * blocked, put ourselves on the wait list. else return 0.
1697 *
1698 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY
1699 */
1700 static int
1701 CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr,
1702 RF_ReconParityStripeStatus_t *pssPtr,
1703 RF_PerDiskReconCtrl_t *ctrl,
1704 RF_RowCol_t col,
1705 RF_StripeNum_t psid,
1706 RF_ReconUnitNum_t which_ru)
1707 {
1708 RF_CallbackDesc_t *cb;
1709 int retcode = 0;
1710
1711 if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
1712 retcode = RF_PSS_FORCED_ON_WRITE;
1713 else
1714 if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
1715 Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru);
1716 cb = rf_AllocCallbackDesc(); /* append ourselves to
1717 * the blockage-wait
1718 * list */
1719 cb->col = col;
1720 cb->next = pssPtr->blockWaitList;
1721 pssPtr->blockWaitList = cb;
1722 retcode = RF_PSS_RECON_BLOCKED;
1723 }
1724 if (!retcode)
1725 pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under
1726 * reconstruction */
1727
1728 return (retcode);
1729 }
1730 /*
1731 * if reconstruction is currently ongoing for the indicated stripeID,
1732 * reconstruction is forced to completion and we return non-zero to
1733 * indicate that the caller must wait. If not, then reconstruction is
1734 * blocked on the indicated stripe and the routine returns zero. If
1735 * and only if we return non-zero, we'll cause the cbFunc to get
1736 * invoked with the cbArg when the reconstruction has completed.
1737 */
1738 int
1739 rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1740 void (*cbFunc)(RF_Raid_t *, void *), void *cbArg)
1741 {
1742 RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're
1743 * forcing recon on */
1744 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */
1745 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; /* a pointer to the parity
1746 * stripe status structure */
1747 RF_StripeNum_t psid; /* parity stripe id */
1748 RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk
1749 * offset */
1750 RF_RowCol_t *diskids;
1751 RF_ReconUnitNum_t which_ru; /* RU within parity stripe */
1752 RF_RowCol_t fcol, diskno, i;
1753 RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */
1754 RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */
1755 RF_CallbackDesc_t *cb;
1756 int nPromoted;
1757
1758 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1759
1760 /* allocate a new PSS in case we need it */
1761 newpssPtr = rf_AllocPSStatus(raidPtr);
1762
1763 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1764
1765 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr);
1766
1767 if (pssPtr != newpssPtr) {
1768 rf_FreePSStatus(raidPtr, newpssPtr);
1769 }
1770
1771 /* if recon is not ongoing on this PS, just return */
1772 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1773 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1774 return (0);
1775 }
1776 /* otherwise, we have to wait for reconstruction to complete on this
1777 * RU. */
1778 /* In order to avoid waiting for a potentially large number of
1779 * low-priority accesses to complete, we force a normal-priority (i.e.
1780 * not low-priority) reconstruction on this RU. */
1781 if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
1782 DDprintf1("Forcing recon on psid %ld\n", psid);
1783 pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under
1784 * forced recon */
1785 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage
1786 * that we just set */
1787 fcol = raidPtr->reconControl->fcol;
1788
1789 /* get a listing of the disks comprising the indicated stripe */
1790 (raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids);
1791
1792 /* For previously issued reads, elevate them to normal
1793 * priority. If the I/O has already completed, it won't be
1794 * found in the queue, and hence this will be a no-op. For
1795 * unissued reads, allocate buffers and issue new reads. The
1796 * fact that we've set the FORCED bit means that the regular
1797 * recon procs will not re-issue these reqs */
1798 for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++)
1799 if ((diskno = diskids[i]) != fcol) {
1800 if (pssPtr->issued[diskno]) {
1801 nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru);
1802 if (rf_reconDebug && nPromoted)
1803 printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno);
1804 } else {
1805 new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */
1806 ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset,
1807 &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare
1808 * location */
1809 new_rbuf->parityStripeID = psid; /* fill in the buffer */
1810 new_rbuf->which_ru = which_ru;
1811 new_rbuf->failedDiskSectorOffset = fd_offset;
1812 new_rbuf->priority = RF_IO_NORMAL_PRIORITY;
1813
1814 /* use NULL b_proc b/c all addrs
1815 * should be in kernel space */
1816 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer,
1817 psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf,
1818 NULL, (void *) raidPtr, 0, NULL, PR_WAITOK);
1819
1820 new_rbuf->arg = req;
1821 rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */
1822 Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno);
1823 }
1824 }
1825 /* if the write is sitting in the disk queue, elevate its
1826 * priority */
1827 if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru))
1828 if (rf_reconDebug)
1829 printf("raid%d: promoted write to col %d\n",
1830 raidPtr->raidid, fcol);
1831 }
1832 /* install a callback descriptor to be invoked when recon completes on
1833 * this parity stripe. */
1834 cb = rf_AllocCallbackDesc();
1835 /* XXX the following is bogus.. These functions don't really match!!
1836 * GO */
1837 cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
1838 cb->callbackArg.p = (void *) cbArg;
1839 cb->next = pssPtr->procWaitList;
1840 pssPtr->procWaitList = cb;
1841 DDprintf2("raid%d: Waiting for forced recon on psid %ld\n",
1842 raidPtr->raidid, psid);
1843
1844 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1845 return (1);
1846 }
1847 /* called upon the completion of a forced reconstruction read.
1848 * all we do is schedule the FORCEDREADONE event.
1849 * called at interrupt context in the kernel, so don't do anything illegal here.
1850 */
1851 static void
1852 ForceReconReadDoneProc(void *arg, int status)
1853 {
1854 RF_ReconBuffer_t *rbuf = arg;
1855
1856 /* Detect that reconControl is no longer valid, and if that
1857 is the case, bail without calling rf_CauseReconEvent().
1858 There won't be anyone listening for this event anyway */
1859
1860 if (rbuf->raidPtr->reconControl == NULL)
1861 return;
1862
1863 if (status) {
1864 printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid);
1865 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED);
1866 return;
1867 }
1868 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE);
1869 }
1870 /* releases a block on the reconstruction of the indicated stripe */
1871 int
1872 rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
1873 {
1874 RF_StripeNum_t stripeID = asmap->stripeID;
1875 RF_ReconParityStripeStatus_t *pssPtr;
1876 RF_ReconUnitNum_t which_ru;
1877 RF_StripeNum_t psid;
1878 RF_CallbackDesc_t *cb;
1879
1880 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1881 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1882 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL);
1883
1884 /* When recon is forced, the pss desc can get deleted before we get
1885 * back to unblock recon. But, this can _only_ happen when recon is
1886 * forced. It would be good to put some kind of sanity check here, but
1887 * how to decide if recon was just forced or not? */
1888 if (!pssPtr) {
1889 /* printf("Warning: no pss descriptor upon unblock on psid %ld
1890 * RU %d\n",psid,which_ru); */
1891 #if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0)
1892 if (rf_reconDebug || rf_pssDebug)
1893 printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru);
1894 #endif
1895 goto out;
1896 }
1897 pssPtr->blockCount--;
1898 Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n",
1899 raidPtr->raidid, psid, pssPtr->blockCount);
1900 if (pssPtr->blockCount == 0) { /* if recon blockage has been released */
1901
1902 /* unblock recon before calling CauseReconEvent in case
1903 * CauseReconEvent causes us to try to issue a new read before
1904 * returning here. */
1905 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
1906
1907
1908 while (pssPtr->blockWaitList) {
1909 /* spin through the block-wait list and
1910 release all the waiters */
1911 cb = pssPtr->blockWaitList;
1912 pssPtr->blockWaitList = cb->next;
1913 cb->next = NULL;
1914 rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR);
1915 rf_FreeCallbackDesc(cb);
1916 }
1917 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1918 /* if no recon was requested while recon was blocked */
1919 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1920 }
1921 }
1922 out:
1923 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1924 return (0);
1925 }
1926
1927 void
1928 rf_WakeupHeadSepCBWaiters(RF_Raid_t *raidPtr)
1929 {
1930 RF_CallbackDesc_t *p;
1931
1932 rf_lock_mutex2(raidPtr->reconControl->rb_mutex);
1933 while(raidPtr->reconControl->rb_lock) {
1934 rf_wait_cond2(raidPtr->reconControl->rb_cv,
1935 raidPtr->reconControl->rb_mutex);
1936 }
1937
1938 raidPtr->reconControl->rb_lock = 1;
1939 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex);
1940
1941 while (raidPtr->reconControl->headSepCBList) {
1942 p = raidPtr->reconControl->headSepCBList;
1943 raidPtr->reconControl->headSepCBList = p->next;
1944 p->next = NULL;
1945 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
1946 rf_FreeCallbackDesc(p);
1947 }
1948 rf_lock_mutex2(raidPtr->reconControl->rb_mutex);
1949 raidPtr->reconControl->rb_lock = 0;
1950 rf_broadcast_cond2(raidPtr->reconControl->rb_cv);
1951 rf_unlock_mutex2(raidPtr->reconControl->rb_mutex);
1952
1953 }
1954
1955