rf_reconstruct.c revision 1.105.4.4 1 /* $NetBSD: rf_reconstruct.c,v 1.105.4.4 2010/11/21 22:06:53 riz Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /************************************************************
30 *
31 * rf_reconstruct.c -- code to perform on-line reconstruction
32 *
33 ************************************************************/
34
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.105.4.4 2010/11/21 22:06:53 riz Exp $");
37
38 #include <sys/param.h>
39 #include <sys/time.h>
40 #include <sys/buf.h>
41 #include <sys/errno.h>
42 #include <sys/systm.h>
43 #include <sys/proc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <dev/raidframe/raidframevar.h>
48
49 #include "rf_raid.h"
50 #include "rf_reconutil.h"
51 #include "rf_revent.h"
52 #include "rf_reconbuffer.h"
53 #include "rf_acctrace.h"
54 #include "rf_etimer.h"
55 #include "rf_dag.h"
56 #include "rf_desc.h"
57 #include "rf_debugprint.h"
58 #include "rf_general.h"
59 #include "rf_driver.h"
60 #include "rf_utils.h"
61 #include "rf_shutdown.h"
62
63 #include "rf_kintf.h"
64
65 /* setting these to -1 causes them to be set to their default values if not set by debug options */
66
67 #if RF_DEBUG_RECON
68 #define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
69 #define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
70 #define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
71 #define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
72 #define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
73 #define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
74 #define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
75 #define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
76
77 #define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
78 #define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
79
80 #else /* RF_DEBUG_RECON */
81
82 #define Dprintf(s) {}
83 #define Dprintf1(s,a) {}
84 #define Dprintf2(s,a,b) {}
85 #define Dprintf3(s,a,b,c) {}
86 #define Dprintf4(s,a,b,c,d) {}
87 #define Dprintf5(s,a,b,c,d,e) {}
88 #define Dprintf6(s,a,b,c,d,e,f) {}
89 #define Dprintf7(s,a,b,c,d,e,f,g) {}
90
91 #define DDprintf1(s,a) {}
92 #define DDprintf2(s,a,b) {}
93
94 #endif /* RF_DEBUG_RECON */
95
96 #define RF_RECON_DONE_READS 1
97 #define RF_RECON_READ_ERROR 2
98 #define RF_RECON_WRITE_ERROR 3
99 #define RF_RECON_READ_STOPPED 4
100 #define RF_RECON_WRITE_DONE 5
101
102 #define RF_MAX_FREE_RECONBUFFER 32
103 #define RF_MIN_FREE_RECONBUFFER 16
104
105 static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t,
106 RF_RaidDisk_t *, int, RF_RowCol_t);
107 static void FreeReconDesc(RF_RaidReconDesc_t *);
108 static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *);
109 static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t);
110 static int TryToRead(RF_Raid_t *, RF_RowCol_t);
111 static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t,
112 RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *,
113 RF_SectorNum_t *);
114 static int IssueNextWriteRequest(RF_Raid_t *);
115 static int ReconReadDoneProc(void *, int);
116 static int ReconWriteDoneProc(void *, int);
117 static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t);
118 static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *,
119 RF_RowCol_t, RF_HeadSepLimit_t,
120 RF_ReconUnitNum_t);
121 static int CheckForcedOrBlockedReconstruction(RF_Raid_t *,
122 RF_ReconParityStripeStatus_t *,
123 RF_PerDiskReconCtrl_t *,
124 RF_RowCol_t, RF_StripeNum_t,
125 RF_ReconUnitNum_t);
126 static void ForceReconReadDoneProc(void *, int);
127 static void rf_ShutdownReconstruction(void *);
128
129 struct RF_ReconDoneProc_s {
130 void (*proc) (RF_Raid_t *, void *);
131 void *arg;
132 RF_ReconDoneProc_t *next;
133 };
134
135 /**************************************************************************
136 *
137 * sets up the parameters that will be used by the reconstruction process
138 * currently there are none, except for those that the layout-specific
139 * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
140 *
141 * in the kernel, we fire off the recon thread.
142 *
143 **************************************************************************/
144 static void
145 rf_ShutdownReconstruction(void *ignored)
146 {
147 pool_destroy(&rf_pools.reconbuffer);
148 }
149
150 int
151 rf_ConfigureReconstruction(RF_ShutdownList_t **listp)
152 {
153
154 rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t),
155 "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER);
156 rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
157
158 return (0);
159 }
160
161 static RF_RaidReconDesc_t *
162 AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col,
163 RF_RaidDisk_t *spareDiskPtr, int numDisksDone,
164 RF_RowCol_t scol)
165 {
166
167 RF_RaidReconDesc_t *reconDesc;
168
169 RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t),
170 (RF_RaidReconDesc_t *));
171 reconDesc->raidPtr = raidPtr;
172 reconDesc->col = col;
173 reconDesc->spareDiskPtr = spareDiskPtr;
174 reconDesc->numDisksDone = numDisksDone;
175 reconDesc->scol = scol;
176 reconDesc->next = NULL;
177
178 return (reconDesc);
179 }
180
181 static void
182 FreeReconDesc(RF_RaidReconDesc_t *reconDesc)
183 {
184 #if RF_RECON_STATS > 0
185 printf("raid%d: %lu recon event waits, %lu recon delays\n",
186 reconDesc->raidPtr->raidid,
187 (long) reconDesc->numReconEventWaits,
188 (long) reconDesc->numReconExecDelays);
189 #endif /* RF_RECON_STATS > 0 */
190 printf("raid%d: %lu max exec ticks\n",
191 reconDesc->raidPtr->raidid,
192 (long) reconDesc->maxReconExecTicks);
193 RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t));
194 }
195
196
197 /*****************************************************************************
198 *
199 * primary routine to reconstruct a failed disk. This should be called from
200 * within its own thread. It won't return until reconstruction completes,
201 * fails, or is aborted.
202 *****************************************************************************/
203 int
204 rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col)
205 {
206 const RF_LayoutSW_t *lp;
207 int rc;
208
209 lp = raidPtr->Layout.map;
210 if (lp->SubmitReconBuffer) {
211 /*
212 * The current infrastructure only supports reconstructing one
213 * disk at a time for each array.
214 */
215 RF_LOCK_MUTEX(raidPtr->mutex);
216 while (raidPtr->reconInProgress) {
217 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
218 }
219 raidPtr->reconInProgress++;
220 RF_UNLOCK_MUTEX(raidPtr->mutex);
221 rc = rf_ReconstructFailedDiskBasic(raidPtr, col);
222 RF_LOCK_MUTEX(raidPtr->mutex);
223 raidPtr->reconInProgress--;
224 RF_UNLOCK_MUTEX(raidPtr->mutex);
225 } else {
226 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
227 lp->parityConfig);
228 rc = EIO;
229 }
230 RF_SIGNAL_COND(raidPtr->waitForReconCond);
231 return (rc);
232 }
233
234 int
235 rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col)
236 {
237 RF_ComponentLabel_t *c_label;
238 RF_RaidDisk_t *spareDiskPtr = NULL;
239 RF_RaidReconDesc_t *reconDesc;
240 RF_RowCol_t scol;
241 int numDisksDone = 0, rc;
242
243 /* first look for a spare drive onto which to reconstruct the data */
244 /* spare disk descriptors are stored in row 0. This may have to
245 * change eventually */
246
247 RF_LOCK_MUTEX(raidPtr->mutex);
248 RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed);
249 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
250 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
251 if (raidPtr->status != rf_rs_degraded) {
252 RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col);
253 RF_UNLOCK_MUTEX(raidPtr->mutex);
254 return (EINVAL);
255 }
256 scol = (-1);
257 } else {
258 #endif
259 for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
260 if (raidPtr->Disks[scol].status == rf_ds_spare) {
261 spareDiskPtr = &raidPtr->Disks[scol];
262 spareDiskPtr->status = rf_ds_used_spare;
263 break;
264 }
265 }
266 if (!spareDiskPtr) {
267 RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col);
268 RF_UNLOCK_MUTEX(raidPtr->mutex);
269 return (ENOSPC);
270 }
271 printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol);
272 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
273 }
274 #endif
275 RF_UNLOCK_MUTEX(raidPtr->mutex);
276
277 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol);
278 raidPtr->reconDesc = (void *) reconDesc;
279 #if RF_RECON_STATS > 0
280 reconDesc->hsStallCount = 0;
281 reconDesc->numReconExecDelays = 0;
282 reconDesc->numReconEventWaits = 0;
283 #endif /* RF_RECON_STATS > 0 */
284 reconDesc->reconExecTimerRunning = 0;
285 reconDesc->reconExecTicks = 0;
286 reconDesc->maxReconExecTicks = 0;
287 rc = rf_ContinueReconstructFailedDisk(reconDesc);
288
289 if (!rc) {
290 /* fix up the component label */
291 /* Don't actually need the read here.. */
292 c_label = raidget_component_label(raidPtr, scol);
293
294 raid_init_component_label(raidPtr, c_label);
295 c_label->row = 0;
296 c_label->column = col;
297 c_label->clean = RF_RAID_DIRTY;
298 c_label->status = rf_ds_optimal;
299 c_label->partitionSize = raidPtr->Disks[scol].partitionSize;
300 c_label->partitionSizeHi =
301 raidPtr->Disks[scol].partitionSize >> 32;
302
303 /* We've just done a rebuild based on all the other
304 disks, so at this point the parity is known to be
305 clean, even if it wasn't before. */
306
307 /* XXX doesn't hold for RAID 6!!*/
308
309 RF_LOCK_MUTEX(raidPtr->mutex);
310 raidPtr->parity_good = RF_RAID_CLEAN;
311 RF_UNLOCK_MUTEX(raidPtr->mutex);
312
313 /* XXXX MORE NEEDED HERE */
314
315 raidflush_component_label(raidPtr, scol);
316 } else {
317 /* Reconstruct failed. */
318
319 RF_LOCK_MUTEX(raidPtr->mutex);
320 /* Failed disk goes back to "failed" status */
321 raidPtr->Disks[col].status = rf_ds_failed;
322
323 /* Spare disk goes back to "spare" status. */
324 spareDiskPtr->status = rf_ds_spare;
325 RF_UNLOCK_MUTEX(raidPtr->mutex);
326
327 }
328 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
329 return (rc);
330 }
331
332 /*
333
334 Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
335 and you don't get a spare until the next Monday. With this function
336 (and hot-swappable drives) you can now put your new disk containing
337 /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
338 rebuild the data "on the spot".
339
340 */
341
342 int
343 rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col)
344 {
345 RF_RaidDisk_t *spareDiskPtr = NULL;
346 RF_RaidReconDesc_t *reconDesc;
347 const RF_LayoutSW_t *lp;
348 RF_ComponentLabel_t *c_label;
349 int numDisksDone = 0, rc;
350 struct partinfo dpart;
351 struct vnode *vp;
352 struct vattr va;
353 int retcode;
354 int ac;
355
356 lp = raidPtr->Layout.map;
357 if (!lp->SubmitReconBuffer) {
358 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
359 lp->parityConfig);
360 /* wakeup anyone who might be waiting to do a reconstruct */
361 RF_SIGNAL_COND(raidPtr->waitForReconCond);
362 return(EIO);
363 }
364
365 /*
366 * The current infrastructure only supports reconstructing one
367 * disk at a time for each array.
368 */
369 RF_LOCK_MUTEX(raidPtr->mutex);
370
371 if (raidPtr->Disks[col].status != rf_ds_failed) {
372 /* "It's gone..." */
373 raidPtr->numFailures++;
374 raidPtr->Disks[col].status = rf_ds_failed;
375 raidPtr->status = rf_rs_degraded;
376 RF_UNLOCK_MUTEX(raidPtr->mutex);
377 rf_update_component_labels(raidPtr,
378 RF_NORMAL_COMPONENT_UPDATE);
379 RF_LOCK_MUTEX(raidPtr->mutex);
380 }
381
382 while (raidPtr->reconInProgress) {
383 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
384 }
385
386 raidPtr->reconInProgress++;
387
388 /* first look for a spare drive onto which to reconstruct the
389 data. spare disk descriptors are stored in row 0. This
390 may have to change eventually */
391
392 /* Actually, we don't care if it's failed or not... On a RAID
393 set with correct parity, this function should be callable
394 on any component without ill effects. */
395 /* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */
396
397 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
398 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
399 RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col);
400
401 raidPtr->reconInProgress--;
402 RF_UNLOCK_MUTEX(raidPtr->mutex);
403 RF_SIGNAL_COND(raidPtr->waitForReconCond);
404 return (EINVAL);
405 }
406 #endif
407
408 /* This device may have been opened successfully the
409 first time. Close it before trying to open it again.. */
410
411 if (raidPtr->raid_cinfo[col].ci_vp != NULL) {
412 #if 0
413 printf("Closed the open device: %s\n",
414 raidPtr->Disks[col].devname);
415 #endif
416 vp = raidPtr->raid_cinfo[col].ci_vp;
417 ac = raidPtr->Disks[col].auto_configured;
418 RF_UNLOCK_MUTEX(raidPtr->mutex);
419 rf_close_component(raidPtr, vp, ac);
420 RF_LOCK_MUTEX(raidPtr->mutex);
421 raidPtr->raid_cinfo[col].ci_vp = NULL;
422 }
423 /* note that this disk was *not* auto_configured (any longer)*/
424 raidPtr->Disks[col].auto_configured = 0;
425
426 #if 0
427 printf("About to (re-)open the device for rebuilding: %s\n",
428 raidPtr->Disks[col].devname);
429 #endif
430 RF_UNLOCK_MUTEX(raidPtr->mutex);
431 retcode = dk_lookup(raidPtr->Disks[col].devname, curlwp, &vp, UIO_SYSSPACE);
432
433 if (retcode) {
434 printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid,
435 raidPtr->Disks[col].devname, retcode);
436
437 /* the component isn't responding properly...
438 must be still dead :-( */
439 RF_LOCK_MUTEX(raidPtr->mutex);
440 raidPtr->reconInProgress--;
441 RF_UNLOCK_MUTEX(raidPtr->mutex);
442 RF_SIGNAL_COND(raidPtr->waitForReconCond);
443 return(retcode);
444 }
445
446 /* Ok, so we can at least do a lookup...
447 How about actually getting a vp for it? */
448
449 if ((retcode = VOP_GETATTR(vp, &va, curlwp->l_cred)) != 0) {
450 RF_LOCK_MUTEX(raidPtr->mutex);
451 raidPtr->reconInProgress--;
452 RF_UNLOCK_MUTEX(raidPtr->mutex);
453 RF_SIGNAL_COND(raidPtr->waitForReconCond);
454 return(retcode);
455 }
456
457 retcode = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, curlwp->l_cred);
458 if (retcode) {
459 RF_LOCK_MUTEX(raidPtr->mutex);
460 raidPtr->reconInProgress--;
461 RF_UNLOCK_MUTEX(raidPtr->mutex);
462 RF_SIGNAL_COND(raidPtr->waitForReconCond);
463 return(retcode);
464 }
465 RF_LOCK_MUTEX(raidPtr->mutex);
466 raidPtr->Disks[col].blockSize = dpart.disklab->d_secsize;
467
468 raidPtr->Disks[col].numBlocks = dpart.part->p_size -
469 rf_protectedSectors;
470
471 raidPtr->raid_cinfo[col].ci_vp = vp;
472 raidPtr->raid_cinfo[col].ci_dev = va.va_rdev;
473
474 raidPtr->Disks[col].dev = va.va_rdev;
475
476 /* we allow the user to specify that only a fraction
477 of the disks should be used this is just for debug:
478 it speeds up * the parity scan */
479 raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks *
480 rf_sizePercentage / 100;
481 RF_UNLOCK_MUTEX(raidPtr->mutex);
482
483 spareDiskPtr = &raidPtr->Disks[col];
484 spareDiskPtr->status = rf_ds_used_spare;
485
486 printf("raid%d: initiating in-place reconstruction on column %d\n",
487 raidPtr->raidid, col);
488
489 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr,
490 numDisksDone, col);
491 raidPtr->reconDesc = (void *) reconDesc;
492 #if RF_RECON_STATS > 0
493 reconDesc->hsStallCount = 0;
494 reconDesc->numReconExecDelays = 0;
495 reconDesc->numReconEventWaits = 0;
496 #endif /* RF_RECON_STATS > 0 */
497 reconDesc->reconExecTimerRunning = 0;
498 reconDesc->reconExecTicks = 0;
499 reconDesc->maxReconExecTicks = 0;
500 rc = rf_ContinueReconstructFailedDisk(reconDesc);
501
502 if (!rc) {
503 RF_LOCK_MUTEX(raidPtr->mutex);
504 /* Need to set these here, as at this point it'll be claiming
505 that the disk is in rf_ds_spared! But we know better :-) */
506
507 raidPtr->Disks[col].status = rf_ds_optimal;
508 raidPtr->status = rf_rs_optimal;
509 RF_UNLOCK_MUTEX(raidPtr->mutex);
510
511 /* fix up the component label */
512 /* Don't actually need the read here.. */
513 c_label = raidget_component_label(raidPtr, col);
514
515 RF_LOCK_MUTEX(raidPtr->mutex);
516 raid_init_component_label(raidPtr, c_label);
517
518 c_label->row = 0;
519 c_label->column = col;
520
521 /* We've just done a rebuild based on all the other
522 disks, so at this point the parity is known to be
523 clean, even if it wasn't before. */
524
525 /* XXX doesn't hold for RAID 6!!*/
526
527 raidPtr->parity_good = RF_RAID_CLEAN;
528 RF_UNLOCK_MUTEX(raidPtr->mutex);
529
530 raidflush_component_label(raidPtr, col);
531 } else {
532 /* Reconstruct-in-place failed. Disk goes back to
533 "failed" status, regardless of what it was before. */
534 RF_LOCK_MUTEX(raidPtr->mutex);
535 raidPtr->Disks[col].status = rf_ds_failed;
536 RF_UNLOCK_MUTEX(raidPtr->mutex);
537 }
538
539 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
540
541 RF_LOCK_MUTEX(raidPtr->mutex);
542 raidPtr->reconInProgress--;
543 RF_UNLOCK_MUTEX(raidPtr->mutex);
544
545 RF_SIGNAL_COND(raidPtr->waitForReconCond);
546 return (rc);
547 }
548
549
550 int
551 rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc)
552 {
553 RF_Raid_t *raidPtr = reconDesc->raidPtr;
554 RF_RowCol_t col = reconDesc->col;
555 RF_RowCol_t scol = reconDesc->scol;
556 RF_ReconMap_t *mapPtr;
557 RF_ReconCtrl_t *tmp_reconctrl;
558 RF_ReconEvent_t *event;
559 RF_StripeCount_t incPSID,lastPSID,num_writes,pending_writes,prev;
560 RF_ReconUnitCount_t RUsPerPU;
561 struct timeval etime, elpsd;
562 unsigned long xor_s, xor_resid_us;
563 int i, ds;
564 int status, done;
565 int recon_error, write_error;
566
567 raidPtr->accumXorTimeUs = 0;
568 #if RF_ACC_TRACE > 0
569 /* create one trace record per physical disk */
570 RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
571 #endif
572
573 /* quiesce the array prior to starting recon. this is needed
574 * to assure no nasty interactions with pending user writes.
575 * We need to do this before we change the disk or row status. */
576
577 Dprintf("RECON: begin request suspend\n");
578 rf_SuspendNewRequestsAndWait(raidPtr);
579 Dprintf("RECON: end request suspend\n");
580
581 /* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */
582 tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol);
583
584 RF_LOCK_MUTEX(raidPtr->mutex);
585
586 /* create the reconstruction control pointer and install it in
587 * the right slot */
588 raidPtr->reconControl = tmp_reconctrl;
589 mapPtr = raidPtr->reconControl->reconMap;
590 raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs;
591 raidPtr->reconControl->numRUsComplete = 0;
592 raidPtr->status = rf_rs_reconstructing;
593 raidPtr->Disks[col].status = rf_ds_reconstructing;
594 raidPtr->Disks[col].spareCol = scol;
595
596 RF_UNLOCK_MUTEX(raidPtr->mutex);
597
598 RF_GETTIME(raidPtr->reconControl->starttime);
599
600 Dprintf("RECON: resume requests\n");
601 rf_ResumeNewRequests(raidPtr);
602
603
604 mapPtr = raidPtr->reconControl->reconMap;
605
606 incPSID = RF_RECONMAP_SIZE;
607 lastPSID = raidPtr->Layout.numStripe / raidPtr->Layout.SUsPerPU;
608 RUsPerPU = raidPtr->Layout.SUsPerPU / raidPtr->Layout.SUsPerRU;
609 recon_error = 0;
610 write_error = 0;
611 pending_writes = incPSID;
612 raidPtr->reconControl->lastPSID = incPSID;
613
614 /* start the actual reconstruction */
615
616 done = 0;
617 while (!done) {
618
619 if (raidPtr->waitShutdown) {
620 /* someone is unconfiguring this array... bail on the reconstruct.. */
621 recon_error = 1;
622 break;
623 }
624
625 num_writes = 0;
626
627 /* issue a read for each surviving disk */
628
629 reconDesc->numDisksDone = 0;
630 for (i = 0; i < raidPtr->numCol; i++) {
631 if (i != col) {
632 /* find and issue the next I/O on the
633 * indicated disk */
634 if (IssueNextReadRequest(raidPtr, i)) {
635 Dprintf1("RECON: done issuing for c%d\n", i);
636 reconDesc->numDisksDone++;
637 }
638 }
639 }
640
641 /* process reconstruction events until all disks report that
642 * they've completed all work */
643
644 while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
645
646 event = rf_GetNextReconEvent(reconDesc);
647 status = ProcessReconEvent(raidPtr, event);
648
649 /* the normal case is that a read completes, and all is well. */
650 if (status == RF_RECON_DONE_READS) {
651 reconDesc->numDisksDone++;
652 } else if ((status == RF_RECON_READ_ERROR) ||
653 (status == RF_RECON_WRITE_ERROR)) {
654 /* an error was encountered while reconstructing...
655 Pretend we've finished this disk.
656 */
657 recon_error = 1;
658 raidPtr->reconControl->error = 1;
659
660 /* bump the numDisksDone count for reads,
661 but not for writes */
662 if (status == RF_RECON_READ_ERROR)
663 reconDesc->numDisksDone++;
664
665 /* write errors are special -- when we are
666 done dealing with the reads that are
667 finished, we don't want to wait for any
668 writes */
669 if (status == RF_RECON_WRITE_ERROR) {
670 write_error = 1;
671 num_writes++;
672 }
673
674 } else if (status == RF_RECON_READ_STOPPED) {
675 /* count this component as being "done" */
676 reconDesc->numDisksDone++;
677 } else if (status == RF_RECON_WRITE_DONE) {
678 num_writes++;
679 }
680
681 if (recon_error) {
682 /* make sure any stragglers are woken up so that
683 their theads will complete, and we can get out
684 of here with all IO processed */
685
686 rf_WakeupHeadSepCBWaiters(raidPtr);
687 }
688
689 raidPtr->reconControl->numRUsTotal =
690 mapPtr->totalRUs;
691 raidPtr->reconControl->numRUsComplete =
692 mapPtr->totalRUs -
693 rf_UnitsLeftToReconstruct(mapPtr);
694
695 #if RF_DEBUG_RECON
696 raidPtr->reconControl->percentComplete =
697 (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
698 if (rf_prReconSched) {
699 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
700 }
701 #endif
702 }
703
704 /* reads done, wakup any waiters, and then wait for writes */
705
706 rf_WakeupHeadSepCBWaiters(raidPtr);
707
708 while (!recon_error && (num_writes < pending_writes)) {
709 event = rf_GetNextReconEvent(reconDesc);
710 status = ProcessReconEvent(raidPtr, event);
711
712 if (status == RF_RECON_WRITE_ERROR) {
713 num_writes++;
714 recon_error = 1;
715 raidPtr->reconControl->error = 1;
716 /* an error was encountered at the very end... bail */
717 } else if (status == RF_RECON_WRITE_DONE) {
718 num_writes++;
719 } /* else it's something else, and we don't care */
720 }
721 if (recon_error ||
722 (raidPtr->reconControl->lastPSID == lastPSID)) {
723 done = 1;
724 break;
725 }
726
727 prev = raidPtr->reconControl->lastPSID;
728 raidPtr->reconControl->lastPSID += incPSID;
729
730 if (raidPtr->reconControl->lastPSID > lastPSID) {
731 pending_writes = lastPSID - prev;
732 raidPtr->reconControl->lastPSID = lastPSID;
733 }
734
735 /* back down curPSID to get ready for the next round... */
736 for (i = 0; i < raidPtr->numCol; i++) {
737 if (i != col) {
738 raidPtr->reconControl->perDiskInfo[i].curPSID--;
739 raidPtr->reconControl->perDiskInfo[i].ru_count = RUsPerPU - 1;
740 }
741 }
742 }
743
744 mapPtr = raidPtr->reconControl->reconMap;
745 if (rf_reconDebug) {
746 printf("RECON: all reads completed\n");
747 }
748 /* at this point all the reads have completed. We now wait
749 * for any pending writes to complete, and then we're done */
750
751 while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) {
752
753 event = rf_GetNextReconEvent(reconDesc);
754 status = ProcessReconEvent(raidPtr, event);
755
756 if (status == RF_RECON_WRITE_ERROR) {
757 recon_error = 1;
758 raidPtr->reconControl->error = 1;
759 /* an error was encountered at the very end... bail */
760 } else {
761 #if RF_DEBUG_RECON
762 raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
763 if (rf_prReconSched) {
764 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
765 }
766 #endif
767 }
768 }
769
770 if (recon_error) {
771 /* we've encountered an error in reconstructing. */
772 printf("raid%d: reconstruction failed.\n", raidPtr->raidid);
773
774 /* we start by blocking IO to the RAID set. */
775 rf_SuspendNewRequestsAndWait(raidPtr);
776
777 RF_LOCK_MUTEX(raidPtr->mutex);
778 /* mark set as being degraded, rather than
779 rf_rs_reconstructing as we were before the problem.
780 After this is done we can update status of the
781 component disks without worrying about someone
782 trying to read from a failed component.
783 */
784 raidPtr->status = rf_rs_degraded;
785 RF_UNLOCK_MUTEX(raidPtr->mutex);
786
787 /* resume IO */
788 rf_ResumeNewRequests(raidPtr);
789
790 /* At this point there are two cases:
791 1) If we've experienced a read error, then we've
792 already waited for all the reads we're going to get,
793 and we just need to wait for the writes.
794
795 2) If we've experienced a write error, we've also
796 already waited for all the reads to complete,
797 but there is little point in waiting for the writes --
798 when they do complete, they will just be ignored.
799
800 So we just wait for writes to complete if we didn't have a
801 write error.
802 */
803
804 if (!write_error) {
805 /* wait for writes to complete */
806 while (raidPtr->reconControl->pending_writes > 0) {
807
808 event = rf_GetNextReconEvent(reconDesc);
809 status = ProcessReconEvent(raidPtr, event);
810
811 if (status == RF_RECON_WRITE_ERROR) {
812 raidPtr->reconControl->error = 1;
813 /* an error was encountered at the very end... bail.
814 This will be very bad news for the user, since
815 at this point there will have been a read error
816 on one component, and a write error on another!
817 */
818 break;
819 }
820 }
821 }
822
823
824 /* cleanup */
825
826 /* drain the event queue - after waiting for the writes above,
827 there shouldn't be much (if anything!) left in the queue. */
828
829 rf_DrainReconEventQueue(reconDesc);
830
831 /* XXX As much as we'd like to free the recon control structure
832 and the reconDesc, we have no way of knowing if/when those will
833 be touched by IO that has yet to occur. It is rather poor to be
834 basically causing a 'memory leak' here, but there doesn't seem to be
835 a cleaner alternative at this time. Perhaps when the reconstruct code
836 gets a makeover this problem will go away.
837 */
838 #if 0
839 rf_FreeReconControl(raidPtr);
840 #endif
841
842 #if RF_ACC_TRACE > 0
843 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
844 #endif
845 /* XXX see comment above */
846 #if 0
847 FreeReconDesc(reconDesc);
848 #endif
849
850 return (1);
851 }
852
853 /* Success: mark the dead disk as reconstructed. We quiesce
854 * the array here to assure no nasty interactions with pending
855 * user accesses when we free up the psstatus structure as
856 * part of FreeReconControl() */
857
858 rf_SuspendNewRequestsAndWait(raidPtr);
859
860 RF_LOCK_MUTEX(raidPtr->mutex);
861 raidPtr->numFailures--;
862 ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
863 raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared;
864 raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal;
865 RF_UNLOCK_MUTEX(raidPtr->mutex);
866 RF_GETTIME(etime);
867 RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd);
868
869 rf_ResumeNewRequests(raidPtr);
870
871 printf("raid%d: Reconstruction of disk at col %d completed\n",
872 raidPtr->raidid, col);
873 xor_s = raidPtr->accumXorTimeUs / 1000000;
874 xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
875 printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n",
876 raidPtr->raidid,
877 (int) elpsd.tv_sec, (int) elpsd.tv_usec,
878 raidPtr->accumXorTimeUs, xor_s, xor_resid_us);
879 printf("raid%d: (start time %d sec %d usec, end time %d sec %d usec)\n",
880 raidPtr->raidid,
881 (int) raidPtr->reconControl->starttime.tv_sec,
882 (int) raidPtr->reconControl->starttime.tv_usec,
883 (int) etime.tv_sec, (int) etime.tv_usec);
884 #if RF_RECON_STATS > 0
885 printf("raid%d: Total head-sep stall count was %d\n",
886 raidPtr->raidid, (int) reconDesc->hsStallCount);
887 #endif /* RF_RECON_STATS > 0 */
888 rf_FreeReconControl(raidPtr);
889 #if RF_ACC_TRACE > 0
890 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
891 #endif
892 FreeReconDesc(reconDesc);
893
894 return (0);
895
896 }
897 /*****************************************************************************
898 * do the right thing upon each reconstruction event.
899 *****************************************************************************/
900 static int
901 ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event)
902 {
903 int retcode = 0, submitblocked;
904 RF_ReconBuffer_t *rbuf;
905 RF_SectorCount_t sectorsPerRU;
906
907 retcode = RF_RECON_READ_STOPPED;
908
909 Dprintf1("RECON: ProcessReconEvent type %d\n", event->type);
910
911 switch (event->type) {
912
913 /* a read I/O has completed */
914 case RF_REVENT_READDONE:
915 rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf;
916 Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n",
917 event->col, rbuf->parityStripeID);
918 Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n",
919 rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
920 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
921 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
922 if (!raidPtr->reconControl->error) {
923 submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
924 Dprintf1("RECON: submitblocked=%d\n", submitblocked);
925 if (!submitblocked)
926 retcode = IssueNextReadRequest(raidPtr, event->col);
927 else
928 retcode = 0;
929 }
930 break;
931
932 /* a write I/O has completed */
933 case RF_REVENT_WRITEDONE:
934 #if RF_DEBUG_RECON
935 if (rf_floatingRbufDebug) {
936 rf_CheckFloatingRbufCount(raidPtr, 1);
937 }
938 #endif
939 sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
940 rbuf = (RF_ReconBuffer_t *) event->arg;
941 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
942 Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n",
943 rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete);
944 rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap,
945 rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
946 rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru);
947
948 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
949 raidPtr->reconControl->pending_writes--;
950 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
951
952 if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
953 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
954 while(raidPtr->reconControl->rb_lock) {
955 ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO, "reconctrlpre1", 0,
956 &raidPtr->reconControl->rb_mutex);
957 }
958 raidPtr->reconControl->rb_lock = 1;
959 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
960
961 raidPtr->numFullReconBuffers--;
962 rf_ReleaseFloatingReconBuffer(raidPtr, rbuf);
963
964 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
965 raidPtr->reconControl->rb_lock = 0;
966 wakeup(&raidPtr->reconControl->rb_lock);
967 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
968 } else
969 if (rbuf->type == RF_RBUF_TYPE_FORCED)
970 rf_FreeReconBuffer(rbuf);
971 else
972 RF_ASSERT(0);
973 retcode = RF_RECON_WRITE_DONE;
974 break;
975
976 case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been
977 * cleared */
978 Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col);
979 if (!raidPtr->reconControl->error) {
980 submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf,
981 0, (int) (long) event->arg);
982 RF_ASSERT(!submitblocked); /* we wouldn't have gotten the
983 * BUFCLEAR event if we
984 * couldn't submit */
985 retcode = IssueNextReadRequest(raidPtr, event->col);
986 }
987 break;
988
989 case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction
990 * blockage has been cleared */
991 DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col);
992 if (!raidPtr->reconControl->error) {
993 retcode = TryToRead(raidPtr, event->col);
994 }
995 break;
996
997 case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation
998 * reconstruction blockage has been
999 * cleared */
1000 Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col);
1001 if (!raidPtr->reconControl->error) {
1002 retcode = TryToRead(raidPtr, event->col);
1003 }
1004 break;
1005
1006 /* a buffer has become ready to write */
1007 case RF_REVENT_BUFREADY:
1008 Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col);
1009 if (!raidPtr->reconControl->error) {
1010 retcode = IssueNextWriteRequest(raidPtr);
1011 #if RF_DEBUG_RECON
1012 if (rf_floatingRbufDebug) {
1013 rf_CheckFloatingRbufCount(raidPtr, 1);
1014 }
1015 #endif
1016 }
1017 break;
1018
1019 /* we need to skip the current RU entirely because it got
1020 * recon'd while we were waiting for something else to happen */
1021 case RF_REVENT_SKIP:
1022 DDprintf1("RECON: SKIP EVENT: col %d\n", event->col);
1023 if (!raidPtr->reconControl->error) {
1024 retcode = IssueNextReadRequest(raidPtr, event->col);
1025 }
1026 break;
1027
1028 /* a forced-reconstruction read access has completed. Just
1029 * submit the buffer */
1030 case RF_REVENT_FORCEDREADDONE:
1031 rbuf = (RF_ReconBuffer_t *) event->arg;
1032 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1033 DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col);
1034 if (!raidPtr->reconControl->error) {
1035 submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
1036 RF_ASSERT(!submitblocked);
1037 retcode = 0;
1038 }
1039 break;
1040
1041 /* A read I/O failed to complete */
1042 case RF_REVENT_READ_FAILED:
1043 retcode = RF_RECON_READ_ERROR;
1044 break;
1045
1046 /* A write I/O failed to complete */
1047 case RF_REVENT_WRITE_FAILED:
1048 retcode = RF_RECON_WRITE_ERROR;
1049
1050 /* This is an error, but it was a pending write.
1051 Account for it. */
1052 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1053 raidPtr->reconControl->pending_writes--;
1054 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1055
1056 rbuf = (RF_ReconBuffer_t *) event->arg;
1057
1058 /* cleanup the disk queue data */
1059 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1060
1061 /* At this point we're erroring out, badly, and floatingRbufs
1062 may not even be valid. Rather than putting this back onto
1063 the floatingRbufs list, just arrange for its immediate
1064 destruction.
1065 */
1066 rf_FreeReconBuffer(rbuf);
1067 break;
1068
1069 /* a forced read I/O failed to complete */
1070 case RF_REVENT_FORCEDREAD_FAILED:
1071 retcode = RF_RECON_READ_ERROR;
1072 break;
1073
1074 default:
1075 RF_PANIC();
1076 }
1077 rf_FreeReconEventDesc(event);
1078 return (retcode);
1079 }
1080 /*****************************************************************************
1081 *
1082 * find the next thing that's needed on the indicated disk, and issue
1083 * a read request for it. We assume that the reconstruction buffer
1084 * associated with this process is free to receive the data. If
1085 * reconstruction is blocked on the indicated RU, we issue a
1086 * blockage-release request instead of a physical disk read request.
1087 * If the current disk gets too far ahead of the others, we issue a
1088 * head-separation wait request and return.
1089 *
1090 * ctrl->{ru_count, curPSID, diskOffset} and
1091 * rbuf->failedDiskSectorOffset are maintained to point to the unit
1092 * we're currently accessing. Note that this deviates from the
1093 * standard C idiom of having counters point to the next thing to be
1094 * accessed. This allows us to easily retry when we're blocked by
1095 * head separation or reconstruction-blockage events.
1096 *
1097 *****************************************************************************/
1098 static int
1099 IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col)
1100 {
1101 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1102 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1103 RF_ReconBuffer_t *rbuf = ctrl->rbuf;
1104 RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
1105 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1106 int do_new_check = 0, retcode = 0, status;
1107
1108 /* if we are currently the slowest disk, mark that we have to do a new
1109 * check */
1110 if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter)
1111 do_new_check = 1;
1112
1113 while (1) {
1114
1115 ctrl->ru_count++;
1116 if (ctrl->ru_count < RUsPerPU) {
1117 ctrl->diskOffset += sectorsPerRU;
1118 rbuf->failedDiskSectorOffset += sectorsPerRU;
1119 } else {
1120 ctrl->curPSID++;
1121 ctrl->ru_count = 0;
1122 /* code left over from when head-sep was based on
1123 * parity stripe id */
1124 if (ctrl->curPSID >= raidPtr->reconControl->lastPSID) {
1125 CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter));
1126 return (RF_RECON_DONE_READS); /* finito! */
1127 }
1128 /* find the disk offsets of the start of the parity
1129 * stripe on both the current disk and the failed
1130 * disk. skip this entire parity stripe if either disk
1131 * does not appear in the indicated PS */
1132 status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset,
1133 &rbuf->spCol, &rbuf->spOffset);
1134 if (status) {
1135 ctrl->ru_count = RUsPerPU - 1;
1136 continue;
1137 }
1138 }
1139 rbuf->which_ru = ctrl->ru_count;
1140
1141 /* skip this RU if it's already been reconstructed */
1142 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) {
1143 Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count);
1144 continue;
1145 }
1146 break;
1147 }
1148 ctrl->headSepCounter++;
1149 if (do_new_check)
1150 CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter); /* update min if needed */
1151
1152
1153 /* at this point, we have definitely decided what to do, and we have
1154 * only to see if we can actually do it now */
1155 rbuf->parityStripeID = ctrl->curPSID;
1156 rbuf->which_ru = ctrl->ru_count;
1157 #if RF_ACC_TRACE > 0
1158 memset((char *) &raidPtr->recon_tracerecs[col], 0,
1159 sizeof(raidPtr->recon_tracerecs[col]));
1160 raidPtr->recon_tracerecs[col].reconacc = 1;
1161 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1162 #endif
1163 retcode = TryToRead(raidPtr, col);
1164 return (retcode);
1165 }
1166
1167 /*
1168 * tries to issue the next read on the indicated disk. We may be
1169 * blocked by (a) the heads being too far apart, or (b) recon on the
1170 * indicated RU being blocked due to a write by a user thread. In
1171 * this case, we issue a head-sep or blockage wait request, which will
1172 * cause this same routine to be invoked again later when the blockage
1173 * has cleared.
1174 */
1175
1176 static int
1177 TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col)
1178 {
1179 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1180 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
1181 RF_StripeNum_t psid = ctrl->curPSID;
1182 RF_ReconUnitNum_t which_ru = ctrl->ru_count;
1183 RF_DiskQueueData_t *req;
1184 int status;
1185 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr;
1186
1187 /* if the current disk is too far ahead of the others, issue a
1188 * head-separation wait and return */
1189 if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru))
1190 return (0);
1191
1192 /* allocate a new PSS in case we need it */
1193 newpssPtr = rf_AllocPSStatus(raidPtr);
1194
1195 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1196 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr);
1197
1198 if (pssPtr != newpssPtr) {
1199 rf_FreePSStatus(raidPtr, newpssPtr);
1200 }
1201
1202 /* if recon is blocked on the indicated parity stripe, issue a
1203 * block-wait request and return. this also must mark the indicated RU
1204 * in the stripe as under reconstruction if not blocked. */
1205 status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru);
1206 if (status == RF_PSS_RECON_BLOCKED) {
1207 Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru);
1208 goto out;
1209 } else
1210 if (status == RF_PSS_FORCED_ON_WRITE) {
1211 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1212 goto out;
1213 }
1214 /* make one last check to be sure that the indicated RU didn't get
1215 * reconstructed while we were waiting for something else to happen.
1216 * This is unfortunate in that it causes us to make this check twice
1217 * in the normal case. Might want to make some attempt to re-work
1218 * this so that we only do this check if we've definitely blocked on
1219 * one of the above checks. When this condition is detected, we may
1220 * have just created a bogus status entry, which we need to delete. */
1221 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) {
1222 Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru);
1223 if (pssPtr == newpssPtr)
1224 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1225 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1226 goto out;
1227 }
1228 /* found something to read. issue the I/O */
1229 Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n",
1230 psid, col, ctrl->diskOffset, ctrl->rbuf->buffer);
1231 #if RF_ACC_TRACE > 0
1232 RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
1233 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
1234 raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
1235 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
1236 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1237 #endif
1238 /* should be ok to use a NULL proc pointer here, all the bufs we use
1239 * should be in kernel space */
1240 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
1241 ReconReadDoneProc, (void *) ctrl,
1242 #if RF_ACC_TRACE > 0
1243 &raidPtr->recon_tracerecs[col],
1244 #else
1245 NULL,
1246 #endif
1247 (void *) raidPtr, 0, NULL, PR_WAITOK);
1248
1249 ctrl->rbuf->arg = (void *) req;
1250 rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY);
1251 pssPtr->issued[col] = 1;
1252
1253 out:
1254 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1255 return (0);
1256 }
1257
1258
1259 /*
1260 * given a parity stripe ID, we want to find out whether both the
1261 * current disk and the failed disk exist in that parity stripe. If
1262 * not, we want to skip this whole PS. If so, we want to find the
1263 * disk offset of the start of the PS on both the current disk and the
1264 * failed disk.
1265 *
1266 * this works by getting a list of disks comprising the indicated
1267 * parity stripe, and searching the list for the current and failed
1268 * disks. Once we've decided they both exist in the parity stripe, we
1269 * need to decide whether each is data or parity, so that we'll know
1270 * which mapping function to call to get the corresponding disk
1271 * offsets.
1272 *
1273 * this is kind of unpleasant, but doing it this way allows the
1274 * reconstruction code to use parity stripe IDs rather than physical
1275 * disks address to march through the failed disk, which greatly
1276 * simplifies a lot of code, as well as eliminating the need for a
1277 * reverse-mapping function. I also think it will execute faster,
1278 * since the calls to the mapping module are kept to a minimum.
1279 *
1280 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
1281 * THE STRIPE IN THE CORRECT ORDER
1282 *
1283 * raidPtr - raid descriptor
1284 * psid - parity stripe identifier
1285 * col - column of disk to find the offsets for
1286 * spCol - out: col of spare unit for failed unit
1287 * spOffset - out: offset into disk containing spare unit
1288 *
1289 */
1290
1291
1292 static int
1293 ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid,
1294 RF_RowCol_t col, RF_SectorNum_t *outDiskOffset,
1295 RF_SectorNum_t *outFailedDiskSectorOffset,
1296 RF_RowCol_t *spCol, RF_SectorNum_t *spOffset)
1297 {
1298 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1299 RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1300 RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */
1301 RF_RowCol_t *diskids;
1302 u_int i, j, k, i_offset, j_offset;
1303 RF_RowCol_t pcol;
1304 int testcol;
1305 RF_SectorNum_t poffset;
1306 char i_is_parity = 0, j_is_parity = 0;
1307 RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
1308
1309 /* get a listing of the disks comprising that stripe */
1310 sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
1311 (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids);
1312 RF_ASSERT(diskids);
1313
1314 /* reject this entire parity stripe if it does not contain the
1315 * indicated disk or it does not contain the failed disk */
1316
1317 for (i = 0; i < stripeWidth; i++) {
1318 if (col == diskids[i])
1319 break;
1320 }
1321 if (i == stripeWidth)
1322 goto skipit;
1323 for (j = 0; j < stripeWidth; j++) {
1324 if (fcol == diskids[j])
1325 break;
1326 }
1327 if (j == stripeWidth) {
1328 goto skipit;
1329 }
1330 /* find out which disk the parity is on */
1331 (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP);
1332
1333 /* find out if either the current RU or the failed RU is parity */
1334 /* also, if the parity occurs in this stripe prior to the data and/or
1335 * failed col, we need to decrement i and/or j */
1336 for (k = 0; k < stripeWidth; k++)
1337 if (diskids[k] == pcol)
1338 break;
1339 RF_ASSERT(k < stripeWidth);
1340 i_offset = i;
1341 j_offset = j;
1342 if (k < i)
1343 i_offset--;
1344 else
1345 if (k == i) {
1346 i_is_parity = 1;
1347 i_offset = 0;
1348 } /* set offsets to zero to disable multiply
1349 * below */
1350 if (k < j)
1351 j_offset--;
1352 else
1353 if (k == j) {
1354 j_is_parity = 1;
1355 j_offset = 0;
1356 }
1357 /* at this point, [ij]_is_parity tells us whether the [current,failed]
1358 * disk is parity at the start of this RU, and, if data, "[ij]_offset"
1359 * tells us how far into the stripe the [current,failed] disk is. */
1360
1361 /* call the mapping routine to get the offset into the current disk,
1362 * repeat for failed disk. */
1363 if (i_is_parity)
1364 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1365 else
1366 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1367
1368 RF_ASSERT(col == testcol);
1369
1370 if (j_is_parity)
1371 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1372 else
1373 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1374 RF_ASSERT(fcol == testcol);
1375
1376 /* now locate the spare unit for the failed unit */
1377 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1378 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
1379 if (j_is_parity)
1380 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1381 else
1382 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1383 } else {
1384 #endif
1385 *spCol = raidPtr->reconControl->spareCol;
1386 *spOffset = *outFailedDiskSectorOffset;
1387 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1388 }
1389 #endif
1390 return (0);
1391
1392 skipit:
1393 Dprintf2("RECON: Skipping psid %ld: nothing needed from c%d\n",
1394 psid, col);
1395 return (1);
1396 }
1397 /* this is called when a buffer has become ready to write to the replacement disk */
1398 static int
1399 IssueNextWriteRequest(RF_Raid_t *raidPtr)
1400 {
1401 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1402 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1403 #if RF_ACC_TRACE > 0
1404 RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1405 #endif
1406 RF_ReconBuffer_t *rbuf;
1407 RF_DiskQueueData_t *req;
1408
1409 rbuf = rf_GetFullReconBuffer(raidPtr->reconControl);
1410 RF_ASSERT(rbuf); /* there must be one available, or we wouldn't
1411 * have gotten the event that sent us here */
1412 RF_ASSERT(rbuf->pssPtr);
1413
1414 rbuf->pssPtr->writeRbuf = rbuf;
1415 rbuf->pssPtr = NULL;
1416
1417 Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n",
1418 rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
1419 rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
1420 Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n",
1421 rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
1422 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
1423
1424 /* should be ok to use a NULL b_proc here b/c all addrs should be in
1425 * kernel space */
1426 req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
1427 sectorsPerRU, rbuf->buffer,
1428 rbuf->parityStripeID, rbuf->which_ru,
1429 ReconWriteDoneProc, (void *) rbuf,
1430 #if RF_ACC_TRACE > 0
1431 &raidPtr->recon_tracerecs[fcol],
1432 #else
1433 NULL,
1434 #endif
1435 (void *) raidPtr, 0, NULL, PR_WAITOK);
1436
1437 rbuf->arg = (void *) req;
1438 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1439 raidPtr->reconControl->pending_writes++;
1440 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1441 rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY);
1442
1443 return (0);
1444 }
1445
1446 /*
1447 * this gets called upon the completion of a reconstruction read
1448 * operation the arg is a pointer to the per-disk reconstruction
1449 * control structure for the process that just finished a read.
1450 *
1451 * called at interrupt context in the kernel, so don't do anything
1452 * illegal here.
1453 */
1454 static int
1455 ReconReadDoneProc(void *arg, int status)
1456 {
1457 RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
1458 RF_Raid_t *raidPtr;
1459
1460 /* Detect that reconCtrl is no longer valid, and if that
1461 is the case, bail without calling rf_CauseReconEvent().
1462 There won't be anyone listening for this event anyway */
1463
1464 if (ctrl->reconCtrl == NULL)
1465 return(0);
1466
1467 raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
1468
1469 if (status) {
1470 printf("raid%d: Recon read failed: %d\n", raidPtr->raidid, status);
1471 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED);
1472 return(0);
1473 }
1474 #if RF_ACC_TRACE > 0
1475 RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1476 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1477 raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us =
1478 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1479 RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1480 #endif
1481 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE);
1482 return (0);
1483 }
1484 /* this gets called upon the completion of a reconstruction write operation.
1485 * the arg is a pointer to the rbuf that was just written
1486 *
1487 * called at interrupt context in the kernel, so don't do anything illegal here.
1488 */
1489 static int
1490 ReconWriteDoneProc(void *arg, int status)
1491 {
1492 RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
1493
1494 /* Detect that reconControl is no longer valid, and if that
1495 is the case, bail without calling rf_CauseReconEvent().
1496 There won't be anyone listening for this event anyway */
1497
1498 if (rbuf->raidPtr->reconControl == NULL)
1499 return(0);
1500
1501 Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru);
1502 if (status) {
1503 printf("raid%d: Recon write failed!\n", rbuf->raidPtr->raidid);
1504 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED);
1505 return(0);
1506 }
1507 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE);
1508 return (0);
1509 }
1510
1511
1512 /*
1513 * computes a new minimum head sep, and wakes up anyone who needs to
1514 * be woken as a result
1515 */
1516 static void
1517 CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr)
1518 {
1519 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1520 RF_HeadSepLimit_t new_min;
1521 RF_RowCol_t i;
1522 RF_CallbackDesc_t *p;
1523 RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition
1524 * of a minimum */
1525
1526
1527 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1528 while(reconCtrlPtr->rb_lock) {
1529 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlcnmhs", 0, &reconCtrlPtr->rb_mutex);
1530 }
1531 reconCtrlPtr->rb_lock = 1;
1532 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1533
1534 new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */
1535 for (i = 0; i < raidPtr->numCol; i++)
1536 if (i != reconCtrlPtr->fcol) {
1537 if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min)
1538 new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter;
1539 }
1540 /* set the new minimum and wake up anyone who can now run again */
1541 if (new_min != reconCtrlPtr->minHeadSepCounter) {
1542 reconCtrlPtr->minHeadSepCounter = new_min;
1543 Dprintf1("RECON: new min head pos counter val is %ld\n", new_min);
1544 while (reconCtrlPtr->headSepCBList) {
1545 if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min)
1546 break;
1547 p = reconCtrlPtr->headSepCBList;
1548 reconCtrlPtr->headSepCBList = p->next;
1549 p->next = NULL;
1550 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
1551 rf_FreeCallbackDesc(p);
1552 }
1553
1554 }
1555 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1556 reconCtrlPtr->rb_lock = 0;
1557 wakeup(&reconCtrlPtr->rb_lock);
1558 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1559 }
1560
1561 /*
1562 * checks to see that the maximum head separation will not be violated
1563 * if we initiate a reconstruction I/O on the indicated disk.
1564 * Limiting the maximum head separation between two disks eliminates
1565 * the nasty buffer-stall conditions that occur when one disk races
1566 * ahead of the others and consumes all of the floating recon buffers.
1567 * This code is complex and unpleasant but it's necessary to avoid
1568 * some very nasty, albeit fairly rare, reconstruction behavior.
1569 *
1570 * returns non-zero if and only if we have to stop working on the
1571 * indicated disk due to a head-separation delay.
1572 */
1573 static int
1574 CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl,
1575 RF_RowCol_t col, RF_HeadSepLimit_t hsCtr,
1576 RF_ReconUnitNum_t which_ru)
1577 {
1578 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1579 RF_CallbackDesc_t *cb, *p, *pt;
1580 int retval = 0;
1581
1582 /* if we're too far ahead of the slowest disk, stop working on this
1583 * disk until the slower ones catch up. We do this by scheduling a
1584 * wakeup callback for the time when the slowest disk has caught up.
1585 * We define "caught up" with 20% hysteresis, i.e. the head separation
1586 * must have fallen to at most 80% of the max allowable head
1587 * separation before we'll wake up.
1588 *
1589 */
1590 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1591 while(reconCtrlPtr->rb_lock) {
1592 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlchs", 0, &reconCtrlPtr->rb_mutex);
1593 }
1594 reconCtrlPtr->rb_lock = 1;
1595 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1596 if ((raidPtr->headSepLimit >= 0) &&
1597 ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) {
1598 Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n",
1599 raidPtr->raidid, col, ctrl->headSepCounter,
1600 reconCtrlPtr->minHeadSepCounter,
1601 raidPtr->headSepLimit);
1602 cb = rf_AllocCallbackDesc();
1603 /* the minHeadSepCounter value we have to get to before we'll
1604 * wake up. build in 20% hysteresis. */
1605 cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
1606 cb->col = col;
1607 cb->next = NULL;
1608
1609 /* insert this callback descriptor into the sorted list of
1610 * pending head-sep callbacks */
1611 p = reconCtrlPtr->headSepCBList;
1612 if (!p)
1613 reconCtrlPtr->headSepCBList = cb;
1614 else
1615 if (cb->callbackArg.v < p->callbackArg.v) {
1616 cb->next = reconCtrlPtr->headSepCBList;
1617 reconCtrlPtr->headSepCBList = cb;
1618 } else {
1619 for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next);
1620 cb->next = p;
1621 pt->next = cb;
1622 }
1623 retval = 1;
1624 #if RF_RECON_STATS > 0
1625 ctrl->reconCtrl->reconDesc->hsStallCount++;
1626 #endif /* RF_RECON_STATS > 0 */
1627 }
1628 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1629 reconCtrlPtr->rb_lock = 0;
1630 wakeup(&reconCtrlPtr->rb_lock);
1631 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1632
1633 return (retval);
1634 }
1635 /*
1636 * checks to see if reconstruction has been either forced or blocked
1637 * by a user operation. if forced, we skip this RU entirely. else if
1638 * blocked, put ourselves on the wait list. else return 0.
1639 *
1640 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY
1641 */
1642 static int
1643 CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr,
1644 RF_ReconParityStripeStatus_t *pssPtr,
1645 RF_PerDiskReconCtrl_t *ctrl,
1646 RF_RowCol_t col,
1647 RF_StripeNum_t psid,
1648 RF_ReconUnitNum_t which_ru)
1649 {
1650 RF_CallbackDesc_t *cb;
1651 int retcode = 0;
1652
1653 if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
1654 retcode = RF_PSS_FORCED_ON_WRITE;
1655 else
1656 if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
1657 Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru);
1658 cb = rf_AllocCallbackDesc(); /* append ourselves to
1659 * the blockage-wait
1660 * list */
1661 cb->col = col;
1662 cb->next = pssPtr->blockWaitList;
1663 pssPtr->blockWaitList = cb;
1664 retcode = RF_PSS_RECON_BLOCKED;
1665 }
1666 if (!retcode)
1667 pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under
1668 * reconstruction */
1669
1670 return (retcode);
1671 }
1672 /*
1673 * if reconstruction is currently ongoing for the indicated stripeID,
1674 * reconstruction is forced to completion and we return non-zero to
1675 * indicate that the caller must wait. If not, then reconstruction is
1676 * blocked on the indicated stripe and the routine returns zero. If
1677 * and only if we return non-zero, we'll cause the cbFunc to get
1678 * invoked with the cbArg when the reconstruction has completed.
1679 */
1680 int
1681 rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1682 void (*cbFunc)(RF_Raid_t *, void *), void *cbArg)
1683 {
1684 RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're
1685 * forcing recon on */
1686 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */
1687 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; /* a pointer to the parity
1688 * stripe status structure */
1689 RF_StripeNum_t psid; /* parity stripe id */
1690 RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk
1691 * offset */
1692 RF_RowCol_t *diskids;
1693 RF_ReconUnitNum_t which_ru; /* RU within parity stripe */
1694 RF_RowCol_t fcol, diskno, i;
1695 RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */
1696 RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */
1697 RF_CallbackDesc_t *cb;
1698 int nPromoted;
1699
1700 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1701
1702 /* allocate a new PSS in case we need it */
1703 newpssPtr = rf_AllocPSStatus(raidPtr);
1704
1705 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1706
1707 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr);
1708
1709 if (pssPtr != newpssPtr) {
1710 rf_FreePSStatus(raidPtr, newpssPtr);
1711 }
1712
1713 /* if recon is not ongoing on this PS, just return */
1714 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1715 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1716 return (0);
1717 }
1718 /* otherwise, we have to wait for reconstruction to complete on this
1719 * RU. */
1720 /* In order to avoid waiting for a potentially large number of
1721 * low-priority accesses to complete, we force a normal-priority (i.e.
1722 * not low-priority) reconstruction on this RU. */
1723 if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
1724 DDprintf1("Forcing recon on psid %ld\n", psid);
1725 pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under
1726 * forced recon */
1727 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage
1728 * that we just set */
1729 fcol = raidPtr->reconControl->fcol;
1730
1731 /* get a listing of the disks comprising the indicated stripe */
1732 (raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids);
1733
1734 /* For previously issued reads, elevate them to normal
1735 * priority. If the I/O has already completed, it won't be
1736 * found in the queue, and hence this will be a no-op. For
1737 * unissued reads, allocate buffers and issue new reads. The
1738 * fact that we've set the FORCED bit means that the regular
1739 * recon procs will not re-issue these reqs */
1740 for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++)
1741 if ((diskno = diskids[i]) != fcol) {
1742 if (pssPtr->issued[diskno]) {
1743 nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru);
1744 if (rf_reconDebug && nPromoted)
1745 printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno);
1746 } else {
1747 new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */
1748 ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset,
1749 &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare
1750 * location */
1751 new_rbuf->parityStripeID = psid; /* fill in the buffer */
1752 new_rbuf->which_ru = which_ru;
1753 new_rbuf->failedDiskSectorOffset = fd_offset;
1754 new_rbuf->priority = RF_IO_NORMAL_PRIORITY;
1755
1756 /* use NULL b_proc b/c all addrs
1757 * should be in kernel space */
1758 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer,
1759 psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf,
1760 NULL, (void *) raidPtr, 0, NULL, PR_WAITOK);
1761
1762 new_rbuf->arg = req;
1763 rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */
1764 Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno);
1765 }
1766 }
1767 /* if the write is sitting in the disk queue, elevate its
1768 * priority */
1769 if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru))
1770 if (rf_reconDebug)
1771 printf("raid%d: promoted write to col %d\n",
1772 raidPtr->raidid, fcol);
1773 }
1774 /* install a callback descriptor to be invoked when recon completes on
1775 * this parity stripe. */
1776 cb = rf_AllocCallbackDesc();
1777 /* XXX the following is bogus.. These functions don't really match!!
1778 * GO */
1779 cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
1780 cb->callbackArg.p = (void *) cbArg;
1781 cb->next = pssPtr->procWaitList;
1782 pssPtr->procWaitList = cb;
1783 DDprintf2("raid%d: Waiting for forced recon on psid %ld\n",
1784 raidPtr->raidid, psid);
1785
1786 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1787 return (1);
1788 }
1789 /* called upon the completion of a forced reconstruction read.
1790 * all we do is schedule the FORCEDREADONE event.
1791 * called at interrupt context in the kernel, so don't do anything illegal here.
1792 */
1793 static void
1794 ForceReconReadDoneProc(void *arg, int status)
1795 {
1796 RF_ReconBuffer_t *rbuf = arg;
1797
1798 /* Detect that reconControl is no longer valid, and if that
1799 is the case, bail without calling rf_CauseReconEvent().
1800 There won't be anyone listening for this event anyway */
1801
1802 if (rbuf->raidPtr->reconControl == NULL)
1803 return;
1804
1805 if (status) {
1806 printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid);
1807 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED);
1808 return;
1809 }
1810 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE);
1811 }
1812 /* releases a block on the reconstruction of the indicated stripe */
1813 int
1814 rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
1815 {
1816 RF_StripeNum_t stripeID = asmap->stripeID;
1817 RF_ReconParityStripeStatus_t *pssPtr;
1818 RF_ReconUnitNum_t which_ru;
1819 RF_StripeNum_t psid;
1820 RF_CallbackDesc_t *cb;
1821
1822 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1823 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1824 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL);
1825
1826 /* When recon is forced, the pss desc can get deleted before we get
1827 * back to unblock recon. But, this can _only_ happen when recon is
1828 * forced. It would be good to put some kind of sanity check here, but
1829 * how to decide if recon was just forced or not? */
1830 if (!pssPtr) {
1831 /* printf("Warning: no pss descriptor upon unblock on psid %ld
1832 * RU %d\n",psid,which_ru); */
1833 #if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0)
1834 if (rf_reconDebug || rf_pssDebug)
1835 printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru);
1836 #endif
1837 goto out;
1838 }
1839 pssPtr->blockCount--;
1840 Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n",
1841 raidPtr->raidid, psid, pssPtr->blockCount);
1842 if (pssPtr->blockCount == 0) { /* if recon blockage has been released */
1843
1844 /* unblock recon before calling CauseReconEvent in case
1845 * CauseReconEvent causes us to try to issue a new read before
1846 * returning here. */
1847 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
1848
1849
1850 while (pssPtr->blockWaitList) {
1851 /* spin through the block-wait list and
1852 release all the waiters */
1853 cb = pssPtr->blockWaitList;
1854 pssPtr->blockWaitList = cb->next;
1855 cb->next = NULL;
1856 rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR);
1857 rf_FreeCallbackDesc(cb);
1858 }
1859 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1860 /* if no recon was requested while recon was blocked */
1861 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1862 }
1863 }
1864 out:
1865 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1866 return (0);
1867 }
1868
1869 void
1870 rf_WakeupHeadSepCBWaiters(RF_Raid_t *raidPtr)
1871 {
1872 RF_CallbackDesc_t *p;
1873
1874 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1875 while(raidPtr->reconControl->rb_lock) {
1876 ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO,
1877 "rf_wakeuphscbw", 0, &raidPtr->reconControl->rb_mutex);
1878 }
1879
1880 raidPtr->reconControl->rb_lock = 1;
1881 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1882
1883 while (raidPtr->reconControl->headSepCBList) {
1884 p = raidPtr->reconControl->headSepCBList;
1885 raidPtr->reconControl->headSepCBList = p->next;
1886 p->next = NULL;
1887 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
1888 rf_FreeCallbackDesc(p);
1889 }
1890 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1891 raidPtr->reconControl->rb_lock = 0;
1892 wakeup(&raidPtr->reconControl->rb_lock);
1893 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1894
1895 }
1896
1897