rf_reconstruct.c revision 1.88.2.6 1 /* $NetBSD: rf_reconstruct.c,v 1.88.2.6 2008/02/04 09:23:34 yamt Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /************************************************************
30 *
31 * rf_reconstruct.c -- code to perform on-line reconstruction
32 *
33 ************************************************************/
34
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.88.2.6 2008/02/04 09:23:34 yamt Exp $");
37
38 #include <sys/param.h>
39 #include <sys/time.h>
40 #include <sys/buf.h>
41 #include <sys/errno.h>
42 #include <sys/systm.h>
43 #include <sys/proc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <dev/raidframe/raidframevar.h>
48
49 #include "rf_raid.h"
50 #include "rf_reconutil.h"
51 #include "rf_revent.h"
52 #include "rf_reconbuffer.h"
53 #include "rf_acctrace.h"
54 #include "rf_etimer.h"
55 #include "rf_dag.h"
56 #include "rf_desc.h"
57 #include "rf_debugprint.h"
58 #include "rf_general.h"
59 #include "rf_driver.h"
60 #include "rf_utils.h"
61 #include "rf_shutdown.h"
62
63 #include "rf_kintf.h"
64
65 /* setting these to -1 causes them to be set to their default values if not set by debug options */
66
67 #if RF_DEBUG_RECON
68 #define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
69 #define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
70 #define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
71 #define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
72 #define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
73 #define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
74 #define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
75 #define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
76
77 #define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
78 #define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
79
80 #else /* RF_DEBUG_RECON */
81
82 #define Dprintf(s) {}
83 #define Dprintf1(s,a) {}
84 #define Dprintf2(s,a,b) {}
85 #define Dprintf3(s,a,b,c) {}
86 #define Dprintf4(s,a,b,c,d) {}
87 #define Dprintf5(s,a,b,c,d,e) {}
88 #define Dprintf6(s,a,b,c,d,e,f) {}
89 #define Dprintf7(s,a,b,c,d,e,f,g) {}
90
91 #define DDprintf1(s,a) {}
92 #define DDprintf2(s,a,b) {}
93
94 #endif /* RF_DEBUG_RECON */
95
96 #define RF_RECON_DONE_READS 1
97 #define RF_RECON_READ_ERROR 2
98 #define RF_RECON_WRITE_ERROR 3
99 #define RF_RECON_READ_STOPPED 4
100
101 #define RF_MAX_FREE_RECONBUFFER 32
102 #define RF_MIN_FREE_RECONBUFFER 16
103
104 static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t,
105 RF_RaidDisk_t *, int, RF_RowCol_t);
106 static void FreeReconDesc(RF_RaidReconDesc_t *);
107 static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *);
108 static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t);
109 static int TryToRead(RF_Raid_t *, RF_RowCol_t);
110 static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t,
111 RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *,
112 RF_SectorNum_t *);
113 static int IssueNextWriteRequest(RF_Raid_t *);
114 static int ReconReadDoneProc(void *, int);
115 static int ReconWriteDoneProc(void *, int);
116 static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t);
117 static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *,
118 RF_RowCol_t, RF_HeadSepLimit_t,
119 RF_ReconUnitNum_t);
120 static int CheckForcedOrBlockedReconstruction(RF_Raid_t *,
121 RF_ReconParityStripeStatus_t *,
122 RF_PerDiskReconCtrl_t *,
123 RF_RowCol_t, RF_StripeNum_t,
124 RF_ReconUnitNum_t);
125 static void ForceReconReadDoneProc(void *, int);
126 static void rf_ShutdownReconstruction(void *);
127
128 struct RF_ReconDoneProc_s {
129 void (*proc) (RF_Raid_t *, void *);
130 void *arg;
131 RF_ReconDoneProc_t *next;
132 };
133
134 /**************************************************************************
135 *
136 * sets up the parameters that will be used by the reconstruction process
137 * currently there are none, except for those that the layout-specific
138 * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
139 *
140 * in the kernel, we fire off the recon thread.
141 *
142 **************************************************************************/
143 static void
144 rf_ShutdownReconstruction(void *ignored)
145 {
146 pool_destroy(&rf_pools.reconbuffer);
147 }
148
149 int
150 rf_ConfigureReconstruction(RF_ShutdownList_t **listp)
151 {
152
153 rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t),
154 "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER);
155 rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
156
157 return (0);
158 }
159
160 static RF_RaidReconDesc_t *
161 AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col,
162 RF_RaidDisk_t *spareDiskPtr, int numDisksDone,
163 RF_RowCol_t scol)
164 {
165
166 RF_RaidReconDesc_t *reconDesc;
167
168 RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t),
169 (RF_RaidReconDesc_t *));
170 reconDesc->raidPtr = raidPtr;
171 reconDesc->col = col;
172 reconDesc->spareDiskPtr = spareDiskPtr;
173 reconDesc->numDisksDone = numDisksDone;
174 reconDesc->scol = scol;
175 reconDesc->next = NULL;
176
177 return (reconDesc);
178 }
179
180 static void
181 FreeReconDesc(RF_RaidReconDesc_t *reconDesc)
182 {
183 #if RF_RECON_STATS > 0
184 printf("raid%d: %lu recon event waits, %lu recon delays\n",
185 reconDesc->raidPtr->raidid,
186 (long) reconDesc->numReconEventWaits,
187 (long) reconDesc->numReconExecDelays);
188 #endif /* RF_RECON_STATS > 0 */
189 printf("raid%d: %lu max exec ticks\n",
190 reconDesc->raidPtr->raidid,
191 (long) reconDesc->maxReconExecTicks);
192 #if (RF_RECON_STATS > 0) || defined(KERNEL)
193 printf("\n");
194 #endif /* (RF_RECON_STATS > 0) || KERNEL */
195 RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t));
196 }
197
198
199 /*****************************************************************************
200 *
201 * primary routine to reconstruct a failed disk. This should be called from
202 * within its own thread. It won't return until reconstruction completes,
203 * fails, or is aborted.
204 *****************************************************************************/
205 int
206 rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col)
207 {
208 const RF_LayoutSW_t *lp;
209 int rc;
210
211 lp = raidPtr->Layout.map;
212 if (lp->SubmitReconBuffer) {
213 /*
214 * The current infrastructure only supports reconstructing one
215 * disk at a time for each array.
216 */
217 RF_LOCK_MUTEX(raidPtr->mutex);
218 while (raidPtr->reconInProgress) {
219 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
220 }
221 raidPtr->reconInProgress++;
222 RF_UNLOCK_MUTEX(raidPtr->mutex);
223 rc = rf_ReconstructFailedDiskBasic(raidPtr, col);
224 RF_LOCK_MUTEX(raidPtr->mutex);
225 raidPtr->reconInProgress--;
226 RF_UNLOCK_MUTEX(raidPtr->mutex);
227 } else {
228 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
229 lp->parityConfig);
230 rc = EIO;
231 }
232 RF_SIGNAL_COND(raidPtr->waitForReconCond);
233 return (rc);
234 }
235
236 int
237 rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col)
238 {
239 RF_ComponentLabel_t c_label;
240 RF_RaidDisk_t *spareDiskPtr = NULL;
241 RF_RaidReconDesc_t *reconDesc;
242 RF_RowCol_t scol;
243 int numDisksDone = 0, rc;
244
245 /* first look for a spare drive onto which to reconstruct the data */
246 /* spare disk descriptors are stored in row 0. This may have to
247 * change eventually */
248
249 RF_LOCK_MUTEX(raidPtr->mutex);
250 RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed);
251 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
252 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
253 if (raidPtr->status != rf_rs_degraded) {
254 RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col);
255 RF_UNLOCK_MUTEX(raidPtr->mutex);
256 return (EINVAL);
257 }
258 scol = (-1);
259 } else {
260 #endif
261 for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
262 if (raidPtr->Disks[scol].status == rf_ds_spare) {
263 spareDiskPtr = &raidPtr->Disks[scol];
264 spareDiskPtr->status = rf_ds_used_spare;
265 break;
266 }
267 }
268 if (!spareDiskPtr) {
269 RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col);
270 RF_UNLOCK_MUTEX(raidPtr->mutex);
271 return (ENOSPC);
272 }
273 printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol);
274 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
275 }
276 #endif
277 RF_UNLOCK_MUTEX(raidPtr->mutex);
278
279 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol);
280 raidPtr->reconDesc = (void *) reconDesc;
281 #if RF_RECON_STATS > 0
282 reconDesc->hsStallCount = 0;
283 reconDesc->numReconExecDelays = 0;
284 reconDesc->numReconEventWaits = 0;
285 #endif /* RF_RECON_STATS > 0 */
286 reconDesc->reconExecTimerRunning = 0;
287 reconDesc->reconExecTicks = 0;
288 reconDesc->maxReconExecTicks = 0;
289 rc = rf_ContinueReconstructFailedDisk(reconDesc);
290
291 if (!rc) {
292 /* fix up the component label */
293 /* Don't actually need the read here.. */
294 raidread_component_label(
295 raidPtr->raid_cinfo[scol].ci_dev,
296 raidPtr->raid_cinfo[scol].ci_vp,
297 &c_label);
298
299 raid_init_component_label( raidPtr, &c_label);
300 c_label.row = 0;
301 c_label.column = col;
302 c_label.clean = RF_RAID_DIRTY;
303 c_label.status = rf_ds_optimal;
304 c_label.partitionSize = raidPtr->Disks[scol].partitionSize;
305
306 /* We've just done a rebuild based on all the other
307 disks, so at this point the parity is known to be
308 clean, even if it wasn't before. */
309
310 /* XXX doesn't hold for RAID 6!!*/
311
312 RF_LOCK_MUTEX(raidPtr->mutex);
313 raidPtr->parity_good = RF_RAID_CLEAN;
314 RF_UNLOCK_MUTEX(raidPtr->mutex);
315
316 /* XXXX MORE NEEDED HERE */
317
318 raidwrite_component_label(
319 raidPtr->raid_cinfo[scol].ci_dev,
320 raidPtr->raid_cinfo[scol].ci_vp,
321 &c_label);
322
323 } else {
324 /* Reconstruct failed. */
325
326 RF_LOCK_MUTEX(raidPtr->mutex);
327 /* Failed disk goes back to "failed" status */
328 raidPtr->Disks[col].status = rf_ds_failed;
329
330 /* Spare disk goes back to "spare" status. */
331 spareDiskPtr->status = rf_ds_spare;
332 RF_UNLOCK_MUTEX(raidPtr->mutex);
333
334 }
335 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
336 return (rc);
337 }
338
339 /*
340
341 Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
342 and you don't get a spare until the next Monday. With this function
343 (and hot-swappable drives) you can now put your new disk containing
344 /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
345 rebuild the data "on the spot".
346
347 */
348
349 int
350 rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col)
351 {
352 RF_RaidDisk_t *spareDiskPtr = NULL;
353 RF_RaidReconDesc_t *reconDesc;
354 const RF_LayoutSW_t *lp;
355 RF_ComponentLabel_t c_label;
356 int numDisksDone = 0, rc;
357 struct partinfo dpart;
358 struct vnode *vp;
359 struct vattr va;
360 int retcode;
361 int ac;
362
363 lp = raidPtr->Layout.map;
364 if (!lp->SubmitReconBuffer) {
365 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
366 lp->parityConfig);
367 /* wakeup anyone who might be waiting to do a reconstruct */
368 RF_SIGNAL_COND(raidPtr->waitForReconCond);
369 return(EIO);
370 }
371
372 /*
373 * The current infrastructure only supports reconstructing one
374 * disk at a time for each array.
375 */
376 RF_LOCK_MUTEX(raidPtr->mutex);
377
378 if (raidPtr->Disks[col].status != rf_ds_failed) {
379 /* "It's gone..." */
380 raidPtr->numFailures++;
381 raidPtr->Disks[col].status = rf_ds_failed;
382 raidPtr->status = rf_rs_degraded;
383 RF_UNLOCK_MUTEX(raidPtr->mutex);
384 rf_update_component_labels(raidPtr,
385 RF_NORMAL_COMPONENT_UPDATE);
386 RF_LOCK_MUTEX(raidPtr->mutex);
387 }
388
389 while (raidPtr->reconInProgress) {
390 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
391 }
392
393 raidPtr->reconInProgress++;
394
395 /* first look for a spare drive onto which to reconstruct the
396 data. spare disk descriptors are stored in row 0. This
397 may have to change eventually */
398
399 /* Actually, we don't care if it's failed or not... On a RAID
400 set with correct parity, this function should be callable
401 on any component without ill effects. */
402 /* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */
403
404 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
405 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
406 RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col);
407
408 raidPtr->reconInProgress--;
409 RF_UNLOCK_MUTEX(raidPtr->mutex);
410 RF_SIGNAL_COND(raidPtr->waitForReconCond);
411 return (EINVAL);
412 }
413 #endif
414
415 /* This device may have been opened successfully the
416 first time. Close it before trying to open it again.. */
417
418 if (raidPtr->raid_cinfo[col].ci_vp != NULL) {
419 #if 0
420 printf("Closed the open device: %s\n",
421 raidPtr->Disks[col].devname);
422 #endif
423 vp = raidPtr->raid_cinfo[col].ci_vp;
424 ac = raidPtr->Disks[col].auto_configured;
425 RF_UNLOCK_MUTEX(raidPtr->mutex);
426 rf_close_component(raidPtr, vp, ac);
427 RF_LOCK_MUTEX(raidPtr->mutex);
428 raidPtr->raid_cinfo[col].ci_vp = NULL;
429 }
430 /* note that this disk was *not* auto_configured (any longer)*/
431 raidPtr->Disks[col].auto_configured = 0;
432
433 #if 0
434 printf("About to (re-)open the device for rebuilding: %s\n",
435 raidPtr->Disks[col].devname);
436 #endif
437 RF_UNLOCK_MUTEX(raidPtr->mutex);
438 retcode = dk_lookup(raidPtr->Disks[col].devname, curlwp, &vp, UIO_SYSSPACE);
439
440 if (retcode) {
441 printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid,
442 raidPtr->Disks[col].devname, retcode);
443
444 /* the component isn't responding properly...
445 must be still dead :-( */
446 RF_LOCK_MUTEX(raidPtr->mutex);
447 raidPtr->reconInProgress--;
448 RF_UNLOCK_MUTEX(raidPtr->mutex);
449 RF_SIGNAL_COND(raidPtr->waitForReconCond);
450 return(retcode);
451 }
452
453 /* Ok, so we can at least do a lookup...
454 How about actually getting a vp for it? */
455
456 if ((retcode = VOP_GETATTR(vp, &va, curlwp->l_cred)) != 0) {
457 RF_LOCK_MUTEX(raidPtr->mutex);
458 raidPtr->reconInProgress--;
459 RF_UNLOCK_MUTEX(raidPtr->mutex);
460 RF_SIGNAL_COND(raidPtr->waitForReconCond);
461 return(retcode);
462 }
463
464 retcode = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, curlwp->l_cred);
465 if (retcode) {
466 RF_LOCK_MUTEX(raidPtr->mutex);
467 raidPtr->reconInProgress--;
468 RF_UNLOCK_MUTEX(raidPtr->mutex);
469 RF_SIGNAL_COND(raidPtr->waitForReconCond);
470 return(retcode);
471 }
472 RF_LOCK_MUTEX(raidPtr->mutex);
473 raidPtr->Disks[col].blockSize = dpart.disklab->d_secsize;
474
475 raidPtr->Disks[col].numBlocks = dpart.part->p_size -
476 rf_protectedSectors;
477
478 raidPtr->raid_cinfo[col].ci_vp = vp;
479 raidPtr->raid_cinfo[col].ci_dev = va.va_rdev;
480
481 raidPtr->Disks[col].dev = va.va_rdev;
482
483 /* we allow the user to specify that only a fraction
484 of the disks should be used this is just for debug:
485 it speeds up * the parity scan */
486 raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks *
487 rf_sizePercentage / 100;
488 RF_UNLOCK_MUTEX(raidPtr->mutex);
489
490 spareDiskPtr = &raidPtr->Disks[col];
491 spareDiskPtr->status = rf_ds_used_spare;
492
493 printf("raid%d: initiating in-place reconstruction on column %d\n",
494 raidPtr->raidid, col);
495
496 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr,
497 numDisksDone, col);
498 raidPtr->reconDesc = (void *) reconDesc;
499 #if RF_RECON_STATS > 0
500 reconDesc->hsStallCount = 0;
501 reconDesc->numReconExecDelays = 0;
502 reconDesc->numReconEventWaits = 0;
503 #endif /* RF_RECON_STATS > 0 */
504 reconDesc->reconExecTimerRunning = 0;
505 reconDesc->reconExecTicks = 0;
506 reconDesc->maxReconExecTicks = 0;
507 rc = rf_ContinueReconstructFailedDisk(reconDesc);
508
509 if (!rc) {
510 RF_LOCK_MUTEX(raidPtr->mutex);
511 /* Need to set these here, as at this point it'll be claiming
512 that the disk is in rf_ds_spared! But we know better :-) */
513
514 raidPtr->Disks[col].status = rf_ds_optimal;
515 raidPtr->status = rf_rs_optimal;
516 RF_UNLOCK_MUTEX(raidPtr->mutex);
517
518 /* fix up the component label */
519 /* Don't actually need the read here.. */
520 raidread_component_label(raidPtr->raid_cinfo[col].ci_dev,
521 raidPtr->raid_cinfo[col].ci_vp,
522 &c_label);
523
524 RF_LOCK_MUTEX(raidPtr->mutex);
525 raid_init_component_label(raidPtr, &c_label);
526
527 c_label.row = 0;
528 c_label.column = col;
529
530 /* We've just done a rebuild based on all the other
531 disks, so at this point the parity is known to be
532 clean, even if it wasn't before. */
533
534 /* XXX doesn't hold for RAID 6!!*/
535
536 raidPtr->parity_good = RF_RAID_CLEAN;
537 RF_UNLOCK_MUTEX(raidPtr->mutex);
538
539 raidwrite_component_label(raidPtr->raid_cinfo[col].ci_dev,
540 raidPtr->raid_cinfo[col].ci_vp,
541 &c_label);
542
543 } else {
544 /* Reconstruct-in-place failed. Disk goes back to
545 "failed" status, regardless of what it was before. */
546 RF_LOCK_MUTEX(raidPtr->mutex);
547 raidPtr->Disks[col].status = rf_ds_failed;
548 RF_UNLOCK_MUTEX(raidPtr->mutex);
549 }
550
551 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
552
553 RF_LOCK_MUTEX(raidPtr->mutex);
554 raidPtr->reconInProgress--;
555 RF_UNLOCK_MUTEX(raidPtr->mutex);
556
557 RF_SIGNAL_COND(raidPtr->waitForReconCond);
558 return (rc);
559 }
560
561
562 int
563 rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc)
564 {
565 RF_Raid_t *raidPtr = reconDesc->raidPtr;
566 RF_RowCol_t col = reconDesc->col;
567 RF_RowCol_t scol = reconDesc->scol;
568 RF_ReconMap_t *mapPtr;
569 RF_ReconCtrl_t *tmp_reconctrl;
570 RF_ReconEvent_t *event;
571 RF_CallbackDesc_t *p;
572 struct timeval etime, elpsd;
573 unsigned long xor_s, xor_resid_us;
574 int i, ds;
575 int status;
576 int recon_error, write_error;
577
578 raidPtr->accumXorTimeUs = 0;
579 #if RF_ACC_TRACE > 0
580 /* create one trace record per physical disk */
581 RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
582 #endif
583
584 /* quiesce the array prior to starting recon. this is needed
585 * to assure no nasty interactions with pending user writes.
586 * We need to do this before we change the disk or row status. */
587
588 Dprintf("RECON: begin request suspend\n");
589 rf_SuspendNewRequestsAndWait(raidPtr);
590 Dprintf("RECON: end request suspend\n");
591
592 /* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */
593 tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol);
594
595 RF_LOCK_MUTEX(raidPtr->mutex);
596
597 /* create the reconstruction control pointer and install it in
598 * the right slot */
599 raidPtr->reconControl = tmp_reconctrl;
600 mapPtr = raidPtr->reconControl->reconMap;
601 raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs;
602 raidPtr->reconControl->numRUsComplete = 0;
603 raidPtr->status = rf_rs_reconstructing;
604 raidPtr->Disks[col].status = rf_ds_reconstructing;
605 raidPtr->Disks[col].spareCol = scol;
606
607 RF_UNLOCK_MUTEX(raidPtr->mutex);
608
609 RF_GETTIME(raidPtr->reconControl->starttime);
610
611 /* now start up the actual reconstruction: issue a read for
612 * each surviving disk */
613
614 reconDesc->numDisksDone = 0;
615 for (i = 0; i < raidPtr->numCol; i++) {
616 if (i != col) {
617 /* find and issue the next I/O on the
618 * indicated disk */
619 if (IssueNextReadRequest(raidPtr, i)) {
620 Dprintf1("RECON: done issuing for c%d\n", i);
621 reconDesc->numDisksDone++;
622 }
623 }
624 }
625
626 Dprintf("RECON: resume requests\n");
627 rf_ResumeNewRequests(raidPtr);
628
629 /* process reconstruction events until all disks report that
630 * they've completed all work */
631
632 mapPtr = raidPtr->reconControl->reconMap;
633 recon_error = 0;
634 write_error = 0;
635
636 while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
637
638 event = rf_GetNextReconEvent(reconDesc);
639 status = ProcessReconEvent(raidPtr, event);
640
641 /* the normal case is that a read completes, and all is well. */
642 if (status == RF_RECON_DONE_READS) {
643 reconDesc->numDisksDone++;
644 } else if ((status == RF_RECON_READ_ERROR) ||
645 (status == RF_RECON_WRITE_ERROR)) {
646 /* an error was encountered while reconstructing...
647 Pretend we've finished this disk.
648 */
649 recon_error = 1;
650 raidPtr->reconControl->error = 1;
651
652 /* bump the numDisksDone count for reads,
653 but not for writes */
654 if (status == RF_RECON_READ_ERROR)
655 reconDesc->numDisksDone++;
656
657 /* write errors are special -- when we are
658 done dealing with the reads that are
659 finished, we don't want to wait for any
660 writes */
661 if (status == RF_RECON_WRITE_ERROR)
662 write_error = 1;
663
664 } else if (status == RF_RECON_READ_STOPPED) {
665 /* count this component as being "done" */
666 reconDesc->numDisksDone++;
667 }
668
669 if (recon_error) {
670
671 /* make sure any stragglers are woken up so that
672 their theads will complete, and we can get out
673 of here with all IO processed */
674
675 while (raidPtr->reconControl->headSepCBList) {
676 p = raidPtr->reconControl->headSepCBList;
677 raidPtr->reconControl->headSepCBList = p->next;
678 p->next = NULL;
679 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
680 rf_FreeCallbackDesc(p);
681 }
682 }
683
684 raidPtr->reconControl->numRUsTotal =
685 mapPtr->totalRUs;
686 raidPtr->reconControl->numRUsComplete =
687 mapPtr->totalRUs -
688 rf_UnitsLeftToReconstruct(mapPtr);
689
690 #if RF_DEBUG_RECON
691 raidPtr->reconControl->percentComplete =
692 (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
693 if (rf_prReconSched) {
694 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
695 }
696 #endif
697 }
698
699 mapPtr = raidPtr->reconControl->reconMap;
700 if (rf_reconDebug) {
701 printf("RECON: all reads completed\n");
702 }
703 /* at this point all the reads have completed. We now wait
704 * for any pending writes to complete, and then we're done */
705
706 while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) {
707
708 event = rf_GetNextReconEvent(reconDesc);
709 status = ProcessReconEvent(raidPtr, event);
710
711 if (status == RF_RECON_WRITE_ERROR) {
712 recon_error = 1;
713 raidPtr->reconControl->error = 1;
714 /* an error was encountered at the very end... bail */
715 } else {
716 #if RF_DEBUG_RECON
717 raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
718 if (rf_prReconSched) {
719 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
720 }
721 #endif
722 }
723 }
724
725 if (recon_error) {
726 /* we've encountered an error in reconstructing. */
727 printf("raid%d: reconstruction failed.\n", raidPtr->raidid);
728
729 /* we start by blocking IO to the RAID set. */
730 rf_SuspendNewRequestsAndWait(raidPtr);
731
732 RF_LOCK_MUTEX(raidPtr->mutex);
733 /* mark set as being degraded, rather than
734 rf_rs_reconstructing as we were before the problem.
735 After this is done we can update status of the
736 component disks without worrying about someone
737 trying to read from a failed component.
738 */
739 raidPtr->status = rf_rs_degraded;
740 RF_UNLOCK_MUTEX(raidPtr->mutex);
741
742 /* resume IO */
743 rf_ResumeNewRequests(raidPtr);
744
745 /* At this point there are two cases:
746 1) If we've experienced a read error, then we've
747 already waited for all the reads we're going to get,
748 and we just need to wait for the writes.
749
750 2) If we've experienced a write error, we've also
751 already waited for all the reads to complete,
752 but there is little point in waiting for the writes --
753 when they do complete, they will just be ignored.
754
755 So we just wait for writes to complete if we didn't have a
756 write error.
757 */
758
759 if (!write_error) {
760 /* wait for writes to complete */
761 while (raidPtr->reconControl->pending_writes > 0) {
762
763 event = rf_GetNextReconEvent(reconDesc);
764 status = ProcessReconEvent(raidPtr, event);
765
766 if (status == RF_RECON_WRITE_ERROR) {
767 raidPtr->reconControl->error = 1;
768 /* an error was encountered at the very end... bail.
769 This will be very bad news for the user, since
770 at this point there will have been a read error
771 on one component, and a write error on another!
772 */
773 break;
774 }
775 }
776 }
777
778
779 /* cleanup */
780
781 /* drain the event queue - after waiting for the writes above,
782 there shouldn't be much (if anything!) left in the queue. */
783
784 rf_DrainReconEventQueue(reconDesc);
785
786 /* XXX As much as we'd like to free the recon control structure
787 and the reconDesc, we have no way of knowing if/when those will
788 be touched by IO that has yet to occur. It is rather poor to be
789 basically causing a 'memory leak' here, but there doesn't seem to be
790 a cleaner alternative at this time. Perhaps when the reconstruct code
791 gets a makeover this problem will go away.
792 */
793 #if 0
794 rf_FreeReconControl(raidPtr);
795 #endif
796
797 #if RF_ACC_TRACE > 0
798 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
799 #endif
800 /* XXX see comment above */
801 #if 0
802 FreeReconDesc(reconDesc);
803 #endif
804
805 return (1);
806 }
807
808 /* Success: mark the dead disk as reconstructed. We quiesce
809 * the array here to assure no nasty interactions with pending
810 * user accesses when we free up the psstatus structure as
811 * part of FreeReconControl() */
812
813 rf_SuspendNewRequestsAndWait(raidPtr);
814
815 RF_LOCK_MUTEX(raidPtr->mutex);
816 raidPtr->numFailures--;
817 ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
818 raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared;
819 raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal;
820 RF_UNLOCK_MUTEX(raidPtr->mutex);
821 RF_GETTIME(etime);
822 RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd);
823
824 rf_ResumeNewRequests(raidPtr);
825
826 printf("raid%d: Reconstruction of disk at col %d completed\n",
827 raidPtr->raidid, col);
828 xor_s = raidPtr->accumXorTimeUs / 1000000;
829 xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
830 printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n",
831 raidPtr->raidid,
832 (int) elpsd.tv_sec, (int) elpsd.tv_usec,
833 raidPtr->accumXorTimeUs, xor_s, xor_resid_us);
834 printf("raid%d: (start time %d sec %d usec, end time %d sec %d usec)\n",
835 raidPtr->raidid,
836 (int) raidPtr->reconControl->starttime.tv_sec,
837 (int) raidPtr->reconControl->starttime.tv_usec,
838 (int) etime.tv_sec, (int) etime.tv_usec);
839 #if RF_RECON_STATS > 0
840 printf("raid%d: Total head-sep stall count was %d\n",
841 raidPtr->raidid, (int) reconDesc->hsStallCount);
842 #endif /* RF_RECON_STATS > 0 */
843 rf_FreeReconControl(raidPtr);
844 #if RF_ACC_TRACE > 0
845 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
846 #endif
847 FreeReconDesc(reconDesc);
848
849 return (0);
850
851 }
852 /*****************************************************************************
853 * do the right thing upon each reconstruction event.
854 *****************************************************************************/
855 static int
856 ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event)
857 {
858 int retcode = 0, submitblocked;
859 RF_ReconBuffer_t *rbuf;
860 RF_SectorCount_t sectorsPerRU;
861
862 retcode = RF_RECON_READ_STOPPED;
863
864 Dprintf1("RECON: ProcessReconEvent type %d\n", event->type);
865 switch (event->type) {
866
867 /* a read I/O has completed */
868 case RF_REVENT_READDONE:
869 rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf;
870 Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n",
871 event->col, rbuf->parityStripeID);
872 Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n",
873 rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
874 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
875 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
876 if (!raidPtr->reconControl->error) {
877 submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
878 Dprintf1("RECON: submitblocked=%d\n", submitblocked);
879 if (!submitblocked)
880 retcode = IssueNextReadRequest(raidPtr, event->col);
881 else
882 retcode = 0;
883 }
884 break;
885
886 /* a write I/O has completed */
887 case RF_REVENT_WRITEDONE:
888 #if RF_DEBUG_RECON
889 if (rf_floatingRbufDebug) {
890 rf_CheckFloatingRbufCount(raidPtr, 1);
891 }
892 #endif
893 sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
894 rbuf = (RF_ReconBuffer_t *) event->arg;
895 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
896 Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n",
897 rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete);
898 rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap,
899 rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
900 rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru);
901
902 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
903 raidPtr->reconControl->pending_writes--;
904 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
905
906 if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
907 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
908 while(raidPtr->reconControl->rb_lock) {
909 ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO, "reconctrlpre1", 0,
910 &raidPtr->reconControl->rb_mutex);
911 }
912 raidPtr->reconControl->rb_lock = 1;
913 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
914
915 raidPtr->numFullReconBuffers--;
916 rf_ReleaseFloatingReconBuffer(raidPtr, rbuf);
917
918 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
919 raidPtr->reconControl->rb_lock = 0;
920 wakeup(&raidPtr->reconControl->rb_lock);
921 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
922 } else
923 if (rbuf->type == RF_RBUF_TYPE_FORCED)
924 rf_FreeReconBuffer(rbuf);
925 else
926 RF_ASSERT(0);
927 retcode = 0;
928 break;
929
930 case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been
931 * cleared */
932 Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col);
933 if (!raidPtr->reconControl->error) {
934 submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf,
935 0, (int) (long) event->arg);
936 RF_ASSERT(!submitblocked); /* we wouldn't have gotten the
937 * BUFCLEAR event if we
938 * couldn't submit */
939 retcode = IssueNextReadRequest(raidPtr, event->col);
940 }
941 break;
942
943 case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction
944 * blockage has been cleared */
945 DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col);
946 if (!raidPtr->reconControl->error) {
947 retcode = TryToRead(raidPtr, event->col);
948 }
949 break;
950
951 case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation
952 * reconstruction blockage has been
953 * cleared */
954 Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col);
955 if (!raidPtr->reconControl->error) {
956 retcode = TryToRead(raidPtr, event->col);
957 }
958 break;
959
960 /* a buffer has become ready to write */
961 case RF_REVENT_BUFREADY:
962 Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col);
963 if (!raidPtr->reconControl->error) {
964 retcode = IssueNextWriteRequest(raidPtr);
965 #if RF_DEBUG_RECON
966 if (rf_floatingRbufDebug) {
967 rf_CheckFloatingRbufCount(raidPtr, 1);
968 }
969 #endif
970 }
971 break;
972
973 /* we need to skip the current RU entirely because it got
974 * recon'd while we were waiting for something else to happen */
975 case RF_REVENT_SKIP:
976 DDprintf1("RECON: SKIP EVENT: col %d\n", event->col);
977 if (!raidPtr->reconControl->error) {
978 retcode = IssueNextReadRequest(raidPtr, event->col);
979 }
980 break;
981
982 /* a forced-reconstruction read access has completed. Just
983 * submit the buffer */
984 case RF_REVENT_FORCEDREADDONE:
985 rbuf = (RF_ReconBuffer_t *) event->arg;
986 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
987 DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col);
988 if (!raidPtr->reconControl->error) {
989 submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
990 RF_ASSERT(!submitblocked);
991 }
992 break;
993
994 /* A read I/O failed to complete */
995 case RF_REVENT_READ_FAILED:
996 retcode = RF_RECON_READ_ERROR;
997 break;
998
999 /* A write I/O failed to complete */
1000 case RF_REVENT_WRITE_FAILED:
1001 retcode = RF_RECON_WRITE_ERROR;
1002
1003 rbuf = (RF_ReconBuffer_t *) event->arg;
1004
1005 /* cleanup the disk queue data */
1006 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1007
1008 /* At this point we're erroring out, badly, and floatingRbufs
1009 may not even be valid. Rather than putting this back onto
1010 the floatingRbufs list, just arrange for its immediate
1011 destruction.
1012 */
1013 rf_FreeReconBuffer(rbuf);
1014 break;
1015
1016 /* a forced read I/O failed to complete */
1017 case RF_REVENT_FORCEDREAD_FAILED:
1018 retcode = RF_RECON_READ_ERROR;
1019 break;
1020
1021 default:
1022 RF_PANIC();
1023 }
1024 rf_FreeReconEventDesc(event);
1025 return (retcode);
1026 }
1027 /*****************************************************************************
1028 *
1029 * find the next thing that's needed on the indicated disk, and issue
1030 * a read request for it. We assume that the reconstruction buffer
1031 * associated with this process is free to receive the data. If
1032 * reconstruction is blocked on the indicated RU, we issue a
1033 * blockage-release request instead of a physical disk read request.
1034 * If the current disk gets too far ahead of the others, we issue a
1035 * head-separation wait request and return.
1036 *
1037 * ctrl->{ru_count, curPSID, diskOffset} and
1038 * rbuf->failedDiskSectorOffset are maintained to point to the unit
1039 * we're currently accessing. Note that this deviates from the
1040 * standard C idiom of having counters point to the next thing to be
1041 * accessed. This allows us to easily retry when we're blocked by
1042 * head separation or reconstruction-blockage events.
1043 *
1044 *****************************************************************************/
1045 static int
1046 IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col)
1047 {
1048 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1049 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1050 RF_ReconBuffer_t *rbuf = ctrl->rbuf;
1051 RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
1052 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1053 int do_new_check = 0, retcode = 0, status;
1054
1055 /* if we are currently the slowest disk, mark that we have to do a new
1056 * check */
1057 if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter)
1058 do_new_check = 1;
1059
1060 while (1) {
1061
1062 ctrl->ru_count++;
1063 if (ctrl->ru_count < RUsPerPU) {
1064 ctrl->diskOffset += sectorsPerRU;
1065 rbuf->failedDiskSectorOffset += sectorsPerRU;
1066 } else {
1067 ctrl->curPSID++;
1068 ctrl->ru_count = 0;
1069 /* code left over from when head-sep was based on
1070 * parity stripe id */
1071 if (ctrl->curPSID >= raidPtr->reconControl->lastPSID) {
1072 CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter));
1073 return (RF_RECON_DONE_READS); /* finito! */
1074 }
1075 /* find the disk offsets of the start of the parity
1076 * stripe on both the current disk and the failed
1077 * disk. skip this entire parity stripe if either disk
1078 * does not appear in the indicated PS */
1079 status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset,
1080 &rbuf->spCol, &rbuf->spOffset);
1081 if (status) {
1082 ctrl->ru_count = RUsPerPU - 1;
1083 continue;
1084 }
1085 }
1086 rbuf->which_ru = ctrl->ru_count;
1087
1088 /* skip this RU if it's already been reconstructed */
1089 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) {
1090 Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count);
1091 continue;
1092 }
1093 break;
1094 }
1095 ctrl->headSepCounter++;
1096 if (do_new_check)
1097 CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter); /* update min if needed */
1098
1099
1100 /* at this point, we have definitely decided what to do, and we have
1101 * only to see if we can actually do it now */
1102 rbuf->parityStripeID = ctrl->curPSID;
1103 rbuf->which_ru = ctrl->ru_count;
1104 #if RF_ACC_TRACE > 0
1105 memset((char *) &raidPtr->recon_tracerecs[col], 0,
1106 sizeof(raidPtr->recon_tracerecs[col]));
1107 raidPtr->recon_tracerecs[col].reconacc = 1;
1108 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1109 #endif
1110 retcode = TryToRead(raidPtr, col);
1111 return (retcode);
1112 }
1113
1114 /*
1115 * tries to issue the next read on the indicated disk. We may be
1116 * blocked by (a) the heads being too far apart, or (b) recon on the
1117 * indicated RU being blocked due to a write by a user thread. In
1118 * this case, we issue a head-sep or blockage wait request, which will
1119 * cause this same routine to be invoked again later when the blockage
1120 * has cleared.
1121 */
1122
1123 static int
1124 TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col)
1125 {
1126 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1127 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
1128 RF_StripeNum_t psid = ctrl->curPSID;
1129 RF_ReconUnitNum_t which_ru = ctrl->ru_count;
1130 RF_DiskQueueData_t *req;
1131 int status;
1132 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr;
1133
1134 /* if the current disk is too far ahead of the others, issue a
1135 * head-separation wait and return */
1136 if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru))
1137 return (0);
1138
1139 /* allocate a new PSS in case we need it */
1140 newpssPtr = rf_AllocPSStatus(raidPtr);
1141
1142 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1143 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr);
1144
1145 if (pssPtr != newpssPtr) {
1146 rf_FreePSStatus(raidPtr, newpssPtr);
1147 }
1148
1149 /* if recon is blocked on the indicated parity stripe, issue a
1150 * block-wait request and return. this also must mark the indicated RU
1151 * in the stripe as under reconstruction if not blocked. */
1152 status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru);
1153 if (status == RF_PSS_RECON_BLOCKED) {
1154 Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru);
1155 goto out;
1156 } else
1157 if (status == RF_PSS_FORCED_ON_WRITE) {
1158 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1159 goto out;
1160 }
1161 /* make one last check to be sure that the indicated RU didn't get
1162 * reconstructed while we were waiting for something else to happen.
1163 * This is unfortunate in that it causes us to make this check twice
1164 * in the normal case. Might want to make some attempt to re-work
1165 * this so that we only do this check if we've definitely blocked on
1166 * one of the above checks. When this condition is detected, we may
1167 * have just created a bogus status entry, which we need to delete. */
1168 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) {
1169 Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru);
1170 if (pssPtr == newpssPtr)
1171 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1172 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1173 goto out;
1174 }
1175 /* found something to read. issue the I/O */
1176 Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n",
1177 psid, col, ctrl->diskOffset, ctrl->rbuf->buffer);
1178 #if RF_ACC_TRACE > 0
1179 RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
1180 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
1181 raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
1182 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
1183 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1184 #endif
1185 /* should be ok to use a NULL proc pointer here, all the bufs we use
1186 * should be in kernel space */
1187 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
1188 ReconReadDoneProc, (void *) ctrl,
1189 #if RF_ACC_TRACE > 0
1190 &raidPtr->recon_tracerecs[col],
1191 #else
1192 NULL,
1193 #endif
1194 (void *) raidPtr, 0, NULL, PR_WAITOK);
1195
1196 ctrl->rbuf->arg = (void *) req;
1197 rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY);
1198 pssPtr->issued[col] = 1;
1199
1200 out:
1201 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1202 return (0);
1203 }
1204
1205
1206 /*
1207 * given a parity stripe ID, we want to find out whether both the
1208 * current disk and the failed disk exist in that parity stripe. If
1209 * not, we want to skip this whole PS. If so, we want to find the
1210 * disk offset of the start of the PS on both the current disk and the
1211 * failed disk.
1212 *
1213 * this works by getting a list of disks comprising the indicated
1214 * parity stripe, and searching the list for the current and failed
1215 * disks. Once we've decided they both exist in the parity stripe, we
1216 * need to decide whether each is data or parity, so that we'll know
1217 * which mapping function to call to get the corresponding disk
1218 * offsets.
1219 *
1220 * this is kind of unpleasant, but doing it this way allows the
1221 * reconstruction code to use parity stripe IDs rather than physical
1222 * disks address to march through the failed disk, which greatly
1223 * simplifies a lot of code, as well as eliminating the need for a
1224 * reverse-mapping function. I also think it will execute faster,
1225 * since the calls to the mapping module are kept to a minimum.
1226 *
1227 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
1228 * THE STRIPE IN THE CORRECT ORDER
1229 *
1230 * raidPtr - raid descriptor
1231 * psid - parity stripe identifier
1232 * col - column of disk to find the offsets for
1233 * spCol - out: col of spare unit for failed unit
1234 * spOffset - out: offset into disk containing spare unit
1235 *
1236 */
1237
1238
1239 static int
1240 ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid,
1241 RF_RowCol_t col, RF_SectorNum_t *outDiskOffset,
1242 RF_SectorNum_t *outFailedDiskSectorOffset,
1243 RF_RowCol_t *spCol, RF_SectorNum_t *spOffset)
1244 {
1245 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1246 RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1247 RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */
1248 RF_RowCol_t *diskids;
1249 u_int i, j, k, i_offset, j_offset;
1250 RF_RowCol_t pcol;
1251 int testcol;
1252 RF_SectorNum_t poffset;
1253 char i_is_parity = 0, j_is_parity = 0;
1254 RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
1255
1256 /* get a listing of the disks comprising that stripe */
1257 sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
1258 (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids);
1259 RF_ASSERT(diskids);
1260
1261 /* reject this entire parity stripe if it does not contain the
1262 * indicated disk or it does not contain the failed disk */
1263
1264 for (i = 0; i < stripeWidth; i++) {
1265 if (col == diskids[i])
1266 break;
1267 }
1268 if (i == stripeWidth)
1269 goto skipit;
1270 for (j = 0; j < stripeWidth; j++) {
1271 if (fcol == diskids[j])
1272 break;
1273 }
1274 if (j == stripeWidth) {
1275 goto skipit;
1276 }
1277 /* find out which disk the parity is on */
1278 (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP);
1279
1280 /* find out if either the current RU or the failed RU is parity */
1281 /* also, if the parity occurs in this stripe prior to the data and/or
1282 * failed col, we need to decrement i and/or j */
1283 for (k = 0; k < stripeWidth; k++)
1284 if (diskids[k] == pcol)
1285 break;
1286 RF_ASSERT(k < stripeWidth);
1287 i_offset = i;
1288 j_offset = j;
1289 if (k < i)
1290 i_offset--;
1291 else
1292 if (k == i) {
1293 i_is_parity = 1;
1294 i_offset = 0;
1295 } /* set offsets to zero to disable multiply
1296 * below */
1297 if (k < j)
1298 j_offset--;
1299 else
1300 if (k == j) {
1301 j_is_parity = 1;
1302 j_offset = 0;
1303 }
1304 /* at this point, [ij]_is_parity tells us whether the [current,failed]
1305 * disk is parity at the start of this RU, and, if data, "[ij]_offset"
1306 * tells us how far into the stripe the [current,failed] disk is. */
1307
1308 /* call the mapping routine to get the offset into the current disk,
1309 * repeat for failed disk. */
1310 if (i_is_parity)
1311 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1312 else
1313 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1314
1315 RF_ASSERT(col == testcol);
1316
1317 if (j_is_parity)
1318 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1319 else
1320 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1321 RF_ASSERT(fcol == testcol);
1322
1323 /* now locate the spare unit for the failed unit */
1324 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1325 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
1326 if (j_is_parity)
1327 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1328 else
1329 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1330 } else {
1331 #endif
1332 *spCol = raidPtr->reconControl->spareCol;
1333 *spOffset = *outFailedDiskSectorOffset;
1334 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1335 }
1336 #endif
1337 return (0);
1338
1339 skipit:
1340 Dprintf2("RECON: Skipping psid %ld: nothing needed from c%d\n",
1341 psid, col);
1342 return (1);
1343 }
1344 /* this is called when a buffer has become ready to write to the replacement disk */
1345 static int
1346 IssueNextWriteRequest(RF_Raid_t *raidPtr)
1347 {
1348 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1349 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1350 #if RF_ACC_TRACE > 0
1351 RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1352 #endif
1353 RF_ReconBuffer_t *rbuf;
1354 RF_DiskQueueData_t *req;
1355
1356 rbuf = rf_GetFullReconBuffer(raidPtr->reconControl);
1357 RF_ASSERT(rbuf); /* there must be one available, or we wouldn't
1358 * have gotten the event that sent us here */
1359 RF_ASSERT(rbuf->pssPtr);
1360
1361 rbuf->pssPtr->writeRbuf = rbuf;
1362 rbuf->pssPtr = NULL;
1363
1364 Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n",
1365 rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
1366 rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
1367 Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n",
1368 rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
1369 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
1370
1371 /* should be ok to use a NULL b_proc here b/c all addrs should be in
1372 * kernel space */
1373 req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
1374 sectorsPerRU, rbuf->buffer,
1375 rbuf->parityStripeID, rbuf->which_ru,
1376 ReconWriteDoneProc, (void *) rbuf,
1377 #if RF_ACC_TRACE > 0
1378 &raidPtr->recon_tracerecs[fcol],
1379 #else
1380 NULL,
1381 #endif
1382 (void *) raidPtr, 0, NULL, PR_WAITOK);
1383
1384 rbuf->arg = (void *) req;
1385 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1386 raidPtr->reconControl->pending_writes++;
1387 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1388 rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY);
1389
1390 return (0);
1391 }
1392
1393 /*
1394 * this gets called upon the completion of a reconstruction read
1395 * operation the arg is a pointer to the per-disk reconstruction
1396 * control structure for the process that just finished a read.
1397 *
1398 * called at interrupt context in the kernel, so don't do anything
1399 * illegal here.
1400 */
1401 static int
1402 ReconReadDoneProc(void *arg, int status)
1403 {
1404 RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
1405 RF_Raid_t *raidPtr;
1406
1407 /* Detect that reconCtrl is no longer valid, and if that
1408 is the case, bail without calling rf_CauseReconEvent().
1409 There won't be anyone listening for this event anyway */
1410
1411 if (ctrl->reconCtrl == NULL)
1412 return(0);
1413
1414 raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
1415
1416 if (status) {
1417 printf("raid%d: Recon read failed!\n", raidPtr->raidid);
1418 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED);
1419 return(0);
1420 }
1421 #if RF_ACC_TRACE > 0
1422 RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1423 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1424 raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us =
1425 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1426 RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1427 #endif
1428 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE);
1429 return (0);
1430 }
1431 /* this gets called upon the completion of a reconstruction write operation.
1432 * the arg is a pointer to the rbuf that was just written
1433 *
1434 * called at interrupt context in the kernel, so don't do anything illegal here.
1435 */
1436 static int
1437 ReconWriteDoneProc(void *arg, int status)
1438 {
1439 RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
1440
1441 /* Detect that reconControl is no longer valid, and if that
1442 is the case, bail without calling rf_CauseReconEvent().
1443 There won't be anyone listening for this event anyway */
1444
1445 if (rbuf->raidPtr->reconControl == NULL)
1446 return(0);
1447
1448 Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru);
1449 if (status) {
1450 printf("raid%d: Recon write failed!\n", rbuf->raidPtr->raidid);
1451 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED);
1452 return(0);
1453 }
1454 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE);
1455 return (0);
1456 }
1457
1458
1459 /*
1460 * computes a new minimum head sep, and wakes up anyone who needs to
1461 * be woken as a result
1462 */
1463 static void
1464 CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr)
1465 {
1466 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1467 RF_HeadSepLimit_t new_min;
1468 RF_RowCol_t i;
1469 RF_CallbackDesc_t *p;
1470 RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition
1471 * of a minimum */
1472
1473
1474 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1475 while(reconCtrlPtr->rb_lock) {
1476 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlcnmhs", 0, &reconCtrlPtr->rb_mutex);
1477 }
1478 reconCtrlPtr->rb_lock = 1;
1479 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1480
1481 new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */
1482 for (i = 0; i < raidPtr->numCol; i++)
1483 if (i != reconCtrlPtr->fcol) {
1484 if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min)
1485 new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter;
1486 }
1487 /* set the new minimum and wake up anyone who can now run again */
1488 if (new_min != reconCtrlPtr->minHeadSepCounter) {
1489 reconCtrlPtr->minHeadSepCounter = new_min;
1490 Dprintf1("RECON: new min head pos counter val is %ld\n", new_min);
1491 while (reconCtrlPtr->headSepCBList) {
1492 if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min)
1493 break;
1494 p = reconCtrlPtr->headSepCBList;
1495 reconCtrlPtr->headSepCBList = p->next;
1496 p->next = NULL;
1497 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
1498 rf_FreeCallbackDesc(p);
1499 }
1500
1501 }
1502 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1503 reconCtrlPtr->rb_lock = 0;
1504 wakeup(&reconCtrlPtr->rb_lock);
1505 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1506 }
1507
1508 /*
1509 * checks to see that the maximum head separation will not be violated
1510 * if we initiate a reconstruction I/O on the indicated disk.
1511 * Limiting the maximum head separation between two disks eliminates
1512 * the nasty buffer-stall conditions that occur when one disk races
1513 * ahead of the others and consumes all of the floating recon buffers.
1514 * This code is complex and unpleasant but it's necessary to avoid
1515 * some very nasty, albeit fairly rare, reconstruction behavior.
1516 *
1517 * returns non-zero if and only if we have to stop working on the
1518 * indicated disk due to a head-separation delay.
1519 */
1520 static int
1521 CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl,
1522 RF_RowCol_t col, RF_HeadSepLimit_t hsCtr,
1523 RF_ReconUnitNum_t which_ru)
1524 {
1525 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1526 RF_CallbackDesc_t *cb, *p, *pt;
1527 int retval = 0;
1528
1529 /* if we're too far ahead of the slowest disk, stop working on this
1530 * disk until the slower ones catch up. We do this by scheduling a
1531 * wakeup callback for the time when the slowest disk has caught up.
1532 * We define "caught up" with 20% hysteresis, i.e. the head separation
1533 * must have fallen to at most 80% of the max allowable head
1534 * separation before we'll wake up.
1535 *
1536 */
1537 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1538 while(reconCtrlPtr->rb_lock) {
1539 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlchs", 0, &reconCtrlPtr->rb_mutex);
1540 }
1541 reconCtrlPtr->rb_lock = 1;
1542 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1543 if ((raidPtr->headSepLimit >= 0) &&
1544 ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) {
1545 Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n",
1546 raidPtr->raidid, col, ctrl->headSepCounter,
1547 reconCtrlPtr->minHeadSepCounter,
1548 raidPtr->headSepLimit);
1549 cb = rf_AllocCallbackDesc();
1550 /* the minHeadSepCounter value we have to get to before we'll
1551 * wake up. build in 20% hysteresis. */
1552 cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
1553 cb->col = col;
1554 cb->next = NULL;
1555
1556 /* insert this callback descriptor into the sorted list of
1557 * pending head-sep callbacks */
1558 p = reconCtrlPtr->headSepCBList;
1559 if (!p)
1560 reconCtrlPtr->headSepCBList = cb;
1561 else
1562 if (cb->callbackArg.v < p->callbackArg.v) {
1563 cb->next = reconCtrlPtr->headSepCBList;
1564 reconCtrlPtr->headSepCBList = cb;
1565 } else {
1566 for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next);
1567 cb->next = p;
1568 pt->next = cb;
1569 }
1570 retval = 1;
1571 #if RF_RECON_STATS > 0
1572 ctrl->reconCtrl->reconDesc->hsStallCount++;
1573 #endif /* RF_RECON_STATS > 0 */
1574 }
1575 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1576 reconCtrlPtr->rb_lock = 0;
1577 wakeup(&reconCtrlPtr->rb_lock);
1578 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1579
1580 return (retval);
1581 }
1582 /*
1583 * checks to see if reconstruction has been either forced or blocked
1584 * by a user operation. if forced, we skip this RU entirely. else if
1585 * blocked, put ourselves on the wait list. else return 0.
1586 *
1587 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY
1588 */
1589 static int
1590 CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr,
1591 RF_ReconParityStripeStatus_t *pssPtr,
1592 RF_PerDiskReconCtrl_t *ctrl,
1593 RF_RowCol_t col,
1594 RF_StripeNum_t psid,
1595 RF_ReconUnitNum_t which_ru)
1596 {
1597 RF_CallbackDesc_t *cb;
1598 int retcode = 0;
1599
1600 if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
1601 retcode = RF_PSS_FORCED_ON_WRITE;
1602 else
1603 if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
1604 Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru);
1605 cb = rf_AllocCallbackDesc(); /* append ourselves to
1606 * the blockage-wait
1607 * list */
1608 cb->col = col;
1609 cb->next = pssPtr->blockWaitList;
1610 pssPtr->blockWaitList = cb;
1611 retcode = RF_PSS_RECON_BLOCKED;
1612 }
1613 if (!retcode)
1614 pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under
1615 * reconstruction */
1616
1617 return (retcode);
1618 }
1619 /*
1620 * if reconstruction is currently ongoing for the indicated stripeID,
1621 * reconstruction is forced to completion and we return non-zero to
1622 * indicate that the caller must wait. If not, then reconstruction is
1623 * blocked on the indicated stripe and the routine returns zero. If
1624 * and only if we return non-zero, we'll cause the cbFunc to get
1625 * invoked with the cbArg when the reconstruction has completed.
1626 */
1627 int
1628 rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1629 void (*cbFunc)(RF_Raid_t *, void *), void *cbArg)
1630 {
1631 RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're
1632 * forcing recon on */
1633 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */
1634 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; /* a pointer to the parity
1635 * stripe status structure */
1636 RF_StripeNum_t psid; /* parity stripe id */
1637 RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk
1638 * offset */
1639 RF_RowCol_t *diskids;
1640 RF_ReconUnitNum_t which_ru; /* RU within parity stripe */
1641 RF_RowCol_t fcol, diskno, i;
1642 RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */
1643 RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */
1644 RF_CallbackDesc_t *cb;
1645 int nPromoted;
1646
1647 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1648
1649 /* allocate a new PSS in case we need it */
1650 newpssPtr = rf_AllocPSStatus(raidPtr);
1651
1652 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1653
1654 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr);
1655
1656 if (pssPtr != newpssPtr) {
1657 rf_FreePSStatus(raidPtr, newpssPtr);
1658 }
1659
1660 /* if recon is not ongoing on this PS, just return */
1661 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1662 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1663 return (0);
1664 }
1665 /* otherwise, we have to wait for reconstruction to complete on this
1666 * RU. */
1667 /* In order to avoid waiting for a potentially large number of
1668 * low-priority accesses to complete, we force a normal-priority (i.e.
1669 * not low-priority) reconstruction on this RU. */
1670 if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
1671 DDprintf1("Forcing recon on psid %ld\n", psid);
1672 pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under
1673 * forced recon */
1674 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage
1675 * that we just set */
1676 fcol = raidPtr->reconControl->fcol;
1677
1678 /* get a listing of the disks comprising the indicated stripe */
1679 (raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids);
1680
1681 /* For previously issued reads, elevate them to normal
1682 * priority. If the I/O has already completed, it won't be
1683 * found in the queue, and hence this will be a no-op. For
1684 * unissued reads, allocate buffers and issue new reads. The
1685 * fact that we've set the FORCED bit means that the regular
1686 * recon procs will not re-issue these reqs */
1687 for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++)
1688 if ((diskno = diskids[i]) != fcol) {
1689 if (pssPtr->issued[diskno]) {
1690 nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru);
1691 if (rf_reconDebug && nPromoted)
1692 printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno);
1693 } else {
1694 new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */
1695 ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset,
1696 &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare
1697 * location */
1698 new_rbuf->parityStripeID = psid; /* fill in the buffer */
1699 new_rbuf->which_ru = which_ru;
1700 new_rbuf->failedDiskSectorOffset = fd_offset;
1701 new_rbuf->priority = RF_IO_NORMAL_PRIORITY;
1702
1703 /* use NULL b_proc b/c all addrs
1704 * should be in kernel space */
1705 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer,
1706 psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf,
1707 NULL, (void *) raidPtr, 0, NULL, PR_WAITOK);
1708
1709 new_rbuf->arg = req;
1710 rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */
1711 Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno);
1712 }
1713 }
1714 /* if the write is sitting in the disk queue, elevate its
1715 * priority */
1716 if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru))
1717 printf("raid%d: promoted write to col %d\n",
1718 raidPtr->raidid, fcol);
1719 }
1720 /* install a callback descriptor to be invoked when recon completes on
1721 * this parity stripe. */
1722 cb = rf_AllocCallbackDesc();
1723 /* XXX the following is bogus.. These functions don't really match!!
1724 * GO */
1725 cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
1726 cb->callbackArg.p = (void *) cbArg;
1727 cb->next = pssPtr->procWaitList;
1728 pssPtr->procWaitList = cb;
1729 DDprintf2("raid%d: Waiting for forced recon on psid %ld\n",
1730 raidPtr->raidid, psid);
1731
1732 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1733 return (1);
1734 }
1735 /* called upon the completion of a forced reconstruction read.
1736 * all we do is schedule the FORCEDREADONE event.
1737 * called at interrupt context in the kernel, so don't do anything illegal here.
1738 */
1739 static void
1740 ForceReconReadDoneProc(void *arg, int status)
1741 {
1742 RF_ReconBuffer_t *rbuf = arg;
1743
1744 /* Detect that reconControl is no longer valid, and if that
1745 is the case, bail without calling rf_CauseReconEvent().
1746 There won't be anyone listening for this event anyway */
1747
1748 if (rbuf->raidPtr->reconControl == NULL)
1749 return;
1750
1751 if (status) {
1752 printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid);
1753 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED);
1754 return;
1755 }
1756 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE);
1757 }
1758 /* releases a block on the reconstruction of the indicated stripe */
1759 int
1760 rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
1761 {
1762 RF_StripeNum_t stripeID = asmap->stripeID;
1763 RF_ReconParityStripeStatus_t *pssPtr;
1764 RF_ReconUnitNum_t which_ru;
1765 RF_StripeNum_t psid;
1766 RF_CallbackDesc_t *cb;
1767
1768 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1769 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1770 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL);
1771
1772 /* When recon is forced, the pss desc can get deleted before we get
1773 * back to unblock recon. But, this can _only_ happen when recon is
1774 * forced. It would be good to put some kind of sanity check here, but
1775 * how to decide if recon was just forced or not? */
1776 if (!pssPtr) {
1777 /* printf("Warning: no pss descriptor upon unblock on psid %ld
1778 * RU %d\n",psid,which_ru); */
1779 #if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0)
1780 if (rf_reconDebug || rf_pssDebug)
1781 printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru);
1782 #endif
1783 goto out;
1784 }
1785 pssPtr->blockCount--;
1786 Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n",
1787 raidPtr->raidid, psid, pssPtr->blockCount);
1788 if (pssPtr->blockCount == 0) { /* if recon blockage has been released */
1789
1790 /* unblock recon before calling CauseReconEvent in case
1791 * CauseReconEvent causes us to try to issue a new read before
1792 * returning here. */
1793 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
1794
1795
1796 while (pssPtr->blockWaitList) {
1797 /* spin through the block-wait list and
1798 release all the waiters */
1799 cb = pssPtr->blockWaitList;
1800 pssPtr->blockWaitList = cb->next;
1801 cb->next = NULL;
1802 rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR);
1803 rf_FreeCallbackDesc(cb);
1804 }
1805 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1806 /* if no recon was requested while recon was blocked */
1807 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1808 }
1809 }
1810 out:
1811 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1812 return (0);
1813 }
1814