rf_reconstruct.c revision 1.88.2.4 1 /* $NetBSD: rf_reconstruct.c,v 1.88.2.4 2007/10/27 11:34:04 yamt Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /************************************************************
30 *
31 * rf_reconstruct.c -- code to perform on-line reconstruction
32 *
33 ************************************************************/
34
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.88.2.4 2007/10/27 11:34:04 yamt Exp $");
37
38 #include <sys/param.h>
39 #include <sys/time.h>
40 #include <sys/buf.h>
41 #include <sys/errno.h>
42 #include <sys/systm.h>
43 #include <sys/proc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <dev/raidframe/raidframevar.h>
48
49 #include "rf_raid.h"
50 #include "rf_reconutil.h"
51 #include "rf_revent.h"
52 #include "rf_reconbuffer.h"
53 #include "rf_acctrace.h"
54 #include "rf_etimer.h"
55 #include "rf_dag.h"
56 #include "rf_desc.h"
57 #include "rf_debugprint.h"
58 #include "rf_general.h"
59 #include "rf_driver.h"
60 #include "rf_utils.h"
61 #include "rf_shutdown.h"
62
63 #include "rf_kintf.h"
64
65 /* setting these to -1 causes them to be set to their default values if not set by debug options */
66
67 #if RF_DEBUG_RECON
68 #define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
69 #define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
70 #define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
71 #define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
72 #define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
73 #define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
74 #define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
75 #define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
76
77 #define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
78 #define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
79
80 #else /* RF_DEBUG_RECON */
81
82 #define Dprintf(s) {}
83 #define Dprintf1(s,a) {}
84 #define Dprintf2(s,a,b) {}
85 #define Dprintf3(s,a,b,c) {}
86 #define Dprintf4(s,a,b,c,d) {}
87 #define Dprintf5(s,a,b,c,d,e) {}
88 #define Dprintf6(s,a,b,c,d,e,f) {}
89 #define Dprintf7(s,a,b,c,d,e,f,g) {}
90
91 #define DDprintf1(s,a) {}
92 #define DDprintf2(s,a,b) {}
93
94 #endif /* RF_DEBUG_RECON */
95
96 #define RF_RECON_DONE_READS 1
97 #define RF_RECON_READ_ERROR 2
98 #define RF_RECON_WRITE_ERROR 3
99 #define RF_RECON_READ_STOPPED 4
100
101 #define RF_MAX_FREE_RECONBUFFER 32
102 #define RF_MIN_FREE_RECONBUFFER 16
103
104 static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t,
105 RF_RaidDisk_t *, int, RF_RowCol_t);
106 static void FreeReconDesc(RF_RaidReconDesc_t *);
107 static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *);
108 static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t);
109 static int TryToRead(RF_Raid_t *, RF_RowCol_t);
110 static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t,
111 RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *,
112 RF_SectorNum_t *);
113 static int IssueNextWriteRequest(RF_Raid_t *);
114 static int ReconReadDoneProc(void *, int);
115 static int ReconWriteDoneProc(void *, int);
116 static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t);
117 static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *,
118 RF_RowCol_t, RF_HeadSepLimit_t,
119 RF_ReconUnitNum_t);
120 static int CheckForcedOrBlockedReconstruction(RF_Raid_t *,
121 RF_ReconParityStripeStatus_t *,
122 RF_PerDiskReconCtrl_t *,
123 RF_RowCol_t, RF_StripeNum_t,
124 RF_ReconUnitNum_t);
125 static void ForceReconReadDoneProc(void *, int);
126 static void rf_ShutdownReconstruction(void *);
127
128 struct RF_ReconDoneProc_s {
129 void (*proc) (RF_Raid_t *, void *);
130 void *arg;
131 RF_ReconDoneProc_t *next;
132 };
133
134 /**************************************************************************
135 *
136 * sets up the parameters that will be used by the reconstruction process
137 * currently there are none, except for those that the layout-specific
138 * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
139 *
140 * in the kernel, we fire off the recon thread.
141 *
142 **************************************************************************/
143 static void
144 rf_ShutdownReconstruction(void *ignored)
145 {
146 pool_destroy(&rf_pools.reconbuffer);
147 }
148
149 int
150 rf_ConfigureReconstruction(RF_ShutdownList_t **listp)
151 {
152
153 rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t),
154 "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER);
155 rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
156
157 return (0);
158 }
159
160 static RF_RaidReconDesc_t *
161 AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col,
162 RF_RaidDisk_t *spareDiskPtr, int numDisksDone,
163 RF_RowCol_t scol)
164 {
165
166 RF_RaidReconDesc_t *reconDesc;
167
168 RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t),
169 (RF_RaidReconDesc_t *));
170 reconDesc->raidPtr = raidPtr;
171 reconDesc->col = col;
172 reconDesc->spareDiskPtr = spareDiskPtr;
173 reconDesc->numDisksDone = numDisksDone;
174 reconDesc->scol = scol;
175 reconDesc->next = NULL;
176
177 return (reconDesc);
178 }
179
180 static void
181 FreeReconDesc(RF_RaidReconDesc_t *reconDesc)
182 {
183 #if RF_RECON_STATS > 0
184 printf("raid%d: %lu recon event waits, %lu recon delays\n",
185 reconDesc->raidPtr->raidid,
186 (long) reconDesc->numReconEventWaits,
187 (long) reconDesc->numReconExecDelays);
188 #endif /* RF_RECON_STATS > 0 */
189 printf("raid%d: %lu max exec ticks\n",
190 reconDesc->raidPtr->raidid,
191 (long) reconDesc->maxReconExecTicks);
192 #if (RF_RECON_STATS > 0) || defined(KERNEL)
193 printf("\n");
194 #endif /* (RF_RECON_STATS > 0) || KERNEL */
195 RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t));
196 }
197
198
199 /*****************************************************************************
200 *
201 * primary routine to reconstruct a failed disk. This should be called from
202 * within its own thread. It won't return until reconstruction completes,
203 * fails, or is aborted.
204 *****************************************************************************/
205 int
206 rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col)
207 {
208 const RF_LayoutSW_t *lp;
209 int rc;
210
211 lp = raidPtr->Layout.map;
212 if (lp->SubmitReconBuffer) {
213 /*
214 * The current infrastructure only supports reconstructing one
215 * disk at a time for each array.
216 */
217 RF_LOCK_MUTEX(raidPtr->mutex);
218 while (raidPtr->reconInProgress) {
219 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
220 }
221 raidPtr->reconInProgress++;
222 RF_UNLOCK_MUTEX(raidPtr->mutex);
223 rc = rf_ReconstructFailedDiskBasic(raidPtr, col);
224 RF_LOCK_MUTEX(raidPtr->mutex);
225 raidPtr->reconInProgress--;
226 RF_UNLOCK_MUTEX(raidPtr->mutex);
227 } else {
228 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
229 lp->parityConfig);
230 rc = EIO;
231 }
232 RF_SIGNAL_COND(raidPtr->waitForReconCond);
233 return (rc);
234 }
235
236 int
237 rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col)
238 {
239 RF_ComponentLabel_t c_label;
240 RF_RaidDisk_t *spareDiskPtr = NULL;
241 RF_RaidReconDesc_t *reconDesc;
242 RF_RowCol_t scol;
243 int numDisksDone = 0, rc;
244
245 /* first look for a spare drive onto which to reconstruct the data */
246 /* spare disk descriptors are stored in row 0. This may have to
247 * change eventually */
248
249 RF_LOCK_MUTEX(raidPtr->mutex);
250 RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed);
251 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
252 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
253 if (raidPtr->status != rf_rs_degraded) {
254 RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col);
255 RF_UNLOCK_MUTEX(raidPtr->mutex);
256 return (EINVAL);
257 }
258 scol = (-1);
259 } else {
260 #endif
261 for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
262 if (raidPtr->Disks[scol].status == rf_ds_spare) {
263 spareDiskPtr = &raidPtr->Disks[scol];
264 spareDiskPtr->status = rf_ds_used_spare;
265 break;
266 }
267 }
268 if (!spareDiskPtr) {
269 RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col);
270 RF_UNLOCK_MUTEX(raidPtr->mutex);
271 return (ENOSPC);
272 }
273 printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol);
274 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
275 }
276 #endif
277 RF_UNLOCK_MUTEX(raidPtr->mutex);
278
279 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol);
280 raidPtr->reconDesc = (void *) reconDesc;
281 #if RF_RECON_STATS > 0
282 reconDesc->hsStallCount = 0;
283 reconDesc->numReconExecDelays = 0;
284 reconDesc->numReconEventWaits = 0;
285 #endif /* RF_RECON_STATS > 0 */
286 reconDesc->reconExecTimerRunning = 0;
287 reconDesc->reconExecTicks = 0;
288 reconDesc->maxReconExecTicks = 0;
289 rc = rf_ContinueReconstructFailedDisk(reconDesc);
290
291 if (!rc) {
292 /* fix up the component label */
293 /* Don't actually need the read here.. */
294 raidread_component_label(
295 raidPtr->raid_cinfo[scol].ci_dev,
296 raidPtr->raid_cinfo[scol].ci_vp,
297 &c_label);
298
299 raid_init_component_label( raidPtr, &c_label);
300 c_label.row = 0;
301 c_label.column = col;
302 c_label.clean = RF_RAID_DIRTY;
303 c_label.status = rf_ds_optimal;
304 c_label.partitionSize = raidPtr->Disks[scol].partitionSize;
305
306 /* We've just done a rebuild based on all the other
307 disks, so at this point the parity is known to be
308 clean, even if it wasn't before. */
309
310 /* XXX doesn't hold for RAID 6!!*/
311
312 RF_LOCK_MUTEX(raidPtr->mutex);
313 raidPtr->parity_good = RF_RAID_CLEAN;
314 RF_UNLOCK_MUTEX(raidPtr->mutex);
315
316 /* XXXX MORE NEEDED HERE */
317
318 raidwrite_component_label(
319 raidPtr->raid_cinfo[scol].ci_dev,
320 raidPtr->raid_cinfo[scol].ci_vp,
321 &c_label);
322
323 } else {
324 /* Reconstruct failed. */
325
326 RF_LOCK_MUTEX(raidPtr->mutex);
327 /* Failed disk goes back to "failed" status */
328 raidPtr->Disks[col].status = rf_ds_failed;
329
330 /* Spare disk goes back to "spare" status. */
331 spareDiskPtr->status = rf_ds_spare;
332 RF_UNLOCK_MUTEX(raidPtr->mutex);
333
334 }
335 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
336 return (rc);
337 }
338
339 /*
340
341 Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
342 and you don't get a spare until the next Monday. With this function
343 (and hot-swappable drives) you can now put your new disk containing
344 /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
345 rebuild the data "on the spot".
346
347 */
348
349 int
350 rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col)
351 {
352 RF_RaidDisk_t *spareDiskPtr = NULL;
353 RF_RaidReconDesc_t *reconDesc;
354 const RF_LayoutSW_t *lp;
355 RF_ComponentLabel_t c_label;
356 int numDisksDone = 0, rc;
357 struct partinfo dpart;
358 struct vnode *vp;
359 struct vattr va;
360 struct lwp *lwp;
361 int retcode;
362 int ac;
363
364 lp = raidPtr->Layout.map;
365 if (!lp->SubmitReconBuffer) {
366 RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
367 lp->parityConfig);
368 /* wakeup anyone who might be waiting to do a reconstruct */
369 RF_SIGNAL_COND(raidPtr->waitForReconCond);
370 return(EIO);
371 }
372
373 /*
374 * The current infrastructure only supports reconstructing one
375 * disk at a time for each array.
376 */
377 RF_LOCK_MUTEX(raidPtr->mutex);
378
379 if (raidPtr->Disks[col].status != rf_ds_failed) {
380 /* "It's gone..." */
381 raidPtr->numFailures++;
382 raidPtr->Disks[col].status = rf_ds_failed;
383 raidPtr->status = rf_rs_degraded;
384 RF_UNLOCK_MUTEX(raidPtr->mutex);
385 rf_update_component_labels(raidPtr,
386 RF_NORMAL_COMPONENT_UPDATE);
387 RF_LOCK_MUTEX(raidPtr->mutex);
388 }
389
390 while (raidPtr->reconInProgress) {
391 RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
392 }
393
394 raidPtr->reconInProgress++;
395
396 /* first look for a spare drive onto which to reconstruct the
397 data. spare disk descriptors are stored in row 0. This
398 may have to change eventually */
399
400 /* Actually, we don't care if it's failed or not... On a RAID
401 set with correct parity, this function should be callable
402 on any component without ill effects. */
403 /* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */
404
405 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
406 if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
407 RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col);
408
409 raidPtr->reconInProgress--;
410 RF_UNLOCK_MUTEX(raidPtr->mutex);
411 RF_SIGNAL_COND(raidPtr->waitForReconCond);
412 return (EINVAL);
413 }
414 #endif
415 lwp = raidPtr->engine_thread;
416
417 /* This device may have been opened successfully the
418 first time. Close it before trying to open it again.. */
419
420 if (raidPtr->raid_cinfo[col].ci_vp != NULL) {
421 #if 0
422 printf("Closed the open device: %s\n",
423 raidPtr->Disks[col].devname);
424 #endif
425 vp = raidPtr->raid_cinfo[col].ci_vp;
426 ac = raidPtr->Disks[col].auto_configured;
427 RF_UNLOCK_MUTEX(raidPtr->mutex);
428 rf_close_component(raidPtr, vp, ac);
429 RF_LOCK_MUTEX(raidPtr->mutex);
430 raidPtr->raid_cinfo[col].ci_vp = NULL;
431 }
432 /* note that this disk was *not* auto_configured (any longer)*/
433 raidPtr->Disks[col].auto_configured = 0;
434
435 #if 0
436 printf("About to (re-)open the device for rebuilding: %s\n",
437 raidPtr->Disks[col].devname);
438 #endif
439 RF_UNLOCK_MUTEX(raidPtr->mutex);
440 retcode = dk_lookup(raidPtr->Disks[col].devname, lwp, &vp, UIO_SYSSPACE);
441
442 if (retcode) {
443 printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid,
444 raidPtr->Disks[col].devname, retcode);
445
446 /* the component isn't responding properly...
447 must be still dead :-( */
448 RF_LOCK_MUTEX(raidPtr->mutex);
449 raidPtr->reconInProgress--;
450 RF_UNLOCK_MUTEX(raidPtr->mutex);
451 RF_SIGNAL_COND(raidPtr->waitForReconCond);
452 return(retcode);
453 }
454
455 /* Ok, so we can at least do a lookup...
456 How about actually getting a vp for it? */
457
458 if ((retcode = VOP_GETATTR(vp, &va, lwp->l_cred, lwp)) != 0) {
459 RF_LOCK_MUTEX(raidPtr->mutex);
460 raidPtr->reconInProgress--;
461 RF_UNLOCK_MUTEX(raidPtr->mutex);
462 RF_SIGNAL_COND(raidPtr->waitForReconCond);
463 return(retcode);
464 }
465
466 retcode = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, lwp->l_cred, lwp);
467 if (retcode) {
468 RF_LOCK_MUTEX(raidPtr->mutex);
469 raidPtr->reconInProgress--;
470 RF_UNLOCK_MUTEX(raidPtr->mutex);
471 RF_SIGNAL_COND(raidPtr->waitForReconCond);
472 return(retcode);
473 }
474 RF_LOCK_MUTEX(raidPtr->mutex);
475 raidPtr->Disks[col].blockSize = dpart.disklab->d_secsize;
476
477 raidPtr->Disks[col].numBlocks = dpart.part->p_size -
478 rf_protectedSectors;
479
480 raidPtr->raid_cinfo[col].ci_vp = vp;
481 raidPtr->raid_cinfo[col].ci_dev = va.va_rdev;
482
483 raidPtr->Disks[col].dev = va.va_rdev;
484
485 /* we allow the user to specify that only a fraction
486 of the disks should be used this is just for debug:
487 it speeds up * the parity scan */
488 raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks *
489 rf_sizePercentage / 100;
490 RF_UNLOCK_MUTEX(raidPtr->mutex);
491
492 spareDiskPtr = &raidPtr->Disks[col];
493 spareDiskPtr->status = rf_ds_used_spare;
494
495 printf("raid%d: initiating in-place reconstruction on column %d\n",
496 raidPtr->raidid, col);
497
498 reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr,
499 numDisksDone, col);
500 raidPtr->reconDesc = (void *) reconDesc;
501 #if RF_RECON_STATS > 0
502 reconDesc->hsStallCount = 0;
503 reconDesc->numReconExecDelays = 0;
504 reconDesc->numReconEventWaits = 0;
505 #endif /* RF_RECON_STATS > 0 */
506 reconDesc->reconExecTimerRunning = 0;
507 reconDesc->reconExecTicks = 0;
508 reconDesc->maxReconExecTicks = 0;
509 rc = rf_ContinueReconstructFailedDisk(reconDesc);
510
511 if (!rc) {
512 RF_LOCK_MUTEX(raidPtr->mutex);
513 /* Need to set these here, as at this point it'll be claiming
514 that the disk is in rf_ds_spared! But we know better :-) */
515
516 raidPtr->Disks[col].status = rf_ds_optimal;
517 raidPtr->status = rf_rs_optimal;
518 RF_UNLOCK_MUTEX(raidPtr->mutex);
519
520 /* fix up the component label */
521 /* Don't actually need the read here.. */
522 raidread_component_label(raidPtr->raid_cinfo[col].ci_dev,
523 raidPtr->raid_cinfo[col].ci_vp,
524 &c_label);
525
526 RF_LOCK_MUTEX(raidPtr->mutex);
527 raid_init_component_label(raidPtr, &c_label);
528
529 c_label.row = 0;
530 c_label.column = col;
531
532 /* We've just done a rebuild based on all the other
533 disks, so at this point the parity is known to be
534 clean, even if it wasn't before. */
535
536 /* XXX doesn't hold for RAID 6!!*/
537
538 raidPtr->parity_good = RF_RAID_CLEAN;
539 RF_UNLOCK_MUTEX(raidPtr->mutex);
540
541 raidwrite_component_label(raidPtr->raid_cinfo[col].ci_dev,
542 raidPtr->raid_cinfo[col].ci_vp,
543 &c_label);
544
545 } else {
546 /* Reconstruct-in-place failed. Disk goes back to
547 "failed" status, regardless of what it was before. */
548 RF_LOCK_MUTEX(raidPtr->mutex);
549 raidPtr->Disks[col].status = rf_ds_failed;
550 RF_UNLOCK_MUTEX(raidPtr->mutex);
551 }
552
553 rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
554
555 RF_LOCK_MUTEX(raidPtr->mutex);
556 raidPtr->reconInProgress--;
557 RF_UNLOCK_MUTEX(raidPtr->mutex);
558
559 RF_SIGNAL_COND(raidPtr->waitForReconCond);
560 return (rc);
561 }
562
563
564 int
565 rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc)
566 {
567 RF_Raid_t *raidPtr = reconDesc->raidPtr;
568 RF_RowCol_t col = reconDesc->col;
569 RF_RowCol_t scol = reconDesc->scol;
570 RF_ReconMap_t *mapPtr;
571 RF_ReconCtrl_t *tmp_reconctrl;
572 RF_ReconEvent_t *event;
573 RF_CallbackDesc_t *p;
574 struct timeval etime, elpsd;
575 unsigned long xor_s, xor_resid_us;
576 int i, ds;
577 int status;
578 int recon_error, write_error;
579
580 raidPtr->accumXorTimeUs = 0;
581 #if RF_ACC_TRACE > 0
582 /* create one trace record per physical disk */
583 RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
584 #endif
585
586 /* quiesce the array prior to starting recon. this is needed
587 * to assure no nasty interactions with pending user writes.
588 * We need to do this before we change the disk or row status. */
589
590 Dprintf("RECON: begin request suspend\n");
591 rf_SuspendNewRequestsAndWait(raidPtr);
592 Dprintf("RECON: end request suspend\n");
593
594 /* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */
595 tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol);
596
597 RF_LOCK_MUTEX(raidPtr->mutex);
598
599 /* create the reconstruction control pointer and install it in
600 * the right slot */
601 raidPtr->reconControl = tmp_reconctrl;
602 mapPtr = raidPtr->reconControl->reconMap;
603 raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs;
604 raidPtr->reconControl->numRUsComplete = 0;
605 raidPtr->status = rf_rs_reconstructing;
606 raidPtr->Disks[col].status = rf_ds_reconstructing;
607 raidPtr->Disks[col].spareCol = scol;
608
609 RF_UNLOCK_MUTEX(raidPtr->mutex);
610
611 RF_GETTIME(raidPtr->reconControl->starttime);
612
613 /* now start up the actual reconstruction: issue a read for
614 * each surviving disk */
615
616 reconDesc->numDisksDone = 0;
617 for (i = 0; i < raidPtr->numCol; i++) {
618 if (i != col) {
619 /* find and issue the next I/O on the
620 * indicated disk */
621 if (IssueNextReadRequest(raidPtr, i)) {
622 Dprintf1("RECON: done issuing for c%d\n", i);
623 reconDesc->numDisksDone++;
624 }
625 }
626 }
627
628 Dprintf("RECON: resume requests\n");
629 rf_ResumeNewRequests(raidPtr);
630
631 /* process reconstruction events until all disks report that
632 * they've completed all work */
633
634 mapPtr = raidPtr->reconControl->reconMap;
635 recon_error = 0;
636 write_error = 0;
637
638 while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
639
640 event = rf_GetNextReconEvent(reconDesc);
641 status = ProcessReconEvent(raidPtr, event);
642
643 /* the normal case is that a read completes, and all is well. */
644 if (status == RF_RECON_DONE_READS) {
645 reconDesc->numDisksDone++;
646 } else if ((status == RF_RECON_READ_ERROR) ||
647 (status == RF_RECON_WRITE_ERROR)) {
648 /* an error was encountered while reconstructing...
649 Pretend we've finished this disk.
650 */
651 recon_error = 1;
652 raidPtr->reconControl->error = 1;
653
654 /* bump the numDisksDone count for reads,
655 but not for writes */
656 if (status == RF_RECON_READ_ERROR)
657 reconDesc->numDisksDone++;
658
659 /* write errors are special -- when we are
660 done dealing with the reads that are
661 finished, we don't want to wait for any
662 writes */
663 if (status == RF_RECON_WRITE_ERROR)
664 write_error = 1;
665
666 } else if (status == RF_RECON_READ_STOPPED) {
667 /* count this component as being "done" */
668 reconDesc->numDisksDone++;
669 }
670
671 if (recon_error) {
672
673 /* make sure any stragglers are woken up so that
674 their theads will complete, and we can get out
675 of here with all IO processed */
676
677 while (raidPtr->reconControl->headSepCBList) {
678 p = raidPtr->reconControl->headSepCBList;
679 raidPtr->reconControl->headSepCBList = p->next;
680 p->next = NULL;
681 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
682 rf_FreeCallbackDesc(p);
683 }
684 }
685
686 raidPtr->reconControl->numRUsTotal =
687 mapPtr->totalRUs;
688 raidPtr->reconControl->numRUsComplete =
689 mapPtr->totalRUs -
690 rf_UnitsLeftToReconstruct(mapPtr);
691
692 #if RF_DEBUG_RECON
693 raidPtr->reconControl->percentComplete =
694 (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
695 if (rf_prReconSched) {
696 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
697 }
698 #endif
699 }
700
701 mapPtr = raidPtr->reconControl->reconMap;
702 if (rf_reconDebug) {
703 printf("RECON: all reads completed\n");
704 }
705 /* at this point all the reads have completed. We now wait
706 * for any pending writes to complete, and then we're done */
707
708 while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) {
709
710 event = rf_GetNextReconEvent(reconDesc);
711 status = ProcessReconEvent(raidPtr, event);
712
713 if (status == RF_RECON_WRITE_ERROR) {
714 recon_error = 1;
715 raidPtr->reconControl->error = 1;
716 /* an error was encountered at the very end... bail */
717 } else {
718 #if RF_DEBUG_RECON
719 raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
720 if (rf_prReconSched) {
721 rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
722 }
723 #endif
724 }
725 }
726
727 if (recon_error) {
728 /* we've encountered an error in reconstructing. */
729 printf("raid%d: reconstruction failed.\n", raidPtr->raidid);
730
731 /* we start by blocking IO to the RAID set. */
732 rf_SuspendNewRequestsAndWait(raidPtr);
733
734 RF_LOCK_MUTEX(raidPtr->mutex);
735 /* mark set as being degraded, rather than
736 rf_rs_reconstructing as we were before the problem.
737 After this is done we can update status of the
738 component disks without worrying about someone
739 trying to read from a failed component.
740 */
741 raidPtr->status = rf_rs_degraded;
742 RF_UNLOCK_MUTEX(raidPtr->mutex);
743
744 /* resume IO */
745 rf_ResumeNewRequests(raidPtr);
746
747 /* At this point there are two cases:
748 1) If we've experienced a read error, then we've
749 already waited for all the reads we're going to get,
750 and we just need to wait for the writes.
751
752 2) If we've experienced a write error, we've also
753 already waited for all the reads to complete,
754 but there is little point in waiting for the writes --
755 when they do complete, they will just be ignored.
756
757 So we just wait for writes to complete if we didn't have a
758 write error.
759 */
760
761 if (!write_error) {
762 /* wait for writes to complete */
763 while (raidPtr->reconControl->pending_writes > 0) {
764
765 event = rf_GetNextReconEvent(reconDesc);
766 status = ProcessReconEvent(raidPtr, event);
767
768 if (status == RF_RECON_WRITE_ERROR) {
769 raidPtr->reconControl->error = 1;
770 /* an error was encountered at the very end... bail.
771 This will be very bad news for the user, since
772 at this point there will have been a read error
773 on one component, and a write error on another!
774 */
775 break;
776 }
777 }
778 }
779
780
781 /* cleanup */
782
783 /* drain the event queue - after waiting for the writes above,
784 there shouldn't be much (if anything!) left in the queue. */
785
786 rf_DrainReconEventQueue(reconDesc);
787
788 /* XXX As much as we'd like to free the recon control structure
789 and the reconDesc, we have no way of knowing if/when those will
790 be touched by IO that has yet to occur. It is rather poor to be
791 basically causing a 'memory leak' here, but there doesn't seem to be
792 a cleaner alternative at this time. Perhaps when the reconstruct code
793 gets a makeover this problem will go away.
794 */
795 #if 0
796 rf_FreeReconControl(raidPtr);
797 #endif
798
799 #if RF_ACC_TRACE > 0
800 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
801 #endif
802 /* XXX see comment above */
803 #if 0
804 FreeReconDesc(reconDesc);
805 #endif
806
807 return (1);
808 }
809
810 /* Success: mark the dead disk as reconstructed. We quiesce
811 * the array here to assure no nasty interactions with pending
812 * user accesses when we free up the psstatus structure as
813 * part of FreeReconControl() */
814
815 rf_SuspendNewRequestsAndWait(raidPtr);
816
817 RF_LOCK_MUTEX(raidPtr->mutex);
818 raidPtr->numFailures--;
819 ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
820 raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared;
821 raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal;
822 RF_UNLOCK_MUTEX(raidPtr->mutex);
823 RF_GETTIME(etime);
824 RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd);
825
826 rf_ResumeNewRequests(raidPtr);
827
828 printf("raid%d: Reconstruction of disk at col %d completed\n",
829 raidPtr->raidid, col);
830 xor_s = raidPtr->accumXorTimeUs / 1000000;
831 xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
832 printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n",
833 raidPtr->raidid,
834 (int) elpsd.tv_sec, (int) elpsd.tv_usec,
835 raidPtr->accumXorTimeUs, xor_s, xor_resid_us);
836 printf("raid%d: (start time %d sec %d usec, end time %d sec %d usec)\n",
837 raidPtr->raidid,
838 (int) raidPtr->reconControl->starttime.tv_sec,
839 (int) raidPtr->reconControl->starttime.tv_usec,
840 (int) etime.tv_sec, (int) etime.tv_usec);
841 #if RF_RECON_STATS > 0
842 printf("raid%d: Total head-sep stall count was %d\n",
843 raidPtr->raidid, (int) reconDesc->hsStallCount);
844 #endif /* RF_RECON_STATS > 0 */
845 rf_FreeReconControl(raidPtr);
846 #if RF_ACC_TRACE > 0
847 RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
848 #endif
849 FreeReconDesc(reconDesc);
850
851 return (0);
852
853 }
854 /*****************************************************************************
855 * do the right thing upon each reconstruction event.
856 *****************************************************************************/
857 static int
858 ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event)
859 {
860 int retcode = 0, submitblocked;
861 RF_ReconBuffer_t *rbuf;
862 RF_SectorCount_t sectorsPerRU;
863
864 retcode = RF_RECON_READ_STOPPED;
865
866 Dprintf1("RECON: ProcessReconEvent type %d\n", event->type);
867 switch (event->type) {
868
869 /* a read I/O has completed */
870 case RF_REVENT_READDONE:
871 rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf;
872 Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n",
873 event->col, rbuf->parityStripeID);
874 Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n",
875 rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
876 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
877 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
878 if (!raidPtr->reconControl->error) {
879 submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
880 Dprintf1("RECON: submitblocked=%d\n", submitblocked);
881 if (!submitblocked)
882 retcode = IssueNextReadRequest(raidPtr, event->col);
883 else
884 retcode = 0;
885 }
886 break;
887
888 /* a write I/O has completed */
889 case RF_REVENT_WRITEDONE:
890 #if RF_DEBUG_RECON
891 if (rf_floatingRbufDebug) {
892 rf_CheckFloatingRbufCount(raidPtr, 1);
893 }
894 #endif
895 sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
896 rbuf = (RF_ReconBuffer_t *) event->arg;
897 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
898 Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n",
899 rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete);
900 rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap,
901 rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
902 rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru);
903
904 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
905 raidPtr->reconControl->pending_writes--;
906 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
907
908 if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
909 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
910 while(raidPtr->reconControl->rb_lock) {
911 ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO, "reconctrlpre1", 0,
912 &raidPtr->reconControl->rb_mutex);
913 }
914 raidPtr->reconControl->rb_lock = 1;
915 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
916
917 raidPtr->numFullReconBuffers--;
918 rf_ReleaseFloatingReconBuffer(raidPtr, rbuf);
919
920 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
921 raidPtr->reconControl->rb_lock = 0;
922 wakeup(&raidPtr->reconControl->rb_lock);
923 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
924 } else
925 if (rbuf->type == RF_RBUF_TYPE_FORCED)
926 rf_FreeReconBuffer(rbuf);
927 else
928 RF_ASSERT(0);
929 retcode = 0;
930 break;
931
932 case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been
933 * cleared */
934 Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col);
935 if (!raidPtr->reconControl->error) {
936 submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf,
937 0, (int) (long) event->arg);
938 RF_ASSERT(!submitblocked); /* we wouldn't have gotten the
939 * BUFCLEAR event if we
940 * couldn't submit */
941 retcode = IssueNextReadRequest(raidPtr, event->col);
942 }
943 break;
944
945 case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction
946 * blockage has been cleared */
947 DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col);
948 if (!raidPtr->reconControl->error) {
949 retcode = TryToRead(raidPtr, event->col);
950 }
951 break;
952
953 case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation
954 * reconstruction blockage has been
955 * cleared */
956 Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col);
957 if (!raidPtr->reconControl->error) {
958 retcode = TryToRead(raidPtr, event->col);
959 }
960 break;
961
962 /* a buffer has become ready to write */
963 case RF_REVENT_BUFREADY:
964 Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col);
965 if (!raidPtr->reconControl->error) {
966 retcode = IssueNextWriteRequest(raidPtr);
967 #if RF_DEBUG_RECON
968 if (rf_floatingRbufDebug) {
969 rf_CheckFloatingRbufCount(raidPtr, 1);
970 }
971 #endif
972 }
973 break;
974
975 /* we need to skip the current RU entirely because it got
976 * recon'd while we were waiting for something else to happen */
977 case RF_REVENT_SKIP:
978 DDprintf1("RECON: SKIP EVENT: col %d\n", event->col);
979 if (!raidPtr->reconControl->error) {
980 retcode = IssueNextReadRequest(raidPtr, event->col);
981 }
982 break;
983
984 /* a forced-reconstruction read access has completed. Just
985 * submit the buffer */
986 case RF_REVENT_FORCEDREADDONE:
987 rbuf = (RF_ReconBuffer_t *) event->arg;
988 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
989 DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col);
990 if (!raidPtr->reconControl->error) {
991 submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
992 RF_ASSERT(!submitblocked);
993 }
994 break;
995
996 /* A read I/O failed to complete */
997 case RF_REVENT_READ_FAILED:
998 retcode = RF_RECON_READ_ERROR;
999 break;
1000
1001 /* A write I/O failed to complete */
1002 case RF_REVENT_WRITE_FAILED:
1003 retcode = RF_RECON_WRITE_ERROR;
1004
1005 rbuf = (RF_ReconBuffer_t *) event->arg;
1006
1007 /* cleanup the disk queue data */
1008 rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1009
1010 /* At this point we're erroring out, badly, and floatingRbufs
1011 may not even be valid. Rather than putting this back onto
1012 the floatingRbufs list, just arrange for its immediate
1013 destruction.
1014 */
1015 rf_FreeReconBuffer(rbuf);
1016 break;
1017
1018 /* a forced read I/O failed to complete */
1019 case RF_REVENT_FORCEDREAD_FAILED:
1020 retcode = RF_RECON_READ_ERROR;
1021 break;
1022
1023 default:
1024 RF_PANIC();
1025 }
1026 rf_FreeReconEventDesc(event);
1027 return (retcode);
1028 }
1029 /*****************************************************************************
1030 *
1031 * find the next thing that's needed on the indicated disk, and issue
1032 * a read request for it. We assume that the reconstruction buffer
1033 * associated with this process is free to receive the data. If
1034 * reconstruction is blocked on the indicated RU, we issue a
1035 * blockage-release request instead of a physical disk read request.
1036 * If the current disk gets too far ahead of the others, we issue a
1037 * head-separation wait request and return.
1038 *
1039 * ctrl->{ru_count, curPSID, diskOffset} and
1040 * rbuf->failedDiskSectorOffset are maintained to point to the unit
1041 * we're currently accessing. Note that this deviates from the
1042 * standard C idiom of having counters point to the next thing to be
1043 * accessed. This allows us to easily retry when we're blocked by
1044 * head separation or reconstruction-blockage events.
1045 *
1046 *****************************************************************************/
1047 static int
1048 IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col)
1049 {
1050 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1051 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1052 RF_ReconBuffer_t *rbuf = ctrl->rbuf;
1053 RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
1054 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1055 int do_new_check = 0, retcode = 0, status;
1056
1057 /* if we are currently the slowest disk, mark that we have to do a new
1058 * check */
1059 if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter)
1060 do_new_check = 1;
1061
1062 while (1) {
1063
1064 ctrl->ru_count++;
1065 if (ctrl->ru_count < RUsPerPU) {
1066 ctrl->diskOffset += sectorsPerRU;
1067 rbuf->failedDiskSectorOffset += sectorsPerRU;
1068 } else {
1069 ctrl->curPSID++;
1070 ctrl->ru_count = 0;
1071 /* code left over from when head-sep was based on
1072 * parity stripe id */
1073 if (ctrl->curPSID >= raidPtr->reconControl->lastPSID) {
1074 CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter));
1075 return (RF_RECON_DONE_READS); /* finito! */
1076 }
1077 /* find the disk offsets of the start of the parity
1078 * stripe on both the current disk and the failed
1079 * disk. skip this entire parity stripe if either disk
1080 * does not appear in the indicated PS */
1081 status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset,
1082 &rbuf->spCol, &rbuf->spOffset);
1083 if (status) {
1084 ctrl->ru_count = RUsPerPU - 1;
1085 continue;
1086 }
1087 }
1088 rbuf->which_ru = ctrl->ru_count;
1089
1090 /* skip this RU if it's already been reconstructed */
1091 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) {
1092 Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count);
1093 continue;
1094 }
1095 break;
1096 }
1097 ctrl->headSepCounter++;
1098 if (do_new_check)
1099 CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter); /* update min if needed */
1100
1101
1102 /* at this point, we have definitely decided what to do, and we have
1103 * only to see if we can actually do it now */
1104 rbuf->parityStripeID = ctrl->curPSID;
1105 rbuf->which_ru = ctrl->ru_count;
1106 #if RF_ACC_TRACE > 0
1107 memset((char *) &raidPtr->recon_tracerecs[col], 0,
1108 sizeof(raidPtr->recon_tracerecs[col]));
1109 raidPtr->recon_tracerecs[col].reconacc = 1;
1110 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1111 #endif
1112 retcode = TryToRead(raidPtr, col);
1113 return (retcode);
1114 }
1115
1116 /*
1117 * tries to issue the next read on the indicated disk. We may be
1118 * blocked by (a) the heads being too far apart, or (b) recon on the
1119 * indicated RU being blocked due to a write by a user thread. In
1120 * this case, we issue a head-sep or blockage wait request, which will
1121 * cause this same routine to be invoked again later when the blockage
1122 * has cleared.
1123 */
1124
1125 static int
1126 TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col)
1127 {
1128 RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1129 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
1130 RF_StripeNum_t psid = ctrl->curPSID;
1131 RF_ReconUnitNum_t which_ru = ctrl->ru_count;
1132 RF_DiskQueueData_t *req;
1133 int status;
1134 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr;
1135
1136 /* if the current disk is too far ahead of the others, issue a
1137 * head-separation wait and return */
1138 if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru))
1139 return (0);
1140
1141 /* allocate a new PSS in case we need it */
1142 newpssPtr = rf_AllocPSStatus(raidPtr);
1143
1144 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1145 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr);
1146
1147 if (pssPtr != newpssPtr) {
1148 rf_FreePSStatus(raidPtr, newpssPtr);
1149 }
1150
1151 /* if recon is blocked on the indicated parity stripe, issue a
1152 * block-wait request and return. this also must mark the indicated RU
1153 * in the stripe as under reconstruction if not blocked. */
1154 status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru);
1155 if (status == RF_PSS_RECON_BLOCKED) {
1156 Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru);
1157 goto out;
1158 } else
1159 if (status == RF_PSS_FORCED_ON_WRITE) {
1160 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1161 goto out;
1162 }
1163 /* make one last check to be sure that the indicated RU didn't get
1164 * reconstructed while we were waiting for something else to happen.
1165 * This is unfortunate in that it causes us to make this check twice
1166 * in the normal case. Might want to make some attempt to re-work
1167 * this so that we only do this check if we've definitely blocked on
1168 * one of the above checks. When this condition is detected, we may
1169 * have just created a bogus status entry, which we need to delete. */
1170 if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) {
1171 Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru);
1172 if (pssPtr == newpssPtr)
1173 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1174 rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1175 goto out;
1176 }
1177 /* found something to read. issue the I/O */
1178 Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n",
1179 psid, col, ctrl->diskOffset, ctrl->rbuf->buffer);
1180 #if RF_ACC_TRACE > 0
1181 RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
1182 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
1183 raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
1184 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
1185 RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1186 #endif
1187 /* should be ok to use a NULL proc pointer here, all the bufs we use
1188 * should be in kernel space */
1189 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
1190 ReconReadDoneProc, (void *) ctrl,
1191 #if RF_ACC_TRACE > 0
1192 &raidPtr->recon_tracerecs[col],
1193 #else
1194 NULL,
1195 #endif
1196 (void *) raidPtr, 0, NULL, PR_WAITOK);
1197
1198 ctrl->rbuf->arg = (void *) req;
1199 rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY);
1200 pssPtr->issued[col] = 1;
1201
1202 out:
1203 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1204 return (0);
1205 }
1206
1207
1208 /*
1209 * given a parity stripe ID, we want to find out whether both the
1210 * current disk and the failed disk exist in that parity stripe. If
1211 * not, we want to skip this whole PS. If so, we want to find the
1212 * disk offset of the start of the PS on both the current disk and the
1213 * failed disk.
1214 *
1215 * this works by getting a list of disks comprising the indicated
1216 * parity stripe, and searching the list for the current and failed
1217 * disks. Once we've decided they both exist in the parity stripe, we
1218 * need to decide whether each is data or parity, so that we'll know
1219 * which mapping function to call to get the corresponding disk
1220 * offsets.
1221 *
1222 * this is kind of unpleasant, but doing it this way allows the
1223 * reconstruction code to use parity stripe IDs rather than physical
1224 * disks address to march through the failed disk, which greatly
1225 * simplifies a lot of code, as well as eliminating the need for a
1226 * reverse-mapping function. I also think it will execute faster,
1227 * since the calls to the mapping module are kept to a minimum.
1228 *
1229 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
1230 * THE STRIPE IN THE CORRECT ORDER
1231 *
1232 * raidPtr - raid descriptor
1233 * psid - parity stripe identifier
1234 * col - column of disk to find the offsets for
1235 * spCol - out: col of spare unit for failed unit
1236 * spOffset - out: offset into disk containing spare unit
1237 *
1238 */
1239
1240
1241 static int
1242 ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid,
1243 RF_RowCol_t col, RF_SectorNum_t *outDiskOffset,
1244 RF_SectorNum_t *outFailedDiskSectorOffset,
1245 RF_RowCol_t *spCol, RF_SectorNum_t *spOffset)
1246 {
1247 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1248 RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1249 RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */
1250 RF_RowCol_t *diskids;
1251 u_int i, j, k, i_offset, j_offset;
1252 RF_RowCol_t pcol;
1253 int testcol;
1254 RF_SectorNum_t poffset;
1255 char i_is_parity = 0, j_is_parity = 0;
1256 RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
1257
1258 /* get a listing of the disks comprising that stripe */
1259 sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
1260 (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids);
1261 RF_ASSERT(diskids);
1262
1263 /* reject this entire parity stripe if it does not contain the
1264 * indicated disk or it does not contain the failed disk */
1265
1266 for (i = 0; i < stripeWidth; i++) {
1267 if (col == diskids[i])
1268 break;
1269 }
1270 if (i == stripeWidth)
1271 goto skipit;
1272 for (j = 0; j < stripeWidth; j++) {
1273 if (fcol == diskids[j])
1274 break;
1275 }
1276 if (j == stripeWidth) {
1277 goto skipit;
1278 }
1279 /* find out which disk the parity is on */
1280 (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP);
1281
1282 /* find out if either the current RU or the failed RU is parity */
1283 /* also, if the parity occurs in this stripe prior to the data and/or
1284 * failed col, we need to decrement i and/or j */
1285 for (k = 0; k < stripeWidth; k++)
1286 if (diskids[k] == pcol)
1287 break;
1288 RF_ASSERT(k < stripeWidth);
1289 i_offset = i;
1290 j_offset = j;
1291 if (k < i)
1292 i_offset--;
1293 else
1294 if (k == i) {
1295 i_is_parity = 1;
1296 i_offset = 0;
1297 } /* set offsets to zero to disable multiply
1298 * below */
1299 if (k < j)
1300 j_offset--;
1301 else
1302 if (k == j) {
1303 j_is_parity = 1;
1304 j_offset = 0;
1305 }
1306 /* at this point, [ij]_is_parity tells us whether the [current,failed]
1307 * disk is parity at the start of this RU, and, if data, "[ij]_offset"
1308 * tells us how far into the stripe the [current,failed] disk is. */
1309
1310 /* call the mapping routine to get the offset into the current disk,
1311 * repeat for failed disk. */
1312 if (i_is_parity)
1313 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1314 else
1315 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1316
1317 RF_ASSERT(col == testcol);
1318
1319 if (j_is_parity)
1320 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1321 else
1322 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1323 RF_ASSERT(fcol == testcol);
1324
1325 /* now locate the spare unit for the failed unit */
1326 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1327 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
1328 if (j_is_parity)
1329 layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1330 else
1331 layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1332 } else {
1333 #endif
1334 *spCol = raidPtr->reconControl->spareCol;
1335 *spOffset = *outFailedDiskSectorOffset;
1336 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1337 }
1338 #endif
1339 return (0);
1340
1341 skipit:
1342 Dprintf2("RECON: Skipping psid %ld: nothing needed from c%d\n",
1343 psid, col);
1344 return (1);
1345 }
1346 /* this is called when a buffer has become ready to write to the replacement disk */
1347 static int
1348 IssueNextWriteRequest(RF_Raid_t *raidPtr)
1349 {
1350 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1351 RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1352 #if RF_ACC_TRACE > 0
1353 RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1354 #endif
1355 RF_ReconBuffer_t *rbuf;
1356 RF_DiskQueueData_t *req;
1357
1358 rbuf = rf_GetFullReconBuffer(raidPtr->reconControl);
1359 RF_ASSERT(rbuf); /* there must be one available, or we wouldn't
1360 * have gotten the event that sent us here */
1361 RF_ASSERT(rbuf->pssPtr);
1362
1363 rbuf->pssPtr->writeRbuf = rbuf;
1364 rbuf->pssPtr = NULL;
1365
1366 Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n",
1367 rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
1368 rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
1369 Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n",
1370 rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
1371 rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
1372
1373 /* should be ok to use a NULL b_proc here b/c all addrs should be in
1374 * kernel space */
1375 req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
1376 sectorsPerRU, rbuf->buffer,
1377 rbuf->parityStripeID, rbuf->which_ru,
1378 ReconWriteDoneProc, (void *) rbuf,
1379 #if RF_ACC_TRACE > 0
1380 &raidPtr->recon_tracerecs[fcol],
1381 #else
1382 NULL,
1383 #endif
1384 (void *) raidPtr, 0, NULL, PR_WAITOK);
1385
1386 rbuf->arg = (void *) req;
1387 RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1388 raidPtr->reconControl->pending_writes++;
1389 RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1390 rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY);
1391
1392 return (0);
1393 }
1394
1395 /*
1396 * this gets called upon the completion of a reconstruction read
1397 * operation the arg is a pointer to the per-disk reconstruction
1398 * control structure for the process that just finished a read.
1399 *
1400 * called at interrupt context in the kernel, so don't do anything
1401 * illegal here.
1402 */
1403 static int
1404 ReconReadDoneProc(void *arg, int status)
1405 {
1406 RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
1407 RF_Raid_t *raidPtr;
1408
1409 /* Detect that reconCtrl is no longer valid, and if that
1410 is the case, bail without calling rf_CauseReconEvent().
1411 There won't be anyone listening for this event anyway */
1412
1413 if (ctrl->reconCtrl == NULL)
1414 return(0);
1415
1416 raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
1417
1418 if (status) {
1419 printf("raid%d: Recon read failed!\n", raidPtr->raidid);
1420 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED);
1421 return(0);
1422 }
1423 #if RF_ACC_TRACE > 0
1424 RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1425 RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1426 raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us =
1427 RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1428 RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1429 #endif
1430 rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE);
1431 return (0);
1432 }
1433 /* this gets called upon the completion of a reconstruction write operation.
1434 * the arg is a pointer to the rbuf that was just written
1435 *
1436 * called at interrupt context in the kernel, so don't do anything illegal here.
1437 */
1438 static int
1439 ReconWriteDoneProc(void *arg, int status)
1440 {
1441 RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
1442
1443 /* Detect that reconControl is no longer valid, and if that
1444 is the case, bail without calling rf_CauseReconEvent().
1445 There won't be anyone listening for this event anyway */
1446
1447 if (rbuf->raidPtr->reconControl == NULL)
1448 return(0);
1449
1450 Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru);
1451 if (status) {
1452 printf("raid%d: Recon write failed!\n", rbuf->raidPtr->raidid);
1453 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED);
1454 return(0);
1455 }
1456 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE);
1457 return (0);
1458 }
1459
1460
1461 /*
1462 * computes a new minimum head sep, and wakes up anyone who needs to
1463 * be woken as a result
1464 */
1465 static void
1466 CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr)
1467 {
1468 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1469 RF_HeadSepLimit_t new_min;
1470 RF_RowCol_t i;
1471 RF_CallbackDesc_t *p;
1472 RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition
1473 * of a minimum */
1474
1475
1476 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1477 while(reconCtrlPtr->rb_lock) {
1478 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlcnmhs", 0, &reconCtrlPtr->rb_mutex);
1479 }
1480 reconCtrlPtr->rb_lock = 1;
1481 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1482
1483 new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */
1484 for (i = 0; i < raidPtr->numCol; i++)
1485 if (i != reconCtrlPtr->fcol) {
1486 if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min)
1487 new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter;
1488 }
1489 /* set the new minimum and wake up anyone who can now run again */
1490 if (new_min != reconCtrlPtr->minHeadSepCounter) {
1491 reconCtrlPtr->minHeadSepCounter = new_min;
1492 Dprintf1("RECON: new min head pos counter val is %ld\n", new_min);
1493 while (reconCtrlPtr->headSepCBList) {
1494 if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min)
1495 break;
1496 p = reconCtrlPtr->headSepCBList;
1497 reconCtrlPtr->headSepCBList = p->next;
1498 p->next = NULL;
1499 rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
1500 rf_FreeCallbackDesc(p);
1501 }
1502
1503 }
1504 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1505 reconCtrlPtr->rb_lock = 0;
1506 wakeup(&reconCtrlPtr->rb_lock);
1507 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1508 }
1509
1510 /*
1511 * checks to see that the maximum head separation will not be violated
1512 * if we initiate a reconstruction I/O on the indicated disk.
1513 * Limiting the maximum head separation between two disks eliminates
1514 * the nasty buffer-stall conditions that occur when one disk races
1515 * ahead of the others and consumes all of the floating recon buffers.
1516 * This code is complex and unpleasant but it's necessary to avoid
1517 * some very nasty, albeit fairly rare, reconstruction behavior.
1518 *
1519 * returns non-zero if and only if we have to stop working on the
1520 * indicated disk due to a head-separation delay.
1521 */
1522 static int
1523 CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl,
1524 RF_RowCol_t col, RF_HeadSepLimit_t hsCtr,
1525 RF_ReconUnitNum_t which_ru)
1526 {
1527 RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1528 RF_CallbackDesc_t *cb, *p, *pt;
1529 int retval = 0;
1530
1531 /* if we're too far ahead of the slowest disk, stop working on this
1532 * disk until the slower ones catch up. We do this by scheduling a
1533 * wakeup callback for the time when the slowest disk has caught up.
1534 * We define "caught up" with 20% hysteresis, i.e. the head separation
1535 * must have fallen to at most 80% of the max allowable head
1536 * separation before we'll wake up.
1537 *
1538 */
1539 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1540 while(reconCtrlPtr->rb_lock) {
1541 ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlchs", 0, &reconCtrlPtr->rb_mutex);
1542 }
1543 reconCtrlPtr->rb_lock = 1;
1544 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1545 if ((raidPtr->headSepLimit >= 0) &&
1546 ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) {
1547 Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n",
1548 raidPtr->raidid, col, ctrl->headSepCounter,
1549 reconCtrlPtr->minHeadSepCounter,
1550 raidPtr->headSepLimit);
1551 cb = rf_AllocCallbackDesc();
1552 /* the minHeadSepCounter value we have to get to before we'll
1553 * wake up. build in 20% hysteresis. */
1554 cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
1555 cb->col = col;
1556 cb->next = NULL;
1557
1558 /* insert this callback descriptor into the sorted list of
1559 * pending head-sep callbacks */
1560 p = reconCtrlPtr->headSepCBList;
1561 if (!p)
1562 reconCtrlPtr->headSepCBList = cb;
1563 else
1564 if (cb->callbackArg.v < p->callbackArg.v) {
1565 cb->next = reconCtrlPtr->headSepCBList;
1566 reconCtrlPtr->headSepCBList = cb;
1567 } else {
1568 for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next);
1569 cb->next = p;
1570 pt->next = cb;
1571 }
1572 retval = 1;
1573 #if RF_RECON_STATS > 0
1574 ctrl->reconCtrl->reconDesc->hsStallCount++;
1575 #endif /* RF_RECON_STATS > 0 */
1576 }
1577 RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1578 reconCtrlPtr->rb_lock = 0;
1579 wakeup(&reconCtrlPtr->rb_lock);
1580 RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1581
1582 return (retval);
1583 }
1584 /*
1585 * checks to see if reconstruction has been either forced or blocked
1586 * by a user operation. if forced, we skip this RU entirely. else if
1587 * blocked, put ourselves on the wait list. else return 0.
1588 *
1589 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY
1590 */
1591 static int
1592 CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr,
1593 RF_ReconParityStripeStatus_t *pssPtr,
1594 RF_PerDiskReconCtrl_t *ctrl,
1595 RF_RowCol_t col,
1596 RF_StripeNum_t psid,
1597 RF_ReconUnitNum_t which_ru)
1598 {
1599 RF_CallbackDesc_t *cb;
1600 int retcode = 0;
1601
1602 if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
1603 retcode = RF_PSS_FORCED_ON_WRITE;
1604 else
1605 if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
1606 Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru);
1607 cb = rf_AllocCallbackDesc(); /* append ourselves to
1608 * the blockage-wait
1609 * list */
1610 cb->col = col;
1611 cb->next = pssPtr->blockWaitList;
1612 pssPtr->blockWaitList = cb;
1613 retcode = RF_PSS_RECON_BLOCKED;
1614 }
1615 if (!retcode)
1616 pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under
1617 * reconstruction */
1618
1619 return (retcode);
1620 }
1621 /*
1622 * if reconstruction is currently ongoing for the indicated stripeID,
1623 * reconstruction is forced to completion and we return non-zero to
1624 * indicate that the caller must wait. If not, then reconstruction is
1625 * blocked on the indicated stripe and the routine returns zero. If
1626 * and only if we return non-zero, we'll cause the cbFunc to get
1627 * invoked with the cbArg when the reconstruction has completed.
1628 */
1629 int
1630 rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1631 void (*cbFunc)(RF_Raid_t *, void *), void *cbArg)
1632 {
1633 RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're
1634 * forcing recon on */
1635 RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */
1636 RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr; /* a pointer to the parity
1637 * stripe status structure */
1638 RF_StripeNum_t psid; /* parity stripe id */
1639 RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk
1640 * offset */
1641 RF_RowCol_t *diskids;
1642 RF_ReconUnitNum_t which_ru; /* RU within parity stripe */
1643 RF_RowCol_t fcol, diskno, i;
1644 RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */
1645 RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */
1646 RF_CallbackDesc_t *cb;
1647 int nPromoted;
1648
1649 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1650
1651 /* allocate a new PSS in case we need it */
1652 newpssPtr = rf_AllocPSStatus(raidPtr);
1653
1654 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1655
1656 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr);
1657
1658 if (pssPtr != newpssPtr) {
1659 rf_FreePSStatus(raidPtr, newpssPtr);
1660 }
1661
1662 /* if recon is not ongoing on this PS, just return */
1663 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1664 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1665 return (0);
1666 }
1667 /* otherwise, we have to wait for reconstruction to complete on this
1668 * RU. */
1669 /* In order to avoid waiting for a potentially large number of
1670 * low-priority accesses to complete, we force a normal-priority (i.e.
1671 * not low-priority) reconstruction on this RU. */
1672 if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
1673 DDprintf1("Forcing recon on psid %ld\n", psid);
1674 pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under
1675 * forced recon */
1676 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage
1677 * that we just set */
1678 fcol = raidPtr->reconControl->fcol;
1679
1680 /* get a listing of the disks comprising the indicated stripe */
1681 (raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids);
1682
1683 /* For previously issued reads, elevate them to normal
1684 * priority. If the I/O has already completed, it won't be
1685 * found in the queue, and hence this will be a no-op. For
1686 * unissued reads, allocate buffers and issue new reads. The
1687 * fact that we've set the FORCED bit means that the regular
1688 * recon procs will not re-issue these reqs */
1689 for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++)
1690 if ((diskno = diskids[i]) != fcol) {
1691 if (pssPtr->issued[diskno]) {
1692 nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru);
1693 if (rf_reconDebug && nPromoted)
1694 printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno);
1695 } else {
1696 new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */
1697 ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset,
1698 &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare
1699 * location */
1700 new_rbuf->parityStripeID = psid; /* fill in the buffer */
1701 new_rbuf->which_ru = which_ru;
1702 new_rbuf->failedDiskSectorOffset = fd_offset;
1703 new_rbuf->priority = RF_IO_NORMAL_PRIORITY;
1704
1705 /* use NULL b_proc b/c all addrs
1706 * should be in kernel space */
1707 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer,
1708 psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf,
1709 NULL, (void *) raidPtr, 0, NULL, PR_WAITOK);
1710
1711 new_rbuf->arg = req;
1712 rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */
1713 Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno);
1714 }
1715 }
1716 /* if the write is sitting in the disk queue, elevate its
1717 * priority */
1718 if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru))
1719 printf("raid%d: promoted write to col %d\n",
1720 raidPtr->raidid, fcol);
1721 }
1722 /* install a callback descriptor to be invoked when recon completes on
1723 * this parity stripe. */
1724 cb = rf_AllocCallbackDesc();
1725 /* XXX the following is bogus.. These functions don't really match!!
1726 * GO */
1727 cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
1728 cb->callbackArg.p = (void *) cbArg;
1729 cb->next = pssPtr->procWaitList;
1730 pssPtr->procWaitList = cb;
1731 DDprintf2("raid%d: Waiting for forced recon on psid %ld\n",
1732 raidPtr->raidid, psid);
1733
1734 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1735 return (1);
1736 }
1737 /* called upon the completion of a forced reconstruction read.
1738 * all we do is schedule the FORCEDREADONE event.
1739 * called at interrupt context in the kernel, so don't do anything illegal here.
1740 */
1741 static void
1742 ForceReconReadDoneProc(void *arg, int status)
1743 {
1744 RF_ReconBuffer_t *rbuf = arg;
1745
1746 /* Detect that reconControl is no longer valid, and if that
1747 is the case, bail without calling rf_CauseReconEvent().
1748 There won't be anyone listening for this event anyway */
1749
1750 if (rbuf->raidPtr->reconControl == NULL)
1751 return;
1752
1753 if (status) {
1754 printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid);
1755 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED);
1756 return;
1757 }
1758 rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE);
1759 }
1760 /* releases a block on the reconstruction of the indicated stripe */
1761 int
1762 rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
1763 {
1764 RF_StripeNum_t stripeID = asmap->stripeID;
1765 RF_ReconParityStripeStatus_t *pssPtr;
1766 RF_ReconUnitNum_t which_ru;
1767 RF_StripeNum_t psid;
1768 RF_CallbackDesc_t *cb;
1769
1770 psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1771 RF_LOCK_PSS_MUTEX(raidPtr, psid);
1772 pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL);
1773
1774 /* When recon is forced, the pss desc can get deleted before we get
1775 * back to unblock recon. But, this can _only_ happen when recon is
1776 * forced. It would be good to put some kind of sanity check here, but
1777 * how to decide if recon was just forced or not? */
1778 if (!pssPtr) {
1779 /* printf("Warning: no pss descriptor upon unblock on psid %ld
1780 * RU %d\n",psid,which_ru); */
1781 #if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0)
1782 if (rf_reconDebug || rf_pssDebug)
1783 printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru);
1784 #endif
1785 goto out;
1786 }
1787 pssPtr->blockCount--;
1788 Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n",
1789 raidPtr->raidid, psid, pssPtr->blockCount);
1790 if (pssPtr->blockCount == 0) { /* if recon blockage has been released */
1791
1792 /* unblock recon before calling CauseReconEvent in case
1793 * CauseReconEvent causes us to try to issue a new read before
1794 * returning here. */
1795 pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
1796
1797
1798 while (pssPtr->blockWaitList) {
1799 /* spin through the block-wait list and
1800 release all the waiters */
1801 cb = pssPtr->blockWaitList;
1802 pssPtr->blockWaitList = cb->next;
1803 cb->next = NULL;
1804 rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR);
1805 rf_FreeCallbackDesc(cb);
1806 }
1807 if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1808 /* if no recon was requested while recon was blocked */
1809 rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1810 }
1811 }
1812 out:
1813 RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1814 return (0);
1815 }
1816