rf_dagutils.c revision 1.1 1 /* $NetBSD: rf_dagutils.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /******************************************************************************
30 *
31 * rf_dagutils.c -- utility routines for manipulating dags
32 *
33 *****************************************************************************/
34
35 /*
36 * :
37 * Log: rf_dagutils.c,v
38 * Revision 1.55 1996/08/22 14:39:47 jimz
39 * reduce v/k fraction (better load balancing)
40 *
41 * Revision 1.54 1996/08/21 04:14:12 jimz
42 * minor workload shift tweaking
43 *
44 * Revision 1.53 1996/08/20 23:41:16 jimz
45 * fix up workload shift computation
46 *
47 * Revision 1.52 1996/08/20 22:34:16 jimz
48 * first cut at fixing workload shift
49 * needs work
50 *
51 * Revision 1.51 1996/08/20 16:51:16 jimz
52 * comment more verbosely compute_workload_shift()
53 *
54 * Revision 1.50 1996/08/11 00:40:50 jimz
55 * fix up broken comment
56 *
57 * Revision 1.49 1996/07/27 23:36:08 jimz
58 * Solaris port of simulator
59 *
60 * Revision 1.48 1996/07/27 18:40:01 jimz
61 * cleanup sweep
62 *
63 * Revision 1.47 1996/07/22 19:52:16 jimz
64 * switched node params to RF_DagParam_t, a union of
65 * a 64-bit int and a void *, for better portability
66 * attempted hpux port, but failed partway through for
67 * lack of a single C compiler capable of compiling all
68 * source files
69 *
70 * Revision 1.46 1996/07/18 22:57:14 jimz
71 * port simulator to AIX
72 *
73 * Revision 1.45 1996/06/17 03:24:59 jimz
74 * include shutdown.h for define of now-macroized ShutdownCreate
75 *
76 * Revision 1.44 1996/06/10 12:50:57 jimz
77 * Add counters to freelists to track number of allocations, frees,
78 * grows, max size, etc. Adjust a couple sets of PRIME params based
79 * on the results.
80 *
81 * Revision 1.43 1996/06/10 11:55:47 jimz
82 * Straightened out some per-array/not-per-array distinctions, fixed
83 * a couple bugs related to confusion. Added shutdown lists. Removed
84 * layout shutdown function (now subsumed by shutdown lists).
85 *
86 * Revision 1.42 1996/06/07 21:33:04 jimz
87 * begin using consistent types for sector numbers,
88 * stripe numbers, row+col numbers, recon unit numbers
89 *
90 * Revision 1.41 1996/06/06 17:28:58 jimz
91 * make PrintNodeInfoString aware of new mirroring funcs
92 *
93 * Revision 1.40 1996/06/05 18:06:02 jimz
94 * Major code cleanup. The Great Renaming is now done.
95 * Better modularity. Better typing. Fixed a bunch of
96 * synchronization bugs. Made a lot of global stuff
97 * per-desc or per-array. Removed dead code.
98 *
99 * Revision 1.39 1996/06/03 23:28:26 jimz
100 * more bugfixes
101 * check in tree to sync for IPDS runs with current bugfixes
102 * there still may be a problem with threads in the script test
103 * getting I/Os stuck- not trivially reproducible (runs ~50 times
104 * in a row without getting stuck)
105 *
106 * Revision 1.38 1996/06/02 17:31:48 jimz
107 * Moved a lot of global stuff into array structure, where it belongs.
108 * Fixed up paritylogging, pss modules in this manner. Some general
109 * code cleanup. Removed lots of dead code, some dead files.
110 *
111 * Revision 1.37 1996/05/31 22:26:54 jimz
112 * fix a lot of mapping problems, memory allocation problems
113 * found some weird lock issues, fixed 'em
114 * more code cleanup
115 *
116 * Revision 1.36 1996/05/30 23:22:16 jimz
117 * bugfixes of serialization, timing problems
118 * more cleanup
119 *
120 * Revision 1.35 1996/05/30 11:29:41 jimz
121 * Numerous bug fixes. Stripe lock release code disagreed with the taking code
122 * about when stripes should be locked (I made it consistent: no parity, no lock)
123 * There was a lot of extra serialization of I/Os which I've removed- a lot of
124 * it was to calculate values for the cache code, which is no longer with us.
125 * More types, function, macro cleanup. Added code to properly quiesce the array
126 * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
127 * before. Fixed memory allocation, freeing bugs.
128 *
129 * Revision 1.34 1996/05/27 18:56:37 jimz
130 * more code cleanup
131 * better typing
132 * compiles in all 3 environments
133 *
134 * Revision 1.33 1996/05/24 22:17:04 jimz
135 * continue code + namespace cleanup
136 * typed a bunch of flags
137 *
138 * Revision 1.32 1996/05/24 04:28:55 jimz
139 * release cleanup ckpt
140 *
141 * Revision 1.31 1996/05/23 21:46:35 jimz
142 * checkpoint in code cleanup (release prep)
143 * lots of types, function names have been fixed
144 *
145 * Revision 1.30 1996/05/23 00:33:23 jimz
146 * code cleanup: move all debug decls to rf_options.c, all extern
147 * debug decls to rf_options.h, all debug vars preceded by rf_
148 *
149 * Revision 1.29 1996/05/18 19:51:34 jimz
150 * major code cleanup- fix syntax, make some types consistent,
151 * add prototypes, clean out dead code, et cetera
152 *
153 * Revision 1.28 1996/05/16 23:05:52 jimz
154 * changed InitNode() to use dag_ptrs field of node when appropriate
155 * (see rf_dag.h or comments within InitNode() for details)
156 *
157 * Revision 1.27 1996/05/16 15:37:19 jimz
158 * convert to RF_FREELIST stuff for dag headers
159 *
160 * Revision 1.26 1996/05/08 21:01:24 jimz
161 * fixed up enum type names that were conflicting with other
162 * enums and function names (ie, "panic")
163 * future naming trends will be towards RF_ and rf_ for
164 * everything raidframe-related
165 *
166 * Revision 1.25 1996/05/03 19:56:15 wvcii
167 * added misc routines from old dag creation files
168 *
169 * Revision 1.24 1995/12/12 18:10:06 jimz
170 * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
171 * fix 80-column brain damage in comments
172 *
173 * Revision 1.23 1995/12/01 15:59:50 root
174 * added copyright info
175 *
176 * Revision 1.22 1995/11/17 15:14:12 wvcii
177 * PrintDAG now processes DiskReadMirrorFunc nodes
178 *
179 * Revision 1.21 1995/11/07 16:22:38 wvcii
180 * InitNode and InitNodeFromBuf now initialize commit fields
181 * beefed up ValidateDag
182 * prettied up PrintDAGList
183 *
184 */
185
186 #include "rf_archs.h"
187 #include "rf_types.h"
188 #include "rf_threadstuff.h"
189 #include "rf_raid.h"
190 #include "rf_dag.h"
191 #include "rf_dagutils.h"
192 #include "rf_dagfuncs.h"
193 #include "rf_general.h"
194 #include "rf_freelist.h"
195 #include "rf_map.h"
196 #include "rf_shutdown.h"
197 #include "rf_sys.h"
198
199 #define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
200
201 RF_RedFuncs_t rf_xorFuncs = {
202 rf_RegularXorFunc, "Reg Xr",
203 rf_SimpleXorFunc, "Simple Xr"};
204
205 RF_RedFuncs_t rf_xorRecoveryFuncs = {
206 rf_RecoveryXorFunc, "Recovery Xr",
207 rf_RecoveryXorFunc, "Recovery Xr"};
208
209 static void rf_RecurPrintDAG(RF_DagNode_t *, int, int);
210 static void rf_PrintDAG(RF_DagHeader_t *);
211 static int rf_ValidateBranch(RF_DagNode_t *, int *, int *,
212 RF_DagNode_t **, int );
213 static void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int);
214 static void rf_ValidateVisitedBits(RF_DagHeader_t *);
215
216 /******************************************************************************
217 *
218 * InitNode - initialize a dag node
219 *
220 * the size of the propList array is always the same as that of the
221 * successors array.
222 *
223 *****************************************************************************/
224 void rf_InitNode(
225 RF_DagNode_t *node,
226 RF_NodeStatus_t initstatus,
227 int commit,
228 int (*doFunc)(RF_DagNode_t *node),
229 int (*undoFunc)(RF_DagNode_t *node),
230 int (*wakeFunc)(RF_DagNode_t *node,int status),
231 int nSucc,
232 int nAnte,
233 int nParam,
234 int nResult,
235 RF_DagHeader_t *hdr,
236 char *name,
237 RF_AllocListElem_t *alist)
238 {
239 void **ptrs;
240 int nptrs;
241
242 if (nAnte > RF_MAX_ANTECEDENTS)
243 RF_PANIC();
244 node->status = initstatus;
245 node->commitNode = commit;
246 node->doFunc = doFunc;
247 node->undoFunc = undoFunc;
248 node->wakeFunc = wakeFunc;
249 node->numParams = nParam;
250 node->numResults = nResult;
251 node->numAntecedents = nAnte;
252 node->numAntDone = 0;
253 node->next = NULL;
254 node->numSuccedents = nSucc;
255 node->name = name;
256 node->dagHdr = hdr;
257 node->visited = 0;
258
259 /* allocate all the pointers with one call to malloc */
260 nptrs = nSucc+nAnte+nResult+nSucc;
261
262 if (nptrs <= RF_DAG_PTRCACHESIZE) {
263 /*
264 * The dag_ptrs field of the node is basically some scribble
265 * space to be used here. We could get rid of it, and always
266 * allocate the range of pointers, but that's expensive. So,
267 * we pick a "common case" size for the pointer cache. Hopefully,
268 * we'll find that:
269 * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
270 * only a little bit (least efficient case)
271 * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE
272 * (wasted memory)
273 */
274 ptrs = (void **)node->dag_ptrs;
275 }
276 else {
277 RF_CallocAndAdd(ptrs, nptrs, sizeof(void *), (void **), alist);
278 }
279 node->succedents = (nSucc) ? (RF_DagNode_t **) ptrs : NULL;
280 node->antecedents = (nAnte) ? (RF_DagNode_t **) (ptrs+nSucc) : NULL;
281 node->results = (nResult) ? (void **) (ptrs+nSucc+nAnte) : NULL;
282 node->propList = (nSucc) ? (RF_PropHeader_t **) (ptrs+nSucc+nAnte+nResult) : NULL;
283
284 if (nParam) {
285 if (nParam <= RF_DAG_PARAMCACHESIZE) {
286 node->params = (RF_DagParam_t *)node->dag_params;
287 }
288 else {
289 RF_CallocAndAdd(node->params, nParam, sizeof(RF_DagParam_t), (RF_DagParam_t *), alist);
290 }
291 }
292 else {
293 node->params = NULL;
294 }
295 }
296
297
298
299 /******************************************************************************
300 *
301 * allocation and deallocation routines
302 *
303 *****************************************************************************/
304
305 void rf_FreeDAG(dag_h)
306 RF_DagHeader_t *dag_h;
307 {
308 RF_AccessStripeMapHeader_t *asmap, *t_asmap;
309 RF_DagHeader_t *nextDag;
310 int i;
311
312 while (dag_h) {
313 nextDag = dag_h->next;
314 for (i=0; dag_h->memChunk[i] && i < RF_MAXCHUNKS; i++) {
315 /* release mem chunks */
316 rf_ReleaseMemChunk(dag_h->memChunk[i]);
317 dag_h->memChunk[i] = NULL;
318 }
319
320 RF_ASSERT(i == dag_h->chunkIndex);
321 if (dag_h->xtraChunkCnt > 0) {
322 /* free xtraMemChunks */
323 for (i=0; dag_h->xtraMemChunk[i] && i < dag_h->xtraChunkIndex; i++) {
324 rf_ReleaseMemChunk(dag_h->xtraMemChunk[i]);
325 dag_h->xtraMemChunk[i] = NULL;
326 }
327 RF_ASSERT(i == dag_h->xtraChunkIndex);
328 /* free ptrs to xtraMemChunks */
329 RF_Free(dag_h->xtraMemChunk, dag_h->xtraChunkCnt * sizeof(RF_ChunkDesc_t *));
330 }
331 rf_FreeAllocList(dag_h->allocList);
332 for (asmap = dag_h->asmList; asmap;) {
333 t_asmap = asmap;
334 asmap = asmap->next;
335 rf_FreeAccessStripeMap(t_asmap);
336 }
337 rf_FreeDAGHeader(dag_h);
338 dag_h = nextDag;
339 }
340 }
341
342 RF_PropHeader_t *rf_MakePropListEntry(
343 RF_DagHeader_t *dag_h,
344 int resultNum,
345 int paramNum,
346 RF_PropHeader_t *next,
347 RF_AllocListElem_t *allocList)
348 {
349 RF_PropHeader_t *p;
350
351 RF_CallocAndAdd(p, 1, sizeof(RF_PropHeader_t),
352 (RF_PropHeader_t *), allocList);
353 p->resultNum = resultNum;
354 p->paramNum = paramNum;
355 p->next = next;
356 return(p);
357 }
358
359 static RF_FreeList_t *rf_dagh_freelist;
360
361 #define RF_MAX_FREE_DAGH 128
362 #define RF_DAGH_INC 16
363 #define RF_DAGH_INITIAL 32
364
365 static void rf_ShutdownDAGs(void *);
366 static void rf_ShutdownDAGs(ignored)
367 void *ignored;
368 {
369 RF_FREELIST_DESTROY(rf_dagh_freelist,next,(RF_DagHeader_t *));
370 }
371
372 int rf_ConfigureDAGs(listp)
373 RF_ShutdownList_t **listp;
374 {
375 int rc;
376
377 RF_FREELIST_CREATE(rf_dagh_freelist, RF_MAX_FREE_DAGH,
378 RF_DAGH_INC, sizeof(RF_DagHeader_t));
379 if (rf_dagh_freelist == NULL)
380 return(ENOMEM);
381 rc = rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL);
382 if (rc) {
383 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
384 __FILE__, __LINE__, rc);
385 rf_ShutdownDAGs(NULL);
386 return(rc);
387 }
388 RF_FREELIST_PRIME(rf_dagh_freelist, RF_DAGH_INITIAL,next,
389 (RF_DagHeader_t *));
390 return(0);
391 }
392
393 RF_DagHeader_t *rf_AllocDAGHeader()
394 {
395 RF_DagHeader_t *dh;
396
397 RF_FREELIST_GET(rf_dagh_freelist,dh,next,(RF_DagHeader_t *));
398 if (dh) {
399 bzero((char *)dh, sizeof(RF_DagHeader_t));
400 }
401 return(dh);
402 }
403
404 void rf_FreeDAGHeader(RF_DagHeader_t *dh)
405 {
406 RF_FREELIST_FREE(rf_dagh_freelist,dh,next);
407 }
408
409 /* allocates a buffer big enough to hold the data described by pda */
410 void *rf_AllocBuffer(
411 RF_Raid_t *raidPtr,
412 RF_DagHeader_t *dag_h,
413 RF_PhysDiskAddr_t *pda,
414 RF_AllocListElem_t *allocList)
415 {
416 char *p;
417
418 RF_MallocAndAdd(p, pda->numSector << raidPtr->logBytesPerSector,
419 (char *), allocList);
420 return((void *)p);
421 }
422
423 /******************************************************************************
424 *
425 * debug routines
426 *
427 *****************************************************************************/
428
429 char *rf_NodeStatusString(RF_DagNode_t *node)
430 {
431 switch (node->status) {
432 case rf_wait: return("wait");
433 case rf_fired: return("fired");
434 case rf_good: return("good");
435 case rf_bad: return("bad");
436 default: return("?");
437 }
438 }
439
440 void rf_PrintNodeInfoString(RF_DagNode_t *node)
441 {
442 RF_PhysDiskAddr_t *pda;
443 int (*df)(RF_DagNode_t *) = node->doFunc;
444 int i, lk, unlk;
445 void *bufPtr;
446
447 if ((df==rf_DiskReadFunc) || (df==rf_DiskWriteFunc)
448 || (df==rf_DiskReadMirrorIdleFunc)
449 || (df == rf_DiskReadMirrorPartitionFunc))
450 {
451 pda = (RF_PhysDiskAddr_t *)node->params[0].p;
452 bufPtr = (void *)node->params[1].p;
453 lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
454 unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
455 RF_ASSERT( !(lk && unlk) );
456 printf("r %d c %d offs %ld nsect %d buf 0x%lx %s\n", pda->row, pda->col,
457 (long)pda->startSector, (int) pda->numSector, (long)bufPtr,
458 (lk) ? "LOCK" : ((unlk) ? "UNLK" : " "));
459 return;
460 }
461
462 if (df == rf_DiskUnlockFunc) {
463 pda = (RF_PhysDiskAddr_t *)node->params[0].p;
464 lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
465 unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
466 RF_ASSERT( !(lk && unlk) );
467 printf("r %d c %d %s\n", pda->row, pda->col,
468 (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop"));
469 return;
470 }
471
472 if ((df==rf_SimpleXorFunc) || (df==rf_RegularXorFunc)
473 || (df==rf_RecoveryXorFunc))
474 {
475 printf("result buf 0x%lx\n",(long) node->results[0]);
476 for (i=0; i<node->numParams-1; i+=2) {
477 pda = (RF_PhysDiskAddr_t *)node->params[i].p;
478 bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
479 printf(" buf 0x%lx r%d c%d offs %ld nsect %d\n",
480 (long)bufPtr, pda->row, pda->col,
481 (long)pda->startSector, (int)pda->numSector);
482 }
483 return;
484 }
485
486 #if RF_INCLUDE_PARITYLOGGING > 0
487 if (df==rf_ParityLogOverwriteFunc || df==rf_ParityLogUpdateFunc) {
488 for (i=0; i<node->numParams-1; i+=2) {
489 pda = (RF_PhysDiskAddr_t *)node->params[i].p;
490 bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
491 printf(" r%d c%d offs %ld nsect %d buf 0x%lx\n",
492 pda->row, pda->col, (long) pda->startSector,
493 (int) pda->numSector, (long) bufPtr);
494 }
495 return;
496 }
497 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
498
499 if ((df==rf_TerminateFunc) || (df==rf_NullNodeFunc)) {
500 printf("\n");
501 return;
502 }
503
504 printf("?\n");
505 }
506
507 static void rf_RecurPrintDAG(node, depth, unvisited)
508 RF_DagNode_t *node;
509 int depth;
510 int unvisited;
511 {
512 char *anttype;
513 int i;
514
515 node->visited = (unvisited) ? 0 : 1;
516 printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth,
517 node->nodeNum, node->commitNode, node->name, rf_NodeStatusString(node),
518 node->numSuccedents, node->numSuccFired, node->numSuccDone,
519 node->numAntecedents, node->numAntDone, node->numParams,node->numResults);
520 for (i=0; i<node->numSuccedents; i++) {
521 printf("%d%s", node->succedents[i]->nodeNum,
522 ((i==node->numSuccedents-1) ? "\0" : " "));
523 }
524 printf("} A{");
525 for (i=0; i<node->numAntecedents; i++) {
526 switch (node->antType[i]) {
527 case rf_trueData :
528 anttype = "T";
529 break;
530 case rf_antiData :
531 anttype = "A";
532 break;
533 case rf_outputData :
534 anttype = "O";
535 break;
536 case rf_control :
537 anttype = "C";
538 break;
539 default :
540 anttype = "?";
541 break;
542 }
543 printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype, (i==node->numAntecedents-1) ? "\0" : " ");
544 }
545 printf("}; ");
546 rf_PrintNodeInfoString(node);
547 for (i=0; i<node->numSuccedents; i++) {
548 if (node->succedents[i]->visited == unvisited)
549 rf_RecurPrintDAG(node->succedents[i], depth+1, unvisited);
550 }
551 }
552
553 static void rf_PrintDAG(dag_h)
554 RF_DagHeader_t *dag_h;
555 {
556 int unvisited, i;
557 char *status;
558
559 /* set dag status */
560 switch (dag_h->status) {
561 case rf_enable :
562 status = "enable";
563 break;
564 case rf_rollForward :
565 status = "rollForward";
566 break;
567 case rf_rollBackward :
568 status = "rollBackward";
569 break;
570 default :
571 status = "illegal!";
572 break;
573 }
574 /* find out if visited bits are currently set or clear */
575 unvisited = dag_h->succedents[0]->visited;
576
577 printf("DAG type: %s\n", dag_h->creator);
578 printf("format is (depth) num commit type: status,nSucc nSuccFired/nSuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)}; info\n");
579 printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum,
580 status, dag_h->numSuccedents, dag_h->numCommitNodes, dag_h->numCommits);
581 for (i=0; i<dag_h->numSuccedents; i++) {
582 printf("%d%s", dag_h->succedents[i]->nodeNum,
583 ((i==dag_h->numSuccedents-1) ? "\0" : " "));
584 }
585 printf("};\n");
586 for (i=0; i<dag_h->numSuccedents; i++) {
587 if (dag_h->succedents[i]->visited == unvisited)
588 rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited);
589 }
590 }
591
592 /* assigns node numbers */
593 int rf_AssignNodeNums(RF_DagHeader_t *dag_h)
594 {
595 int unvisited, i, nnum;
596 RF_DagNode_t *node;
597
598 nnum = 0;
599 unvisited = dag_h->succedents[0]->visited;
600
601 dag_h->nodeNum = nnum++;
602 for (i=0; i<dag_h->numSuccedents; i++) {
603 node = dag_h->succedents[i];
604 if (node->visited == unvisited) {
605 nnum = rf_RecurAssignNodeNums(dag_h->succedents[i], nnum, unvisited);
606 }
607 }
608 return(nnum);
609 }
610
611 int rf_RecurAssignNodeNums(node, num, unvisited)
612 RF_DagNode_t *node;
613 int num;
614 int unvisited;
615 {
616 int i;
617
618 node->visited = (unvisited) ? 0 : 1;
619
620 node->nodeNum = num++;
621 for (i=0; i<node->numSuccedents; i++) {
622 if (node->succedents[i]->visited == unvisited) {
623 num = rf_RecurAssignNodeNums(node->succedents[i], num, unvisited);
624 }
625 }
626 return(num);
627 }
628
629 /* set the header pointers in each node to "newptr" */
630 void rf_ResetDAGHeaderPointers(dag_h, newptr)
631 RF_DagHeader_t *dag_h;
632 RF_DagHeader_t *newptr;
633 {
634 int i;
635 for (i=0; i<dag_h->numSuccedents; i++)
636 if (dag_h->succedents[i]->dagHdr != newptr)
637 rf_RecurResetDAGHeaderPointers(dag_h->succedents[i], newptr);
638 }
639
640 void rf_RecurResetDAGHeaderPointers(node, newptr)
641 RF_DagNode_t *node;
642 RF_DagHeader_t *newptr;
643 {
644 int i;
645 node->dagHdr = newptr;
646 for (i=0; i<node->numSuccedents; i++)
647 if (node->succedents[i]->dagHdr != newptr)
648 rf_RecurResetDAGHeaderPointers(node->succedents[i], newptr);
649 }
650
651
652 void rf_PrintDAGList(RF_DagHeader_t *dag_h)
653 {
654 int i=0;
655
656 for (; dag_h; dag_h=dag_h->next) {
657 rf_AssignNodeNums(dag_h);
658 printf("\n\nDAG %d IN LIST:\n",i++);
659 rf_PrintDAG(dag_h);
660 }
661 }
662
663 static int rf_ValidateBranch(node, scount, acount, nodes, unvisited)
664 RF_DagNode_t *node;
665 int *scount;
666 int *acount;
667 RF_DagNode_t **nodes;
668 int unvisited;
669 {
670 int i, retcode = 0;
671
672 /* construct an array of node pointers indexed by node num */
673 node->visited = (unvisited) ? 0 : 1;
674 nodes[ node->nodeNum ] = node;
675
676 if (node->next != NULL) {
677 printf("INVALID DAG: next pointer in node is not NULL\n");
678 retcode = 1;
679 }
680 if (node->status != rf_wait) {
681 printf("INVALID DAG: Node status is not wait\n");
682 retcode = 1;
683 }
684 if (node->numAntDone != 0) {
685 printf("INVALID DAG: numAntDone is not zero\n");
686 retcode = 1;
687 }
688 if (node->doFunc == rf_TerminateFunc) {
689 if (node->numSuccedents != 0) {
690 printf("INVALID DAG: Terminator node has succedents\n");
691 retcode = 1;
692 }
693 } else {
694 if (node->numSuccedents == 0) {
695 printf("INVALID DAG: Non-terminator node has no succedents\n");
696 retcode = 1;
697 }
698 }
699 for (i=0; i<node->numSuccedents; i++) {
700 if (!node->succedents[i]) {
701 printf("INVALID DAG: succedent %d of node %s is NULL\n",i,node->name);
702 retcode = 1;
703 }
704 scount[ node->succedents[i]->nodeNum ]++;
705 }
706 for (i=0; i<node->numAntecedents; i++) {
707 if (!node->antecedents[i]) {
708 printf("INVALID DAG: antecedent %d of node %s is NULL\n",i,node->name);
709 retcode = 1;
710 }
711 acount[ node->antecedents[i]->nodeNum ]++;
712 }
713 for (i=0; i<node->numSuccedents; i++) {
714 if (node->succedents[i]->visited == unvisited) {
715 if (rf_ValidateBranch(node->succedents[i], scount,
716 acount, nodes, unvisited))
717 {
718 retcode = 1;
719 }
720 }
721 }
722 return(retcode);
723 }
724
725 static void rf_ValidateBranchVisitedBits(node, unvisited, rl)
726 RF_DagNode_t *node;
727 int unvisited;
728 int rl;
729 {
730 int i;
731
732 RF_ASSERT(node->visited == unvisited);
733 for (i=0; i<node->numSuccedents; i++) {
734 if (node->succedents[i] == NULL) {
735 printf("node=%lx node->succedents[%d] is NULL\n", (long)node, i);
736 RF_ASSERT(0);
737 }
738 rf_ValidateBranchVisitedBits(node->succedents[i],unvisited, rl+1);
739 }
740 }
741
742 /* NOTE: never call this on a big dag, because it is exponential
743 * in execution time
744 */
745 static void rf_ValidateVisitedBits(dag)
746 RF_DagHeader_t *dag;
747 {
748 int i, unvisited;
749
750 unvisited = dag->succedents[0]->visited;
751
752 for (i=0; i<dag->numSuccedents; i++) {
753 if (dag->succedents[i] == NULL) {
754 printf("dag=%lx dag->succedents[%d] is NULL\n", (long) dag, i);
755 RF_ASSERT(0);
756 }
757 rf_ValidateBranchVisitedBits(dag->succedents[i],unvisited,0);
758 }
759 }
760
761 /* validate a DAG. _at entry_ verify that:
762 * -- numNodesCompleted is zero
763 * -- node queue is null
764 * -- dag status is rf_enable
765 * -- next pointer is null on every node
766 * -- all nodes have status wait
767 * -- numAntDone is zero in all nodes
768 * -- terminator node has zero successors
769 * -- no other node besides terminator has zero successors
770 * -- no successor or antecedent pointer in a node is NULL
771 * -- number of times that each node appears as a successor of another node
772 * is equal to the antecedent count on that node
773 * -- number of times that each node appears as an antecedent of another node
774 * is equal to the succedent count on that node
775 * -- what else?
776 */
777 int rf_ValidateDAG(dag_h)
778 RF_DagHeader_t *dag_h;
779 {
780 int i, nodecount;
781 int *scount, *acount; /* per-node successor and antecedent counts */
782 RF_DagNode_t **nodes; /* array of ptrs to nodes in dag */
783 int retcode = 0;
784 int unvisited;
785 int commitNodeCount = 0;
786
787 if (rf_validateVisitedDebug)
788 rf_ValidateVisitedBits(dag_h);
789
790 if (dag_h->numNodesCompleted != 0) {
791 printf("INVALID DAG: num nodes completed is %d, should be 0\n",dag_h->numNodesCompleted);
792 retcode = 1; goto validate_dag_bad;
793 }
794 if (dag_h->status != rf_enable) {
795 printf("INVALID DAG: not enabled\n");
796 retcode = 1; goto validate_dag_bad;
797 }
798 if (dag_h->numCommits != 0) {
799 printf("INVALID DAG: numCommits != 0 (%d)\n",dag_h->numCommits);
800 retcode = 1; goto validate_dag_bad;
801 }
802 if (dag_h->numSuccedents != 1) {
803 /* currently, all dags must have only one succedent */
804 printf("INVALID DAG: numSuccedents !1 (%d)\n",dag_h->numSuccedents);
805 retcode = 1; goto validate_dag_bad;
806 }
807 nodecount = rf_AssignNodeNums(dag_h);
808
809 unvisited = dag_h->succedents[0]->visited;
810
811 RF_Calloc(scount, nodecount, sizeof(int), (int *));
812 RF_Calloc(acount, nodecount, sizeof(int), (int *));
813 RF_Calloc(nodes, nodecount, sizeof(RF_DagNode_t *), (RF_DagNode_t **));
814 for (i=0; i<dag_h->numSuccedents; i++) {
815 if ((dag_h->succedents[i]->visited == unvisited)
816 && rf_ValidateBranch(dag_h->succedents[i], scount,
817 acount, nodes, unvisited))
818 {
819 retcode = 1;
820 }
821 }
822 /* start at 1 to skip the header node */
823 for (i=1; i<nodecount; i++) {
824 if ( nodes[i]->commitNode )
825 commitNodeCount++;
826 if ( nodes[i]->doFunc == NULL ) {
827 printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
828 retcode = 1;
829 goto validate_dag_out;
830 }
831 if ( nodes[i]->undoFunc == NULL ) {
832 printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
833 retcode = 1;
834 goto validate_dag_out;
835 }
836 if ( nodes[i]->numAntecedents != scount[ nodes[i]->nodeNum ] ) {
837 printf("INVALID DAG: node %s has %d antecedents but appears as a succedent %d times\n",
838 nodes[i]->name, nodes[i]->numAntecedents, scount[nodes[i]->nodeNum]);
839 retcode = 1;
840 goto validate_dag_out;
841 }
842 if ( nodes[i]->numSuccedents != acount[ nodes[i]->nodeNum ] ) {
843 printf("INVALID DAG: node %s has %d succedents but appears as an antecedent %d times\n",
844 nodes[i]->name, nodes[i]->numSuccedents, acount[nodes[i]->nodeNum]);
845 retcode = 1;
846 goto validate_dag_out;
847 }
848 }
849
850 if ( dag_h->numCommitNodes != commitNodeCount ) {
851 printf("INVALID DAG: incorrect commit node count. hdr->numCommitNodes (%d) found (%d) commit nodes in graph\n",
852 dag_h->numCommitNodes, commitNodeCount);
853 retcode = 1;
854 goto validate_dag_out;
855 }
856
857 validate_dag_out:
858 RF_Free(scount, nodecount*sizeof(int));
859 RF_Free(acount, nodecount*sizeof(int));
860 RF_Free(nodes, nodecount*sizeof(RF_DagNode_t *));
861 if (retcode)
862 rf_PrintDAGList(dag_h);
863
864 if (rf_validateVisitedDebug)
865 rf_ValidateVisitedBits(dag_h);
866
867 return(retcode);
868
869 validate_dag_bad:
870 rf_PrintDAGList(dag_h);
871 return(retcode);
872 }
873
874
875 /******************************************************************************
876 *
877 * misc construction routines
878 *
879 *****************************************************************************/
880
881 void rf_redirect_asm(
882 RF_Raid_t *raidPtr,
883 RF_AccessStripeMap_t *asmap)
884 {
885 int ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0;
886 int row = asmap->physInfo->row;
887 int fcol = raidPtr->reconControl[row]->fcol;
888 int srow = raidPtr->reconControl[row]->spareRow;
889 int scol = raidPtr->reconControl[row]->spareCol;
890 RF_PhysDiskAddr_t *pda;
891
892 RF_ASSERT( raidPtr->status[row] == rf_rs_reconstructing );
893 for (pda = asmap->physInfo; pda; pda=pda->next) {
894 if (pda->col == fcol) {
895 if (rf_dagDebug) {
896 if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap,
897 pda->startSector))
898 {
899 RF_PANIC();
900 }
901 }
902 /*printf("Remapped data for large write\n");*/
903 if (ds) {
904 raidPtr->Layout.map->MapSector(raidPtr, pda->raidAddress,
905 &pda->row, &pda->col, &pda->startSector, RF_REMAP);
906 }
907 else {
908 pda->row = srow; pda->col = scol;
909 }
910 }
911 }
912 for (pda = asmap->parityInfo; pda; pda=pda->next) {
913 if (pda->col == fcol) {
914 if (rf_dagDebug) {
915 if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, pda->startSector)) {
916 RF_PANIC();
917 }
918 }
919 }
920 if (ds) {
921 (raidPtr->Layout.map->MapParity)(raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
922 }
923 else {
924 pda->row = srow; pda->col = scol;
925 }
926 }
927 }
928
929
930 /* this routine allocates read buffers and generates stripe maps for the
931 * regions of the array from the start of the stripe to the start of the
932 * access, and from the end of the access to the end of the stripe. It also
933 * computes and returns the number of DAG nodes needed to read all this data.
934 * Note that this routine does the wrong thing if the access is fully
935 * contained within one stripe unit, so we RF_ASSERT against this case at the
936 * start.
937 */
938 void rf_MapUnaccessedPortionOfStripe(
939 RF_Raid_t *raidPtr,
940 RF_RaidLayout_t *layoutPtr, /* in: layout information */
941 RF_AccessStripeMap_t *asmap, /* in: access stripe map */
942 RF_DagHeader_t *dag_h, /* in: header of the dag to create */
943 RF_AccessStripeMapHeader_t **new_asm_h, /* in: ptr to array of 2 headers, to be filled in */
944 int *nRodNodes, /* out: num nodes to be generated to read unaccessed data */
945 char **sosBuffer, /* out: pointers to newly allocated buffer */
946 char **eosBuffer,
947 RF_AllocListElem_t *allocList)
948 {
949 RF_RaidAddr_t sosRaidAddress, eosRaidAddress;
950 RF_SectorNum_t sosNumSector, eosNumSector;
951
952 RF_ASSERT( asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol/2) );
953 /* generate an access map for the region of the array from start of stripe
954 * to start of access */
955 new_asm_h[0] = new_asm_h[1] = NULL; *nRodNodes = 0;
956 if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) {
957 sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
958 sosNumSector = asmap->raidAddress - sosRaidAddress;
959 RF_MallocAndAdd(*sosBuffer, rf_RaidAddressToByte(raidPtr, sosNumSector), (char *), allocList);
960 new_asm_h[0] = rf_MapAccess(raidPtr, sosRaidAddress, sosNumSector, *sosBuffer, RF_DONT_REMAP);
961 new_asm_h[0]->next = dag_h->asmList;
962 dag_h->asmList = new_asm_h[0];
963 *nRodNodes += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
964
965 RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL);
966 /* we're totally within one stripe here */
967 if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
968 rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap);
969 }
970 /* generate an access map for the region of the array from end of access
971 * to end of stripe */
972 if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) {
973 eosRaidAddress = asmap->endRaidAddress;
974 eosNumSector = rf_RaidAddressOfNextStripeBoundary(layoutPtr, eosRaidAddress) - eosRaidAddress;
975 RF_MallocAndAdd(*eosBuffer, rf_RaidAddressToByte(raidPtr, eosNumSector), (char *), allocList);
976 new_asm_h[1] = rf_MapAccess(raidPtr, eosRaidAddress, eosNumSector, *eosBuffer, RF_DONT_REMAP);
977 new_asm_h[1]->next = dag_h->asmList;
978 dag_h->asmList = new_asm_h[1];
979 *nRodNodes += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
980
981 RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL);
982 /* we're totally within one stripe here */
983 if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
984 rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap);
985 }
986 }
987
988
989
990 /* returns non-zero if the indicated ranges of stripe unit offsets overlap */
991 int rf_PDAOverlap(
992 RF_RaidLayout_t *layoutPtr,
993 RF_PhysDiskAddr_t *src,
994 RF_PhysDiskAddr_t *dest)
995 {
996 RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
997 RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
998 /* use -1 to be sure we stay within SU */
999 RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1);
1000 RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
1001 return( (RF_MAX(soffs,doffs) <= RF_MIN(send,dend)) ? 1 : 0 );
1002 }
1003
1004
1005 /* GenerateFailedAccessASMs
1006 *
1007 * this routine figures out what portion of the stripe needs to be read
1008 * to effect the degraded read or write operation. It's primary function
1009 * is to identify everything required to recover the data, and then
1010 * eliminate anything that is already being accessed by the user.
1011 *
1012 * The main result is two new ASMs, one for the region from the start of the
1013 * stripe to the start of the access, and one for the region from the end of
1014 * the access to the end of the stripe. These ASMs describe everything that
1015 * needs to be read to effect the degraded access. Other results are:
1016 * nXorBufs -- the total number of buffers that need to be XORed together to
1017 * recover the lost data,
1018 * rpBufPtr -- ptr to a newly-allocated buffer to hold the parity. If NULL
1019 * at entry, not allocated.
1020 * overlappingPDAs --
1021 * describes which of the non-failed PDAs in the user access
1022 * overlap data that needs to be read to effect recovery.
1023 * overlappingPDAs[i]==1 if and only if, neglecting the failed
1024 * PDA, the ith pda in the input asm overlaps data that needs
1025 * to be read for recovery.
1026 */
1027 /* in: asm - ASM for the actual access, one stripe only */
1028 /* in: faildPDA - which component of the access has failed */
1029 /* in: dag_h - header of the DAG we're going to create */
1030 /* out: new_asm_h - the two new ASMs */
1031 /* out: nXorBufs - the total number of xor bufs required */
1032 /* out: rpBufPtr - a buffer for the parity read */
1033 void rf_GenerateFailedAccessASMs(
1034 RF_Raid_t *raidPtr,
1035 RF_AccessStripeMap_t *asmap,
1036 RF_PhysDiskAddr_t *failedPDA,
1037 RF_DagHeader_t *dag_h,
1038 RF_AccessStripeMapHeader_t **new_asm_h,
1039 int *nXorBufs,
1040 char **rpBufPtr,
1041 char *overlappingPDAs,
1042 RF_AllocListElem_t *allocList)
1043 {
1044 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
1045
1046 /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
1047 RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr;
1048
1049 RF_SectorCount_t numSect[2], numParitySect;
1050 RF_PhysDiskAddr_t *pda;
1051 char *rdBuf, *bufP;
1052 int foundit, i;
1053
1054 bufP = NULL;
1055 foundit = 0;
1056 /* first compute the following raid addresses:
1057 start of stripe, (sosAddr)
1058 MIN(start of access, start of failed SU), (sosEndAddr)
1059 MAX(end of access, end of failed SU), (eosStartAddr)
1060 end of stripe (i.e. start of next stripe) (eosAddr)
1061 */
1062 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
1063 sosEndAddr = RF_MIN(asmap->raidAddress, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,failedPDA->raidAddress));
1064 eosStartAddr = RF_MAX(asmap->endRaidAddress, rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, failedPDA->raidAddress));
1065 eosAddr = rf_RaidAddressOfNextStripeBoundary(layoutPtr, asmap->raidAddress);
1066
1067 /* now generate access stripe maps for each of the above regions of the
1068 * stripe. Use a dummy (NULL) buf ptr for now */
1069
1070 new_asm_h[0] = (sosAddr != sosEndAddr) ? rf_MapAccess(raidPtr, sosAddr, sosEndAddr-sosAddr, NULL, RF_DONT_REMAP) : NULL;
1071 new_asm_h[1] = (eosStartAddr != eosAddr) ? rf_MapAccess(raidPtr, eosStartAddr, eosAddr-eosStartAddr, NULL, RF_DONT_REMAP) : NULL;
1072
1073 /* walk through the PDAs and range-restrict each SU to the region of the
1074 * SU touched on the failed PDA. also compute total data buffer space
1075 * requirements in this step. Ignore the parity for now. */
1076
1077 numSect[0] = numSect[1] = 0;
1078 if (new_asm_h[0]) {
1079 new_asm_h[0]->next = dag_h->asmList; dag_h->asmList = new_asm_h[0];
1080 for (pda = new_asm_h[0]->stripeMap->physInfo; pda; pda = pda->next) {
1081 rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[0] += pda->numSector;
1082 }
1083 }
1084 if (new_asm_h[1]) {
1085 new_asm_h[1]->next = dag_h->asmList; dag_h->asmList = new_asm_h[1];
1086 for (pda = new_asm_h[1]->stripeMap->physInfo; pda; pda = pda->next) {
1087 rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[1] += pda->numSector;
1088 }
1089 }
1090 numParitySect = failedPDA->numSector;
1091
1092 /* allocate buffer space for the data & parity we have to read to recover
1093 * from the failure */
1094
1095 if (numSect[0]+numSect[1]+ ((rpBufPtr) ? numParitySect : 0)) { /* don't allocate parity buf if not needed */
1096 RF_MallocAndAdd(rdBuf, rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (char *), allocList);
1097 bufP = rdBuf;
1098 if (rf_degDagDebug) printf("Newly allocated buffer (%d bytes) is 0x%lx\n",
1099 (int)rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (unsigned long) bufP);
1100 }
1101
1102 /* now walk through the pdas one last time and assign buffer pointers
1103 * (ugh!). Again, ignore the parity. also, count nodes to find out how
1104 * many bufs need to be xored together */
1105 (*nXorBufs) = 1; /* in read case, 1 is for parity. In write case, 1 is for failed data */
1106 if (new_asm_h[0]) {
1107 for (pda=new_asm_h[0]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
1108 *nXorBufs += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
1109 }
1110 if (new_asm_h[1]) {
1111 for (pda=new_asm_h[1]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
1112 (*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
1113 }
1114 if (rpBufPtr) *rpBufPtr = bufP; /* the rest of the buffer is for parity */
1115
1116 /* the last step is to figure out how many more distinct buffers need to
1117 * get xor'd to produce the missing unit. there's one for each user-data
1118 * read node that overlaps the portion of the failed unit being accessed */
1119
1120 for (foundit=i=0,pda=asmap->physInfo; pda; i++,pda=pda->next) {
1121 if (pda == failedPDA) {i--; foundit=1; continue;}
1122 if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) {
1123 overlappingPDAs[i] = 1;
1124 (*nXorBufs)++;
1125 }
1126 }
1127 if (!foundit) {RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA in asm list\n"); RF_ASSERT(0);}
1128
1129 if (rf_degDagDebug) {
1130 if (new_asm_h[0]) {
1131 printf("First asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[0], 1);
1132 }
1133 if (new_asm_h[1]) {
1134 printf("Second asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[1], 1);
1135 }
1136 }
1137 }
1138
1139
1140 /* adjusts the offset and number of sectors in the destination pda so that
1141 * it covers at most the region of the SU covered by the source PDA. This
1142 * is exclusively a restriction: the number of sectors indicated by the
1143 * target PDA can only shrink.
1144 *
1145 * For example: s = sectors within SU indicated by source PDA
1146 * d = sectors within SU indicated by dest PDA
1147 * r = results, stored in dest PDA
1148 *
1149 * |--------------- one stripe unit ---------------------|
1150 * | sssssssssssssssssssssssssssssssss |
1151 * | ddddddddddddddddddddddddddddddddddddddddddddd |
1152 * | rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr |
1153 *
1154 * Another example:
1155 *
1156 * |--------------- one stripe unit ---------------------|
1157 * | sssssssssssssssssssssssssssssssss |
1158 * | ddddddddddddddddddddddd |
1159 * | rrrrrrrrrrrrrrrr |
1160 *
1161 */
1162 void rf_RangeRestrictPDA(
1163 RF_Raid_t *raidPtr,
1164 RF_PhysDiskAddr_t *src,
1165 RF_PhysDiskAddr_t *dest,
1166 int dobuffer,
1167 int doraidaddr)
1168 {
1169 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1170 RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
1171 RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
1172 RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1); /* use -1 to be sure we stay within SU */
1173 RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
1174 RF_SectorNum_t subAddr = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->startSector); /* stripe unit boundary */
1175
1176 dest->startSector = subAddr + RF_MAX(soffs,doffs);
1177 dest->numSector = subAddr + RF_MIN(send,dend) + 1 - dest->startSector;
1178
1179 if (dobuffer)
1180 dest->bufPtr += (soffs > doffs) ? rf_RaidAddressToByte(raidPtr,soffs-doffs) : 0;
1181 if (doraidaddr) {
1182 dest->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->raidAddress) +
1183 rf_StripeUnitOffset(layoutPtr, dest->startSector);
1184 }
1185 }
1186
1187 /*
1188 * Want the highest of these primes to be the largest one
1189 * less than the max expected number of columns (won't hurt
1190 * to be too small or too large, but won't be optimal, either)
1191 * --jimz
1192 */
1193 #define NLOWPRIMES 8
1194 static int lowprimes[NLOWPRIMES] = {2,3,5,7,11,13,17,19};
1195
1196 /*****************************************************************************
1197 * compute the workload shift factor. (chained declustering)
1198 *
1199 * return nonzero if access should shift to secondary, otherwise,
1200 * access is to primary
1201 *****************************************************************************/
1202 int rf_compute_workload_shift(
1203 RF_Raid_t *raidPtr,
1204 RF_PhysDiskAddr_t *pda)
1205 {
1206 /*
1207 * variables:
1208 * d = column of disk containing primary
1209 * f = column of failed disk
1210 * n = number of disks in array
1211 * sd = "shift distance" (number of columns that d is to the right of f)
1212 * row = row of array the access is in
1213 * v = numerator of redirection ratio
1214 * k = denominator of redirection ratio
1215 */
1216 RF_RowCol_t d, f, sd, row, n;
1217 int k, v, ret, i;
1218
1219 row = pda->row;
1220 n = raidPtr->numCol;
1221
1222 /* assign column of primary copy to d */
1223 d = pda->col;
1224
1225 /* assign column of dead disk to f */
1226 for(f=0;((!RF_DEAD_DISK(raidPtr->Disks[row][f].status))&&(f<n));f++);
1227
1228 RF_ASSERT(f < n);
1229 RF_ASSERT(f != d);
1230
1231 sd = (f > d) ? (n + d - f) : (d - f);
1232 RF_ASSERT(sd < n);
1233
1234 /*
1235 * v of every k accesses should be redirected
1236 *
1237 * v/k := (n-1-sd)/(n-1)
1238 */
1239 v = (n-1-sd);
1240 k = (n-1);
1241
1242 #if 1
1243 /*
1244 * XXX
1245 * Is this worth it?
1246 *
1247 * Now reduce the fraction, by repeatedly factoring
1248 * out primes (just like they teach in elementary school!)
1249 */
1250 for(i=0;i<NLOWPRIMES;i++) {
1251 if (lowprimes[i] > v)
1252 break;
1253 while (((v%lowprimes[i])==0) && ((k%lowprimes[i])==0)) {
1254 v /= lowprimes[i];
1255 k /= lowprimes[i];
1256 }
1257 }
1258 #endif
1259
1260 raidPtr->hist_diskreq[row][d]++;
1261 if (raidPtr->hist_diskreq[row][d] > v) {
1262 ret = 0; /* do not redirect */
1263 }
1264 else {
1265 ret = 1; /* redirect */
1266 }
1267
1268 #if 0
1269 printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret,
1270 raidPtr->hist_diskreq[row][d]);
1271 #endif
1272
1273 if (raidPtr->hist_diskreq[row][d] >= k) {
1274 /* reset counter */
1275 raidPtr->hist_diskreq[row][d] = 0;
1276 }
1277
1278 return(ret);
1279 }
1280
1281 /*
1282 * Disk selection routines
1283 */
1284
1285 /*
1286 * Selects the disk with the shortest queue from a mirror pair.
1287 * Both the disk I/Os queued in RAIDframe as well as those at the physical
1288 * disk are counted as members of the "queue"
1289 */
1290 void rf_SelectMirrorDiskIdle(RF_DagNode_t *node)
1291 {
1292 RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
1293 RF_RowCol_t rowData, colData, rowMirror, colMirror;
1294 int dataQueueLength, mirrorQueueLength, usemirror;
1295 RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *)node->params[0].p;
1296 RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
1297 RF_PhysDiskAddr_t *tmp_pda;
1298 RF_RaidDisk_t **disks = raidPtr->Disks;
1299 RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
1300
1301 /* return the [row col] of the disk with the shortest queue */
1302 rowData = data_pda->row;
1303 colData = data_pda->col;
1304 rowMirror = mirror_pda->row;
1305 colMirror = mirror_pda->col;
1306 dataQueue = &(dqs[rowData][colData]);
1307 mirrorQueue = &(dqs[rowMirror][colMirror]);
1308
1309 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1310 RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
1311 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1312 dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding;
1313 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1314 RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
1315 RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
1316 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1317 mirrorQueueLength = mirrorQueue->queueLength + mirrorQueue->numOutstanding;
1318 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1319 RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
1320 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1321
1322 usemirror = 0;
1323 if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
1324 usemirror = 0;
1325 }
1326 else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
1327 usemirror = 1;
1328 }
1329 else if (dataQueueLength < mirrorQueueLength) {
1330 usemirror = 0;
1331 }
1332 else if (mirrorQueueLength < dataQueueLength) {
1333 usemirror = 1;
1334 }
1335 else {
1336 /* queues are equal length. attempt cleverness. */
1337 if (SNUM_DIFF(dataQueue->last_deq_sector,data_pda->startSector)
1338 <= SNUM_DIFF(mirrorQueue->last_deq_sector,mirror_pda->startSector))
1339 {
1340 usemirror = 0;
1341 }
1342 else {
1343 usemirror = 1;
1344 }
1345 }
1346
1347 if (usemirror) {
1348 /* use mirror (parity) disk, swap params 0 & 4 */
1349 tmp_pda = data_pda;
1350 node->params[0].p = mirror_pda;
1351 node->params[4].p = tmp_pda;
1352 }
1353 else {
1354 /* use data disk, leave param 0 unchanged */
1355 }
1356 /* printf("dataQueueLength %d, mirrorQueueLength %d\n",dataQueueLength, mirrorQueueLength); */
1357 }
1358
1359 /*
1360 * Do simple partitioning. This assumes that
1361 * the data and parity disks are laid out identically.
1362 */
1363 void rf_SelectMirrorDiskPartition(RF_DagNode_t *node)
1364 {
1365 RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
1366 RF_RowCol_t rowData, colData, rowMirror, colMirror;
1367 RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *)node->params[0].p;
1368 RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
1369 RF_PhysDiskAddr_t *tmp_pda;
1370 RF_RaidDisk_t **disks = raidPtr->Disks;
1371 RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
1372 int usemirror;
1373
1374 /* return the [row col] of the disk with the shortest queue */
1375 rowData = data_pda->row;
1376 colData = data_pda->col;
1377 rowMirror = mirror_pda->row;
1378 colMirror = mirror_pda->col;
1379 dataQueue = &(dqs[rowData][colData]);
1380 mirrorQueue = &(dqs[rowMirror][colMirror]);
1381
1382 usemirror = 0;
1383 if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
1384 usemirror = 0;
1385 }
1386 else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
1387 usemirror = 1;
1388 }
1389 else if (data_pda->startSector < (disks[rowData][colData].numBlocks / 2)) {
1390 usemirror = 0;
1391 }
1392 else {
1393 usemirror = 1;
1394 }
1395
1396 if (usemirror) {
1397 /* use mirror (parity) disk, swap params 0 & 4 */
1398 tmp_pda = data_pda;
1399 node->params[0].p = mirror_pda;
1400 node->params[4].p = tmp_pda;
1401 }
1402 else {
1403 /* use data disk, leave param 0 unchanged */
1404 }
1405 }
1406