rf_dagutils.c revision 1.2 1 1.2 oster /* $NetBSD: rf_dagutils.c,v 1.2 1999/01/26 02:33:54 oster Exp $ */
2 1.1 oster /*
3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University.
4 1.1 oster * All rights reserved.
5 1.1 oster *
6 1.1 oster * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
7 1.1 oster *
8 1.1 oster * Permission to use, copy, modify and distribute this software and
9 1.1 oster * its documentation is hereby granted, provided that both the copyright
10 1.1 oster * notice and this permission notice appear in all copies of the
11 1.1 oster * software, derivative works or modified versions, and any portions
12 1.1 oster * thereof, and that both notices appear in supporting documentation.
13 1.1 oster *
14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 1.1 oster *
18 1.1 oster * Carnegie Mellon requests users of this software to return to
19 1.1 oster *
20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 1.1 oster * School of Computer Science
22 1.1 oster * Carnegie Mellon University
23 1.1 oster * Pittsburgh PA 15213-3890
24 1.1 oster *
25 1.1 oster * any improvements or extensions that they make and grant Carnegie the
26 1.1 oster * rights to redistribute these changes.
27 1.1 oster */
28 1.1 oster
29 1.1 oster /******************************************************************************
30 1.1 oster *
31 1.1 oster * rf_dagutils.c -- utility routines for manipulating dags
32 1.1 oster *
33 1.1 oster *****************************************************************************/
34 1.1 oster
35 1.1 oster #include "rf_archs.h"
36 1.1 oster #include "rf_types.h"
37 1.1 oster #include "rf_threadstuff.h"
38 1.1 oster #include "rf_raid.h"
39 1.1 oster #include "rf_dag.h"
40 1.1 oster #include "rf_dagutils.h"
41 1.1 oster #include "rf_dagfuncs.h"
42 1.1 oster #include "rf_general.h"
43 1.1 oster #include "rf_freelist.h"
44 1.1 oster #include "rf_map.h"
45 1.1 oster #include "rf_shutdown.h"
46 1.1 oster #include "rf_sys.h"
47 1.1 oster
48 1.1 oster #define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
49 1.1 oster
50 1.1 oster RF_RedFuncs_t rf_xorFuncs = {
51 1.1 oster rf_RegularXorFunc, "Reg Xr",
52 1.1 oster rf_SimpleXorFunc, "Simple Xr"};
53 1.1 oster
54 1.1 oster RF_RedFuncs_t rf_xorRecoveryFuncs = {
55 1.1 oster rf_RecoveryXorFunc, "Recovery Xr",
56 1.1 oster rf_RecoveryXorFunc, "Recovery Xr"};
57 1.1 oster
58 1.1 oster static void rf_RecurPrintDAG(RF_DagNode_t *, int, int);
59 1.1 oster static void rf_PrintDAG(RF_DagHeader_t *);
60 1.1 oster static int rf_ValidateBranch(RF_DagNode_t *, int *, int *,
61 1.1 oster RF_DagNode_t **, int );
62 1.1 oster static void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int);
63 1.1 oster static void rf_ValidateVisitedBits(RF_DagHeader_t *);
64 1.1 oster
65 1.1 oster /******************************************************************************
66 1.1 oster *
67 1.1 oster * InitNode - initialize a dag node
68 1.1 oster *
69 1.1 oster * the size of the propList array is always the same as that of the
70 1.1 oster * successors array.
71 1.1 oster *
72 1.1 oster *****************************************************************************/
73 1.1 oster void rf_InitNode(
74 1.1 oster RF_DagNode_t *node,
75 1.1 oster RF_NodeStatus_t initstatus,
76 1.1 oster int commit,
77 1.1 oster int (*doFunc)(RF_DagNode_t *node),
78 1.1 oster int (*undoFunc)(RF_DagNode_t *node),
79 1.1 oster int (*wakeFunc)(RF_DagNode_t *node,int status),
80 1.1 oster int nSucc,
81 1.1 oster int nAnte,
82 1.1 oster int nParam,
83 1.1 oster int nResult,
84 1.1 oster RF_DagHeader_t *hdr,
85 1.1 oster char *name,
86 1.1 oster RF_AllocListElem_t *alist)
87 1.1 oster {
88 1.1 oster void **ptrs;
89 1.1 oster int nptrs;
90 1.1 oster
91 1.1 oster if (nAnte > RF_MAX_ANTECEDENTS)
92 1.1 oster RF_PANIC();
93 1.1 oster node->status = initstatus;
94 1.1 oster node->commitNode = commit;
95 1.1 oster node->doFunc = doFunc;
96 1.1 oster node->undoFunc = undoFunc;
97 1.1 oster node->wakeFunc = wakeFunc;
98 1.1 oster node->numParams = nParam;
99 1.1 oster node->numResults = nResult;
100 1.1 oster node->numAntecedents = nAnte;
101 1.1 oster node->numAntDone = 0;
102 1.1 oster node->next = NULL;
103 1.1 oster node->numSuccedents = nSucc;
104 1.1 oster node->name = name;
105 1.1 oster node->dagHdr = hdr;
106 1.1 oster node->visited = 0;
107 1.1 oster
108 1.1 oster /* allocate all the pointers with one call to malloc */
109 1.1 oster nptrs = nSucc+nAnte+nResult+nSucc;
110 1.1 oster
111 1.1 oster if (nptrs <= RF_DAG_PTRCACHESIZE) {
112 1.1 oster /*
113 1.1 oster * The dag_ptrs field of the node is basically some scribble
114 1.1 oster * space to be used here. We could get rid of it, and always
115 1.1 oster * allocate the range of pointers, but that's expensive. So,
116 1.1 oster * we pick a "common case" size for the pointer cache. Hopefully,
117 1.1 oster * we'll find that:
118 1.1 oster * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
119 1.1 oster * only a little bit (least efficient case)
120 1.1 oster * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE
121 1.1 oster * (wasted memory)
122 1.1 oster */
123 1.1 oster ptrs = (void **)node->dag_ptrs;
124 1.1 oster }
125 1.1 oster else {
126 1.1 oster RF_CallocAndAdd(ptrs, nptrs, sizeof(void *), (void **), alist);
127 1.1 oster }
128 1.1 oster node->succedents = (nSucc) ? (RF_DagNode_t **) ptrs : NULL;
129 1.1 oster node->antecedents = (nAnte) ? (RF_DagNode_t **) (ptrs+nSucc) : NULL;
130 1.1 oster node->results = (nResult) ? (void **) (ptrs+nSucc+nAnte) : NULL;
131 1.1 oster node->propList = (nSucc) ? (RF_PropHeader_t **) (ptrs+nSucc+nAnte+nResult) : NULL;
132 1.1 oster
133 1.1 oster if (nParam) {
134 1.1 oster if (nParam <= RF_DAG_PARAMCACHESIZE) {
135 1.1 oster node->params = (RF_DagParam_t *)node->dag_params;
136 1.1 oster }
137 1.1 oster else {
138 1.1 oster RF_CallocAndAdd(node->params, nParam, sizeof(RF_DagParam_t), (RF_DagParam_t *), alist);
139 1.1 oster }
140 1.1 oster }
141 1.1 oster else {
142 1.1 oster node->params = NULL;
143 1.1 oster }
144 1.1 oster }
145 1.1 oster
146 1.1 oster
147 1.1 oster
148 1.1 oster /******************************************************************************
149 1.1 oster *
150 1.1 oster * allocation and deallocation routines
151 1.1 oster *
152 1.1 oster *****************************************************************************/
153 1.1 oster
154 1.1 oster void rf_FreeDAG(dag_h)
155 1.1 oster RF_DagHeader_t *dag_h;
156 1.1 oster {
157 1.1 oster RF_AccessStripeMapHeader_t *asmap, *t_asmap;
158 1.1 oster RF_DagHeader_t *nextDag;
159 1.1 oster int i;
160 1.1 oster
161 1.1 oster while (dag_h) {
162 1.1 oster nextDag = dag_h->next;
163 1.1 oster for (i=0; dag_h->memChunk[i] && i < RF_MAXCHUNKS; i++) {
164 1.1 oster /* release mem chunks */
165 1.1 oster rf_ReleaseMemChunk(dag_h->memChunk[i]);
166 1.1 oster dag_h->memChunk[i] = NULL;
167 1.1 oster }
168 1.1 oster
169 1.1 oster RF_ASSERT(i == dag_h->chunkIndex);
170 1.1 oster if (dag_h->xtraChunkCnt > 0) {
171 1.1 oster /* free xtraMemChunks */
172 1.1 oster for (i=0; dag_h->xtraMemChunk[i] && i < dag_h->xtraChunkIndex; i++) {
173 1.1 oster rf_ReleaseMemChunk(dag_h->xtraMemChunk[i]);
174 1.1 oster dag_h->xtraMemChunk[i] = NULL;
175 1.1 oster }
176 1.1 oster RF_ASSERT(i == dag_h->xtraChunkIndex);
177 1.1 oster /* free ptrs to xtraMemChunks */
178 1.1 oster RF_Free(dag_h->xtraMemChunk, dag_h->xtraChunkCnt * sizeof(RF_ChunkDesc_t *));
179 1.1 oster }
180 1.1 oster rf_FreeAllocList(dag_h->allocList);
181 1.1 oster for (asmap = dag_h->asmList; asmap;) {
182 1.1 oster t_asmap = asmap;
183 1.1 oster asmap = asmap->next;
184 1.1 oster rf_FreeAccessStripeMap(t_asmap);
185 1.1 oster }
186 1.1 oster rf_FreeDAGHeader(dag_h);
187 1.1 oster dag_h = nextDag;
188 1.1 oster }
189 1.1 oster }
190 1.1 oster
191 1.1 oster RF_PropHeader_t *rf_MakePropListEntry(
192 1.1 oster RF_DagHeader_t *dag_h,
193 1.1 oster int resultNum,
194 1.1 oster int paramNum,
195 1.1 oster RF_PropHeader_t *next,
196 1.1 oster RF_AllocListElem_t *allocList)
197 1.1 oster {
198 1.1 oster RF_PropHeader_t *p;
199 1.1 oster
200 1.1 oster RF_CallocAndAdd(p, 1, sizeof(RF_PropHeader_t),
201 1.1 oster (RF_PropHeader_t *), allocList);
202 1.1 oster p->resultNum = resultNum;
203 1.1 oster p->paramNum = paramNum;
204 1.1 oster p->next = next;
205 1.1 oster return(p);
206 1.1 oster }
207 1.1 oster
208 1.1 oster static RF_FreeList_t *rf_dagh_freelist;
209 1.1 oster
210 1.1 oster #define RF_MAX_FREE_DAGH 128
211 1.1 oster #define RF_DAGH_INC 16
212 1.1 oster #define RF_DAGH_INITIAL 32
213 1.1 oster
214 1.1 oster static void rf_ShutdownDAGs(void *);
215 1.1 oster static void rf_ShutdownDAGs(ignored)
216 1.1 oster void *ignored;
217 1.1 oster {
218 1.1 oster RF_FREELIST_DESTROY(rf_dagh_freelist,next,(RF_DagHeader_t *));
219 1.1 oster }
220 1.1 oster
221 1.1 oster int rf_ConfigureDAGs(listp)
222 1.1 oster RF_ShutdownList_t **listp;
223 1.1 oster {
224 1.1 oster int rc;
225 1.1 oster
226 1.1 oster RF_FREELIST_CREATE(rf_dagh_freelist, RF_MAX_FREE_DAGH,
227 1.1 oster RF_DAGH_INC, sizeof(RF_DagHeader_t));
228 1.1 oster if (rf_dagh_freelist == NULL)
229 1.1 oster return(ENOMEM);
230 1.1 oster rc = rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL);
231 1.1 oster if (rc) {
232 1.1 oster RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
233 1.1 oster __FILE__, __LINE__, rc);
234 1.1 oster rf_ShutdownDAGs(NULL);
235 1.1 oster return(rc);
236 1.1 oster }
237 1.1 oster RF_FREELIST_PRIME(rf_dagh_freelist, RF_DAGH_INITIAL,next,
238 1.1 oster (RF_DagHeader_t *));
239 1.1 oster return(0);
240 1.1 oster }
241 1.1 oster
242 1.1 oster RF_DagHeader_t *rf_AllocDAGHeader()
243 1.1 oster {
244 1.1 oster RF_DagHeader_t *dh;
245 1.1 oster
246 1.1 oster RF_FREELIST_GET(rf_dagh_freelist,dh,next,(RF_DagHeader_t *));
247 1.1 oster if (dh) {
248 1.1 oster bzero((char *)dh, sizeof(RF_DagHeader_t));
249 1.1 oster }
250 1.1 oster return(dh);
251 1.1 oster }
252 1.1 oster
253 1.1 oster void rf_FreeDAGHeader(RF_DagHeader_t *dh)
254 1.1 oster {
255 1.1 oster RF_FREELIST_FREE(rf_dagh_freelist,dh,next);
256 1.1 oster }
257 1.1 oster
258 1.1 oster /* allocates a buffer big enough to hold the data described by pda */
259 1.1 oster void *rf_AllocBuffer(
260 1.1 oster RF_Raid_t *raidPtr,
261 1.1 oster RF_DagHeader_t *dag_h,
262 1.1 oster RF_PhysDiskAddr_t *pda,
263 1.1 oster RF_AllocListElem_t *allocList)
264 1.1 oster {
265 1.1 oster char *p;
266 1.1 oster
267 1.1 oster RF_MallocAndAdd(p, pda->numSector << raidPtr->logBytesPerSector,
268 1.1 oster (char *), allocList);
269 1.1 oster return((void *)p);
270 1.1 oster }
271 1.1 oster
272 1.1 oster /******************************************************************************
273 1.1 oster *
274 1.1 oster * debug routines
275 1.1 oster *
276 1.1 oster *****************************************************************************/
277 1.1 oster
278 1.1 oster char *rf_NodeStatusString(RF_DagNode_t *node)
279 1.1 oster {
280 1.1 oster switch (node->status) {
281 1.1 oster case rf_wait: return("wait");
282 1.1 oster case rf_fired: return("fired");
283 1.1 oster case rf_good: return("good");
284 1.1 oster case rf_bad: return("bad");
285 1.1 oster default: return("?");
286 1.1 oster }
287 1.1 oster }
288 1.1 oster
289 1.1 oster void rf_PrintNodeInfoString(RF_DagNode_t *node)
290 1.1 oster {
291 1.1 oster RF_PhysDiskAddr_t *pda;
292 1.1 oster int (*df)(RF_DagNode_t *) = node->doFunc;
293 1.1 oster int i, lk, unlk;
294 1.1 oster void *bufPtr;
295 1.1 oster
296 1.1 oster if ((df==rf_DiskReadFunc) || (df==rf_DiskWriteFunc)
297 1.1 oster || (df==rf_DiskReadMirrorIdleFunc)
298 1.1 oster || (df == rf_DiskReadMirrorPartitionFunc))
299 1.1 oster {
300 1.1 oster pda = (RF_PhysDiskAddr_t *)node->params[0].p;
301 1.1 oster bufPtr = (void *)node->params[1].p;
302 1.1 oster lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
303 1.1 oster unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
304 1.1 oster RF_ASSERT( !(lk && unlk) );
305 1.1 oster printf("r %d c %d offs %ld nsect %d buf 0x%lx %s\n", pda->row, pda->col,
306 1.1 oster (long)pda->startSector, (int) pda->numSector, (long)bufPtr,
307 1.1 oster (lk) ? "LOCK" : ((unlk) ? "UNLK" : " "));
308 1.1 oster return;
309 1.1 oster }
310 1.1 oster
311 1.1 oster if (df == rf_DiskUnlockFunc) {
312 1.1 oster pda = (RF_PhysDiskAddr_t *)node->params[0].p;
313 1.1 oster lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
314 1.1 oster unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
315 1.1 oster RF_ASSERT( !(lk && unlk) );
316 1.1 oster printf("r %d c %d %s\n", pda->row, pda->col,
317 1.1 oster (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop"));
318 1.1 oster return;
319 1.1 oster }
320 1.1 oster
321 1.1 oster if ((df==rf_SimpleXorFunc) || (df==rf_RegularXorFunc)
322 1.1 oster || (df==rf_RecoveryXorFunc))
323 1.1 oster {
324 1.1 oster printf("result buf 0x%lx\n",(long) node->results[0]);
325 1.1 oster for (i=0; i<node->numParams-1; i+=2) {
326 1.1 oster pda = (RF_PhysDiskAddr_t *)node->params[i].p;
327 1.1 oster bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
328 1.1 oster printf(" buf 0x%lx r%d c%d offs %ld nsect %d\n",
329 1.1 oster (long)bufPtr, pda->row, pda->col,
330 1.1 oster (long)pda->startSector, (int)pda->numSector);
331 1.1 oster }
332 1.1 oster return;
333 1.1 oster }
334 1.1 oster
335 1.1 oster #if RF_INCLUDE_PARITYLOGGING > 0
336 1.1 oster if (df==rf_ParityLogOverwriteFunc || df==rf_ParityLogUpdateFunc) {
337 1.1 oster for (i=0; i<node->numParams-1; i+=2) {
338 1.1 oster pda = (RF_PhysDiskAddr_t *)node->params[i].p;
339 1.1 oster bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
340 1.1 oster printf(" r%d c%d offs %ld nsect %d buf 0x%lx\n",
341 1.1 oster pda->row, pda->col, (long) pda->startSector,
342 1.1 oster (int) pda->numSector, (long) bufPtr);
343 1.1 oster }
344 1.1 oster return;
345 1.1 oster }
346 1.1 oster #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
347 1.1 oster
348 1.1 oster if ((df==rf_TerminateFunc) || (df==rf_NullNodeFunc)) {
349 1.1 oster printf("\n");
350 1.1 oster return;
351 1.1 oster }
352 1.1 oster
353 1.1 oster printf("?\n");
354 1.1 oster }
355 1.1 oster
356 1.1 oster static void rf_RecurPrintDAG(node, depth, unvisited)
357 1.1 oster RF_DagNode_t *node;
358 1.1 oster int depth;
359 1.1 oster int unvisited;
360 1.1 oster {
361 1.1 oster char *anttype;
362 1.1 oster int i;
363 1.1 oster
364 1.1 oster node->visited = (unvisited) ? 0 : 1;
365 1.1 oster printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth,
366 1.1 oster node->nodeNum, node->commitNode, node->name, rf_NodeStatusString(node),
367 1.1 oster node->numSuccedents, node->numSuccFired, node->numSuccDone,
368 1.1 oster node->numAntecedents, node->numAntDone, node->numParams,node->numResults);
369 1.1 oster for (i=0; i<node->numSuccedents; i++) {
370 1.1 oster printf("%d%s", node->succedents[i]->nodeNum,
371 1.1 oster ((i==node->numSuccedents-1) ? "\0" : " "));
372 1.1 oster }
373 1.1 oster printf("} A{");
374 1.1 oster for (i=0; i<node->numAntecedents; i++) {
375 1.1 oster switch (node->antType[i]) {
376 1.1 oster case rf_trueData :
377 1.1 oster anttype = "T";
378 1.1 oster break;
379 1.1 oster case rf_antiData :
380 1.1 oster anttype = "A";
381 1.1 oster break;
382 1.1 oster case rf_outputData :
383 1.1 oster anttype = "O";
384 1.1 oster break;
385 1.1 oster case rf_control :
386 1.1 oster anttype = "C";
387 1.1 oster break;
388 1.1 oster default :
389 1.1 oster anttype = "?";
390 1.1 oster break;
391 1.1 oster }
392 1.1 oster printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype, (i==node->numAntecedents-1) ? "\0" : " ");
393 1.1 oster }
394 1.1 oster printf("}; ");
395 1.1 oster rf_PrintNodeInfoString(node);
396 1.1 oster for (i=0; i<node->numSuccedents; i++) {
397 1.1 oster if (node->succedents[i]->visited == unvisited)
398 1.1 oster rf_RecurPrintDAG(node->succedents[i], depth+1, unvisited);
399 1.1 oster }
400 1.1 oster }
401 1.1 oster
402 1.1 oster static void rf_PrintDAG(dag_h)
403 1.1 oster RF_DagHeader_t *dag_h;
404 1.1 oster {
405 1.1 oster int unvisited, i;
406 1.1 oster char *status;
407 1.1 oster
408 1.1 oster /* set dag status */
409 1.1 oster switch (dag_h->status) {
410 1.1 oster case rf_enable :
411 1.1 oster status = "enable";
412 1.1 oster break;
413 1.1 oster case rf_rollForward :
414 1.1 oster status = "rollForward";
415 1.1 oster break;
416 1.1 oster case rf_rollBackward :
417 1.1 oster status = "rollBackward";
418 1.1 oster break;
419 1.1 oster default :
420 1.1 oster status = "illegal!";
421 1.1 oster break;
422 1.1 oster }
423 1.1 oster /* find out if visited bits are currently set or clear */
424 1.1 oster unvisited = dag_h->succedents[0]->visited;
425 1.1 oster
426 1.1 oster printf("DAG type: %s\n", dag_h->creator);
427 1.1 oster printf("format is (depth) num commit type: status,nSucc nSuccFired/nSuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)}; info\n");
428 1.1 oster printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum,
429 1.1 oster status, dag_h->numSuccedents, dag_h->numCommitNodes, dag_h->numCommits);
430 1.1 oster for (i=0; i<dag_h->numSuccedents; i++) {
431 1.1 oster printf("%d%s", dag_h->succedents[i]->nodeNum,
432 1.1 oster ((i==dag_h->numSuccedents-1) ? "\0" : " "));
433 1.1 oster }
434 1.1 oster printf("};\n");
435 1.1 oster for (i=0; i<dag_h->numSuccedents; i++) {
436 1.1 oster if (dag_h->succedents[i]->visited == unvisited)
437 1.1 oster rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited);
438 1.1 oster }
439 1.1 oster }
440 1.1 oster
441 1.1 oster /* assigns node numbers */
442 1.1 oster int rf_AssignNodeNums(RF_DagHeader_t *dag_h)
443 1.1 oster {
444 1.1 oster int unvisited, i, nnum;
445 1.1 oster RF_DagNode_t *node;
446 1.1 oster
447 1.1 oster nnum = 0;
448 1.1 oster unvisited = dag_h->succedents[0]->visited;
449 1.1 oster
450 1.1 oster dag_h->nodeNum = nnum++;
451 1.1 oster for (i=0; i<dag_h->numSuccedents; i++) {
452 1.1 oster node = dag_h->succedents[i];
453 1.1 oster if (node->visited == unvisited) {
454 1.1 oster nnum = rf_RecurAssignNodeNums(dag_h->succedents[i], nnum, unvisited);
455 1.1 oster }
456 1.1 oster }
457 1.1 oster return(nnum);
458 1.1 oster }
459 1.1 oster
460 1.1 oster int rf_RecurAssignNodeNums(node, num, unvisited)
461 1.1 oster RF_DagNode_t *node;
462 1.1 oster int num;
463 1.1 oster int unvisited;
464 1.1 oster {
465 1.1 oster int i;
466 1.1 oster
467 1.1 oster node->visited = (unvisited) ? 0 : 1;
468 1.1 oster
469 1.1 oster node->nodeNum = num++;
470 1.1 oster for (i=0; i<node->numSuccedents; i++) {
471 1.1 oster if (node->succedents[i]->visited == unvisited) {
472 1.1 oster num = rf_RecurAssignNodeNums(node->succedents[i], num, unvisited);
473 1.1 oster }
474 1.1 oster }
475 1.1 oster return(num);
476 1.1 oster }
477 1.1 oster
478 1.1 oster /* set the header pointers in each node to "newptr" */
479 1.1 oster void rf_ResetDAGHeaderPointers(dag_h, newptr)
480 1.1 oster RF_DagHeader_t *dag_h;
481 1.1 oster RF_DagHeader_t *newptr;
482 1.1 oster {
483 1.1 oster int i;
484 1.1 oster for (i=0; i<dag_h->numSuccedents; i++)
485 1.1 oster if (dag_h->succedents[i]->dagHdr != newptr)
486 1.1 oster rf_RecurResetDAGHeaderPointers(dag_h->succedents[i], newptr);
487 1.1 oster }
488 1.1 oster
489 1.1 oster void rf_RecurResetDAGHeaderPointers(node, newptr)
490 1.1 oster RF_DagNode_t *node;
491 1.1 oster RF_DagHeader_t *newptr;
492 1.1 oster {
493 1.1 oster int i;
494 1.1 oster node->dagHdr = newptr;
495 1.1 oster for (i=0; i<node->numSuccedents; i++)
496 1.1 oster if (node->succedents[i]->dagHdr != newptr)
497 1.1 oster rf_RecurResetDAGHeaderPointers(node->succedents[i], newptr);
498 1.1 oster }
499 1.1 oster
500 1.1 oster
501 1.1 oster void rf_PrintDAGList(RF_DagHeader_t *dag_h)
502 1.1 oster {
503 1.1 oster int i=0;
504 1.1 oster
505 1.1 oster for (; dag_h; dag_h=dag_h->next) {
506 1.1 oster rf_AssignNodeNums(dag_h);
507 1.1 oster printf("\n\nDAG %d IN LIST:\n",i++);
508 1.1 oster rf_PrintDAG(dag_h);
509 1.1 oster }
510 1.1 oster }
511 1.1 oster
512 1.1 oster static int rf_ValidateBranch(node, scount, acount, nodes, unvisited)
513 1.1 oster RF_DagNode_t *node;
514 1.1 oster int *scount;
515 1.1 oster int *acount;
516 1.1 oster RF_DagNode_t **nodes;
517 1.1 oster int unvisited;
518 1.1 oster {
519 1.1 oster int i, retcode = 0;
520 1.1 oster
521 1.1 oster /* construct an array of node pointers indexed by node num */
522 1.1 oster node->visited = (unvisited) ? 0 : 1;
523 1.1 oster nodes[ node->nodeNum ] = node;
524 1.1 oster
525 1.1 oster if (node->next != NULL) {
526 1.1 oster printf("INVALID DAG: next pointer in node is not NULL\n");
527 1.1 oster retcode = 1;
528 1.1 oster }
529 1.1 oster if (node->status != rf_wait) {
530 1.1 oster printf("INVALID DAG: Node status is not wait\n");
531 1.1 oster retcode = 1;
532 1.1 oster }
533 1.1 oster if (node->numAntDone != 0) {
534 1.1 oster printf("INVALID DAG: numAntDone is not zero\n");
535 1.1 oster retcode = 1;
536 1.1 oster }
537 1.1 oster if (node->doFunc == rf_TerminateFunc) {
538 1.1 oster if (node->numSuccedents != 0) {
539 1.1 oster printf("INVALID DAG: Terminator node has succedents\n");
540 1.1 oster retcode = 1;
541 1.1 oster }
542 1.1 oster } else {
543 1.1 oster if (node->numSuccedents == 0) {
544 1.1 oster printf("INVALID DAG: Non-terminator node has no succedents\n");
545 1.1 oster retcode = 1;
546 1.1 oster }
547 1.1 oster }
548 1.1 oster for (i=0; i<node->numSuccedents; i++) {
549 1.1 oster if (!node->succedents[i]) {
550 1.1 oster printf("INVALID DAG: succedent %d of node %s is NULL\n",i,node->name);
551 1.1 oster retcode = 1;
552 1.1 oster }
553 1.1 oster scount[ node->succedents[i]->nodeNum ]++;
554 1.1 oster }
555 1.1 oster for (i=0; i<node->numAntecedents; i++) {
556 1.1 oster if (!node->antecedents[i]) {
557 1.1 oster printf("INVALID DAG: antecedent %d of node %s is NULL\n",i,node->name);
558 1.1 oster retcode = 1;
559 1.1 oster }
560 1.1 oster acount[ node->antecedents[i]->nodeNum ]++;
561 1.1 oster }
562 1.1 oster for (i=0; i<node->numSuccedents; i++) {
563 1.1 oster if (node->succedents[i]->visited == unvisited) {
564 1.1 oster if (rf_ValidateBranch(node->succedents[i], scount,
565 1.1 oster acount, nodes, unvisited))
566 1.1 oster {
567 1.1 oster retcode = 1;
568 1.1 oster }
569 1.1 oster }
570 1.1 oster }
571 1.1 oster return(retcode);
572 1.1 oster }
573 1.1 oster
574 1.1 oster static void rf_ValidateBranchVisitedBits(node, unvisited, rl)
575 1.1 oster RF_DagNode_t *node;
576 1.1 oster int unvisited;
577 1.1 oster int rl;
578 1.1 oster {
579 1.1 oster int i;
580 1.1 oster
581 1.1 oster RF_ASSERT(node->visited == unvisited);
582 1.1 oster for (i=0; i<node->numSuccedents; i++) {
583 1.1 oster if (node->succedents[i] == NULL) {
584 1.1 oster printf("node=%lx node->succedents[%d] is NULL\n", (long)node, i);
585 1.1 oster RF_ASSERT(0);
586 1.1 oster }
587 1.1 oster rf_ValidateBranchVisitedBits(node->succedents[i],unvisited, rl+1);
588 1.1 oster }
589 1.1 oster }
590 1.1 oster
591 1.1 oster /* NOTE: never call this on a big dag, because it is exponential
592 1.1 oster * in execution time
593 1.1 oster */
594 1.1 oster static void rf_ValidateVisitedBits(dag)
595 1.1 oster RF_DagHeader_t *dag;
596 1.1 oster {
597 1.1 oster int i, unvisited;
598 1.1 oster
599 1.1 oster unvisited = dag->succedents[0]->visited;
600 1.1 oster
601 1.1 oster for (i=0; i<dag->numSuccedents; i++) {
602 1.1 oster if (dag->succedents[i] == NULL) {
603 1.1 oster printf("dag=%lx dag->succedents[%d] is NULL\n", (long) dag, i);
604 1.1 oster RF_ASSERT(0);
605 1.1 oster }
606 1.1 oster rf_ValidateBranchVisitedBits(dag->succedents[i],unvisited,0);
607 1.1 oster }
608 1.1 oster }
609 1.1 oster
610 1.1 oster /* validate a DAG. _at entry_ verify that:
611 1.1 oster * -- numNodesCompleted is zero
612 1.1 oster * -- node queue is null
613 1.1 oster * -- dag status is rf_enable
614 1.1 oster * -- next pointer is null on every node
615 1.1 oster * -- all nodes have status wait
616 1.1 oster * -- numAntDone is zero in all nodes
617 1.1 oster * -- terminator node has zero successors
618 1.1 oster * -- no other node besides terminator has zero successors
619 1.1 oster * -- no successor or antecedent pointer in a node is NULL
620 1.1 oster * -- number of times that each node appears as a successor of another node
621 1.1 oster * is equal to the antecedent count on that node
622 1.1 oster * -- number of times that each node appears as an antecedent of another node
623 1.1 oster * is equal to the succedent count on that node
624 1.1 oster * -- what else?
625 1.1 oster */
626 1.1 oster int rf_ValidateDAG(dag_h)
627 1.1 oster RF_DagHeader_t *dag_h;
628 1.1 oster {
629 1.1 oster int i, nodecount;
630 1.1 oster int *scount, *acount; /* per-node successor and antecedent counts */
631 1.1 oster RF_DagNode_t **nodes; /* array of ptrs to nodes in dag */
632 1.1 oster int retcode = 0;
633 1.1 oster int unvisited;
634 1.1 oster int commitNodeCount = 0;
635 1.1 oster
636 1.1 oster if (rf_validateVisitedDebug)
637 1.1 oster rf_ValidateVisitedBits(dag_h);
638 1.1 oster
639 1.1 oster if (dag_h->numNodesCompleted != 0) {
640 1.1 oster printf("INVALID DAG: num nodes completed is %d, should be 0\n",dag_h->numNodesCompleted);
641 1.1 oster retcode = 1; goto validate_dag_bad;
642 1.1 oster }
643 1.1 oster if (dag_h->status != rf_enable) {
644 1.1 oster printf("INVALID DAG: not enabled\n");
645 1.1 oster retcode = 1; goto validate_dag_bad;
646 1.1 oster }
647 1.1 oster if (dag_h->numCommits != 0) {
648 1.1 oster printf("INVALID DAG: numCommits != 0 (%d)\n",dag_h->numCommits);
649 1.1 oster retcode = 1; goto validate_dag_bad;
650 1.1 oster }
651 1.1 oster if (dag_h->numSuccedents != 1) {
652 1.1 oster /* currently, all dags must have only one succedent */
653 1.1 oster printf("INVALID DAG: numSuccedents !1 (%d)\n",dag_h->numSuccedents);
654 1.1 oster retcode = 1; goto validate_dag_bad;
655 1.1 oster }
656 1.1 oster nodecount = rf_AssignNodeNums(dag_h);
657 1.1 oster
658 1.1 oster unvisited = dag_h->succedents[0]->visited;
659 1.1 oster
660 1.1 oster RF_Calloc(scount, nodecount, sizeof(int), (int *));
661 1.1 oster RF_Calloc(acount, nodecount, sizeof(int), (int *));
662 1.1 oster RF_Calloc(nodes, nodecount, sizeof(RF_DagNode_t *), (RF_DagNode_t **));
663 1.1 oster for (i=0; i<dag_h->numSuccedents; i++) {
664 1.1 oster if ((dag_h->succedents[i]->visited == unvisited)
665 1.1 oster && rf_ValidateBranch(dag_h->succedents[i], scount,
666 1.1 oster acount, nodes, unvisited))
667 1.1 oster {
668 1.1 oster retcode = 1;
669 1.1 oster }
670 1.1 oster }
671 1.1 oster /* start at 1 to skip the header node */
672 1.1 oster for (i=1; i<nodecount; i++) {
673 1.1 oster if ( nodes[i]->commitNode )
674 1.1 oster commitNodeCount++;
675 1.1 oster if ( nodes[i]->doFunc == NULL ) {
676 1.1 oster printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
677 1.1 oster retcode = 1;
678 1.1 oster goto validate_dag_out;
679 1.1 oster }
680 1.1 oster if ( nodes[i]->undoFunc == NULL ) {
681 1.1 oster printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
682 1.1 oster retcode = 1;
683 1.1 oster goto validate_dag_out;
684 1.1 oster }
685 1.1 oster if ( nodes[i]->numAntecedents != scount[ nodes[i]->nodeNum ] ) {
686 1.1 oster printf("INVALID DAG: node %s has %d antecedents but appears as a succedent %d times\n",
687 1.1 oster nodes[i]->name, nodes[i]->numAntecedents, scount[nodes[i]->nodeNum]);
688 1.1 oster retcode = 1;
689 1.1 oster goto validate_dag_out;
690 1.1 oster }
691 1.1 oster if ( nodes[i]->numSuccedents != acount[ nodes[i]->nodeNum ] ) {
692 1.1 oster printf("INVALID DAG: node %s has %d succedents but appears as an antecedent %d times\n",
693 1.1 oster nodes[i]->name, nodes[i]->numSuccedents, acount[nodes[i]->nodeNum]);
694 1.1 oster retcode = 1;
695 1.1 oster goto validate_dag_out;
696 1.1 oster }
697 1.1 oster }
698 1.1 oster
699 1.1 oster if ( dag_h->numCommitNodes != commitNodeCount ) {
700 1.1 oster printf("INVALID DAG: incorrect commit node count. hdr->numCommitNodes (%d) found (%d) commit nodes in graph\n",
701 1.1 oster dag_h->numCommitNodes, commitNodeCount);
702 1.1 oster retcode = 1;
703 1.1 oster goto validate_dag_out;
704 1.1 oster }
705 1.1 oster
706 1.1 oster validate_dag_out:
707 1.1 oster RF_Free(scount, nodecount*sizeof(int));
708 1.1 oster RF_Free(acount, nodecount*sizeof(int));
709 1.1 oster RF_Free(nodes, nodecount*sizeof(RF_DagNode_t *));
710 1.1 oster if (retcode)
711 1.1 oster rf_PrintDAGList(dag_h);
712 1.1 oster
713 1.1 oster if (rf_validateVisitedDebug)
714 1.1 oster rf_ValidateVisitedBits(dag_h);
715 1.1 oster
716 1.1 oster return(retcode);
717 1.1 oster
718 1.1 oster validate_dag_bad:
719 1.1 oster rf_PrintDAGList(dag_h);
720 1.1 oster return(retcode);
721 1.1 oster }
722 1.1 oster
723 1.1 oster
724 1.1 oster /******************************************************************************
725 1.1 oster *
726 1.1 oster * misc construction routines
727 1.1 oster *
728 1.1 oster *****************************************************************************/
729 1.1 oster
730 1.1 oster void rf_redirect_asm(
731 1.1 oster RF_Raid_t *raidPtr,
732 1.1 oster RF_AccessStripeMap_t *asmap)
733 1.1 oster {
734 1.1 oster int ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0;
735 1.1 oster int row = asmap->physInfo->row;
736 1.1 oster int fcol = raidPtr->reconControl[row]->fcol;
737 1.1 oster int srow = raidPtr->reconControl[row]->spareRow;
738 1.1 oster int scol = raidPtr->reconControl[row]->spareCol;
739 1.1 oster RF_PhysDiskAddr_t *pda;
740 1.1 oster
741 1.1 oster RF_ASSERT( raidPtr->status[row] == rf_rs_reconstructing );
742 1.1 oster for (pda = asmap->physInfo; pda; pda=pda->next) {
743 1.1 oster if (pda->col == fcol) {
744 1.1 oster if (rf_dagDebug) {
745 1.1 oster if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap,
746 1.1 oster pda->startSector))
747 1.1 oster {
748 1.1 oster RF_PANIC();
749 1.1 oster }
750 1.1 oster }
751 1.1 oster /*printf("Remapped data for large write\n");*/
752 1.1 oster if (ds) {
753 1.1 oster raidPtr->Layout.map->MapSector(raidPtr, pda->raidAddress,
754 1.1 oster &pda->row, &pda->col, &pda->startSector, RF_REMAP);
755 1.1 oster }
756 1.1 oster else {
757 1.1 oster pda->row = srow; pda->col = scol;
758 1.1 oster }
759 1.1 oster }
760 1.1 oster }
761 1.1 oster for (pda = asmap->parityInfo; pda; pda=pda->next) {
762 1.1 oster if (pda->col == fcol) {
763 1.1 oster if (rf_dagDebug) {
764 1.1 oster if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, pda->startSector)) {
765 1.1 oster RF_PANIC();
766 1.1 oster }
767 1.1 oster }
768 1.1 oster }
769 1.1 oster if (ds) {
770 1.1 oster (raidPtr->Layout.map->MapParity)(raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
771 1.1 oster }
772 1.1 oster else {
773 1.1 oster pda->row = srow; pda->col = scol;
774 1.1 oster }
775 1.1 oster }
776 1.1 oster }
777 1.1 oster
778 1.1 oster
779 1.1 oster /* this routine allocates read buffers and generates stripe maps for the
780 1.1 oster * regions of the array from the start of the stripe to the start of the
781 1.1 oster * access, and from the end of the access to the end of the stripe. It also
782 1.1 oster * computes and returns the number of DAG nodes needed to read all this data.
783 1.1 oster * Note that this routine does the wrong thing if the access is fully
784 1.1 oster * contained within one stripe unit, so we RF_ASSERT against this case at the
785 1.1 oster * start.
786 1.1 oster */
787 1.1 oster void rf_MapUnaccessedPortionOfStripe(
788 1.1 oster RF_Raid_t *raidPtr,
789 1.1 oster RF_RaidLayout_t *layoutPtr, /* in: layout information */
790 1.1 oster RF_AccessStripeMap_t *asmap, /* in: access stripe map */
791 1.1 oster RF_DagHeader_t *dag_h, /* in: header of the dag to create */
792 1.1 oster RF_AccessStripeMapHeader_t **new_asm_h, /* in: ptr to array of 2 headers, to be filled in */
793 1.1 oster int *nRodNodes, /* out: num nodes to be generated to read unaccessed data */
794 1.1 oster char **sosBuffer, /* out: pointers to newly allocated buffer */
795 1.1 oster char **eosBuffer,
796 1.1 oster RF_AllocListElem_t *allocList)
797 1.1 oster {
798 1.1 oster RF_RaidAddr_t sosRaidAddress, eosRaidAddress;
799 1.1 oster RF_SectorNum_t sosNumSector, eosNumSector;
800 1.1 oster
801 1.1 oster RF_ASSERT( asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol/2) );
802 1.1 oster /* generate an access map for the region of the array from start of stripe
803 1.1 oster * to start of access */
804 1.1 oster new_asm_h[0] = new_asm_h[1] = NULL; *nRodNodes = 0;
805 1.1 oster if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) {
806 1.1 oster sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
807 1.1 oster sosNumSector = asmap->raidAddress - sosRaidAddress;
808 1.1 oster RF_MallocAndAdd(*sosBuffer, rf_RaidAddressToByte(raidPtr, sosNumSector), (char *), allocList);
809 1.1 oster new_asm_h[0] = rf_MapAccess(raidPtr, sosRaidAddress, sosNumSector, *sosBuffer, RF_DONT_REMAP);
810 1.1 oster new_asm_h[0]->next = dag_h->asmList;
811 1.1 oster dag_h->asmList = new_asm_h[0];
812 1.1 oster *nRodNodes += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
813 1.1 oster
814 1.1 oster RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL);
815 1.1 oster /* we're totally within one stripe here */
816 1.1 oster if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
817 1.1 oster rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap);
818 1.1 oster }
819 1.1 oster /* generate an access map for the region of the array from end of access
820 1.1 oster * to end of stripe */
821 1.1 oster if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) {
822 1.1 oster eosRaidAddress = asmap->endRaidAddress;
823 1.1 oster eosNumSector = rf_RaidAddressOfNextStripeBoundary(layoutPtr, eosRaidAddress) - eosRaidAddress;
824 1.1 oster RF_MallocAndAdd(*eosBuffer, rf_RaidAddressToByte(raidPtr, eosNumSector), (char *), allocList);
825 1.1 oster new_asm_h[1] = rf_MapAccess(raidPtr, eosRaidAddress, eosNumSector, *eosBuffer, RF_DONT_REMAP);
826 1.1 oster new_asm_h[1]->next = dag_h->asmList;
827 1.1 oster dag_h->asmList = new_asm_h[1];
828 1.1 oster *nRodNodes += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
829 1.1 oster
830 1.1 oster RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL);
831 1.1 oster /* we're totally within one stripe here */
832 1.1 oster if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
833 1.1 oster rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap);
834 1.1 oster }
835 1.1 oster }
836 1.1 oster
837 1.1 oster
838 1.1 oster
839 1.1 oster /* returns non-zero if the indicated ranges of stripe unit offsets overlap */
840 1.1 oster int rf_PDAOverlap(
841 1.1 oster RF_RaidLayout_t *layoutPtr,
842 1.1 oster RF_PhysDiskAddr_t *src,
843 1.1 oster RF_PhysDiskAddr_t *dest)
844 1.1 oster {
845 1.1 oster RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
846 1.1 oster RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
847 1.1 oster /* use -1 to be sure we stay within SU */
848 1.1 oster RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1);
849 1.1 oster RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
850 1.1 oster return( (RF_MAX(soffs,doffs) <= RF_MIN(send,dend)) ? 1 : 0 );
851 1.1 oster }
852 1.1 oster
853 1.1 oster
854 1.1 oster /* GenerateFailedAccessASMs
855 1.1 oster *
856 1.1 oster * this routine figures out what portion of the stripe needs to be read
857 1.1 oster * to effect the degraded read or write operation. It's primary function
858 1.1 oster * is to identify everything required to recover the data, and then
859 1.1 oster * eliminate anything that is already being accessed by the user.
860 1.1 oster *
861 1.1 oster * The main result is two new ASMs, one for the region from the start of the
862 1.1 oster * stripe to the start of the access, and one for the region from the end of
863 1.1 oster * the access to the end of the stripe. These ASMs describe everything that
864 1.1 oster * needs to be read to effect the degraded access. Other results are:
865 1.1 oster * nXorBufs -- the total number of buffers that need to be XORed together to
866 1.1 oster * recover the lost data,
867 1.1 oster * rpBufPtr -- ptr to a newly-allocated buffer to hold the parity. If NULL
868 1.1 oster * at entry, not allocated.
869 1.1 oster * overlappingPDAs --
870 1.1 oster * describes which of the non-failed PDAs in the user access
871 1.1 oster * overlap data that needs to be read to effect recovery.
872 1.1 oster * overlappingPDAs[i]==1 if and only if, neglecting the failed
873 1.1 oster * PDA, the ith pda in the input asm overlaps data that needs
874 1.1 oster * to be read for recovery.
875 1.1 oster */
876 1.1 oster /* in: asm - ASM for the actual access, one stripe only */
877 1.1 oster /* in: faildPDA - which component of the access has failed */
878 1.1 oster /* in: dag_h - header of the DAG we're going to create */
879 1.1 oster /* out: new_asm_h - the two new ASMs */
880 1.1 oster /* out: nXorBufs - the total number of xor bufs required */
881 1.1 oster /* out: rpBufPtr - a buffer for the parity read */
882 1.1 oster void rf_GenerateFailedAccessASMs(
883 1.1 oster RF_Raid_t *raidPtr,
884 1.1 oster RF_AccessStripeMap_t *asmap,
885 1.1 oster RF_PhysDiskAddr_t *failedPDA,
886 1.1 oster RF_DagHeader_t *dag_h,
887 1.1 oster RF_AccessStripeMapHeader_t **new_asm_h,
888 1.1 oster int *nXorBufs,
889 1.1 oster char **rpBufPtr,
890 1.1 oster char *overlappingPDAs,
891 1.1 oster RF_AllocListElem_t *allocList)
892 1.1 oster {
893 1.1 oster RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
894 1.1 oster
895 1.1 oster /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
896 1.1 oster RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr;
897 1.1 oster
898 1.1 oster RF_SectorCount_t numSect[2], numParitySect;
899 1.1 oster RF_PhysDiskAddr_t *pda;
900 1.1 oster char *rdBuf, *bufP;
901 1.1 oster int foundit, i;
902 1.1 oster
903 1.1 oster bufP = NULL;
904 1.1 oster foundit = 0;
905 1.1 oster /* first compute the following raid addresses:
906 1.1 oster start of stripe, (sosAddr)
907 1.1 oster MIN(start of access, start of failed SU), (sosEndAddr)
908 1.1 oster MAX(end of access, end of failed SU), (eosStartAddr)
909 1.1 oster end of stripe (i.e. start of next stripe) (eosAddr)
910 1.1 oster */
911 1.1 oster sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
912 1.1 oster sosEndAddr = RF_MIN(asmap->raidAddress, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,failedPDA->raidAddress));
913 1.1 oster eosStartAddr = RF_MAX(asmap->endRaidAddress, rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, failedPDA->raidAddress));
914 1.1 oster eosAddr = rf_RaidAddressOfNextStripeBoundary(layoutPtr, asmap->raidAddress);
915 1.1 oster
916 1.1 oster /* now generate access stripe maps for each of the above regions of the
917 1.1 oster * stripe. Use a dummy (NULL) buf ptr for now */
918 1.1 oster
919 1.1 oster new_asm_h[0] = (sosAddr != sosEndAddr) ? rf_MapAccess(raidPtr, sosAddr, sosEndAddr-sosAddr, NULL, RF_DONT_REMAP) : NULL;
920 1.1 oster new_asm_h[1] = (eosStartAddr != eosAddr) ? rf_MapAccess(raidPtr, eosStartAddr, eosAddr-eosStartAddr, NULL, RF_DONT_REMAP) : NULL;
921 1.1 oster
922 1.1 oster /* walk through the PDAs and range-restrict each SU to the region of the
923 1.1 oster * SU touched on the failed PDA. also compute total data buffer space
924 1.1 oster * requirements in this step. Ignore the parity for now. */
925 1.1 oster
926 1.1 oster numSect[0] = numSect[1] = 0;
927 1.1 oster if (new_asm_h[0]) {
928 1.1 oster new_asm_h[0]->next = dag_h->asmList; dag_h->asmList = new_asm_h[0];
929 1.1 oster for (pda = new_asm_h[0]->stripeMap->physInfo; pda; pda = pda->next) {
930 1.1 oster rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[0] += pda->numSector;
931 1.1 oster }
932 1.1 oster }
933 1.1 oster if (new_asm_h[1]) {
934 1.1 oster new_asm_h[1]->next = dag_h->asmList; dag_h->asmList = new_asm_h[1];
935 1.1 oster for (pda = new_asm_h[1]->stripeMap->physInfo; pda; pda = pda->next) {
936 1.1 oster rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[1] += pda->numSector;
937 1.1 oster }
938 1.1 oster }
939 1.1 oster numParitySect = failedPDA->numSector;
940 1.1 oster
941 1.1 oster /* allocate buffer space for the data & parity we have to read to recover
942 1.1 oster * from the failure */
943 1.1 oster
944 1.1 oster if (numSect[0]+numSect[1]+ ((rpBufPtr) ? numParitySect : 0)) { /* don't allocate parity buf if not needed */
945 1.1 oster RF_MallocAndAdd(rdBuf, rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (char *), allocList);
946 1.1 oster bufP = rdBuf;
947 1.1 oster if (rf_degDagDebug) printf("Newly allocated buffer (%d bytes) is 0x%lx\n",
948 1.1 oster (int)rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (unsigned long) bufP);
949 1.1 oster }
950 1.1 oster
951 1.1 oster /* now walk through the pdas one last time and assign buffer pointers
952 1.1 oster * (ugh!). Again, ignore the parity. also, count nodes to find out how
953 1.1 oster * many bufs need to be xored together */
954 1.1 oster (*nXorBufs) = 1; /* in read case, 1 is for parity. In write case, 1 is for failed data */
955 1.1 oster if (new_asm_h[0]) {
956 1.1 oster for (pda=new_asm_h[0]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
957 1.1 oster *nXorBufs += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
958 1.1 oster }
959 1.1 oster if (new_asm_h[1]) {
960 1.1 oster for (pda=new_asm_h[1]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
961 1.1 oster (*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
962 1.1 oster }
963 1.1 oster if (rpBufPtr) *rpBufPtr = bufP; /* the rest of the buffer is for parity */
964 1.1 oster
965 1.1 oster /* the last step is to figure out how many more distinct buffers need to
966 1.1 oster * get xor'd to produce the missing unit. there's one for each user-data
967 1.1 oster * read node that overlaps the portion of the failed unit being accessed */
968 1.1 oster
969 1.1 oster for (foundit=i=0,pda=asmap->physInfo; pda; i++,pda=pda->next) {
970 1.1 oster if (pda == failedPDA) {i--; foundit=1; continue;}
971 1.1 oster if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) {
972 1.1 oster overlappingPDAs[i] = 1;
973 1.1 oster (*nXorBufs)++;
974 1.1 oster }
975 1.1 oster }
976 1.1 oster if (!foundit) {RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA in asm list\n"); RF_ASSERT(0);}
977 1.1 oster
978 1.1 oster if (rf_degDagDebug) {
979 1.1 oster if (new_asm_h[0]) {
980 1.1 oster printf("First asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[0], 1);
981 1.1 oster }
982 1.1 oster if (new_asm_h[1]) {
983 1.1 oster printf("Second asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[1], 1);
984 1.1 oster }
985 1.1 oster }
986 1.1 oster }
987 1.1 oster
988 1.1 oster
989 1.1 oster /* adjusts the offset and number of sectors in the destination pda so that
990 1.1 oster * it covers at most the region of the SU covered by the source PDA. This
991 1.1 oster * is exclusively a restriction: the number of sectors indicated by the
992 1.1 oster * target PDA can only shrink.
993 1.1 oster *
994 1.1 oster * For example: s = sectors within SU indicated by source PDA
995 1.1 oster * d = sectors within SU indicated by dest PDA
996 1.1 oster * r = results, stored in dest PDA
997 1.1 oster *
998 1.1 oster * |--------------- one stripe unit ---------------------|
999 1.1 oster * | sssssssssssssssssssssssssssssssss |
1000 1.1 oster * | ddddddddddddddddddddddddddddddddddddddddddddd |
1001 1.1 oster * | rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr |
1002 1.1 oster *
1003 1.1 oster * Another example:
1004 1.1 oster *
1005 1.1 oster * |--------------- one stripe unit ---------------------|
1006 1.1 oster * | sssssssssssssssssssssssssssssssss |
1007 1.1 oster * | ddddddddddddddddddddddd |
1008 1.1 oster * | rrrrrrrrrrrrrrrr |
1009 1.1 oster *
1010 1.1 oster */
1011 1.1 oster void rf_RangeRestrictPDA(
1012 1.1 oster RF_Raid_t *raidPtr,
1013 1.1 oster RF_PhysDiskAddr_t *src,
1014 1.1 oster RF_PhysDiskAddr_t *dest,
1015 1.1 oster int dobuffer,
1016 1.1 oster int doraidaddr)
1017 1.1 oster {
1018 1.1 oster RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1019 1.1 oster RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
1020 1.1 oster RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
1021 1.1 oster RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1); /* use -1 to be sure we stay within SU */
1022 1.1 oster RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
1023 1.1 oster RF_SectorNum_t subAddr = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->startSector); /* stripe unit boundary */
1024 1.1 oster
1025 1.1 oster dest->startSector = subAddr + RF_MAX(soffs,doffs);
1026 1.1 oster dest->numSector = subAddr + RF_MIN(send,dend) + 1 - dest->startSector;
1027 1.1 oster
1028 1.1 oster if (dobuffer)
1029 1.1 oster dest->bufPtr += (soffs > doffs) ? rf_RaidAddressToByte(raidPtr,soffs-doffs) : 0;
1030 1.1 oster if (doraidaddr) {
1031 1.1 oster dest->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->raidAddress) +
1032 1.1 oster rf_StripeUnitOffset(layoutPtr, dest->startSector);
1033 1.1 oster }
1034 1.1 oster }
1035 1.1 oster
1036 1.1 oster /*
1037 1.1 oster * Want the highest of these primes to be the largest one
1038 1.1 oster * less than the max expected number of columns (won't hurt
1039 1.1 oster * to be too small or too large, but won't be optimal, either)
1040 1.1 oster * --jimz
1041 1.1 oster */
1042 1.1 oster #define NLOWPRIMES 8
1043 1.1 oster static int lowprimes[NLOWPRIMES] = {2,3,5,7,11,13,17,19};
1044 1.1 oster
1045 1.1 oster /*****************************************************************************
1046 1.1 oster * compute the workload shift factor. (chained declustering)
1047 1.1 oster *
1048 1.1 oster * return nonzero if access should shift to secondary, otherwise,
1049 1.1 oster * access is to primary
1050 1.1 oster *****************************************************************************/
1051 1.1 oster int rf_compute_workload_shift(
1052 1.1 oster RF_Raid_t *raidPtr,
1053 1.1 oster RF_PhysDiskAddr_t *pda)
1054 1.1 oster {
1055 1.1 oster /*
1056 1.1 oster * variables:
1057 1.1 oster * d = column of disk containing primary
1058 1.1 oster * f = column of failed disk
1059 1.1 oster * n = number of disks in array
1060 1.1 oster * sd = "shift distance" (number of columns that d is to the right of f)
1061 1.1 oster * row = row of array the access is in
1062 1.1 oster * v = numerator of redirection ratio
1063 1.1 oster * k = denominator of redirection ratio
1064 1.1 oster */
1065 1.1 oster RF_RowCol_t d, f, sd, row, n;
1066 1.1 oster int k, v, ret, i;
1067 1.1 oster
1068 1.1 oster row = pda->row;
1069 1.1 oster n = raidPtr->numCol;
1070 1.1 oster
1071 1.1 oster /* assign column of primary copy to d */
1072 1.1 oster d = pda->col;
1073 1.1 oster
1074 1.1 oster /* assign column of dead disk to f */
1075 1.1 oster for(f=0;((!RF_DEAD_DISK(raidPtr->Disks[row][f].status))&&(f<n));f++);
1076 1.1 oster
1077 1.1 oster RF_ASSERT(f < n);
1078 1.1 oster RF_ASSERT(f != d);
1079 1.1 oster
1080 1.1 oster sd = (f > d) ? (n + d - f) : (d - f);
1081 1.1 oster RF_ASSERT(sd < n);
1082 1.1 oster
1083 1.1 oster /*
1084 1.1 oster * v of every k accesses should be redirected
1085 1.1 oster *
1086 1.1 oster * v/k := (n-1-sd)/(n-1)
1087 1.1 oster */
1088 1.1 oster v = (n-1-sd);
1089 1.1 oster k = (n-1);
1090 1.1 oster
1091 1.1 oster #if 1
1092 1.1 oster /*
1093 1.1 oster * XXX
1094 1.1 oster * Is this worth it?
1095 1.1 oster *
1096 1.1 oster * Now reduce the fraction, by repeatedly factoring
1097 1.1 oster * out primes (just like they teach in elementary school!)
1098 1.1 oster */
1099 1.1 oster for(i=0;i<NLOWPRIMES;i++) {
1100 1.1 oster if (lowprimes[i] > v)
1101 1.1 oster break;
1102 1.1 oster while (((v%lowprimes[i])==0) && ((k%lowprimes[i])==0)) {
1103 1.1 oster v /= lowprimes[i];
1104 1.1 oster k /= lowprimes[i];
1105 1.1 oster }
1106 1.1 oster }
1107 1.1 oster #endif
1108 1.1 oster
1109 1.1 oster raidPtr->hist_diskreq[row][d]++;
1110 1.1 oster if (raidPtr->hist_diskreq[row][d] > v) {
1111 1.1 oster ret = 0; /* do not redirect */
1112 1.1 oster }
1113 1.1 oster else {
1114 1.1 oster ret = 1; /* redirect */
1115 1.1 oster }
1116 1.1 oster
1117 1.1 oster #if 0
1118 1.1 oster printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret,
1119 1.1 oster raidPtr->hist_diskreq[row][d]);
1120 1.1 oster #endif
1121 1.1 oster
1122 1.1 oster if (raidPtr->hist_diskreq[row][d] >= k) {
1123 1.1 oster /* reset counter */
1124 1.1 oster raidPtr->hist_diskreq[row][d] = 0;
1125 1.1 oster }
1126 1.1 oster
1127 1.1 oster return(ret);
1128 1.1 oster }
1129 1.1 oster
1130 1.1 oster /*
1131 1.1 oster * Disk selection routines
1132 1.1 oster */
1133 1.1 oster
1134 1.1 oster /*
1135 1.1 oster * Selects the disk with the shortest queue from a mirror pair.
1136 1.1 oster * Both the disk I/Os queued in RAIDframe as well as those at the physical
1137 1.1 oster * disk are counted as members of the "queue"
1138 1.1 oster */
1139 1.1 oster void rf_SelectMirrorDiskIdle(RF_DagNode_t *node)
1140 1.1 oster {
1141 1.1 oster RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
1142 1.1 oster RF_RowCol_t rowData, colData, rowMirror, colMirror;
1143 1.1 oster int dataQueueLength, mirrorQueueLength, usemirror;
1144 1.1 oster RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *)node->params[0].p;
1145 1.1 oster RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
1146 1.1 oster RF_PhysDiskAddr_t *tmp_pda;
1147 1.1 oster RF_RaidDisk_t **disks = raidPtr->Disks;
1148 1.1 oster RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
1149 1.1 oster
1150 1.1 oster /* return the [row col] of the disk with the shortest queue */
1151 1.1 oster rowData = data_pda->row;
1152 1.1 oster colData = data_pda->col;
1153 1.1 oster rowMirror = mirror_pda->row;
1154 1.1 oster colMirror = mirror_pda->col;
1155 1.1 oster dataQueue = &(dqs[rowData][colData]);
1156 1.1 oster mirrorQueue = &(dqs[rowMirror][colMirror]);
1157 1.1 oster
1158 1.1 oster #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1159 1.1 oster RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
1160 1.1 oster #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1161 1.1 oster dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding;
1162 1.1 oster #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1163 1.1 oster RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
1164 1.1 oster RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
1165 1.1 oster #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1166 1.1 oster mirrorQueueLength = mirrorQueue->queueLength + mirrorQueue->numOutstanding;
1167 1.1 oster #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1168 1.1 oster RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
1169 1.1 oster #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1170 1.1 oster
1171 1.1 oster usemirror = 0;
1172 1.1 oster if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
1173 1.1 oster usemirror = 0;
1174 1.1 oster }
1175 1.1 oster else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
1176 1.1 oster usemirror = 1;
1177 1.1 oster }
1178 1.1 oster else if (dataQueueLength < mirrorQueueLength) {
1179 1.1 oster usemirror = 0;
1180 1.1 oster }
1181 1.1 oster else if (mirrorQueueLength < dataQueueLength) {
1182 1.1 oster usemirror = 1;
1183 1.1 oster }
1184 1.1 oster else {
1185 1.1 oster /* queues are equal length. attempt cleverness. */
1186 1.1 oster if (SNUM_DIFF(dataQueue->last_deq_sector,data_pda->startSector)
1187 1.1 oster <= SNUM_DIFF(mirrorQueue->last_deq_sector,mirror_pda->startSector))
1188 1.1 oster {
1189 1.1 oster usemirror = 0;
1190 1.1 oster }
1191 1.1 oster else {
1192 1.1 oster usemirror = 1;
1193 1.1 oster }
1194 1.1 oster }
1195 1.1 oster
1196 1.1 oster if (usemirror) {
1197 1.1 oster /* use mirror (parity) disk, swap params 0 & 4 */
1198 1.1 oster tmp_pda = data_pda;
1199 1.1 oster node->params[0].p = mirror_pda;
1200 1.1 oster node->params[4].p = tmp_pda;
1201 1.1 oster }
1202 1.1 oster else {
1203 1.1 oster /* use data disk, leave param 0 unchanged */
1204 1.1 oster }
1205 1.1 oster /* printf("dataQueueLength %d, mirrorQueueLength %d\n",dataQueueLength, mirrorQueueLength); */
1206 1.1 oster }
1207 1.1 oster
1208 1.1 oster /*
1209 1.1 oster * Do simple partitioning. This assumes that
1210 1.1 oster * the data and parity disks are laid out identically.
1211 1.1 oster */
1212 1.1 oster void rf_SelectMirrorDiskPartition(RF_DagNode_t *node)
1213 1.1 oster {
1214 1.1 oster RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
1215 1.1 oster RF_RowCol_t rowData, colData, rowMirror, colMirror;
1216 1.1 oster RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *)node->params[0].p;
1217 1.1 oster RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
1218 1.1 oster RF_PhysDiskAddr_t *tmp_pda;
1219 1.1 oster RF_RaidDisk_t **disks = raidPtr->Disks;
1220 1.1 oster RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
1221 1.1 oster int usemirror;
1222 1.1 oster
1223 1.1 oster /* return the [row col] of the disk with the shortest queue */
1224 1.1 oster rowData = data_pda->row;
1225 1.1 oster colData = data_pda->col;
1226 1.1 oster rowMirror = mirror_pda->row;
1227 1.1 oster colMirror = mirror_pda->col;
1228 1.1 oster dataQueue = &(dqs[rowData][colData]);
1229 1.1 oster mirrorQueue = &(dqs[rowMirror][colMirror]);
1230 1.1 oster
1231 1.1 oster usemirror = 0;
1232 1.1 oster if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
1233 1.1 oster usemirror = 0;
1234 1.1 oster }
1235 1.1 oster else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
1236 1.1 oster usemirror = 1;
1237 1.1 oster }
1238 1.1 oster else if (data_pda->startSector < (disks[rowData][colData].numBlocks / 2)) {
1239 1.1 oster usemirror = 0;
1240 1.1 oster }
1241 1.1 oster else {
1242 1.1 oster usemirror = 1;
1243 1.1 oster }
1244 1.1 oster
1245 1.1 oster if (usemirror) {
1246 1.1 oster /* use mirror (parity) disk, swap params 0 & 4 */
1247 1.1 oster tmp_pda = data_pda;
1248 1.1 oster node->params[0].p = mirror_pda;
1249 1.1 oster node->params[4].p = tmp_pda;
1250 1.1 oster }
1251 1.1 oster else {
1252 1.1 oster /* use data disk, leave param 0 unchanged */
1253 1.1 oster }
1254 1.1 oster }
1255