rf_driver.c revision 1.5 1 /* $NetBSD: rf_driver.c,v 1.5 1999/01/26 04:40:03 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland, Khalil Amiri, Claudson Bornstein, William V. Courtright II,
7 * Robby Findler, Daniel Stodolsky, Rachad Youssef, Jim Zelenka
8 *
9 * Permission to use, copy, modify and distribute this software and
10 * its documentation is hereby granted, provided that both the copyright
11 * notice and this permission notice appear in all copies of the
12 * software, derivative works or modified versions, and any portions
13 * thereof, and that both notices appear in supporting documentation.
14 *
15 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
16 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
17 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
18 *
19 * Carnegie Mellon requests users of this software to return to
20 *
21 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
22 * School of Computer Science
23 * Carnegie Mellon University
24 * Pittsburgh PA 15213-3890
25 *
26 * any improvements or extensions that they make and grant Carnegie the
27 * rights to redistribute these changes.
28 */
29
30 /******************************************************************************
31 *
32 * rf_driver.c -- main setup, teardown, and access routines for the RAID driver
33 *
34 * all routines are prefixed with rf_ (raidframe), to avoid conficts.
35 *
36 ******************************************************************************/
37
38
39 #include <sys/types.h>
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/ioctl.h>
43 #include <sys/fcntl.h>
44 #include <sys/vnode.h>
45
46
47 #include "rf_archs.h"
48 #include "rf_threadstuff.h"
49
50 #include <sys/errno.h>
51
52 #include "rf_raid.h"
53 #include "rf_dag.h"
54 #include "rf_aselect.h"
55 #include "rf_diskqueue.h"
56 #include "rf_parityscan.h"
57 #include "rf_alloclist.h"
58 #include "rf_threadid.h"
59 #include "rf_dagutils.h"
60 #include "rf_utils.h"
61 #include "rf_etimer.h"
62 #include "rf_acctrace.h"
63 #include "rf_configure.h"
64 #include "rf_general.h"
65 #include "rf_desc.h"
66 #include "rf_states.h"
67 #include "rf_freelist.h"
68 #include "rf_decluster.h"
69 #include "rf_map.h"
70 #include "rf_diskthreads.h"
71 #include "rf_revent.h"
72 #include "rf_callback.h"
73 #include "rf_engine.h"
74 #include "rf_memchunk.h"
75 #include "rf_mcpair.h"
76 #include "rf_nwayxor.h"
77 #include "rf_debugprint.h"
78 #include "rf_copyback.h"
79 #if !defined(__NetBSD__)
80 #include "rf_camlayer.h"
81 #endif
82 #include "rf_driver.h"
83 #include "rf_options.h"
84 #include "rf_shutdown.h"
85 #include "rf_sys.h"
86 #include "rf_cpuutil.h"
87
88 #include <sys/buf.h>
89
90 #if DKUSAGE > 0
91 #include <sys/dkusage.h>
92 #include <io/common/iotypes.h>
93 #include <io/cam/dec_cam.h>
94 #include <io/cam/cam.h>
95 #include <io/cam/pdrv.h>
96 #endif /* DKUSAGE > 0 */
97
98 /* rad == RF_RaidAccessDesc_t */
99 static RF_FreeList_t *rf_rad_freelist;
100 #define RF_MAX_FREE_RAD 128
101 #define RF_RAD_INC 16
102 #define RF_RAD_INITIAL 32
103
104 /* debug variables */
105 char rf_panicbuf[2048]; /* a buffer to hold an error msg when we panic */
106
107 /* main configuration routines */
108 static int raidframe_booted = 0;
109
110 static void rf_ConfigureDebug(RF_Config_t *cfgPtr);
111 static void set_debug_option(char *name, long val);
112 static void rf_UnconfigureArray(void);
113 static int init_rad(RF_RaidAccessDesc_t *);
114 static void clean_rad(RF_RaidAccessDesc_t *);
115 static void rf_ShutdownRDFreeList(void *);
116 static int rf_ConfigureRDFreeList(RF_ShutdownList_t **);
117
118
119 RF_DECLARE_MUTEX(rf_printf_mutex) /* debug only: avoids interleaved printfs by different stripes */
120 RF_DECLARE_GLOBAL_THREADID /* declarations for threadid.h */
121
122
123 #define SIGNAL_QUIESCENT_COND(_raid_) wakeup(&((_raid_)->accesses_suspended))
124 #define WAIT_FOR_QUIESCENCE(_raid_) \
125 tsleep(&((_raid_)->accesses_suspended),PRIBIO|PCATCH,"raidframe quiesce", 0);
126
127 #if DKUSAGE > 0
128 #define IO_BUF_ERR(bp, err, unit) { \
129 bp->b_flags |= B_ERROR; \
130 bp->b_resid = bp->b_bcount; \
131 bp->b_error = err; \
132 RF_DKU_END_IO(unit, bp); \
133 biodone(bp); \
134 }
135 #else
136 #define IO_BUF_ERR(bp, err, unit) { \
137 bp->b_flags |= B_ERROR; \
138 bp->b_resid = bp->b_bcount; \
139 bp->b_error = err; \
140 RF_DKU_END_IO(unit); \
141 biodone(bp); \
142 }
143 #endif /* DKUSAGE > 0 */
144
145 static int configureCount=0; /* number of active configurations */
146 static int isconfigged=0; /* is basic raidframe (non per-array) stuff configged */
147 RF_DECLARE_STATIC_MUTEX(configureMutex) /* used to lock the configuration stuff */
148
149 static RF_ShutdownList_t *globalShutdown; /* non array-specific stuff */
150
151 static int rf_ConfigureRDFreeList(RF_ShutdownList_t **listp);
152
153 /* called at system boot time */
154 int rf_BootRaidframe()
155 {
156 int rc;
157
158 if (raidframe_booted)
159 return(EBUSY);
160 raidframe_booted = 1;
161
162 #if RF_DEBUG_ATOMIC > 0
163 rf_atent_init();
164 #endif /* RF_DEBUG_ATOMIC > 0 */
165
166 rf_setup_threadid();
167 rf_assign_threadid();
168
169 rc = rf_mutex_init(&configureMutex);
170 if (rc) {
171 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
172 __LINE__, rc);
173 RF_PANIC();
174 }
175 configureCount = 0;
176 isconfigged = 0;
177 globalShutdown = NULL;
178 return(0);
179 }
180
181 /*
182 * This function is really just for debugging user-level stuff: it
183 * frees up all memory, other RAIDframe resources which might otherwise
184 * be kept around. This is used with systems like "sentinel" to detect
185 * memory leaks.
186 */
187 int rf_UnbootRaidframe()
188 {
189 int rc;
190
191 RF_LOCK_MUTEX(configureMutex);
192 if (configureCount) {
193 RF_UNLOCK_MUTEX(configureMutex);
194 return(EBUSY);
195 }
196 raidframe_booted = 0;
197 RF_UNLOCK_MUTEX(configureMutex);
198 rc = rf_mutex_destroy(&configureMutex);
199 if (rc) {
200 RF_ERRORMSG3("Unable to destroy mutex file %s line %d rc=%d\n", __FILE__,
201 __LINE__, rc);
202 RF_PANIC();
203 }
204 #if RF_DEBUG_ATOMIC > 0
205 rf_atent_shutdown();
206 #endif /* RF_DEBUG_ATOMIC > 0 */
207 return(0);
208 }
209
210 /*
211 * Called whenever an array is shutdown
212 */
213 static void rf_UnconfigureArray()
214 {
215 int rc;
216
217 RF_LOCK_MUTEX(configureMutex);
218 if (--configureCount == 0) { /* if no active configurations, shut everything down */
219 isconfigged = 0;
220
221 rc = rf_ShutdownList(&globalShutdown);
222 if (rc) {
223 RF_ERRORMSG1("RAIDFRAME: unable to do global shutdown, rc=%d\n", rc);
224 }
225
226 rf_shutdown_threadid();
227
228 /*
229 * We must wait until now, because the AllocList module
230 * uses the DebugMem module.
231 */
232 if (rf_memDebug)
233 rf_print_unfreed();
234 }
235 RF_UNLOCK_MUTEX(configureMutex);
236 }
237
238 /*
239 * Called to shut down an array.
240 */
241 int rf_Shutdown(raidPtr)
242 RF_Raid_t *raidPtr;
243 {
244 int r,c;
245
246 struct proc *p;
247
248 if (!raidPtr->valid) {
249 RF_ERRORMSG("Attempt to shut down unconfigured RAIDframe driver. Aborting shutdown\n");
250 return(EINVAL);
251 }
252
253 /*
254 * wait for outstanding IOs to land
255 * As described in rf_raid.h, we use the rad_freelist lock
256 * to protect the per-array info about outstanding descs
257 * since we need to do freelist locking anyway, and this
258 * cuts down on the amount of serialization we've got going
259 * on.
260 */
261 RF_FREELIST_DO_LOCK(rf_rad_freelist);
262 if (raidPtr->waitShutdown) {
263 RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
264 return(EBUSY);
265 }
266 raidPtr->waitShutdown = 1;
267 while (raidPtr->nAccOutstanding) {
268 RF_WAIT_COND(raidPtr->outstandingCond, RF_FREELIST_MUTEX_OF(rf_rad_freelist));
269 }
270 RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
271
272 raidPtr->valid = 0;
273
274
275 /* We take this opportunity to close the vnodes like we should.. */
276
277 p = raidPtr->proc; /* XXX */
278
279 for(r=0;r<raidPtr->numRow;r++) {
280 for(c=0;c<raidPtr->numCol;c++) {
281 printf("Closing vnode for row: %d col: %d\n",r,c);
282 if (raidPtr->raid_cinfo[r][c].ci_vp) {
283 (void)vn_close(raidPtr->raid_cinfo[r][c].ci_vp,
284 FREAD|FWRITE, p->p_ucred, p);
285 } else {
286 printf("vnode was NULL\n");
287 }
288
289 }
290 }
291 for(r=0;r<raidPtr->numSpare;r++) {
292 printf("Closing vnode for spare: %d\n",r);
293 if (raidPtr->raid_cinfo[0][raidPtr->numCol+r].ci_vp) {
294 (void)vn_close(raidPtr->raid_cinfo[0][raidPtr->numCol+r].ci_vp,
295 FREAD|FWRITE, p->p_ucred, p);
296 } else {
297 printf("vnode was NULL\n");
298 }
299 }
300
301
302
303 rf_ShutdownList(&raidPtr->shutdownList);
304
305 rf_UnconfigureArray();
306
307 return(0);
308 }
309
310 #define DO_INIT_CONFIGURE(f) { \
311 rc = f (&globalShutdown); \
312 if (rc) { \
313 RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \
314 rf_ShutdownList(&globalShutdown); \
315 configureCount--; \
316 RF_UNLOCK_MUTEX(configureMutex); \
317 return(rc); \
318 } \
319 }
320
321 #define DO_RAID_FAIL() { \
322 rf_ShutdownList(&raidPtr->shutdownList); \
323 rf_UnconfigureArray(); \
324 }
325
326 #define DO_RAID_INIT_CONFIGURE(f) { \
327 rc = f (&raidPtr->shutdownList, raidPtr, cfgPtr); \
328 if (rc) { \
329 RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \
330 DO_RAID_FAIL(); \
331 return(rc); \
332 } \
333 }
334
335 #define DO_RAID_MUTEX(_m_) { \
336 rc = rf_create_managed_mutex(&raidPtr->shutdownList, (_m_)); \
337 if (rc) { \
338 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", \
339 __FILE__, __LINE__, rc); \
340 DO_RAID_FAIL(); \
341 return(rc); \
342 } \
343 }
344
345 #define DO_RAID_COND(_c_) { \
346 rc = rf_create_managed_cond(&raidPtr->shutdownList, (_c_)); \
347 if (rc) { \
348 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", \
349 __FILE__, __LINE__, rc); \
350 DO_RAID_FAIL(); \
351 return(rc); \
352 } \
353 }
354
355 int rf_Configure(raidPtr, cfgPtr)
356 RF_Raid_t *raidPtr;
357 RF_Config_t *cfgPtr;
358 {
359 RF_RowCol_t row, col;
360 int i, rc;
361 int unit;
362 struct proc *p;
363
364 if (raidPtr->valid) {
365 RF_ERRORMSG("RAIDframe configuration not shut down. Aborting configure.\n");
366 return(EINVAL);
367 }
368
369 RF_LOCK_MUTEX(configureMutex);
370 configureCount++;
371 if (isconfigged == 0) {
372 rc = rf_create_managed_mutex(&globalShutdown, &rf_printf_mutex);
373 if (rc) {
374 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
375 __LINE__, rc);
376 rf_ShutdownList(&globalShutdown);
377 return(rc);
378 }
379
380 /* initialize globals */
381 printf("RAIDFRAME: protectedSectors is %ld\n",rf_protectedSectors);
382
383 rf_clear_debug_print_buffer();
384
385 DO_INIT_CONFIGURE(rf_ConfigureAllocList);
386 DO_INIT_CONFIGURE(rf_ConfigureEtimer);
387 /*
388 * Yes, this does make debugging general to the whole system instead
389 * of being array specific. Bummer, drag.
390 */
391 rf_ConfigureDebug(cfgPtr);
392 DO_INIT_CONFIGURE(rf_ConfigureDebugMem);
393 DO_INIT_CONFIGURE(rf_ConfigureAccessTrace);
394 DO_INIT_CONFIGURE(rf_ConfigureMapModule);
395 DO_INIT_CONFIGURE(rf_ConfigureReconEvent);
396 DO_INIT_CONFIGURE(rf_ConfigureCallback);
397 DO_INIT_CONFIGURE(rf_ConfigureMemChunk);
398 DO_INIT_CONFIGURE(rf_ConfigureRDFreeList);
399 DO_INIT_CONFIGURE(rf_ConfigureNWayXor);
400 DO_INIT_CONFIGURE(rf_ConfigureStripeLockFreeList);
401 DO_INIT_CONFIGURE(rf_ConfigureMCPair);
402 #if !defined(__NetBSD__)
403 DO_INIT_CONFIGURE(rf_ConfigureCamLayer);
404 #endif
405 DO_INIT_CONFIGURE(rf_ConfigureDAGs);
406 DO_INIT_CONFIGURE(rf_ConfigureDAGFuncs);
407 DO_INIT_CONFIGURE(rf_ConfigureDebugPrint);
408 DO_INIT_CONFIGURE(rf_ConfigureReconstruction);
409 DO_INIT_CONFIGURE(rf_ConfigureCopyback);
410 DO_INIT_CONFIGURE(rf_ConfigureDiskQueueSystem);
411 DO_INIT_CONFIGURE(rf_ConfigureCpuMonitor);
412 isconfigged = 1;
413 }
414 RF_UNLOCK_MUTEX(configureMutex);
415
416 /*
417 * Null out the entire raid descriptor to avoid problems when we reconfig.
418 * This also clears the valid bit.
419 */
420 /* XXX this clearing should be moved UP to outside of here.... that, or
421 rf_Configure() needs to take more arguments... XXX */
422 unit = raidPtr->raidid;
423 p = raidPtr->proc; /* XXX save these... */
424 bzero((char *)raidPtr, sizeof(RF_Raid_t));
425 raidPtr->raidid = unit;
426 raidPtr->proc = p; /* XXX and then recover them..*/
427 DO_RAID_MUTEX(&raidPtr->mutex);
428 /* set up the cleanup list. Do this after ConfigureDebug so that value of memDebug will be set */
429
430 rf_MakeAllocList(raidPtr->cleanupList);
431 if (raidPtr->cleanupList == NULL) {
432 DO_RAID_FAIL();
433 return(ENOMEM);
434 }
435
436 rc = rf_ShutdownCreate(&raidPtr->shutdownList,
437 (void (*)(void *))rf_FreeAllocList,
438 raidPtr->cleanupList);
439 if (rc) {
440 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
441 __FILE__, __LINE__, rc);
442 DO_RAID_FAIL();
443 return(rc);
444 }
445
446 raidPtr->numRow = cfgPtr->numRow;
447 raidPtr->numCol = cfgPtr->numCol;
448 raidPtr->numSpare = cfgPtr->numSpare;
449
450 /* XXX we don't even pretend to support more than one row
451 in the kernel... */
452 if (raidPtr->numRow != 1) {
453 RF_ERRORMSG("Only one row supported in kernel.\n");
454 DO_RAID_FAIL();
455 return(EINVAL);
456 }
457
458
459
460 RF_CallocAndAdd(raidPtr->status, raidPtr->numRow, sizeof(RF_RowStatus_t),
461 (RF_RowStatus_t *), raidPtr->cleanupList);
462 if (raidPtr->status == NULL) {
463 DO_RAID_FAIL();
464 return(ENOMEM);
465 }
466
467 RF_CallocAndAdd(raidPtr->reconControl, raidPtr->numRow,
468 sizeof(RF_ReconCtrl_t *), (RF_ReconCtrl_t **), raidPtr->cleanupList);
469 if (raidPtr->reconControl == NULL) {
470 DO_RAID_FAIL();
471 return(ENOMEM);
472 }
473 for (i=0; i<raidPtr->numRow; i++) {
474 raidPtr->status[i] = rf_rs_optimal;
475 raidPtr->reconControl[i] = NULL;
476 }
477
478 DO_RAID_INIT_CONFIGURE(rf_ConfigureEngine);
479 DO_RAID_INIT_CONFIGURE(rf_ConfigureStripeLocks);
480
481 DO_RAID_COND(&raidPtr->outstandingCond);
482
483 raidPtr->nAccOutstanding = 0;
484 raidPtr->waitShutdown = 0;
485
486 DO_RAID_MUTEX(&raidPtr->access_suspend_mutex);
487 DO_RAID_COND(&raidPtr->quiescent_cond);
488
489 DO_RAID_COND(&raidPtr->waitForReconCond);
490
491 DO_RAID_MUTEX(&raidPtr->recon_done_proc_mutex);
492 DO_RAID_INIT_CONFIGURE(rf_ConfigureDisks);
493 DO_RAID_INIT_CONFIGURE(rf_ConfigureSpareDisks);
494 /* do this after ConfigureDisks & ConfigureSpareDisks to be sure dev no. is set */
495 DO_RAID_INIT_CONFIGURE(rf_ConfigureDiskQueues);
496
497 DO_RAID_INIT_CONFIGURE(rf_ConfigureLayout);
498
499 DO_RAID_INIT_CONFIGURE(rf_ConfigurePSStatus);
500
501 for(row=0;row<raidPtr->numRow;row++) {
502 for(col=0;col<raidPtr->numCol;col++) {
503 /*
504 * XXX better distribution
505 */
506 raidPtr->hist_diskreq[row][col] = 0;
507 }
508 }
509
510 if (rf_keepAccTotals) {
511 raidPtr->keep_acc_totals = 1;
512 }
513
514 rf_StartUserStats(raidPtr);
515
516 raidPtr->valid = 1;
517 return(0);
518 }
519
520 static int init_rad(desc)
521 RF_RaidAccessDesc_t *desc;
522 {
523 int rc;
524
525 rc = rf_mutex_init(&desc->mutex);
526 if (rc) {
527 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
528 __LINE__, rc);
529 return(rc);
530 }
531 rc = rf_cond_init(&desc->cond);
532 if (rc) {
533 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
534 __LINE__, rc);
535 rf_mutex_destroy(&desc->mutex);
536 return(rc);
537 }
538 return(0);
539 }
540
541 static void clean_rad(desc)
542 RF_RaidAccessDesc_t *desc;
543 {
544 rf_mutex_destroy(&desc->mutex);
545 rf_cond_destroy(&desc->cond);
546 }
547
548 static void rf_ShutdownRDFreeList(ignored)
549 void *ignored;
550 {
551 RF_FREELIST_DESTROY_CLEAN(rf_rad_freelist,next,(RF_RaidAccessDesc_t *),clean_rad);
552 }
553
554 static int rf_ConfigureRDFreeList(listp)
555 RF_ShutdownList_t **listp;
556 {
557 int rc;
558
559 RF_FREELIST_CREATE(rf_rad_freelist, RF_MAX_FREE_RAD,
560 RF_RAD_INC, sizeof(RF_RaidAccessDesc_t));
561 if (rf_rad_freelist == NULL) {
562 return(ENOMEM);
563 }
564 rc = rf_ShutdownCreate(listp, rf_ShutdownRDFreeList, NULL);
565 if (rc) {
566 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
567 __LINE__, rc);
568 rf_ShutdownRDFreeList(NULL);
569 return(rc);
570 }
571 RF_FREELIST_PRIME_INIT(rf_rad_freelist, RF_RAD_INITIAL,next,
572 (RF_RaidAccessDesc_t *),init_rad);
573 return(0);
574 }
575
576 RF_RaidAccessDesc_t *rf_AllocRaidAccDesc(
577 RF_Raid_t *raidPtr,
578 RF_IoType_t type,
579 RF_RaidAddr_t raidAddress,
580 RF_SectorCount_t numBlocks,
581 caddr_t bufPtr,
582 void *bp,
583 RF_DagHeader_t **paramDAG,
584 RF_AccessStripeMapHeader_t **paramASM,
585 RF_RaidAccessFlags_t flags,
586 void (*cbF)(struct buf *),
587 void *cbA,
588 RF_AccessState_t *states)
589 {
590 RF_RaidAccessDesc_t *desc;
591
592 RF_FREELIST_GET_INIT_NOUNLOCK(rf_rad_freelist,desc,next,(RF_RaidAccessDesc_t *),init_rad);
593 if (raidPtr->waitShutdown) {
594 /*
595 * Actually, we're shutting the array down. Free the desc
596 * and return NULL.
597 */
598 RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
599 RF_FREELIST_FREE_CLEAN(rf_rad_freelist,desc,next,clean_rad);
600 return(NULL);
601 }
602 raidPtr->nAccOutstanding++;
603 RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
604
605 desc->raidPtr = (void*)raidPtr;
606 desc->type = type;
607 desc->raidAddress = raidAddress;
608 desc->numBlocks = numBlocks;
609 desc->bufPtr = bufPtr;
610 desc->bp = bp;
611 desc->paramDAG = paramDAG;
612 desc->paramASM = paramASM;
613 desc->flags = flags;
614 desc -> states = states;
615 desc -> state = 0;
616
617 desc->status = 0;
618 bzero((char *)&desc->tracerec, sizeof(RF_AccTraceEntry_t));
619 desc->callbackFunc= (void (*)(RF_CBParam_t))cbF; /* XXX */
620 desc->callbackArg = cbA;
621 desc->next = NULL;
622 desc->head = desc;
623 desc->numPending = 0;
624 desc->cleanupList = NULL;
625 rf_MakeAllocList(desc->cleanupList);
626 rf_get_threadid(desc->tid);
627 return(desc);
628 }
629
630 void rf_FreeRaidAccDesc(RF_RaidAccessDesc_t *desc)
631 {
632 RF_Raid_t *raidPtr = desc->raidPtr;
633
634 RF_ASSERT(desc);
635
636 rf_FreeAllocList(desc->cleanupList);
637 RF_FREELIST_FREE_CLEAN_NOUNLOCK(rf_rad_freelist,desc,next,clean_rad);
638 raidPtr->nAccOutstanding--;
639 if (raidPtr->waitShutdown) {
640 RF_SIGNAL_COND(raidPtr->outstandingCond);
641 }
642 RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
643 }
644
645 /*********************************************************************
646 * Main routine for performing an access.
647 * Accesses are retried until a DAG can not be selected. This occurs
648 * when either the DAG library is incomplete or there are too many
649 * failures in a parity group.
650 ********************************************************************/
651 int rf_DoAccess(
652 RF_Raid_t *raidPtr,
653 RF_IoType_t type,
654 int async_flag,
655 RF_RaidAddr_t raidAddress,
656 RF_SectorCount_t numBlocks,
657 caddr_t bufPtr,
658 void *bp_in,
659 RF_DagHeader_t **paramDAG,
660 RF_AccessStripeMapHeader_t **paramASM,
661 RF_RaidAccessFlags_t flags,
662 RF_RaidAccessDesc_t **paramDesc,
663 void (*cbF)(struct buf *),
664 void *cbA)
665 /*
666 type should be read or write
667 async_flag should be RF_TRUE or RF_FALSE
668 bp_in is a buf pointer. void * to facilitate ignoring it outside the kernel
669 */
670 {
671 int tid;
672 RF_RaidAccessDesc_t *desc;
673 caddr_t lbufPtr = bufPtr;
674 struct buf *bp = (struct buf *) bp_in;
675 #if DFSTRACE > 0
676 struct { RF_uint64 raidAddr; int numBlocks; char type;} dfsrecord;
677 #endif /* DFSTRACE > 0 */
678
679 raidAddress += rf_raidSectorOffset;
680
681 if (!raidPtr->valid) {
682 RF_ERRORMSG("RAIDframe driver not successfully configured. Rejecting access.\n");
683 IO_BUF_ERR(bp, EINVAL, raidPtr->raidid);
684 return(EINVAL);
685 }
686
687 #if defined(KERNEL) && DFSTRACE > 0
688 if (rf_DFSTraceAccesses) {
689 dfsrecord.raidAddr = raidAddress;
690 dfsrecord.numBlocks = numBlocks;
691 dfsrecord.type = type;
692 dfs_log(DFS_NOTE, (char *) &dfsrecord, sizeof(dfsrecord), 0);
693 }
694 #endif /* KERNEL && DFSTRACE > 0 */
695
696 rf_get_threadid(tid);
697 if (rf_accessDebug) {
698
699 printf("logBytes is: %d %d %d\n",raidPtr->raidid,
700 raidPtr->logBytesPerSector,
701 (int)rf_RaidAddressToByte(raidPtr,numBlocks));
702 printf("[%d] %s raidAddr %d (stripeid %d-%d) numBlocks %d (%d bytes) buf 0x%lx\n",tid,
703 (type==RF_IO_TYPE_READ) ? "READ":"WRITE", (int)raidAddress,
704 (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress),
705 (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress+numBlocks-1),
706 (int) numBlocks,
707 (int) rf_RaidAddressToByte(raidPtr,numBlocks),
708 (long) bufPtr);
709 }
710
711 if (raidAddress + numBlocks > raidPtr->totalSectors) {
712
713 printf("DoAccess: raid addr %lu too large to access %lu sectors. Max legal addr is %lu\n",
714 (u_long)raidAddress,(u_long)numBlocks,(u_long)raidPtr->totalSectors);
715
716 if (type == RF_IO_TYPE_READ) {
717 IO_BUF_ERR(bp, ENOSPC, raidPtr->raidid);
718 return(ENOSPC);
719 } else {
720 IO_BUF_ERR(bp, ENOSPC, raidPtr->raidid);
721 return(ENOSPC);
722 }
723 }
724
725 desc = rf_AllocRaidAccDesc(raidPtr, type, raidAddress,
726 numBlocks, lbufPtr, bp, paramDAG, paramASM,
727 flags, cbF, cbA, raidPtr->Layout.map->states);
728
729 if (desc == NULL) {
730 return(ENOMEM);
731 }
732
733 RF_ETIMER_START(desc->tracerec.tot_timer);
734
735 desc->async_flag = async_flag;
736
737 rf_ContinueRaidAccess(desc);
738
739 return(0);
740 }
741
742 /* force the array into reconfigured mode without doing reconstruction */
743 int rf_SetReconfiguredMode(raidPtr, row, col)
744 RF_Raid_t *raidPtr;
745 int row;
746 int col;
747 {
748 if (!(raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
749 printf("Can't set reconfigured mode in dedicated-spare array\n");
750 RF_PANIC();
751 }
752 RF_LOCK_MUTEX(raidPtr->mutex);
753 raidPtr->numFailures++;
754 raidPtr->Disks[row][col].status = rf_ds_dist_spared;
755 raidPtr->status[row] = rf_rs_reconfigured;
756 /* install spare table only if declustering + distributed sparing architecture. */
757 if ( raidPtr->Layout.map->flags & RF_BD_DECLUSTERED )
758 rf_InstallSpareTable(raidPtr, row, col);
759 RF_UNLOCK_MUTEX(raidPtr->mutex);
760 return(0);
761 }
762
763 extern int fail_row, fail_col, fail_time;
764 extern int delayed_recon;
765
766 int rf_FailDisk(
767 RF_Raid_t *raidPtr,
768 int frow,
769 int fcol,
770 int initRecon)
771 {
772 int tid;
773
774 rf_get_threadid(tid);
775 printf("[%d] Failing disk r%d c%d\n",tid,frow,fcol);
776 RF_LOCK_MUTEX(raidPtr->mutex);
777 raidPtr->numFailures++;
778 raidPtr->Disks[frow][fcol].status = rf_ds_failed;
779 raidPtr->status[frow] = rf_rs_degraded;
780 RF_UNLOCK_MUTEX(raidPtr->mutex);
781 if (initRecon)
782 rf_ReconstructFailedDisk(raidPtr, frow, fcol);
783 return(0);
784 }
785
786 /* releases a thread that is waiting for the array to become quiesced.
787 * access_suspend_mutex should be locked upon calling this
788 */
789 void rf_SignalQuiescenceLock(raidPtr, reconDesc)
790 RF_Raid_t *raidPtr;
791 RF_RaidReconDesc_t *reconDesc;
792 {
793 int tid;
794
795 if (rf_quiesceDebug) {
796 rf_get_threadid(tid);
797 printf("[%d] Signalling quiescence lock\n", tid);
798 }
799 raidPtr->access_suspend_release = 1;
800
801 if (raidPtr->waiting_for_quiescence) {
802 SIGNAL_QUIESCENT_COND(raidPtr);
803 }
804 }
805
806 /* suspends all new requests to the array. No effect on accesses that are in flight. */
807 int rf_SuspendNewRequestsAndWait(raidPtr)
808 RF_Raid_t *raidPtr;
809 {
810 if (rf_quiesceDebug)
811 printf("Suspending new reqs\n");
812
813 RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
814 raidPtr->accesses_suspended++;
815 raidPtr->waiting_for_quiescence = (raidPtr->accs_in_flight == 0) ? 0 : 1;
816
817 if (raidPtr->waiting_for_quiescence) {
818 raidPtr->access_suspend_release=0;
819 while (!raidPtr->access_suspend_release) {
820 printf("Suspending: Waiting for Quiesence\n");
821 WAIT_FOR_QUIESCENCE(raidPtr);
822 raidPtr->waiting_for_quiescence = 0;
823 }
824 }
825 printf("Quiesence reached..\n");
826
827 RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
828 return (raidPtr->waiting_for_quiescence);
829 }
830
831 /* wake up everyone waiting for quiescence to be released */
832 void rf_ResumeNewRequests(raidPtr)
833 RF_Raid_t *raidPtr;
834 {
835 RF_CallbackDesc_t *t, *cb;
836
837 if (rf_quiesceDebug)
838 printf("Resuming new reqs\n");
839
840 RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
841 raidPtr->accesses_suspended--;
842 if (raidPtr->accesses_suspended == 0)
843 cb = raidPtr->quiesce_wait_list;
844 else
845 cb = NULL;
846 raidPtr->quiesce_wait_list = NULL;
847 RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
848
849 while (cb) {
850 t = cb;
851 cb = cb->next;
852 (t->callbackFunc)(t->callbackArg);
853 rf_FreeCallbackDesc(t);
854 }
855 }
856
857 /*****************************************************************************************
858 *
859 * debug routines
860 *
861 ****************************************************************************************/
862
863 static void set_debug_option(name, val)
864 char *name;
865 long val;
866 {
867 RF_DebugName_t *p;
868
869 for (p = rf_debugNames; p->name; p++) {
870 if (!strcmp(p->name, name)) {
871 *(p->ptr) = val;
872 printf("[Set debug variable %s to %ld]\n",name,val);
873 return;
874 }
875 }
876 RF_ERRORMSG1("Unknown debug string \"%s\"\n",name);
877 }
878
879
880 /* would like to use sscanf here, but apparently not available in kernel */
881 /*ARGSUSED*/
882 static void rf_ConfigureDebug(cfgPtr)
883 RF_Config_t *cfgPtr;
884 {
885 char *val_p, *name_p, *white_p;
886 long val;
887 int i;
888
889 rf_ResetDebugOptions();
890 for (i=0; cfgPtr->debugVars[i][0] && i < RF_MAXDBGV; i++) {
891 name_p = rf_find_non_white(&cfgPtr->debugVars[i][0]);
892 white_p = rf_find_white(name_p); /* skip to start of 2nd word */
893 val_p = rf_find_non_white(white_p);
894 if (*val_p == '0' && *(val_p+1) == 'x') val = rf_htoi(val_p+2);
895 else val = rf_atoi(val_p);
896 *white_p = '\0';
897 set_debug_option(name_p, val);
898 }
899 }
900
901 /* performance monitoring stuff */
902
903 #define TIMEVAL_TO_US(t) (((long) t.tv_sec) * 1000000L + (long) t.tv_usec)
904
905 #if !defined(_KERNEL) && !defined(SIMULATE)
906
907 /*
908 * Throughput stats currently only used in user-level RAIDframe
909 */
910
911 static int rf_InitThroughputStats(
912 RF_ShutdownList_t **listp,
913 RF_Raid_t *raidPtr,
914 RF_Config_t *cfgPtr)
915 {
916 int rc;
917
918 /* these used by user-level raidframe only */
919 rc = rf_create_managed_mutex(listp, &raidPtr->throughputstats.mutex);
920 if (rc) {
921 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
922 __LINE__, rc);
923 return(rc);
924 }
925 raidPtr->throughputstats.sum_io_us = 0;
926 raidPtr->throughputstats.num_ios = 0;
927 raidPtr->throughputstats.num_out_ios = 0;
928 return(0);
929 }
930
931 void rf_StartThroughputStats(RF_Raid_t *raidPtr)
932 {
933 RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
934 raidPtr->throughputstats.num_ios++;
935 raidPtr->throughputstats.num_out_ios++;
936 if (raidPtr->throughputstats.num_out_ios == 1)
937 RF_GETTIME(raidPtr->throughputstats.start);
938 RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
939 }
940
941 static void rf_StopThroughputStats(RF_Raid_t *raidPtr)
942 {
943 struct timeval diff;
944
945 RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
946 raidPtr->throughputstats.num_out_ios--;
947 if (raidPtr->throughputstats.num_out_ios == 0) {
948 RF_GETTIME(raidPtr->throughputstats.stop);
949 RF_TIMEVAL_DIFF(&raidPtr->throughputstats.start, &raidPtr->throughputstats.stop, &diff);
950 raidPtr->throughputstats.sum_io_us += TIMEVAL_TO_US(diff);
951 }
952 RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
953 }
954
955 static void rf_PrintThroughputStats(RF_Raid_t *raidPtr)
956 {
957 RF_ASSERT(raidPtr->throughputstats.num_out_ios == 0);
958 if ( raidPtr->throughputstats.sum_io_us != 0 ) {
959 printf("[Througphut: %8.2f IOs/second]\n", raidPtr->throughputstats.num_ios
960 / (raidPtr->throughputstats.sum_io_us / 1000000.0));
961 }
962 }
963
964 #endif /* !KERNEL && !SIMULATE */
965
966 void rf_StartUserStats(RF_Raid_t *raidPtr)
967 {
968 RF_GETTIME(raidPtr->userstats.start);
969 raidPtr->userstats.sum_io_us = 0;
970 raidPtr->userstats.num_ios = 0;
971 raidPtr->userstats.num_sect_moved = 0;
972 }
973
974 void rf_StopUserStats(RF_Raid_t *raidPtr)
975 {
976 RF_GETTIME(raidPtr->userstats.stop);
977 }
978
979 void rf_UpdateUserStats(raidPtr, rt, numsect)
980 RF_Raid_t *raidPtr;
981 int rt; /* resp time in us */
982 int numsect; /* number of sectors for this access */
983 {
984 raidPtr->userstats.sum_io_us += rt;
985 raidPtr->userstats.num_ios++;
986 raidPtr->userstats.num_sect_moved += numsect;
987 }
988
989 void rf_PrintUserStats(RF_Raid_t *raidPtr)
990 {
991 long elapsed_us, mbs, mbs_frac;
992 struct timeval diff;
993
994 RF_TIMEVAL_DIFF(&raidPtr->userstats.start, &raidPtr->userstats.stop, &diff);
995 elapsed_us = TIMEVAL_TO_US(diff);
996
997 /* 2000 sectors per megabyte, 10000000 microseconds per second */
998 if (elapsed_us)
999 mbs = (raidPtr->userstats.num_sect_moved / 2000) / (elapsed_us / 1000000);
1000 else
1001 mbs = 0;
1002
1003 /* this computes only the first digit of the fractional mb/s moved */
1004 if (elapsed_us) {
1005 mbs_frac = ((raidPtr->userstats.num_sect_moved / 200) / (elapsed_us / 1000000))
1006 - (mbs * 10);
1007 }
1008 else {
1009 mbs_frac = 0;
1010 }
1011
1012 printf("Number of I/Os: %ld\n",raidPtr->userstats.num_ios);
1013 printf("Elapsed time (us): %ld\n",elapsed_us);
1014 printf("User I/Os per second: %ld\n",RF_DB0_CHECK(raidPtr->userstats.num_ios, (elapsed_us/1000000)));
1015 printf("Average user response time: %ld us\n",RF_DB0_CHECK(raidPtr->userstats.sum_io_us, raidPtr->userstats.num_ios));
1016 printf("Total sectors moved: %ld\n",raidPtr->userstats.num_sect_moved);
1017 printf("Average access size (sect): %ld\n",RF_DB0_CHECK(raidPtr->userstats.num_sect_moved, raidPtr->userstats.num_ios));
1018 printf("Achieved data rate: %ld.%ld MB/sec\n",mbs,mbs_frac);
1019 }
1020