rf_driver.c revision 1.2 1 /* $NetBSD: rf_driver.c,v 1.2 1998/11/13 13:45:15 drochner Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland, Khalil Amiri, Claudson Bornstein, William V. Courtright II,
7 * Robby Findler, Daniel Stodolsky, Rachad Youssef, Jim Zelenka
8 *
9 * Permission to use, copy, modify and distribute this software and
10 * its documentation is hereby granted, provided that both the copyright
11 * notice and this permission notice appear in all copies of the
12 * software, derivative works or modified versions, and any portions
13 * thereof, and that both notices appear in supporting documentation.
14 *
15 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
16 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
17 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
18 *
19 * Carnegie Mellon requests users of this software to return to
20 *
21 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
22 * School of Computer Science
23 * Carnegie Mellon University
24 * Pittsburgh PA 15213-3890
25 *
26 * any improvements or extensions that they make and grant Carnegie the
27 * rights to redistribute these changes.
28 */
29
30 /******************************************************************************
31 *
32 * rf_driver.c -- main setup, teardown, and access routines for the RAID driver
33 *
34 * all routines are prefixed with rf_ (raidframe), to avoid conficts.
35 *
36 ******************************************************************************/
37
38 /*
39 * :
40 * Log: rf_driver.c,v
41 * Revision 1.147 1996/08/21 04:12:46 jimz
42 * added hook for starting out req_hist w/ more distributed values
43 * (currently not done)
44 *
45 * Revision 1.146 1996/07/29 14:05:12 jimz
46 * fix numPUs/numRUs confusion (everything is now numRUs)
47 * clean up some commenting, return values
48 *
49 * Revision 1.145 1996/07/28 20:31:39 jimz
50 * i386netbsd port
51 * true/false fixup
52 *
53 * Revision 1.144 1996/07/27 18:40:24 jimz
54 * cleanup sweep
55 *
56 * Revision 1.143 1996/07/22 21:11:53 jimz
57 * fix formatting on DoAccess error msg
58 *
59 * Revision 1.142 1996/07/19 16:10:06 jimz
60 * added call to rf_ResetDebugOptions() in rf_ConfigureDebug()
61 *
62 * Revision 1.141 1996/07/18 22:57:14 jimz
63 * port simulator to AIX
64 *
65 * Revision 1.140 1996/07/17 21:00:58 jimz
66 * clean up timer interface, tracing
67 *
68 * Revision 1.139 1996/07/15 05:40:41 jimz
69 * some recon datastructure cleanup
70 * better handling of multiple failures
71 * added undocumented double-recon test
72 *
73 * Revision 1.138 1996/07/11 19:08:00 jimz
74 * generalize reconstruction mechanism
75 * allow raid1 reconstructs via copyback (done with array
76 * quiesced, not online, therefore not disk-directed)
77 *
78 * Revision 1.137 1996/07/10 22:28:00 jimz
79 * get rid of obsolete row statuses (dead,degraded2)
80 *
81 * Revision 1.136 1996/06/17 14:38:33 jimz
82 * properly #if out RF_DEMO code
83 * fix bug in MakeConfig that was causing weird behavior
84 * in configuration routines (config was not zeroed at start)
85 * clean up genplot handling of stacks
86 *
87 * Revision 1.135 1996/06/17 03:20:32 jimz
88 * move out raidframe_attr_default
89 * don't monkey with stack sizes
90 *
91 * Revision 1.134 1996/06/14 23:15:38 jimz
92 * attempt to deal with thread GC problem
93 *
94 * Revision 1.133 1996/06/14 21:24:08 jimz
95 * new ConfigureEtimer init
96 * moved out timer vars
97 *
98 * Revision 1.132 1996/06/14 16:19:03 jimz
99 * remove include of pdllib.h (beginning of PDL cleanup)
100 *
101 * Revision 1.131 1996/06/14 14:35:24 jimz
102 * clean up dfstrace protection
103 *
104 * Revision 1.130 1996/06/14 14:16:09 jimz
105 * engine config is now array-specific
106 *
107 * Revision 1.129 1996/06/13 19:08:10 jimz
108 * add debug var to force keep_acc_totals on
109 *
110 * Revision 1.128 1996/06/11 10:57:08 jimz
111 * init recon_done_proc_mutex
112 *
113 * Revision 1.127 1996/06/10 14:18:58 jimz
114 * move user, throughput stats into per-array structure
115 *
116 * Revision 1.126 1996/06/10 11:55:47 jimz
117 * Straightened out some per-array/not-per-array distinctions, fixed
118 * a couple bugs related to confusion. Added shutdown lists. Removed
119 * layout shutdown function (now subsumed by shutdown lists).
120 *
121 * Revision 1.125 1996/06/09 02:36:46 jimz
122 * lots of little crufty cleanup- fixup whitespace
123 * issues, comment #ifdefs, improve typing in some
124 * places (esp size-related)
125 *
126 * Revision 1.124 1996/06/07 21:33:04 jimz
127 * begin using consistent types for sector numbers,
128 * stripe numbers, row+col numbers, recon unit numbers
129 *
130 * Revision 1.123 1996/06/05 19:38:32 jimz
131 * fixed up disk queueing types config
132 * added sstf disk queueing
133 * fixed exit bug on diskthreads (ref-ing bad mem)
134 *
135 * Revision 1.122 1996/06/05 18:06:02 jimz
136 * Major code cleanup. The Great Renaming is now done.
137 * Better modularity. Better typing. Fixed a bunch of
138 * synchronization bugs. Made a lot of global stuff
139 * per-desc or per-array. Removed dead code.
140 *
141 * Revision 1.121 1996/06/03 23:28:26 jimz
142 * more bugfixes
143 * check in tree to sync for IPDS runs with current bugfixes
144 * there still may be a problem with threads in the script test
145 * getting I/Os stuck- not trivially reproducible (runs ~50 times
146 * in a row without getting stuck)
147 *
148 * Revision 1.120 1996/06/02 17:31:48 jimz
149 * Moved a lot of global stuff into array structure, where it belongs.
150 * Fixed up paritylogging, pss modules in this manner. Some general
151 * code cleanup. Removed lots of dead code, some dead files.
152 *
153 * Revision 1.119 1996/05/31 22:26:54 jimz
154 * fix a lot of mapping problems, memory allocation problems
155 * found some weird lock issues, fixed 'em
156 * more code cleanup
157 *
158 * Revision 1.118 1996/05/30 23:22:16 jimz
159 * bugfixes of serialization, timing problems
160 * more cleanup
161 *
162 * Revision 1.117 1996/05/30 16:28:33 jimz
163 * typo in rf_SignalQuiescenceLock() fixed
164 *
165 * Revision 1.116 1996/05/30 12:59:18 jimz
166 * make etimer happier, more portable
167 *
168 * Revision 1.115 1996/05/30 11:29:41 jimz
169 * Numerous bug fixes. Stripe lock release code disagreed with the taking code
170 * about when stripes should be locked (I made it consistent: no parity, no lock)
171 * There was a lot of extra serialization of I/Os which I've removed- a lot of
172 * it was to calculate values for the cache code, which is no longer with us.
173 * More types, function, macro cleanup. Added code to properly quiesce the array
174 * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
175 * before. Fixed memory allocation, freeing bugs.
176 *
177 * Revision 1.114 1996/05/27 18:56:37 jimz
178 * more code cleanup
179 * better typing
180 * compiles in all 3 environments
181 *
182 * Revision 1.113 1996/05/24 22:17:04 jimz
183 * continue code + namespace cleanup
184 * typed a bunch of flags
185 *
186 * Revision 1.112 1996/05/24 04:28:55 jimz
187 * release cleanup ckpt
188 *
189 * Revision 1.111 1996/05/24 01:59:45 jimz
190 * another checkpoint in code cleanup for release
191 * time to sync kernel tree
192 *
193 * Revision 1.110 1996/05/23 21:46:35 jimz
194 * checkpoint in code cleanup (release prep)
195 * lots of types, function names have been fixed
196 *
197 * Revision 1.109 1996/05/23 00:39:56 jimz
198 * demoMode -> rf_demoMode
199 *
200 * Revision 1.108 1996/05/23 00:33:23 jimz
201 * code cleanup: move all debug decls to rf_options.c, all extern
202 * debug decls to rf_options.h, all debug vars preceded by rf_
203 *
204 * Revision 1.107 1996/05/21 14:30:04 jimz
205 * idler_desc_mutex should be ifndef SIMULATE
206 *
207 * Revision 1.106 1996/05/20 19:31:12 jimz
208 * add atomic debug (mutex and cond leak finder) stuff
209 *
210 * Revision 1.105 1996/05/20 16:12:45 jimz
211 * switch to rf_{mutex,cond}_{init,destroy}
212 *
213 * Revision 1.104 1996/05/18 20:09:41 jimz
214 * bit of cleanup to compile cleanly in kernel, once again
215 *
216 * Revision 1.103 1996/05/18 19:51:34 jimz
217 * major code cleanup- fix syntax, make some types consistent,
218 * add prototypes, clean out dead code, et cetera
219 *
220 * Revision 1.102 1996/05/16 21:20:51 jimz
221 * use FREELIST stuff to manage access descriptors
222 *
223 * Revision 1.101 1996/05/16 14:21:10 jimz
224 * remove bogus copies from write path on user
225 *
226 * Revision 1.100 1996/05/15 22:33:54 jimz
227 * appropriately #ifdef cache stuff
228 *
229 * Revision 1.99 1996/05/08 21:34:41 jimz
230 * #if 0 ShutdownCache() and ConfigureCache()
231 *
232 * Revision 1.98 1996/05/08 21:01:24 jimz
233 * fixed up enum type names that were conflicting with other
234 * enums and function names (ie, "panic")
235 * future naming trends will be towards RF_ and rf_ for
236 * everything raidframe-related
237 *
238 * Revision 1.97 1996/05/07 19:02:58 wvcii
239 * corrected header comment of rf_DoAccess()
240 * reordered free of desc in FreeRaidAccDesc() The desc is now
241 * freed last.
242 *
243 * Revision 1.96 1996/05/07 17:40:50 jimz
244 * add doDebug
245 *
246 * Revision 1.95 1996/05/06 21:35:23 jimz
247 * fixed ordering of cleanup and removed extra decrement of configureCount
248 *
249 * Revision 1.94 1996/05/06 18:44:14 jimz
250 * reorder cleanup to not blow alloclist out from under various modules
251 * zero raidPtr contents on config
252 *
253 * Revision 1.93 1996/05/04 17:06:53 jimz
254 * Fail the I/O with ENOSPC if reading past end of the array in the kernel.
255 *
256 * Revision 1.92 1996/05/03 19:44:22 wvcii
257 * debug vars degDagDebug and enableAtomicRMW now defined
258 * in this file.
259 *
260 * Revision 1.91 1995/12/12 18:10:06 jimz
261 * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
262 * fix 80-column brain damage in comments
263 *
264 * Revision 1.90 1995/12/08 15:07:03 arw
265 * cache code cleanup
266 *
267 * Revision 1.89 1995/12/06 20:53:58 wvcii
268 * created debug var forceParityLogReint
269 * this variable forces reintegration of all parity logs at shutdown
270 *
271 * Revision 1.88 1995/12/01 15:59:10 root
272 * added copyright info
273 *
274 * Revision 1.87 1995/11/28 21:34:02 amiri
275 * modified SetReconfiguredMode so that it installs the
276 * spare table only if arch is declustered based on block designs
277 *
278 * Revision 1.86 1995/11/21 23:06:11 amiri
279 * added division by zero check in printing
280 * throughput stats.
281 *
282 * Revision 1.85 1995/11/19 16:27:25 wvcii
283 * disableParityVerify now defined locally, only read from config
284 * file for !KERNEL compiles
285 *
286 * Revision 1.84 1995/11/17 15:08:31 wvcii
287 * added debug var disableParityVerify
288 * used in RealLoopTest to disable parity verification
289 *
290 * Revision 1.83 1995/11/07 15:48:43 wvcii
291 * deleted debug vars: suppressAtomicRMW, enableRollAway, concatDagDebug
292 * deleted debug vars: debugSelectUnit, debugSelectBlock
293 * added debug var: enableAtomicRMW
294 *
295 * Revision 1.82 1995/10/18 19:28:45 amiri
296 * added support for reconstruction demos in the
297 * simulator, by updating some simulator
298 * variables in Faildisk.
299 *
300 * Revision 1.81 1995/10/09 18:36:33 jimz
301 * move rf_StopThroughputStats() into FreeAccDesc()
302 * changed throughput output print format
303 * added user-level copy to write path to emulate kernel hack
304 *
305 * Revision 1.80 1995/10/09 18:07:47 wvcii
306 * moved call to rf_StopThroughputStats to rf_states.c
307 *
308 * Revision 1.79 1995/10/09 17:38:53 jimz
309 * quiesce an array for user-level testing before shutting it down
310 * (should this also be done in the kernel?)
311 *
312 * Revision 1.78 1995/10/09 15:35:43 wvcii
313 * added code to measure throughput in user mode
314 *
315 * Revision 1.77 1995/10/05 06:18:59 jimz
316 * Changed DDEventRequest() to take additional arg, used by simulator
317 * to cache diskid so queue length can be decremented on io complete
318 * (this is a hack to get around the fact that the event mechanism
319 * assumes it can dereference arbitrary handles on enqueued events)
320 *
321 * Revision 1.76 1995/10/04 07:25:10 jimz
322 * turn off bigstacks by default
323 *
324 * Revision 1.75 1995/10/04 07:24:34 jimz
325 * code for bigstacks in user process
326 *
327 * Revision 1.74 1995/09/26 21:42:51 wvcii
328 * removed calls to ConfigureCache, ShutdownCache when building kernel
329 * kernel currently does not support any cached architectures
330 *
331 * Revision 1.73 1995/09/20 21:05:35 jimz
332 * add missing unit arg to IO_BUF_ERR() in non-kernel case
333 *
334 * Revision 1.72 1995/09/19 23:02:44 jimz
335 * call RF_DKU_END_IO in the appropriate places
336 *
337 * Revision 1.71 1995/09/07 19:02:31 jimz
338 * mods to get raidframe to compile and link
339 * in kernel environment
340 *
341 * Revision 1.70 1995/09/06 19:24:01 wvcii
342 * added debug vars enableRollAway and debugRecovery
343 *
344 * Revision 1.69 1995/08/24 19:25:36 rachad
345 * Fixes to LSS GC in the simulater
346 *
347 * Revision 1.68 1995/07/28 21:43:42 robby
348 * checkin after leaving for Rice. Bye
349 *
350 * Revision 1.67 1995/07/26 18:06:52 cfb
351 * *** empty log message ***
352 *
353 * Revision 1.66 1995/07/26 03:25:24 robby
354 * fixed accesses mutex and updated call to ConfigureCache
355 *
356 * Revision 1.65 1995/07/25 14:36:52 rachad
357 * *** empty log message ***
358 *
359 * Revision 1.64 1995/07/21 19:29:05 robby
360 * added total_accesses
361 *
362 * Revision 1.63 1995/07/20 19:43:35 cfb
363 * *** empty log message ***
364 *
365 * Revision 1.62 1995/07/20 16:10:24 rachad
366 * *** empty log message ***
367 *
368 * Revision 1.61 1995/07/20 03:36:53 rachad
369 * Added suport for cache warming
370 *
371 * Revision 1.60 1995/07/17 22:31:31 cfb
372 * *** empty log message ***
373 *
374 * Revision 1.59 1995/07/16 17:02:23 cfb
375 * *** empty log message ***
376 *
377 * Revision 1.58 1995/07/16 15:19:27 cfb
378 * *** empty log message ***
379 *
380 * Revision 1.57 1995/07/16 03:17:01 cfb
381 * *** empty log message ***
382 *
383 * Revision 1.56 1995/07/13 16:11:59 cfb
384 * *** empty log message ***
385 *
386 * Revision 1.55 1995/07/13 15:42:40 cfb
387 * added cacheDebug variable ...
388 *
389 * Revision 1.54 1995/07/13 14:28:27 rachad
390 * *** empty log message ***
391 *
392 * Revision 1.53 1995/07/10 21:48:52 robby
393 * added virtualStripingWarnings
394 *
395 * Revision 1.52 1995/07/10 20:41:13 rachad
396 * *** empty log message ***
397 *
398 * Revision 1.51 1995/07/09 19:46:49 cfb
399 * Added cache Shutdown
400 *
401 * Revision 1.50 1995/07/08 21:38:53 rachad
402 * Added support for interactive traces
403 * in the simulator
404 *
405 * Revision 1.49 1995/07/08 18:05:39 rachad
406 * Linked up Claudsons code with the real cache
407 *
408 * Revision 1.48 1995/07/07 16:00:22 cfb
409 * Added initialization of cacheDesc to AllocRaidAccDesc
410 *
411 * Revision 1.47 1995/07/06 14:22:37 rachad
412 * Merge complete
413 *
414 * Revision 1.46.50.2 1995/06/21 17:48:30 robby
415 * test
416 *
417 * Revision 1.46.50.1 1995/06/21 17:34:49 robby
418 * branching to work on "meta-dag" capabilities
419 *
420 * Revision 1.46.10.5 1995/07/03 21:58:34 holland
421 * added support for suppressing both stripe locks & large writes
422 *
423 * Revision 1.46.10.4 1995/06/27 03:42:48 holland
424 * typo fix
425 *
426 * Revision 1.46.10.3 1995/06/27 03:31:42 holland
427 * prototypes
428 *
429 * Revision 1.46.10.2 1995/06/27 03:17:57 holland
430 * fixed callback bug in kernel rf_DoAccess
431 *
432 * Revision 1.46.10.1 1995/06/25 14:32:44 holland
433 * initial checkin on new branch
434 *
435 * Revision 1.46 1995/06/13 17:52:41 holland
436 * added UserStats stuff
437 *
438 * Revision 1.45 1995/06/13 16:03:41 rachad
439 * *** empty log message ***
440 *
441 * Revision 1.44 1995/06/12 15:54:40 rachad
442 * Added garbege collection for log structured storage
443 *
444 * Revision 1.43 1995/06/09 18:01:09 holland
445 * various changes related to in-kernel recon, multiple-row arrays,
446 * trace extraction from kernel, etc.
447 *
448 * Revision 1.42 1995/06/08 19:52:28 rachad
449 * *** empty log message ***
450 *
451 * Revision 1.41 1995/06/08 00:11:49 robby
452 * added a debug variable -- showVirtualSizeRequirements
453 *
454 * Revision 1.40 1995/06/05 00:33:30 holland
455 * protectedSectors bug fix
456 *
457 * Revision 1.39 1995/06/01 22:45:03 holland
458 * made compilation of parity logging and virtual striping
459 * stuff conditional on some constants defined in rf_archs.h
460 *
461 * Revision 1.38 1995/06/01 21:52:37 holland
462 * replaced NULL sizes in calls to Free() by -1, and caused this
463 * to suppress the size-mismatch error
464 *
465 * Revision 1.37 1995/05/26 20:04:54 wvcii
466 * modified parity logging debug vars
467 *
468 * Revision 1.36 95/05/21 15:32:41 wvcii
469 * added debug vars: parityLogDebug, numParityRegions, numParityLogs,
470 * numReintegrationThreads
471 *
472 * Revision 1.35 95/05/19 20:58:21 holland
473 * cleanups on error cases in rf_DoAccess
474 *
475 * Revision 1.34 1995/05/16 17:35:53 holland
476 * added rf_copyback_in_progress. this is debug-only.
477 *
478 * Revision 1.33 1995/05/15 12:25:35 holland
479 * bug fix in test code: no stripe locks were getting acquired in RAID0 mode
480 *
481 * Revision 1.32 1995/05/10 18:54:12 holland
482 * bug fixes related to deadlock problem at time of disk failure
483 * eliminated read-op-write code
484 * beefed up parity checking in loop test
485 * various small changes & new ASSERTs
486 *
487 * Revision 1.31 1995/05/02 22:49:02 holland
488 * add shutdown calls for each architecture
489 *
490 * Revision 1.30 1995/05/01 14:43:37 holland
491 * merged changes from Bill
492 *
493 * Revision 1.29 1995/05/01 13:28:00 holland
494 * parity range locks, locking disk requests, recon+parityscan in kernel, etc.
495 *
496 * Revision 1.28 1995/04/24 13:25:51 holland
497 * rewrite to move disk queues, recon, & atomic RMW to kernel
498 *
499 * Revision 1.27 1995/04/06 14:47:56 rachad
500 * merge completed
501 *
502 * Revision 1.26 1995/04/03 20:32:35 rachad
503 * added reconstruction to simulator
504 *
505 * Revision 1.25.10.2 1995/04/03 20:41:00 holland
506 * misc changes related to distributed sparing
507 *
508 * Revision 1.25.10.1 1995/03/17 20:04:01 holland
509 * initial checkin on new branch
510 *
511 * Revision 1.25 1995/03/15 20:34:30 holland
512 * changes for distributed sparing.
513 *
514 * Revision 1.24 1995/03/09 19:53:05 rachad
515 * *** empty log message ***
516 *
517 * Revision 1.23 1995/03/03 18:36:16 rachad
518 * Simulator mechanism added
519 *
520 * Revision 1.22 1995/03/01 20:25:48 holland
521 * kernelization changes
522 *
523 * Revision 1.21 1995/02/17 19:39:56 holland
524 * added size param to all calls to Free().
525 * this is ignored at user level, but necessary in the kernel.
526 *
527 * Revision 1.20 1995/02/17 13:37:49 holland
528 * kernelization changes -- not yet complete
529 *
530 * Revision 1.19 1995/02/10 18:08:07 holland
531 * fixed a few things I broke during kernelization
532 *
533 * Revision 1.18 1995/02/10 17:34:10 holland
534 * kernelization changes
535 *
536 * Revision 1.17 1995/02/04 15:51:35 holland
537 * kernelization changes
538 *
539 * Revision 1.16 1995/02/03 22:31:36 holland
540 * many changes related to kernelization
541 *
542 * Revision 1.15 1995/02/01 15:13:05 holland
543 * moved #include of general.h out of raid.h and into each file
544 *
545 * Revision 1.14 1995/02/01 14:25:19 holland
546 * began changes for kernelization:
547 * changed all instances of mutex_t and cond_t to DECLARE macros
548 * converted configuration code to use config structure
549 *
550 * Revision 1.13 1995/01/30 14:53:46 holland
551 * extensive changes related to making DoIO non-blocking
552 *
553 * Revision 1.12 1995/01/25 00:26:21 holland
554 * eliminated support for aio
555 *
556 * Revision 1.11 1995/01/24 23:58:46 holland
557 * multi-way recon XOR, plus various small changes
558 *
559 * Revision 1.10 1995/01/11 19:27:02 holland
560 * various changes related to performance tuning
561 *
562 * Revision 1.9 1994/12/05 15:29:09 holland
563 * added trace run time limitation (maxTraceRunTimeSec)
564 *
565 * Revision 1.8 1994/12/05 04:18:12 holland
566 * various new control vars in the config file
567 *
568 * Revision 1.7 1994/11/29 23:11:36 holland
569 * tracerec bug on dag retry fixed
570 *
571 * Revision 1.6 1994/11/29 22:11:38 danner
572 * holland updates
573 *
574 * Revision 1.5 1994/11/29 21:09:47 danner
575 * Detailed tracing support (holland).
576 *
577 * Revision 1.4 1994/11/29 20:36:02 danner
578 * Added suppressAtomicRMW option.
579 *
580 * Revision 1.3 1994/11/21 15:34:06 danner
581 * Added ConfigureAllocList() call.
582 *
583 */
584
585 #ifdef _KERNEL
586 #define KERNEL
587 #endif
588
589 #if defined(__NetBSD__) && defined(_KERNEL)
590 #include <sys/types.h>
591 #include <sys/param.h>
592 #include <sys/systm.h>
593 #include <sys/ioctl.h>
594 #include <sys/fcntl.h>
595 #include <sys/vnode.h>
596 #endif
597
598 #ifdef KERNEL
599 #ifndef __NetBSD__
600 #include <dkusage.h>
601 #include <dfstrace.h>
602 #endif /* !__NetBSD__ */
603 #endif /* KERNEL */
604
605 #include "rf_archs.h"
606 #include "rf_threadstuff.h"
607
608 #ifndef KERNEL
609 #include <stdio.h>
610 #include <stdlib.h>
611 #endif /* KERNEL */
612
613 #include <sys/errno.h>
614
615 #include "rf_raid.h"
616 #include "rf_dag.h"
617 #include "rf_aselect.h"
618 #include "rf_diskqueue.h"
619 #include "rf_parityscan.h"
620 #include "rf_alloclist.h"
621 #include "rf_threadid.h"
622 #include "rf_dagutils.h"
623 #include "rf_utils.h"
624 #include "rf_etimer.h"
625 #include "rf_acctrace.h"
626 #include "rf_configure.h"
627 #include "rf_general.h"
628 #include "rf_desc.h"
629 #include "rf_states.h"
630 #include "rf_freelist.h"
631 #include "rf_decluster.h"
632 #include "rf_map.h"
633 #include "rf_diskthreads.h"
634 #include "rf_revent.h"
635 #include "rf_callback.h"
636 #include "rf_engine.h"
637 #include "rf_memchunk.h"
638 #include "rf_mcpair.h"
639 #include "rf_nwayxor.h"
640 #include "rf_debugprint.h"
641 #include "rf_copyback.h"
642 #if !defined(__NetBSD__)
643 #include "rf_camlayer.h"
644 #endif
645 #include "rf_driver.h"
646 #include "rf_options.h"
647 #include "rf_shutdown.h"
648 #include "rf_sys.h"
649 #include "rf_cpuutil.h"
650
651 #ifdef SIMULATE
652 #include "rf_diskevent.h"
653 #endif /* SIMULATE */
654
655 #ifdef KERNEL
656 #include <sys/buf.h>
657 #ifndef __NetBSD__
658 #include <io/common/devdriver.h>
659 #endif /* !__NetBSD__ */
660
661 #if DFSTRACE > 0
662 #include <sys/dfs_log.h>
663 #include <sys/dfstracebuf.h>
664 #endif /* DFSTRACE > 0 */
665
666 #if DKUSAGE > 0
667 #include <sys/dkusage.h>
668 #include <io/common/iotypes.h>
669 #include <io/cam/dec_cam.h>
670 #include <io/cam/cam.h>
671 #include <io/cam/pdrv.h>
672 #endif /* DKUSAGE > 0 */
673 #endif /* KERNEL */
674
675 #if RF_DEMO > 0
676 #include "rf_demo.h"
677 #endif /* RF_DEMO > 0 */
678
679 /* rad == RF_RaidAccessDesc_t */
680 static RF_FreeList_t *rf_rad_freelist;
681 #define RF_MAX_FREE_RAD 128
682 #define RF_RAD_INC 16
683 #define RF_RAD_INITIAL 32
684
685 /* debug variables */
686 char rf_panicbuf[2048]; /* a buffer to hold an error msg when we panic */
687
688 /* main configuration routines */
689 static int raidframe_booted = 0;
690
691 static void rf_ConfigureDebug(RF_Config_t *cfgPtr);
692 static void set_debug_option(char *name, long val);
693 static void rf_UnconfigureArray(void);
694 static int init_rad(RF_RaidAccessDesc_t *);
695 static void clean_rad(RF_RaidAccessDesc_t *);
696 static void rf_ShutdownRDFreeList(void *);
697 static int rf_ConfigureRDFreeList(RF_ShutdownList_t **);
698
699
700 RF_DECLARE_MUTEX(rf_printf_mutex) /* debug only: avoids interleaved printfs by different stripes */
701 RF_DECLARE_GLOBAL_THREADID /* declarations for threadid.h */
702
703 #if !defined(KERNEL) && !defined(SIMULATE)
704 static int rf_InitThroughputStats(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, RF_Config_t *cfgPtr);
705 static void rf_StopThroughputStats(RF_Raid_t *raidPtr);
706 static void rf_PrintThroughputStats(RF_Raid_t *raidPtr);
707 #endif /* !KERNEL && !SIMULATE */
708
709 #ifdef KERNEL
710 #define SIGNAL_QUIESCENT_COND(_raid_) wakeup(&((_raid_)->accesses_suspended))
711 #ifndef __NetBSD__
712 #define WAIT_FOR_QUIESCENCE(_raid_) \
713 mpsleep(&((_raid_)->accesses_suspended), PZERO, "raidframe quiesce", 0, \
714 (void *) simple_lock_addr((_raid_)->access_suspend_mutex), MS_LOCK_SIMPLE)
715 #else
716 #define WAIT_FOR_QUIESCENCE(_raid_) \
717 tsleep(&((_raid_)->accesses_suspended),PRIBIO|PCATCH,"raidframe quiesce", 0);
718
719 #endif
720 #if DKUSAGE > 0
721 #define IO_BUF_ERR(bp, err, unit) { \
722 bp->b_flags |= B_ERROR; \
723 bp->b_resid = bp->b_bcount; \
724 bp->b_error = err; \
725 RF_DKU_END_IO(unit, bp); \
726 biodone(bp); \
727 }
728 #else
729 #define IO_BUF_ERR(bp, err, unit) { \
730 bp->b_flags |= B_ERROR; \
731 bp->b_resid = bp->b_bcount; \
732 bp->b_error = err; \
733 RF_DKU_END_IO(unit); \
734 biodone(bp); \
735 }
736 #endif /* DKUSAGE > 0 */
737 #else /* KERNEL */
738
739 #define SIGNAL_QUIESCENT_COND(_raid_) RF_SIGNAL_COND((_raid_)->quiescent_cond)
740 #define WAIT_FOR_QUIESCENCE(_raid_) RF_WAIT_COND((_raid_)->quiescent_cond, (_raid_)->access_suspend_mutex)
741 #define IO_BUF_ERR(bp, err, unit)
742
743 #endif /* KERNEL */
744
745 static int configureCount=0; /* number of active configurations */
746 static int isconfigged=0; /* is basic raidframe (non per-array) stuff configged */
747 RF_DECLARE_STATIC_MUTEX(configureMutex) /* used to lock the configuration stuff */
748
749 static RF_ShutdownList_t *globalShutdown; /* non array-specific stuff */
750
751 static int rf_ConfigureRDFreeList(RF_ShutdownList_t **listp);
752
753 /* called at system boot time */
754 int rf_BootRaidframe()
755 {
756 #if 0
757 long stacksize;
758 #endif
759 int rc;
760
761 if (raidframe_booted)
762 return(EBUSY);
763 raidframe_booted = 1;
764
765 #if RF_DEBUG_ATOMIC > 0
766 rf_atent_init();
767 #endif /* RF_DEBUG_ATOMIC > 0 */
768
769 rf_setup_threadid();
770 rf_assign_threadid();
771
772 #if !defined(KERNEL) && !defined(SIMULATE)
773 if (RF_THREAD_ATTR_CREATE(raidframe_attr_default)) {
774 fprintf(stderr, "Unable to create default thread attr\n");
775 exit(1);
776 }
777 #if 0
778 stacksize = RF_THREAD_ATTR_GETSTACKSIZE(raidframe_attr_default);
779 if (stacksize < 0) {
780 fprintf(stderr, "Unable to get stack size of default thread attr\n");
781 exit(1);
782 }
783 stacksize += 16384;
784 rc = RF_THREAD_ATTR_SETSTACKSIZE(raidframe_attr_default, stacksize);
785 if (rc) {
786 fprintf(stderr, "Unable to set stack size of default thread attr\n");
787 exit(1);
788 }
789 #endif /* 0 */
790 #endif /* !KERNEL && !SIMULATE */
791 rc = rf_mutex_init(&configureMutex);
792 if (rc) {
793 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
794 __LINE__, rc);
795 RF_PANIC();
796 }
797 configureCount = 0;
798 isconfigged = 0;
799 globalShutdown = NULL;
800 return(0);
801 }
802
803 /*
804 * This function is really just for debugging user-level stuff: it
805 * frees up all memory, other RAIDframe resources which might otherwise
806 * be kept around. This is used with systems like "sentinel" to detect
807 * memory leaks.
808 */
809 int rf_UnbootRaidframe()
810 {
811 int rc;
812
813 RF_LOCK_MUTEX(configureMutex);
814 if (configureCount) {
815 RF_UNLOCK_MUTEX(configureMutex);
816 return(EBUSY);
817 }
818 raidframe_booted = 0;
819 RF_UNLOCK_MUTEX(configureMutex);
820 rc = rf_mutex_destroy(&configureMutex);
821 if (rc) {
822 RF_ERRORMSG3("Unable to destroy mutex file %s line %d rc=%d\n", __FILE__,
823 __LINE__, rc);
824 RF_PANIC();
825 }
826 #if RF_DEBUG_ATOMIC > 0
827 rf_atent_shutdown();
828 #endif /* RF_DEBUG_ATOMIC > 0 */
829 return(0);
830 }
831
832 /*
833 * Called whenever an array is shutdown
834 */
835 static void rf_UnconfigureArray()
836 {
837 int rc;
838
839 RF_LOCK_MUTEX(configureMutex);
840 if (--configureCount == 0) { /* if no active configurations, shut everything down */
841 isconfigged = 0;
842
843 rc = rf_ShutdownList(&globalShutdown);
844 if (rc) {
845 RF_ERRORMSG1("RAIDFRAME: unable to do global shutdown, rc=%d\n", rc);
846 }
847
848 rf_shutdown_threadid();
849
850 /*
851 * We must wait until now, because the AllocList module
852 * uses the DebugMem module.
853 */
854 if (rf_memDebug)
855 rf_print_unfreed();
856 }
857 RF_UNLOCK_MUTEX(configureMutex);
858 }
859
860 /*
861 * Called to shut down an array.
862 */
863 int rf_Shutdown(raidPtr)
864 RF_Raid_t *raidPtr;
865 {
866 #if !defined(__NetBSD) && !defined(_KERNEL)
867 int rc;
868 #endif
869 int r,c;
870
871 #if defined(__NetBSD__) && defined(_KERNEL)
872 struct proc *p;
873 #endif
874
875 if (!raidPtr->valid) {
876 RF_ERRORMSG("Attempt to shut down unconfigured RAIDframe driver. Aborting shutdown\n");
877 return(EINVAL);
878 }
879
880 /*
881 * wait for outstanding IOs to land
882 * As described in rf_raid.h, we use the rad_freelist lock
883 * to protect the per-array info about outstanding descs
884 * since we need to do freelist locking anyway, and this
885 * cuts down on the amount of serialization we've got going
886 * on.
887 */
888 RF_FREELIST_DO_LOCK(rf_rad_freelist);
889 if (raidPtr->waitShutdown) {
890 RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
891 return(EBUSY);
892 }
893 raidPtr->waitShutdown = 1;
894 while (raidPtr->nAccOutstanding) {
895 RF_WAIT_COND(raidPtr->outstandingCond, RF_FREELIST_MUTEX_OF(rf_rad_freelist));
896 }
897 RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
898
899 #if !defined(KERNEL) && !defined(SIMULATE)
900 rf_PrintThroughputStats(raidPtr);
901 #endif /* !KERNEL && !SIMULATE */
902
903 raidPtr->valid = 0;
904
905 #if !defined(KERNEL) && !defined(SIMULATE)
906 rf_TerminateDiskQueues(raidPtr); /* tell all disk queues to release any waiting threads */
907 rf_ShutdownDiskThreads(raidPtr); /* wait for all threads to exit */
908 #endif /* !KERNEL && !SIMULATE */
909
910 #if defined(__NetBSD__) && defined(_KERNEL)
911 /* We take this opportunity to close the vnodes like we should.. */
912
913 p = raidPtr->proc; /* XXX */
914
915 for(r=0;r<raidPtr->numRow;r++) {
916 for(c=0;c<raidPtr->numCol;c++) {
917 printf("Closing vnode for row: %d col: %d\n",r,c);
918 if (raidPtr->raid_cinfo[r][c].ci_vp) {
919 (void)vn_close(raidPtr->raid_cinfo[r][c].ci_vp,
920 FREAD|FWRITE, p->p_ucred, p);
921 } else {
922 printf("vnode was NULL\n");
923 }
924
925 }
926 }
927 for(r=0;r<raidPtr->numSpare;r++) {
928 printf("Closing vnode for spare: %d\n",r);
929 if (raidPtr->raid_cinfo[0][raidPtr->numCol+r].ci_vp) {
930 (void)vn_close(raidPtr->raid_cinfo[0][raidPtr->numCol+r].ci_vp,
931 FREAD|FWRITE, p->p_ucred, p);
932 } else {
933 printf("vnode was NULL\n");
934 }
935 }
936
937
938 #endif
939
940 rf_ShutdownList(&raidPtr->shutdownList);
941
942 rf_UnconfigureArray();
943
944 return(0);
945 }
946
947 #define DO_INIT_CONFIGURE(f) { \
948 rc = f (&globalShutdown); \
949 if (rc) { \
950 RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \
951 rf_ShutdownList(&globalShutdown); \
952 configureCount--; \
953 RF_UNLOCK_MUTEX(configureMutex); \
954 return(rc); \
955 } \
956 }
957
958 #define DO_RAID_FAIL() { \
959 rf_ShutdownList(&raidPtr->shutdownList); \
960 rf_UnconfigureArray(); \
961 }
962
963 #define DO_RAID_INIT_CONFIGURE(f) { \
964 rc = f (&raidPtr->shutdownList, raidPtr, cfgPtr); \
965 if (rc) { \
966 RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \
967 DO_RAID_FAIL(); \
968 return(rc); \
969 } \
970 }
971
972 #define DO_RAID_MUTEX(_m_) { \
973 rc = rf_create_managed_mutex(&raidPtr->shutdownList, (_m_)); \
974 if (rc) { \
975 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", \
976 __FILE__, __LINE__, rc); \
977 DO_RAID_FAIL(); \
978 return(rc); \
979 } \
980 }
981
982 #define DO_RAID_COND(_c_) { \
983 rc = rf_create_managed_cond(&raidPtr->shutdownList, (_c_)); \
984 if (rc) { \
985 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", \
986 __FILE__, __LINE__, rc); \
987 DO_RAID_FAIL(); \
988 return(rc); \
989 } \
990 }
991
992 int rf_Configure(raidPtr, cfgPtr)
993 RF_Raid_t *raidPtr;
994 RF_Config_t *cfgPtr;
995 {
996 RF_RowCol_t row, col;
997 int i, rc;
998 int unit;
999 struct proc *p;
1000
1001 if (raidPtr->valid) {
1002 RF_ERRORMSG("RAIDframe configuration not shut down. Aborting configure.\n");
1003 return(EINVAL);
1004 }
1005
1006 RF_LOCK_MUTEX(configureMutex);
1007 configureCount++;
1008 if (isconfigged == 0) {
1009 rc = rf_create_managed_mutex(&globalShutdown, &rf_printf_mutex);
1010 if (rc) {
1011 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
1012 __LINE__, rc);
1013 rf_ShutdownList(&globalShutdown);
1014 return(rc);
1015 }
1016
1017 /* initialize globals */
1018 printf("RAIDFRAME: protectedSectors is %ld\n",rf_protectedSectors);
1019
1020 rf_clear_debug_print_buffer();
1021
1022 DO_INIT_CONFIGURE(rf_ConfigureAllocList);
1023 DO_INIT_CONFIGURE(rf_ConfigureEtimer);
1024 /*
1025 * Yes, this does make debugging general to the whole system instead
1026 * of being array specific. Bummer, drag.
1027 */
1028 rf_ConfigureDebug(cfgPtr);
1029 DO_INIT_CONFIGURE(rf_ConfigureDebugMem);
1030 #ifdef SIMULATE
1031 rf_default_disk_names();
1032 DO_INIT_CONFIGURE(rf_DDEventInit);
1033 #endif /* SIMULATE */
1034 DO_INIT_CONFIGURE(rf_ConfigureAccessTrace);
1035 DO_INIT_CONFIGURE(rf_ConfigureMapModule);
1036 DO_INIT_CONFIGURE(rf_ConfigureReconEvent);
1037 DO_INIT_CONFIGURE(rf_ConfigureCallback);
1038 DO_INIT_CONFIGURE(rf_ConfigureMemChunk);
1039 DO_INIT_CONFIGURE(rf_ConfigureRDFreeList);
1040 DO_INIT_CONFIGURE(rf_ConfigureNWayXor);
1041 DO_INIT_CONFIGURE(rf_ConfigureStripeLockFreeList);
1042 DO_INIT_CONFIGURE(rf_ConfigureMCPair);
1043 #ifndef SIMULATE
1044 #if !defined(__NetBSD__)
1045 DO_INIT_CONFIGURE(rf_ConfigureCamLayer);
1046 #endif
1047 #endif /* !SIMULATE */
1048 DO_INIT_CONFIGURE(rf_ConfigureDAGs);
1049 DO_INIT_CONFIGURE(rf_ConfigureDAGFuncs);
1050 DO_INIT_CONFIGURE(rf_ConfigureDebugPrint);
1051 DO_INIT_CONFIGURE(rf_ConfigureReconstruction);
1052 DO_INIT_CONFIGURE(rf_ConfigureCopyback);
1053 DO_INIT_CONFIGURE(rf_ConfigureDiskQueueSystem);
1054 DO_INIT_CONFIGURE(rf_ConfigureCpuMonitor);
1055 isconfigged = 1;
1056 }
1057 RF_UNLOCK_MUTEX(configureMutex);
1058
1059 /*
1060 * Null out the entire raid descriptor to avoid problems when we reconfig.
1061 * This also clears the valid bit.
1062 */
1063 /* XXX this clearing should be moved UP to outside of here.... that, or
1064 rf_Configure() needs to take more arguments... XXX */
1065 #if defined(__NetBSD__) && defined(_KERNEL)
1066 unit = raidPtr->raidid;
1067 p = raidPtr->proc; /* XXX save these... */
1068 #endif
1069 bzero((char *)raidPtr, sizeof(RF_Raid_t));
1070 #if defined(__NetBSD__) && defined(_KERNEL)
1071 raidPtr->raidid = unit;
1072 raidPtr->proc = p; /* XXX and then recover them..*/
1073 #endif
1074 DO_RAID_MUTEX(&raidPtr->mutex);
1075 /* set up the cleanup list. Do this after ConfigureDebug so that value of memDebug will be set */
1076
1077 rf_MakeAllocList(raidPtr->cleanupList);
1078 if (raidPtr->cleanupList == NULL) {
1079 DO_RAID_FAIL();
1080 return(ENOMEM);
1081 }
1082
1083 rc = rf_ShutdownCreate(&raidPtr->shutdownList,
1084 (void (*)(void *))rf_FreeAllocList,
1085 raidPtr->cleanupList);
1086 if (rc) {
1087 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
1088 __FILE__, __LINE__, rc);
1089 DO_RAID_FAIL();
1090 return(rc);
1091 }
1092
1093 raidPtr->numRow = cfgPtr->numRow;
1094 raidPtr->numCol = cfgPtr->numCol;
1095 raidPtr->numSpare = cfgPtr->numSpare;
1096
1097 #if defined(__NetBSD__) && defined(_KERNEL)
1098 /* XXX we don't even pretend to support more than one row
1099 in the kernel... */
1100 if (raidPtr->numRow != 1) {
1101 RF_ERRORMSG("Only one row supported in kernel.\n");
1102 DO_RAID_FAIL();
1103 return(EINVAL);
1104 }
1105 #endif
1106
1107
1108
1109 RF_CallocAndAdd(raidPtr->status, raidPtr->numRow, sizeof(RF_RowStatus_t),
1110 (RF_RowStatus_t *), raidPtr->cleanupList);
1111 if (raidPtr->status == NULL) {
1112 DO_RAID_FAIL();
1113 return(ENOMEM);
1114 }
1115
1116 RF_CallocAndAdd(raidPtr->reconControl, raidPtr->numRow,
1117 sizeof(RF_ReconCtrl_t *), (RF_ReconCtrl_t **), raidPtr->cleanupList);
1118 if (raidPtr->reconControl == NULL) {
1119 DO_RAID_FAIL();
1120 return(ENOMEM);
1121 }
1122 for (i=0; i<raidPtr->numRow; i++) {
1123 raidPtr->status[i] = rf_rs_optimal;
1124 raidPtr->reconControl[i] = NULL;
1125 }
1126
1127 DO_RAID_INIT_CONFIGURE(rf_ConfigureEngine);
1128 #if !defined(KERNEL) && !defined(SIMULATE)
1129 DO_RAID_INIT_CONFIGURE(rf_InitThroughputStats);
1130 #endif /* !KERNEL && !SIMULATE */
1131
1132 DO_RAID_INIT_CONFIGURE(rf_ConfigureStripeLocks);
1133
1134 DO_RAID_COND(&raidPtr->outstandingCond);
1135
1136 raidPtr->nAccOutstanding = 0;
1137 raidPtr->waitShutdown = 0;
1138
1139 DO_RAID_MUTEX(&raidPtr->access_suspend_mutex);
1140 DO_RAID_COND(&raidPtr->quiescent_cond);
1141
1142 DO_RAID_COND(&raidPtr->waitForReconCond);
1143
1144 DO_RAID_MUTEX(&raidPtr->recon_done_proc_mutex);
1145 DO_RAID_INIT_CONFIGURE(rf_ConfigureDisks);
1146 DO_RAID_INIT_CONFIGURE(rf_ConfigureSpareDisks);
1147 /* do this after ConfigureDisks & ConfigureSpareDisks to be sure dev no. is set */
1148 DO_RAID_INIT_CONFIGURE(rf_ConfigureDiskQueues);
1149 #ifndef KERNEL
1150 DO_RAID_INIT_CONFIGURE(rf_ConfigureDiskThreads);
1151 #endif /* !KERNEL */
1152
1153 DO_RAID_INIT_CONFIGURE(rf_ConfigureLayout);
1154
1155 DO_RAID_INIT_CONFIGURE(rf_ConfigurePSStatus);
1156
1157 for(row=0;row<raidPtr->numRow;row++) {
1158 for(col=0;col<raidPtr->numCol;col++) {
1159 /*
1160 * XXX better distribution
1161 */
1162 raidPtr->hist_diskreq[row][col] = 0;
1163 }
1164 }
1165
1166 if (rf_keepAccTotals) {
1167 raidPtr->keep_acc_totals = 1;
1168 }
1169
1170 rf_StartUserStats(raidPtr);
1171
1172 raidPtr->valid = 1;
1173 return(0);
1174 }
1175
1176 static int init_rad(desc)
1177 RF_RaidAccessDesc_t *desc;
1178 {
1179 int rc;
1180
1181 rc = rf_mutex_init(&desc->mutex);
1182 if (rc) {
1183 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
1184 __LINE__, rc);
1185 return(rc);
1186 }
1187 rc = rf_cond_init(&desc->cond);
1188 if (rc) {
1189 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
1190 __LINE__, rc);
1191 rf_mutex_destroy(&desc->mutex);
1192 return(rc);
1193 }
1194 return(0);
1195 }
1196
1197 static void clean_rad(desc)
1198 RF_RaidAccessDesc_t *desc;
1199 {
1200 rf_mutex_destroy(&desc->mutex);
1201 rf_cond_destroy(&desc->cond);
1202 }
1203
1204 static void rf_ShutdownRDFreeList(ignored)
1205 void *ignored;
1206 {
1207 RF_FREELIST_DESTROY_CLEAN(rf_rad_freelist,next,(RF_RaidAccessDesc_t *),clean_rad);
1208 }
1209
1210 static int rf_ConfigureRDFreeList(listp)
1211 RF_ShutdownList_t **listp;
1212 {
1213 int rc;
1214
1215 RF_FREELIST_CREATE(rf_rad_freelist, RF_MAX_FREE_RAD,
1216 RF_RAD_INC, sizeof(RF_RaidAccessDesc_t));
1217 if (rf_rad_freelist == NULL) {
1218 return(ENOMEM);
1219 }
1220 rc = rf_ShutdownCreate(listp, rf_ShutdownRDFreeList, NULL);
1221 if (rc) {
1222 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
1223 __LINE__, rc);
1224 rf_ShutdownRDFreeList(NULL);
1225 return(rc);
1226 }
1227 RF_FREELIST_PRIME_INIT(rf_rad_freelist, RF_RAD_INITIAL,next,
1228 (RF_RaidAccessDesc_t *),init_rad);
1229 return(0);
1230 }
1231
1232 RF_RaidAccessDesc_t *rf_AllocRaidAccDesc(
1233 RF_Raid_t *raidPtr,
1234 RF_IoType_t type,
1235 RF_RaidAddr_t raidAddress,
1236 RF_SectorCount_t numBlocks,
1237 caddr_t bufPtr,
1238 void *bp,
1239 RF_DagHeader_t **paramDAG,
1240 RF_AccessStripeMapHeader_t **paramASM,
1241 RF_RaidAccessFlags_t flags,
1242 void (*cbF)(struct buf *),
1243 void *cbA,
1244 RF_AccessState_t *states)
1245 {
1246 RF_RaidAccessDesc_t *desc;
1247
1248 RF_FREELIST_GET_INIT_NOUNLOCK(rf_rad_freelist,desc,next,(RF_RaidAccessDesc_t *),init_rad);
1249 if (raidPtr->waitShutdown) {
1250 /*
1251 * Actually, we're shutting the array down. Free the desc
1252 * and return NULL.
1253 */
1254 RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
1255 RF_FREELIST_FREE_CLEAN(rf_rad_freelist,desc,next,clean_rad);
1256 return(NULL);
1257 }
1258 raidPtr->nAccOutstanding++;
1259 RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
1260
1261 desc->raidPtr = (void*)raidPtr;
1262 desc->type = type;
1263 desc->raidAddress = raidAddress;
1264 desc->numBlocks = numBlocks;
1265 desc->bufPtr = bufPtr;
1266 desc->bp = bp;
1267 desc->paramDAG = paramDAG;
1268 desc->paramASM = paramASM;
1269 desc->flags = flags;
1270 desc -> states = states;
1271 desc -> state = 0;
1272
1273 desc->status = 0;
1274 bzero((char *)&desc->tracerec, sizeof(RF_AccTraceEntry_t));
1275 desc->callbackFunc= (void (*)(RF_CBParam_t))cbF; /* XXX */
1276 desc->callbackArg = cbA;
1277 desc->next = NULL;
1278 desc->head = desc;
1279 desc->numPending = 0;
1280 desc->cleanupList = NULL;
1281 rf_MakeAllocList(desc->cleanupList);
1282 rf_get_threadid(desc->tid);
1283 #ifdef SIMULATE
1284 desc->owner = rf_GetCurrentOwner();
1285 #endif /* SIMULATE */
1286 return(desc);
1287 }
1288
1289 void rf_FreeRaidAccDesc(RF_RaidAccessDesc_t *desc)
1290 {
1291 RF_Raid_t *raidPtr = desc->raidPtr;
1292
1293 RF_ASSERT(desc);
1294
1295 #if !defined(KERNEL) && !defined(SIMULATE)
1296 rf_StopThroughputStats(raidPtr);
1297 #endif /* !KERNEL && !SIMULATE */
1298
1299 rf_FreeAllocList(desc->cleanupList);
1300 RF_FREELIST_FREE_CLEAN_NOUNLOCK(rf_rad_freelist,desc,next,clean_rad);
1301 raidPtr->nAccOutstanding--;
1302 if (raidPtr->waitShutdown) {
1303 RF_SIGNAL_COND(raidPtr->outstandingCond);
1304 }
1305 RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
1306 }
1307
1308 #ifdef JIMZ
1309 #define THREAD_NUMDESC 1024
1310 #define THREAD_NUM 600
1311 static RF_RaidAccessDesc_t *dest_hist[THREAD_NUM*THREAD_NUMDESC];
1312 int jimz_access_num[THREAD_NUM];
1313 #endif /* JIMZ */
1314
1315 /*********************************************************************
1316 * Main routine for performing an access.
1317 * Accesses are retried until a DAG can not be selected. This occurs
1318 * when either the DAG library is incomplete or there are too many
1319 * failures in a parity group.
1320 ********************************************************************/
1321 int rf_DoAccess(
1322 RF_Raid_t *raidPtr,
1323 RF_IoType_t type,
1324 int async_flag,
1325 RF_RaidAddr_t raidAddress,
1326 RF_SectorCount_t numBlocks,
1327 caddr_t bufPtr,
1328 void *bp_in,
1329 RF_DagHeader_t **paramDAG,
1330 RF_AccessStripeMapHeader_t **paramASM,
1331 RF_RaidAccessFlags_t flags,
1332 RF_RaidAccessDesc_t **paramDesc,
1333 void (*cbF)(struct buf *),
1334 void *cbA)
1335 /*
1336 type should be read or write
1337 async_flag should be RF_TRUE or RF_FALSE
1338 bp_in is a buf pointer. void * to facilitate ignoring it outside the kernel
1339 */
1340 {
1341 int tid;
1342 RF_RaidAccessDesc_t *desc;
1343 caddr_t lbufPtr = bufPtr;
1344 #ifdef KERNEL
1345 struct buf *bp = (struct buf *) bp_in;
1346 #if DFSTRACE > 0
1347 struct { RF_uint64 raidAddr; int numBlocks; char type;} dfsrecord;
1348 #endif /* DFSTRACE > 0 */
1349 #else /* KERNEL */
1350 void *bp = bp_in;
1351 #endif /* KERNEL */
1352
1353 raidAddress += rf_raidSectorOffset;
1354
1355 if (!raidPtr->valid) {
1356 RF_ERRORMSG("RAIDframe driver not successfully configured. Rejecting access.\n");
1357 IO_BUF_ERR(bp, EINVAL, raidPtr->raidid);
1358 return(EINVAL);
1359 }
1360
1361 #if defined(KERNEL) && DFSTRACE > 0
1362 if (rf_DFSTraceAccesses) {
1363 dfsrecord.raidAddr = raidAddress;
1364 dfsrecord.numBlocks = numBlocks;
1365 dfsrecord.type = type;
1366 dfs_log(DFS_NOTE, (char *) &dfsrecord, sizeof(dfsrecord), 0);
1367 }
1368 #endif /* KERNEL && DFSTRACE > 0 */
1369
1370 rf_get_threadid(tid);
1371 if (rf_accessDebug) {
1372
1373 printf("logBytes is: %d %d %d\n",raidPtr->raidid,
1374 raidPtr->logBytesPerSector,
1375 (int)rf_RaidAddressToByte(raidPtr,numBlocks));
1376 printf("[%d] %s raidAddr %d (stripeid %d-%d) numBlocks %d (%d bytes) buf 0x%lx\n",tid,
1377 (type==RF_IO_TYPE_READ) ? "READ":"WRITE", (int)raidAddress,
1378 (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress),
1379 (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress+numBlocks-1),
1380 (int) numBlocks,
1381 (int) rf_RaidAddressToByte(raidPtr,numBlocks),
1382 (long) bufPtr);
1383 }
1384
1385 if (raidAddress + numBlocks > raidPtr->totalSectors) {
1386
1387 printf("DoAccess: raid addr %lu too large to access %lu sectors. Max legal addr is %lu\n",
1388 (u_long)raidAddress,(u_long)numBlocks,(u_long)raidPtr->totalSectors);
1389
1390 #ifdef KERNEL
1391 if (type == RF_IO_TYPE_READ) {
1392 IO_BUF_ERR(bp, ENOSPC, raidPtr->raidid);
1393 return(ENOSPC);
1394 } else {
1395 IO_BUF_ERR(bp, ENOSPC, raidPtr->raidid);
1396 return(ENOSPC);
1397 }
1398 #elif defined(SIMULATE)
1399 RF_PANIC();
1400 #else /* SIMULATE */
1401 return(EIO);
1402 #endif /* SIMULATE */
1403 }
1404
1405 #if !defined(KERNEL) && !defined(SIMULATE)
1406 rf_StartThroughputStats(raidPtr);
1407 #endif /* !KERNEL && !SIMULATE */
1408
1409 desc = rf_AllocRaidAccDesc(raidPtr, type, raidAddress,
1410 numBlocks, lbufPtr, bp, paramDAG, paramASM,
1411 flags, cbF, cbA, raidPtr->Layout.map->states);
1412
1413 if (desc == NULL) {
1414 return(ENOMEM);
1415 }
1416 #ifdef JIMZ
1417 dest_hist[(tid*THREAD_NUMDESC)+jimz_access_num[tid]]; jimz_access_num[tid]++;
1418 #endif /* JIMZ */
1419
1420 RF_ETIMER_START(desc->tracerec.tot_timer);
1421
1422 #ifdef SIMULATE
1423 /* simulator uses paramDesc to continue dag from test function */
1424 desc->async_flag=async_flag;
1425
1426 *paramDesc=desc;
1427
1428 return(0);
1429 #endif /* SIMULATE */
1430
1431 rf_ContinueRaidAccess(desc);
1432
1433 #ifndef KERNEL
1434 if (!(flags & RF_DAG_NONBLOCKING_IO)) {
1435 RF_LOCK_MUTEX(desc->mutex);
1436 while (!(desc->flags & RF_DAG_ACCESS_COMPLETE)) {
1437 RF_WAIT_COND(desc->cond, desc->mutex);
1438 }
1439 RF_UNLOCK_MUTEX(desc->mutex);
1440 rf_FreeRaidAccDesc(desc);
1441 }
1442 #endif /* !KERNEL */
1443
1444 return(0);
1445 }
1446
1447 /* force the array into reconfigured mode without doing reconstruction */
1448 int rf_SetReconfiguredMode(raidPtr, row, col)
1449 RF_Raid_t *raidPtr;
1450 int row;
1451 int col;
1452 {
1453 if (!(raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
1454 printf("Can't set reconfigured mode in dedicated-spare array\n");
1455 RF_PANIC();
1456 }
1457 RF_LOCK_MUTEX(raidPtr->mutex);
1458 raidPtr->numFailures++;
1459 raidPtr->Disks[row][col].status = rf_ds_dist_spared;
1460 raidPtr->status[row] = rf_rs_reconfigured;
1461 /* install spare table only if declustering + distributed sparing architecture. */
1462 if ( raidPtr->Layout.map->flags & RF_BD_DECLUSTERED )
1463 rf_InstallSpareTable(raidPtr, row, col);
1464 RF_UNLOCK_MUTEX(raidPtr->mutex);
1465 return(0);
1466 }
1467
1468 extern int fail_row, fail_col, fail_time;
1469 extern int delayed_recon;
1470
1471 int rf_FailDisk(
1472 RF_Raid_t *raidPtr,
1473 int frow,
1474 int fcol,
1475 int initRecon)
1476 {
1477 int tid;
1478
1479 rf_get_threadid(tid);
1480 printf("[%d] Failing disk r%d c%d\n",tid,frow,fcol);
1481 RF_LOCK_MUTEX(raidPtr->mutex);
1482 raidPtr->numFailures++;
1483 raidPtr->Disks[frow][fcol].status = rf_ds_failed;
1484 raidPtr->status[frow] = rf_rs_degraded;
1485 RF_UNLOCK_MUTEX(raidPtr->mutex);
1486 #ifdef SIMULATE
1487 #if RF_DEMO > 0
1488 if (rf_demoMode) {
1489 rf_demo_update_mode (RF_DEMO_DEGRADED);
1490 fail_col = fcol; fail_row = frow;
1491 fail_time = rf_CurTime(); /* XXX */
1492 if (initRecon)
1493 delayed_recon = RF_TRUE;
1494 }
1495 else {
1496 if (initRecon)
1497 rf_ReconstructFailedDisk(raidPtr, frow, fcol);
1498 }
1499 #else /* RF_DEMO > 0 */
1500 if (initRecon)
1501 rf_ReconstructFailedDisk(raidPtr, frow, fcol);
1502 #endif /* RF_DEMO > 0 */
1503 #else /* SIMULATE */
1504 if (initRecon)
1505 rf_ReconstructFailedDisk(raidPtr, frow, fcol);
1506 #endif /* SIMULATE */
1507 return(0);
1508 }
1509
1510 #ifdef SIMULATE
1511 extern RF_Owner_t recon_owner;
1512
1513 void rf_ScheduleContinueReconstructFailedDisk(reconDesc)
1514 RF_RaidReconDesc_t *reconDesc;
1515 {
1516 rf_DDEventRequest(rf_CurTime(), rf_ContinueReconstructFailedDisk,
1517 (void *) reconDesc, recon_owner, -4, -4, reconDesc->raidPtr, NULL);
1518 }
1519 #endif /* SIMULATE */
1520
1521 /* releases a thread that is waiting for the array to become quiesced.
1522 * access_suspend_mutex should be locked upon calling this
1523 */
1524 void rf_SignalQuiescenceLock(raidPtr, reconDesc)
1525 RF_Raid_t *raidPtr;
1526 RF_RaidReconDesc_t *reconDesc;
1527 {
1528 int tid;
1529
1530 if (rf_quiesceDebug) {
1531 rf_get_threadid(tid);
1532 printf("[%d] Signalling quiescence lock\n", tid);
1533 }
1534 raidPtr->access_suspend_release = 1;
1535
1536 if (raidPtr->waiting_for_quiescence) {
1537 #ifndef SIMULATE
1538 SIGNAL_QUIESCENT_COND(raidPtr);
1539 #else /* !SIMULATE */
1540 if (reconDesc) {
1541 rf_ScheduleContinueReconstructFailedDisk(reconDesc);
1542 }
1543 #endif /* !SIMULATE */
1544 }
1545 }
1546
1547 /* suspends all new requests to the array. No effect on accesses that are in flight. */
1548 int rf_SuspendNewRequestsAndWait(raidPtr)
1549 RF_Raid_t *raidPtr;
1550 {
1551 if (rf_quiesceDebug)
1552 printf("Suspending new reqs\n");
1553
1554 RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
1555 raidPtr->accesses_suspended++;
1556 raidPtr->waiting_for_quiescence = (raidPtr->accs_in_flight == 0) ? 0 : 1;
1557
1558 #ifndef SIMULATE
1559 if (raidPtr->waiting_for_quiescence) {
1560 raidPtr->access_suspend_release=0;
1561 while (!raidPtr->access_suspend_release) {
1562 printf("Suspending: Waiting for Quiesence\n");
1563 WAIT_FOR_QUIESCENCE(raidPtr);
1564 raidPtr->waiting_for_quiescence = 0;
1565 }
1566 }
1567 printf("Quiesence reached..\n");
1568 #endif /* !SIMULATE */
1569
1570 RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
1571 return (raidPtr->waiting_for_quiescence);
1572 }
1573
1574 /* wake up everyone waiting for quiescence to be released */
1575 void rf_ResumeNewRequests(raidPtr)
1576 RF_Raid_t *raidPtr;
1577 {
1578 RF_CallbackDesc_t *t, *cb;
1579
1580 if (rf_quiesceDebug)
1581 printf("Resuming new reqs\n");
1582
1583 RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
1584 raidPtr->accesses_suspended--;
1585 if (raidPtr->accesses_suspended == 0)
1586 cb = raidPtr->quiesce_wait_list;
1587 else
1588 cb = NULL;
1589 raidPtr->quiesce_wait_list = NULL;
1590 RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
1591
1592 while (cb) {
1593 t = cb;
1594 cb = cb->next;
1595 (t->callbackFunc)(t->callbackArg);
1596 rf_FreeCallbackDesc(t);
1597 }
1598 }
1599
1600 /*****************************************************************************************
1601 *
1602 * debug routines
1603 *
1604 ****************************************************************************************/
1605
1606 static void set_debug_option(name, val)
1607 char *name;
1608 long val;
1609 {
1610 RF_DebugName_t *p;
1611
1612 for (p = rf_debugNames; p->name; p++) {
1613 if (!strcmp(p->name, name)) {
1614 *(p->ptr) = val;
1615 printf("[Set debug variable %s to %ld]\n",name,val);
1616 return;
1617 }
1618 }
1619 RF_ERRORMSG1("Unknown debug string \"%s\"\n",name);
1620 }
1621
1622
1623 /* would like to use sscanf here, but apparently not available in kernel */
1624 /*ARGSUSED*/
1625 static void rf_ConfigureDebug(cfgPtr)
1626 RF_Config_t *cfgPtr;
1627 {
1628 char *val_p, *name_p, *white_p;
1629 long val;
1630 int i;
1631
1632 rf_ResetDebugOptions();
1633 for (i=0; cfgPtr->debugVars[i][0] && i < RF_MAXDBGV; i++) {
1634 name_p = rf_find_non_white(&cfgPtr->debugVars[i][0]);
1635 white_p = rf_find_white(name_p); /* skip to start of 2nd word */
1636 val_p = rf_find_non_white(white_p);
1637 if (*val_p == '0' && *(val_p+1) == 'x') val = rf_htoi(val_p+2);
1638 else val = rf_atoi(val_p);
1639 *white_p = '\0';
1640 set_debug_option(name_p, val);
1641 }
1642 }
1643
1644 /* performance monitoring stuff */
1645
1646 #define TIMEVAL_TO_US(t) (((long) t.tv_sec) * 1000000L + (long) t.tv_usec)
1647
1648 #if !defined(KERNEL) && !defined(SIMULATE)
1649
1650 /*
1651 * Throughput stats currently only used in user-level RAIDframe
1652 */
1653
1654 static int rf_InitThroughputStats(
1655 RF_ShutdownList_t **listp,
1656 RF_Raid_t *raidPtr,
1657 RF_Config_t *cfgPtr)
1658 {
1659 int rc;
1660
1661 /* these used by user-level raidframe only */
1662 rc = rf_create_managed_mutex(listp, &raidPtr->throughputstats.mutex);
1663 if (rc) {
1664 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
1665 __LINE__, rc);
1666 return(rc);
1667 }
1668 raidPtr->throughputstats.sum_io_us = 0;
1669 raidPtr->throughputstats.num_ios = 0;
1670 raidPtr->throughputstats.num_out_ios = 0;
1671 return(0);
1672 }
1673
1674 void rf_StartThroughputStats(RF_Raid_t *raidPtr)
1675 {
1676 RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
1677 raidPtr->throughputstats.num_ios++;
1678 raidPtr->throughputstats.num_out_ios++;
1679 if (raidPtr->throughputstats.num_out_ios == 1)
1680 RF_GETTIME(raidPtr->throughputstats.start);
1681 RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
1682 }
1683
1684 static void rf_StopThroughputStats(RF_Raid_t *raidPtr)
1685 {
1686 struct timeval diff;
1687
1688 RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
1689 raidPtr->throughputstats.num_out_ios--;
1690 if (raidPtr->throughputstats.num_out_ios == 0) {
1691 RF_GETTIME(raidPtr->throughputstats.stop);
1692 RF_TIMEVAL_DIFF(&raidPtr->throughputstats.start, &raidPtr->throughputstats.stop, &diff);
1693 raidPtr->throughputstats.sum_io_us += TIMEVAL_TO_US(diff);
1694 }
1695 RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
1696 }
1697
1698 static void rf_PrintThroughputStats(RF_Raid_t *raidPtr)
1699 {
1700 RF_ASSERT(raidPtr->throughputstats.num_out_ios == 0);
1701 if ( raidPtr->throughputstats.sum_io_us != 0 ) {
1702 printf("[Througphut: %8.2f IOs/second]\n", raidPtr->throughputstats.num_ios
1703 / (raidPtr->throughputstats.sum_io_us / 1000000.0));
1704 }
1705 }
1706
1707 #endif /* !KERNEL && !SIMULATE */
1708
1709 void rf_StartUserStats(RF_Raid_t *raidPtr)
1710 {
1711 RF_GETTIME(raidPtr->userstats.start);
1712 raidPtr->userstats.sum_io_us = 0;
1713 raidPtr->userstats.num_ios = 0;
1714 raidPtr->userstats.num_sect_moved = 0;
1715 }
1716
1717 void rf_StopUserStats(RF_Raid_t *raidPtr)
1718 {
1719 RF_GETTIME(raidPtr->userstats.stop);
1720 }
1721
1722 void rf_UpdateUserStats(raidPtr, rt, numsect)
1723 RF_Raid_t *raidPtr;
1724 int rt; /* resp time in us */
1725 int numsect; /* number of sectors for this access */
1726 {
1727 raidPtr->userstats.sum_io_us += rt;
1728 raidPtr->userstats.num_ios++;
1729 raidPtr->userstats.num_sect_moved += numsect;
1730 }
1731
1732 void rf_PrintUserStats(RF_Raid_t *raidPtr)
1733 {
1734 long elapsed_us, mbs, mbs_frac;
1735 struct timeval diff;
1736
1737 RF_TIMEVAL_DIFF(&raidPtr->userstats.start, &raidPtr->userstats.stop, &diff);
1738 elapsed_us = TIMEVAL_TO_US(diff);
1739
1740 /* 2000 sectors per megabyte, 10000000 microseconds per second */
1741 if (elapsed_us)
1742 mbs = (raidPtr->userstats.num_sect_moved / 2000) / (elapsed_us / 1000000);
1743 else
1744 mbs = 0;
1745
1746 /* this computes only the first digit of the fractional mb/s moved */
1747 if (elapsed_us) {
1748 mbs_frac = ((raidPtr->userstats.num_sect_moved / 200) / (elapsed_us / 1000000))
1749 - (mbs * 10);
1750 }
1751 else {
1752 mbs_frac = 0;
1753 }
1754
1755 printf("Number of I/Os: %ld\n",raidPtr->userstats.num_ios);
1756 printf("Elapsed time (us): %ld\n",elapsed_us);
1757 printf("User I/Os per second: %ld\n",RF_DB0_CHECK(raidPtr->userstats.num_ios, (elapsed_us/1000000)));
1758 printf("Average user response time: %ld us\n",RF_DB0_CHECK(raidPtr->userstats.sum_io_us, raidPtr->userstats.num_ios));
1759 printf("Total sectors moved: %ld\n",raidPtr->userstats.num_sect_moved);
1760 printf("Average access size (sect): %ld\n",RF_DB0_CHECK(raidPtr->userstats.num_sect_moved, raidPtr->userstats.num_ios));
1761 printf("Achieved data rate: %ld.%ld MB/sec\n",mbs,mbs_frac);
1762 }
1763