rf_raid.h revision 1.1 1 1.1 oster /* $NetBSD: rf_raid.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */
2 1.1 oster /*
3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University.
4 1.1 oster * All rights reserved.
5 1.1 oster *
6 1.1 oster * Author: Mark Holland
7 1.1 oster *
8 1.1 oster * Permission to use, copy, modify and distribute this software and
9 1.1 oster * its documentation is hereby granted, provided that both the copyright
10 1.1 oster * notice and this permission notice appear in all copies of the
11 1.1 oster * software, derivative works or modified versions, and any portions
12 1.1 oster * thereof, and that both notices appear in supporting documentation.
13 1.1 oster *
14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 1.1 oster *
18 1.1 oster * Carnegie Mellon requests users of this software to return to
19 1.1 oster *
20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 1.1 oster * School of Computer Science
22 1.1 oster * Carnegie Mellon University
23 1.1 oster * Pittsburgh PA 15213-3890
24 1.1 oster *
25 1.1 oster * any improvements or extensions that they make and grant Carnegie the
26 1.1 oster * rights to redistribute these changes.
27 1.1 oster */
28 1.1 oster
29 1.1 oster /**********************************************
30 1.1 oster * rf_raid.h -- main header file for RAID driver
31 1.1 oster **********************************************/
32 1.1 oster
33 1.1 oster /*
34 1.1 oster * :
35 1.1 oster * Log: rf_raid.h,v
36 1.1 oster * Revision 1.48 1996/08/20 22:33:54 jimz
37 1.1 oster * make hist_diskreq a doubly-indexed array
38 1.1 oster *
39 1.1 oster * Revision 1.47 1996/07/15 05:40:41 jimz
40 1.1 oster * some recon datastructure cleanup
41 1.1 oster * better handling of multiple failures
42 1.1 oster * added undocumented double-recon test
43 1.1 oster *
44 1.1 oster * Revision 1.46 1996/07/10 22:28:51 jimz
45 1.1 oster * get rid of obsolete row statuses (dead,degraded2)
46 1.1 oster *
47 1.1 oster * Revision 1.45 1996/06/14 14:56:29 jimz
48 1.1 oster * make engine threading stuff ifndef SIMULATE
49 1.1 oster *
50 1.1 oster * Revision 1.44 1996/06/14 14:16:54 jimz
51 1.1 oster * move in engine node queue, atomicity control
52 1.1 oster *
53 1.1 oster * Revision 1.43 1996/06/12 04:41:26 jimz
54 1.1 oster * tweaks to make genplot work with user-level driver
55 1.1 oster * (mainly change stat collection)
56 1.1 oster *
57 1.1 oster * Revision 1.42 1996/06/11 10:57:17 jimz
58 1.1 oster * add recon_done_procs, recon_done_proc_mutex
59 1.1 oster *
60 1.1 oster * Revision 1.41 1996/06/11 01:26:48 jimz
61 1.1 oster * added mechanism for user-level to sync diskthread startup,
62 1.1 oster * shutdown
63 1.1 oster *
64 1.1 oster * Revision 1.40 1996/06/10 14:18:58 jimz
65 1.1 oster * move user, throughput stats into per-array structure
66 1.1 oster *
67 1.1 oster * Revision 1.39 1996/06/10 11:55:47 jimz
68 1.1 oster * Straightened out some per-array/not-per-array distinctions, fixed
69 1.1 oster * a couple bugs related to confusion. Added shutdown lists. Removed
70 1.1 oster * layout shutdown function (now subsumed by shutdown lists).
71 1.1 oster *
72 1.1 oster * Revision 1.38 1996/06/07 21:33:04 jimz
73 1.1 oster * begin using consistent types for sector numbers,
74 1.1 oster * stripe numbers, row+col numbers, recon unit numbers
75 1.1 oster *
76 1.1 oster * Revision 1.37 1996/06/05 19:38:32 jimz
77 1.1 oster * fixed up disk queueing types config
78 1.1 oster * added sstf disk queueing
79 1.1 oster * fixed exit bug on diskthreads (ref-ing bad mem)
80 1.1 oster *
81 1.1 oster * Revision 1.36 1996/06/05 18:06:02 jimz
82 1.1 oster * Major code cleanup. The Great Renaming is now done.
83 1.1 oster * Better modularity. Better typing. Fixed a bunch of
84 1.1 oster * synchronization bugs. Made a lot of global stuff
85 1.1 oster * per-desc or per-array. Removed dead code.
86 1.1 oster *
87 1.1 oster * Revision 1.35 1996/06/03 23:28:26 jimz
88 1.1 oster * more bugfixes
89 1.1 oster * check in tree to sync for IPDS runs with current bugfixes
90 1.1 oster * there still may be a problem with threads in the script test
91 1.1 oster * getting I/Os stuck- not trivially reproducible (runs ~50 times
92 1.1 oster * in a row without getting stuck)
93 1.1 oster *
94 1.1 oster * Revision 1.34 1996/06/02 17:31:48 jimz
95 1.1 oster * Moved a lot of global stuff into array structure, where it belongs.
96 1.1 oster * Fixed up paritylogging, pss modules in this manner. Some general
97 1.1 oster * code cleanup. Removed lots of dead code, some dead files.
98 1.1 oster *
99 1.1 oster * Revision 1.33 1996/05/30 23:22:16 jimz
100 1.1 oster * bugfixes of serialization, timing problems
101 1.1 oster * more cleanup
102 1.1 oster *
103 1.1 oster * Revision 1.32 1996/05/30 11:29:41 jimz
104 1.1 oster * Numerous bug fixes. Stripe lock release code disagreed with the taking code
105 1.1 oster * about when stripes should be locked (I made it consistent: no parity, no lock)
106 1.1 oster * There was a lot of extra serialization of I/Os which I've removed- a lot of
107 1.1 oster * it was to calculate values for the cache code, which is no longer with us.
108 1.1 oster * More types, function, macro cleanup. Added code to properly quiesce the array
109 1.1 oster * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
110 1.1 oster * before. Fixed memory allocation, freeing bugs.
111 1.1 oster *
112 1.1 oster * Revision 1.31 1996/05/27 18:56:37 jimz
113 1.1 oster * more code cleanup
114 1.1 oster * better typing
115 1.1 oster * compiles in all 3 environments
116 1.1 oster *
117 1.1 oster * Revision 1.30 1996/05/24 22:17:04 jimz
118 1.1 oster * continue code + namespace cleanup
119 1.1 oster * typed a bunch of flags
120 1.1 oster *
121 1.1 oster * Revision 1.29 1996/05/23 21:46:35 jimz
122 1.1 oster * checkpoint in code cleanup (release prep)
123 1.1 oster * lots of types, function names have been fixed
124 1.1 oster *
125 1.1 oster * Revision 1.28 1996/05/23 00:33:23 jimz
126 1.1 oster * code cleanup: move all debug decls to rf_options.c, all extern
127 1.1 oster * debug decls to rf_options.h, all debug vars preceded by rf_
128 1.1 oster *
129 1.1 oster * Revision 1.27 1996/05/18 19:51:34 jimz
130 1.1 oster * major code cleanup- fix syntax, make some types consistent,
131 1.1 oster * add prototypes, clean out dead code, et cetera
132 1.1 oster *
133 1.1 oster * Revision 1.26 1996/05/08 21:01:24 jimz
134 1.1 oster * fixed up enum type names that were conflicting with other
135 1.1 oster * enums and function names (ie, "panic")
136 1.1 oster * future naming trends will be towards RF_ and rf_ for
137 1.1 oster * everything raidframe-related
138 1.1 oster *
139 1.1 oster * Revision 1.25 1996/05/02 14:57:55 jimz
140 1.1 oster * add sectorMask
141 1.1 oster *
142 1.1 oster * Revision 1.24 1996/04/22 15:53:13 jimz
143 1.1 oster * MAX_RAIDS -> NRAIDFRAME
144 1.1 oster *
145 1.1 oster * Revision 1.23 1995/12/14 18:39:46 jimz
146 1.1 oster * convert to rf_types.h types
147 1.1 oster *
148 1.1 oster * Revision 1.22 1995/12/06 15:02:26 root
149 1.1 oster * added copyright info
150 1.1 oster *
151 1.1 oster * Revision 1.21 1995/10/09 17:39:24 jimz
152 1.1 oster * added info for tracking number of outstanding accesses
153 1.1 oster * at user-level
154 1.1 oster *
155 1.1 oster * Revision 1.20 1995/09/30 20:37:46 jimz
156 1.1 oster * added acc_totals to Raid for kernel
157 1.1 oster *
158 1.1 oster * Revision 1.19 1995/09/19 22:57:14 jimz
159 1.1 oster * add cache of raidid for kernel
160 1.1 oster *
161 1.1 oster * Revision 1.18 1995/09/18 16:50:04 jimz
162 1.1 oster * added RF_MAX_DISKS (for config ioctls)
163 1.1 oster *
164 1.1 oster * Revision 1.17 1995/09/07 19:02:31 jimz
165 1.1 oster * mods to get raidframe to compile and link
166 1.1 oster * in kernel environment
167 1.1 oster *
168 1.1 oster * Revision 1.16 1995/07/21 19:29:51 robby
169 1.1 oster * added some info for the idler to the Raid
170 1.1 oster *
171 1.1 oster * Revision 1.15 1995/07/16 03:19:14 cfb
172 1.1 oster * added cachePtr to *raidPtr
173 1.1 oster *
174 1.1 oster * Revision 1.14 1995/06/23 13:39:36 robby
175 1.1 oster * updeated to prototypes in rf_layout.h
176 1.1 oster *
177 1.1 oster */
178 1.1 oster
179 1.1 oster #ifndef _RF__RF_RAID_H_
180 1.1 oster #define _RF__RF_RAID_H_
181 1.1 oster
182 1.1 oster #ifdef _KERNEL
183 1.1 oster #define KERNEL
184 1.1 oster #endif
185 1.1 oster
186 1.1 oster #include "rf_archs.h"
187 1.1 oster #include "rf_types.h"
188 1.1 oster #include "rf_threadstuff.h"
189 1.1 oster
190 1.1 oster #if defined(__NetBSD__) && defined(_KERNEL)
191 1.1 oster #include "rf_netbsd.h"
192 1.1 oster #endif
193 1.1 oster
194 1.1 oster #ifdef KERNEL
195 1.1 oster /* XXX Needs to be added. GO
196 1.1 oster #include <raidframe.h>
197 1.1 oster */
198 1.1 oster #include <sys/disklabel.h>
199 1.1 oster #else /* KERNEL */
200 1.1 oster #include <stdio.h>
201 1.1 oster #include <assert.h>
202 1.1 oster #endif /* KERNEL */
203 1.1 oster #include <sys/types.h>
204 1.1 oster
205 1.1 oster #include "rf_alloclist.h"
206 1.1 oster #include "rf_stripelocks.h"
207 1.1 oster #include "rf_layout.h"
208 1.1 oster #include "rf_disks.h"
209 1.1 oster #include "rf_debugMem.h"
210 1.1 oster #include "rf_diskqueue.h"
211 1.1 oster #include "rf_reconstruct.h"
212 1.1 oster #include "rf_acctrace.h"
213 1.1 oster
214 1.1 oster #if RF_INCLUDE_PARITYLOGGING > 0
215 1.1 oster #include "rf_paritylog.h"
216 1.1 oster #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
217 1.1 oster
218 1.1 oster #define RF_MAX_DISKS 128 /* max disks per array */
219 1.1 oster #ifdef __NetBSD__
220 1.1 oster #define RF_DEV2RAIDID(_dev) (DISKUNIT(_dev))
221 1.1 oster #else
222 1.1 oster #define RF_DEV2RAIDID(_dev) (minor(_dev)>>6) /* convert dev_t to raid id */
223 1.1 oster #endif
224 1.1 oster
225 1.1 oster /*
226 1.1 oster * Each row in the array is a distinct parity group, so
227 1.1 oster * each has it's own status, which is one of the following.
228 1.1 oster */
229 1.1 oster typedef enum RF_RowStatus_e {
230 1.1 oster rf_rs_optimal,
231 1.1 oster rf_rs_degraded,
232 1.1 oster rf_rs_reconstructing,
233 1.1 oster rf_rs_reconfigured
234 1.1 oster } RF_RowStatus_t;
235 1.1 oster
236 1.1 oster struct RF_CumulativeStats_s {
237 1.1 oster struct timeval start; /* the time when the stats were last started*/
238 1.1 oster struct timeval stop; /* the time when the stats were last stopped */
239 1.1 oster long sum_io_us; /* sum of all user response times (us) */
240 1.1 oster long num_ios; /* total number of I/Os serviced */
241 1.1 oster long num_sect_moved; /* total number of sectors read or written */
242 1.1 oster };
243 1.1 oster
244 1.1 oster struct RF_ThroughputStats_s {
245 1.1 oster RF_DECLARE_MUTEX(mutex)/* a mutex used to lock the configuration stuff */
246 1.1 oster struct timeval start; /* timer started when numOutstandingRequests moves from 0 to 1 */
247 1.1 oster struct timeval stop; /* timer stopped when numOutstandingRequests moves from 1 to 0 */
248 1.1 oster RF_uint64 sum_io_us; /* total time timer is enabled */
249 1.1 oster RF_uint64 num_ios; /* total number of ios processed by RAIDframe */
250 1.1 oster long num_out_ios; /* number of outstanding ios */
251 1.1 oster };
252 1.1 oster
253 1.1 oster #ifdef SIMULATE
254 1.1 oster typedef struct RF_PendingRecon_s RF_PendingRecon_t;
255 1.1 oster struct RF_PendingRecon_s {
256 1.1 oster RF_RowCol_t row;
257 1.1 oster RF_RowCol_t col;
258 1.1 oster RF_PendingRecon_t *next;
259 1.1 oster };
260 1.1 oster #endif /* SIMULATE */
261 1.1 oster
262 1.1 oster struct RF_Raid_s {
263 1.1 oster /* This portion never changes, and can be accessed without locking */
264 1.1 oster /* an exception is Disks[][].status, which requires locking when it is changed */
265 1.1 oster u_int numRow; /* number of rows of disks, typically == # of ranks */
266 1.1 oster u_int numCol; /* number of columns of disks, typically == # of disks/rank */
267 1.1 oster u_int numSpare; /* number of spare disks */
268 1.1 oster int maxQueueDepth; /* max disk queue depth */
269 1.1 oster RF_SectorCount_t totalSectors; /* total number of sectors in the array */
270 1.1 oster RF_SectorCount_t sectorsPerDisk; /* number of sectors on each disk */
271 1.1 oster u_int logBytesPerSector; /* base-2 log of the number of bytes in a sector */
272 1.1 oster u_int bytesPerSector; /* bytes in a sector */
273 1.1 oster RF_int32 sectorMask; /* mask of bytes-per-sector */
274 1.1 oster
275 1.1 oster RF_RaidLayout_t Layout; /* all information related to layout */
276 1.1 oster RF_RaidDisk_t **Disks; /* all information related to physical disks */
277 1.1 oster RF_DiskQueue_t **Queues; /* all information related to disk queues */
278 1.1 oster /* NOTE: This is an anchor point via which the queues can be accessed,
279 1.1 oster * but the enqueue/dequeue routines in diskqueue.c use a local copy of
280 1.1 oster * this pointer for the actual accesses.
281 1.1 oster */
282 1.1 oster /* The remainder of the structure can change, and therefore requires locking on reads and updates */
283 1.1 oster RF_DECLARE_MUTEX(mutex) /* mutex used to serialize access to the fields below */
284 1.1 oster RF_RowStatus_t *status; /* the status of each row in the array */
285 1.1 oster int valid; /* indicates successful configuration */
286 1.1 oster RF_LockTableEntry_t *lockTable; /* stripe-lock table */
287 1.1 oster RF_LockTableEntry_t *quiesceLock; /* quiesnce table */
288 1.1 oster int numFailures; /* total number of failures in the array */
289 1.1 oster
290 1.1 oster /*
291 1.1 oster * Cleanup stuff
292 1.1 oster */
293 1.1 oster RF_ShutdownList_t *shutdownList; /* shutdown activities */
294 1.1 oster RF_AllocListElem_t *cleanupList; /* memory to be freed at shutdown time */
295 1.1 oster
296 1.1 oster /*
297 1.1 oster * Recon stuff
298 1.1 oster */
299 1.1 oster RF_HeadSepLimit_t headSepLimit;
300 1.1 oster int numFloatingReconBufs;
301 1.1 oster int reconInProgress;
302 1.1 oster #ifdef SIMULATE
303 1.1 oster RF_PendingRecon_t *pendingRecon;
304 1.1 oster #endif /* SIMULATE */
305 1.1 oster RF_DECLARE_COND(waitForReconCond)
306 1.1 oster RF_RaidReconDesc_t *reconDesc; /* reconstruction descriptor */
307 1.1 oster RF_ReconCtrl_t **reconControl; /* reconstruction control structure pointers for each row in the array */
308 1.1 oster
309 1.1 oster #if !defined(KERNEL) && !defined(SIMULATE)
310 1.1 oster /*
311 1.1 oster * Disk thread stuff
312 1.1 oster */
313 1.1 oster int diskthreads_created;
314 1.1 oster int diskthreads_running;
315 1.1 oster int diskthreads_shutdown;
316 1.1 oster RF_DECLARE_MUTEX(diskthread_count_mutex)
317 1.1 oster RF_DECLARE_COND(diskthread_count_cond)
318 1.1 oster #endif /* !KERNEL && !SIMULATE */
319 1.1 oster
320 1.1 oster /*
321 1.1 oster * Array-quiescence stuff
322 1.1 oster */
323 1.1 oster RF_DECLARE_MUTEX(access_suspend_mutex)
324 1.1 oster RF_DECLARE_COND(quiescent_cond)
325 1.1 oster RF_IoCount_t accesses_suspended;
326 1.1 oster RF_IoCount_t accs_in_flight;
327 1.1 oster int access_suspend_release;
328 1.1 oster int waiting_for_quiescence;
329 1.1 oster RF_CallbackDesc_t *quiesce_wait_list;
330 1.1 oster
331 1.1 oster /*
332 1.1 oster * Statistics
333 1.1 oster */
334 1.1 oster #if !defined(KERNEL) && !defined(SIMULATE)
335 1.1 oster RF_ThroughputStats_t throughputstats;
336 1.1 oster #endif /* !KERNEL && !SIMULATE */
337 1.1 oster RF_CumulativeStats_t userstats;
338 1.1 oster
339 1.1 oster /*
340 1.1 oster * Engine thread control
341 1.1 oster */
342 1.1 oster RF_DECLARE_MUTEX(node_queue_mutex)
343 1.1 oster RF_DECLARE_COND(node_queue_cond)
344 1.1 oster RF_DagNode_t *node_queue;
345 1.1 oster #ifndef SIMULATE
346 1.1 oster RF_Thread_t engine_thread;
347 1.1 oster RF_ThreadGroup_t engine_tg;
348 1.1 oster #endif /* !SIMULATE */
349 1.1 oster int shutdown_engine;
350 1.1 oster int dags_in_flight; /* debug */
351 1.1 oster
352 1.1 oster /*
353 1.1 oster * PSS (Parity Stripe Status) stuff
354 1.1 oster */
355 1.1 oster RF_FreeList_t *pss_freelist;
356 1.1 oster long pssTableSize;
357 1.1 oster
358 1.1 oster /*
359 1.1 oster * Reconstruction stuff
360 1.1 oster */
361 1.1 oster int procsInBufWait;
362 1.1 oster int numFullReconBuffers;
363 1.1 oster RF_AccTraceEntry_t *recon_tracerecs;
364 1.1 oster unsigned long accumXorTimeUs;
365 1.1 oster RF_ReconDoneProc_t *recon_done_procs;
366 1.1 oster RF_DECLARE_MUTEX(recon_done_proc_mutex)
367 1.1 oster
368 1.1 oster #if !defined(KERNEL) && !defined(SIMULATE)
369 1.1 oster RF_Thread_t **diskthreads, *sparediskthreads; /* thread descriptors for disk threads in user-level version */
370 1.1 oster #endif /* !KERNEL && !SIMULATE */
371 1.1 oster
372 1.1 oster /*
373 1.1 oster * nAccOutstanding, waitShutdown protected by desc freelist lock
374 1.1 oster * (This may seem strange, since that's a central serialization point
375 1.1 oster * for a per-array piece of data, but otherwise, it'd be an extra
376 1.1 oster * per-array lock, and that'd only be less efficient...)
377 1.1 oster */
378 1.1 oster RF_DECLARE_COND(outstandingCond)
379 1.1 oster int waitShutdown;
380 1.1 oster int nAccOutstanding;
381 1.1 oster
382 1.1 oster RF_DiskId_t **diskids;
383 1.1 oster RF_DiskId_t *sparediskids;
384 1.1 oster
385 1.1 oster #ifdef KERNEL
386 1.1 oster int raidid;
387 1.1 oster #endif /* KERNEL */
388 1.1 oster RF_AccTotals_t acc_totals;
389 1.1 oster int keep_acc_totals;
390 1.1 oster
391 1.1 oster #ifdef _KERNEL
392 1.1 oster struct raidcinfo **raid_cinfo; /* array of component info */
393 1.1 oster struct proc *proc; /* XXX shouldn't be needed here.. :-p */
394 1.1 oster #endif
395 1.1 oster
396 1.1 oster int terminate_disk_queues;
397 1.1 oster
398 1.1 oster /*
399 1.1 oster * XXX
400 1.1 oster *
401 1.1 oster * config-specific information should be moved
402 1.1 oster * somewhere else, or at least hung off this
403 1.1 oster * in some generic way
404 1.1 oster */
405 1.1 oster
406 1.1 oster /* used by rf_compute_workload_shift */
407 1.1 oster RF_RowCol_t hist_diskreq[RF_MAXROW][RF_MAXCOL];
408 1.1 oster
409 1.1 oster /* used by declustering */
410 1.1 oster int noRotate;
411 1.1 oster
412 1.1 oster #if RF_INCLUDE_PARITYLOGGING > 0
413 1.1 oster /* used by parity logging */
414 1.1 oster RF_SectorCount_t regionLogCapacity;
415 1.1 oster RF_ParityLogQueue_t parityLogPool; /* pool of unused parity logs */
416 1.1 oster RF_RegionInfo_t *regionInfo; /* array of region state */
417 1.1 oster int numParityLogs;
418 1.1 oster int numSectorsPerLog;
419 1.1 oster int regionParityRange;
420 1.1 oster int logsInUse; /* debugging */
421 1.1 oster RF_ParityLogDiskQueue_t parityLogDiskQueue; /* state of parity logging disk work */
422 1.1 oster RF_RegionBufferQueue_t regionBufferPool; /* buffers for holding region log */
423 1.1 oster RF_RegionBufferQueue_t parityBufferPool; /* buffers for holding parity */
424 1.1 oster caddr_t parityLogBufferHeap; /* pool of unused parity logs */
425 1.1 oster #ifndef SIMULATE
426 1.1 oster RF_Thread_t pLogDiskThreadHandle;
427 1.1 oster #endif /* !SIMULATE */
428 1.1 oster
429 1.1 oster #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
430 1.1 oster };
431 1.1 oster
432 1.1 oster #endif /* !_RF__RF_RAID_H_ */
433