rf_driver.c revision 1.125 1 1.125 mrg /* $NetBSD: rf_driver.c,v 1.125 2011/04/27 07:55:14 mrg Exp $ */
2 1.9 oster /*-
3 1.9 oster * Copyright (c) 1999 The NetBSD Foundation, Inc.
4 1.9 oster * All rights reserved.
5 1.9 oster *
6 1.9 oster * This code is derived from software contributed to The NetBSD Foundation
7 1.9 oster * by Greg Oster
8 1.9 oster *
9 1.9 oster * Redistribution and use in source and binary forms, with or without
10 1.9 oster * modification, are permitted provided that the following conditions
11 1.9 oster * are met:
12 1.9 oster * 1. Redistributions of source code must retain the above copyright
13 1.9 oster * notice, this list of conditions and the following disclaimer.
14 1.9 oster * 2. Redistributions in binary form must reproduce the above copyright
15 1.9 oster * notice, this list of conditions and the following disclaimer in the
16 1.9 oster * documentation and/or other materials provided with the distribution.
17 1.9 oster *
18 1.9 oster * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19 1.9 oster * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 1.9 oster * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 1.9 oster * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22 1.9 oster * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 1.9 oster * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 1.9 oster * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 1.9 oster * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 1.9 oster * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 1.9 oster * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 1.9 oster * POSSIBILITY OF SUCH DAMAGE.
29 1.9 oster */
30 1.9 oster
31 1.1 oster /*
32 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University.
33 1.1 oster * All rights reserved.
34 1.1 oster *
35 1.1 oster * Author: Mark Holland, Khalil Amiri, Claudson Bornstein, William V. Courtright II,
36 1.1 oster * Robby Findler, Daniel Stodolsky, Rachad Youssef, Jim Zelenka
37 1.1 oster *
38 1.1 oster * Permission to use, copy, modify and distribute this software and
39 1.1 oster * its documentation is hereby granted, provided that both the copyright
40 1.1 oster * notice and this permission notice appear in all copies of the
41 1.1 oster * software, derivative works or modified versions, and any portions
42 1.1 oster * thereof, and that both notices appear in supporting documentation.
43 1.1 oster *
44 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
45 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
46 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
47 1.1 oster *
48 1.1 oster * Carnegie Mellon requests users of this software to return to
49 1.1 oster *
50 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
51 1.1 oster * School of Computer Science
52 1.1 oster * Carnegie Mellon University
53 1.1 oster * Pittsburgh PA 15213-3890
54 1.1 oster *
55 1.1 oster * any improvements or extensions that they make and grant Carnegie the
56 1.1 oster * rights to redistribute these changes.
57 1.1 oster */
58 1.1 oster
59 1.1 oster /******************************************************************************
60 1.1 oster *
61 1.1 oster * rf_driver.c -- main setup, teardown, and access routines for the RAID driver
62 1.1 oster *
63 1.1 oster * all routines are prefixed with rf_ (raidframe), to avoid conficts.
64 1.1 oster *
65 1.1 oster ******************************************************************************/
66 1.1 oster
67 1.44 lukem
68 1.44 lukem #include <sys/cdefs.h>
69 1.125 mrg __KERNEL_RCSID(0, "$NetBSD: rf_driver.c,v 1.125 2011/04/27 07:55:14 mrg Exp $");
70 1.71 martin
71 1.119 ad #ifdef _KERNEL_OPT
72 1.71 martin #include "opt_raid_diagnostic.h"
73 1.119 ad #endif
74 1.1 oster
75 1.1 oster #include <sys/param.h>
76 1.1 oster #include <sys/systm.h>
77 1.1 oster #include <sys/ioctl.h>
78 1.1 oster #include <sys/fcntl.h>
79 1.1 oster #include <sys/vnode.h>
80 1.1 oster
81 1.1 oster
82 1.1 oster #include "rf_archs.h"
83 1.1 oster #include "rf_threadstuff.h"
84 1.1 oster
85 1.1 oster #include <sys/errno.h>
86 1.1 oster
87 1.1 oster #include "rf_raid.h"
88 1.1 oster #include "rf_dag.h"
89 1.1 oster #include "rf_aselect.h"
90 1.1 oster #include "rf_diskqueue.h"
91 1.1 oster #include "rf_parityscan.h"
92 1.1 oster #include "rf_alloclist.h"
93 1.1 oster #include "rf_dagutils.h"
94 1.1 oster #include "rf_utils.h"
95 1.1 oster #include "rf_etimer.h"
96 1.1 oster #include "rf_acctrace.h"
97 1.1 oster #include "rf_general.h"
98 1.1 oster #include "rf_desc.h"
99 1.1 oster #include "rf_states.h"
100 1.1 oster #include "rf_decluster.h"
101 1.1 oster #include "rf_map.h"
102 1.1 oster #include "rf_revent.h"
103 1.1 oster #include "rf_callback.h"
104 1.1 oster #include "rf_engine.h"
105 1.1 oster #include "rf_mcpair.h"
106 1.1 oster #include "rf_nwayxor.h"
107 1.1 oster #include "rf_copyback.h"
108 1.1 oster #include "rf_driver.h"
109 1.1 oster #include "rf_options.h"
110 1.1 oster #include "rf_shutdown.h"
111 1.24 oster #include "rf_kintf.h"
112 1.122 jld #include "rf_paritymap.h"
113 1.1 oster
114 1.1 oster #include <sys/buf.h>
115 1.1 oster
116 1.61 oster #ifndef RF_ACCESS_DEBUG
117 1.61 oster #define RF_ACCESS_DEBUG 0
118 1.61 oster #endif
119 1.61 oster
120 1.1 oster /* rad == RF_RaidAccessDesc_t */
121 1.91 oster RF_DECLARE_MUTEX(rf_rad_lock)
122 1.1 oster #define RF_MAX_FREE_RAD 128
123 1.88 oster #define RF_MIN_FREE_RAD 32
124 1.1 oster
125 1.1 oster /* debug variables */
126 1.6 oster char rf_panicbuf[2048]; /* a buffer to hold an error msg when we panic */
127 1.1 oster
128 1.1 oster /* main configuration routines */
129 1.1 oster static int raidframe_booted = 0;
130 1.1 oster
131 1.6 oster static void rf_ConfigureDebug(RF_Config_t * cfgPtr);
132 1.1 oster static void set_debug_option(char *name, long val);
133 1.1 oster static void rf_UnconfigureArray(void);
134 1.1 oster static void rf_ShutdownRDFreeList(void *);
135 1.1 oster static int rf_ConfigureRDFreeList(RF_ShutdownList_t **);
136 1.1 oster
137 1.6 oster RF_DECLARE_MUTEX(rf_printf_mutex) /* debug only: avoids interleaved
138 1.6 oster * printfs by different stripes */
139 1.1 oster
140 1.1 oster #define SIGNAL_QUIESCENT_COND(_raid_) wakeup(&((_raid_)->accesses_suspended))
141 1.1 oster #define WAIT_FOR_QUIESCENCE(_raid_) \
142 1.38 oster ltsleep(&((_raid_)->accesses_suspended), PRIBIO, \
143 1.38 oster "raidframe quiesce", 0, &((_raid_)->access_suspend_mutex))
144 1.1 oster
145 1.9 oster static int configureCount = 0; /* number of active configurations */
146 1.9 oster static int isconfigged = 0; /* is basic raidframe (non per-array)
147 1.116 oster * stuff configured */
148 1.125 mrg static rf_declare_mutex2(configureMutex); /* used to lock the configuration
149 1.125 mrg * stuff */
150 1.9 oster static RF_ShutdownList_t *globalShutdown; /* non array-specific
151 1.9 oster * stuff */
152 1.1 oster
153 1.9 oster static int rf_ConfigureRDFreeList(RF_ShutdownList_t ** listp);
154 1.103 oster static int rf_AllocEmergBuffers(RF_Raid_t *);
155 1.103 oster static void rf_FreeEmergBuffers(RF_Raid_t *);
156 1.1 oster
157 1.1 oster /* called at system boot time */
158 1.107 perry int
159 1.121 cegger rf_BootRaidframe(void)
160 1.1 oster {
161 1.1 oster
162 1.6 oster if (raidframe_booted)
163 1.6 oster return (EBUSY);
164 1.6 oster raidframe_booted = 1;
165 1.125 mrg rf_init_mutex2(configureMutex, IPL_NONE);
166 1.79 oster configureCount = 0;
167 1.6 oster isconfigged = 0;
168 1.6 oster globalShutdown = NULL;
169 1.6 oster return (0);
170 1.1 oster }
171 1.1 oster
172 1.1 oster /*
173 1.1 oster * Called whenever an array is shutdown
174 1.1 oster */
175 1.107 perry static void
176 1.121 cegger rf_UnconfigureArray(void)
177 1.1 oster {
178 1.1 oster
179 1.125 mrg rf_lock_mutex2(configureMutex);
180 1.6 oster if (--configureCount == 0) { /* if no active configurations, shut
181 1.6 oster * everything down */
182 1.6 oster isconfigged = 0;
183 1.92 oster rf_ShutdownList(&globalShutdown);
184 1.6 oster
185 1.6 oster /*
186 1.6 oster * We must wait until now, because the AllocList module
187 1.6 oster * uses the DebugMem module.
188 1.6 oster */
189 1.60 oster #if RF_DEBUG_MEM
190 1.6 oster if (rf_memDebug)
191 1.6 oster rf_print_unfreed();
192 1.60 oster #endif
193 1.6 oster }
194 1.125 mrg rf_unlock_mutex2(configureMutex);
195 1.9 oster }
196 1.9 oster
197 1.1 oster /*
198 1.1 oster * Called to shut down an array.
199 1.1 oster */
200 1.107 perry int
201 1.80 oster rf_Shutdown(RF_Raid_t *raidPtr)
202 1.1 oster {
203 1.100 oster
204 1.6 oster if (!raidPtr->valid) {
205 1.6 oster RF_ERRORMSG("Attempt to shut down unconfigured RAIDframe driver. Aborting shutdown\n");
206 1.6 oster return (EINVAL);
207 1.6 oster }
208 1.6 oster /*
209 1.6 oster * wait for outstanding IOs to land
210 1.6 oster * As described in rf_raid.h, we use the rad_freelist lock
211 1.6 oster * to protect the per-array info about outstanding descs
212 1.6 oster * since we need to do freelist locking anyway, and this
213 1.6 oster * cuts down on the amount of serialization we've got going
214 1.6 oster * on.
215 1.6 oster */
216 1.91 oster RF_LOCK_MUTEX(rf_rad_lock);
217 1.6 oster if (raidPtr->waitShutdown) {
218 1.91 oster RF_UNLOCK_MUTEX(rf_rad_lock);
219 1.6 oster return (EBUSY);
220 1.6 oster }
221 1.6 oster raidPtr->waitShutdown = 1;
222 1.6 oster while (raidPtr->nAccOutstanding) {
223 1.91 oster RF_WAIT_COND(raidPtr->outstandingCond, rf_rad_lock);
224 1.6 oster }
225 1.91 oster RF_UNLOCK_MUTEX(rf_rad_lock);
226 1.35 oster
227 1.35 oster /* Wait for any parity re-writes to stop... */
228 1.35 oster while (raidPtr->parity_rewrite_in_progress) {
229 1.120 oster printf("raid%d: Waiting for parity re-write to exit...\n",
230 1.120 oster raidPtr->raidid);
231 1.35 oster tsleep(&raidPtr->parity_rewrite_in_progress, PRIBIO,
232 1.35 oster "rfprwshutdown", 0);
233 1.35 oster }
234 1.6 oster
235 1.120 oster /* Wait for any reconstruction to stop... */
236 1.120 oster while (raidPtr->reconInProgress) {
237 1.120 oster printf("raid%d: Waiting for reconstruction to stop...\n",
238 1.120 oster raidPtr->raidid);
239 1.120 oster tsleep(&raidPtr->waitForReconCond, PRIBIO,
240 1.120 oster "rfreshutdown",0);
241 1.120 oster }
242 1.120 oster
243 1.6 oster raidPtr->valid = 0;
244 1.6 oster
245 1.122 jld if (raidPtr->parity_map != NULL)
246 1.122 jld rf_paritymap_detach(raidPtr);
247 1.122 jld
248 1.37 oster rf_update_component_labels(raidPtr, RF_FINAL_COMPONENT_UPDATE);
249 1.6 oster
250 1.7 oster rf_UnconfigureVnodes(raidPtr);
251 1.7 oster
252 1.103 oster rf_FreeEmergBuffers(raidPtr);
253 1.100 oster
254 1.7 oster rf_ShutdownList(&raidPtr->shutdownList);
255 1.7 oster
256 1.7 oster rf_UnconfigureArray();
257 1.7 oster
258 1.7 oster return (0);
259 1.7 oster }
260 1.1 oster
261 1.6 oster
262 1.1 oster #define DO_INIT_CONFIGURE(f) { \
263 1.1 oster rc = f (&globalShutdown); \
264 1.1 oster if (rc) { \
265 1.1 oster RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \
266 1.1 oster rf_ShutdownList(&globalShutdown); \
267 1.1 oster configureCount--; \
268 1.125 mrg rf_unlock_mutex2(configureMutex); \
269 1.1 oster return(rc); \
270 1.1 oster } \
271 1.1 oster }
272 1.1 oster
273 1.1 oster #define DO_RAID_FAIL() { \
274 1.12 oster rf_UnconfigureVnodes(raidPtr); \
275 1.103 oster rf_FreeEmergBuffers(raidPtr); \
276 1.1 oster rf_ShutdownList(&raidPtr->shutdownList); \
277 1.1 oster rf_UnconfigureArray(); \
278 1.1 oster }
279 1.1 oster
280 1.1 oster #define DO_RAID_INIT_CONFIGURE(f) { \
281 1.1 oster rc = f (&raidPtr->shutdownList, raidPtr, cfgPtr); \
282 1.1 oster if (rc) { \
283 1.1 oster RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \
284 1.1 oster DO_RAID_FAIL(); \
285 1.1 oster return(rc); \
286 1.1 oster } \
287 1.1 oster }
288 1.1 oster
289 1.1 oster #define DO_RAID_MUTEX(_m_) { \
290 1.75 oster rf_mutex_init((_m_)); \
291 1.1 oster }
292 1.1 oster
293 1.107 perry int
294 1.80 oster rf_Configure(RF_Raid_t *raidPtr, RF_Config_t *cfgPtr, RF_AutoConfig_t *ac)
295 1.6 oster {
296 1.72 oster RF_RowCol_t col;
297 1.103 oster int rc;
298 1.6 oster
299 1.125 mrg rf_lock_mutex2(configureMutex);
300 1.6 oster configureCount++;
301 1.6 oster if (isconfigged == 0) {
302 1.75 oster rf_mutex_init(&rf_printf_mutex);
303 1.75 oster
304 1.6 oster /* initialize globals */
305 1.6 oster
306 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureAllocList);
307 1.28 oster
308 1.6 oster /*
309 1.28 oster * Yes, this does make debugging general to the whole
310 1.107 perry * system instead of being array specific. Bummer, drag.
311 1.28 oster */
312 1.6 oster rf_ConfigureDebug(cfgPtr);
313 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureDebugMem);
314 1.87 oster #if RF_ACC_TRACE > 0
315 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureAccessTrace);
316 1.87 oster #endif
317 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureMapModule);
318 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureReconEvent);
319 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureCallback);
320 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureRDFreeList);
321 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureNWayXor);
322 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureStripeLockFreeList);
323 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureMCPair);
324 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureDAGs);
325 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureDAGFuncs);
326 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureReconstruction);
327 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureCopyback);
328 1.6 oster DO_INIT_CONFIGURE(rf_ConfigureDiskQueueSystem);
329 1.110 oster DO_INIT_CONFIGURE(rf_ConfigurePSStatus);
330 1.6 oster isconfigged = 1;
331 1.6 oster }
332 1.125 mrg rf_unlock_mutex2(configureMutex);
333 1.6 oster
334 1.6 oster DO_RAID_MUTEX(&raidPtr->mutex);
335 1.6 oster /* set up the cleanup list. Do this after ConfigureDebug so that
336 1.6 oster * value of memDebug will be set */
337 1.6 oster
338 1.6 oster rf_MakeAllocList(raidPtr->cleanupList);
339 1.6 oster if (raidPtr->cleanupList == NULL) {
340 1.6 oster DO_RAID_FAIL();
341 1.6 oster return (ENOMEM);
342 1.6 oster }
343 1.86 oster rf_ShutdownCreate(&raidPtr->shutdownList,
344 1.86 oster (void (*) (void *)) rf_FreeAllocList,
345 1.86 oster raidPtr->cleanupList);
346 1.86 oster
347 1.6 oster raidPtr->numCol = cfgPtr->numCol;
348 1.6 oster raidPtr->numSpare = cfgPtr->numSpare;
349 1.6 oster
350 1.72 oster raidPtr->status = rf_rs_optimal;
351 1.72 oster raidPtr->reconControl = NULL;
352 1.107 perry
353 1.6 oster DO_RAID_INIT_CONFIGURE(rf_ConfigureEngine);
354 1.6 oster DO_RAID_INIT_CONFIGURE(rf_ConfigureStripeLocks);
355 1.6 oster
356 1.76 oster raidPtr->outstandingCond = 0;
357 1.6 oster
358 1.6 oster raidPtr->nAccOutstanding = 0;
359 1.6 oster raidPtr->waitShutdown = 0;
360 1.6 oster
361 1.6 oster DO_RAID_MUTEX(&raidPtr->access_suspend_mutex);
362 1.6 oster
363 1.76 oster raidPtr->waitForReconCond = 0;
364 1.6 oster
365 1.28 oster if (ac!=NULL) {
366 1.28 oster /* We have an AutoConfig structure.. Don't do the
367 1.28 oster normal disk configuration... call the auto config
368 1.28 oster stuff */
369 1.28 oster rf_AutoConfigureDisks(raidPtr, cfgPtr, ac);
370 1.28 oster } else {
371 1.28 oster DO_RAID_INIT_CONFIGURE(rf_ConfigureDisks);
372 1.28 oster DO_RAID_INIT_CONFIGURE(rf_ConfigureSpareDisks);
373 1.28 oster }
374 1.6 oster /* do this after ConfigureDisks & ConfigureSpareDisks to be sure dev
375 1.6 oster * no. is set */
376 1.6 oster DO_RAID_INIT_CONFIGURE(rf_ConfigureDiskQueues);
377 1.6 oster
378 1.6 oster DO_RAID_INIT_CONFIGURE(rf_ConfigureLayout);
379 1.6 oster
380 1.110 oster /* Initialize per-RAID PSS bits */
381 1.110 oster rf_InitPSStatus(raidPtr);
382 1.6 oster
383 1.82 oster #if RF_INCLUDE_CHAINDECLUSTER > 0
384 1.72 oster for (col = 0; col < raidPtr->numCol; col++) {
385 1.72 oster /*
386 1.72 oster * XXX better distribution
387 1.72 oster */
388 1.72 oster raidPtr->hist_diskreq[col] = 0;
389 1.6 oster }
390 1.82 oster #endif
391 1.30 oster raidPtr->numNewFailures = 0;
392 1.28 oster raidPtr->copyback_in_progress = 0;
393 1.28 oster raidPtr->parity_rewrite_in_progress = 0;
394 1.66 oster raidPtr->adding_hot_spare = 0;
395 1.28 oster raidPtr->recon_in_progress = 0;
396 1.29 oster raidPtr->maxOutstanding = cfgPtr->maxOutstandingDiskReqs;
397 1.29 oster
398 1.107 perry /* autoconfigure and root_partition will actually get filled in
399 1.29 oster after the config is done */
400 1.29 oster raidPtr->autoconfigure = 0;
401 1.29 oster raidPtr->root_partition = 0;
402 1.29 oster raidPtr->last_unit = raidPtr->raidid;
403 1.29 oster raidPtr->config_order = 0;
404 1.6 oster
405 1.6 oster if (rf_keepAccTotals) {
406 1.6 oster raidPtr->keep_acc_totals = 1;
407 1.6 oster }
408 1.1 oster
409 1.97 oster /* Allocate a bunch of buffers to be used in low-memory conditions */
410 1.97 oster raidPtr->iobuf = NULL;
411 1.103 oster
412 1.107 perry rc = rf_AllocEmergBuffers(raidPtr);
413 1.103 oster if (rc) {
414 1.103 oster printf("raid%d: Unable to allocate emergency buffers.\n",
415 1.103 oster raidPtr->raidid);
416 1.103 oster DO_RAID_FAIL();
417 1.103 oster return(rc);
418 1.103 oster }
419 1.103 oster
420 1.122 jld /* Set up parity map stuff, if applicable. */
421 1.122 jld #ifndef RF_NO_PARITY_MAP
422 1.122 jld rf_paritymap_attach(raidPtr, cfgPtr->force);
423 1.122 jld #endif
424 1.122 jld
425 1.103 oster raidPtr->valid = 1;
426 1.103 oster
427 1.103 oster printf("raid%d: %s\n", raidPtr->raidid,
428 1.103 oster raidPtr->Layout.map->configName);
429 1.103 oster printf("raid%d: Components:", raidPtr->raidid);
430 1.103 oster
431 1.103 oster for (col = 0; col < raidPtr->numCol; col++) {
432 1.103 oster printf(" %s", raidPtr->Disks[col].devname);
433 1.103 oster if (RF_DEAD_DISK(raidPtr->Disks[col].status)) {
434 1.103 oster printf("[**FAILED**]");
435 1.103 oster }
436 1.103 oster }
437 1.103 oster printf("\n");
438 1.117 sborrill printf("raid%d: Total Sectors: %" PRIu64 " (%" PRIu64 " MB)\n",
439 1.103 oster raidPtr->raidid,
440 1.117 sborrill raidPtr->totalSectors,
441 1.117 sborrill (raidPtr->totalSectors / 1024 *
442 1.103 oster (1 << raidPtr->logBytesPerSector) / 1024));
443 1.103 oster
444 1.103 oster return (0);
445 1.103 oster }
446 1.103 oster
447 1.103 oster
448 1.103 oster /*
449 1.103 oster
450 1.103 oster Routines to allocate and free the "emergency buffers" for a given
451 1.103 oster RAID set. These emergency buffers will be used when the kernel runs
452 1.107 perry out of kernel memory.
453 1.107 perry
454 1.103 oster */
455 1.103 oster
456 1.107 perry static int
457 1.103 oster rf_AllocEmergBuffers(RF_Raid_t *raidPtr)
458 1.103 oster {
459 1.103 oster void *tmpbuf;
460 1.103 oster RF_VoidPointerListElem_t *vple;
461 1.103 oster int i;
462 1.103 oster
463 1.97 oster /* XXX next line needs tuning... */
464 1.97 oster raidPtr->numEmergencyBuffers = 10 * raidPtr->numCol;
465 1.97 oster #if DEBUG
466 1.97 oster printf("raid%d: allocating %d buffers of %d bytes.\n",
467 1.97 oster raidPtr->raidid,
468 1.107 perry raidPtr->numEmergencyBuffers,
469 1.107 perry (int)(raidPtr->Layout.sectorsPerStripeUnit <<
470 1.97 oster raidPtr->logBytesPerSector));
471 1.97 oster #endif
472 1.97 oster for (i = 0; i < raidPtr->numEmergencyBuffers; i++) {
473 1.107 perry tmpbuf = malloc( raidPtr->Layout.sectorsPerStripeUnit <<
474 1.107 perry raidPtr->logBytesPerSector,
475 1.111 oster M_RAIDFRAME, M_WAITOK);
476 1.97 oster if (tmpbuf) {
477 1.99 oster vple = rf_AllocVPListElem();
478 1.99 oster vple->p= tmpbuf;
479 1.99 oster vple->next = raidPtr->iobuf;
480 1.99 oster raidPtr->iobuf = vple;
481 1.97 oster raidPtr->iobuf_count++;
482 1.97 oster } else {
483 1.97 oster printf("raid%d: failed to allocate emergency buffer!\n",
484 1.97 oster raidPtr->raidid);
485 1.111 oster return 1;
486 1.97 oster }
487 1.97 oster }
488 1.97 oster
489 1.99 oster /* XXX next line needs tuning too... */
490 1.99 oster raidPtr->numEmergencyStripeBuffers = 10;
491 1.99 oster for (i = 0; i < raidPtr->numEmergencyStripeBuffers; i++) {
492 1.99 oster tmpbuf = malloc( raidPtr->numCol * (raidPtr->Layout.sectorsPerStripeUnit <<
493 1.99 oster raidPtr->logBytesPerSector),
494 1.111 oster M_RAIDFRAME, M_WAITOK);
495 1.99 oster if (tmpbuf) {
496 1.99 oster vple = rf_AllocVPListElem();
497 1.99 oster vple->p= tmpbuf;
498 1.99 oster vple->next = raidPtr->stripebuf;
499 1.99 oster raidPtr->stripebuf = vple;
500 1.99 oster raidPtr->stripebuf_count++;
501 1.99 oster } else {
502 1.99 oster printf("raid%d: failed to allocate emergency stripe buffer!\n",
503 1.99 oster raidPtr->raidid);
504 1.111 oster return 1;
505 1.99 oster }
506 1.99 oster }
507 1.107 perry
508 1.103 oster return (0);
509 1.103 oster }
510 1.99 oster
511 1.103 oster static void
512 1.103 oster rf_FreeEmergBuffers(RF_Raid_t *raidPtr)
513 1.103 oster {
514 1.103 oster RF_VoidPointerListElem_t *tmp;
515 1.99 oster
516 1.103 oster /* Free the emergency IO buffers */
517 1.103 oster while (raidPtr->iobuf != NULL) {
518 1.103 oster tmp = raidPtr->iobuf;
519 1.103 oster raidPtr->iobuf = raidPtr->iobuf->next;
520 1.103 oster free(tmp->p, M_RAIDFRAME);
521 1.103 oster rf_FreeVPListElem(tmp);
522 1.103 oster }
523 1.52 oster
524 1.103 oster /* Free the emergency stripe buffers */
525 1.103 oster while (raidPtr->stripebuf != NULL) {
526 1.103 oster tmp = raidPtr->stripebuf;
527 1.103 oster raidPtr->stripebuf = raidPtr->stripebuf->next;
528 1.103 oster free(tmp->p, M_RAIDFRAME);
529 1.103 oster rf_FreeVPListElem(tmp);
530 1.52 oster }
531 1.103 oster }
532 1.50 oster
533 1.1 oster
534 1.107 perry static void
535 1.113 christos rf_ShutdownRDFreeList(void *ignored)
536 1.1 oster {
537 1.89 oster pool_destroy(&rf_pools.rad);
538 1.1 oster }
539 1.1 oster
540 1.107 perry static int
541 1.80 oster rf_ConfigureRDFreeList(RF_ShutdownList_t **listp)
542 1.1 oster {
543 1.1 oster
544 1.89 oster rf_pool_init(&rf_pools.rad, sizeof(RF_RaidAccessDesc_t),
545 1.89 oster "rf_rad_pl", RF_MIN_FREE_RAD, RF_MAX_FREE_RAD);
546 1.86 oster rf_ShutdownCreate(listp, rf_ShutdownRDFreeList, NULL);
547 1.91 oster simple_lock_init(&rf_rad_lock);
548 1.6 oster return (0);
549 1.6 oster }
550 1.6 oster
551 1.6 oster RF_RaidAccessDesc_t *
552 1.80 oster rf_AllocRaidAccDesc(RF_Raid_t *raidPtr, RF_IoType_t type,
553 1.80 oster RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks,
554 1.114 christos void *bufPtr, void *bp, RF_RaidAccessFlags_t flags,
555 1.102 drochner const RF_AccessState_t *states)
556 1.6 oster {
557 1.6 oster RF_RaidAccessDesc_t *desc;
558 1.6 oster
559 1.89 oster desc = pool_get(&rf_pools.rad, PR_WAITOK);
560 1.73 oster
561 1.91 oster RF_LOCK_MUTEX(rf_rad_lock);
562 1.6 oster if (raidPtr->waitShutdown) {
563 1.6 oster /*
564 1.6 oster * Actually, we're shutting the array down. Free the desc
565 1.6 oster * and return NULL.
566 1.6 oster */
567 1.73 oster
568 1.91 oster RF_UNLOCK_MUTEX(rf_rad_lock);
569 1.89 oster pool_put(&rf_pools.rad, desc);
570 1.6 oster return (NULL);
571 1.6 oster }
572 1.6 oster raidPtr->nAccOutstanding++;
573 1.73 oster
574 1.91 oster RF_UNLOCK_MUTEX(rf_rad_lock);
575 1.6 oster
576 1.6 oster desc->raidPtr = (void *) raidPtr;
577 1.6 oster desc->type = type;
578 1.6 oster desc->raidAddress = raidAddress;
579 1.6 oster desc->numBlocks = numBlocks;
580 1.6 oster desc->bufPtr = bufPtr;
581 1.6 oster desc->bp = bp;
582 1.6 oster desc->flags = flags;
583 1.6 oster desc->states = states;
584 1.6 oster desc->state = 0;
585 1.99 oster desc->dagList = NULL;
586 1.6 oster
587 1.6 oster desc->status = 0;
588 1.105 oster desc->numRetries = 0;
589 1.87 oster #if RF_ACC_TRACE > 0
590 1.40 thorpej memset((char *) &desc->tracerec, 0, sizeof(RF_AccTraceEntry_t));
591 1.87 oster #endif
592 1.41 oster desc->callbackFunc = NULL;
593 1.41 oster desc->callbackArg = NULL;
594 1.6 oster desc->next = NULL;
595 1.99 oster desc->iobufs = NULL;
596 1.99 oster desc->stripebufs = NULL;
597 1.99 oster
598 1.6 oster return (desc);
599 1.6 oster }
600 1.6 oster
601 1.107 perry void
602 1.80 oster rf_FreeRaidAccDesc(RF_RaidAccessDesc_t *desc)
603 1.6 oster {
604 1.6 oster RF_Raid_t *raidPtr = desc->raidPtr;
605 1.85 oster RF_DagList_t *dagList, *temp;
606 1.99 oster RF_VoidPointerListElem_t *tmp;
607 1.6 oster
608 1.6 oster RF_ASSERT(desc);
609 1.6 oster
610 1.85 oster /* Cleanup the dagList(s) */
611 1.85 oster dagList = desc->dagList;
612 1.85 oster while(dagList != NULL) {
613 1.85 oster temp = dagList;
614 1.85 oster dagList = dagList->next;
615 1.85 oster rf_FreeDAGList(temp);
616 1.85 oster }
617 1.85 oster
618 1.99 oster while (desc->iobufs) {
619 1.99 oster tmp = desc->iobufs;
620 1.99 oster desc->iobufs = desc->iobufs->next;
621 1.99 oster rf_FreeIOBuffer(raidPtr, tmp);
622 1.99 oster }
623 1.99 oster
624 1.99 oster while (desc->stripebufs) {
625 1.99 oster tmp = desc->stripebufs;
626 1.99 oster desc->stripebufs = desc->stripebufs->next;
627 1.99 oster rf_FreeStripeBuffer(raidPtr, tmp);
628 1.99 oster }
629 1.99 oster
630 1.89 oster pool_put(&rf_pools.rad, desc);
631 1.91 oster RF_LOCK_MUTEX(rf_rad_lock);
632 1.6 oster raidPtr->nAccOutstanding--;
633 1.6 oster if (raidPtr->waitShutdown) {
634 1.6 oster RF_SIGNAL_COND(raidPtr->outstandingCond);
635 1.6 oster }
636 1.91 oster RF_UNLOCK_MUTEX(rf_rad_lock);
637 1.1 oster }
638 1.1 oster /*********************************************************************
639 1.1 oster * Main routine for performing an access.
640 1.1 oster * Accesses are retried until a DAG can not be selected. This occurs
641 1.1 oster * when either the DAG library is incomplete or there are too many
642 1.1 oster * failures in a parity group.
643 1.80 oster *
644 1.80 oster * type should be read or write async_flag should be RF_TRUE or
645 1.114 christos * RF_FALSE bp_in is a buf pointer. void *to facilitate ignoring it
646 1.80 oster * outside the kernel
647 1.1 oster ********************************************************************/
648 1.107 perry int
649 1.80 oster rf_DoAccess(RF_Raid_t * raidPtr, RF_IoType_t type, int async_flag,
650 1.80 oster RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks,
651 1.114 christos void *bufPtr, struct buf *bp, RF_RaidAccessFlags_t flags)
652 1.1 oster {
653 1.6 oster RF_RaidAccessDesc_t *desc;
654 1.114 christos void *lbufPtr = bufPtr;
655 1.6 oster
656 1.6 oster raidAddress += rf_raidSectorOffset;
657 1.6 oster
658 1.61 oster #if RF_ACCESS_DEBUG
659 1.6 oster if (rf_accessDebug) {
660 1.1 oster
661 1.6 oster printf("logBytes is: %d %d %d\n", raidPtr->raidid,
662 1.6 oster raidPtr->logBytesPerSector,
663 1.6 oster (int) rf_RaidAddressToByte(raidPtr, numBlocks));
664 1.22 oster printf("raid%d: %s raidAddr %d (stripeid %d-%d) numBlocks %d (%d bytes) buf 0x%lx\n", raidPtr->raidid,
665 1.6 oster (type == RF_IO_TYPE_READ) ? "READ" : "WRITE", (int) raidAddress,
666 1.6 oster (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress),
667 1.6 oster (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress + numBlocks - 1),
668 1.6 oster (int) numBlocks,
669 1.6 oster (int) rf_RaidAddressToByte(raidPtr, numBlocks),
670 1.6 oster (long) bufPtr);
671 1.6 oster }
672 1.61 oster #endif
673 1.1 oster
674 1.6 oster desc = rf_AllocRaidAccDesc(raidPtr, type, raidAddress,
675 1.41 oster numBlocks, lbufPtr, bp, flags, raidPtr->Layout.map->states);
676 1.1 oster
677 1.6 oster if (desc == NULL) {
678 1.6 oster return (ENOMEM);
679 1.6 oster }
680 1.87 oster #if RF_ACC_TRACE > 0
681 1.6 oster RF_ETIMER_START(desc->tracerec.tot_timer);
682 1.87 oster #endif
683 1.6 oster desc->async_flag = async_flag;
684 1.3 explorer
685 1.122 jld if (raidPtr->parity_map != NULL &&
686 1.122 jld type == RF_IO_TYPE_WRITE)
687 1.122 jld rf_paritymap_begin(raidPtr->parity_map, raidAddress,
688 1.122 jld numBlocks);
689 1.122 jld
690 1.6 oster rf_ContinueRaidAccess(desc);
691 1.1 oster
692 1.6 oster return (0);
693 1.1 oster }
694 1.46 oster #if 0
695 1.1 oster /* force the array into reconfigured mode without doing reconstruction */
696 1.107 perry int
697 1.80 oster rf_SetReconfiguredMode(RF_Raid_t *raidPtr, int col)
698 1.6 oster {
699 1.6 oster if (!(raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
700 1.6 oster printf("Can't set reconfigured mode in dedicated-spare array\n");
701 1.6 oster RF_PANIC();
702 1.6 oster }
703 1.6 oster RF_LOCK_MUTEX(raidPtr->mutex);
704 1.6 oster raidPtr->numFailures++;
705 1.72 oster raidPtr->Disks[col].status = rf_ds_dist_spared;
706 1.72 oster raidPtr->status = rf_rs_reconfigured;
707 1.37 oster rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
708 1.6 oster /* install spare table only if declustering + distributed sparing
709 1.6 oster * architecture. */
710 1.6 oster if (raidPtr->Layout.map->flags & RF_BD_DECLUSTERED)
711 1.72 oster rf_InstallSpareTable(raidPtr, col);
712 1.6 oster RF_UNLOCK_MUTEX(raidPtr->mutex);
713 1.6 oster return (0);
714 1.1 oster }
715 1.46 oster #endif
716 1.1 oster
717 1.107 perry int
718 1.80 oster rf_FailDisk(RF_Raid_t *raidPtr, int fcol, int initRecon)
719 1.6 oster {
720 1.98 oster
721 1.98 oster /* need to suspend IO's here -- if there are DAGs in flight
722 1.107 perry and we pull the rug out from under ci_vp, Bad Things
723 1.98 oster can happen. */
724 1.98 oster
725 1.98 oster rf_SuspendNewRequestsAndWait(raidPtr);
726 1.98 oster
727 1.6 oster RF_LOCK_MUTEX(raidPtr->mutex);
728 1.72 oster if (raidPtr->Disks[fcol].status != rf_ds_failed) {
729 1.68 oster /* must be failing something that is valid, or else it's
730 1.107 perry already marked as failed (in which case we don't
731 1.68 oster want to mark it failed again!) */
732 1.68 oster raidPtr->numFailures++;
733 1.72 oster raidPtr->Disks[fcol].status = rf_ds_failed;
734 1.107 perry raidPtr->status = rf_rs_degraded;
735 1.68 oster }
736 1.65 oster RF_UNLOCK_MUTEX(raidPtr->mutex);
737 1.107 perry
738 1.37 oster rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
739 1.107 perry
740 1.107 perry /* Close the component, so that it's not "locked" if someone
741 1.56 oster else want's to use it! */
742 1.56 oster
743 1.72 oster rf_close_component(raidPtr, raidPtr->raid_cinfo[fcol].ci_vp,
744 1.72 oster raidPtr->Disks[fcol].auto_configured);
745 1.65 oster
746 1.65 oster RF_LOCK_MUTEX(raidPtr->mutex);
747 1.72 oster raidPtr->raid_cinfo[fcol].ci_vp = NULL;
748 1.56 oster
749 1.107 perry /* Need to mark the component as not being auto_configured
750 1.56 oster (in case it was previously). */
751 1.56 oster
752 1.72 oster raidPtr->Disks[fcol].auto_configured = 0;
753 1.65 oster RF_UNLOCK_MUTEX(raidPtr->mutex);
754 1.98 oster /* now we can allow IO to continue -- we'll be suspending it
755 1.98 oster again in rf_ReconstructFailedDisk() if we have to.. */
756 1.98 oster
757 1.98 oster rf_ResumeNewRequests(raidPtr);
758 1.56 oster
759 1.6 oster if (initRecon)
760 1.72 oster rf_ReconstructFailedDisk(raidPtr, fcol);
761 1.6 oster return (0);
762 1.1 oster }
763 1.1 oster /* releases a thread that is waiting for the array to become quiesced.
764 1.1 oster * access_suspend_mutex should be locked upon calling this
765 1.1 oster */
766 1.107 perry void
767 1.80 oster rf_SignalQuiescenceLock(RF_Raid_t *raidPtr)
768 1.6 oster {
769 1.61 oster #if RF_DEBUG_QUIESCE
770 1.6 oster if (rf_quiesceDebug) {
771 1.107 perry printf("raid%d: Signalling quiescence lock\n",
772 1.22 oster raidPtr->raidid);
773 1.6 oster }
774 1.61 oster #endif
775 1.6 oster raidPtr->access_suspend_release = 1;
776 1.6 oster
777 1.6 oster if (raidPtr->waiting_for_quiescence) {
778 1.6 oster SIGNAL_QUIESCENT_COND(raidPtr);
779 1.6 oster }
780 1.1 oster }
781 1.1 oster /* suspends all new requests to the array. No effect on accesses that are in flight. */
782 1.107 perry int
783 1.80 oster rf_SuspendNewRequestsAndWait(RF_Raid_t *raidPtr)
784 1.6 oster {
785 1.61 oster #if RF_DEBUG_QUIESCE
786 1.6 oster if (rf_quiesceDebug)
787 1.53 oster printf("raid%d: Suspending new reqs\n", raidPtr->raidid);
788 1.61 oster #endif
789 1.6 oster RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
790 1.6 oster raidPtr->accesses_suspended++;
791 1.6 oster raidPtr->waiting_for_quiescence = (raidPtr->accs_in_flight == 0) ? 0 : 1;
792 1.6 oster
793 1.6 oster if (raidPtr->waiting_for_quiescence) {
794 1.6 oster raidPtr->access_suspend_release = 0;
795 1.6 oster while (!raidPtr->access_suspend_release) {
796 1.93 oster #if RF_DEBUG_QUIESCE
797 1.53 oster printf("raid%d: Suspending: Waiting for Quiescence\n",
798 1.53 oster raidPtr->raidid);
799 1.93 oster #endif
800 1.6 oster WAIT_FOR_QUIESCENCE(raidPtr);
801 1.6 oster raidPtr->waiting_for_quiescence = 0;
802 1.6 oster }
803 1.6 oster }
804 1.93 oster #if RF_DEBUG_QUIESCE
805 1.53 oster printf("raid%d: Quiescence reached..\n", raidPtr->raidid);
806 1.93 oster #endif
807 1.1 oster
808 1.6 oster RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
809 1.6 oster return (raidPtr->waiting_for_quiescence);
810 1.1 oster }
811 1.1 oster /* wake up everyone waiting for quiescence to be released */
812 1.107 perry void
813 1.80 oster rf_ResumeNewRequests(RF_Raid_t *raidPtr)
814 1.6 oster {
815 1.6 oster RF_CallbackDesc_t *t, *cb;
816 1.6 oster
817 1.61 oster #if RF_DEBUG_QUIESCE
818 1.6 oster if (rf_quiesceDebug)
819 1.116 oster printf("raid%d: Resuming new requests\n", raidPtr->raidid);
820 1.61 oster #endif
821 1.6 oster
822 1.6 oster RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
823 1.6 oster raidPtr->accesses_suspended--;
824 1.6 oster if (raidPtr->accesses_suspended == 0)
825 1.6 oster cb = raidPtr->quiesce_wait_list;
826 1.6 oster else
827 1.6 oster cb = NULL;
828 1.6 oster raidPtr->quiesce_wait_list = NULL;
829 1.6 oster RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
830 1.6 oster
831 1.6 oster while (cb) {
832 1.6 oster t = cb;
833 1.6 oster cb = cb->next;
834 1.6 oster (t->callbackFunc) (t->callbackArg);
835 1.6 oster rf_FreeCallbackDesc(t);
836 1.6 oster }
837 1.1 oster }
838 1.1 oster /*****************************************************************************************
839 1.1 oster *
840 1.1 oster * debug routines
841 1.1 oster *
842 1.1 oster ****************************************************************************************/
843 1.1 oster
844 1.107 perry static void
845 1.80 oster set_debug_option(char *name, long val)
846 1.6 oster {
847 1.6 oster RF_DebugName_t *p;
848 1.6 oster
849 1.6 oster for (p = rf_debugNames; p->name; p++) {
850 1.6 oster if (!strcmp(p->name, name)) {
851 1.6 oster *(p->ptr) = val;
852 1.6 oster printf("[Set debug variable %s to %ld]\n", name, val);
853 1.6 oster return;
854 1.6 oster }
855 1.6 oster }
856 1.6 oster RF_ERRORMSG1("Unknown debug string \"%s\"\n", name);
857 1.1 oster }
858 1.1 oster
859 1.1 oster
860 1.1 oster /* would like to use sscanf here, but apparently not available in kernel */
861 1.1 oster /*ARGSUSED*/
862 1.107 perry static void
863 1.80 oster rf_ConfigureDebug(RF_Config_t *cfgPtr)
864 1.6 oster {
865 1.6 oster char *val_p, *name_p, *white_p;
866 1.6 oster long val;
867 1.6 oster int i;
868 1.6 oster
869 1.6 oster rf_ResetDebugOptions();
870 1.6 oster for (i = 0; cfgPtr->debugVars[i][0] && i < RF_MAXDBGV; i++) {
871 1.6 oster name_p = rf_find_non_white(&cfgPtr->debugVars[i][0]);
872 1.6 oster white_p = rf_find_white(name_p); /* skip to start of 2nd
873 1.6 oster * word */
874 1.6 oster val_p = rf_find_non_white(white_p);
875 1.6 oster if (*val_p == '0' && *(val_p + 1) == 'x')
876 1.6 oster val = rf_htoi(val_p + 2);
877 1.6 oster else
878 1.6 oster val = rf_atoi(val_p);
879 1.6 oster *white_p = '\0';
880 1.6 oster set_debug_option(name_p, val);
881 1.6 oster }
882 1.1 oster }
883 1.39 oster
884 1.39 oster void
885 1.108 christos rf_print_panic_message(int line, const char *file)
886 1.39 oster {
887 1.101 itojun snprintf(rf_panicbuf, sizeof(rf_panicbuf),
888 1.101 itojun "raidframe error at line %d file %s", line, file);
889 1.39 oster }
890 1.39 oster
891 1.62 oster #ifdef RAID_DIAGNOSTIC
892 1.39 oster void
893 1.108 christos rf_print_assert_panic_message(int line, const char *file, const char *condition)
894 1.39 oster {
895 1.101 itojun snprintf(rf_panicbuf, sizeof(rf_panicbuf),
896 1.39 oster "raidframe error at line %d file %s (failed asserting %s)\n",
897 1.39 oster line, file, condition);
898 1.58 oster }
899 1.62 oster #endif
900 1.58 oster
901 1.58 oster void
902 1.108 christos rf_print_unable_to_init_mutex(const char *file, int line, int rc)
903 1.58 oster {
904 1.58 oster RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n",
905 1.58 oster file, line, rc);
906 1.58 oster }
907 1.58 oster
908 1.58 oster void
909 1.108 christos rf_print_unable_to_add_shutdown(const char *file, int line, int rc)
910 1.58 oster {
911 1.58 oster RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
912 1.58 oster file, line, rc);
913 1.1 oster }
914