rf_paritymap.c revision 1.8 1 1.8 mrg /* $NetBSD: rf_paritymap.c,v 1.8 2011/04/27 07:55:15 mrg Exp $ */
2 1.1 jld
3 1.1 jld /*-
4 1.1 jld * Copyright (c) 2009 Jed Davis.
5 1.1 jld * All rights reserved.
6 1.1 jld *
7 1.1 jld * Redistribution and use in source and binary forms, with or without
8 1.1 jld * modification, are permitted provided that the following conditions
9 1.1 jld * are met:
10 1.1 jld * 1. Redistributions of source code must retain the above copyright
11 1.1 jld * notice, this list of conditions and the following disclaimer.
12 1.1 jld * 2. Redistributions in binary form must reproduce the above copyright
13 1.1 jld * notice, this list of conditions and the following disclaimer in the
14 1.1 jld * documentation and/or other materials provided with the distribution.
15 1.1 jld *
16 1.1 jld * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.1 jld * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.1 jld * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.1 jld * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.1 jld * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.1 jld * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.1 jld * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.1 jld * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.1 jld * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.1 jld * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.1 jld * POSSIBILITY OF SUCH DAMAGE.
27 1.1 jld */
28 1.1 jld
29 1.1 jld #include <sys/cdefs.h>
30 1.8 mrg __KERNEL_RCSID(0, "$NetBSD: rf_paritymap.c,v 1.8 2011/04/27 07:55:15 mrg Exp $");
31 1.1 jld
32 1.3 pooka #include <sys/param.h>
33 1.1 jld #include <sys/callout.h>
34 1.1 jld #include <sys/kmem.h>
35 1.1 jld #include <sys/mutex.h>
36 1.1 jld #include <sys/rwlock.h>
37 1.1 jld #include <sys/systm.h>
38 1.1 jld #include <sys/types.h>
39 1.1 jld
40 1.1 jld #include <dev/raidframe/rf_paritymap.h>
41 1.1 jld #include <dev/raidframe/rf_stripelocks.h>
42 1.1 jld #include <dev/raidframe/rf_layout.h>
43 1.1 jld #include <dev/raidframe/rf_raid.h>
44 1.1 jld #include <dev/raidframe/rf_parityscan.h>
45 1.1 jld #include <dev/raidframe/rf_kintf.h>
46 1.1 jld
47 1.1 jld /* Important parameters: */
48 1.1 jld #define REGION_MINSIZE (25ULL << 20)
49 1.1 jld #define DFL_TICKMS 40000
50 1.1 jld #define DFL_COOLDOWN 8 /* 7-8 intervals of 40s = 5min +/- 20s */
51 1.1 jld
52 1.1 jld /* Internal-use flag bits. */
53 1.1 jld #define TICKING 1
54 1.1 jld #define TICKED 2
55 1.1 jld
56 1.1 jld /* Prototypes! */
57 1.1 jld static void rf_paritymap_write_locked(struct rf_paritymap *);
58 1.1 jld static void rf_paritymap_tick(void *);
59 1.1 jld static u_int rf_paritymap_nreg(RF_Raid_t *);
60 1.1 jld
61 1.1 jld /* Extract the current status of the parity map. */
62 1.1 jld void
63 1.1 jld rf_paritymap_status(struct rf_paritymap *pm, struct rf_pmstat *ps)
64 1.1 jld {
65 1.1 jld memset(ps, 0, sizeof(*ps));
66 1.1 jld if (pm == NULL)
67 1.1 jld ps->enabled = 0;
68 1.1 jld else {
69 1.1 jld ps->enabled = 1;
70 1.1 jld ps->region_size = pm->region_size;
71 1.1 jld mutex_enter(&pm->lock);
72 1.1 jld memcpy(&ps->params, &pm->params, sizeof(ps->params));
73 1.1 jld memcpy(ps->dirty, pm->disk_now, sizeof(ps->dirty));
74 1.1 jld memcpy(&ps->ctrs, &pm->ctrs, sizeof(ps->ctrs));
75 1.1 jld mutex_exit(&pm->lock);
76 1.1 jld }
77 1.1 jld }
78 1.1 jld
79 1.1 jld /*
80 1.1 jld * Test whether parity in a given sector is suspected of being inconsistent
81 1.1 jld * on disk (assuming that any pending I/O to it is allowed to complete).
82 1.1 jld * This may be of interest to future work on parity scrubbing.
83 1.1 jld */
84 1.1 jld int
85 1.1 jld rf_paritymap_test(struct rf_paritymap *pm, daddr_t sector)
86 1.1 jld {
87 1.1 jld unsigned region = sector / pm->region_size;
88 1.1 jld int retval;
89 1.1 jld
90 1.1 jld mutex_enter(&pm->lock);
91 1.1 jld retval = isset(pm->disk_boot->bits, region) ? 1 : 0;
92 1.1 jld mutex_exit(&pm->lock);
93 1.1 jld return retval;
94 1.1 jld }
95 1.1 jld
96 1.1 jld /* To be called before a write to the RAID is submitted. */
97 1.1 jld void
98 1.1 jld rf_paritymap_begin(struct rf_paritymap *pm, daddr_t offset, daddr_t size)
99 1.1 jld {
100 1.1 jld unsigned i, b, e;
101 1.1 jld
102 1.1 jld b = offset / pm->region_size;
103 1.1 jld e = (offset + size - 1) / pm->region_size;
104 1.1 jld
105 1.1 jld for (i = b; i <= e; i++)
106 1.1 jld rf_paritymap_begin_region(pm, i);
107 1.1 jld }
108 1.1 jld
109 1.1 jld /* To be called after a write to the RAID completes. */
110 1.1 jld void
111 1.1 jld rf_paritymap_end(struct rf_paritymap *pm, daddr_t offset, daddr_t size)
112 1.1 jld {
113 1.1 jld unsigned i, b, e;
114 1.1 jld
115 1.1 jld b = offset / pm->region_size;
116 1.1 jld e = (offset + size - 1) / pm->region_size;
117 1.1 jld
118 1.1 jld for (i = b; i <= e; i++)
119 1.1 jld rf_paritymap_end_region(pm, i);
120 1.1 jld }
121 1.1 jld
122 1.1 jld void
123 1.1 jld rf_paritymap_begin_region(struct rf_paritymap *pm, unsigned region)
124 1.1 jld {
125 1.1 jld int needs_write;
126 1.1 jld
127 1.1 jld KASSERT(region < RF_PARITYMAP_NREG);
128 1.1 jld pm->ctrs.nwrite++;
129 1.1 jld
130 1.1 jld /* If it was being kept warm, deal with that. */
131 1.1 jld mutex_enter(&pm->lock);
132 1.1 jld if (pm->current->state[region] < 0)
133 1.1 jld pm->current->state[region] = 0;
134 1.1 jld
135 1.1 jld /* This shouldn't happen unless RAIDOUTSTANDING is set too high. */
136 1.1 jld KASSERT(pm->current->state[region] < 127);
137 1.1 jld pm->current->state[region]++;
138 1.1 jld
139 1.1 jld needs_write = isclr(pm->disk_now->bits, region);
140 1.1 jld
141 1.1 jld if (needs_write) {
142 1.1 jld KASSERT(pm->current->state[region] == 1);
143 1.1 jld rf_paritymap_write_locked(pm);
144 1.1 jld }
145 1.1 jld
146 1.1 jld mutex_exit(&pm->lock);
147 1.1 jld }
148 1.1 jld
149 1.1 jld void
150 1.1 jld rf_paritymap_end_region(struct rf_paritymap *pm, unsigned region)
151 1.1 jld {
152 1.1 jld KASSERT(region < RF_PARITYMAP_NREG);
153 1.1 jld
154 1.1 jld mutex_enter(&pm->lock);
155 1.1 jld KASSERT(pm->current->state[region] > 0);
156 1.1 jld --pm->current->state[region];
157 1.1 jld
158 1.1 jld if (pm->current->state[region] <= 0) {
159 1.1 jld pm->current->state[region] = -pm->params.cooldown;
160 1.1 jld KASSERT(pm->current->state[region] <= 0);
161 1.1 jld mutex_enter(&pm->lk_flags);
162 1.1 jld if (!(pm->flags & TICKING)) {
163 1.1 jld pm->flags |= TICKING;
164 1.1 jld mutex_exit(&pm->lk_flags);
165 1.1 jld callout_schedule(&pm->ticker,
166 1.1 jld mstohz(pm->params.tickms));
167 1.1 jld } else
168 1.1 jld mutex_exit(&pm->lk_flags);
169 1.1 jld }
170 1.1 jld mutex_exit(&pm->lock);
171 1.1 jld }
172 1.1 jld
173 1.1 jld /*
174 1.1 jld * Updates the parity map to account for any changes in current activity
175 1.1 jld * and/or an ongoing parity scan, then writes it to disk with appropriate
176 1.1 jld * synchronization.
177 1.1 jld */
178 1.1 jld void
179 1.1 jld rf_paritymap_write(struct rf_paritymap *pm)
180 1.1 jld {
181 1.1 jld mutex_enter(&pm->lock);
182 1.1 jld rf_paritymap_write_locked(pm);
183 1.1 jld mutex_exit(&pm->lock);
184 1.1 jld }
185 1.1 jld
186 1.1 jld /* As above, but to be used when pm->lock is already held. */
187 1.1 jld static void
188 1.1 jld rf_paritymap_write_locked(struct rf_paritymap *pm)
189 1.1 jld {
190 1.1 jld char w, w0;
191 1.1 jld int i, j, setting, clearing;
192 1.1 jld
193 1.1 jld setting = clearing = 0;
194 1.1 jld for (i = 0; i < RF_PARITYMAP_NBYTE; i++) {
195 1.1 jld w0 = pm->disk_now->bits[i];
196 1.1 jld w = pm->disk_boot->bits[i];
197 1.1 jld
198 1.1 jld for (j = 0; j < NBBY; j++)
199 1.1 jld if (pm->current->state[i * NBBY + j] != 0)
200 1.1 jld w |= 1 << j;
201 1.1 jld
202 1.1 jld if (w & ~w0)
203 1.1 jld setting = 1;
204 1.1 jld if (w0 & ~w)
205 1.1 jld clearing = 1;
206 1.1 jld
207 1.1 jld pm->disk_now->bits[i] = w;
208 1.1 jld }
209 1.1 jld pm->ctrs.ncachesync += setting + clearing;
210 1.1 jld pm->ctrs.nclearing += clearing;
211 1.1 jld
212 1.1 jld /*
213 1.1 jld * If bits are being set in the parity map, then a sync is
214 1.1 jld * required afterwards, so that the regions are marked dirty
215 1.1 jld * on disk before any writes to them take place. If bits are
216 1.1 jld * being cleared, then a sync is required before the write, so
217 1.1 jld * that any writes to those regions are processed before the
218 1.1 jld * region is marked clean. (Synchronization is somewhat
219 1.1 jld * overkill; a write ordering barrier would suffice, but we
220 1.1 jld * currently have no way to express that directly.)
221 1.1 jld */
222 1.1 jld if (clearing)
223 1.1 jld rf_sync_component_caches(pm->raid);
224 1.1 jld rf_paritymap_kern_write(pm->raid, pm->disk_now);
225 1.1 jld if (setting)
226 1.1 jld rf_sync_component_caches(pm->raid);
227 1.1 jld }
228 1.1 jld
229 1.1 jld /* Mark all parity as being in need of rewrite. */
230 1.1 jld void
231 1.1 jld rf_paritymap_invalidate(struct rf_paritymap *pm)
232 1.1 jld {
233 1.1 jld mutex_enter(&pm->lock);
234 1.1 jld memset(pm->disk_boot, ~(unsigned char)0,
235 1.1 jld sizeof(struct rf_paritymap_ondisk));
236 1.1 jld mutex_exit(&pm->lock);
237 1.1 jld }
238 1.1 jld
239 1.1 jld /* Mark all parity as being correct. */
240 1.1 jld void
241 1.1 jld rf_paritymap_forceclean(struct rf_paritymap *pm)
242 1.1 jld {
243 1.1 jld mutex_enter(&pm->lock);
244 1.1 jld memset(pm->disk_boot, (unsigned char)0,
245 1.1 jld sizeof(struct rf_paritymap_ondisk));
246 1.1 jld mutex_exit(&pm->lock);
247 1.1 jld }
248 1.1 jld
249 1.1 jld /*
250 1.1 jld * The cooldown callout routine just defers its work to a thread; it can't do
251 1.1 jld * the parity map write itself as it would block, and although mutex-induced
252 1.1 jld * blocking is permitted it seems wise to avoid tying up the softint.
253 1.1 jld */
254 1.1 jld static void
255 1.1 jld rf_paritymap_tick(void *arg)
256 1.1 jld {
257 1.1 jld struct rf_paritymap *pm = arg;
258 1.1 jld
259 1.1 jld mutex_enter(&pm->lk_flags);
260 1.1 jld pm->flags |= TICKED;
261 1.1 jld mutex_exit(&pm->lk_flags);
262 1.7 mrg
263 1.8 mrg rf_lock_mutex2(pm->raid->iodone_lock);
264 1.8 mrg rf_signal_cond2(pm->raid->iodone_cv); /* XXX */
265 1.8 mrg rf_unlock_mutex2(pm->raid->iodone_lock);
266 1.1 jld }
267 1.1 jld
268 1.1 jld /*
269 1.1 jld * This is where the parity cooling work (and rearming the callout if needed)
270 1.1 jld * is done; the raidio thread calls it when woken up, as by the above.
271 1.1 jld */
272 1.1 jld void
273 1.1 jld rf_paritymap_checkwork(struct rf_paritymap *pm)
274 1.1 jld {
275 1.1 jld int i, zerop, progressp;
276 1.1 jld
277 1.1 jld mutex_enter(&pm->lk_flags);
278 1.1 jld if (pm->flags & TICKED) {
279 1.1 jld zerop = progressp = 0;
280 1.1 jld
281 1.1 jld pm->flags &= ~TICKED;
282 1.1 jld mutex_exit(&pm->lk_flags);
283 1.1 jld
284 1.1 jld mutex_enter(&pm->lock);
285 1.1 jld for (i = 0; i < RF_PARITYMAP_NREG; i++) {
286 1.1 jld if (pm->current->state[i] < 0) {
287 1.1 jld progressp = 1;
288 1.1 jld pm->current->state[i]++;
289 1.1 jld if (pm->current->state[i] == 0)
290 1.1 jld zerop = 1;
291 1.1 jld }
292 1.1 jld }
293 1.1 jld
294 1.1 jld if (progressp)
295 1.1 jld callout_schedule(&pm->ticker,
296 1.1 jld mstohz(pm->params.tickms));
297 1.1 jld else {
298 1.1 jld mutex_enter(&pm->lk_flags);
299 1.1 jld pm->flags &= ~TICKING;
300 1.1 jld mutex_exit(&pm->lk_flags);
301 1.1 jld }
302 1.1 jld
303 1.1 jld if (zerop)
304 1.1 jld rf_paritymap_write_locked(pm);
305 1.1 jld mutex_exit(&pm->lock);
306 1.1 jld } else
307 1.1 jld mutex_exit(&pm->lk_flags);
308 1.1 jld }
309 1.1 jld
310 1.1 jld /*
311 1.1 jld * Set parity map parameters; used both to alter parameters on the fly and to
312 1.1 jld * establish their initial values. Note that setting a parameter to 0 means
313 1.1 jld * to leave the previous setting unchanged, and that if this is done for the
314 1.1 jld * initial setting of "regions", then a default value will be computed based
315 1.1 jld * on the RAID component size.
316 1.1 jld */
317 1.1 jld int
318 1.1 jld rf_paritymap_set_params(struct rf_paritymap *pm,
319 1.1 jld const struct rf_pmparams *params, int todisk)
320 1.1 jld {
321 1.1 jld int cooldown, tickms;
322 1.1 jld u_int regions;
323 1.1 jld RF_RowCol_t col;
324 1.1 jld RF_ComponentLabel_t *clabel;
325 1.1 jld RF_Raid_t *raidPtr;
326 1.1 jld
327 1.1 jld cooldown = params->cooldown != 0
328 1.1 jld ? params->cooldown : pm->params.cooldown;
329 1.1 jld tickms = params->tickms != 0
330 1.1 jld ? params->tickms : pm->params.tickms;
331 1.1 jld regions = params->regions != 0
332 1.1 jld ? params->regions : pm->params.regions;
333 1.1 jld
334 1.1 jld if (cooldown < 1 || cooldown > 128) {
335 1.1 jld printf("raid%d: cooldown %d out of range\n", pm->raid->raidid,
336 1.1 jld cooldown);
337 1.1 jld return (-1);
338 1.1 jld }
339 1.1 jld if (tickms < 10) {
340 1.1 jld printf("raid%d: tick time %dms out of range\n",
341 1.1 jld pm->raid->raidid, tickms);
342 1.1 jld return (-1);
343 1.1 jld }
344 1.1 jld if (regions == 0) {
345 1.1 jld regions = rf_paritymap_nreg(pm->raid);
346 1.1 jld } else if (regions > RF_PARITYMAP_NREG) {
347 1.1 jld printf("raid%d: region count %u too large (more than %u)\n",
348 1.1 jld pm->raid->raidid, regions, RF_PARITYMAP_NREG);
349 1.1 jld return (-1);
350 1.1 jld }
351 1.1 jld
352 1.1 jld /* XXX any currently warm parity will be used with the new tickms! */
353 1.1 jld pm->params.cooldown = cooldown;
354 1.1 jld pm->params.tickms = tickms;
355 1.1 jld /* Apply the initial region count, but do not change it after that. */
356 1.1 jld if (pm->params.regions == 0)
357 1.1 jld pm->params.regions = regions;
358 1.1 jld
359 1.1 jld /* So that the newly set parameters can be tested: */
360 1.1 jld pm->ctrs.nwrite = pm->ctrs.ncachesync = pm->ctrs.nclearing = 0;
361 1.1 jld
362 1.1 jld if (todisk) {
363 1.1 jld raidPtr = pm->raid;
364 1.1 jld for (col = 0; col < raidPtr->numCol; col++) {
365 1.4 oster if (RF_DEAD_DISK(raidPtr->Disks[col].status))
366 1.4 oster continue;
367 1.4 oster
368 1.1 jld clabel = raidget_component_label(raidPtr, col);
369 1.1 jld clabel->parity_map_ntick = cooldown;
370 1.1 jld clabel->parity_map_tickms = tickms;
371 1.1 jld clabel->parity_map_regions = regions;
372 1.4 oster
373 1.4 oster /* Don't touch the disk if it's been spared */
374 1.4 oster if (clabel->status == rf_ds_spared)
375 1.4 oster continue;
376 1.4 oster
377 1.1 jld raidflush_component_label(raidPtr, col);
378 1.1 jld }
379 1.4 oster
380 1.4 oster /* handle the spares too... */
381 1.4 oster for (col = 0; col < raidPtr->numSpare; col++) {
382 1.4 oster if (raidPtr->Disks[raidPtr->numCol+col].status == rf_ds_used_spare) {
383 1.4 oster clabel = raidget_component_label(raidPtr, raidPtr->numCol+col);
384 1.4 oster clabel->parity_map_ntick = cooldown;
385 1.4 oster clabel->parity_map_tickms = tickms;
386 1.4 oster clabel->parity_map_regions = regions;
387 1.4 oster raidflush_component_label(raidPtr, raidPtr->numCol+col);
388 1.4 oster }
389 1.4 oster }
390 1.1 jld }
391 1.1 jld return 0;
392 1.1 jld }
393 1.1 jld
394 1.1 jld /*
395 1.1 jld * The number of regions may not be as many as can fit into the map, because
396 1.1 jld * when regions are too small, the overhead of setting parity map bits
397 1.1 jld * becomes significant in comparison to the actual I/O, while the
398 1.1 jld * corresponding gains in parity verification time become negligible. Thus,
399 1.1 jld * a minimum region size (defined above) is imposed.
400 1.1 jld *
401 1.1 jld * Note that, if the number of regions is less than the maximum, then some of
402 1.1 jld * the regions will be "fictional", corresponding to no actual disk; some
403 1.1 jld * parts of the code may process them as normal, but they can not ever be
404 1.1 jld * written to.
405 1.1 jld */
406 1.1 jld static u_int
407 1.1 jld rf_paritymap_nreg(RF_Raid_t *raid)
408 1.1 jld {
409 1.1 jld daddr_t bytes_per_disk, nreg;
410 1.1 jld
411 1.1 jld bytes_per_disk = raid->sectorsPerDisk << raid->logBytesPerSector;
412 1.1 jld nreg = bytes_per_disk / REGION_MINSIZE;
413 1.1 jld if (nreg > RF_PARITYMAP_NREG)
414 1.1 jld nreg = RF_PARITYMAP_NREG;
415 1.6 riz if (nreg < 1)
416 1.6 riz nreg = 1;
417 1.1 jld
418 1.1 jld return (u_int)nreg;
419 1.1 jld }
420 1.1 jld
421 1.1 jld /*
422 1.1 jld * Initialize a parity map given specific parameters. This neither reads nor
423 1.1 jld * writes the parity map config in the component labels; for that, see below.
424 1.1 jld */
425 1.1 jld int
426 1.1 jld rf_paritymap_init(struct rf_paritymap *pm, RF_Raid_t *raid,
427 1.1 jld const struct rf_pmparams *params)
428 1.1 jld {
429 1.1 jld daddr_t rstripes;
430 1.1 jld struct rf_pmparams safe;
431 1.1 jld
432 1.1 jld pm->raid = raid;
433 1.1 jld pm->params.regions = 0;
434 1.1 jld if (0 != rf_paritymap_set_params(pm, params, 0)) {
435 1.1 jld /*
436 1.1 jld * If the parameters are out-of-range, then bring the
437 1.1 jld * parity map up with something reasonable, so that
438 1.1 jld * the admin can at least go and fix it (or ignore it
439 1.1 jld * entirely).
440 1.1 jld */
441 1.1 jld safe.cooldown = DFL_COOLDOWN;
442 1.1 jld safe.tickms = DFL_TICKMS;
443 1.1 jld safe.regions = 0;
444 1.1 jld
445 1.1 jld if (0 != rf_paritymap_set_params(pm, &safe, 0))
446 1.1 jld return (-1);
447 1.1 jld }
448 1.1 jld
449 1.1 jld rstripes = howmany(raid->Layout.numStripe, pm->params.regions);
450 1.1 jld pm->region_size = rstripes * raid->Layout.dataSectorsPerStripe;
451 1.1 jld
452 1.1 jld callout_init(&pm->ticker, CALLOUT_MPSAFE);
453 1.1 jld callout_setfunc(&pm->ticker, rf_paritymap_tick, pm);
454 1.1 jld pm->flags = 0;
455 1.1 jld
456 1.1 jld pm->disk_boot = kmem_alloc(sizeof(struct rf_paritymap_ondisk),
457 1.1 jld KM_SLEEP);
458 1.1 jld pm->disk_now = kmem_alloc(sizeof(struct rf_paritymap_ondisk),
459 1.1 jld KM_SLEEP);
460 1.1 jld pm->current = kmem_zalloc(sizeof(struct rf_paritymap_current),
461 1.1 jld KM_SLEEP);
462 1.1 jld
463 1.1 jld rf_paritymap_kern_read(pm->raid, pm->disk_boot);
464 1.1 jld memcpy(pm->disk_now, pm->disk_boot, sizeof(*pm->disk_now));
465 1.1 jld
466 1.1 jld mutex_init(&pm->lock, MUTEX_DEFAULT, IPL_NONE);
467 1.1 jld mutex_init(&pm->lk_flags, MUTEX_DEFAULT, IPL_SOFTCLOCK);
468 1.1 jld
469 1.1 jld return 0;
470 1.1 jld }
471 1.1 jld
472 1.1 jld /*
473 1.1 jld * Destroys a parity map; unless "force" is set, also cleans parity for any
474 1.1 jld * regions which were still in cooldown (but are not dirty on disk).
475 1.1 jld */
476 1.1 jld void
477 1.1 jld rf_paritymap_destroy(struct rf_paritymap *pm, int force)
478 1.1 jld {
479 1.1 jld int i;
480 1.1 jld
481 1.1 jld callout_halt(&pm->ticker, NULL); /* XXX stop? halt? */
482 1.1 jld callout_destroy(&pm->ticker);
483 1.1 jld
484 1.1 jld if (!force) {
485 1.1 jld for (i = 0; i < RF_PARITYMAP_NREG; i++) {
486 1.1 jld /* XXX check for > 0 ? */
487 1.1 jld if (pm->current->state[i] < 0)
488 1.1 jld pm->current->state[i] = 0;
489 1.1 jld }
490 1.1 jld
491 1.1 jld rf_paritymap_write_locked(pm);
492 1.1 jld }
493 1.1 jld
494 1.1 jld mutex_destroy(&pm->lock);
495 1.1 jld mutex_destroy(&pm->lk_flags);
496 1.1 jld
497 1.1 jld kmem_free(pm->disk_boot, sizeof(struct rf_paritymap_ondisk));
498 1.1 jld kmem_free(pm->disk_now, sizeof(struct rf_paritymap_ondisk));
499 1.1 jld kmem_free(pm->current, sizeof(struct rf_paritymap_current));
500 1.1 jld }
501 1.1 jld
502 1.1 jld /*
503 1.1 jld * Rewrite parity, taking parity map into account; this is the equivalent of
504 1.1 jld * the old rf_RewriteParity, and is likewise to be called from a suitable
505 1.1 jld * thread and shouldn't have multiple copies running in parallel and so on.
506 1.1 jld *
507 1.1 jld * Note that the fictional regions are "cleaned" in one shot, so that very
508 1.1 jld * small RAIDs (useful for testing) will not experience potentially severe
509 1.1 jld * regressions in rewrite time.
510 1.1 jld */
511 1.1 jld int
512 1.1 jld rf_paritymap_rewrite(struct rf_paritymap *pm)
513 1.1 jld {
514 1.1 jld int i, ret_val = 0;
515 1.1 jld daddr_t reg_b, reg_e;
516 1.1 jld
517 1.1 jld /* Process only the actual regions. */
518 1.1 jld for (i = 0; i < pm->params.regions; i++) {
519 1.1 jld mutex_enter(&pm->lock);
520 1.1 jld if (isset(pm->disk_boot->bits, i)) {
521 1.1 jld mutex_exit(&pm->lock);
522 1.1 jld
523 1.1 jld reg_b = i * pm->region_size;
524 1.1 jld reg_e = reg_b + pm->region_size;
525 1.1 jld if (reg_e > pm->raid->totalSectors)
526 1.1 jld reg_e = pm->raid->totalSectors;
527 1.1 jld
528 1.1 jld if (rf_RewriteParityRange(pm->raid, reg_b,
529 1.1 jld reg_e - reg_b)) {
530 1.1 jld ret_val = 1;
531 1.1 jld if (pm->raid->waitShutdown)
532 1.1 jld return ret_val;
533 1.1 jld } else {
534 1.1 jld mutex_enter(&pm->lock);
535 1.1 jld clrbit(pm->disk_boot->bits, i);
536 1.1 jld rf_paritymap_write_locked(pm);
537 1.1 jld mutex_exit(&pm->lock);
538 1.1 jld }
539 1.1 jld } else {
540 1.1 jld mutex_exit(&pm->lock);
541 1.1 jld }
542 1.1 jld }
543 1.1 jld
544 1.1 jld /* Now, clear the fictional regions, if any. */
545 1.1 jld rf_paritymap_forceclean(pm);
546 1.1 jld rf_paritymap_write(pm);
547 1.1 jld
548 1.1 jld return ret_val;
549 1.1 jld }
550 1.1 jld
551 1.1 jld /*
552 1.1 jld * How to merge the on-disk parity maps when reading them in from the
553 1.1 jld * various components; returns whether they differ. In the case that
554 1.1 jld * they do differ, sets *dst to the union of *dst and *src.
555 1.1 jld *
556 1.1 jld * In theory, it should be safe to take the intersection (or just pick
557 1.1 jld * a single component arbitrarily), but the paranoid approach costs
558 1.1 jld * little.
559 1.1 jld *
560 1.1 jld * Appropriate locking, if any, is the responsibility of the caller.
561 1.1 jld */
562 1.1 jld int
563 1.1 jld rf_paritymap_merge(struct rf_paritymap_ondisk *dst,
564 1.1 jld struct rf_paritymap_ondisk *src)
565 1.1 jld {
566 1.1 jld int i, discrep = 0;
567 1.1 jld
568 1.1 jld for (i = 0; i < RF_PARITYMAP_NBYTE; i++) {
569 1.1 jld if (dst->bits[i] != src->bits[i])
570 1.1 jld discrep = 1;
571 1.1 jld dst->bits[i] |= src->bits[i];
572 1.1 jld }
573 1.1 jld
574 1.1 jld return discrep;
575 1.1 jld }
576 1.1 jld
577 1.1 jld /*
578 1.1 jld * Detach a parity map from its RAID. This is not meant to be applied except
579 1.1 jld * when unconfiguring the RAID after all I/O has been resolved, as otherwise
580 1.1 jld * an out-of-date parity map could be treated as current.
581 1.1 jld */
582 1.1 jld void
583 1.1 jld rf_paritymap_detach(RF_Raid_t *raidPtr)
584 1.1 jld {
585 1.1 jld if (raidPtr->parity_map == NULL)
586 1.1 jld return;
587 1.1 jld
588 1.8 mrg rf_lock_mutex2(raidPtr->iodone_lock);
589 1.1 jld struct rf_paritymap *pm = raidPtr->parity_map;
590 1.1 jld raidPtr->parity_map = NULL;
591 1.8 mrg rf_unlock_mutex2(raidPtr->iodone_lock);
592 1.1 jld /* XXXjld is that enough locking? Or too much? */
593 1.1 jld rf_paritymap_destroy(pm, 0);
594 1.1 jld kmem_free(pm, sizeof(*pm));
595 1.1 jld }
596 1.1 jld
597 1.1 jld /*
598 1.5 jld * Is this RAID set ineligible for parity-map use due to not actually
599 1.5 jld * having any parity? (If so, rf_paritymap_attach is a no-op, but
600 1.5 jld * rf_paritymap_{get,set}_disable will still pointlessly act on the
601 1.5 jld * component labels.)
602 1.5 jld */
603 1.5 jld int
604 1.5 jld rf_paritymap_ineligible(RF_Raid_t *raidPtr)
605 1.5 jld {
606 1.5 jld return raidPtr->Layout.map->faultsTolerated == 0;
607 1.5 jld }
608 1.5 jld
609 1.5 jld /*
610 1.1 jld * Attach a parity map to a RAID set if appropriate. Includes
611 1.1 jld * configure-time processing of parity-map fields of component label.
612 1.1 jld */
613 1.1 jld void
614 1.1 jld rf_paritymap_attach(RF_Raid_t *raidPtr, int force)
615 1.1 jld {
616 1.1 jld RF_RowCol_t col;
617 1.1 jld int pm_use, pm_zap;
618 1.1 jld int g_tickms, g_ntick, g_regions;
619 1.1 jld int good;
620 1.1 jld RF_ComponentLabel_t *clabel;
621 1.1 jld u_int flags, regions;
622 1.1 jld struct rf_pmparams params;
623 1.1 jld
624 1.5 jld if (rf_paritymap_ineligible(raidPtr)) {
625 1.1 jld /* There isn't any parity. */
626 1.1 jld return;
627 1.1 jld }
628 1.1 jld
629 1.1 jld pm_use = 1;
630 1.1 jld pm_zap = 0;
631 1.1 jld g_tickms = DFL_TICKMS;
632 1.1 jld g_ntick = DFL_COOLDOWN;
633 1.1 jld g_regions = 0;
634 1.1 jld
635 1.1 jld /*
636 1.1 jld * Collect opinions on the set config. If this is the initial
637 1.1 jld * config (raidctl -C), treat all labels as invalid, since
638 1.1 jld * there may be random data present.
639 1.1 jld */
640 1.1 jld if (!force) {
641 1.1 jld for (col = 0; col < raidPtr->numCol; col++) {
642 1.4 oster if (RF_DEAD_DISK(raidPtr->Disks[col].status))
643 1.4 oster continue;
644 1.1 jld clabel = raidget_component_label(raidPtr, col);
645 1.1 jld flags = clabel->parity_map_flags;
646 1.1 jld /* Check for use by non-parity-map kernel. */
647 1.1 jld if (clabel->parity_map_modcount
648 1.1 jld != clabel->mod_counter) {
649 1.1 jld flags &= ~RF_PMLABEL_WASUSED;
650 1.1 jld }
651 1.1 jld
652 1.1 jld if (flags & RF_PMLABEL_VALID) {
653 1.1 jld g_tickms = clabel->parity_map_tickms;
654 1.1 jld g_ntick = clabel->parity_map_ntick;
655 1.1 jld regions = clabel->parity_map_regions;
656 1.1 jld if (g_regions == 0)
657 1.1 jld g_regions = regions;
658 1.1 jld else if (g_regions != regions) {
659 1.1 jld pm_zap = 1; /* important! */
660 1.1 jld }
661 1.1 jld
662 1.1 jld if (flags & RF_PMLABEL_DISABLE) {
663 1.1 jld pm_use = 0;
664 1.1 jld }
665 1.1 jld if (!(flags & RF_PMLABEL_WASUSED)) {
666 1.1 jld pm_zap = 1;
667 1.1 jld }
668 1.1 jld } else {
669 1.1 jld pm_zap = 1;
670 1.1 jld }
671 1.1 jld }
672 1.1 jld } else {
673 1.1 jld pm_zap = 1;
674 1.1 jld }
675 1.1 jld
676 1.1 jld /* Finally, create and attach the parity map. */
677 1.1 jld if (pm_use) {
678 1.1 jld params.cooldown = g_ntick;
679 1.1 jld params.tickms = g_tickms;
680 1.1 jld params.regions = g_regions;
681 1.1 jld
682 1.1 jld raidPtr->parity_map = kmem_alloc(sizeof(struct rf_paritymap),
683 1.1 jld KM_SLEEP);
684 1.1 jld if (0 != rf_paritymap_init(raidPtr->parity_map, raidPtr,
685 1.1 jld ¶ms)) {
686 1.1 jld /* It failed; do without. */
687 1.1 jld kmem_free(raidPtr->parity_map,
688 1.1 jld sizeof(struct rf_paritymap));
689 1.1 jld raidPtr->parity_map = NULL;
690 1.1 jld return;
691 1.1 jld }
692 1.1 jld
693 1.1 jld if (g_regions == 0)
694 1.1 jld /* Pick up the autoconfigured region count. */
695 1.1 jld g_regions = raidPtr->parity_map->params.regions;
696 1.1 jld
697 1.1 jld if (pm_zap) {
698 1.1 jld good = raidPtr->parity_good && !force;
699 1.1 jld
700 1.1 jld if (good)
701 1.1 jld rf_paritymap_forceclean(raidPtr->parity_map);
702 1.1 jld else
703 1.1 jld rf_paritymap_invalidate(raidPtr->parity_map);
704 1.1 jld /* This needs to be on disk before WASUSED is set. */
705 1.1 jld rf_paritymap_write(raidPtr->parity_map);
706 1.1 jld }
707 1.1 jld }
708 1.1 jld
709 1.1 jld /* Alter labels in-core to reflect the current view of things. */
710 1.1 jld for (col = 0; col < raidPtr->numCol; col++) {
711 1.4 oster if (RF_DEAD_DISK(raidPtr->Disks[col].status))
712 1.4 oster continue;
713 1.1 jld clabel = raidget_component_label(raidPtr, col);
714 1.1 jld
715 1.1 jld if (pm_use)
716 1.1 jld flags = RF_PMLABEL_VALID | RF_PMLABEL_WASUSED;
717 1.1 jld else
718 1.1 jld flags = RF_PMLABEL_VALID | RF_PMLABEL_DISABLE;
719 1.1 jld
720 1.1 jld clabel->parity_map_flags = flags;
721 1.1 jld clabel->parity_map_tickms = g_tickms;
722 1.1 jld clabel->parity_map_ntick = g_ntick;
723 1.1 jld clabel->parity_map_regions = g_regions;
724 1.1 jld raidflush_component_label(raidPtr, col);
725 1.1 jld }
726 1.4 oster /* Note that we're just in 'attach' here, and there won't
727 1.4 oster be any spare disks at this point. */
728 1.1 jld }
729 1.1 jld
730 1.1 jld /*
731 1.1 jld * For initializing the parity-map fields of a component label, both on
732 1.4 oster * initial creation and on reconstruct/copyback/etc. */
733 1.1 jld void
734 1.1 jld rf_paritymap_init_label(struct rf_paritymap *pm, RF_ComponentLabel_t *clabel)
735 1.1 jld {
736 1.1 jld if (pm != NULL) {
737 1.1 jld clabel->parity_map_flags =
738 1.1 jld RF_PMLABEL_VALID | RF_PMLABEL_WASUSED;
739 1.1 jld clabel->parity_map_tickms = pm->params.tickms;
740 1.1 jld clabel->parity_map_ntick = pm->params.cooldown;
741 1.1 jld /*
742 1.1 jld * XXXjld: If the number of regions is changed on disk, and
743 1.1 jld * then a new component is labeled before the next configure,
744 1.1 jld * then it will get the old value and they will conflict on
745 1.1 jld * the next boot (and the default will be used instead).
746 1.1 jld */
747 1.1 jld clabel->parity_map_regions = pm->params.regions;
748 1.1 jld } else {
749 1.1 jld /*
750 1.1 jld * XXXjld: if the map is disabled, and all the components are
751 1.1 jld * replaced without an intervening unconfigure/reconfigure,
752 1.1 jld * then it will become enabled on the next unconfig/reconfig.
753 1.1 jld */
754 1.1 jld }
755 1.1 jld }
756 1.1 jld
757 1.1 jld
758 1.1 jld /* Will the parity map be disabled next time? */
759 1.1 jld int
760 1.1 jld rf_paritymap_get_disable(RF_Raid_t *raidPtr)
761 1.1 jld {
762 1.1 jld RF_ComponentLabel_t *clabel;
763 1.1 jld RF_RowCol_t col;
764 1.1 jld int dis;
765 1.1 jld
766 1.1 jld dis = 0;
767 1.1 jld for (col = 0; col < raidPtr->numCol; col++) {
768 1.4 oster if (RF_DEAD_DISK(raidPtr->Disks[col].status))
769 1.4 oster continue;
770 1.1 jld clabel = raidget_component_label(raidPtr, col);
771 1.1 jld if (clabel->parity_map_flags & RF_PMLABEL_DISABLE)
772 1.1 jld dis = 1;
773 1.1 jld }
774 1.4 oster for (col = 0; col < raidPtr->numSpare; col++) {
775 1.4 oster if (raidPtr->Disks[raidPtr->numCol+col].status != rf_ds_used_spare)
776 1.4 oster continue;
777 1.4 oster clabel = raidget_component_label(raidPtr, raidPtr->numCol+col);
778 1.4 oster if (clabel->parity_map_flags & RF_PMLABEL_DISABLE)
779 1.4 oster dis = 1;
780 1.4 oster }
781 1.1 jld
782 1.1 jld return dis;
783 1.1 jld }
784 1.1 jld
785 1.1 jld /* Set whether the parity map will be disabled next time. */
786 1.1 jld void
787 1.1 jld rf_paritymap_set_disable(RF_Raid_t *raidPtr, int dis)
788 1.1 jld {
789 1.1 jld RF_ComponentLabel_t *clabel;
790 1.1 jld RF_RowCol_t col;
791 1.1 jld
792 1.1 jld for (col = 0; col < raidPtr->numCol; col++) {
793 1.4 oster if (RF_DEAD_DISK(raidPtr->Disks[col].status))
794 1.4 oster continue;
795 1.1 jld clabel = raidget_component_label(raidPtr, col);
796 1.1 jld if (dis)
797 1.1 jld clabel->parity_map_flags |= RF_PMLABEL_DISABLE;
798 1.1 jld else
799 1.1 jld clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE;
800 1.1 jld raidflush_component_label(raidPtr, col);
801 1.1 jld }
802 1.4 oster
803 1.4 oster /* update any used spares as well */
804 1.4 oster for (col = 0; col < raidPtr->numSpare; col++) {
805 1.4 oster if (raidPtr->Disks[raidPtr->numCol+col].status != rf_ds_used_spare)
806 1.4 oster continue;
807 1.4 oster
808 1.4 oster clabel = raidget_component_label(raidPtr, raidPtr->numCol+col);
809 1.4 oster if (dis)
810 1.4 oster clabel->parity_map_flags |= RF_PMLABEL_DISABLE;
811 1.4 oster else
812 1.4 oster clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE;
813 1.4 oster raidflush_component_label(raidPtr, raidPtr->numCol+col);
814 1.4 oster }
815 1.1 jld }
816