rf_paritymap.c revision 1.5.4.2 1 1.5.4.2 matt /* $NetBSD: rf_paritymap.c,v 1.5.4.2 2010/04/21 00:27:51 matt Exp $ */
2 1.5.4.2 matt
3 1.5.4.2 matt /*-
4 1.5.4.2 matt * Copyright (c) 2009 Jed Davis.
5 1.5.4.2 matt * All rights reserved.
6 1.5.4.2 matt *
7 1.5.4.2 matt * Redistribution and use in source and binary forms, with or without
8 1.5.4.2 matt * modification, are permitted provided that the following conditions
9 1.5.4.2 matt * are met:
10 1.5.4.2 matt * 1. Redistributions of source code must retain the above copyright
11 1.5.4.2 matt * notice, this list of conditions and the following disclaimer.
12 1.5.4.2 matt * 2. Redistributions in binary form must reproduce the above copyright
13 1.5.4.2 matt * notice, this list of conditions and the following disclaimer in the
14 1.5.4.2 matt * documentation and/or other materials provided with the distribution.
15 1.5.4.2 matt *
16 1.5.4.2 matt * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 1.5.4.2 matt * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 1.5.4.2 matt * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 1.5.4.2 matt * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 1.5.4.2 matt * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 1.5.4.2 matt * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 1.5.4.2 matt * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 1.5.4.2 matt * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 1.5.4.2 matt * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 1.5.4.2 matt * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 1.5.4.2 matt * POSSIBILITY OF SUCH DAMAGE.
27 1.5.4.2 matt */
28 1.5.4.2 matt
29 1.5.4.2 matt #include <sys/cdefs.h>
30 1.5.4.2 matt __KERNEL_RCSID(0, "$NetBSD: rf_paritymap.c,v 1.5.4.2 2010/04/21 00:27:51 matt Exp $");
31 1.5.4.2 matt
32 1.5.4.2 matt #include <sys/param.h>
33 1.5.4.2 matt #include <sys/callout.h>
34 1.5.4.2 matt #include <sys/kmem.h>
35 1.5.4.2 matt #include <sys/mutex.h>
36 1.5.4.2 matt #include <sys/rwlock.h>
37 1.5.4.2 matt #include <sys/systm.h>
38 1.5.4.2 matt #include <sys/types.h>
39 1.5.4.2 matt
40 1.5.4.2 matt #include <dev/raidframe/rf_paritymap.h>
41 1.5.4.2 matt #include <dev/raidframe/rf_stripelocks.h>
42 1.5.4.2 matt #include <dev/raidframe/rf_layout.h>
43 1.5.4.2 matt #include <dev/raidframe/rf_raid.h>
44 1.5.4.2 matt #include <dev/raidframe/rf_parityscan.h>
45 1.5.4.2 matt #include <dev/raidframe/rf_kintf.h>
46 1.5.4.2 matt
47 1.5.4.2 matt /* Important parameters: */
48 1.5.4.2 matt #define REGION_MINSIZE (25ULL << 20)
49 1.5.4.2 matt #define DFL_TICKMS 40000
50 1.5.4.2 matt #define DFL_COOLDOWN 8 /* 7-8 intervals of 40s = 5min +/- 20s */
51 1.5.4.2 matt
52 1.5.4.2 matt /* Internal-use flag bits. */
53 1.5.4.2 matt #define TICKING 1
54 1.5.4.2 matt #define TICKED 2
55 1.5.4.2 matt
56 1.5.4.2 matt /* Prototypes! */
57 1.5.4.2 matt static void rf_paritymap_write_locked(struct rf_paritymap *);
58 1.5.4.2 matt static void rf_paritymap_tick(void *);
59 1.5.4.2 matt static u_int rf_paritymap_nreg(RF_Raid_t *);
60 1.5.4.2 matt
61 1.5.4.2 matt /* Extract the current status of the parity map. */
62 1.5.4.2 matt void
63 1.5.4.2 matt rf_paritymap_status(struct rf_paritymap *pm, struct rf_pmstat *ps)
64 1.5.4.2 matt {
65 1.5.4.2 matt memset(ps, 0, sizeof(*ps));
66 1.5.4.2 matt if (pm == NULL)
67 1.5.4.2 matt ps->enabled = 0;
68 1.5.4.2 matt else {
69 1.5.4.2 matt ps->enabled = 1;
70 1.5.4.2 matt ps->region_size = pm->region_size;
71 1.5.4.2 matt mutex_enter(&pm->lock);
72 1.5.4.2 matt memcpy(&ps->params, &pm->params, sizeof(ps->params));
73 1.5.4.2 matt memcpy(ps->dirty, pm->disk_now, sizeof(ps->dirty));
74 1.5.4.2 matt memcpy(&ps->ctrs, &pm->ctrs, sizeof(ps->ctrs));
75 1.5.4.2 matt mutex_exit(&pm->lock);
76 1.5.4.2 matt }
77 1.5.4.2 matt }
78 1.5.4.2 matt
79 1.5.4.2 matt /*
80 1.5.4.2 matt * Test whether parity in a given sector is suspected of being inconsistent
81 1.5.4.2 matt * on disk (assuming that any pending I/O to it is allowed to complete).
82 1.5.4.2 matt * This may be of interest to future work on parity scrubbing.
83 1.5.4.2 matt */
84 1.5.4.2 matt int
85 1.5.4.2 matt rf_paritymap_test(struct rf_paritymap *pm, daddr_t sector)
86 1.5.4.2 matt {
87 1.5.4.2 matt unsigned region = sector / pm->region_size;
88 1.5.4.2 matt int retval;
89 1.5.4.2 matt
90 1.5.4.2 matt mutex_enter(&pm->lock);
91 1.5.4.2 matt retval = isset(pm->disk_boot->bits, region) ? 1 : 0;
92 1.5.4.2 matt mutex_exit(&pm->lock);
93 1.5.4.2 matt return retval;
94 1.5.4.2 matt }
95 1.5.4.2 matt
96 1.5.4.2 matt /* To be called before a write to the RAID is submitted. */
97 1.5.4.2 matt void
98 1.5.4.2 matt rf_paritymap_begin(struct rf_paritymap *pm, daddr_t offset, daddr_t size)
99 1.5.4.2 matt {
100 1.5.4.2 matt unsigned i, b, e;
101 1.5.4.2 matt
102 1.5.4.2 matt b = offset / pm->region_size;
103 1.5.4.2 matt e = (offset + size - 1) / pm->region_size;
104 1.5.4.2 matt
105 1.5.4.2 matt for (i = b; i <= e; i++)
106 1.5.4.2 matt rf_paritymap_begin_region(pm, i);
107 1.5.4.2 matt }
108 1.5.4.2 matt
109 1.5.4.2 matt /* To be called after a write to the RAID completes. */
110 1.5.4.2 matt void
111 1.5.4.2 matt rf_paritymap_end(struct rf_paritymap *pm, daddr_t offset, daddr_t size)
112 1.5.4.2 matt {
113 1.5.4.2 matt unsigned i, b, e;
114 1.5.4.2 matt
115 1.5.4.2 matt b = offset / pm->region_size;
116 1.5.4.2 matt e = (offset + size - 1) / pm->region_size;
117 1.5.4.2 matt
118 1.5.4.2 matt for (i = b; i <= e; i++)
119 1.5.4.2 matt rf_paritymap_end_region(pm, i);
120 1.5.4.2 matt }
121 1.5.4.2 matt
122 1.5.4.2 matt void
123 1.5.4.2 matt rf_paritymap_begin_region(struct rf_paritymap *pm, unsigned region)
124 1.5.4.2 matt {
125 1.5.4.2 matt int needs_write;
126 1.5.4.2 matt
127 1.5.4.2 matt KASSERT(region < RF_PARITYMAP_NREG);
128 1.5.4.2 matt pm->ctrs.nwrite++;
129 1.5.4.2 matt
130 1.5.4.2 matt /* If it was being kept warm, deal with that. */
131 1.5.4.2 matt mutex_enter(&pm->lock);
132 1.5.4.2 matt if (pm->current->state[region] < 0)
133 1.5.4.2 matt pm->current->state[region] = 0;
134 1.5.4.2 matt
135 1.5.4.2 matt /* This shouldn't happen unless RAIDOUTSTANDING is set too high. */
136 1.5.4.2 matt KASSERT(pm->current->state[region] < 127);
137 1.5.4.2 matt pm->current->state[region]++;
138 1.5.4.2 matt
139 1.5.4.2 matt needs_write = isclr(pm->disk_now->bits, region);
140 1.5.4.2 matt
141 1.5.4.2 matt if (needs_write) {
142 1.5.4.2 matt KASSERT(pm->current->state[region] == 1);
143 1.5.4.2 matt rf_paritymap_write_locked(pm);
144 1.5.4.2 matt }
145 1.5.4.2 matt
146 1.5.4.2 matt mutex_exit(&pm->lock);
147 1.5.4.2 matt }
148 1.5.4.2 matt
149 1.5.4.2 matt void
150 1.5.4.2 matt rf_paritymap_end_region(struct rf_paritymap *pm, unsigned region)
151 1.5.4.2 matt {
152 1.5.4.2 matt KASSERT(region < RF_PARITYMAP_NREG);
153 1.5.4.2 matt
154 1.5.4.2 matt mutex_enter(&pm->lock);
155 1.5.4.2 matt KASSERT(pm->current->state[region] > 0);
156 1.5.4.2 matt --pm->current->state[region];
157 1.5.4.2 matt
158 1.5.4.2 matt if (pm->current->state[region] <= 0) {
159 1.5.4.2 matt pm->current->state[region] = -pm->params.cooldown;
160 1.5.4.2 matt KASSERT(pm->current->state[region] <= 0);
161 1.5.4.2 matt mutex_enter(&pm->lk_flags);
162 1.5.4.2 matt if (!(pm->flags & TICKING)) {
163 1.5.4.2 matt pm->flags |= TICKING;
164 1.5.4.2 matt mutex_exit(&pm->lk_flags);
165 1.5.4.2 matt callout_schedule(&pm->ticker,
166 1.5.4.2 matt mstohz(pm->params.tickms));
167 1.5.4.2 matt } else
168 1.5.4.2 matt mutex_exit(&pm->lk_flags);
169 1.5.4.2 matt }
170 1.5.4.2 matt mutex_exit(&pm->lock);
171 1.5.4.2 matt }
172 1.5.4.2 matt
173 1.5.4.2 matt /*
174 1.5.4.2 matt * Updates the parity map to account for any changes in current activity
175 1.5.4.2 matt * and/or an ongoing parity scan, then writes it to disk with appropriate
176 1.5.4.2 matt * synchronization.
177 1.5.4.2 matt */
178 1.5.4.2 matt void
179 1.5.4.2 matt rf_paritymap_write(struct rf_paritymap *pm)
180 1.5.4.2 matt {
181 1.5.4.2 matt mutex_enter(&pm->lock);
182 1.5.4.2 matt rf_paritymap_write_locked(pm);
183 1.5.4.2 matt mutex_exit(&pm->lock);
184 1.5.4.2 matt }
185 1.5.4.2 matt
186 1.5.4.2 matt /* As above, but to be used when pm->lock is already held. */
187 1.5.4.2 matt static void
188 1.5.4.2 matt rf_paritymap_write_locked(struct rf_paritymap *pm)
189 1.5.4.2 matt {
190 1.5.4.2 matt char w, w0;
191 1.5.4.2 matt int i, j, setting, clearing;
192 1.5.4.2 matt
193 1.5.4.2 matt setting = clearing = 0;
194 1.5.4.2 matt for (i = 0; i < RF_PARITYMAP_NBYTE; i++) {
195 1.5.4.2 matt w0 = pm->disk_now->bits[i];
196 1.5.4.2 matt w = pm->disk_boot->bits[i];
197 1.5.4.2 matt
198 1.5.4.2 matt for (j = 0; j < NBBY; j++)
199 1.5.4.2 matt if (pm->current->state[i * NBBY + j] != 0)
200 1.5.4.2 matt w |= 1 << j;
201 1.5.4.2 matt
202 1.5.4.2 matt if (w & ~w0)
203 1.5.4.2 matt setting = 1;
204 1.5.4.2 matt if (w0 & ~w)
205 1.5.4.2 matt clearing = 1;
206 1.5.4.2 matt
207 1.5.4.2 matt pm->disk_now->bits[i] = w;
208 1.5.4.2 matt }
209 1.5.4.2 matt pm->ctrs.ncachesync += setting + clearing;
210 1.5.4.2 matt pm->ctrs.nclearing += clearing;
211 1.5.4.2 matt
212 1.5.4.2 matt /*
213 1.5.4.2 matt * If bits are being set in the parity map, then a sync is
214 1.5.4.2 matt * required afterwards, so that the regions are marked dirty
215 1.5.4.2 matt * on disk before any writes to them take place. If bits are
216 1.5.4.2 matt * being cleared, then a sync is required before the write, so
217 1.5.4.2 matt * that any writes to those regions are processed before the
218 1.5.4.2 matt * region is marked clean. (Synchronization is somewhat
219 1.5.4.2 matt * overkill; a write ordering barrier would suffice, but we
220 1.5.4.2 matt * currently have no way to express that directly.)
221 1.5.4.2 matt */
222 1.5.4.2 matt if (clearing)
223 1.5.4.2 matt rf_sync_component_caches(pm->raid);
224 1.5.4.2 matt rf_paritymap_kern_write(pm->raid, pm->disk_now);
225 1.5.4.2 matt if (setting)
226 1.5.4.2 matt rf_sync_component_caches(pm->raid);
227 1.5.4.2 matt }
228 1.5.4.2 matt
229 1.5.4.2 matt /* Mark all parity as being in need of rewrite. */
230 1.5.4.2 matt void
231 1.5.4.2 matt rf_paritymap_invalidate(struct rf_paritymap *pm)
232 1.5.4.2 matt {
233 1.5.4.2 matt mutex_enter(&pm->lock);
234 1.5.4.2 matt memset(pm->disk_boot, ~(unsigned char)0,
235 1.5.4.2 matt sizeof(struct rf_paritymap_ondisk));
236 1.5.4.2 matt mutex_exit(&pm->lock);
237 1.5.4.2 matt }
238 1.5.4.2 matt
239 1.5.4.2 matt /* Mark all parity as being correct. */
240 1.5.4.2 matt void
241 1.5.4.2 matt rf_paritymap_forceclean(struct rf_paritymap *pm)
242 1.5.4.2 matt {
243 1.5.4.2 matt mutex_enter(&pm->lock);
244 1.5.4.2 matt memset(pm->disk_boot, (unsigned char)0,
245 1.5.4.2 matt sizeof(struct rf_paritymap_ondisk));
246 1.5.4.2 matt mutex_exit(&pm->lock);
247 1.5.4.2 matt }
248 1.5.4.2 matt
249 1.5.4.2 matt /*
250 1.5.4.2 matt * The cooldown callout routine just defers its work to a thread; it can't do
251 1.5.4.2 matt * the parity map write itself as it would block, and although mutex-induced
252 1.5.4.2 matt * blocking is permitted it seems wise to avoid tying up the softint.
253 1.5.4.2 matt */
254 1.5.4.2 matt static void
255 1.5.4.2 matt rf_paritymap_tick(void *arg)
256 1.5.4.2 matt {
257 1.5.4.2 matt struct rf_paritymap *pm = arg;
258 1.5.4.2 matt
259 1.5.4.2 matt mutex_enter(&pm->lk_flags);
260 1.5.4.2 matt pm->flags |= TICKED;
261 1.5.4.2 matt mutex_exit(&pm->lk_flags);
262 1.5.4.2 matt wakeup(&(pm->raid->iodone)); /* XXX */
263 1.5.4.2 matt }
264 1.5.4.2 matt
265 1.5.4.2 matt /*
266 1.5.4.2 matt * This is where the parity cooling work (and rearming the callout if needed)
267 1.5.4.2 matt * is done; the raidio thread calls it when woken up, as by the above.
268 1.5.4.2 matt */
269 1.5.4.2 matt void
270 1.5.4.2 matt rf_paritymap_checkwork(struct rf_paritymap *pm)
271 1.5.4.2 matt {
272 1.5.4.2 matt int i, zerop, progressp;
273 1.5.4.2 matt
274 1.5.4.2 matt mutex_enter(&pm->lk_flags);
275 1.5.4.2 matt if (pm->flags & TICKED) {
276 1.5.4.2 matt zerop = progressp = 0;
277 1.5.4.2 matt
278 1.5.4.2 matt pm->flags &= ~TICKED;
279 1.5.4.2 matt mutex_exit(&pm->lk_flags);
280 1.5.4.2 matt
281 1.5.4.2 matt mutex_enter(&pm->lock);
282 1.5.4.2 matt for (i = 0; i < RF_PARITYMAP_NREG; i++) {
283 1.5.4.2 matt if (pm->current->state[i] < 0) {
284 1.5.4.2 matt progressp = 1;
285 1.5.4.2 matt pm->current->state[i]++;
286 1.5.4.2 matt if (pm->current->state[i] == 0)
287 1.5.4.2 matt zerop = 1;
288 1.5.4.2 matt }
289 1.5.4.2 matt }
290 1.5.4.2 matt
291 1.5.4.2 matt if (progressp)
292 1.5.4.2 matt callout_schedule(&pm->ticker,
293 1.5.4.2 matt mstohz(pm->params.tickms));
294 1.5.4.2 matt else {
295 1.5.4.2 matt mutex_enter(&pm->lk_flags);
296 1.5.4.2 matt pm->flags &= ~TICKING;
297 1.5.4.2 matt mutex_exit(&pm->lk_flags);
298 1.5.4.2 matt }
299 1.5.4.2 matt
300 1.5.4.2 matt if (zerop)
301 1.5.4.2 matt rf_paritymap_write_locked(pm);
302 1.5.4.2 matt mutex_exit(&pm->lock);
303 1.5.4.2 matt } else
304 1.5.4.2 matt mutex_exit(&pm->lk_flags);
305 1.5.4.2 matt }
306 1.5.4.2 matt
307 1.5.4.2 matt /*
308 1.5.4.2 matt * Set parity map parameters; used both to alter parameters on the fly and to
309 1.5.4.2 matt * establish their initial values. Note that setting a parameter to 0 means
310 1.5.4.2 matt * to leave the previous setting unchanged, and that if this is done for the
311 1.5.4.2 matt * initial setting of "regions", then a default value will be computed based
312 1.5.4.2 matt * on the RAID component size.
313 1.5.4.2 matt */
314 1.5.4.2 matt int
315 1.5.4.2 matt rf_paritymap_set_params(struct rf_paritymap *pm,
316 1.5.4.2 matt const struct rf_pmparams *params, int todisk)
317 1.5.4.2 matt {
318 1.5.4.2 matt int cooldown, tickms;
319 1.5.4.2 matt u_int regions;
320 1.5.4.2 matt RF_RowCol_t col;
321 1.5.4.2 matt RF_ComponentLabel_t *clabel;
322 1.5.4.2 matt RF_Raid_t *raidPtr;
323 1.5.4.2 matt
324 1.5.4.2 matt cooldown = params->cooldown != 0
325 1.5.4.2 matt ? params->cooldown : pm->params.cooldown;
326 1.5.4.2 matt tickms = params->tickms != 0
327 1.5.4.2 matt ? params->tickms : pm->params.tickms;
328 1.5.4.2 matt regions = params->regions != 0
329 1.5.4.2 matt ? params->regions : pm->params.regions;
330 1.5.4.2 matt
331 1.5.4.2 matt if (cooldown < 1 || cooldown > 128) {
332 1.5.4.2 matt printf("raid%d: cooldown %d out of range\n", pm->raid->raidid,
333 1.5.4.2 matt cooldown);
334 1.5.4.2 matt return (-1);
335 1.5.4.2 matt }
336 1.5.4.2 matt if (tickms < 10) {
337 1.5.4.2 matt printf("raid%d: tick time %dms out of range\n",
338 1.5.4.2 matt pm->raid->raidid, tickms);
339 1.5.4.2 matt return (-1);
340 1.5.4.2 matt }
341 1.5.4.2 matt if (regions == 0) {
342 1.5.4.2 matt regions = rf_paritymap_nreg(pm->raid);
343 1.5.4.2 matt } else if (regions > RF_PARITYMAP_NREG) {
344 1.5.4.2 matt printf("raid%d: region count %u too large (more than %u)\n",
345 1.5.4.2 matt pm->raid->raidid, regions, RF_PARITYMAP_NREG);
346 1.5.4.2 matt return (-1);
347 1.5.4.2 matt }
348 1.5.4.2 matt
349 1.5.4.2 matt /* XXX any currently warm parity will be used with the new tickms! */
350 1.5.4.2 matt pm->params.cooldown = cooldown;
351 1.5.4.2 matt pm->params.tickms = tickms;
352 1.5.4.2 matt /* Apply the initial region count, but do not change it after that. */
353 1.5.4.2 matt if (pm->params.regions == 0)
354 1.5.4.2 matt pm->params.regions = regions;
355 1.5.4.2 matt
356 1.5.4.2 matt /* So that the newly set parameters can be tested: */
357 1.5.4.2 matt pm->ctrs.nwrite = pm->ctrs.ncachesync = pm->ctrs.nclearing = 0;
358 1.5.4.2 matt
359 1.5.4.2 matt if (todisk) {
360 1.5.4.2 matt raidPtr = pm->raid;
361 1.5.4.2 matt for (col = 0; col < raidPtr->numCol; col++) {
362 1.5.4.2 matt if (RF_DEAD_DISK(raidPtr->Disks[col].status))
363 1.5.4.2 matt continue;
364 1.5.4.2 matt
365 1.5.4.2 matt clabel = raidget_component_label(raidPtr, col);
366 1.5.4.2 matt clabel->parity_map_ntick = cooldown;
367 1.5.4.2 matt clabel->parity_map_tickms = tickms;
368 1.5.4.2 matt clabel->parity_map_regions = regions;
369 1.5.4.2 matt
370 1.5.4.2 matt /* Don't touch the disk if it's been spared */
371 1.5.4.2 matt if (clabel->status == rf_ds_spared)
372 1.5.4.2 matt continue;
373 1.5.4.2 matt
374 1.5.4.2 matt raidflush_component_label(raidPtr, col);
375 1.5.4.2 matt }
376 1.5.4.2 matt
377 1.5.4.2 matt /* handle the spares too... */
378 1.5.4.2 matt for (col = 0; col < raidPtr->numSpare; col++) {
379 1.5.4.2 matt if (raidPtr->Disks[raidPtr->numCol+col].status == rf_ds_used_spare) {
380 1.5.4.2 matt clabel = raidget_component_label(raidPtr, raidPtr->numCol+col);
381 1.5.4.2 matt clabel->parity_map_ntick = cooldown;
382 1.5.4.2 matt clabel->parity_map_tickms = tickms;
383 1.5.4.2 matt clabel->parity_map_regions = regions;
384 1.5.4.2 matt raidflush_component_label(raidPtr, raidPtr->numCol+col);
385 1.5.4.2 matt }
386 1.5.4.2 matt }
387 1.5.4.2 matt }
388 1.5.4.2 matt return 0;
389 1.5.4.2 matt }
390 1.5.4.2 matt
391 1.5.4.2 matt /*
392 1.5.4.2 matt * The number of regions may not be as many as can fit into the map, because
393 1.5.4.2 matt * when regions are too small, the overhead of setting parity map bits
394 1.5.4.2 matt * becomes significant in comparison to the actual I/O, while the
395 1.5.4.2 matt * corresponding gains in parity verification time become negligible. Thus,
396 1.5.4.2 matt * a minimum region size (defined above) is imposed.
397 1.5.4.2 matt *
398 1.5.4.2 matt * Note that, if the number of regions is less than the maximum, then some of
399 1.5.4.2 matt * the regions will be "fictional", corresponding to no actual disk; some
400 1.5.4.2 matt * parts of the code may process them as normal, but they can not ever be
401 1.5.4.2 matt * written to.
402 1.5.4.2 matt */
403 1.5.4.2 matt static u_int
404 1.5.4.2 matt rf_paritymap_nreg(RF_Raid_t *raid)
405 1.5.4.2 matt {
406 1.5.4.2 matt daddr_t bytes_per_disk, nreg;
407 1.5.4.2 matt
408 1.5.4.2 matt bytes_per_disk = raid->sectorsPerDisk << raid->logBytesPerSector;
409 1.5.4.2 matt nreg = bytes_per_disk / REGION_MINSIZE;
410 1.5.4.2 matt if (nreg > RF_PARITYMAP_NREG)
411 1.5.4.2 matt nreg = RF_PARITYMAP_NREG;
412 1.5.4.2 matt
413 1.5.4.2 matt return (u_int)nreg;
414 1.5.4.2 matt }
415 1.5.4.2 matt
416 1.5.4.2 matt /*
417 1.5.4.2 matt * Initialize a parity map given specific parameters. This neither reads nor
418 1.5.4.2 matt * writes the parity map config in the component labels; for that, see below.
419 1.5.4.2 matt */
420 1.5.4.2 matt int
421 1.5.4.2 matt rf_paritymap_init(struct rf_paritymap *pm, RF_Raid_t *raid,
422 1.5.4.2 matt const struct rf_pmparams *params)
423 1.5.4.2 matt {
424 1.5.4.2 matt daddr_t rstripes;
425 1.5.4.2 matt struct rf_pmparams safe;
426 1.5.4.2 matt
427 1.5.4.2 matt pm->raid = raid;
428 1.5.4.2 matt pm->params.regions = 0;
429 1.5.4.2 matt if (0 != rf_paritymap_set_params(pm, params, 0)) {
430 1.5.4.2 matt /*
431 1.5.4.2 matt * If the parameters are out-of-range, then bring the
432 1.5.4.2 matt * parity map up with something reasonable, so that
433 1.5.4.2 matt * the admin can at least go and fix it (or ignore it
434 1.5.4.2 matt * entirely).
435 1.5.4.2 matt */
436 1.5.4.2 matt safe.cooldown = DFL_COOLDOWN;
437 1.5.4.2 matt safe.tickms = DFL_TICKMS;
438 1.5.4.2 matt safe.regions = 0;
439 1.5.4.2 matt
440 1.5.4.2 matt if (0 != rf_paritymap_set_params(pm, &safe, 0))
441 1.5.4.2 matt return (-1);
442 1.5.4.2 matt }
443 1.5.4.2 matt
444 1.5.4.2 matt rstripes = howmany(raid->Layout.numStripe, pm->params.regions);
445 1.5.4.2 matt pm->region_size = rstripes * raid->Layout.dataSectorsPerStripe;
446 1.5.4.2 matt
447 1.5.4.2 matt callout_init(&pm->ticker, CALLOUT_MPSAFE);
448 1.5.4.2 matt callout_setfunc(&pm->ticker, rf_paritymap_tick, pm);
449 1.5.4.2 matt pm->flags = 0;
450 1.5.4.2 matt
451 1.5.4.2 matt pm->disk_boot = kmem_alloc(sizeof(struct rf_paritymap_ondisk),
452 1.5.4.2 matt KM_SLEEP);
453 1.5.4.2 matt pm->disk_now = kmem_alloc(sizeof(struct rf_paritymap_ondisk),
454 1.5.4.2 matt KM_SLEEP);
455 1.5.4.2 matt pm->current = kmem_zalloc(sizeof(struct rf_paritymap_current),
456 1.5.4.2 matt KM_SLEEP);
457 1.5.4.2 matt
458 1.5.4.2 matt rf_paritymap_kern_read(pm->raid, pm->disk_boot);
459 1.5.4.2 matt memcpy(pm->disk_now, pm->disk_boot, sizeof(*pm->disk_now));
460 1.5.4.2 matt
461 1.5.4.2 matt mutex_init(&pm->lock, MUTEX_DEFAULT, IPL_NONE);
462 1.5.4.2 matt mutex_init(&pm->lk_flags, MUTEX_DEFAULT, IPL_SOFTCLOCK);
463 1.5.4.2 matt
464 1.5.4.2 matt return 0;
465 1.5.4.2 matt }
466 1.5.4.2 matt
467 1.5.4.2 matt /*
468 1.5.4.2 matt * Destroys a parity map; unless "force" is set, also cleans parity for any
469 1.5.4.2 matt * regions which were still in cooldown (but are not dirty on disk).
470 1.5.4.2 matt */
471 1.5.4.2 matt void
472 1.5.4.2 matt rf_paritymap_destroy(struct rf_paritymap *pm, int force)
473 1.5.4.2 matt {
474 1.5.4.2 matt int i;
475 1.5.4.2 matt
476 1.5.4.2 matt callout_halt(&pm->ticker, NULL); /* XXX stop? halt? */
477 1.5.4.2 matt callout_destroy(&pm->ticker);
478 1.5.4.2 matt
479 1.5.4.2 matt if (!force) {
480 1.5.4.2 matt for (i = 0; i < RF_PARITYMAP_NREG; i++) {
481 1.5.4.2 matt /* XXX check for > 0 ? */
482 1.5.4.2 matt if (pm->current->state[i] < 0)
483 1.5.4.2 matt pm->current->state[i] = 0;
484 1.5.4.2 matt }
485 1.5.4.2 matt
486 1.5.4.2 matt rf_paritymap_write_locked(pm);
487 1.5.4.2 matt }
488 1.5.4.2 matt
489 1.5.4.2 matt mutex_destroy(&pm->lock);
490 1.5.4.2 matt mutex_destroy(&pm->lk_flags);
491 1.5.4.2 matt
492 1.5.4.2 matt kmem_free(pm->disk_boot, sizeof(struct rf_paritymap_ondisk));
493 1.5.4.2 matt kmem_free(pm->disk_now, sizeof(struct rf_paritymap_ondisk));
494 1.5.4.2 matt kmem_free(pm->current, sizeof(struct rf_paritymap_current));
495 1.5.4.2 matt }
496 1.5.4.2 matt
497 1.5.4.2 matt /*
498 1.5.4.2 matt * Rewrite parity, taking parity map into account; this is the equivalent of
499 1.5.4.2 matt * the old rf_RewriteParity, and is likewise to be called from a suitable
500 1.5.4.2 matt * thread and shouldn't have multiple copies running in parallel and so on.
501 1.5.4.2 matt *
502 1.5.4.2 matt * Note that the fictional regions are "cleaned" in one shot, so that very
503 1.5.4.2 matt * small RAIDs (useful for testing) will not experience potentially severe
504 1.5.4.2 matt * regressions in rewrite time.
505 1.5.4.2 matt */
506 1.5.4.2 matt int
507 1.5.4.2 matt rf_paritymap_rewrite(struct rf_paritymap *pm)
508 1.5.4.2 matt {
509 1.5.4.2 matt int i, ret_val = 0;
510 1.5.4.2 matt daddr_t reg_b, reg_e;
511 1.5.4.2 matt
512 1.5.4.2 matt /* Process only the actual regions. */
513 1.5.4.2 matt for (i = 0; i < pm->params.regions; i++) {
514 1.5.4.2 matt mutex_enter(&pm->lock);
515 1.5.4.2 matt if (isset(pm->disk_boot->bits, i)) {
516 1.5.4.2 matt mutex_exit(&pm->lock);
517 1.5.4.2 matt
518 1.5.4.2 matt reg_b = i * pm->region_size;
519 1.5.4.2 matt reg_e = reg_b + pm->region_size;
520 1.5.4.2 matt if (reg_e > pm->raid->totalSectors)
521 1.5.4.2 matt reg_e = pm->raid->totalSectors;
522 1.5.4.2 matt
523 1.5.4.2 matt if (rf_RewriteParityRange(pm->raid, reg_b,
524 1.5.4.2 matt reg_e - reg_b)) {
525 1.5.4.2 matt ret_val = 1;
526 1.5.4.2 matt if (pm->raid->waitShutdown)
527 1.5.4.2 matt return ret_val;
528 1.5.4.2 matt } else {
529 1.5.4.2 matt mutex_enter(&pm->lock);
530 1.5.4.2 matt clrbit(pm->disk_boot->bits, i);
531 1.5.4.2 matt rf_paritymap_write_locked(pm);
532 1.5.4.2 matt mutex_exit(&pm->lock);
533 1.5.4.2 matt }
534 1.5.4.2 matt } else {
535 1.5.4.2 matt mutex_exit(&pm->lock);
536 1.5.4.2 matt }
537 1.5.4.2 matt }
538 1.5.4.2 matt
539 1.5.4.2 matt /* Now, clear the fictional regions, if any. */
540 1.5.4.2 matt rf_paritymap_forceclean(pm);
541 1.5.4.2 matt rf_paritymap_write(pm);
542 1.5.4.2 matt
543 1.5.4.2 matt return ret_val;
544 1.5.4.2 matt }
545 1.5.4.2 matt
546 1.5.4.2 matt /*
547 1.5.4.2 matt * How to merge the on-disk parity maps when reading them in from the
548 1.5.4.2 matt * various components; returns whether they differ. In the case that
549 1.5.4.2 matt * they do differ, sets *dst to the union of *dst and *src.
550 1.5.4.2 matt *
551 1.5.4.2 matt * In theory, it should be safe to take the intersection (or just pick
552 1.5.4.2 matt * a single component arbitrarily), but the paranoid approach costs
553 1.5.4.2 matt * little.
554 1.5.4.2 matt *
555 1.5.4.2 matt * Appropriate locking, if any, is the responsibility of the caller.
556 1.5.4.2 matt */
557 1.5.4.2 matt int
558 1.5.4.2 matt rf_paritymap_merge(struct rf_paritymap_ondisk *dst,
559 1.5.4.2 matt struct rf_paritymap_ondisk *src)
560 1.5.4.2 matt {
561 1.5.4.2 matt int i, discrep = 0;
562 1.5.4.2 matt
563 1.5.4.2 matt for (i = 0; i < RF_PARITYMAP_NBYTE; i++) {
564 1.5.4.2 matt if (dst->bits[i] != src->bits[i])
565 1.5.4.2 matt discrep = 1;
566 1.5.4.2 matt dst->bits[i] |= src->bits[i];
567 1.5.4.2 matt }
568 1.5.4.2 matt
569 1.5.4.2 matt return discrep;
570 1.5.4.2 matt }
571 1.5.4.2 matt
572 1.5.4.2 matt /*
573 1.5.4.2 matt * Detach a parity map from its RAID. This is not meant to be applied except
574 1.5.4.2 matt * when unconfiguring the RAID after all I/O has been resolved, as otherwise
575 1.5.4.2 matt * an out-of-date parity map could be treated as current.
576 1.5.4.2 matt */
577 1.5.4.2 matt void
578 1.5.4.2 matt rf_paritymap_detach(RF_Raid_t *raidPtr)
579 1.5.4.2 matt {
580 1.5.4.2 matt if (raidPtr->parity_map == NULL)
581 1.5.4.2 matt return;
582 1.5.4.2 matt
583 1.5.4.2 matt simple_lock(&(raidPtr->iodone_lock));
584 1.5.4.2 matt struct rf_paritymap *pm = raidPtr->parity_map;
585 1.5.4.2 matt raidPtr->parity_map = NULL;
586 1.5.4.2 matt simple_unlock(&(raidPtr->iodone_lock));
587 1.5.4.2 matt /* XXXjld is that enough locking? Or too much? */
588 1.5.4.2 matt rf_paritymap_destroy(pm, 0);
589 1.5.4.2 matt kmem_free(pm, sizeof(*pm));
590 1.5.4.2 matt }
591 1.5.4.2 matt
592 1.5.4.2 matt /*
593 1.5.4.2 matt * Attach a parity map to a RAID set if appropriate. Includes
594 1.5.4.2 matt * configure-time processing of parity-map fields of component label.
595 1.5.4.2 matt */
596 1.5.4.2 matt void
597 1.5.4.2 matt rf_paritymap_attach(RF_Raid_t *raidPtr, int force)
598 1.5.4.2 matt {
599 1.5.4.2 matt RF_RowCol_t col;
600 1.5.4.2 matt int pm_use, pm_zap;
601 1.5.4.2 matt int g_tickms, g_ntick, g_regions;
602 1.5.4.2 matt int good;
603 1.5.4.2 matt RF_ComponentLabel_t *clabel;
604 1.5.4.2 matt u_int flags, regions;
605 1.5.4.2 matt struct rf_pmparams params;
606 1.5.4.2 matt
607 1.5.4.2 matt if (raidPtr->Layout.map->faultsTolerated == 0) {
608 1.5.4.2 matt /* There isn't any parity. */
609 1.5.4.2 matt return;
610 1.5.4.2 matt }
611 1.5.4.2 matt
612 1.5.4.2 matt pm_use = 1;
613 1.5.4.2 matt pm_zap = 0;
614 1.5.4.2 matt g_tickms = DFL_TICKMS;
615 1.5.4.2 matt g_ntick = DFL_COOLDOWN;
616 1.5.4.2 matt g_regions = 0;
617 1.5.4.2 matt
618 1.5.4.2 matt /*
619 1.5.4.2 matt * Collect opinions on the set config. If this is the initial
620 1.5.4.2 matt * config (raidctl -C), treat all labels as invalid, since
621 1.5.4.2 matt * there may be random data present.
622 1.5.4.2 matt */
623 1.5.4.2 matt if (!force) {
624 1.5.4.2 matt for (col = 0; col < raidPtr->numCol; col++) {
625 1.5.4.2 matt if (RF_DEAD_DISK(raidPtr->Disks[col].status))
626 1.5.4.2 matt continue;
627 1.5.4.2 matt clabel = raidget_component_label(raidPtr, col);
628 1.5.4.2 matt flags = clabel->parity_map_flags;
629 1.5.4.2 matt /* Check for use by non-parity-map kernel. */
630 1.5.4.2 matt if (clabel->parity_map_modcount
631 1.5.4.2 matt != clabel->mod_counter) {
632 1.5.4.2 matt flags &= ~RF_PMLABEL_WASUSED;
633 1.5.4.2 matt }
634 1.5.4.2 matt
635 1.5.4.2 matt if (flags & RF_PMLABEL_VALID) {
636 1.5.4.2 matt g_tickms = clabel->parity_map_tickms;
637 1.5.4.2 matt g_ntick = clabel->parity_map_ntick;
638 1.5.4.2 matt regions = clabel->parity_map_regions;
639 1.5.4.2 matt if (g_regions == 0)
640 1.5.4.2 matt g_regions = regions;
641 1.5.4.2 matt else if (g_regions != regions) {
642 1.5.4.2 matt pm_zap = 1; /* important! */
643 1.5.4.2 matt }
644 1.5.4.2 matt
645 1.5.4.2 matt if (flags & RF_PMLABEL_DISABLE) {
646 1.5.4.2 matt pm_use = 0;
647 1.5.4.2 matt }
648 1.5.4.2 matt if (!(flags & RF_PMLABEL_WASUSED)) {
649 1.5.4.2 matt pm_zap = 1;
650 1.5.4.2 matt }
651 1.5.4.2 matt } else {
652 1.5.4.2 matt pm_zap = 1;
653 1.5.4.2 matt }
654 1.5.4.2 matt }
655 1.5.4.2 matt } else {
656 1.5.4.2 matt pm_zap = 1;
657 1.5.4.2 matt }
658 1.5.4.2 matt
659 1.5.4.2 matt /* Finally, create and attach the parity map. */
660 1.5.4.2 matt if (pm_use) {
661 1.5.4.2 matt params.cooldown = g_ntick;
662 1.5.4.2 matt params.tickms = g_tickms;
663 1.5.4.2 matt params.regions = g_regions;
664 1.5.4.2 matt
665 1.5.4.2 matt raidPtr->parity_map = kmem_alloc(sizeof(struct rf_paritymap),
666 1.5.4.2 matt KM_SLEEP);
667 1.5.4.2 matt if (0 != rf_paritymap_init(raidPtr->parity_map, raidPtr,
668 1.5.4.2 matt ¶ms)) {
669 1.5.4.2 matt /* It failed; do without. */
670 1.5.4.2 matt kmem_free(raidPtr->parity_map,
671 1.5.4.2 matt sizeof(struct rf_paritymap));
672 1.5.4.2 matt raidPtr->parity_map = NULL;
673 1.5.4.2 matt return;
674 1.5.4.2 matt }
675 1.5.4.2 matt
676 1.5.4.2 matt if (g_regions == 0)
677 1.5.4.2 matt /* Pick up the autoconfigured region count. */
678 1.5.4.2 matt g_regions = raidPtr->parity_map->params.regions;
679 1.5.4.2 matt
680 1.5.4.2 matt if (pm_zap) {
681 1.5.4.2 matt good = raidPtr->parity_good && !force;
682 1.5.4.2 matt
683 1.5.4.2 matt if (good)
684 1.5.4.2 matt rf_paritymap_forceclean(raidPtr->parity_map);
685 1.5.4.2 matt else
686 1.5.4.2 matt rf_paritymap_invalidate(raidPtr->parity_map);
687 1.5.4.2 matt /* This needs to be on disk before WASUSED is set. */
688 1.5.4.2 matt rf_paritymap_write(raidPtr->parity_map);
689 1.5.4.2 matt }
690 1.5.4.2 matt }
691 1.5.4.2 matt
692 1.5.4.2 matt /* Alter labels in-core to reflect the current view of things. */
693 1.5.4.2 matt for (col = 0; col < raidPtr->numCol; col++) {
694 1.5.4.2 matt if (RF_DEAD_DISK(raidPtr->Disks[col].status))
695 1.5.4.2 matt continue;
696 1.5.4.2 matt clabel = raidget_component_label(raidPtr, col);
697 1.5.4.2 matt
698 1.5.4.2 matt if (pm_use)
699 1.5.4.2 matt flags = RF_PMLABEL_VALID | RF_PMLABEL_WASUSED;
700 1.5.4.2 matt else
701 1.5.4.2 matt flags = RF_PMLABEL_VALID | RF_PMLABEL_DISABLE;
702 1.5.4.2 matt
703 1.5.4.2 matt clabel->parity_map_flags = flags;
704 1.5.4.2 matt clabel->parity_map_tickms = g_tickms;
705 1.5.4.2 matt clabel->parity_map_ntick = g_ntick;
706 1.5.4.2 matt clabel->parity_map_regions = g_regions;
707 1.5.4.2 matt raidflush_component_label(raidPtr, col);
708 1.5.4.2 matt }
709 1.5.4.2 matt /* Note that we're just in 'attach' here, and there won't
710 1.5.4.2 matt be any spare disks at this point. */
711 1.5.4.2 matt }
712 1.5.4.2 matt
713 1.5.4.2 matt /*
714 1.5.4.2 matt * For initializing the parity-map fields of a component label, both on
715 1.5.4.2 matt * initial creation and on reconstruct/copyback/etc. */
716 1.5.4.2 matt void
717 1.5.4.2 matt rf_paritymap_init_label(struct rf_paritymap *pm, RF_ComponentLabel_t *clabel)
718 1.5.4.2 matt {
719 1.5.4.2 matt if (pm != NULL) {
720 1.5.4.2 matt clabel->parity_map_flags =
721 1.5.4.2 matt RF_PMLABEL_VALID | RF_PMLABEL_WASUSED;
722 1.5.4.2 matt clabel->parity_map_tickms = pm->params.tickms;
723 1.5.4.2 matt clabel->parity_map_ntick = pm->params.cooldown;
724 1.5.4.2 matt /*
725 1.5.4.2 matt * XXXjld: If the number of regions is changed on disk, and
726 1.5.4.2 matt * then a new component is labeled before the next configure,
727 1.5.4.2 matt * then it will get the old value and they will conflict on
728 1.5.4.2 matt * the next boot (and the default will be used instead).
729 1.5.4.2 matt */
730 1.5.4.2 matt clabel->parity_map_regions = pm->params.regions;
731 1.5.4.2 matt } else {
732 1.5.4.2 matt /*
733 1.5.4.2 matt * XXXjld: if the map is disabled, and all the components are
734 1.5.4.2 matt * replaced without an intervening unconfigure/reconfigure,
735 1.5.4.2 matt * then it will become enabled on the next unconfig/reconfig.
736 1.5.4.2 matt */
737 1.5.4.2 matt }
738 1.5.4.2 matt }
739 1.5.4.2 matt
740 1.5.4.2 matt
741 1.5.4.2 matt /* Will the parity map be disabled next time? */
742 1.5.4.2 matt int
743 1.5.4.2 matt rf_paritymap_get_disable(RF_Raid_t *raidPtr)
744 1.5.4.2 matt {
745 1.5.4.2 matt RF_ComponentLabel_t *clabel;
746 1.5.4.2 matt RF_RowCol_t col;
747 1.5.4.2 matt int dis;
748 1.5.4.2 matt
749 1.5.4.2 matt dis = 0;
750 1.5.4.2 matt for (col = 0; col < raidPtr->numCol; col++) {
751 1.5.4.2 matt if (RF_DEAD_DISK(raidPtr->Disks[col].status))
752 1.5.4.2 matt continue;
753 1.5.4.2 matt clabel = raidget_component_label(raidPtr, col);
754 1.5.4.2 matt if (clabel->parity_map_flags & RF_PMLABEL_DISABLE)
755 1.5.4.2 matt dis = 1;
756 1.5.4.2 matt }
757 1.5.4.2 matt for (col = 0; col < raidPtr->numSpare; col++) {
758 1.5.4.2 matt if (raidPtr->Disks[raidPtr->numCol+col].status != rf_ds_used_spare)
759 1.5.4.2 matt continue;
760 1.5.4.2 matt clabel = raidget_component_label(raidPtr, raidPtr->numCol+col);
761 1.5.4.2 matt if (clabel->parity_map_flags & RF_PMLABEL_DISABLE)
762 1.5.4.2 matt dis = 1;
763 1.5.4.2 matt }
764 1.5.4.2 matt
765 1.5.4.2 matt return dis;
766 1.5.4.2 matt }
767 1.5.4.2 matt
768 1.5.4.2 matt /* Set whether the parity map will be disabled next time. */
769 1.5.4.2 matt void
770 1.5.4.2 matt rf_paritymap_set_disable(RF_Raid_t *raidPtr, int dis)
771 1.5.4.2 matt {
772 1.5.4.2 matt RF_ComponentLabel_t *clabel;
773 1.5.4.2 matt RF_RowCol_t col;
774 1.5.4.2 matt
775 1.5.4.2 matt for (col = 0; col < raidPtr->numCol; col++) {
776 1.5.4.2 matt if (RF_DEAD_DISK(raidPtr->Disks[col].status))
777 1.5.4.2 matt continue;
778 1.5.4.2 matt clabel = raidget_component_label(raidPtr, col);
779 1.5.4.2 matt if (dis)
780 1.5.4.2 matt clabel->parity_map_flags |= RF_PMLABEL_DISABLE;
781 1.5.4.2 matt else
782 1.5.4.2 matt clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE;
783 1.5.4.2 matt raidflush_component_label(raidPtr, col);
784 1.5.4.2 matt }
785 1.5.4.2 matt
786 1.5.4.2 matt /* update any used spares as well */
787 1.5.4.2 matt for (col = 0; col < raidPtr->numSpare; col++) {
788 1.5.4.2 matt if (raidPtr->Disks[raidPtr->numCol+col].status != rf_ds_used_spare)
789 1.5.4.2 matt continue;
790 1.5.4.2 matt
791 1.5.4.2 matt clabel = raidget_component_label(raidPtr, raidPtr->numCol+col);
792 1.5.4.2 matt if (dis)
793 1.5.4.2 matt clabel->parity_map_flags |= RF_PMLABEL_DISABLE;
794 1.5.4.2 matt else
795 1.5.4.2 matt clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE;
796 1.5.4.2 matt raidflush_component_label(raidPtr, raidPtr->numCol+col);
797 1.5.4.2 matt }
798 1.5.4.2 matt }
799