1 1.11 oster /* $NetBSD: rf_paritymap.c,v 1.11 2023/09/25 21:59:38 oster Exp $ */ 2 1.1 jld 3 1.1 jld /*- 4 1.1 jld * Copyright (c) 2009 Jed Davis. 5 1.1 jld * All rights reserved. 6 1.1 jld * 7 1.1 jld * Redistribution and use in source and binary forms, with or without 8 1.1 jld * modification, are permitted provided that the following conditions 9 1.1 jld * are met: 10 1.1 jld * 1. Redistributions of source code must retain the above copyright 11 1.1 jld * notice, this list of conditions and the following disclaimer. 12 1.1 jld * 2. Redistributions in binary form must reproduce the above copyright 13 1.1 jld * notice, this list of conditions and the following disclaimer in the 14 1.1 jld * documentation and/or other materials provided with the distribution. 15 1.1 jld * 16 1.1 jld * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 1.1 jld * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 1.1 jld * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 1.1 jld * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 1.1 jld * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 1.1 jld * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 1.1 jld * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 1.1 jld * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 1.1 jld * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 1.1 jld * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 1.1 jld * POSSIBILITY OF SUCH DAMAGE. 27 1.1 jld */ 28 1.1 jld 29 1.1 jld #include <sys/cdefs.h> 30 1.11 oster __KERNEL_RCSID(0, "$NetBSD: rf_paritymap.c,v 1.11 2023/09/25 21:59:38 oster Exp $"); 31 1.1 jld 32 1.3 pooka #include <sys/param.h> 33 1.1 jld #include <sys/callout.h> 34 1.1 jld #include <sys/kmem.h> 35 1.1 jld #include <sys/mutex.h> 36 1.1 jld #include <sys/rwlock.h> 37 1.1 jld #include <sys/systm.h> 38 1.1 jld #include <sys/types.h> 39 1.1 jld 40 1.1 jld #include <dev/raidframe/rf_paritymap.h> 41 1.1 jld #include <dev/raidframe/rf_stripelocks.h> 42 1.1 jld #include <dev/raidframe/rf_layout.h> 43 1.1 jld #include <dev/raidframe/rf_raid.h> 44 1.1 jld #include <dev/raidframe/rf_parityscan.h> 45 1.1 jld #include <dev/raidframe/rf_kintf.h> 46 1.1 jld 47 1.1 jld /* Important parameters: */ 48 1.1 jld #define REGION_MINSIZE (25ULL << 20) 49 1.1 jld #define DFL_TICKMS 40000 50 1.1 jld #define DFL_COOLDOWN 8 /* 7-8 intervals of 40s = 5min +/- 20s */ 51 1.1 jld 52 1.1 jld /* Internal-use flag bits. */ 53 1.1 jld #define TICKING 1 54 1.1 jld #define TICKED 2 55 1.1 jld 56 1.1 jld /* Prototypes! */ 57 1.1 jld static void rf_paritymap_write_locked(struct rf_paritymap *); 58 1.1 jld static void rf_paritymap_tick(void *); 59 1.1 jld static u_int rf_paritymap_nreg(RF_Raid_t *); 60 1.1 jld 61 1.1 jld /* Extract the current status of the parity map. */ 62 1.1 jld void 63 1.1 jld rf_paritymap_status(struct rf_paritymap *pm, struct rf_pmstat *ps) 64 1.1 jld { 65 1.1 jld memset(ps, 0, sizeof(*ps)); 66 1.1 jld if (pm == NULL) 67 1.1 jld ps->enabled = 0; 68 1.1 jld else { 69 1.1 jld ps->enabled = 1; 70 1.1 jld ps->region_size = pm->region_size; 71 1.1 jld mutex_enter(&pm->lock); 72 1.1 jld memcpy(&ps->params, &pm->params, sizeof(ps->params)); 73 1.1 jld memcpy(ps->dirty, pm->disk_now, sizeof(ps->dirty)); 74 1.1 jld memcpy(&ps->ctrs, &pm->ctrs, sizeof(ps->ctrs)); 75 1.1 jld mutex_exit(&pm->lock); 76 1.1 jld } 77 1.1 jld } 78 1.1 jld 79 1.1 jld /* 80 1.1 jld * Test whether parity in a given sector is suspected of being inconsistent 81 1.1 jld * on disk (assuming that any pending I/O to it is allowed to complete). 82 1.1 jld * This may be of interest to future work on parity scrubbing. 83 1.1 jld */ 84 1.1 jld int 85 1.1 jld rf_paritymap_test(struct rf_paritymap *pm, daddr_t sector) 86 1.1 jld { 87 1.1 jld unsigned region = sector / pm->region_size; 88 1.1 jld int retval; 89 1.1 jld 90 1.1 jld mutex_enter(&pm->lock); 91 1.1 jld retval = isset(pm->disk_boot->bits, region) ? 1 : 0; 92 1.1 jld mutex_exit(&pm->lock); 93 1.1 jld return retval; 94 1.1 jld } 95 1.1 jld 96 1.1 jld /* To be called before a write to the RAID is submitted. */ 97 1.1 jld void 98 1.1 jld rf_paritymap_begin(struct rf_paritymap *pm, daddr_t offset, daddr_t size) 99 1.1 jld { 100 1.1 jld unsigned i, b, e; 101 1.1 jld 102 1.1 jld b = offset / pm->region_size; 103 1.1 jld e = (offset + size - 1) / pm->region_size; 104 1.1 jld 105 1.1 jld for (i = b; i <= e; i++) 106 1.1 jld rf_paritymap_begin_region(pm, i); 107 1.1 jld } 108 1.1 jld 109 1.1 jld /* To be called after a write to the RAID completes. */ 110 1.1 jld void 111 1.1 jld rf_paritymap_end(struct rf_paritymap *pm, daddr_t offset, daddr_t size) 112 1.1 jld { 113 1.1 jld unsigned i, b, e; 114 1.1 jld 115 1.1 jld b = offset / pm->region_size; 116 1.1 jld e = (offset + size - 1) / pm->region_size; 117 1.1 jld 118 1.1 jld for (i = b; i <= e; i++) 119 1.1 jld rf_paritymap_end_region(pm, i); 120 1.1 jld } 121 1.1 jld 122 1.1 jld void 123 1.1 jld rf_paritymap_begin_region(struct rf_paritymap *pm, unsigned region) 124 1.1 jld { 125 1.1 jld int needs_write; 126 1.1 jld 127 1.1 jld KASSERT(region < RF_PARITYMAP_NREG); 128 1.1 jld pm->ctrs.nwrite++; 129 1.1 jld 130 1.1 jld /* If it was being kept warm, deal with that. */ 131 1.1 jld mutex_enter(&pm->lock); 132 1.1 jld if (pm->current->state[region] < 0) 133 1.1 jld pm->current->state[region] = 0; 134 1.1 jld 135 1.1 jld /* This shouldn't happen unless RAIDOUTSTANDING is set too high. */ 136 1.1 jld KASSERT(pm->current->state[region] < 127); 137 1.1 jld pm->current->state[region]++; 138 1.1 jld 139 1.1 jld needs_write = isclr(pm->disk_now->bits, region); 140 1.1 jld 141 1.1 jld if (needs_write) { 142 1.1 jld KASSERT(pm->current->state[region] == 1); 143 1.1 jld rf_paritymap_write_locked(pm); 144 1.1 jld } 145 1.1 jld 146 1.1 jld mutex_exit(&pm->lock); 147 1.1 jld } 148 1.1 jld 149 1.1 jld void 150 1.1 jld rf_paritymap_end_region(struct rf_paritymap *pm, unsigned region) 151 1.1 jld { 152 1.1 jld KASSERT(region < RF_PARITYMAP_NREG); 153 1.1 jld 154 1.1 jld mutex_enter(&pm->lock); 155 1.1 jld KASSERT(pm->current->state[region] > 0); 156 1.1 jld --pm->current->state[region]; 157 1.1 jld 158 1.1 jld if (pm->current->state[region] <= 0) { 159 1.1 jld pm->current->state[region] = -pm->params.cooldown; 160 1.1 jld KASSERT(pm->current->state[region] <= 0); 161 1.1 jld mutex_enter(&pm->lk_flags); 162 1.1 jld if (!(pm->flags & TICKING)) { 163 1.1 jld pm->flags |= TICKING; 164 1.1 jld mutex_exit(&pm->lk_flags); 165 1.1 jld callout_schedule(&pm->ticker, 166 1.1 jld mstohz(pm->params.tickms)); 167 1.1 jld } else 168 1.1 jld mutex_exit(&pm->lk_flags); 169 1.1 jld } 170 1.1 jld mutex_exit(&pm->lock); 171 1.1 jld } 172 1.1 jld 173 1.1 jld /* 174 1.1 jld * Updates the parity map to account for any changes in current activity 175 1.1 jld * and/or an ongoing parity scan, then writes it to disk with appropriate 176 1.1 jld * synchronization. 177 1.1 jld */ 178 1.1 jld void 179 1.1 jld rf_paritymap_write(struct rf_paritymap *pm) 180 1.1 jld { 181 1.1 jld mutex_enter(&pm->lock); 182 1.1 jld rf_paritymap_write_locked(pm); 183 1.1 jld mutex_exit(&pm->lock); 184 1.1 jld } 185 1.1 jld 186 1.1 jld /* As above, but to be used when pm->lock is already held. */ 187 1.1 jld static void 188 1.1 jld rf_paritymap_write_locked(struct rf_paritymap *pm) 189 1.1 jld { 190 1.1 jld char w, w0; 191 1.1 jld int i, j, setting, clearing; 192 1.1 jld 193 1.1 jld setting = clearing = 0; 194 1.1 jld for (i = 0; i < RF_PARITYMAP_NBYTE; i++) { 195 1.1 jld w0 = pm->disk_now->bits[i]; 196 1.1 jld w = pm->disk_boot->bits[i]; 197 1.1 jld 198 1.1 jld for (j = 0; j < NBBY; j++) 199 1.1 jld if (pm->current->state[i * NBBY + j] != 0) 200 1.1 jld w |= 1 << j; 201 1.1 jld 202 1.1 jld if (w & ~w0) 203 1.1 jld setting = 1; 204 1.1 jld if (w0 & ~w) 205 1.1 jld clearing = 1; 206 1.1 jld 207 1.1 jld pm->disk_now->bits[i] = w; 208 1.1 jld } 209 1.1 jld pm->ctrs.ncachesync += setting + clearing; 210 1.1 jld pm->ctrs.nclearing += clearing; 211 1.1 jld 212 1.1 jld /* 213 1.1 jld * If bits are being set in the parity map, then a sync is 214 1.1 jld * required afterwards, so that the regions are marked dirty 215 1.1 jld * on disk before any writes to them take place. If bits are 216 1.1 jld * being cleared, then a sync is required before the write, so 217 1.1 jld * that any writes to those regions are processed before the 218 1.1 jld * region is marked clean. (Synchronization is somewhat 219 1.1 jld * overkill; a write ordering barrier would suffice, but we 220 1.1 jld * currently have no way to express that directly.) 221 1.1 jld */ 222 1.1 jld if (clearing) 223 1.10 christos rf_sync_component_caches(pm->raid, 1); 224 1.1 jld rf_paritymap_kern_write(pm->raid, pm->disk_now); 225 1.1 jld if (setting) 226 1.10 christos rf_sync_component_caches(pm->raid, 1); 227 1.1 jld } 228 1.1 jld 229 1.1 jld /* Mark all parity as being in need of rewrite. */ 230 1.1 jld void 231 1.1 jld rf_paritymap_invalidate(struct rf_paritymap *pm) 232 1.1 jld { 233 1.1 jld mutex_enter(&pm->lock); 234 1.9 christos memset(pm->disk_boot, (unsigned char)~0, sizeof(*pm->disk_boot)); 235 1.1 jld mutex_exit(&pm->lock); 236 1.1 jld } 237 1.1 jld 238 1.1 jld /* Mark all parity as being correct. */ 239 1.1 jld void 240 1.1 jld rf_paritymap_forceclean(struct rf_paritymap *pm) 241 1.1 jld { 242 1.1 jld mutex_enter(&pm->lock); 243 1.9 christos memset(pm->disk_boot, 0, sizeof(*pm->disk_boot)); 244 1.1 jld mutex_exit(&pm->lock); 245 1.1 jld } 246 1.1 jld 247 1.1 jld /* 248 1.1 jld * The cooldown callout routine just defers its work to a thread; it can't do 249 1.1 jld * the parity map write itself as it would block, and although mutex-induced 250 1.1 jld * blocking is permitted it seems wise to avoid tying up the softint. 251 1.1 jld */ 252 1.1 jld static void 253 1.1 jld rf_paritymap_tick(void *arg) 254 1.1 jld { 255 1.1 jld struct rf_paritymap *pm = arg; 256 1.1 jld 257 1.1 jld mutex_enter(&pm->lk_flags); 258 1.1 jld pm->flags |= TICKED; 259 1.1 jld mutex_exit(&pm->lk_flags); 260 1.7 mrg 261 1.8 mrg rf_lock_mutex2(pm->raid->iodone_lock); 262 1.8 mrg rf_signal_cond2(pm->raid->iodone_cv); /* XXX */ 263 1.8 mrg rf_unlock_mutex2(pm->raid->iodone_lock); 264 1.1 jld } 265 1.1 jld 266 1.1 jld /* 267 1.1 jld * This is where the parity cooling work (and rearming the callout if needed) 268 1.1 jld * is done; the raidio thread calls it when woken up, as by the above. 269 1.1 jld */ 270 1.1 jld void 271 1.1 jld rf_paritymap_checkwork(struct rf_paritymap *pm) 272 1.1 jld { 273 1.1 jld int i, zerop, progressp; 274 1.1 jld 275 1.1 jld mutex_enter(&pm->lk_flags); 276 1.1 jld if (pm->flags & TICKED) { 277 1.1 jld zerop = progressp = 0; 278 1.1 jld 279 1.1 jld pm->flags &= ~TICKED; 280 1.1 jld mutex_exit(&pm->lk_flags); 281 1.1 jld 282 1.1 jld mutex_enter(&pm->lock); 283 1.1 jld for (i = 0; i < RF_PARITYMAP_NREG; i++) { 284 1.1 jld if (pm->current->state[i] < 0) { 285 1.1 jld progressp = 1; 286 1.1 jld pm->current->state[i]++; 287 1.1 jld if (pm->current->state[i] == 0) 288 1.1 jld zerop = 1; 289 1.1 jld } 290 1.1 jld } 291 1.1 jld 292 1.1 jld if (progressp) 293 1.1 jld callout_schedule(&pm->ticker, 294 1.1 jld mstohz(pm->params.tickms)); 295 1.1 jld else { 296 1.1 jld mutex_enter(&pm->lk_flags); 297 1.1 jld pm->flags &= ~TICKING; 298 1.1 jld mutex_exit(&pm->lk_flags); 299 1.1 jld } 300 1.1 jld 301 1.1 jld if (zerop) 302 1.1 jld rf_paritymap_write_locked(pm); 303 1.1 jld mutex_exit(&pm->lock); 304 1.1 jld } else 305 1.1 jld mutex_exit(&pm->lk_flags); 306 1.1 jld } 307 1.1 jld 308 1.1 jld /* 309 1.1 jld * Set parity map parameters; used both to alter parameters on the fly and to 310 1.1 jld * establish their initial values. Note that setting a parameter to 0 means 311 1.1 jld * to leave the previous setting unchanged, and that if this is done for the 312 1.1 jld * initial setting of "regions", then a default value will be computed based 313 1.1 jld * on the RAID component size. 314 1.1 jld */ 315 1.1 jld int 316 1.1 jld rf_paritymap_set_params(struct rf_paritymap *pm, 317 1.1 jld const struct rf_pmparams *params, int todisk) 318 1.1 jld { 319 1.1 jld int cooldown, tickms; 320 1.1 jld u_int regions; 321 1.1 jld RF_RowCol_t col; 322 1.1 jld RF_ComponentLabel_t *clabel; 323 1.1 jld RF_Raid_t *raidPtr; 324 1.1 jld 325 1.1 jld cooldown = params->cooldown != 0 326 1.1 jld ? params->cooldown : pm->params.cooldown; 327 1.1 jld tickms = params->tickms != 0 328 1.1 jld ? params->tickms : pm->params.tickms; 329 1.1 jld regions = params->regions != 0 330 1.1 jld ? params->regions : pm->params.regions; 331 1.1 jld 332 1.1 jld if (cooldown < 1 || cooldown > 128) { 333 1.1 jld printf("raid%d: cooldown %d out of range\n", pm->raid->raidid, 334 1.1 jld cooldown); 335 1.1 jld return (-1); 336 1.1 jld } 337 1.1 jld if (tickms < 10) { 338 1.1 jld printf("raid%d: tick time %dms out of range\n", 339 1.1 jld pm->raid->raidid, tickms); 340 1.1 jld return (-1); 341 1.1 jld } 342 1.1 jld if (regions == 0) { 343 1.1 jld regions = rf_paritymap_nreg(pm->raid); 344 1.1 jld } else if (regions > RF_PARITYMAP_NREG) { 345 1.1 jld printf("raid%d: region count %u too large (more than %u)\n", 346 1.1 jld pm->raid->raidid, regions, RF_PARITYMAP_NREG); 347 1.1 jld return (-1); 348 1.1 jld } 349 1.1 jld 350 1.1 jld /* XXX any currently warm parity will be used with the new tickms! */ 351 1.1 jld pm->params.cooldown = cooldown; 352 1.1 jld pm->params.tickms = tickms; 353 1.1 jld /* Apply the initial region count, but do not change it after that. */ 354 1.1 jld if (pm->params.regions == 0) 355 1.1 jld pm->params.regions = regions; 356 1.1 jld 357 1.1 jld /* So that the newly set parameters can be tested: */ 358 1.1 jld pm->ctrs.nwrite = pm->ctrs.ncachesync = pm->ctrs.nclearing = 0; 359 1.1 jld 360 1.1 jld if (todisk) { 361 1.1 jld raidPtr = pm->raid; 362 1.1 jld for (col = 0; col < raidPtr->numCol; col++) { 363 1.4 oster if (RF_DEAD_DISK(raidPtr->Disks[col].status)) 364 1.4 oster continue; 365 1.4 oster 366 1.1 jld clabel = raidget_component_label(raidPtr, col); 367 1.1 jld clabel->parity_map_ntick = cooldown; 368 1.1 jld clabel->parity_map_tickms = tickms; 369 1.1 jld clabel->parity_map_regions = regions; 370 1.4 oster 371 1.4 oster /* Don't touch the disk if it's been spared */ 372 1.4 oster if (clabel->status == rf_ds_spared) 373 1.4 oster continue; 374 1.4 oster 375 1.1 jld raidflush_component_label(raidPtr, col); 376 1.1 jld } 377 1.4 oster 378 1.4 oster /* handle the spares too... */ 379 1.4 oster for (col = 0; col < raidPtr->numSpare; col++) { 380 1.4 oster if (raidPtr->Disks[raidPtr->numCol+col].status == rf_ds_used_spare) { 381 1.4 oster clabel = raidget_component_label(raidPtr, raidPtr->numCol+col); 382 1.4 oster clabel->parity_map_ntick = cooldown; 383 1.4 oster clabel->parity_map_tickms = tickms; 384 1.4 oster clabel->parity_map_regions = regions; 385 1.4 oster raidflush_component_label(raidPtr, raidPtr->numCol+col); 386 1.4 oster } 387 1.4 oster } 388 1.1 jld } 389 1.1 jld return 0; 390 1.1 jld } 391 1.1 jld 392 1.1 jld /* 393 1.1 jld * The number of regions may not be as many as can fit into the map, because 394 1.1 jld * when regions are too small, the overhead of setting parity map bits 395 1.1 jld * becomes significant in comparison to the actual I/O, while the 396 1.1 jld * corresponding gains in parity verification time become negligible. Thus, 397 1.1 jld * a minimum region size (defined above) is imposed. 398 1.1 jld * 399 1.1 jld * Note that, if the number of regions is less than the maximum, then some of 400 1.1 jld * the regions will be "fictional", corresponding to no actual disk; some 401 1.1 jld * parts of the code may process them as normal, but they can not ever be 402 1.1 jld * written to. 403 1.1 jld */ 404 1.1 jld static u_int 405 1.1 jld rf_paritymap_nreg(RF_Raid_t *raid) 406 1.1 jld { 407 1.1 jld daddr_t bytes_per_disk, nreg; 408 1.1 jld 409 1.1 jld bytes_per_disk = raid->sectorsPerDisk << raid->logBytesPerSector; 410 1.1 jld nreg = bytes_per_disk / REGION_MINSIZE; 411 1.1 jld if (nreg > RF_PARITYMAP_NREG) 412 1.1 jld nreg = RF_PARITYMAP_NREG; 413 1.6 riz if (nreg < 1) 414 1.6 riz nreg = 1; 415 1.1 jld 416 1.1 jld return (u_int)nreg; 417 1.1 jld } 418 1.1 jld 419 1.1 jld /* 420 1.1 jld * Initialize a parity map given specific parameters. This neither reads nor 421 1.1 jld * writes the parity map config in the component labels; for that, see below. 422 1.1 jld */ 423 1.1 jld int 424 1.1 jld rf_paritymap_init(struct rf_paritymap *pm, RF_Raid_t *raid, 425 1.1 jld const struct rf_pmparams *params) 426 1.1 jld { 427 1.1 jld daddr_t rstripes; 428 1.1 jld struct rf_pmparams safe; 429 1.1 jld 430 1.1 jld pm->raid = raid; 431 1.1 jld pm->params.regions = 0; 432 1.1 jld if (0 != rf_paritymap_set_params(pm, params, 0)) { 433 1.1 jld /* 434 1.1 jld * If the parameters are out-of-range, then bring the 435 1.1 jld * parity map up with something reasonable, so that 436 1.1 jld * the admin can at least go and fix it (or ignore it 437 1.1 jld * entirely). 438 1.1 jld */ 439 1.1 jld safe.cooldown = DFL_COOLDOWN; 440 1.1 jld safe.tickms = DFL_TICKMS; 441 1.1 jld safe.regions = 0; 442 1.1 jld 443 1.1 jld if (0 != rf_paritymap_set_params(pm, &safe, 0)) 444 1.1 jld return (-1); 445 1.1 jld } 446 1.1 jld 447 1.1 jld rstripes = howmany(raid->Layout.numStripe, pm->params.regions); 448 1.1 jld pm->region_size = rstripes * raid->Layout.dataSectorsPerStripe; 449 1.1 jld 450 1.1 jld callout_init(&pm->ticker, CALLOUT_MPSAFE); 451 1.1 jld callout_setfunc(&pm->ticker, rf_paritymap_tick, pm); 452 1.1 jld pm->flags = 0; 453 1.1 jld 454 1.1 jld pm->disk_boot = kmem_alloc(sizeof(struct rf_paritymap_ondisk), 455 1.1 jld KM_SLEEP); 456 1.1 jld pm->disk_now = kmem_alloc(sizeof(struct rf_paritymap_ondisk), 457 1.1 jld KM_SLEEP); 458 1.1 jld pm->current = kmem_zalloc(sizeof(struct rf_paritymap_current), 459 1.1 jld KM_SLEEP); 460 1.1 jld 461 1.1 jld rf_paritymap_kern_read(pm->raid, pm->disk_boot); 462 1.1 jld memcpy(pm->disk_now, pm->disk_boot, sizeof(*pm->disk_now)); 463 1.1 jld 464 1.1 jld mutex_init(&pm->lock, MUTEX_DEFAULT, IPL_NONE); 465 1.1 jld mutex_init(&pm->lk_flags, MUTEX_DEFAULT, IPL_SOFTCLOCK); 466 1.1 jld 467 1.1 jld return 0; 468 1.1 jld } 469 1.1 jld 470 1.1 jld /* 471 1.1 jld * Destroys a parity map; unless "force" is set, also cleans parity for any 472 1.1 jld * regions which were still in cooldown (but are not dirty on disk). 473 1.1 jld */ 474 1.1 jld void 475 1.1 jld rf_paritymap_destroy(struct rf_paritymap *pm, int force) 476 1.1 jld { 477 1.1 jld int i; 478 1.1 jld 479 1.1 jld callout_halt(&pm->ticker, NULL); /* XXX stop? halt? */ 480 1.1 jld callout_destroy(&pm->ticker); 481 1.1 jld 482 1.1 jld if (!force) { 483 1.1 jld for (i = 0; i < RF_PARITYMAP_NREG; i++) { 484 1.1 jld /* XXX check for > 0 ? */ 485 1.1 jld if (pm->current->state[i] < 0) 486 1.1 jld pm->current->state[i] = 0; 487 1.1 jld } 488 1.1 jld 489 1.1 jld rf_paritymap_write_locked(pm); 490 1.1 jld } 491 1.1 jld 492 1.1 jld mutex_destroy(&pm->lock); 493 1.1 jld mutex_destroy(&pm->lk_flags); 494 1.1 jld 495 1.1 jld kmem_free(pm->disk_boot, sizeof(struct rf_paritymap_ondisk)); 496 1.1 jld kmem_free(pm->disk_now, sizeof(struct rf_paritymap_ondisk)); 497 1.1 jld kmem_free(pm->current, sizeof(struct rf_paritymap_current)); 498 1.1 jld } 499 1.1 jld 500 1.1 jld /* 501 1.1 jld * Rewrite parity, taking parity map into account; this is the equivalent of 502 1.1 jld * the old rf_RewriteParity, and is likewise to be called from a suitable 503 1.1 jld * thread and shouldn't have multiple copies running in parallel and so on. 504 1.1 jld * 505 1.1 jld * Note that the fictional regions are "cleaned" in one shot, so that very 506 1.1 jld * small RAIDs (useful for testing) will not experience potentially severe 507 1.1 jld * regressions in rewrite time. 508 1.1 jld */ 509 1.1 jld int 510 1.1 jld rf_paritymap_rewrite(struct rf_paritymap *pm) 511 1.1 jld { 512 1.1 jld int i, ret_val = 0; 513 1.1 jld daddr_t reg_b, reg_e; 514 1.1 jld 515 1.1 jld /* Process only the actual regions. */ 516 1.1 jld for (i = 0; i < pm->params.regions; i++) { 517 1.1 jld mutex_enter(&pm->lock); 518 1.1 jld if (isset(pm->disk_boot->bits, i)) { 519 1.1 jld mutex_exit(&pm->lock); 520 1.1 jld 521 1.1 jld reg_b = i * pm->region_size; 522 1.1 jld reg_e = reg_b + pm->region_size; 523 1.1 jld if (reg_e > pm->raid->totalSectors) 524 1.1 jld reg_e = pm->raid->totalSectors; 525 1.1 jld 526 1.1 jld if (rf_RewriteParityRange(pm->raid, reg_b, 527 1.1 jld reg_e - reg_b)) { 528 1.1 jld ret_val = 1; 529 1.1 jld if (pm->raid->waitShutdown) 530 1.1 jld return ret_val; 531 1.1 jld } else { 532 1.1 jld mutex_enter(&pm->lock); 533 1.1 jld clrbit(pm->disk_boot->bits, i); 534 1.1 jld rf_paritymap_write_locked(pm); 535 1.1 jld mutex_exit(&pm->lock); 536 1.1 jld } 537 1.1 jld } else { 538 1.1 jld mutex_exit(&pm->lock); 539 1.1 jld } 540 1.1 jld } 541 1.1 jld 542 1.1 jld /* Now, clear the fictional regions, if any. */ 543 1.1 jld rf_paritymap_forceclean(pm); 544 1.1 jld rf_paritymap_write(pm); 545 1.1 jld 546 1.1 jld return ret_val; 547 1.1 jld } 548 1.1 jld 549 1.1 jld /* 550 1.1 jld * How to merge the on-disk parity maps when reading them in from the 551 1.1 jld * various components; returns whether they differ. In the case that 552 1.1 jld * they do differ, sets *dst to the union of *dst and *src. 553 1.1 jld * 554 1.1 jld * In theory, it should be safe to take the intersection (or just pick 555 1.1 jld * a single component arbitrarily), but the paranoid approach costs 556 1.1 jld * little. 557 1.1 jld * 558 1.1 jld * Appropriate locking, if any, is the responsibility of the caller. 559 1.1 jld */ 560 1.1 jld int 561 1.1 jld rf_paritymap_merge(struct rf_paritymap_ondisk *dst, 562 1.1 jld struct rf_paritymap_ondisk *src) 563 1.1 jld { 564 1.1 jld int i, discrep = 0; 565 1.1 jld 566 1.1 jld for (i = 0; i < RF_PARITYMAP_NBYTE; i++) { 567 1.1 jld if (dst->bits[i] != src->bits[i]) 568 1.1 jld discrep = 1; 569 1.1 jld dst->bits[i] |= src->bits[i]; 570 1.1 jld } 571 1.1 jld 572 1.1 jld return discrep; 573 1.1 jld } 574 1.1 jld 575 1.1 jld /* 576 1.1 jld * Detach a parity map from its RAID. This is not meant to be applied except 577 1.1 jld * when unconfiguring the RAID after all I/O has been resolved, as otherwise 578 1.1 jld * an out-of-date parity map could be treated as current. 579 1.1 jld */ 580 1.1 jld void 581 1.1 jld rf_paritymap_detach(RF_Raid_t *raidPtr) 582 1.1 jld { 583 1.1 jld if (raidPtr->parity_map == NULL) 584 1.1 jld return; 585 1.1 jld 586 1.8 mrg rf_lock_mutex2(raidPtr->iodone_lock); 587 1.1 jld struct rf_paritymap *pm = raidPtr->parity_map; 588 1.1 jld raidPtr->parity_map = NULL; 589 1.8 mrg rf_unlock_mutex2(raidPtr->iodone_lock); 590 1.1 jld /* XXXjld is that enough locking? Or too much? */ 591 1.1 jld rf_paritymap_destroy(pm, 0); 592 1.1 jld kmem_free(pm, sizeof(*pm)); 593 1.1 jld } 594 1.1 jld 595 1.1 jld /* 596 1.5 jld * Is this RAID set ineligible for parity-map use due to not actually 597 1.5 jld * having any parity? (If so, rf_paritymap_attach is a no-op, but 598 1.5 jld * rf_paritymap_{get,set}_disable will still pointlessly act on the 599 1.5 jld * component labels.) 600 1.5 jld */ 601 1.5 jld int 602 1.5 jld rf_paritymap_ineligible(RF_Raid_t *raidPtr) 603 1.5 jld { 604 1.5 jld return raidPtr->Layout.map->faultsTolerated == 0; 605 1.5 jld } 606 1.5 jld 607 1.5 jld /* 608 1.1 jld * Attach a parity map to a RAID set if appropriate. Includes 609 1.1 jld * configure-time processing of parity-map fields of component label. 610 1.1 jld */ 611 1.1 jld void 612 1.1 jld rf_paritymap_attach(RF_Raid_t *raidPtr, int force) 613 1.1 jld { 614 1.1 jld RF_RowCol_t col; 615 1.1 jld int pm_use, pm_zap; 616 1.1 jld int g_tickms, g_ntick, g_regions; 617 1.1 jld int good; 618 1.1 jld RF_ComponentLabel_t *clabel; 619 1.1 jld u_int flags, regions; 620 1.1 jld struct rf_pmparams params; 621 1.1 jld 622 1.5 jld if (rf_paritymap_ineligible(raidPtr)) { 623 1.1 jld /* There isn't any parity. */ 624 1.1 jld return; 625 1.1 jld } 626 1.1 jld 627 1.1 jld pm_use = 1; 628 1.1 jld pm_zap = 0; 629 1.1 jld g_tickms = DFL_TICKMS; 630 1.1 jld g_ntick = DFL_COOLDOWN; 631 1.1 jld g_regions = 0; 632 1.1 jld 633 1.1 jld /* 634 1.1 jld * Collect opinions on the set config. If this is the initial 635 1.1 jld * config (raidctl -C), treat all labels as invalid, since 636 1.1 jld * there may be random data present. 637 1.1 jld */ 638 1.1 jld if (!force) { 639 1.1 jld for (col = 0; col < raidPtr->numCol; col++) { 640 1.4 oster if (RF_DEAD_DISK(raidPtr->Disks[col].status)) 641 1.4 oster continue; 642 1.1 jld clabel = raidget_component_label(raidPtr, col); 643 1.1 jld flags = clabel->parity_map_flags; 644 1.1 jld /* Check for use by non-parity-map kernel. */ 645 1.1 jld if (clabel->parity_map_modcount 646 1.1 jld != clabel->mod_counter) { 647 1.1 jld flags &= ~RF_PMLABEL_WASUSED; 648 1.1 jld } 649 1.1 jld 650 1.1 jld if (flags & RF_PMLABEL_VALID) { 651 1.1 jld g_tickms = clabel->parity_map_tickms; 652 1.1 jld g_ntick = clabel->parity_map_ntick; 653 1.1 jld regions = clabel->parity_map_regions; 654 1.1 jld if (g_regions == 0) 655 1.1 jld g_regions = regions; 656 1.1 jld else if (g_regions != regions) { 657 1.1 jld pm_zap = 1; /* important! */ 658 1.1 jld } 659 1.1 jld 660 1.1 jld if (flags & RF_PMLABEL_DISABLE) { 661 1.1 jld pm_use = 0; 662 1.1 jld } 663 1.1 jld if (!(flags & RF_PMLABEL_WASUSED)) { 664 1.1 jld pm_zap = 1; 665 1.1 jld } 666 1.1 jld } else { 667 1.1 jld pm_zap = 1; 668 1.1 jld } 669 1.1 jld } 670 1.1 jld } else { 671 1.1 jld pm_zap = 1; 672 1.1 jld } 673 1.1 jld 674 1.1 jld /* Finally, create and attach the parity map. */ 675 1.1 jld if (pm_use) { 676 1.1 jld params.cooldown = g_ntick; 677 1.1 jld params.tickms = g_tickms; 678 1.1 jld params.regions = g_regions; 679 1.1 jld 680 1.1 jld raidPtr->parity_map = kmem_alloc(sizeof(struct rf_paritymap), 681 1.1 jld KM_SLEEP); 682 1.1 jld if (0 != rf_paritymap_init(raidPtr->parity_map, raidPtr, 683 1.1 jld ¶ms)) { 684 1.1 jld /* It failed; do without. */ 685 1.1 jld kmem_free(raidPtr->parity_map, 686 1.1 jld sizeof(struct rf_paritymap)); 687 1.1 jld raidPtr->parity_map = NULL; 688 1.1 jld return; 689 1.1 jld } 690 1.1 jld 691 1.1 jld if (g_regions == 0) 692 1.1 jld /* Pick up the autoconfigured region count. */ 693 1.1 jld g_regions = raidPtr->parity_map->params.regions; 694 1.1 jld 695 1.1 jld if (pm_zap) { 696 1.1 jld good = raidPtr->parity_good && !force; 697 1.1 jld 698 1.1 jld if (good) 699 1.1 jld rf_paritymap_forceclean(raidPtr->parity_map); 700 1.1 jld else 701 1.1 jld rf_paritymap_invalidate(raidPtr->parity_map); 702 1.1 jld /* This needs to be on disk before WASUSED is set. */ 703 1.1 jld rf_paritymap_write(raidPtr->parity_map); 704 1.1 jld } 705 1.1 jld } 706 1.1 jld 707 1.1 jld /* Alter labels in-core to reflect the current view of things. */ 708 1.1 jld for (col = 0; col < raidPtr->numCol; col++) { 709 1.4 oster if (RF_DEAD_DISK(raidPtr->Disks[col].status)) 710 1.4 oster continue; 711 1.1 jld clabel = raidget_component_label(raidPtr, col); 712 1.1 jld 713 1.1 jld if (pm_use) 714 1.1 jld flags = RF_PMLABEL_VALID | RF_PMLABEL_WASUSED; 715 1.1 jld else 716 1.1 jld flags = RF_PMLABEL_VALID | RF_PMLABEL_DISABLE; 717 1.1 jld 718 1.1 jld clabel->parity_map_flags = flags; 719 1.1 jld clabel->parity_map_tickms = g_tickms; 720 1.1 jld clabel->parity_map_ntick = g_ntick; 721 1.1 jld clabel->parity_map_regions = g_regions; 722 1.1 jld raidflush_component_label(raidPtr, col); 723 1.1 jld } 724 1.4 oster /* Note that we're just in 'attach' here, and there won't 725 1.4 oster be any spare disks at this point. */ 726 1.1 jld } 727 1.1 jld 728 1.1 jld /* 729 1.1 jld * For initializing the parity-map fields of a component label, both on 730 1.11 oster * initial creation and on reconstruct. */ 731 1.1 jld void 732 1.1 jld rf_paritymap_init_label(struct rf_paritymap *pm, RF_ComponentLabel_t *clabel) 733 1.1 jld { 734 1.1 jld if (pm != NULL) { 735 1.1 jld clabel->parity_map_flags = 736 1.1 jld RF_PMLABEL_VALID | RF_PMLABEL_WASUSED; 737 1.1 jld clabel->parity_map_tickms = pm->params.tickms; 738 1.1 jld clabel->parity_map_ntick = pm->params.cooldown; 739 1.1 jld /* 740 1.1 jld * XXXjld: If the number of regions is changed on disk, and 741 1.1 jld * then a new component is labeled before the next configure, 742 1.1 jld * then it will get the old value and they will conflict on 743 1.1 jld * the next boot (and the default will be used instead). 744 1.1 jld */ 745 1.1 jld clabel->parity_map_regions = pm->params.regions; 746 1.1 jld } else { 747 1.1 jld /* 748 1.1 jld * XXXjld: if the map is disabled, and all the components are 749 1.1 jld * replaced without an intervening unconfigure/reconfigure, 750 1.1 jld * then it will become enabled on the next unconfig/reconfig. 751 1.1 jld */ 752 1.1 jld } 753 1.1 jld } 754 1.1 jld 755 1.1 jld 756 1.1 jld /* Will the parity map be disabled next time? */ 757 1.1 jld int 758 1.1 jld rf_paritymap_get_disable(RF_Raid_t *raidPtr) 759 1.1 jld { 760 1.1 jld RF_ComponentLabel_t *clabel; 761 1.1 jld RF_RowCol_t col; 762 1.1 jld int dis; 763 1.1 jld 764 1.1 jld dis = 0; 765 1.1 jld for (col = 0; col < raidPtr->numCol; col++) { 766 1.4 oster if (RF_DEAD_DISK(raidPtr->Disks[col].status)) 767 1.4 oster continue; 768 1.1 jld clabel = raidget_component_label(raidPtr, col); 769 1.1 jld if (clabel->parity_map_flags & RF_PMLABEL_DISABLE) 770 1.1 jld dis = 1; 771 1.1 jld } 772 1.4 oster for (col = 0; col < raidPtr->numSpare; col++) { 773 1.4 oster if (raidPtr->Disks[raidPtr->numCol+col].status != rf_ds_used_spare) 774 1.4 oster continue; 775 1.4 oster clabel = raidget_component_label(raidPtr, raidPtr->numCol+col); 776 1.4 oster if (clabel->parity_map_flags & RF_PMLABEL_DISABLE) 777 1.4 oster dis = 1; 778 1.4 oster } 779 1.1 jld 780 1.1 jld return dis; 781 1.1 jld } 782 1.1 jld 783 1.1 jld /* Set whether the parity map will be disabled next time. */ 784 1.1 jld void 785 1.1 jld rf_paritymap_set_disable(RF_Raid_t *raidPtr, int dis) 786 1.1 jld { 787 1.1 jld RF_ComponentLabel_t *clabel; 788 1.1 jld RF_RowCol_t col; 789 1.1 jld 790 1.1 jld for (col = 0; col < raidPtr->numCol; col++) { 791 1.4 oster if (RF_DEAD_DISK(raidPtr->Disks[col].status)) 792 1.4 oster continue; 793 1.1 jld clabel = raidget_component_label(raidPtr, col); 794 1.1 jld if (dis) 795 1.1 jld clabel->parity_map_flags |= RF_PMLABEL_DISABLE; 796 1.1 jld else 797 1.1 jld clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE; 798 1.1 jld raidflush_component_label(raidPtr, col); 799 1.1 jld } 800 1.4 oster 801 1.4 oster /* update any used spares as well */ 802 1.4 oster for (col = 0; col < raidPtr->numSpare; col++) { 803 1.4 oster if (raidPtr->Disks[raidPtr->numCol+col].status != rf_ds_used_spare) 804 1.4 oster continue; 805 1.4 oster 806 1.4 oster clabel = raidget_component_label(raidPtr, raidPtr->numCol+col); 807 1.4 oster if (dis) 808 1.4 oster clabel->parity_map_flags |= RF_PMLABEL_DISABLE; 809 1.4 oster else 810 1.4 oster clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE; 811 1.4 oster raidflush_component_label(raidPtr, raidPtr->numCol+col); 812 1.4 oster } 813 1.1 jld } 814