Home | History | Annotate | Line # | Download | only in raidframe
rf_paritymap.c revision 1.3.4.1
      1  1.3.4.1  uebayasi /* $NetBSD: rf_paritymap.c,v 1.3.4.1 2010/04/30 14:43:47 uebayasi Exp $ */
      2      1.1       jld 
      3      1.1       jld /*-
      4      1.1       jld  * Copyright (c) 2009 Jed Davis.
      5      1.1       jld  * All rights reserved.
      6      1.1       jld  *
      7      1.1       jld  * Redistribution and use in source and binary forms, with or without
      8      1.1       jld  * modification, are permitted provided that the following conditions
      9      1.1       jld  * are met:
     10      1.1       jld  * 1. Redistributions of source code must retain the above copyright
     11      1.1       jld  *    notice, this list of conditions and the following disclaimer.
     12      1.1       jld  * 2. Redistributions in binary form must reproduce the above copyright
     13      1.1       jld  *    notice, this list of conditions and the following disclaimer in the
     14      1.1       jld  *    documentation and/or other materials provided with the distribution.
     15      1.1       jld  *
     16      1.1       jld  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17      1.1       jld  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18      1.1       jld  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19      1.1       jld  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20      1.1       jld  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21      1.1       jld  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22      1.1       jld  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23      1.1       jld  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24      1.1       jld  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25      1.1       jld  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26      1.1       jld  * POSSIBILITY OF SUCH DAMAGE.
     27      1.1       jld  */
     28      1.1       jld 
     29      1.1       jld #include <sys/cdefs.h>
     30  1.3.4.1  uebayasi __KERNEL_RCSID(0, "$NetBSD: rf_paritymap.c,v 1.3.4.1 2010/04/30 14:43:47 uebayasi Exp $");
     31      1.1       jld 
     32      1.3     pooka #include <sys/param.h>
     33      1.1       jld #include <sys/callout.h>
     34      1.1       jld #include <sys/kmem.h>
     35      1.1       jld #include <sys/mutex.h>
     36      1.1       jld #include <sys/rwlock.h>
     37      1.1       jld #include <sys/systm.h>
     38      1.1       jld #include <sys/types.h>
     39      1.1       jld 
     40      1.1       jld #include <dev/raidframe/rf_paritymap.h>
     41      1.1       jld #include <dev/raidframe/rf_stripelocks.h>
     42      1.1       jld #include <dev/raidframe/rf_layout.h>
     43      1.1       jld #include <dev/raidframe/rf_raid.h>
     44      1.1       jld #include <dev/raidframe/rf_parityscan.h>
     45      1.1       jld #include <dev/raidframe/rf_kintf.h>
     46      1.1       jld 
     47      1.1       jld /* Important parameters: */
     48      1.1       jld #define REGION_MINSIZE (25ULL << 20)
     49      1.1       jld #define DFL_TICKMS      40000
     50      1.1       jld #define DFL_COOLDOWN    8     /* 7-8 intervals of 40s = 5min +/- 20s */
     51      1.1       jld 
     52      1.1       jld /* Internal-use flag bits. */
     53      1.1       jld #define TICKING 1
     54      1.1       jld #define TICKED 2
     55      1.1       jld 
     56      1.1       jld /* Prototypes! */
     57      1.1       jld static void rf_paritymap_write_locked(struct rf_paritymap *);
     58      1.1       jld static void rf_paritymap_tick(void *);
     59      1.1       jld static u_int rf_paritymap_nreg(RF_Raid_t *);
     60      1.1       jld 
     61      1.1       jld /* Extract the current status of the parity map. */
     62      1.1       jld void
     63      1.1       jld rf_paritymap_status(struct rf_paritymap *pm, struct rf_pmstat *ps)
     64      1.1       jld {
     65      1.1       jld 	memset(ps, 0, sizeof(*ps));
     66      1.1       jld 	if (pm == NULL)
     67      1.1       jld 		ps->enabled = 0;
     68      1.1       jld 	else {
     69      1.1       jld 		ps->enabled = 1;
     70      1.1       jld 		ps->region_size = pm->region_size;
     71      1.1       jld 		mutex_enter(&pm->lock);
     72      1.1       jld 		memcpy(&ps->params, &pm->params, sizeof(ps->params));
     73      1.1       jld 		memcpy(ps->dirty, pm->disk_now, sizeof(ps->dirty));
     74      1.1       jld 		memcpy(&ps->ctrs, &pm->ctrs, sizeof(ps->ctrs));
     75      1.1       jld 		mutex_exit(&pm->lock);
     76      1.1       jld 	}
     77      1.1       jld }
     78      1.1       jld 
     79      1.1       jld /*
     80      1.1       jld  * Test whether parity in a given sector is suspected of being inconsistent
     81      1.1       jld  * on disk (assuming that any pending I/O to it is allowed to complete).
     82      1.1       jld  * This may be of interest to future work on parity scrubbing.
     83      1.1       jld  */
     84      1.1       jld int
     85      1.1       jld rf_paritymap_test(struct rf_paritymap *pm, daddr_t sector)
     86      1.1       jld {
     87      1.1       jld 	unsigned region = sector / pm->region_size;
     88      1.1       jld 	int retval;
     89      1.1       jld 
     90      1.1       jld 	mutex_enter(&pm->lock);
     91      1.1       jld 	retval = isset(pm->disk_boot->bits, region) ? 1 : 0;
     92      1.1       jld 	mutex_exit(&pm->lock);
     93      1.1       jld 	return retval;
     94      1.1       jld }
     95      1.1       jld 
     96      1.1       jld /* To be called before a write to the RAID is submitted. */
     97      1.1       jld void
     98      1.1       jld rf_paritymap_begin(struct rf_paritymap *pm, daddr_t offset, daddr_t size)
     99      1.1       jld {
    100      1.1       jld 	unsigned i, b, e;
    101      1.1       jld 
    102      1.1       jld 	b = offset / pm->region_size;
    103      1.1       jld 	e = (offset + size - 1) / pm->region_size;
    104      1.1       jld 
    105      1.1       jld 	for (i = b; i <= e; i++)
    106      1.1       jld 		rf_paritymap_begin_region(pm, i);
    107      1.1       jld }
    108      1.1       jld 
    109      1.1       jld /* To be called after a write to the RAID completes. */
    110      1.1       jld void
    111      1.1       jld rf_paritymap_end(struct rf_paritymap *pm, daddr_t offset, daddr_t size)
    112      1.1       jld {
    113      1.1       jld 	unsigned i, b, e;
    114      1.1       jld 
    115      1.1       jld 	b = offset / pm->region_size;
    116      1.1       jld 	e = (offset + size - 1) / pm->region_size;
    117      1.1       jld 
    118      1.1       jld 	for (i = b; i <= e; i++)
    119      1.1       jld 		rf_paritymap_end_region(pm, i);
    120      1.1       jld }
    121      1.1       jld 
    122      1.1       jld void
    123      1.1       jld rf_paritymap_begin_region(struct rf_paritymap *pm, unsigned region)
    124      1.1       jld {
    125      1.1       jld 	int needs_write;
    126      1.1       jld 
    127      1.1       jld 	KASSERT(region < RF_PARITYMAP_NREG);
    128      1.1       jld 	pm->ctrs.nwrite++;
    129      1.1       jld 
    130      1.1       jld 	/* If it was being kept warm, deal with that. */
    131      1.1       jld 	mutex_enter(&pm->lock);
    132      1.1       jld 	if (pm->current->state[region] < 0)
    133      1.1       jld 		pm->current->state[region] = 0;
    134      1.1       jld 
    135      1.1       jld 	/* This shouldn't happen unless RAIDOUTSTANDING is set too high. */
    136      1.1       jld 	KASSERT(pm->current->state[region] < 127);
    137      1.1       jld 	pm->current->state[region]++;
    138      1.1       jld 
    139      1.1       jld 	needs_write = isclr(pm->disk_now->bits, region);
    140      1.1       jld 
    141      1.1       jld 	if (needs_write) {
    142      1.1       jld 		KASSERT(pm->current->state[region] == 1);
    143      1.1       jld 		rf_paritymap_write_locked(pm);
    144      1.1       jld 	}
    145      1.1       jld 
    146      1.1       jld 	mutex_exit(&pm->lock);
    147      1.1       jld }
    148      1.1       jld 
    149      1.1       jld void
    150      1.1       jld rf_paritymap_end_region(struct rf_paritymap *pm, unsigned region)
    151      1.1       jld {
    152      1.1       jld 	KASSERT(region < RF_PARITYMAP_NREG);
    153      1.1       jld 
    154      1.1       jld 	mutex_enter(&pm->lock);
    155      1.1       jld 	KASSERT(pm->current->state[region] > 0);
    156      1.1       jld 	--pm->current->state[region];
    157      1.1       jld 
    158      1.1       jld 	if (pm->current->state[region] <= 0) {
    159      1.1       jld 		pm->current->state[region] = -pm->params.cooldown;
    160      1.1       jld 		KASSERT(pm->current->state[region] <= 0);
    161      1.1       jld 		mutex_enter(&pm->lk_flags);
    162      1.1       jld 		if (!(pm->flags & TICKING)) {
    163      1.1       jld 			pm->flags |= TICKING;
    164      1.1       jld 			mutex_exit(&pm->lk_flags);
    165      1.1       jld 			callout_schedule(&pm->ticker,
    166      1.1       jld 			    mstohz(pm->params.tickms));
    167      1.1       jld 		} else
    168      1.1       jld 			mutex_exit(&pm->lk_flags);
    169      1.1       jld 	}
    170      1.1       jld 	mutex_exit(&pm->lock);
    171      1.1       jld }
    172      1.1       jld 
    173      1.1       jld /*
    174      1.1       jld  * Updates the parity map to account for any changes in current activity
    175      1.1       jld  * and/or an ongoing parity scan, then writes it to disk with appropriate
    176      1.1       jld  * synchronization.
    177      1.1       jld  */
    178      1.1       jld void
    179      1.1       jld rf_paritymap_write(struct rf_paritymap *pm)
    180      1.1       jld {
    181      1.1       jld 	mutex_enter(&pm->lock);
    182      1.1       jld 	rf_paritymap_write_locked(pm);
    183      1.1       jld 	mutex_exit(&pm->lock);
    184      1.1       jld }
    185      1.1       jld 
    186      1.1       jld /* As above, but to be used when pm->lock is already held. */
    187      1.1       jld static void
    188      1.1       jld rf_paritymap_write_locked(struct rf_paritymap *pm)
    189      1.1       jld {
    190      1.1       jld 	char w, w0;
    191      1.1       jld 	int i, j, setting, clearing;
    192      1.1       jld 
    193      1.1       jld 	setting = clearing = 0;
    194      1.1       jld 	for (i = 0; i < RF_PARITYMAP_NBYTE; i++) {
    195      1.1       jld 		w0 = pm->disk_now->bits[i];
    196      1.1       jld 		w = pm->disk_boot->bits[i];
    197      1.1       jld 
    198      1.1       jld 		for (j = 0; j < NBBY; j++)
    199      1.1       jld 			if (pm->current->state[i * NBBY + j] != 0)
    200      1.1       jld 				w |= 1 << j;
    201      1.1       jld 
    202      1.1       jld 		if (w & ~w0)
    203      1.1       jld 			setting = 1;
    204      1.1       jld 		if (w0 & ~w)
    205      1.1       jld 			clearing = 1;
    206      1.1       jld 
    207      1.1       jld 		pm->disk_now->bits[i] = w;
    208      1.1       jld 	}
    209      1.1       jld 	pm->ctrs.ncachesync += setting + clearing;
    210      1.1       jld 	pm->ctrs.nclearing += clearing;
    211      1.1       jld 
    212      1.1       jld 	/*
    213      1.1       jld 	 * If bits are being set in the parity map, then a sync is
    214      1.1       jld 	 * required afterwards, so that the regions are marked dirty
    215      1.1       jld 	 * on disk before any writes to them take place.  If bits are
    216      1.1       jld 	 * being cleared, then a sync is required before the write, so
    217      1.1       jld 	 * that any writes to those regions are processed before the
    218      1.1       jld 	 * region is marked clean.  (Synchronization is somewhat
    219      1.1       jld 	 * overkill; a write ordering barrier would suffice, but we
    220      1.1       jld 	 * currently have no way to express that directly.)
    221      1.1       jld 	 */
    222      1.1       jld 	if (clearing)
    223      1.1       jld 		rf_sync_component_caches(pm->raid);
    224      1.1       jld 	rf_paritymap_kern_write(pm->raid, pm->disk_now);
    225      1.1       jld 	if (setting)
    226      1.1       jld 		rf_sync_component_caches(pm->raid);
    227      1.1       jld }
    228      1.1       jld 
    229      1.1       jld /* Mark all parity as being in need of rewrite. */
    230      1.1       jld void
    231      1.1       jld rf_paritymap_invalidate(struct rf_paritymap *pm)
    232      1.1       jld {
    233      1.1       jld 	mutex_enter(&pm->lock);
    234      1.1       jld 	memset(pm->disk_boot, ~(unsigned char)0,
    235      1.1       jld 	    sizeof(struct rf_paritymap_ondisk));
    236      1.1       jld 	mutex_exit(&pm->lock);
    237      1.1       jld }
    238      1.1       jld 
    239      1.1       jld /* Mark all parity as being correct. */
    240      1.1       jld void
    241      1.1       jld rf_paritymap_forceclean(struct rf_paritymap *pm)
    242      1.1       jld {
    243      1.1       jld 	mutex_enter(&pm->lock);
    244      1.1       jld 	memset(pm->disk_boot, (unsigned char)0,
    245      1.1       jld 	    sizeof(struct rf_paritymap_ondisk));
    246      1.1       jld 	mutex_exit(&pm->lock);
    247      1.1       jld }
    248      1.1       jld 
    249      1.1       jld /*
    250      1.1       jld  * The cooldown callout routine just defers its work to a thread; it can't do
    251      1.1       jld  * the parity map write itself as it would block, and although mutex-induced
    252      1.1       jld  * blocking is permitted it seems wise to avoid tying up the softint.
    253      1.1       jld  */
    254      1.1       jld static void
    255      1.1       jld rf_paritymap_tick(void *arg)
    256      1.1       jld {
    257      1.1       jld 	struct rf_paritymap *pm = arg;
    258      1.1       jld 
    259      1.1       jld 	mutex_enter(&pm->lk_flags);
    260      1.1       jld 	pm->flags |= TICKED;
    261      1.1       jld 	mutex_exit(&pm->lk_flags);
    262      1.1       jld 	wakeup(&(pm->raid->iodone)); /* XXX */
    263      1.1       jld }
    264      1.1       jld 
    265      1.1       jld /*
    266      1.1       jld  * This is where the parity cooling work (and rearming the callout if needed)
    267      1.1       jld  * is done; the raidio thread calls it when woken up, as by the above.
    268      1.1       jld  */
    269      1.1       jld void
    270      1.1       jld rf_paritymap_checkwork(struct rf_paritymap *pm)
    271      1.1       jld {
    272      1.1       jld 	int i, zerop, progressp;
    273      1.1       jld 
    274      1.1       jld 	mutex_enter(&pm->lk_flags);
    275      1.1       jld 	if (pm->flags & TICKED) {
    276      1.1       jld 		zerop = progressp = 0;
    277      1.1       jld 
    278      1.1       jld 		pm->flags &= ~TICKED;
    279      1.1       jld 		mutex_exit(&pm->lk_flags);
    280      1.1       jld 
    281      1.1       jld 		mutex_enter(&pm->lock);
    282      1.1       jld 		for (i = 0; i < RF_PARITYMAP_NREG; i++) {
    283      1.1       jld 			if (pm->current->state[i] < 0) {
    284      1.1       jld 				progressp = 1;
    285      1.1       jld 				pm->current->state[i]++;
    286      1.1       jld 				if (pm->current->state[i] == 0)
    287      1.1       jld 					zerop = 1;
    288      1.1       jld 			}
    289      1.1       jld 		}
    290      1.1       jld 
    291      1.1       jld 		if (progressp)
    292      1.1       jld 			callout_schedule(&pm->ticker,
    293      1.1       jld 			    mstohz(pm->params.tickms));
    294      1.1       jld 		else {
    295      1.1       jld 			mutex_enter(&pm->lk_flags);
    296      1.1       jld 			pm->flags &= ~TICKING;
    297      1.1       jld 			mutex_exit(&pm->lk_flags);
    298      1.1       jld 		}
    299      1.1       jld 
    300      1.1       jld 		if (zerop)
    301      1.1       jld 			rf_paritymap_write_locked(pm);
    302      1.1       jld 		mutex_exit(&pm->lock);
    303      1.1       jld 	} else
    304      1.1       jld 		mutex_exit(&pm->lk_flags);
    305      1.1       jld }
    306      1.1       jld 
    307      1.1       jld /*
    308      1.1       jld  * Set parity map parameters; used both to alter parameters on the fly and to
    309      1.1       jld  * establish their initial values.  Note that setting a parameter to 0 means
    310      1.1       jld  * to leave the previous setting unchanged, and that if this is done for the
    311      1.1       jld  * initial setting of "regions", then a default value will be computed based
    312      1.1       jld  * on the RAID component size.
    313      1.1       jld  */
    314      1.1       jld int
    315      1.1       jld rf_paritymap_set_params(struct rf_paritymap *pm,
    316      1.1       jld     const struct rf_pmparams *params, int todisk)
    317      1.1       jld {
    318      1.1       jld 	int cooldown, tickms;
    319      1.1       jld 	u_int regions;
    320      1.1       jld 	RF_RowCol_t col;
    321      1.1       jld 	RF_ComponentLabel_t *clabel;
    322      1.1       jld 	RF_Raid_t *raidPtr;
    323      1.1       jld 
    324      1.1       jld 	cooldown = params->cooldown != 0
    325      1.1       jld 	    ? params->cooldown : pm->params.cooldown;
    326      1.1       jld 	tickms = params->tickms != 0
    327      1.1       jld 	    ? params->tickms : pm->params.tickms;
    328      1.1       jld 	regions = params->regions != 0
    329      1.1       jld 	    ? params->regions : pm->params.regions;
    330      1.1       jld 
    331      1.1       jld 	if (cooldown < 1 || cooldown > 128) {
    332      1.1       jld 		printf("raid%d: cooldown %d out of range\n", pm->raid->raidid,
    333      1.1       jld 		    cooldown);
    334      1.1       jld 		return (-1);
    335      1.1       jld 	}
    336      1.1       jld 	if (tickms < 10) {
    337      1.1       jld 		printf("raid%d: tick time %dms out of range\n",
    338      1.1       jld 		    pm->raid->raidid, tickms);
    339      1.1       jld 		return (-1);
    340      1.1       jld 	}
    341      1.1       jld 	if (regions == 0) {
    342      1.1       jld 		regions = rf_paritymap_nreg(pm->raid);
    343      1.1       jld 	} else if (regions > RF_PARITYMAP_NREG) {
    344      1.1       jld 		printf("raid%d: region count %u too large (more than %u)\n",
    345      1.1       jld 		    pm->raid->raidid, regions, RF_PARITYMAP_NREG);
    346      1.1       jld 		return (-1);
    347      1.1       jld 	}
    348      1.1       jld 
    349      1.1       jld 	/* XXX any currently warm parity will be used with the new tickms! */
    350      1.1       jld 	pm->params.cooldown = cooldown;
    351      1.1       jld 	pm->params.tickms = tickms;
    352      1.1       jld 	/* Apply the initial region count, but do not change it after that. */
    353      1.1       jld 	if (pm->params.regions == 0)
    354      1.1       jld 		pm->params.regions = regions;
    355      1.1       jld 
    356      1.1       jld 	/* So that the newly set parameters can be tested: */
    357      1.1       jld 	pm->ctrs.nwrite = pm->ctrs.ncachesync = pm->ctrs.nclearing = 0;
    358      1.1       jld 
    359      1.1       jld 	if (todisk) {
    360      1.1       jld 		raidPtr = pm->raid;
    361      1.1       jld 		for (col = 0; col < raidPtr->numCol; col++) {
    362  1.3.4.1  uebayasi 			if (RF_DEAD_DISK(raidPtr->Disks[col].status))
    363  1.3.4.1  uebayasi 				continue;
    364  1.3.4.1  uebayasi 
    365      1.1       jld 			clabel = raidget_component_label(raidPtr, col);
    366      1.1       jld 			clabel->parity_map_ntick = cooldown;
    367      1.1       jld 			clabel->parity_map_tickms = tickms;
    368      1.1       jld 			clabel->parity_map_regions = regions;
    369  1.3.4.1  uebayasi 
    370  1.3.4.1  uebayasi 			/* Don't touch the disk if it's been spared */
    371  1.3.4.1  uebayasi 			if (clabel->status == rf_ds_spared)
    372  1.3.4.1  uebayasi 				continue;
    373  1.3.4.1  uebayasi 
    374      1.1       jld 			raidflush_component_label(raidPtr, col);
    375      1.1       jld 		}
    376  1.3.4.1  uebayasi 
    377  1.3.4.1  uebayasi 		/* handle the spares too... */
    378  1.3.4.1  uebayasi 		for (col = 0; col < raidPtr->numSpare; col++) {
    379  1.3.4.1  uebayasi 			if (raidPtr->Disks[raidPtr->numCol+col].status == rf_ds_used_spare) {
    380  1.3.4.1  uebayasi 				clabel = raidget_component_label(raidPtr, raidPtr->numCol+col);
    381  1.3.4.1  uebayasi 				clabel->parity_map_ntick = cooldown;
    382  1.3.4.1  uebayasi 				clabel->parity_map_tickms = tickms;
    383  1.3.4.1  uebayasi 				clabel->parity_map_regions = regions;
    384  1.3.4.1  uebayasi 				raidflush_component_label(raidPtr, raidPtr->numCol+col);
    385  1.3.4.1  uebayasi 			}
    386  1.3.4.1  uebayasi 		}
    387      1.1       jld 	}
    388      1.1       jld 	return 0;
    389      1.1       jld }
    390      1.1       jld 
    391      1.1       jld /*
    392      1.1       jld  * The number of regions may not be as many as can fit into the map, because
    393      1.1       jld  * when regions are too small, the overhead of setting parity map bits
    394      1.1       jld  * becomes significant in comparison to the actual I/O, while the
    395      1.1       jld  * corresponding gains in parity verification time become negligible.  Thus,
    396      1.1       jld  * a minimum region size (defined above) is imposed.
    397      1.1       jld  *
    398      1.1       jld  * Note that, if the number of regions is less than the maximum, then some of
    399      1.1       jld  * the regions will be "fictional", corresponding to no actual disk; some
    400      1.1       jld  * parts of the code may process them as normal, but they can not ever be
    401      1.1       jld  * written to.
    402      1.1       jld  */
    403      1.1       jld static u_int
    404      1.1       jld rf_paritymap_nreg(RF_Raid_t *raid)
    405      1.1       jld {
    406      1.1       jld 	daddr_t bytes_per_disk, nreg;
    407      1.1       jld 
    408      1.1       jld 	bytes_per_disk = raid->sectorsPerDisk << raid->logBytesPerSector;
    409      1.1       jld 	nreg = bytes_per_disk / REGION_MINSIZE;
    410      1.1       jld 	if (nreg > RF_PARITYMAP_NREG)
    411      1.1       jld 		nreg = RF_PARITYMAP_NREG;
    412      1.1       jld 
    413      1.1       jld 	return (u_int)nreg;
    414      1.1       jld }
    415      1.1       jld 
    416      1.1       jld /*
    417      1.1       jld  * Initialize a parity map given specific parameters.  This neither reads nor
    418      1.1       jld  * writes the parity map config in the component labels; for that, see below.
    419      1.1       jld  */
    420      1.1       jld int
    421      1.1       jld rf_paritymap_init(struct rf_paritymap *pm, RF_Raid_t *raid,
    422      1.1       jld     const struct rf_pmparams *params)
    423      1.1       jld {
    424      1.1       jld 	daddr_t rstripes;
    425      1.1       jld 	struct rf_pmparams safe;
    426      1.1       jld 
    427      1.1       jld 	pm->raid = raid;
    428      1.1       jld 	pm->params.regions = 0;
    429      1.1       jld 	if (0 != rf_paritymap_set_params(pm, params, 0)) {
    430      1.1       jld 		/*
    431      1.1       jld 		 * If the parameters are out-of-range, then bring the
    432      1.1       jld 		 * parity map up with something reasonable, so that
    433      1.1       jld 		 * the admin can at least go and fix it (or ignore it
    434      1.1       jld 		 * entirely).
    435      1.1       jld 		 */
    436      1.1       jld 		safe.cooldown = DFL_COOLDOWN;
    437      1.1       jld 		safe.tickms = DFL_TICKMS;
    438      1.1       jld 		safe.regions = 0;
    439      1.1       jld 
    440      1.1       jld 		if (0 != rf_paritymap_set_params(pm, &safe, 0))
    441      1.1       jld 			return (-1);
    442      1.1       jld 	}
    443      1.1       jld 
    444      1.1       jld 	rstripes = howmany(raid->Layout.numStripe, pm->params.regions);
    445      1.1       jld 	pm->region_size = rstripes * raid->Layout.dataSectorsPerStripe;
    446      1.1       jld 
    447      1.1       jld 	callout_init(&pm->ticker, CALLOUT_MPSAFE);
    448      1.1       jld 	callout_setfunc(&pm->ticker, rf_paritymap_tick, pm);
    449      1.1       jld 	pm->flags = 0;
    450      1.1       jld 
    451      1.1       jld 	pm->disk_boot = kmem_alloc(sizeof(struct rf_paritymap_ondisk),
    452      1.1       jld 	    KM_SLEEP);
    453      1.1       jld 	pm->disk_now = kmem_alloc(sizeof(struct rf_paritymap_ondisk),
    454      1.1       jld 	    KM_SLEEP);
    455      1.1       jld 	pm->current = kmem_zalloc(sizeof(struct rf_paritymap_current),
    456      1.1       jld 	    KM_SLEEP);
    457      1.1       jld 
    458      1.1       jld 	rf_paritymap_kern_read(pm->raid, pm->disk_boot);
    459      1.1       jld 	memcpy(pm->disk_now, pm->disk_boot, sizeof(*pm->disk_now));
    460      1.1       jld 
    461      1.1       jld 	mutex_init(&pm->lock, MUTEX_DEFAULT, IPL_NONE);
    462      1.1       jld 	mutex_init(&pm->lk_flags, MUTEX_DEFAULT, IPL_SOFTCLOCK);
    463      1.1       jld 
    464      1.1       jld 	return 0;
    465      1.1       jld }
    466      1.1       jld 
    467      1.1       jld /*
    468      1.1       jld  * Destroys a parity map; unless "force" is set, also cleans parity for any
    469      1.1       jld  * regions which were still in cooldown (but are not dirty on disk).
    470      1.1       jld  */
    471      1.1       jld void
    472      1.1       jld rf_paritymap_destroy(struct rf_paritymap *pm, int force)
    473      1.1       jld {
    474      1.1       jld 	int i;
    475      1.1       jld 
    476      1.1       jld 	callout_halt(&pm->ticker, NULL); /* XXX stop? halt? */
    477      1.1       jld 	callout_destroy(&pm->ticker);
    478      1.1       jld 
    479      1.1       jld 	if (!force) {
    480      1.1       jld 		for (i = 0; i < RF_PARITYMAP_NREG; i++) {
    481      1.1       jld 			/* XXX check for > 0 ? */
    482      1.1       jld 			if (pm->current->state[i] < 0)
    483      1.1       jld 				pm->current->state[i] = 0;
    484      1.1       jld 		}
    485      1.1       jld 
    486      1.1       jld 		rf_paritymap_write_locked(pm);
    487      1.1       jld 	}
    488      1.1       jld 
    489      1.1       jld 	mutex_destroy(&pm->lock);
    490      1.1       jld 	mutex_destroy(&pm->lk_flags);
    491      1.1       jld 
    492      1.1       jld 	kmem_free(pm->disk_boot, sizeof(struct rf_paritymap_ondisk));
    493      1.1       jld 	kmem_free(pm->disk_now, sizeof(struct rf_paritymap_ondisk));
    494      1.1       jld 	kmem_free(pm->current, sizeof(struct rf_paritymap_current));
    495      1.1       jld }
    496      1.1       jld 
    497      1.1       jld /*
    498      1.1       jld  * Rewrite parity, taking parity map into account; this is the equivalent of
    499      1.1       jld  * the old rf_RewriteParity, and is likewise to be called from a suitable
    500      1.1       jld  * thread and shouldn't have multiple copies running in parallel and so on.
    501      1.1       jld  *
    502      1.1       jld  * Note that the fictional regions are "cleaned" in one shot, so that very
    503      1.1       jld  * small RAIDs (useful for testing) will not experience potentially severe
    504      1.1       jld  * regressions in rewrite time.
    505      1.1       jld  */
    506      1.1       jld int
    507      1.1       jld rf_paritymap_rewrite(struct rf_paritymap *pm)
    508      1.1       jld {
    509      1.1       jld 	int i, ret_val = 0;
    510      1.1       jld 	daddr_t reg_b, reg_e;
    511      1.1       jld 
    512      1.1       jld 	/* Process only the actual regions. */
    513      1.1       jld 	for (i = 0; i < pm->params.regions; i++) {
    514      1.1       jld 		mutex_enter(&pm->lock);
    515      1.1       jld 		if (isset(pm->disk_boot->bits, i)) {
    516      1.1       jld 			mutex_exit(&pm->lock);
    517      1.1       jld 
    518      1.1       jld 			reg_b = i * pm->region_size;
    519      1.1       jld 			reg_e = reg_b + pm->region_size;
    520      1.1       jld 			if (reg_e > pm->raid->totalSectors)
    521      1.1       jld 				reg_e = pm->raid->totalSectors;
    522      1.1       jld 
    523      1.1       jld 			if (rf_RewriteParityRange(pm->raid, reg_b,
    524      1.1       jld 			    reg_e - reg_b)) {
    525      1.1       jld 				ret_val = 1;
    526      1.1       jld 				if (pm->raid->waitShutdown)
    527      1.1       jld 					return ret_val;
    528      1.1       jld 			} else {
    529      1.1       jld 				mutex_enter(&pm->lock);
    530      1.1       jld 				clrbit(pm->disk_boot->bits, i);
    531      1.1       jld 				rf_paritymap_write_locked(pm);
    532      1.1       jld 				mutex_exit(&pm->lock);
    533      1.1       jld 			}
    534      1.1       jld 		} else {
    535      1.1       jld 			mutex_exit(&pm->lock);
    536      1.1       jld 		}
    537      1.1       jld 	}
    538      1.1       jld 
    539      1.1       jld 	/* Now, clear the fictional regions, if any. */
    540      1.1       jld 	rf_paritymap_forceclean(pm);
    541      1.1       jld 	rf_paritymap_write(pm);
    542      1.1       jld 
    543      1.1       jld 	return ret_val;
    544      1.1       jld }
    545      1.1       jld 
    546      1.1       jld /*
    547      1.1       jld  * How to merge the on-disk parity maps when reading them in from the
    548      1.1       jld  * various components; returns whether they differ.  In the case that
    549      1.1       jld  * they do differ, sets *dst to the union of *dst and *src.
    550      1.1       jld  *
    551      1.1       jld  * In theory, it should be safe to take the intersection (or just pick
    552      1.1       jld  * a single component arbitrarily), but the paranoid approach costs
    553      1.1       jld  * little.
    554      1.1       jld  *
    555      1.1       jld  * Appropriate locking, if any, is the responsibility of the caller.
    556      1.1       jld  */
    557      1.1       jld int
    558      1.1       jld rf_paritymap_merge(struct rf_paritymap_ondisk *dst,
    559      1.1       jld     struct rf_paritymap_ondisk *src)
    560      1.1       jld {
    561      1.1       jld 	int i, discrep = 0;
    562      1.1       jld 
    563      1.1       jld 	for (i = 0; i < RF_PARITYMAP_NBYTE; i++) {
    564      1.1       jld 		if (dst->bits[i] != src->bits[i])
    565      1.1       jld 			discrep = 1;
    566      1.1       jld 		dst->bits[i] |= src->bits[i];
    567      1.1       jld 	}
    568      1.1       jld 
    569      1.1       jld 	return discrep;
    570      1.1       jld }
    571      1.1       jld 
    572      1.1       jld /*
    573      1.1       jld  * Detach a parity map from its RAID.  This is not meant to be applied except
    574      1.1       jld  * when unconfiguring the RAID after all I/O has been resolved, as otherwise
    575      1.1       jld  * an out-of-date parity map could be treated as current.
    576      1.1       jld  */
    577      1.1       jld void
    578      1.1       jld rf_paritymap_detach(RF_Raid_t *raidPtr)
    579      1.1       jld {
    580      1.1       jld 	if (raidPtr->parity_map == NULL)
    581      1.1       jld 		return;
    582      1.1       jld 
    583      1.1       jld 	simple_lock(&(raidPtr->iodone_lock));
    584      1.1       jld 	struct rf_paritymap *pm = raidPtr->parity_map;
    585      1.1       jld 	raidPtr->parity_map = NULL;
    586      1.1       jld 	simple_unlock(&(raidPtr->iodone_lock));
    587      1.1       jld 	/* XXXjld is that enough locking?  Or too much? */
    588      1.1       jld 	rf_paritymap_destroy(pm, 0);
    589      1.1       jld 	kmem_free(pm, sizeof(*pm));
    590      1.1       jld }
    591      1.1       jld 
    592      1.1       jld /*
    593  1.3.4.1  uebayasi  * Is this RAID set ineligible for parity-map use due to not actually
    594  1.3.4.1  uebayasi  * having any parity?  (If so, rf_paritymap_attach is a no-op, but
    595  1.3.4.1  uebayasi  * rf_paritymap_{get,set}_disable will still pointlessly act on the
    596  1.3.4.1  uebayasi  * component labels.)
    597  1.3.4.1  uebayasi  */
    598  1.3.4.1  uebayasi int
    599  1.3.4.1  uebayasi rf_paritymap_ineligible(RF_Raid_t *raidPtr)
    600  1.3.4.1  uebayasi {
    601  1.3.4.1  uebayasi 	return raidPtr->Layout.map->faultsTolerated == 0;
    602  1.3.4.1  uebayasi }
    603  1.3.4.1  uebayasi 
    604  1.3.4.1  uebayasi /*
    605      1.1       jld  * Attach a parity map to a RAID set if appropriate.  Includes
    606      1.1       jld  * configure-time processing of parity-map fields of component label.
    607      1.1       jld  */
    608      1.1       jld void
    609      1.1       jld rf_paritymap_attach(RF_Raid_t *raidPtr, int force)
    610      1.1       jld {
    611      1.1       jld 	RF_RowCol_t col;
    612      1.1       jld 	int pm_use, pm_zap;
    613      1.1       jld 	int g_tickms, g_ntick, g_regions;
    614      1.1       jld 	int good;
    615      1.1       jld 	RF_ComponentLabel_t *clabel;
    616      1.1       jld 	u_int flags, regions;
    617      1.1       jld 	struct rf_pmparams params;
    618      1.1       jld 
    619  1.3.4.1  uebayasi 	if (rf_paritymap_ineligible(raidPtr)) {
    620      1.1       jld 		/* There isn't any parity. */
    621      1.1       jld 		return;
    622      1.1       jld 	}
    623      1.1       jld 
    624      1.1       jld 	pm_use = 1;
    625      1.1       jld 	pm_zap = 0;
    626      1.1       jld 	g_tickms = DFL_TICKMS;
    627      1.1       jld 	g_ntick = DFL_COOLDOWN;
    628      1.1       jld 	g_regions = 0;
    629      1.1       jld 
    630      1.1       jld 	/*
    631      1.1       jld 	 * Collect opinions on the set config.  If this is the initial
    632      1.1       jld 	 * config (raidctl -C), treat all labels as invalid, since
    633      1.1       jld 	 * there may be random data present.
    634      1.1       jld 	 */
    635      1.1       jld 	if (!force) {
    636      1.1       jld 		for (col = 0; col < raidPtr->numCol; col++) {
    637  1.3.4.1  uebayasi 			if (RF_DEAD_DISK(raidPtr->Disks[col].status))
    638  1.3.4.1  uebayasi 				continue;
    639      1.1       jld 			clabel = raidget_component_label(raidPtr, col);
    640      1.1       jld 			flags = clabel->parity_map_flags;
    641      1.1       jld 			/* Check for use by non-parity-map kernel. */
    642      1.1       jld 			if (clabel->parity_map_modcount
    643      1.1       jld 			    != clabel->mod_counter) {
    644      1.1       jld 				flags &= ~RF_PMLABEL_WASUSED;
    645      1.1       jld 			}
    646      1.1       jld 
    647      1.1       jld 			if (flags & RF_PMLABEL_VALID) {
    648      1.1       jld 				g_tickms = clabel->parity_map_tickms;
    649      1.1       jld 				g_ntick = clabel->parity_map_ntick;
    650      1.1       jld 				regions = clabel->parity_map_regions;
    651      1.1       jld 				if (g_regions == 0)
    652      1.1       jld 					g_regions = regions;
    653      1.1       jld 				else if (g_regions != regions) {
    654      1.1       jld 					pm_zap = 1; /* important! */
    655      1.1       jld 				}
    656      1.1       jld 
    657      1.1       jld 				if (flags & RF_PMLABEL_DISABLE) {
    658      1.1       jld 					pm_use = 0;
    659      1.1       jld 				}
    660      1.1       jld 				if (!(flags & RF_PMLABEL_WASUSED)) {
    661      1.1       jld 					pm_zap = 1;
    662      1.1       jld 				}
    663      1.1       jld 			} else {
    664      1.1       jld 				pm_zap = 1;
    665      1.1       jld 			}
    666      1.1       jld 		}
    667      1.1       jld 	} else {
    668      1.1       jld 		pm_zap = 1;
    669      1.1       jld 	}
    670      1.1       jld 
    671      1.1       jld 	/* Finally, create and attach the parity map. */
    672      1.1       jld 	if (pm_use) {
    673      1.1       jld 		params.cooldown = g_ntick;
    674      1.1       jld 		params.tickms = g_tickms;
    675      1.1       jld 		params.regions = g_regions;
    676      1.1       jld 
    677      1.1       jld 		raidPtr->parity_map = kmem_alloc(sizeof(struct rf_paritymap),
    678      1.1       jld 		    KM_SLEEP);
    679      1.1       jld 		if (0 != rf_paritymap_init(raidPtr->parity_map, raidPtr,
    680      1.1       jld 			&params)) {
    681      1.1       jld 			/* It failed; do without. */
    682      1.1       jld 			kmem_free(raidPtr->parity_map,
    683      1.1       jld 			    sizeof(struct rf_paritymap));
    684      1.1       jld 			raidPtr->parity_map = NULL;
    685      1.1       jld 			return;
    686      1.1       jld 		}
    687      1.1       jld 
    688      1.1       jld 		if (g_regions == 0)
    689      1.1       jld 			/* Pick up the autoconfigured region count. */
    690      1.1       jld 			g_regions = raidPtr->parity_map->params.regions;
    691      1.1       jld 
    692      1.1       jld 		if (pm_zap) {
    693      1.1       jld 			good = raidPtr->parity_good && !force;
    694      1.1       jld 
    695      1.1       jld 			if (good)
    696      1.1       jld 				rf_paritymap_forceclean(raidPtr->parity_map);
    697      1.1       jld 			else
    698      1.1       jld 				rf_paritymap_invalidate(raidPtr->parity_map);
    699      1.1       jld 			/* This needs to be on disk before WASUSED is set. */
    700      1.1       jld 			rf_paritymap_write(raidPtr->parity_map);
    701      1.1       jld 		}
    702      1.1       jld 	}
    703      1.1       jld 
    704      1.1       jld 	/* Alter labels in-core to reflect the current view of things. */
    705      1.1       jld 	for (col = 0; col < raidPtr->numCol; col++) {
    706  1.3.4.1  uebayasi 		if (RF_DEAD_DISK(raidPtr->Disks[col].status))
    707  1.3.4.1  uebayasi 			continue;
    708      1.1       jld 		clabel = raidget_component_label(raidPtr, col);
    709      1.1       jld 
    710      1.1       jld 		if (pm_use)
    711      1.1       jld 			flags = RF_PMLABEL_VALID | RF_PMLABEL_WASUSED;
    712      1.1       jld 		else
    713      1.1       jld 			flags = RF_PMLABEL_VALID | RF_PMLABEL_DISABLE;
    714      1.1       jld 
    715      1.1       jld 		clabel->parity_map_flags = flags;
    716      1.1       jld 		clabel->parity_map_tickms = g_tickms;
    717      1.1       jld 		clabel->parity_map_ntick = g_ntick;
    718      1.1       jld 		clabel->parity_map_regions = g_regions;
    719      1.1       jld 		raidflush_component_label(raidPtr, col);
    720      1.1       jld 	}
    721  1.3.4.1  uebayasi 	/* Note that we're just in 'attach' here, and there won't
    722  1.3.4.1  uebayasi 	   be any spare disks at this point. */
    723      1.1       jld }
    724      1.1       jld 
    725      1.1       jld /*
    726      1.1       jld  * For initializing the parity-map fields of a component label, both on
    727  1.3.4.1  uebayasi  * initial creation and on reconstruct/copyback/etc.  */
    728      1.1       jld void
    729      1.1       jld rf_paritymap_init_label(struct rf_paritymap *pm, RF_ComponentLabel_t *clabel)
    730      1.1       jld {
    731      1.1       jld 	if (pm != NULL) {
    732      1.1       jld 		clabel->parity_map_flags =
    733      1.1       jld 		    RF_PMLABEL_VALID | RF_PMLABEL_WASUSED;
    734      1.1       jld 		clabel->parity_map_tickms = pm->params.tickms;
    735      1.1       jld 		clabel->parity_map_ntick = pm->params.cooldown;
    736      1.1       jld 		/*
    737      1.1       jld 		 * XXXjld: If the number of regions is changed on disk, and
    738      1.1       jld 		 * then a new component is labeled before the next configure,
    739      1.1       jld 		 * then it will get the old value and they will conflict on
    740      1.1       jld 		 * the next boot (and the default will be used instead).
    741      1.1       jld 		 */
    742      1.1       jld 		clabel->parity_map_regions = pm->params.regions;
    743      1.1       jld 	} else {
    744      1.1       jld 		/*
    745      1.1       jld 		 * XXXjld: if the map is disabled, and all the components are
    746      1.1       jld 		 * replaced without an intervening unconfigure/reconfigure,
    747      1.1       jld 		 * then it will become enabled on the next unconfig/reconfig.
    748      1.1       jld 		 */
    749      1.1       jld 	}
    750      1.1       jld }
    751      1.1       jld 
    752      1.1       jld 
    753      1.1       jld /* Will the parity map be disabled next time? */
    754      1.1       jld int
    755      1.1       jld rf_paritymap_get_disable(RF_Raid_t *raidPtr)
    756      1.1       jld {
    757      1.1       jld 	RF_ComponentLabel_t *clabel;
    758      1.1       jld 	RF_RowCol_t col;
    759      1.1       jld 	int dis;
    760      1.1       jld 
    761      1.1       jld 	dis = 0;
    762      1.1       jld 	for (col = 0; col < raidPtr->numCol; col++) {
    763  1.3.4.1  uebayasi 		if (RF_DEAD_DISK(raidPtr->Disks[col].status))
    764  1.3.4.1  uebayasi 			continue;
    765      1.1       jld 		clabel = raidget_component_label(raidPtr, col);
    766      1.1       jld 		if (clabel->parity_map_flags & RF_PMLABEL_DISABLE)
    767      1.1       jld 			dis = 1;
    768      1.1       jld 	}
    769  1.3.4.1  uebayasi         for (col = 0; col < raidPtr->numSpare; col++) {
    770  1.3.4.1  uebayasi 		if (raidPtr->Disks[raidPtr->numCol+col].status != rf_ds_used_spare)
    771  1.3.4.1  uebayasi                         continue;
    772  1.3.4.1  uebayasi                 clabel = raidget_component_label(raidPtr, raidPtr->numCol+col);
    773  1.3.4.1  uebayasi                 if (clabel->parity_map_flags & RF_PMLABEL_DISABLE)
    774  1.3.4.1  uebayasi                         dis = 1;
    775  1.3.4.1  uebayasi         }
    776      1.1       jld 
    777      1.1       jld 	return dis;
    778      1.1       jld }
    779      1.1       jld 
    780      1.1       jld /* Set whether the parity map will be disabled next time. */
    781      1.1       jld void
    782      1.1       jld rf_paritymap_set_disable(RF_Raid_t *raidPtr, int dis)
    783      1.1       jld {
    784      1.1       jld 	RF_ComponentLabel_t *clabel;
    785      1.1       jld 	RF_RowCol_t col;
    786      1.1       jld 
    787      1.1       jld 	for (col = 0; col < raidPtr->numCol; col++) {
    788  1.3.4.1  uebayasi 		if (RF_DEAD_DISK(raidPtr->Disks[col].status))
    789  1.3.4.1  uebayasi 			continue;
    790      1.1       jld 		clabel = raidget_component_label(raidPtr, col);
    791      1.1       jld 		if (dis)
    792      1.1       jld 			clabel->parity_map_flags |= RF_PMLABEL_DISABLE;
    793      1.1       jld 		else
    794      1.1       jld 			clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE;
    795      1.1       jld 		raidflush_component_label(raidPtr, col);
    796      1.1       jld 	}
    797  1.3.4.1  uebayasi 
    798  1.3.4.1  uebayasi 	/* update any used spares as well */
    799  1.3.4.1  uebayasi 	for (col = 0; col < raidPtr->numSpare; col++) {
    800  1.3.4.1  uebayasi 		if (raidPtr->Disks[raidPtr->numCol+col].status != rf_ds_used_spare)
    801  1.3.4.1  uebayasi 			continue;
    802  1.3.4.1  uebayasi 
    803  1.3.4.1  uebayasi 		clabel = raidget_component_label(raidPtr, raidPtr->numCol+col);
    804  1.3.4.1  uebayasi 		if (dis)
    805  1.3.4.1  uebayasi 			clabel->parity_map_flags |= RF_PMLABEL_DISABLE;
    806  1.3.4.1  uebayasi 		else
    807  1.3.4.1  uebayasi 			clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE;
    808  1.3.4.1  uebayasi 		raidflush_component_label(raidPtr, raidPtr->numCol+col);
    809  1.3.4.1  uebayasi 	}
    810      1.1       jld }
    811