Home | History | Annotate | Line # | Download | only in raidframe
rf_paritymap.c revision 1.3
      1  1.3  pooka /* $NetBSD: rf_paritymap.c,v 1.3 2009/11/26 07:35:39 pooka Exp $ */
      2  1.1    jld 
      3  1.1    jld /*-
      4  1.1    jld  * Copyright (c) 2009 Jed Davis.
      5  1.1    jld  * All rights reserved.
      6  1.1    jld  *
      7  1.1    jld  * Redistribution and use in source and binary forms, with or without
      8  1.1    jld  * modification, are permitted provided that the following conditions
      9  1.1    jld  * are met:
     10  1.1    jld  * 1. Redistributions of source code must retain the above copyright
     11  1.1    jld  *    notice, this list of conditions and the following disclaimer.
     12  1.1    jld  * 2. Redistributions in binary form must reproduce the above copyright
     13  1.1    jld  *    notice, this list of conditions and the following disclaimer in the
     14  1.1    jld  *    documentation and/or other materials provided with the distribution.
     15  1.1    jld  *
     16  1.1    jld  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17  1.1    jld  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18  1.1    jld  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19  1.1    jld  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20  1.1    jld  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21  1.1    jld  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22  1.1    jld  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23  1.1    jld  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24  1.1    jld  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25  1.1    jld  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26  1.1    jld  * POSSIBILITY OF SUCH DAMAGE.
     27  1.1    jld  */
     28  1.1    jld 
     29  1.1    jld #include <sys/cdefs.h>
     30  1.3  pooka __KERNEL_RCSID(0, "$NetBSD: rf_paritymap.c,v 1.3 2009/11/26 07:35:39 pooka Exp $");
     31  1.1    jld 
     32  1.3  pooka #include <sys/param.h>
     33  1.1    jld #include <sys/callout.h>
     34  1.1    jld #include <sys/kmem.h>
     35  1.1    jld #include <sys/mutex.h>
     36  1.1    jld #include <sys/rwlock.h>
     37  1.1    jld #include <sys/systm.h>
     38  1.1    jld #include <sys/types.h>
     39  1.1    jld 
     40  1.1    jld #include <dev/raidframe/rf_paritymap.h>
     41  1.1    jld #include <dev/raidframe/rf_stripelocks.h>
     42  1.1    jld #include <dev/raidframe/rf_layout.h>
     43  1.1    jld #include <dev/raidframe/rf_raid.h>
     44  1.1    jld #include <dev/raidframe/rf_parityscan.h>
     45  1.1    jld #include <dev/raidframe/rf_kintf.h>
     46  1.1    jld 
     47  1.1    jld /* Important parameters: */
     48  1.1    jld #define REGION_MINSIZE (25ULL << 20)
     49  1.1    jld #define DFL_TICKMS      40000
     50  1.1    jld #define DFL_COOLDOWN    8     /* 7-8 intervals of 40s = 5min +/- 20s */
     51  1.1    jld 
     52  1.1    jld /* Internal-use flag bits. */
     53  1.1    jld #define TICKING 1
     54  1.1    jld #define TICKED 2
     55  1.1    jld 
     56  1.1    jld /* Prototypes! */
     57  1.1    jld static void rf_paritymap_write_locked(struct rf_paritymap *);
     58  1.1    jld static void rf_paritymap_tick(void *);
     59  1.1    jld static u_int rf_paritymap_nreg(RF_Raid_t *);
     60  1.1    jld 
     61  1.1    jld /* Extract the current status of the parity map. */
     62  1.1    jld void
     63  1.1    jld rf_paritymap_status(struct rf_paritymap *pm, struct rf_pmstat *ps)
     64  1.1    jld {
     65  1.1    jld 	memset(ps, 0, sizeof(*ps));
     66  1.1    jld 	if (pm == NULL)
     67  1.1    jld 		ps->enabled = 0;
     68  1.1    jld 	else {
     69  1.1    jld 		ps->enabled = 1;
     70  1.1    jld 		ps->region_size = pm->region_size;
     71  1.1    jld 		mutex_enter(&pm->lock);
     72  1.1    jld 		memcpy(&ps->params, &pm->params, sizeof(ps->params));
     73  1.1    jld 		memcpy(ps->dirty, pm->disk_now, sizeof(ps->dirty));
     74  1.1    jld 		memcpy(&ps->ctrs, &pm->ctrs, sizeof(ps->ctrs));
     75  1.1    jld 		mutex_exit(&pm->lock);
     76  1.1    jld 	}
     77  1.1    jld }
     78  1.1    jld 
     79  1.1    jld /*
     80  1.1    jld  * Test whether parity in a given sector is suspected of being inconsistent
     81  1.1    jld  * on disk (assuming that any pending I/O to it is allowed to complete).
     82  1.1    jld  * This may be of interest to future work on parity scrubbing.
     83  1.1    jld  */
     84  1.1    jld int
     85  1.1    jld rf_paritymap_test(struct rf_paritymap *pm, daddr_t sector)
     86  1.1    jld {
     87  1.1    jld 	unsigned region = sector / pm->region_size;
     88  1.1    jld 	int retval;
     89  1.1    jld 
     90  1.1    jld 	mutex_enter(&pm->lock);
     91  1.1    jld 	retval = isset(pm->disk_boot->bits, region) ? 1 : 0;
     92  1.1    jld 	mutex_exit(&pm->lock);
     93  1.1    jld 	return retval;
     94  1.1    jld }
     95  1.1    jld 
     96  1.1    jld /* To be called before a write to the RAID is submitted. */
     97  1.1    jld void
     98  1.1    jld rf_paritymap_begin(struct rf_paritymap *pm, daddr_t offset, daddr_t size)
     99  1.1    jld {
    100  1.1    jld 	unsigned i, b, e;
    101  1.1    jld 
    102  1.1    jld 	b = offset / pm->region_size;
    103  1.1    jld 	e = (offset + size - 1) / pm->region_size;
    104  1.1    jld 
    105  1.1    jld 	for (i = b; i <= e; i++)
    106  1.1    jld 		rf_paritymap_begin_region(pm, i);
    107  1.1    jld }
    108  1.1    jld 
    109  1.1    jld /* To be called after a write to the RAID completes. */
    110  1.1    jld void
    111  1.1    jld rf_paritymap_end(struct rf_paritymap *pm, daddr_t offset, daddr_t size)
    112  1.1    jld {
    113  1.1    jld 	unsigned i, b, e;
    114  1.1    jld 
    115  1.1    jld 	b = offset / pm->region_size;
    116  1.1    jld 	e = (offset + size - 1) / pm->region_size;
    117  1.1    jld 
    118  1.1    jld 	for (i = b; i <= e; i++)
    119  1.1    jld 		rf_paritymap_end_region(pm, i);
    120  1.1    jld }
    121  1.1    jld 
    122  1.1    jld void
    123  1.1    jld rf_paritymap_begin_region(struct rf_paritymap *pm, unsigned region)
    124  1.1    jld {
    125  1.1    jld 	int needs_write;
    126  1.1    jld 
    127  1.1    jld 	KASSERT(region < RF_PARITYMAP_NREG);
    128  1.1    jld 	pm->ctrs.nwrite++;
    129  1.1    jld 
    130  1.1    jld 	/* If it was being kept warm, deal with that. */
    131  1.1    jld 	mutex_enter(&pm->lock);
    132  1.1    jld 	if (pm->current->state[region] < 0)
    133  1.1    jld 		pm->current->state[region] = 0;
    134  1.1    jld 
    135  1.1    jld 	/* This shouldn't happen unless RAIDOUTSTANDING is set too high. */
    136  1.1    jld 	KASSERT(pm->current->state[region] < 127);
    137  1.1    jld 	pm->current->state[region]++;
    138  1.1    jld 
    139  1.1    jld 	needs_write = isclr(pm->disk_now->bits, region);
    140  1.1    jld 
    141  1.1    jld 	if (needs_write) {
    142  1.1    jld 		KASSERT(pm->current->state[region] == 1);
    143  1.1    jld 		rf_paritymap_write_locked(pm);
    144  1.1    jld 	}
    145  1.1    jld 
    146  1.1    jld 	mutex_exit(&pm->lock);
    147  1.1    jld }
    148  1.1    jld 
    149  1.1    jld void
    150  1.1    jld rf_paritymap_end_region(struct rf_paritymap *pm, unsigned region)
    151  1.1    jld {
    152  1.1    jld 	KASSERT(region < RF_PARITYMAP_NREG);
    153  1.1    jld 
    154  1.1    jld 	mutex_enter(&pm->lock);
    155  1.1    jld 	KASSERT(pm->current->state[region] > 0);
    156  1.1    jld 	--pm->current->state[region];
    157  1.1    jld 
    158  1.1    jld 	if (pm->current->state[region] <= 0) {
    159  1.1    jld 		pm->current->state[region] = -pm->params.cooldown;
    160  1.1    jld 		KASSERT(pm->current->state[region] <= 0);
    161  1.1    jld 		mutex_enter(&pm->lk_flags);
    162  1.1    jld 		if (!(pm->flags & TICKING)) {
    163  1.1    jld 			pm->flags |= TICKING;
    164  1.1    jld 			mutex_exit(&pm->lk_flags);
    165  1.1    jld 			callout_schedule(&pm->ticker,
    166  1.1    jld 			    mstohz(pm->params.tickms));
    167  1.1    jld 		} else
    168  1.1    jld 			mutex_exit(&pm->lk_flags);
    169  1.1    jld 	}
    170  1.1    jld 	mutex_exit(&pm->lock);
    171  1.1    jld }
    172  1.1    jld 
    173  1.1    jld /*
    174  1.1    jld  * Updates the parity map to account for any changes in current activity
    175  1.1    jld  * and/or an ongoing parity scan, then writes it to disk with appropriate
    176  1.1    jld  * synchronization.
    177  1.1    jld  */
    178  1.1    jld void
    179  1.1    jld rf_paritymap_write(struct rf_paritymap *pm)
    180  1.1    jld {
    181  1.1    jld 	mutex_enter(&pm->lock);
    182  1.1    jld 	rf_paritymap_write_locked(pm);
    183  1.1    jld 	mutex_exit(&pm->lock);
    184  1.1    jld }
    185  1.1    jld 
    186  1.1    jld /* As above, but to be used when pm->lock is already held. */
    187  1.1    jld static void
    188  1.1    jld rf_paritymap_write_locked(struct rf_paritymap *pm)
    189  1.1    jld {
    190  1.1    jld 	char w, w0;
    191  1.1    jld 	int i, j, setting, clearing;
    192  1.1    jld 
    193  1.1    jld 	setting = clearing = 0;
    194  1.1    jld 	for (i = 0; i < RF_PARITYMAP_NBYTE; i++) {
    195  1.1    jld 		w0 = pm->disk_now->bits[i];
    196  1.1    jld 		w = pm->disk_boot->bits[i];
    197  1.1    jld 
    198  1.1    jld 		for (j = 0; j < NBBY; j++)
    199  1.1    jld 			if (pm->current->state[i * NBBY + j] != 0)
    200  1.1    jld 				w |= 1 << j;
    201  1.1    jld 
    202  1.1    jld 		if (w & ~w0)
    203  1.1    jld 			setting = 1;
    204  1.1    jld 		if (w0 & ~w)
    205  1.1    jld 			clearing = 1;
    206  1.1    jld 
    207  1.1    jld 		pm->disk_now->bits[i] = w;
    208  1.1    jld 	}
    209  1.1    jld 	pm->ctrs.ncachesync += setting + clearing;
    210  1.1    jld 	pm->ctrs.nclearing += clearing;
    211  1.1    jld 
    212  1.1    jld 	/*
    213  1.1    jld 	 * If bits are being set in the parity map, then a sync is
    214  1.1    jld 	 * required afterwards, so that the regions are marked dirty
    215  1.1    jld 	 * on disk before any writes to them take place.  If bits are
    216  1.1    jld 	 * being cleared, then a sync is required before the write, so
    217  1.1    jld 	 * that any writes to those regions are processed before the
    218  1.1    jld 	 * region is marked clean.  (Synchronization is somewhat
    219  1.1    jld 	 * overkill; a write ordering barrier would suffice, but we
    220  1.1    jld 	 * currently have no way to express that directly.)
    221  1.1    jld 	 */
    222  1.1    jld 	if (clearing)
    223  1.1    jld 		rf_sync_component_caches(pm->raid);
    224  1.1    jld 	rf_paritymap_kern_write(pm->raid, pm->disk_now);
    225  1.1    jld 	if (setting)
    226  1.1    jld 		rf_sync_component_caches(pm->raid);
    227  1.1    jld }
    228  1.1    jld 
    229  1.1    jld /* Mark all parity as being in need of rewrite. */
    230  1.1    jld void
    231  1.1    jld rf_paritymap_invalidate(struct rf_paritymap *pm)
    232  1.1    jld {
    233  1.1    jld 	mutex_enter(&pm->lock);
    234  1.1    jld 	memset(pm->disk_boot, ~(unsigned char)0,
    235  1.1    jld 	    sizeof(struct rf_paritymap_ondisk));
    236  1.1    jld 	mutex_exit(&pm->lock);
    237  1.1    jld }
    238  1.1    jld 
    239  1.1    jld /* Mark all parity as being correct. */
    240  1.1    jld void
    241  1.1    jld rf_paritymap_forceclean(struct rf_paritymap *pm)
    242  1.1    jld {
    243  1.1    jld 	mutex_enter(&pm->lock);
    244  1.1    jld 	memset(pm->disk_boot, (unsigned char)0,
    245  1.1    jld 	    sizeof(struct rf_paritymap_ondisk));
    246  1.1    jld 	mutex_exit(&pm->lock);
    247  1.1    jld }
    248  1.1    jld 
    249  1.1    jld /*
    250  1.1    jld  * The cooldown callout routine just defers its work to a thread; it can't do
    251  1.1    jld  * the parity map write itself as it would block, and although mutex-induced
    252  1.1    jld  * blocking is permitted it seems wise to avoid tying up the softint.
    253  1.1    jld  */
    254  1.1    jld static void
    255  1.1    jld rf_paritymap_tick(void *arg)
    256  1.1    jld {
    257  1.1    jld 	struct rf_paritymap *pm = arg;
    258  1.1    jld 
    259  1.1    jld 	mutex_enter(&pm->lk_flags);
    260  1.1    jld 	pm->flags |= TICKED;
    261  1.1    jld 	mutex_exit(&pm->lk_flags);
    262  1.1    jld 	wakeup(&(pm->raid->iodone)); /* XXX */
    263  1.1    jld }
    264  1.1    jld 
    265  1.1    jld /*
    266  1.1    jld  * This is where the parity cooling work (and rearming the callout if needed)
    267  1.1    jld  * is done; the raidio thread calls it when woken up, as by the above.
    268  1.1    jld  */
    269  1.1    jld void
    270  1.1    jld rf_paritymap_checkwork(struct rf_paritymap *pm)
    271  1.1    jld {
    272  1.1    jld 	int i, zerop, progressp;
    273  1.1    jld 
    274  1.1    jld 	mutex_enter(&pm->lk_flags);
    275  1.1    jld 	if (pm->flags & TICKED) {
    276  1.1    jld 		zerop = progressp = 0;
    277  1.1    jld 
    278  1.1    jld 		pm->flags &= ~TICKED;
    279  1.1    jld 		mutex_exit(&pm->lk_flags);
    280  1.1    jld 
    281  1.1    jld 		mutex_enter(&pm->lock);
    282  1.1    jld 		for (i = 0; i < RF_PARITYMAP_NREG; i++) {
    283  1.1    jld 			if (pm->current->state[i] < 0) {
    284  1.1    jld 				progressp = 1;
    285  1.1    jld 				pm->current->state[i]++;
    286  1.1    jld 				if (pm->current->state[i] == 0)
    287  1.1    jld 					zerop = 1;
    288  1.1    jld 			}
    289  1.1    jld 		}
    290  1.1    jld 
    291  1.1    jld 		if (progressp)
    292  1.1    jld 			callout_schedule(&pm->ticker,
    293  1.1    jld 			    mstohz(pm->params.tickms));
    294  1.1    jld 		else {
    295  1.1    jld 			mutex_enter(&pm->lk_flags);
    296  1.1    jld 			pm->flags &= ~TICKING;
    297  1.1    jld 			mutex_exit(&pm->lk_flags);
    298  1.1    jld 		}
    299  1.1    jld 
    300  1.1    jld 		if (zerop)
    301  1.1    jld 			rf_paritymap_write_locked(pm);
    302  1.1    jld 		mutex_exit(&pm->lock);
    303  1.1    jld 	} else
    304  1.1    jld 		mutex_exit(&pm->lk_flags);
    305  1.1    jld }
    306  1.1    jld 
    307  1.1    jld /*
    308  1.1    jld  * Set parity map parameters; used both to alter parameters on the fly and to
    309  1.1    jld  * establish their initial values.  Note that setting a parameter to 0 means
    310  1.1    jld  * to leave the previous setting unchanged, and that if this is done for the
    311  1.1    jld  * initial setting of "regions", then a default value will be computed based
    312  1.1    jld  * on the RAID component size.
    313  1.1    jld  */
    314  1.1    jld int
    315  1.1    jld rf_paritymap_set_params(struct rf_paritymap *pm,
    316  1.1    jld     const struct rf_pmparams *params, int todisk)
    317  1.1    jld {
    318  1.1    jld 	int cooldown, tickms;
    319  1.1    jld 	u_int regions;
    320  1.1    jld 	RF_RowCol_t col;
    321  1.1    jld 	RF_ComponentLabel_t *clabel;
    322  1.1    jld 	RF_Raid_t *raidPtr;
    323  1.1    jld 
    324  1.1    jld 	cooldown = params->cooldown != 0
    325  1.1    jld 	    ? params->cooldown : pm->params.cooldown;
    326  1.1    jld 	tickms = params->tickms != 0
    327  1.1    jld 	    ? params->tickms : pm->params.tickms;
    328  1.1    jld 	regions = params->regions != 0
    329  1.1    jld 	    ? params->regions : pm->params.regions;
    330  1.1    jld 
    331  1.1    jld 	if (cooldown < 1 || cooldown > 128) {
    332  1.1    jld 		printf("raid%d: cooldown %d out of range\n", pm->raid->raidid,
    333  1.1    jld 		    cooldown);
    334  1.1    jld 		return (-1);
    335  1.1    jld 	}
    336  1.1    jld 	if (tickms < 10) {
    337  1.1    jld 		printf("raid%d: tick time %dms out of range\n",
    338  1.1    jld 		    pm->raid->raidid, tickms);
    339  1.1    jld 		return (-1);
    340  1.1    jld 	}
    341  1.1    jld 	if (regions == 0) {
    342  1.1    jld 		regions = rf_paritymap_nreg(pm->raid);
    343  1.1    jld 	} else if (regions > RF_PARITYMAP_NREG) {
    344  1.1    jld 		printf("raid%d: region count %u too large (more than %u)\n",
    345  1.1    jld 		    pm->raid->raidid, regions, RF_PARITYMAP_NREG);
    346  1.1    jld 		return (-1);
    347  1.1    jld 	}
    348  1.1    jld 
    349  1.1    jld 	/* XXX any currently warm parity will be used with the new tickms! */
    350  1.1    jld 	pm->params.cooldown = cooldown;
    351  1.1    jld 	pm->params.tickms = tickms;
    352  1.1    jld 	/* Apply the initial region count, but do not change it after that. */
    353  1.1    jld 	if (pm->params.regions == 0)
    354  1.1    jld 		pm->params.regions = regions;
    355  1.1    jld 
    356  1.1    jld 	/* So that the newly set parameters can be tested: */
    357  1.1    jld 	pm->ctrs.nwrite = pm->ctrs.ncachesync = pm->ctrs.nclearing = 0;
    358  1.1    jld 
    359  1.1    jld 	if (todisk) {
    360  1.1    jld 		raidPtr = pm->raid;
    361  1.1    jld 		for (col = 0; col < raidPtr->numCol; col++) {
    362  1.1    jld 			clabel = raidget_component_label(raidPtr, col);
    363  1.1    jld 			clabel->parity_map_ntick = cooldown;
    364  1.1    jld 			clabel->parity_map_tickms = tickms;
    365  1.1    jld 			clabel->parity_map_regions = regions;
    366  1.1    jld 			raidflush_component_label(raidPtr, col);
    367  1.1    jld 		}
    368  1.1    jld 	}
    369  1.1    jld 	return 0;
    370  1.1    jld }
    371  1.1    jld 
    372  1.1    jld /*
    373  1.1    jld  * The number of regions may not be as many as can fit into the map, because
    374  1.1    jld  * when regions are too small, the overhead of setting parity map bits
    375  1.1    jld  * becomes significant in comparison to the actual I/O, while the
    376  1.1    jld  * corresponding gains in parity verification time become negligible.  Thus,
    377  1.1    jld  * a minimum region size (defined above) is imposed.
    378  1.1    jld  *
    379  1.1    jld  * Note that, if the number of regions is less than the maximum, then some of
    380  1.1    jld  * the regions will be "fictional", corresponding to no actual disk; some
    381  1.1    jld  * parts of the code may process them as normal, but they can not ever be
    382  1.1    jld  * written to.
    383  1.1    jld  */
    384  1.1    jld static u_int
    385  1.1    jld rf_paritymap_nreg(RF_Raid_t *raid)
    386  1.1    jld {
    387  1.1    jld 	daddr_t bytes_per_disk, nreg;
    388  1.1    jld 
    389  1.1    jld 	bytes_per_disk = raid->sectorsPerDisk << raid->logBytesPerSector;
    390  1.1    jld 	nreg = bytes_per_disk / REGION_MINSIZE;
    391  1.1    jld 	if (nreg > RF_PARITYMAP_NREG)
    392  1.1    jld 		nreg = RF_PARITYMAP_NREG;
    393  1.1    jld 
    394  1.1    jld 	return (u_int)nreg;
    395  1.1    jld }
    396  1.1    jld 
    397  1.1    jld /*
    398  1.1    jld  * Initialize a parity map given specific parameters.  This neither reads nor
    399  1.1    jld  * writes the parity map config in the component labels; for that, see below.
    400  1.1    jld  */
    401  1.1    jld int
    402  1.1    jld rf_paritymap_init(struct rf_paritymap *pm, RF_Raid_t *raid,
    403  1.1    jld     const struct rf_pmparams *params)
    404  1.1    jld {
    405  1.1    jld 	daddr_t rstripes;
    406  1.1    jld 	struct rf_pmparams safe;
    407  1.1    jld 
    408  1.1    jld 	pm->raid = raid;
    409  1.1    jld 	pm->params.regions = 0;
    410  1.1    jld 	if (0 != rf_paritymap_set_params(pm, params, 0)) {
    411  1.1    jld 		/*
    412  1.1    jld 		 * If the parameters are out-of-range, then bring the
    413  1.1    jld 		 * parity map up with something reasonable, so that
    414  1.1    jld 		 * the admin can at least go and fix it (or ignore it
    415  1.1    jld 		 * entirely).
    416  1.1    jld 		 */
    417  1.1    jld 		safe.cooldown = DFL_COOLDOWN;
    418  1.1    jld 		safe.tickms = DFL_TICKMS;
    419  1.1    jld 		safe.regions = 0;
    420  1.1    jld 
    421  1.1    jld 		if (0 != rf_paritymap_set_params(pm, &safe, 0))
    422  1.1    jld 			return (-1);
    423  1.1    jld 	}
    424  1.1    jld 
    425  1.1    jld 	rstripes = howmany(raid->Layout.numStripe, pm->params.regions);
    426  1.1    jld 	pm->region_size = rstripes * raid->Layout.dataSectorsPerStripe;
    427  1.1    jld 
    428  1.1    jld 	callout_init(&pm->ticker, CALLOUT_MPSAFE);
    429  1.1    jld 	callout_setfunc(&pm->ticker, rf_paritymap_tick, pm);
    430  1.1    jld 	pm->flags = 0;
    431  1.1    jld 
    432  1.1    jld 	pm->disk_boot = kmem_alloc(sizeof(struct rf_paritymap_ondisk),
    433  1.1    jld 	    KM_SLEEP);
    434  1.1    jld 	pm->disk_now = kmem_alloc(sizeof(struct rf_paritymap_ondisk),
    435  1.1    jld 	    KM_SLEEP);
    436  1.1    jld 	pm->current = kmem_zalloc(sizeof(struct rf_paritymap_current),
    437  1.1    jld 	    KM_SLEEP);
    438  1.1    jld 
    439  1.1    jld 	rf_paritymap_kern_read(pm->raid, pm->disk_boot);
    440  1.1    jld 	memcpy(pm->disk_now, pm->disk_boot, sizeof(*pm->disk_now));
    441  1.1    jld 
    442  1.1    jld 	mutex_init(&pm->lock, MUTEX_DEFAULT, IPL_NONE);
    443  1.1    jld 	mutex_init(&pm->lk_flags, MUTEX_DEFAULT, IPL_SOFTCLOCK);
    444  1.1    jld 
    445  1.1    jld 	return 0;
    446  1.1    jld }
    447  1.1    jld 
    448  1.1    jld /*
    449  1.1    jld  * Destroys a parity map; unless "force" is set, also cleans parity for any
    450  1.1    jld  * regions which were still in cooldown (but are not dirty on disk).
    451  1.1    jld  */
    452  1.1    jld void
    453  1.1    jld rf_paritymap_destroy(struct rf_paritymap *pm, int force)
    454  1.1    jld {
    455  1.1    jld 	int i;
    456  1.1    jld 
    457  1.1    jld 	callout_halt(&pm->ticker, NULL); /* XXX stop? halt? */
    458  1.1    jld 	callout_destroy(&pm->ticker);
    459  1.1    jld 
    460  1.1    jld 	if (!force) {
    461  1.1    jld 		for (i = 0; i < RF_PARITYMAP_NREG; i++) {
    462  1.1    jld 			/* XXX check for > 0 ? */
    463  1.1    jld 			if (pm->current->state[i] < 0)
    464  1.1    jld 				pm->current->state[i] = 0;
    465  1.1    jld 		}
    466  1.1    jld 
    467  1.1    jld 		rf_paritymap_write_locked(pm);
    468  1.1    jld 	}
    469  1.1    jld 
    470  1.1    jld 	mutex_destroy(&pm->lock);
    471  1.1    jld 	mutex_destroy(&pm->lk_flags);
    472  1.1    jld 
    473  1.1    jld 	kmem_free(pm->disk_boot, sizeof(struct rf_paritymap_ondisk));
    474  1.1    jld 	kmem_free(pm->disk_now, sizeof(struct rf_paritymap_ondisk));
    475  1.1    jld 	kmem_free(pm->current, sizeof(struct rf_paritymap_current));
    476  1.1    jld }
    477  1.1    jld 
    478  1.1    jld /*
    479  1.1    jld  * Rewrite parity, taking parity map into account; this is the equivalent of
    480  1.1    jld  * the old rf_RewriteParity, and is likewise to be called from a suitable
    481  1.1    jld  * thread and shouldn't have multiple copies running in parallel and so on.
    482  1.1    jld  *
    483  1.1    jld  * Note that the fictional regions are "cleaned" in one shot, so that very
    484  1.1    jld  * small RAIDs (useful for testing) will not experience potentially severe
    485  1.1    jld  * regressions in rewrite time.
    486  1.1    jld  */
    487  1.1    jld int
    488  1.1    jld rf_paritymap_rewrite(struct rf_paritymap *pm)
    489  1.1    jld {
    490  1.1    jld 	int i, ret_val = 0;
    491  1.1    jld 	daddr_t reg_b, reg_e;
    492  1.1    jld 
    493  1.1    jld 	/* Process only the actual regions. */
    494  1.1    jld 	for (i = 0; i < pm->params.regions; i++) {
    495  1.1    jld 		mutex_enter(&pm->lock);
    496  1.1    jld 		if (isset(pm->disk_boot->bits, i)) {
    497  1.1    jld 			mutex_exit(&pm->lock);
    498  1.1    jld 
    499  1.1    jld 			reg_b = i * pm->region_size;
    500  1.1    jld 			reg_e = reg_b + pm->region_size;
    501  1.1    jld 			if (reg_e > pm->raid->totalSectors)
    502  1.1    jld 				reg_e = pm->raid->totalSectors;
    503  1.1    jld 
    504  1.1    jld 			if (rf_RewriteParityRange(pm->raid, reg_b,
    505  1.1    jld 			    reg_e - reg_b)) {
    506  1.1    jld 				ret_val = 1;
    507  1.1    jld 				if (pm->raid->waitShutdown)
    508  1.1    jld 					return ret_val;
    509  1.1    jld 			} else {
    510  1.1    jld 				mutex_enter(&pm->lock);
    511  1.1    jld 				clrbit(pm->disk_boot->bits, i);
    512  1.1    jld 				rf_paritymap_write_locked(pm);
    513  1.1    jld 				mutex_exit(&pm->lock);
    514  1.1    jld 			}
    515  1.1    jld 		} else {
    516  1.1    jld 			mutex_exit(&pm->lock);
    517  1.1    jld 		}
    518  1.1    jld 	}
    519  1.1    jld 
    520  1.1    jld 	/* Now, clear the fictional regions, if any. */
    521  1.1    jld 	rf_paritymap_forceclean(pm);
    522  1.1    jld 	rf_paritymap_write(pm);
    523  1.1    jld 
    524  1.1    jld 	return ret_val;
    525  1.1    jld }
    526  1.1    jld 
    527  1.1    jld /*
    528  1.1    jld  * How to merge the on-disk parity maps when reading them in from the
    529  1.1    jld  * various components; returns whether they differ.  In the case that
    530  1.1    jld  * they do differ, sets *dst to the union of *dst and *src.
    531  1.1    jld  *
    532  1.1    jld  * In theory, it should be safe to take the intersection (or just pick
    533  1.1    jld  * a single component arbitrarily), but the paranoid approach costs
    534  1.1    jld  * little.
    535  1.1    jld  *
    536  1.1    jld  * Appropriate locking, if any, is the responsibility of the caller.
    537  1.1    jld  */
    538  1.1    jld int
    539  1.1    jld rf_paritymap_merge(struct rf_paritymap_ondisk *dst,
    540  1.1    jld     struct rf_paritymap_ondisk *src)
    541  1.1    jld {
    542  1.1    jld 	int i, discrep = 0;
    543  1.1    jld 
    544  1.1    jld 	for (i = 0; i < RF_PARITYMAP_NBYTE; i++) {
    545  1.1    jld 		if (dst->bits[i] != src->bits[i])
    546  1.1    jld 			discrep = 1;
    547  1.1    jld 		dst->bits[i] |= src->bits[i];
    548  1.1    jld 	}
    549  1.1    jld 
    550  1.1    jld 	return discrep;
    551  1.1    jld }
    552  1.1    jld 
    553  1.1    jld /*
    554  1.1    jld  * Detach a parity map from its RAID.  This is not meant to be applied except
    555  1.1    jld  * when unconfiguring the RAID after all I/O has been resolved, as otherwise
    556  1.1    jld  * an out-of-date parity map could be treated as current.
    557  1.1    jld  */
    558  1.1    jld void
    559  1.1    jld rf_paritymap_detach(RF_Raid_t *raidPtr)
    560  1.1    jld {
    561  1.1    jld 	if (raidPtr->parity_map == NULL)
    562  1.1    jld 		return;
    563  1.1    jld 
    564  1.1    jld 	simple_lock(&(raidPtr->iodone_lock));
    565  1.1    jld 	struct rf_paritymap *pm = raidPtr->parity_map;
    566  1.1    jld 	raidPtr->parity_map = NULL;
    567  1.1    jld 	simple_unlock(&(raidPtr->iodone_lock));
    568  1.1    jld 	/* XXXjld is that enough locking?  Or too much? */
    569  1.1    jld 	rf_paritymap_destroy(pm, 0);
    570  1.1    jld 	kmem_free(pm, sizeof(*pm));
    571  1.1    jld }
    572  1.1    jld 
    573  1.1    jld /*
    574  1.1    jld  * Attach a parity map to a RAID set if appropriate.  Includes
    575  1.1    jld  * configure-time processing of parity-map fields of component label.
    576  1.1    jld  */
    577  1.1    jld void
    578  1.1    jld rf_paritymap_attach(RF_Raid_t *raidPtr, int force)
    579  1.1    jld {
    580  1.1    jld 	RF_RowCol_t col;
    581  1.1    jld 	int pm_use, pm_zap;
    582  1.1    jld 	int g_tickms, g_ntick, g_regions;
    583  1.1    jld 	int good;
    584  1.1    jld 	RF_ComponentLabel_t *clabel;
    585  1.1    jld 	u_int flags, regions;
    586  1.1    jld 	struct rf_pmparams params;
    587  1.1    jld 
    588  1.1    jld 	if (raidPtr->Layout.map->faultsTolerated == 0) {
    589  1.1    jld 		/* There isn't any parity. */
    590  1.1    jld 		return;
    591  1.1    jld 	}
    592  1.1    jld 
    593  1.1    jld 	pm_use = 1;
    594  1.1    jld 	pm_zap = 0;
    595  1.1    jld 	g_tickms = DFL_TICKMS;
    596  1.1    jld 	g_ntick = DFL_COOLDOWN;
    597  1.1    jld 	g_regions = 0;
    598  1.1    jld 
    599  1.1    jld 	/*
    600  1.1    jld 	 * Collect opinions on the set config.  If this is the initial
    601  1.1    jld 	 * config (raidctl -C), treat all labels as invalid, since
    602  1.1    jld 	 * there may be random data present.
    603  1.1    jld 	 */
    604  1.1    jld 	if (!force) {
    605  1.1    jld 		for (col = 0; col < raidPtr->numCol; col++) {
    606  1.1    jld 			clabel = raidget_component_label(raidPtr, col);
    607  1.1    jld 			flags = clabel->parity_map_flags;
    608  1.1    jld 			/* Check for use by non-parity-map kernel. */
    609  1.1    jld 			if (clabel->parity_map_modcount
    610  1.1    jld 			    != clabel->mod_counter) {
    611  1.1    jld 				flags &= ~RF_PMLABEL_WASUSED;
    612  1.1    jld 			}
    613  1.1    jld 
    614  1.1    jld 			if (flags & RF_PMLABEL_VALID) {
    615  1.1    jld 				g_tickms = clabel->parity_map_tickms;
    616  1.1    jld 				g_ntick = clabel->parity_map_ntick;
    617  1.1    jld 				regions = clabel->parity_map_regions;
    618  1.1    jld 				if (g_regions == 0)
    619  1.1    jld 					g_regions = regions;
    620  1.1    jld 				else if (g_regions != regions) {
    621  1.1    jld 					pm_zap = 1; /* important! */
    622  1.1    jld 				}
    623  1.1    jld 
    624  1.1    jld 				if (flags & RF_PMLABEL_DISABLE) {
    625  1.1    jld 					pm_use = 0;
    626  1.1    jld 				}
    627  1.1    jld 				if (!(flags & RF_PMLABEL_WASUSED)) {
    628  1.1    jld 					pm_zap = 1;
    629  1.1    jld 				}
    630  1.1    jld 			} else {
    631  1.1    jld 				pm_zap = 1;
    632  1.1    jld 			}
    633  1.1    jld 		}
    634  1.1    jld 	} else {
    635  1.1    jld 		pm_zap = 1;
    636  1.1    jld 	}
    637  1.1    jld 
    638  1.1    jld 	/* Finally, create and attach the parity map. */
    639  1.1    jld 	if (pm_use) {
    640  1.1    jld 		params.cooldown = g_ntick;
    641  1.1    jld 		params.tickms = g_tickms;
    642  1.1    jld 		params.regions = g_regions;
    643  1.1    jld 
    644  1.1    jld 		raidPtr->parity_map = kmem_alloc(sizeof(struct rf_paritymap),
    645  1.1    jld 		    KM_SLEEP);
    646  1.1    jld 		if (0 != rf_paritymap_init(raidPtr->parity_map, raidPtr,
    647  1.1    jld 			&params)) {
    648  1.1    jld 			/* It failed; do without. */
    649  1.1    jld 			kmem_free(raidPtr->parity_map,
    650  1.1    jld 			    sizeof(struct rf_paritymap));
    651  1.1    jld 			raidPtr->parity_map = NULL;
    652  1.1    jld 			return;
    653  1.1    jld 		}
    654  1.1    jld 
    655  1.1    jld 		if (g_regions == 0)
    656  1.1    jld 			/* Pick up the autoconfigured region count. */
    657  1.1    jld 			g_regions = raidPtr->parity_map->params.regions;
    658  1.1    jld 
    659  1.1    jld 		if (pm_zap) {
    660  1.1    jld 			good = raidPtr->parity_good && !force;
    661  1.1    jld 
    662  1.1    jld 			if (good)
    663  1.1    jld 				rf_paritymap_forceclean(raidPtr->parity_map);
    664  1.1    jld 			else
    665  1.1    jld 				rf_paritymap_invalidate(raidPtr->parity_map);
    666  1.1    jld 			/* This needs to be on disk before WASUSED is set. */
    667  1.1    jld 			rf_paritymap_write(raidPtr->parity_map);
    668  1.1    jld 		}
    669  1.1    jld 	}
    670  1.1    jld 
    671  1.1    jld 	/* Alter labels in-core to reflect the current view of things. */
    672  1.1    jld 	for (col = 0; col < raidPtr->numCol; col++) {
    673  1.1    jld 		clabel = raidget_component_label(raidPtr, col);
    674  1.1    jld 
    675  1.1    jld 		if (pm_use)
    676  1.1    jld 			flags = RF_PMLABEL_VALID | RF_PMLABEL_WASUSED;
    677  1.1    jld 		else
    678  1.1    jld 			flags = RF_PMLABEL_VALID | RF_PMLABEL_DISABLE;
    679  1.1    jld 
    680  1.1    jld 		clabel->parity_map_flags = flags;
    681  1.1    jld 		clabel->parity_map_tickms = g_tickms;
    682  1.1    jld 		clabel->parity_map_ntick = g_ntick;
    683  1.1    jld 		clabel->parity_map_regions = g_regions;
    684  1.1    jld 		raidflush_component_label(raidPtr, col);
    685  1.1    jld 	}
    686  1.1    jld }
    687  1.1    jld 
    688  1.1    jld /*
    689  1.1    jld  * For initializing the parity-map fields of a component label, both on
    690  1.1    jld  * initial creation and on reconstruct/copyback/etc.
    691  1.1    jld  */
    692  1.1    jld void
    693  1.1    jld rf_paritymap_init_label(struct rf_paritymap *pm, RF_ComponentLabel_t *clabel)
    694  1.1    jld {
    695  1.1    jld 	if (pm != NULL) {
    696  1.1    jld 		clabel->parity_map_flags =
    697  1.1    jld 		    RF_PMLABEL_VALID | RF_PMLABEL_WASUSED;
    698  1.1    jld 		clabel->parity_map_tickms = pm->params.tickms;
    699  1.1    jld 		clabel->parity_map_ntick = pm->params.cooldown;
    700  1.1    jld 		/*
    701  1.1    jld 		 * XXXjld: If the number of regions is changed on disk, and
    702  1.1    jld 		 * then a new component is labeled before the next configure,
    703  1.1    jld 		 * then it will get the old value and they will conflict on
    704  1.1    jld 		 * the next boot (and the default will be used instead).
    705  1.1    jld 		 */
    706  1.1    jld 		clabel->parity_map_regions = pm->params.regions;
    707  1.1    jld 	} else {
    708  1.1    jld 		/*
    709  1.1    jld 		 * XXXjld: if the map is disabled, and all the components are
    710  1.1    jld 		 * replaced without an intervening unconfigure/reconfigure,
    711  1.1    jld 		 * then it will become enabled on the next unconfig/reconfig.
    712  1.1    jld 		 */
    713  1.1    jld 	}
    714  1.1    jld }
    715  1.1    jld 
    716  1.1    jld 
    717  1.1    jld /* Will the parity map be disabled next time? */
    718  1.1    jld int
    719  1.1    jld rf_paritymap_get_disable(RF_Raid_t *raidPtr)
    720  1.1    jld {
    721  1.1    jld 	RF_ComponentLabel_t *clabel;
    722  1.1    jld 	RF_RowCol_t col;
    723  1.1    jld 	int dis;
    724  1.1    jld 
    725  1.1    jld 	dis = 0;
    726  1.1    jld 	for (col = 0; col < raidPtr->numCol; col++) {
    727  1.1    jld 		clabel = raidget_component_label(raidPtr, col);
    728  1.1    jld 		if (clabel->parity_map_flags & RF_PMLABEL_DISABLE)
    729  1.1    jld 			dis = 1;
    730  1.1    jld 	}
    731  1.1    jld 
    732  1.1    jld 	return dis;
    733  1.1    jld }
    734  1.1    jld 
    735  1.1    jld /* Set whether the parity map will be disabled next time. */
    736  1.1    jld void
    737  1.1    jld rf_paritymap_set_disable(RF_Raid_t *raidPtr, int dis)
    738  1.1    jld {
    739  1.1    jld 	RF_ComponentLabel_t *clabel;
    740  1.1    jld 	RF_RowCol_t col;
    741  1.1    jld 
    742  1.1    jld 	for (col = 0; col < raidPtr->numCol; col++) {
    743  1.1    jld 		clabel = raidget_component_label(raidPtr, col);
    744  1.1    jld 		if (dis)
    745  1.1    jld 			clabel->parity_map_flags |= RF_PMLABEL_DISABLE;
    746  1.1    jld 		else
    747  1.1    jld 			clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE;
    748  1.1    jld 		raidflush_component_label(raidPtr, col);
    749  1.1    jld 	}
    750  1.1    jld }
    751