Home | History | Annotate | Line # | Download | only in raidframe
rf_paritymap.c revision 1.10
      1  1.10  christos /* $NetBSD: rf_paritymap.c,v 1.10 2020/09/27 21:39:08 christos Exp $ */
      2   1.1       jld 
      3   1.1       jld /*-
      4   1.1       jld  * Copyright (c) 2009 Jed Davis.
      5   1.1       jld  * All rights reserved.
      6   1.1       jld  *
      7   1.1       jld  * Redistribution and use in source and binary forms, with or without
      8   1.1       jld  * modification, are permitted provided that the following conditions
      9   1.1       jld  * are met:
     10   1.1       jld  * 1. Redistributions of source code must retain the above copyright
     11   1.1       jld  *    notice, this list of conditions and the following disclaimer.
     12   1.1       jld  * 2. Redistributions in binary form must reproduce the above copyright
     13   1.1       jld  *    notice, this list of conditions and the following disclaimer in the
     14   1.1       jld  *    documentation and/or other materials provided with the distribution.
     15   1.1       jld  *
     16   1.1       jld  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     17   1.1       jld  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     18   1.1       jld  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     19   1.1       jld  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     20   1.1       jld  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     21   1.1       jld  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     22   1.1       jld  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     23   1.1       jld  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     24   1.1       jld  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     25   1.1       jld  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     26   1.1       jld  * POSSIBILITY OF SUCH DAMAGE.
     27   1.1       jld  */
     28   1.1       jld 
     29   1.1       jld #include <sys/cdefs.h>
     30  1.10  christos __KERNEL_RCSID(0, "$NetBSD: rf_paritymap.c,v 1.10 2020/09/27 21:39:08 christos Exp $");
     31   1.1       jld 
     32   1.3     pooka #include <sys/param.h>
     33   1.1       jld #include <sys/callout.h>
     34   1.1       jld #include <sys/kmem.h>
     35   1.1       jld #include <sys/mutex.h>
     36   1.1       jld #include <sys/rwlock.h>
     37   1.1       jld #include <sys/systm.h>
     38   1.1       jld #include <sys/types.h>
     39   1.1       jld 
     40   1.1       jld #include <dev/raidframe/rf_paritymap.h>
     41   1.1       jld #include <dev/raidframe/rf_stripelocks.h>
     42   1.1       jld #include <dev/raidframe/rf_layout.h>
     43   1.1       jld #include <dev/raidframe/rf_raid.h>
     44   1.1       jld #include <dev/raidframe/rf_parityscan.h>
     45   1.1       jld #include <dev/raidframe/rf_kintf.h>
     46   1.1       jld 
     47   1.1       jld /* Important parameters: */
     48   1.1       jld #define REGION_MINSIZE (25ULL << 20)
     49   1.1       jld #define DFL_TICKMS      40000
     50   1.1       jld #define DFL_COOLDOWN    8     /* 7-8 intervals of 40s = 5min +/- 20s */
     51   1.1       jld 
     52   1.1       jld /* Internal-use flag bits. */
     53   1.1       jld #define TICKING 1
     54   1.1       jld #define TICKED 2
     55   1.1       jld 
     56   1.1       jld /* Prototypes! */
     57   1.1       jld static void rf_paritymap_write_locked(struct rf_paritymap *);
     58   1.1       jld static void rf_paritymap_tick(void *);
     59   1.1       jld static u_int rf_paritymap_nreg(RF_Raid_t *);
     60   1.1       jld 
     61   1.1       jld /* Extract the current status of the parity map. */
     62   1.1       jld void
     63   1.1       jld rf_paritymap_status(struct rf_paritymap *pm, struct rf_pmstat *ps)
     64   1.1       jld {
     65   1.1       jld 	memset(ps, 0, sizeof(*ps));
     66   1.1       jld 	if (pm == NULL)
     67   1.1       jld 		ps->enabled = 0;
     68   1.1       jld 	else {
     69   1.1       jld 		ps->enabled = 1;
     70   1.1       jld 		ps->region_size = pm->region_size;
     71   1.1       jld 		mutex_enter(&pm->lock);
     72   1.1       jld 		memcpy(&ps->params, &pm->params, sizeof(ps->params));
     73   1.1       jld 		memcpy(ps->dirty, pm->disk_now, sizeof(ps->dirty));
     74   1.1       jld 		memcpy(&ps->ctrs, &pm->ctrs, sizeof(ps->ctrs));
     75   1.1       jld 		mutex_exit(&pm->lock);
     76   1.1       jld 	}
     77   1.1       jld }
     78   1.1       jld 
     79   1.1       jld /*
     80   1.1       jld  * Test whether parity in a given sector is suspected of being inconsistent
     81   1.1       jld  * on disk (assuming that any pending I/O to it is allowed to complete).
     82   1.1       jld  * This may be of interest to future work on parity scrubbing.
     83   1.1       jld  */
     84   1.1       jld int
     85   1.1       jld rf_paritymap_test(struct rf_paritymap *pm, daddr_t sector)
     86   1.1       jld {
     87   1.1       jld 	unsigned region = sector / pm->region_size;
     88   1.1       jld 	int retval;
     89   1.1       jld 
     90   1.1       jld 	mutex_enter(&pm->lock);
     91   1.1       jld 	retval = isset(pm->disk_boot->bits, region) ? 1 : 0;
     92   1.1       jld 	mutex_exit(&pm->lock);
     93   1.1       jld 	return retval;
     94   1.1       jld }
     95   1.1       jld 
     96   1.1       jld /* To be called before a write to the RAID is submitted. */
     97   1.1       jld void
     98   1.1       jld rf_paritymap_begin(struct rf_paritymap *pm, daddr_t offset, daddr_t size)
     99   1.1       jld {
    100   1.1       jld 	unsigned i, b, e;
    101   1.1       jld 
    102   1.1       jld 	b = offset / pm->region_size;
    103   1.1       jld 	e = (offset + size - 1) / pm->region_size;
    104   1.1       jld 
    105   1.1       jld 	for (i = b; i <= e; i++)
    106   1.1       jld 		rf_paritymap_begin_region(pm, i);
    107   1.1       jld }
    108   1.1       jld 
    109   1.1       jld /* To be called after a write to the RAID completes. */
    110   1.1       jld void
    111   1.1       jld rf_paritymap_end(struct rf_paritymap *pm, daddr_t offset, daddr_t size)
    112   1.1       jld {
    113   1.1       jld 	unsigned i, b, e;
    114   1.1       jld 
    115   1.1       jld 	b = offset / pm->region_size;
    116   1.1       jld 	e = (offset + size - 1) / pm->region_size;
    117   1.1       jld 
    118   1.1       jld 	for (i = b; i <= e; i++)
    119   1.1       jld 		rf_paritymap_end_region(pm, i);
    120   1.1       jld }
    121   1.1       jld 
    122   1.1       jld void
    123   1.1       jld rf_paritymap_begin_region(struct rf_paritymap *pm, unsigned region)
    124   1.1       jld {
    125   1.1       jld 	int needs_write;
    126   1.1       jld 
    127   1.1       jld 	KASSERT(region < RF_PARITYMAP_NREG);
    128   1.1       jld 	pm->ctrs.nwrite++;
    129   1.1       jld 
    130   1.1       jld 	/* If it was being kept warm, deal with that. */
    131   1.1       jld 	mutex_enter(&pm->lock);
    132   1.1       jld 	if (pm->current->state[region] < 0)
    133   1.1       jld 		pm->current->state[region] = 0;
    134   1.1       jld 
    135   1.1       jld 	/* This shouldn't happen unless RAIDOUTSTANDING is set too high. */
    136   1.1       jld 	KASSERT(pm->current->state[region] < 127);
    137   1.1       jld 	pm->current->state[region]++;
    138   1.1       jld 
    139   1.1       jld 	needs_write = isclr(pm->disk_now->bits, region);
    140   1.1       jld 
    141   1.1       jld 	if (needs_write) {
    142   1.1       jld 		KASSERT(pm->current->state[region] == 1);
    143   1.1       jld 		rf_paritymap_write_locked(pm);
    144   1.1       jld 	}
    145   1.1       jld 
    146   1.1       jld 	mutex_exit(&pm->lock);
    147   1.1       jld }
    148   1.1       jld 
    149   1.1       jld void
    150   1.1       jld rf_paritymap_end_region(struct rf_paritymap *pm, unsigned region)
    151   1.1       jld {
    152   1.1       jld 	KASSERT(region < RF_PARITYMAP_NREG);
    153   1.1       jld 
    154   1.1       jld 	mutex_enter(&pm->lock);
    155   1.1       jld 	KASSERT(pm->current->state[region] > 0);
    156   1.1       jld 	--pm->current->state[region];
    157   1.1       jld 
    158   1.1       jld 	if (pm->current->state[region] <= 0) {
    159   1.1       jld 		pm->current->state[region] = -pm->params.cooldown;
    160   1.1       jld 		KASSERT(pm->current->state[region] <= 0);
    161   1.1       jld 		mutex_enter(&pm->lk_flags);
    162   1.1       jld 		if (!(pm->flags & TICKING)) {
    163   1.1       jld 			pm->flags |= TICKING;
    164   1.1       jld 			mutex_exit(&pm->lk_flags);
    165   1.1       jld 			callout_schedule(&pm->ticker,
    166   1.1       jld 			    mstohz(pm->params.tickms));
    167   1.1       jld 		} else
    168   1.1       jld 			mutex_exit(&pm->lk_flags);
    169   1.1       jld 	}
    170   1.1       jld 	mutex_exit(&pm->lock);
    171   1.1       jld }
    172   1.1       jld 
    173   1.1       jld /*
    174   1.1       jld  * Updates the parity map to account for any changes in current activity
    175   1.1       jld  * and/or an ongoing parity scan, then writes it to disk with appropriate
    176   1.1       jld  * synchronization.
    177   1.1       jld  */
    178   1.1       jld void
    179   1.1       jld rf_paritymap_write(struct rf_paritymap *pm)
    180   1.1       jld {
    181   1.1       jld 	mutex_enter(&pm->lock);
    182   1.1       jld 	rf_paritymap_write_locked(pm);
    183   1.1       jld 	mutex_exit(&pm->lock);
    184   1.1       jld }
    185   1.1       jld 
    186   1.1       jld /* As above, but to be used when pm->lock is already held. */
    187   1.1       jld static void
    188   1.1       jld rf_paritymap_write_locked(struct rf_paritymap *pm)
    189   1.1       jld {
    190   1.1       jld 	char w, w0;
    191   1.1       jld 	int i, j, setting, clearing;
    192   1.1       jld 
    193   1.1       jld 	setting = clearing = 0;
    194   1.1       jld 	for (i = 0; i < RF_PARITYMAP_NBYTE; i++) {
    195   1.1       jld 		w0 = pm->disk_now->bits[i];
    196   1.1       jld 		w = pm->disk_boot->bits[i];
    197   1.1       jld 
    198   1.1       jld 		for (j = 0; j < NBBY; j++)
    199   1.1       jld 			if (pm->current->state[i * NBBY + j] != 0)
    200   1.1       jld 				w |= 1 << j;
    201   1.1       jld 
    202   1.1       jld 		if (w & ~w0)
    203   1.1       jld 			setting = 1;
    204   1.1       jld 		if (w0 & ~w)
    205   1.1       jld 			clearing = 1;
    206   1.1       jld 
    207   1.1       jld 		pm->disk_now->bits[i] = w;
    208   1.1       jld 	}
    209   1.1       jld 	pm->ctrs.ncachesync += setting + clearing;
    210   1.1       jld 	pm->ctrs.nclearing += clearing;
    211   1.1       jld 
    212   1.1       jld 	/*
    213   1.1       jld 	 * If bits are being set in the parity map, then a sync is
    214   1.1       jld 	 * required afterwards, so that the regions are marked dirty
    215   1.1       jld 	 * on disk before any writes to them take place.  If bits are
    216   1.1       jld 	 * being cleared, then a sync is required before the write, so
    217   1.1       jld 	 * that any writes to those regions are processed before the
    218   1.1       jld 	 * region is marked clean.  (Synchronization is somewhat
    219   1.1       jld 	 * overkill; a write ordering barrier would suffice, but we
    220   1.1       jld 	 * currently have no way to express that directly.)
    221   1.1       jld 	 */
    222   1.1       jld 	if (clearing)
    223  1.10  christos 		rf_sync_component_caches(pm->raid, 1);
    224   1.1       jld 	rf_paritymap_kern_write(pm->raid, pm->disk_now);
    225   1.1       jld 	if (setting)
    226  1.10  christos 		rf_sync_component_caches(pm->raid, 1);
    227   1.1       jld }
    228   1.1       jld 
    229   1.1       jld /* Mark all parity as being in need of rewrite. */
    230   1.1       jld void
    231   1.1       jld rf_paritymap_invalidate(struct rf_paritymap *pm)
    232   1.1       jld {
    233   1.1       jld 	mutex_enter(&pm->lock);
    234   1.9  christos 	memset(pm->disk_boot, (unsigned char)~0, sizeof(*pm->disk_boot));
    235   1.1       jld 	mutex_exit(&pm->lock);
    236   1.1       jld }
    237   1.1       jld 
    238   1.1       jld /* Mark all parity as being correct. */
    239   1.1       jld void
    240   1.1       jld rf_paritymap_forceclean(struct rf_paritymap *pm)
    241   1.1       jld {
    242   1.1       jld 	mutex_enter(&pm->lock);
    243   1.9  christos 	memset(pm->disk_boot, 0, sizeof(*pm->disk_boot));
    244   1.1       jld 	mutex_exit(&pm->lock);
    245   1.1       jld }
    246   1.1       jld 
    247   1.1       jld /*
    248   1.1       jld  * The cooldown callout routine just defers its work to a thread; it can't do
    249   1.1       jld  * the parity map write itself as it would block, and although mutex-induced
    250   1.1       jld  * blocking is permitted it seems wise to avoid tying up the softint.
    251   1.1       jld  */
    252   1.1       jld static void
    253   1.1       jld rf_paritymap_tick(void *arg)
    254   1.1       jld {
    255   1.1       jld 	struct rf_paritymap *pm = arg;
    256   1.1       jld 
    257   1.1       jld 	mutex_enter(&pm->lk_flags);
    258   1.1       jld 	pm->flags |= TICKED;
    259   1.1       jld 	mutex_exit(&pm->lk_flags);
    260   1.7       mrg 
    261   1.8       mrg 	rf_lock_mutex2(pm->raid->iodone_lock);
    262   1.8       mrg 	rf_signal_cond2(pm->raid->iodone_cv); /* XXX */
    263   1.8       mrg 	rf_unlock_mutex2(pm->raid->iodone_lock);
    264   1.1       jld }
    265   1.1       jld 
    266   1.1       jld /*
    267   1.1       jld  * This is where the parity cooling work (and rearming the callout if needed)
    268   1.1       jld  * is done; the raidio thread calls it when woken up, as by the above.
    269   1.1       jld  */
    270   1.1       jld void
    271   1.1       jld rf_paritymap_checkwork(struct rf_paritymap *pm)
    272   1.1       jld {
    273   1.1       jld 	int i, zerop, progressp;
    274   1.1       jld 
    275   1.1       jld 	mutex_enter(&pm->lk_flags);
    276   1.1       jld 	if (pm->flags & TICKED) {
    277   1.1       jld 		zerop = progressp = 0;
    278   1.1       jld 
    279   1.1       jld 		pm->flags &= ~TICKED;
    280   1.1       jld 		mutex_exit(&pm->lk_flags);
    281   1.1       jld 
    282   1.1       jld 		mutex_enter(&pm->lock);
    283   1.1       jld 		for (i = 0; i < RF_PARITYMAP_NREG; i++) {
    284   1.1       jld 			if (pm->current->state[i] < 0) {
    285   1.1       jld 				progressp = 1;
    286   1.1       jld 				pm->current->state[i]++;
    287   1.1       jld 				if (pm->current->state[i] == 0)
    288   1.1       jld 					zerop = 1;
    289   1.1       jld 			}
    290   1.1       jld 		}
    291   1.1       jld 
    292   1.1       jld 		if (progressp)
    293   1.1       jld 			callout_schedule(&pm->ticker,
    294   1.1       jld 			    mstohz(pm->params.tickms));
    295   1.1       jld 		else {
    296   1.1       jld 			mutex_enter(&pm->lk_flags);
    297   1.1       jld 			pm->flags &= ~TICKING;
    298   1.1       jld 			mutex_exit(&pm->lk_flags);
    299   1.1       jld 		}
    300   1.1       jld 
    301   1.1       jld 		if (zerop)
    302   1.1       jld 			rf_paritymap_write_locked(pm);
    303   1.1       jld 		mutex_exit(&pm->lock);
    304   1.1       jld 	} else
    305   1.1       jld 		mutex_exit(&pm->lk_flags);
    306   1.1       jld }
    307   1.1       jld 
    308   1.1       jld /*
    309   1.1       jld  * Set parity map parameters; used both to alter parameters on the fly and to
    310   1.1       jld  * establish their initial values.  Note that setting a parameter to 0 means
    311   1.1       jld  * to leave the previous setting unchanged, and that if this is done for the
    312   1.1       jld  * initial setting of "regions", then a default value will be computed based
    313   1.1       jld  * on the RAID component size.
    314   1.1       jld  */
    315   1.1       jld int
    316   1.1       jld rf_paritymap_set_params(struct rf_paritymap *pm,
    317   1.1       jld     const struct rf_pmparams *params, int todisk)
    318   1.1       jld {
    319   1.1       jld 	int cooldown, tickms;
    320   1.1       jld 	u_int regions;
    321   1.1       jld 	RF_RowCol_t col;
    322   1.1       jld 	RF_ComponentLabel_t *clabel;
    323   1.1       jld 	RF_Raid_t *raidPtr;
    324   1.1       jld 
    325   1.1       jld 	cooldown = params->cooldown != 0
    326   1.1       jld 	    ? params->cooldown : pm->params.cooldown;
    327   1.1       jld 	tickms = params->tickms != 0
    328   1.1       jld 	    ? params->tickms : pm->params.tickms;
    329   1.1       jld 	regions = params->regions != 0
    330   1.1       jld 	    ? params->regions : pm->params.regions;
    331   1.1       jld 
    332   1.1       jld 	if (cooldown < 1 || cooldown > 128) {
    333   1.1       jld 		printf("raid%d: cooldown %d out of range\n", pm->raid->raidid,
    334   1.1       jld 		    cooldown);
    335   1.1       jld 		return (-1);
    336   1.1       jld 	}
    337   1.1       jld 	if (tickms < 10) {
    338   1.1       jld 		printf("raid%d: tick time %dms out of range\n",
    339   1.1       jld 		    pm->raid->raidid, tickms);
    340   1.1       jld 		return (-1);
    341   1.1       jld 	}
    342   1.1       jld 	if (regions == 0) {
    343   1.1       jld 		regions = rf_paritymap_nreg(pm->raid);
    344   1.1       jld 	} else if (regions > RF_PARITYMAP_NREG) {
    345   1.1       jld 		printf("raid%d: region count %u too large (more than %u)\n",
    346   1.1       jld 		    pm->raid->raidid, regions, RF_PARITYMAP_NREG);
    347   1.1       jld 		return (-1);
    348   1.1       jld 	}
    349   1.1       jld 
    350   1.1       jld 	/* XXX any currently warm parity will be used with the new tickms! */
    351   1.1       jld 	pm->params.cooldown = cooldown;
    352   1.1       jld 	pm->params.tickms = tickms;
    353   1.1       jld 	/* Apply the initial region count, but do not change it after that. */
    354   1.1       jld 	if (pm->params.regions == 0)
    355   1.1       jld 		pm->params.regions = regions;
    356   1.1       jld 
    357   1.1       jld 	/* So that the newly set parameters can be tested: */
    358   1.1       jld 	pm->ctrs.nwrite = pm->ctrs.ncachesync = pm->ctrs.nclearing = 0;
    359   1.1       jld 
    360   1.1       jld 	if (todisk) {
    361   1.1       jld 		raidPtr = pm->raid;
    362   1.1       jld 		for (col = 0; col < raidPtr->numCol; col++) {
    363   1.4     oster 			if (RF_DEAD_DISK(raidPtr->Disks[col].status))
    364   1.4     oster 				continue;
    365   1.4     oster 
    366   1.1       jld 			clabel = raidget_component_label(raidPtr, col);
    367   1.1       jld 			clabel->parity_map_ntick = cooldown;
    368   1.1       jld 			clabel->parity_map_tickms = tickms;
    369   1.1       jld 			clabel->parity_map_regions = regions;
    370   1.4     oster 
    371   1.4     oster 			/* Don't touch the disk if it's been spared */
    372   1.4     oster 			if (clabel->status == rf_ds_spared)
    373   1.4     oster 				continue;
    374   1.4     oster 
    375   1.1       jld 			raidflush_component_label(raidPtr, col);
    376   1.1       jld 		}
    377   1.4     oster 
    378   1.4     oster 		/* handle the spares too... */
    379   1.4     oster 		for (col = 0; col < raidPtr->numSpare; col++) {
    380   1.4     oster 			if (raidPtr->Disks[raidPtr->numCol+col].status == rf_ds_used_spare) {
    381   1.4     oster 				clabel = raidget_component_label(raidPtr, raidPtr->numCol+col);
    382   1.4     oster 				clabel->parity_map_ntick = cooldown;
    383   1.4     oster 				clabel->parity_map_tickms = tickms;
    384   1.4     oster 				clabel->parity_map_regions = regions;
    385   1.4     oster 				raidflush_component_label(raidPtr, raidPtr->numCol+col);
    386   1.4     oster 			}
    387   1.4     oster 		}
    388   1.1       jld 	}
    389   1.1       jld 	return 0;
    390   1.1       jld }
    391   1.1       jld 
    392   1.1       jld /*
    393   1.1       jld  * The number of regions may not be as many as can fit into the map, because
    394   1.1       jld  * when regions are too small, the overhead of setting parity map bits
    395   1.1       jld  * becomes significant in comparison to the actual I/O, while the
    396   1.1       jld  * corresponding gains in parity verification time become negligible.  Thus,
    397   1.1       jld  * a minimum region size (defined above) is imposed.
    398   1.1       jld  *
    399   1.1       jld  * Note that, if the number of regions is less than the maximum, then some of
    400   1.1       jld  * the regions will be "fictional", corresponding to no actual disk; some
    401   1.1       jld  * parts of the code may process them as normal, but they can not ever be
    402   1.1       jld  * written to.
    403   1.1       jld  */
    404   1.1       jld static u_int
    405   1.1       jld rf_paritymap_nreg(RF_Raid_t *raid)
    406   1.1       jld {
    407   1.1       jld 	daddr_t bytes_per_disk, nreg;
    408   1.1       jld 
    409   1.1       jld 	bytes_per_disk = raid->sectorsPerDisk << raid->logBytesPerSector;
    410   1.1       jld 	nreg = bytes_per_disk / REGION_MINSIZE;
    411   1.1       jld 	if (nreg > RF_PARITYMAP_NREG)
    412   1.1       jld 		nreg = RF_PARITYMAP_NREG;
    413   1.6       riz 	if (nreg < 1)
    414   1.6       riz 		nreg = 1;
    415   1.1       jld 
    416   1.1       jld 	return (u_int)nreg;
    417   1.1       jld }
    418   1.1       jld 
    419   1.1       jld /*
    420   1.1       jld  * Initialize a parity map given specific parameters.  This neither reads nor
    421   1.1       jld  * writes the parity map config in the component labels; for that, see below.
    422   1.1       jld  */
    423   1.1       jld int
    424   1.1       jld rf_paritymap_init(struct rf_paritymap *pm, RF_Raid_t *raid,
    425   1.1       jld     const struct rf_pmparams *params)
    426   1.1       jld {
    427   1.1       jld 	daddr_t rstripes;
    428   1.1       jld 	struct rf_pmparams safe;
    429   1.1       jld 
    430   1.1       jld 	pm->raid = raid;
    431   1.1       jld 	pm->params.regions = 0;
    432   1.1       jld 	if (0 != rf_paritymap_set_params(pm, params, 0)) {
    433   1.1       jld 		/*
    434   1.1       jld 		 * If the parameters are out-of-range, then bring the
    435   1.1       jld 		 * parity map up with something reasonable, so that
    436   1.1       jld 		 * the admin can at least go and fix it (or ignore it
    437   1.1       jld 		 * entirely).
    438   1.1       jld 		 */
    439   1.1       jld 		safe.cooldown = DFL_COOLDOWN;
    440   1.1       jld 		safe.tickms = DFL_TICKMS;
    441   1.1       jld 		safe.regions = 0;
    442   1.1       jld 
    443   1.1       jld 		if (0 != rf_paritymap_set_params(pm, &safe, 0))
    444   1.1       jld 			return (-1);
    445   1.1       jld 	}
    446   1.1       jld 
    447   1.1       jld 	rstripes = howmany(raid->Layout.numStripe, pm->params.regions);
    448   1.1       jld 	pm->region_size = rstripes * raid->Layout.dataSectorsPerStripe;
    449   1.1       jld 
    450   1.1       jld 	callout_init(&pm->ticker, CALLOUT_MPSAFE);
    451   1.1       jld 	callout_setfunc(&pm->ticker, rf_paritymap_tick, pm);
    452   1.1       jld 	pm->flags = 0;
    453   1.1       jld 
    454   1.1       jld 	pm->disk_boot = kmem_alloc(sizeof(struct rf_paritymap_ondisk),
    455   1.1       jld 	    KM_SLEEP);
    456   1.1       jld 	pm->disk_now = kmem_alloc(sizeof(struct rf_paritymap_ondisk),
    457   1.1       jld 	    KM_SLEEP);
    458   1.1       jld 	pm->current = kmem_zalloc(sizeof(struct rf_paritymap_current),
    459   1.1       jld 	    KM_SLEEP);
    460   1.1       jld 
    461   1.1       jld 	rf_paritymap_kern_read(pm->raid, pm->disk_boot);
    462   1.1       jld 	memcpy(pm->disk_now, pm->disk_boot, sizeof(*pm->disk_now));
    463   1.1       jld 
    464   1.1       jld 	mutex_init(&pm->lock, MUTEX_DEFAULT, IPL_NONE);
    465   1.1       jld 	mutex_init(&pm->lk_flags, MUTEX_DEFAULT, IPL_SOFTCLOCK);
    466   1.1       jld 
    467   1.1       jld 	return 0;
    468   1.1       jld }
    469   1.1       jld 
    470   1.1       jld /*
    471   1.1       jld  * Destroys a parity map; unless "force" is set, also cleans parity for any
    472   1.1       jld  * regions which were still in cooldown (but are not dirty on disk).
    473   1.1       jld  */
    474   1.1       jld void
    475   1.1       jld rf_paritymap_destroy(struct rf_paritymap *pm, int force)
    476   1.1       jld {
    477   1.1       jld 	int i;
    478   1.1       jld 
    479   1.1       jld 	callout_halt(&pm->ticker, NULL); /* XXX stop? halt? */
    480   1.1       jld 	callout_destroy(&pm->ticker);
    481   1.1       jld 
    482   1.1       jld 	if (!force) {
    483   1.1       jld 		for (i = 0; i < RF_PARITYMAP_NREG; i++) {
    484   1.1       jld 			/* XXX check for > 0 ? */
    485   1.1       jld 			if (pm->current->state[i] < 0)
    486   1.1       jld 				pm->current->state[i] = 0;
    487   1.1       jld 		}
    488   1.1       jld 
    489   1.1       jld 		rf_paritymap_write_locked(pm);
    490   1.1       jld 	}
    491   1.1       jld 
    492   1.1       jld 	mutex_destroy(&pm->lock);
    493   1.1       jld 	mutex_destroy(&pm->lk_flags);
    494   1.1       jld 
    495   1.1       jld 	kmem_free(pm->disk_boot, sizeof(struct rf_paritymap_ondisk));
    496   1.1       jld 	kmem_free(pm->disk_now, sizeof(struct rf_paritymap_ondisk));
    497   1.1       jld 	kmem_free(pm->current, sizeof(struct rf_paritymap_current));
    498   1.1       jld }
    499   1.1       jld 
    500   1.1       jld /*
    501   1.1       jld  * Rewrite parity, taking parity map into account; this is the equivalent of
    502   1.1       jld  * the old rf_RewriteParity, and is likewise to be called from a suitable
    503   1.1       jld  * thread and shouldn't have multiple copies running in parallel and so on.
    504   1.1       jld  *
    505   1.1       jld  * Note that the fictional regions are "cleaned" in one shot, so that very
    506   1.1       jld  * small RAIDs (useful for testing) will not experience potentially severe
    507   1.1       jld  * regressions in rewrite time.
    508   1.1       jld  */
    509   1.1       jld int
    510   1.1       jld rf_paritymap_rewrite(struct rf_paritymap *pm)
    511   1.1       jld {
    512   1.1       jld 	int i, ret_val = 0;
    513   1.1       jld 	daddr_t reg_b, reg_e;
    514   1.1       jld 
    515   1.1       jld 	/* Process only the actual regions. */
    516   1.1       jld 	for (i = 0; i < pm->params.regions; i++) {
    517   1.1       jld 		mutex_enter(&pm->lock);
    518   1.1       jld 		if (isset(pm->disk_boot->bits, i)) {
    519   1.1       jld 			mutex_exit(&pm->lock);
    520   1.1       jld 
    521   1.1       jld 			reg_b = i * pm->region_size;
    522   1.1       jld 			reg_e = reg_b + pm->region_size;
    523   1.1       jld 			if (reg_e > pm->raid->totalSectors)
    524   1.1       jld 				reg_e = pm->raid->totalSectors;
    525   1.1       jld 
    526   1.1       jld 			if (rf_RewriteParityRange(pm->raid, reg_b,
    527   1.1       jld 			    reg_e - reg_b)) {
    528   1.1       jld 				ret_val = 1;
    529   1.1       jld 				if (pm->raid->waitShutdown)
    530   1.1       jld 					return ret_val;
    531   1.1       jld 			} else {
    532   1.1       jld 				mutex_enter(&pm->lock);
    533   1.1       jld 				clrbit(pm->disk_boot->bits, i);
    534   1.1       jld 				rf_paritymap_write_locked(pm);
    535   1.1       jld 				mutex_exit(&pm->lock);
    536   1.1       jld 			}
    537   1.1       jld 		} else {
    538   1.1       jld 			mutex_exit(&pm->lock);
    539   1.1       jld 		}
    540   1.1       jld 	}
    541   1.1       jld 
    542   1.1       jld 	/* Now, clear the fictional regions, if any. */
    543   1.1       jld 	rf_paritymap_forceclean(pm);
    544   1.1       jld 	rf_paritymap_write(pm);
    545   1.1       jld 
    546   1.1       jld 	return ret_val;
    547   1.1       jld }
    548   1.1       jld 
    549   1.1       jld /*
    550   1.1       jld  * How to merge the on-disk parity maps when reading them in from the
    551   1.1       jld  * various components; returns whether they differ.  In the case that
    552   1.1       jld  * they do differ, sets *dst to the union of *dst and *src.
    553   1.1       jld  *
    554   1.1       jld  * In theory, it should be safe to take the intersection (or just pick
    555   1.1       jld  * a single component arbitrarily), but the paranoid approach costs
    556   1.1       jld  * little.
    557   1.1       jld  *
    558   1.1       jld  * Appropriate locking, if any, is the responsibility of the caller.
    559   1.1       jld  */
    560   1.1       jld int
    561   1.1       jld rf_paritymap_merge(struct rf_paritymap_ondisk *dst,
    562   1.1       jld     struct rf_paritymap_ondisk *src)
    563   1.1       jld {
    564   1.1       jld 	int i, discrep = 0;
    565   1.1       jld 
    566   1.1       jld 	for (i = 0; i < RF_PARITYMAP_NBYTE; i++) {
    567   1.1       jld 		if (dst->bits[i] != src->bits[i])
    568   1.1       jld 			discrep = 1;
    569   1.1       jld 		dst->bits[i] |= src->bits[i];
    570   1.1       jld 	}
    571   1.1       jld 
    572   1.1       jld 	return discrep;
    573   1.1       jld }
    574   1.1       jld 
    575   1.1       jld /*
    576   1.1       jld  * Detach a parity map from its RAID.  This is not meant to be applied except
    577   1.1       jld  * when unconfiguring the RAID after all I/O has been resolved, as otherwise
    578   1.1       jld  * an out-of-date parity map could be treated as current.
    579   1.1       jld  */
    580   1.1       jld void
    581   1.1       jld rf_paritymap_detach(RF_Raid_t *raidPtr)
    582   1.1       jld {
    583   1.1       jld 	if (raidPtr->parity_map == NULL)
    584   1.1       jld 		return;
    585   1.1       jld 
    586   1.8       mrg 	rf_lock_mutex2(raidPtr->iodone_lock);
    587   1.1       jld 	struct rf_paritymap *pm = raidPtr->parity_map;
    588   1.1       jld 	raidPtr->parity_map = NULL;
    589   1.8       mrg 	rf_unlock_mutex2(raidPtr->iodone_lock);
    590   1.1       jld 	/* XXXjld is that enough locking?  Or too much? */
    591   1.1       jld 	rf_paritymap_destroy(pm, 0);
    592   1.1       jld 	kmem_free(pm, sizeof(*pm));
    593   1.1       jld }
    594   1.1       jld 
    595   1.1       jld /*
    596   1.5       jld  * Is this RAID set ineligible for parity-map use due to not actually
    597   1.5       jld  * having any parity?  (If so, rf_paritymap_attach is a no-op, but
    598   1.5       jld  * rf_paritymap_{get,set}_disable will still pointlessly act on the
    599   1.5       jld  * component labels.)
    600   1.5       jld  */
    601   1.5       jld int
    602   1.5       jld rf_paritymap_ineligible(RF_Raid_t *raidPtr)
    603   1.5       jld {
    604   1.5       jld 	return raidPtr->Layout.map->faultsTolerated == 0;
    605   1.5       jld }
    606   1.5       jld 
    607   1.5       jld /*
    608   1.1       jld  * Attach a parity map to a RAID set if appropriate.  Includes
    609   1.1       jld  * configure-time processing of parity-map fields of component label.
    610   1.1       jld  */
    611   1.1       jld void
    612   1.1       jld rf_paritymap_attach(RF_Raid_t *raidPtr, int force)
    613   1.1       jld {
    614   1.1       jld 	RF_RowCol_t col;
    615   1.1       jld 	int pm_use, pm_zap;
    616   1.1       jld 	int g_tickms, g_ntick, g_regions;
    617   1.1       jld 	int good;
    618   1.1       jld 	RF_ComponentLabel_t *clabel;
    619   1.1       jld 	u_int flags, regions;
    620   1.1       jld 	struct rf_pmparams params;
    621   1.1       jld 
    622   1.5       jld 	if (rf_paritymap_ineligible(raidPtr)) {
    623   1.1       jld 		/* There isn't any parity. */
    624   1.1       jld 		return;
    625   1.1       jld 	}
    626   1.1       jld 
    627   1.1       jld 	pm_use = 1;
    628   1.1       jld 	pm_zap = 0;
    629   1.1       jld 	g_tickms = DFL_TICKMS;
    630   1.1       jld 	g_ntick = DFL_COOLDOWN;
    631   1.1       jld 	g_regions = 0;
    632   1.1       jld 
    633   1.1       jld 	/*
    634   1.1       jld 	 * Collect opinions on the set config.  If this is the initial
    635   1.1       jld 	 * config (raidctl -C), treat all labels as invalid, since
    636   1.1       jld 	 * there may be random data present.
    637   1.1       jld 	 */
    638   1.1       jld 	if (!force) {
    639   1.1       jld 		for (col = 0; col < raidPtr->numCol; col++) {
    640   1.4     oster 			if (RF_DEAD_DISK(raidPtr->Disks[col].status))
    641   1.4     oster 				continue;
    642   1.1       jld 			clabel = raidget_component_label(raidPtr, col);
    643   1.1       jld 			flags = clabel->parity_map_flags;
    644   1.1       jld 			/* Check for use by non-parity-map kernel. */
    645   1.1       jld 			if (clabel->parity_map_modcount
    646   1.1       jld 			    != clabel->mod_counter) {
    647   1.1       jld 				flags &= ~RF_PMLABEL_WASUSED;
    648   1.1       jld 			}
    649   1.1       jld 
    650   1.1       jld 			if (flags & RF_PMLABEL_VALID) {
    651   1.1       jld 				g_tickms = clabel->parity_map_tickms;
    652   1.1       jld 				g_ntick = clabel->parity_map_ntick;
    653   1.1       jld 				regions = clabel->parity_map_regions;
    654   1.1       jld 				if (g_regions == 0)
    655   1.1       jld 					g_regions = regions;
    656   1.1       jld 				else if (g_regions != regions) {
    657   1.1       jld 					pm_zap = 1; /* important! */
    658   1.1       jld 				}
    659   1.1       jld 
    660   1.1       jld 				if (flags & RF_PMLABEL_DISABLE) {
    661   1.1       jld 					pm_use = 0;
    662   1.1       jld 				}
    663   1.1       jld 				if (!(flags & RF_PMLABEL_WASUSED)) {
    664   1.1       jld 					pm_zap = 1;
    665   1.1       jld 				}
    666   1.1       jld 			} else {
    667   1.1       jld 				pm_zap = 1;
    668   1.1       jld 			}
    669   1.1       jld 		}
    670   1.1       jld 	} else {
    671   1.1       jld 		pm_zap = 1;
    672   1.1       jld 	}
    673   1.1       jld 
    674   1.1       jld 	/* Finally, create and attach the parity map. */
    675   1.1       jld 	if (pm_use) {
    676   1.1       jld 		params.cooldown = g_ntick;
    677   1.1       jld 		params.tickms = g_tickms;
    678   1.1       jld 		params.regions = g_regions;
    679   1.1       jld 
    680   1.1       jld 		raidPtr->parity_map = kmem_alloc(sizeof(struct rf_paritymap),
    681   1.1       jld 		    KM_SLEEP);
    682   1.1       jld 		if (0 != rf_paritymap_init(raidPtr->parity_map, raidPtr,
    683   1.1       jld 			&params)) {
    684   1.1       jld 			/* It failed; do without. */
    685   1.1       jld 			kmem_free(raidPtr->parity_map,
    686   1.1       jld 			    sizeof(struct rf_paritymap));
    687   1.1       jld 			raidPtr->parity_map = NULL;
    688   1.1       jld 			return;
    689   1.1       jld 		}
    690   1.1       jld 
    691   1.1       jld 		if (g_regions == 0)
    692   1.1       jld 			/* Pick up the autoconfigured region count. */
    693   1.1       jld 			g_regions = raidPtr->parity_map->params.regions;
    694   1.1       jld 
    695   1.1       jld 		if (pm_zap) {
    696   1.1       jld 			good = raidPtr->parity_good && !force;
    697   1.1       jld 
    698   1.1       jld 			if (good)
    699   1.1       jld 				rf_paritymap_forceclean(raidPtr->parity_map);
    700   1.1       jld 			else
    701   1.1       jld 				rf_paritymap_invalidate(raidPtr->parity_map);
    702   1.1       jld 			/* This needs to be on disk before WASUSED is set. */
    703   1.1       jld 			rf_paritymap_write(raidPtr->parity_map);
    704   1.1       jld 		}
    705   1.1       jld 	}
    706   1.1       jld 
    707   1.1       jld 	/* Alter labels in-core to reflect the current view of things. */
    708   1.1       jld 	for (col = 0; col < raidPtr->numCol; col++) {
    709   1.4     oster 		if (RF_DEAD_DISK(raidPtr->Disks[col].status))
    710   1.4     oster 			continue;
    711   1.1       jld 		clabel = raidget_component_label(raidPtr, col);
    712   1.1       jld 
    713   1.1       jld 		if (pm_use)
    714   1.1       jld 			flags = RF_PMLABEL_VALID | RF_PMLABEL_WASUSED;
    715   1.1       jld 		else
    716   1.1       jld 			flags = RF_PMLABEL_VALID | RF_PMLABEL_DISABLE;
    717   1.1       jld 
    718   1.1       jld 		clabel->parity_map_flags = flags;
    719   1.1       jld 		clabel->parity_map_tickms = g_tickms;
    720   1.1       jld 		clabel->parity_map_ntick = g_ntick;
    721   1.1       jld 		clabel->parity_map_regions = g_regions;
    722   1.1       jld 		raidflush_component_label(raidPtr, col);
    723   1.1       jld 	}
    724   1.4     oster 	/* Note that we're just in 'attach' here, and there won't
    725   1.4     oster 	   be any spare disks at this point. */
    726   1.1       jld }
    727   1.1       jld 
    728   1.1       jld /*
    729   1.1       jld  * For initializing the parity-map fields of a component label, both on
    730   1.4     oster  * initial creation and on reconstruct/copyback/etc.  */
    731   1.1       jld void
    732   1.1       jld rf_paritymap_init_label(struct rf_paritymap *pm, RF_ComponentLabel_t *clabel)
    733   1.1       jld {
    734   1.1       jld 	if (pm != NULL) {
    735   1.1       jld 		clabel->parity_map_flags =
    736   1.1       jld 		    RF_PMLABEL_VALID | RF_PMLABEL_WASUSED;
    737   1.1       jld 		clabel->parity_map_tickms = pm->params.tickms;
    738   1.1       jld 		clabel->parity_map_ntick = pm->params.cooldown;
    739   1.1       jld 		/*
    740   1.1       jld 		 * XXXjld: If the number of regions is changed on disk, and
    741   1.1       jld 		 * then a new component is labeled before the next configure,
    742   1.1       jld 		 * then it will get the old value and they will conflict on
    743   1.1       jld 		 * the next boot (and the default will be used instead).
    744   1.1       jld 		 */
    745   1.1       jld 		clabel->parity_map_regions = pm->params.regions;
    746   1.1       jld 	} else {
    747   1.1       jld 		/*
    748   1.1       jld 		 * XXXjld: if the map is disabled, and all the components are
    749   1.1       jld 		 * replaced without an intervening unconfigure/reconfigure,
    750   1.1       jld 		 * then it will become enabled on the next unconfig/reconfig.
    751   1.1       jld 		 */
    752   1.1       jld 	}
    753   1.1       jld }
    754   1.1       jld 
    755   1.1       jld 
    756   1.1       jld /* Will the parity map be disabled next time? */
    757   1.1       jld int
    758   1.1       jld rf_paritymap_get_disable(RF_Raid_t *raidPtr)
    759   1.1       jld {
    760   1.1       jld 	RF_ComponentLabel_t *clabel;
    761   1.1       jld 	RF_RowCol_t col;
    762   1.1       jld 	int dis;
    763   1.1       jld 
    764   1.1       jld 	dis = 0;
    765   1.1       jld 	for (col = 0; col < raidPtr->numCol; col++) {
    766   1.4     oster 		if (RF_DEAD_DISK(raidPtr->Disks[col].status))
    767   1.4     oster 			continue;
    768   1.1       jld 		clabel = raidget_component_label(raidPtr, col);
    769   1.1       jld 		if (clabel->parity_map_flags & RF_PMLABEL_DISABLE)
    770   1.1       jld 			dis = 1;
    771   1.1       jld 	}
    772   1.4     oster         for (col = 0; col < raidPtr->numSpare; col++) {
    773   1.4     oster 		if (raidPtr->Disks[raidPtr->numCol+col].status != rf_ds_used_spare)
    774   1.4     oster                         continue;
    775   1.4     oster                 clabel = raidget_component_label(raidPtr, raidPtr->numCol+col);
    776   1.4     oster                 if (clabel->parity_map_flags & RF_PMLABEL_DISABLE)
    777   1.4     oster                         dis = 1;
    778   1.4     oster         }
    779   1.1       jld 
    780   1.1       jld 	return dis;
    781   1.1       jld }
    782   1.1       jld 
    783   1.1       jld /* Set whether the parity map will be disabled next time. */
    784   1.1       jld void
    785   1.1       jld rf_paritymap_set_disable(RF_Raid_t *raidPtr, int dis)
    786   1.1       jld {
    787   1.1       jld 	RF_ComponentLabel_t *clabel;
    788   1.1       jld 	RF_RowCol_t col;
    789   1.1       jld 
    790   1.1       jld 	for (col = 0; col < raidPtr->numCol; col++) {
    791   1.4     oster 		if (RF_DEAD_DISK(raidPtr->Disks[col].status))
    792   1.4     oster 			continue;
    793   1.1       jld 		clabel = raidget_component_label(raidPtr, col);
    794   1.1       jld 		if (dis)
    795   1.1       jld 			clabel->parity_map_flags |= RF_PMLABEL_DISABLE;
    796   1.1       jld 		else
    797   1.1       jld 			clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE;
    798   1.1       jld 		raidflush_component_label(raidPtr, col);
    799   1.1       jld 	}
    800   1.4     oster 
    801   1.4     oster 	/* update any used spares as well */
    802   1.4     oster 	for (col = 0; col < raidPtr->numSpare; col++) {
    803   1.4     oster 		if (raidPtr->Disks[raidPtr->numCol+col].status != rf_ds_used_spare)
    804   1.4     oster 			continue;
    805   1.4     oster 
    806   1.4     oster 		clabel = raidget_component_label(raidPtr, raidPtr->numCol+col);
    807   1.4     oster 		if (dis)
    808   1.4     oster 			clabel->parity_map_flags |= RF_PMLABEL_DISABLE;
    809   1.4     oster 		else
    810   1.4     oster 			clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE;
    811   1.4     oster 		raidflush_component_label(raidPtr, raidPtr->numCol+col);
    812   1.4     oster 	}
    813   1.1       jld }
    814