Home | History | Annotate | Line # | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
     24  */
     25 
     26 #include <sys/zfs_context.h>
     27 #include <sys/zfeature.h>
     28 #include <sys/dmu.h>
     29 #include <sys/nvpair.h>
     30 #include <sys/zap.h>
     31 #include <sys/dmu_tx.h>
     32 #include "zfeature_common.h"
     33 #include <sys/spa_impl.h>
     34 
     35 /*
     36  * ZFS Feature Flags
     37  * -----------------
     38  *
     39  * ZFS feature flags are used to provide fine-grained versioning to the ZFS
     40  * on-disk format. Once enabled on a pool feature flags replace the old
     41  * spa_version() number.
     42  *
     43  * Each new on-disk format change will be given a uniquely identifying string
     44  * guid rather than a version number. This avoids the problem of different
     45  * organizations creating new on-disk formats with the same version number. To
     46  * keep feature guids unique they should consist of the reverse dns name of the
     47  * organization which implemented the feature and a short name for the feature,
     48  * separated by a colon (e.g. com.delphix:async_destroy).
     49  *
     50  * Reference Counts
     51  * ----------------
     52  *
     53  * Within each pool features can be in one of three states: disabled, enabled,
     54  * or active. These states are differentiated by a reference count stored on
     55  * disk for each feature:
     56  *
     57  *   1) If there is no reference count stored on disk the feature is disabled.
     58  *   2) If the reference count is 0 a system administrator has enabled the
     59  *      feature, but the feature has not been used yet, so no on-disk
     60  *      format changes have been made.
     61  *   3) If the reference count is greater than 0 the feature is active.
     62  *      The format changes required by the feature are currently on disk.
     63  *      Note that if the feature's format changes are reversed the feature
     64  *      may choose to set its reference count back to 0.
     65  *
     66  * Feature flags makes no differentiation between non-zero reference counts
     67  * for an active feature (e.g. a reference count of 1 means the same thing as a
     68  * reference count of 27834721), but feature implementations may choose to use
     69  * the reference count to store meaningful information. For example, a new RAID
     70  * implementation might set the reference count to the number of vdevs using
     71  * it. If all those disks are removed from the pool the feature goes back to
     72  * having a reference count of 0.
     73  *
     74  * It is the responsibility of the individual features to maintain a non-zero
     75  * reference count as long as the feature's format changes are present on disk.
     76  *
     77  * Dependencies
     78  * ------------
     79  *
     80  * Each feature may depend on other features. The only effect of this
     81  * relationship is that when a feature is enabled all of its dependencies are
     82  * automatically enabled as well. Any future work to support disabling of
     83  * features would need to ensure that features cannot be disabled if other
     84  * enabled features depend on them.
     85  *
     86  * On-disk Format
     87  * --------------
     88  *
     89  * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES
     90  * (5000). In order for this to work the pool is automatically upgraded to
     91  * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk
     92  * format changes will be in use.
     93  *
     94  * Information about features is stored in 3 ZAP objects in the pool's MOS.
     95  * These objects are linked to by the following names in the pool directory
     96  * object:
     97  *
     98  * 1) features_for_read: feature guid -> reference count
     99  *    Features needed to open the pool for reading.
    100  * 2) features_for_write: feature guid -> reference count
    101  *    Features needed to open the pool for writing.
    102  * 3) feature_descriptions: feature guid -> descriptive string
    103  *    A human readable string.
    104  *
    105  * All enabled features appear in either features_for_read or
    106  * features_for_write, but not both.
    107  *
    108  * To open a pool in read-only mode only the features listed in
    109  * features_for_read need to be supported.
    110  *
    111  * To open the pool in read-write mode features in both features_for_read and
    112  * features_for_write need to be supported.
    113  *
    114  * Some features may be required to read the ZAP objects containing feature
    115  * information. To allow software to check for compatibility with these features
    116  * before the pool is opened their names must be stored in the label in a
    117  * new "features_for_read" entry (note that features that are only required
    118  * to write to a pool never need to be stored in the label since the
    119  * features_for_write ZAP object can be read before the pool is written to).
    120  * To save space in the label features must be explicitly marked as needing to
    121  * be written to the label. Also, reference counts are not stored in the label,
    122  * instead any feature whose reference count drops to 0 is removed from the
    123  * label.
    124  *
    125  * Adding New Features
    126  * -------------------
    127  *
    128  * Features must be registered in zpool_feature_init() function in
    129  * zfeature_common.c using the zfeature_register() function. This function
    130  * has arguments to specify if the feature should be stored in the
    131  * features_for_read or features_for_write ZAP object and if it needs to be
    132  * written to the label when active.
    133  *
    134  * Once a feature is registered it will appear as a "feature@<feature name>"
    135  * property which can be set by an administrator. Feature implementors should
    136  * use the spa_feature_is_enabled() and spa_feature_is_active() functions to
    137  * query the state of a feature and the spa_feature_incr() and
    138  * spa_feature_decr() functions to change an enabled feature's reference count.
    139  * Reference counts may only be updated in the syncing context.
    140  *
    141  * Features may not perform enable-time initialization. Instead, any such
    142  * initialization should occur when the feature is first used. This design
    143  * enforces that on-disk changes be made only when features are used. Code
    144  * should only check if a feature is enabled using spa_feature_is_enabled(),
    145  * not by relying on any feature specific metadata existing. If a feature is
    146  * enabled, but the feature's metadata is not on disk yet then it should be
    147  * created as needed.
    148  *
    149  * As an example, consider the com.delphix:async_destroy feature. This feature
    150  * relies on the existence of a bptree in the MOS that store blocks for
    151  * asynchronous freeing. This bptree is not created when async_destroy is
    152  * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is
    153  * called to check if async_destroy is enabled. If it is and the bptree object
    154  * does not exist yet, the bptree object is created as part of the dataset
    155  * destroy and async_destroy's reference count is incremented to indicate it
    156  * has made an on-disk format change. Later, after the destroyed dataset's
    157  * blocks have all been asynchronously freed there is no longer any use for the
    158  * bptree object, so it is destroyed and async_destroy's reference count is
    159  * decremented back to 0 to indicate that it has undone its on-disk format
    160  * changes.
    161  */
    162 
    163 typedef enum {
    164 	FEATURE_ACTION_INCR,
    165 	FEATURE_ACTION_DECR,
    166 } feature_action_t;
    167 
    168 /*
    169  * Checks that the active features in the pool are supported by
    170  * this software.  Adds each unsupported feature (name -> description) to
    171  * the supplied nvlist.
    172  */
    173 boolean_t
    174 spa_features_check(spa_t *spa, boolean_t for_write,
    175     nvlist_t *unsup_feat, nvlist_t *enabled_feat)
    176 {
    177 	objset_t *os = spa->spa_meta_objset;
    178 	boolean_t supported;
    179 	zap_cursor_t zc;
    180 	zap_attribute_t za;
    181 	uint64_t obj = for_write ?
    182 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
    183 
    184 	supported = B_TRUE;
    185 	for (zap_cursor_init(&zc, os, obj);
    186 	    zap_cursor_retrieve(&zc, &za) == 0;
    187 	    zap_cursor_advance(&zc)) {
    188 		ASSERT(za.za_integer_length == sizeof (uint64_t) &&
    189 		    za.za_num_integers == 1);
    190 
    191 		if (NULL != enabled_feat) {
    192 			fnvlist_add_uint64(enabled_feat, za.za_name,
    193 			    za.za_first_integer);
    194 		}
    195 
    196 		if (za.za_first_integer != 0 &&
    197 		    !zfeature_is_supported(za.za_name)) {
    198 			supported = B_FALSE;
    199 
    200 			if (NULL != unsup_feat) {
    201 				char *desc = "";
    202 				char buf[MAXPATHLEN];
    203 
    204 				if (zap_lookup(os, spa->spa_feat_desc_obj,
    205 				    za.za_name, 1, sizeof (buf), buf) == 0)
    206 					desc = buf;
    207 
    208 				VERIFY(nvlist_add_string(unsup_feat, za.za_name,
    209 				    desc) == 0);
    210 			}
    211 		}
    212 	}
    213 	zap_cursor_fini(&zc);
    214 
    215 	return (supported);
    216 }
    217 
    218 /*
    219  * Use an in-memory cache of feature refcounts for quick retrieval.
    220  *
    221  * Note: well-designed features will not need to use this; they should
    222  * use spa_feature_is_enabled() and spa_feature_is_active() instead.
    223  * However, this is non-static for zdb and zhack.
    224  */
    225 int
    226 feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
    227 {
    228 	ASSERT(VALID_FEATURE_FID(feature->fi_feature));
    229 	if (spa->spa_feat_refcount_cache[feature->fi_feature] ==
    230 	    SPA_FEATURE_DISABLED) {
    231 		return (SET_ERROR(ENOTSUP));
    232 	}
    233 	*res = spa->spa_feat_refcount_cache[feature->fi_feature];
    234 	return (0);
    235 }
    236 
    237 /*
    238  * Note: well-designed features will not need to use this; they should
    239  * use spa_feature_is_enabled() and spa_feature_is_active() instead.
    240  * However, this is non-static for zdb and zhack.
    241  */
    242 int
    243 feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
    244     uint64_t *res)
    245 {
    246 	int err;
    247 	uint64_t refcount;
    248 	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
    249 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
    250 
    251 	/*
    252 	 * If the pool is currently being created, the feature objects may not
    253 	 * have been allocated yet.  Act as though all features are disabled.
    254 	 */
    255 	if (zapobj == 0)
    256 		return (SET_ERROR(ENOTSUP));
    257 
    258 	err = zap_lookup(spa->spa_meta_objset, zapobj,
    259 	    feature->fi_guid, sizeof (uint64_t), 1, &refcount);
    260 	if (err != 0) {
    261 		if (err == ENOENT)
    262 			return (SET_ERROR(ENOTSUP));
    263 		else
    264 			return (err);
    265 	}
    266 	*res = refcount;
    267 	return (0);
    268 }
    269 
    270 
    271 static int
    272 feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
    273 {
    274 	uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj;
    275 
    276 	ASSERT(zfeature_depends_on(feature->fi_feature,
    277 	    SPA_FEATURE_ENABLED_TXG));
    278 
    279 	if (!spa_feature_is_enabled(spa, feature->fi_feature)) {
    280 		return (SET_ERROR(ENOTSUP));
    281 	}
    282 
    283 	ASSERT(enabled_txg_obj != 0);
    284 
    285 	VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj,
    286 	    feature->fi_guid, sizeof (uint64_t), 1, res));
    287 
    288 	return (0);
    289 }
    290 
    291 /*
    292  * This function is non-static for zhack; it should otherwise not be used
    293  * outside this file.
    294  */
    295 void
    296 feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
    297     dmu_tx_t *tx)
    298 {
    299 	ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
    300 	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
    301 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
    302 
    303 	VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
    304 	    sizeof (uint64_t), 1, &refcount, tx));
    305 
    306 	/*
    307 	 * feature_sync is called directly from zhack, allowing the
    308 	 * creation of arbitrary features whose fi_feature field may
    309 	 * be greater than SPA_FEATURES. When called from zhack, the
    310 	 * zfeature_info_t object's fi_feature field will be set to
    311 	 * SPA_FEATURE_NONE.
    312 	 */
    313 	if (feature->fi_feature != SPA_FEATURE_NONE) {
    314 		uint64_t *refcount_cache =
    315 		    &spa->spa_feat_refcount_cache[feature->fi_feature];
    316 #ifdef atomic_swap_64
    317 		VERIFY3U(*refcount_cache, ==,
    318 		    atomic_swap_64(refcount_cache, refcount));
    319 #else
    320 		*refcount_cache = refcount;
    321 #endif
    322 	}
    323 
    324 	if (refcount == 0)
    325 		spa_deactivate_mos_feature(spa, feature->fi_guid);
    326 	else if (feature->fi_flags & ZFEATURE_FLAG_MOS)
    327 		spa_activate_mos_feature(spa, feature->fi_guid, tx);
    328 }
    329 
    330 /*
    331  * This function is non-static for zhack; it should otherwise not be used
    332  * outside this file.
    333  */
    334 void
    335 feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
    336 {
    337 	uint64_t initial_refcount =
    338 	    (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0;
    339 	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
    340 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
    341 
    342 	ASSERT(0 != zapobj);
    343 	ASSERT(zfeature_is_valid_guid(feature->fi_guid));
    344 	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
    345 
    346 	/*
    347 	 * If the feature is already enabled, ignore the request.
    348 	 */
    349 	if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0)
    350 		return;
    351 
    352 	for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++)
    353 		spa_feature_enable(spa, feature->fi_depends[i], tx);
    354 
    355 	VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj,
    356 	    feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
    357 	    feature->fi_desc, tx));
    358 
    359 	feature_sync(spa, feature, initial_refcount, tx);
    360 
    361 	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) {
    362 		uint64_t enabling_txg = dmu_tx_get_txg(tx);
    363 
    364 		if (spa->spa_feat_enabled_txg_obj == 0ULL) {
    365 			spa->spa_feat_enabled_txg_obj =
    366 			    zap_create_link(spa->spa_meta_objset,
    367 			    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
    368 			    DMU_POOL_FEATURE_ENABLED_TXG, tx);
    369 		}
    370 		spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx);
    371 
    372 		VERIFY0(zap_add(spa->spa_meta_objset,
    373 		    spa->spa_feat_enabled_txg_obj, feature->fi_guid,
    374 		    sizeof (uint64_t), 1, &enabling_txg, tx));
    375 	}
    376 }
    377 
    378 static void
    379 feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
    380     dmu_tx_t *tx)
    381 {
    382 	uint64_t refcount;
    383 	zfeature_info_t *feature = &spa_feature_table[fid];
    384 	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
    385 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
    386 
    387 	ASSERT(VALID_FEATURE_FID(fid));
    388 	ASSERT(0 != zapobj);
    389 	ASSERT(zfeature_is_valid_guid(feature->fi_guid));
    390 
    391 	ASSERT(dmu_tx_is_syncing(tx));
    392 	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
    393 
    394 	VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP);
    395 
    396 	switch (action) {
    397 	case FEATURE_ACTION_INCR:
    398 		VERIFY3U(refcount, !=, UINT64_MAX);
    399 		refcount++;
    400 		break;
    401 	case FEATURE_ACTION_DECR:
    402 		VERIFY3U(refcount, !=, 0);
    403 		refcount--;
    404 		break;
    405 	default:
    406 		ASSERT(0);
    407 		break;
    408 	}
    409 
    410 	feature_sync(spa, feature, refcount, tx);
    411 }
    412 
    413 void
    414 spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx)
    415 {
    416 	/*
    417 	 * We create feature flags ZAP objects in two instances: during pool
    418 	 * creation and during pool upgrade.
    419 	 */
    420 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on &&
    421 	    tx->tx_txg == TXG_INITIAL));
    422 
    423 	spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
    424 	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
    425 	    DMU_POOL_FEATURES_FOR_READ, tx);
    426 	spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset,
    427 	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
    428 	    DMU_POOL_FEATURES_FOR_WRITE, tx);
    429 	spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset,
    430 	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
    431 	    DMU_POOL_FEATURE_DESCRIPTIONS, tx);
    432 }
    433 
    434 /*
    435  * Enable any required dependencies, then enable the requested feature.
    436  */
    437 void
    438 spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
    439 {
    440 	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
    441 	ASSERT(VALID_FEATURE_FID(fid));
    442 	feature_enable_sync(spa, &spa_feature_table[fid], tx);
    443 }
    444 
    445 void
    446 spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
    447 {
    448 	feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx);
    449 }
    450 
    451 void
    452 spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
    453 {
    454 	feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx);
    455 }
    456 
    457 boolean_t
    458 spa_feature_is_enabled(spa_t *spa, spa_feature_t fid)
    459 {
    460 	int err;
    461 	uint64_t refcount;
    462 
    463 	ASSERT(VALID_FEATURE_FID(fid));
    464 	if (spa_version(spa) < SPA_VERSION_FEATURES)
    465 		return (B_FALSE);
    466 
    467 	err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
    468 	ASSERT(err == 0 || err == ENOTSUP);
    469 	return (err == 0);
    470 }
    471 
    472 boolean_t
    473 spa_feature_is_active(spa_t *spa, spa_feature_t fid)
    474 {
    475 	int err;
    476 	uint64_t refcount;
    477 
    478 	ASSERT(VALID_FEATURE_FID(fid));
    479 	if (spa_version(spa) < SPA_VERSION_FEATURES)
    480 		return (B_FALSE);
    481 
    482 	err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
    483 	ASSERT(err == 0 || err == ENOTSUP);
    484 	return (err == 0 && refcount > 0);
    485 }
    486 
    487 /*
    488  * For the feature specified by fid (which must depend on
    489  * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the
    490  * OUT txg argument.
    491  *
    492  * Returns B_TRUE if the feature is enabled, in which case txg will be filled
    493  * with the transaction group in which the specified feature was enabled.
    494  * Returns B_FALSE otherwise (i.e. if the feature is not enabled).
    495  */
    496 boolean_t
    497 spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg)
    498 {
    499 	int err;
    500 
    501 	ASSERT(VALID_FEATURE_FID(fid));
    502 	if (spa_version(spa) < SPA_VERSION_FEATURES)
    503 		return (B_FALSE);
    504 
    505 	err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg);
    506 	ASSERT(err == 0 || err == ENOTSUP);
    507 
    508 	return (err == 0);
    509 }
    510