Home | History | Annotate | Line # | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /*
     26  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
     27  */
     28 
     29 #include <sys/zfs_context.h>
     30 #include <sys/spa.h>
     31 #include <sys/dmu.h>
     32 #include <sys/dmu_tx.h>
     33 #include <sys/dnode.h>
     34 #include <sys/dsl_pool.h>
     35 #include <sys/zio.h>
     36 #include <sys/space_map.h>
     37 #include <sys/refcount.h>
     38 #include <sys/zfeature.h>
     39 
     40 SYSCTL_DECL(_vfs_zfs);
     41 
     42 /*
     43  * The data for a given space map can be kept on blocks of any size.
     44  * Larger blocks entail fewer i/o operations, but they also cause the
     45  * DMU to keep more data in-core, and also to waste more i/o bandwidth
     46  * when only a few blocks have changed since the last transaction group.
     47  */
     48 int space_map_blksz = (1 << 12);
     49 SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_blksz, CTLFLAG_RDTUN, &space_map_blksz, 0,
     50     "Maximum block size for space map.  Must be power of 2 and greater than 4096.");
     51 
     52 /*
     53  * Load the space map disk into the specified range tree. Segments of maptype
     54  * are added to the range tree, other segment types are removed.
     55  *
     56  * Note: space_map_load() will drop sm_lock across dmu_read() calls.
     57  * The caller must be OK with this.
     58  */
     59 int
     60 space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
     61 {
     62 	uint64_t *entry, *entry_map, *entry_map_end;
     63 	uint64_t bufsize, size, offset, end, space;
     64 	int error = 0;
     65 
     66 	ASSERT(MUTEX_HELD(sm->sm_lock));
     67 
     68 	end = space_map_length(sm);
     69 	space = space_map_allocated(sm);
     70 
     71 	VERIFY0(range_tree_space(rt));
     72 
     73 	if (maptype == SM_FREE) {
     74 		range_tree_add(rt, sm->sm_start, sm->sm_size);
     75 		space = sm->sm_size - space;
     76 	}
     77 
     78 	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
     79 	entry_map = zio_buf_alloc(bufsize);
     80 
     81 	mutex_exit(sm->sm_lock);
     82 	if (end > bufsize) {
     83 		dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
     84 		    end - bufsize, ZIO_PRIORITY_SYNC_READ);
     85 	}
     86 	mutex_enter(sm->sm_lock);
     87 
     88 	for (offset = 0; offset < end; offset += bufsize) {
     89 		size = MIN(end - offset, bufsize);
     90 		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
     91 		VERIFY(size != 0);
     92 		ASSERT3U(sm->sm_blksz, !=, 0);
     93 
     94 		dprintf("object=%llu  offset=%llx  size=%llx\n",
     95 		    space_map_object(sm), offset, size);
     96 
     97 		mutex_exit(sm->sm_lock);
     98 		error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
     99 		    entry_map, DMU_READ_PREFETCH);
    100 		mutex_enter(sm->sm_lock);
    101 		if (error != 0)
    102 			break;
    103 
    104 		entry_map_end = entry_map + (size / sizeof (uint64_t));
    105 		for (entry = entry_map; entry < entry_map_end; entry++) {
    106 			uint64_t e = *entry;
    107 			uint64_t offset, size;
    108 
    109 			if (SM_DEBUG_DECODE(e))		/* Skip debug entries */
    110 				continue;
    111 
    112 			offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
    113 			    sm->sm_start;
    114 			size = SM_RUN_DECODE(e) << sm->sm_shift;
    115 
    116 			VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
    117 			VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
    118 			VERIFY3U(offset, >=, sm->sm_start);
    119 			VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
    120 			if (SM_TYPE_DECODE(e) == maptype) {
    121 				VERIFY3U(range_tree_space(rt) + size, <=,
    122 				    sm->sm_size);
    123 				range_tree_add(rt, offset, size);
    124 			} else {
    125 				range_tree_remove(rt, offset, size);
    126 			}
    127 		}
    128 	}
    129 
    130 	if (error == 0)
    131 		VERIFY3U(range_tree_space(rt), ==, space);
    132 	else
    133 		range_tree_vacate(rt, NULL, NULL);
    134 
    135 	zio_buf_free(entry_map, bufsize);
    136 	return (error);
    137 }
    138 
    139 void
    140 space_map_histogram_clear(space_map_t *sm)
    141 {
    142 	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
    143 		return;
    144 
    145 	bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
    146 }
    147 
    148 boolean_t
    149 space_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
    150 {
    151 	/*
    152 	 * Verify that the in-core range tree does not have any
    153 	 * ranges smaller than our sm_shift size.
    154 	 */
    155 	for (int i = 0; i < sm->sm_shift; i++) {
    156 		if (rt->rt_histogram[i] != 0)
    157 			return (B_FALSE);
    158 	}
    159 	return (B_TRUE);
    160 }
    161 
    162 void
    163 space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
    164 {
    165 	int idx = 0;
    166 
    167 	ASSERT(MUTEX_HELD(rt->rt_lock));
    168 	ASSERT(dmu_tx_is_syncing(tx));
    169 	VERIFY3U(space_map_object(sm), !=, 0);
    170 
    171 	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
    172 		return;
    173 
    174 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
    175 
    176 	ASSERT(space_map_histogram_verify(sm, rt));
    177 	/*
    178 	 * Transfer the content of the range tree histogram to the space
    179 	 * map histogram. The space map histogram contains 32 buckets ranging
    180 	 * between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
    181 	 * however, can represent ranges from 2^0 to 2^63. Since the space
    182 	 * map only cares about allocatable blocks (minimum of sm_shift) we
    183 	 * can safely ignore all ranges in the range tree smaller than sm_shift.
    184 	 */
    185 	for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
    186 
    187 		/*
    188 		 * Since the largest histogram bucket in the space map is
    189 		 * 2^(32+sm_shift-1), we need to normalize the values in
    190 		 * the range tree for any bucket larger than that size. For
    191 		 * example given an sm_shift of 9, ranges larger than 2^40
    192 		 * would get normalized as if they were 1TB ranges. Assume
    193 		 * the range tree had a count of 5 in the 2^44 (16TB) bucket,
    194 		 * the calculation below would normalize this to 5 * 2^4 (16).
    195 		 */
    196 		ASSERT3U(i, >=, idx + sm->sm_shift);
    197 		sm->sm_phys->smp_histogram[idx] +=
    198 		    rt->rt_histogram[i] << (i - idx - sm->sm_shift);
    199 
    200 		/*
    201 		 * Increment the space map's index as long as we haven't
    202 		 * reached the maximum bucket size. Accumulate all ranges
    203 		 * larger than the max bucket size into the last bucket.
    204 		 */
    205 		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
    206 			ASSERT3U(idx + sm->sm_shift, ==, i);
    207 			idx++;
    208 			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
    209 		}
    210 	}
    211 }
    212 
    213 uint64_t
    214 space_map_entries(space_map_t *sm, range_tree_t *rt)
    215 {
    216 	avl_tree_t *t = &rt->rt_root;
    217 	range_seg_t *rs;
    218 	uint64_t size, entries;
    219 
    220 	/*
    221 	 * All space_maps always have a debug entry so account for it here.
    222 	 */
    223 	entries = 1;
    224 
    225 	/*
    226 	 * Traverse the range tree and calculate the number of space map
    227 	 * entries that would be required to write out the range tree.
    228 	 */
    229 	for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
    230 		size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
    231 		entries += howmany(size, SM_RUN_MAX);
    232 	}
    233 	return (entries);
    234 }
    235 
    236 /*
    237  * Note: space_map_write() will drop sm_lock across dmu_write() calls.
    238  */
    239 void
    240 space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
    241     dmu_tx_t *tx)
    242 {
    243 	objset_t *os = sm->sm_os;
    244 	spa_t *spa = dmu_objset_spa(os);
    245 	avl_tree_t *t = &rt->rt_root;
    246 	range_seg_t *rs;
    247 	uint64_t size, total, rt_space, nodes;
    248 	uint64_t *entry, *entry_map, *entry_map_end;
    249 	uint64_t expected_entries, actual_entries = 1;
    250 
    251 	ASSERT(MUTEX_HELD(rt->rt_lock));
    252 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
    253 	VERIFY3U(space_map_object(sm), !=, 0);
    254 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
    255 
    256 	/*
    257 	 * This field is no longer necessary since the in-core space map
    258 	 * now contains the object number but is maintained for backwards
    259 	 * compatibility.
    260 	 */
    261 	sm->sm_phys->smp_object = sm->sm_object;
    262 
    263 	if (range_tree_space(rt) == 0) {
    264 		VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
    265 		return;
    266 	}
    267 
    268 	if (maptype == SM_ALLOC)
    269 		sm->sm_phys->smp_alloc += range_tree_space(rt);
    270 	else
    271 		sm->sm_phys->smp_alloc -= range_tree_space(rt);
    272 
    273 	expected_entries = space_map_entries(sm, rt);
    274 
    275 	entry_map = zio_buf_alloc(sm->sm_blksz);
    276 	entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
    277 	entry = entry_map;
    278 
    279 	*entry++ = SM_DEBUG_ENCODE(1) |
    280 	    SM_DEBUG_ACTION_ENCODE(maptype) |
    281 	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
    282 	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
    283 
    284 	total = 0;
    285 	nodes = avl_numnodes(&rt->rt_root);
    286 	rt_space = range_tree_space(rt);
    287 	for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
    288 		uint64_t start;
    289 
    290 		size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
    291 		start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
    292 
    293 		total += size << sm->sm_shift;
    294 
    295 		while (size != 0) {
    296 			uint64_t run_len;
    297 
    298 			run_len = MIN(size, SM_RUN_MAX);
    299 
    300 			if (entry == entry_map_end) {
    301 				mutex_exit(rt->rt_lock);
    302 				dmu_write(os, space_map_object(sm),
    303 				    sm->sm_phys->smp_objsize, sm->sm_blksz,
    304 				    entry_map, tx);
    305 				mutex_enter(rt->rt_lock);
    306 				sm->sm_phys->smp_objsize += sm->sm_blksz;
    307 				entry = entry_map;
    308 			}
    309 
    310 			*entry++ = SM_OFFSET_ENCODE(start) |
    311 			    SM_TYPE_ENCODE(maptype) |
    312 			    SM_RUN_ENCODE(run_len);
    313 
    314 			start += run_len;
    315 			size -= run_len;
    316 			actual_entries++;
    317 		}
    318 	}
    319 
    320 	if (entry != entry_map) {
    321 		size = (entry - entry_map) * sizeof (uint64_t);
    322 		mutex_exit(rt->rt_lock);
    323 		dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize,
    324 		    size, entry_map, tx);
    325 		mutex_enter(rt->rt_lock);
    326 		sm->sm_phys->smp_objsize += size;
    327 	}
    328 	ASSERT3U(expected_entries, ==, actual_entries);
    329 
    330 	/*
    331 	 * Ensure that the space_map's accounting wasn't changed
    332 	 * while we were in the middle of writing it out.
    333 	 */
    334 	VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
    335 	VERIFY3U(range_tree_space(rt), ==, rt_space);
    336 	VERIFY3U(range_tree_space(rt), ==, total);
    337 
    338 	zio_buf_free(entry_map, sm->sm_blksz);
    339 }
    340 
    341 static int
    342 space_map_open_impl(space_map_t *sm)
    343 {
    344 	int error;
    345 	u_longlong_t blocks;
    346 
    347 	error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
    348 	if (error)
    349 		return (error);
    350 
    351 	dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
    352 	sm->sm_phys = sm->sm_dbuf->db_data;
    353 	return (0);
    354 }
    355 
    356 int
    357 space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
    358     uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp)
    359 {
    360 	space_map_t *sm;
    361 	int error;
    362 
    363 	ASSERT(*smp == NULL);
    364 	ASSERT(os != NULL);
    365 	ASSERT(object != 0);
    366 
    367 	sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
    368 
    369 	sm->sm_start = start;
    370 	sm->sm_size = size;
    371 	sm->sm_shift = shift;
    372 	sm->sm_lock = lp;
    373 	sm->sm_os = os;
    374 	sm->sm_object = object;
    375 
    376 	error = space_map_open_impl(sm);
    377 	if (error != 0) {
    378 		space_map_close(sm);
    379 		return (error);
    380 	}
    381 
    382 	*smp = sm;
    383 
    384 	return (0);
    385 }
    386 
    387 void
    388 space_map_close(space_map_t *sm)
    389 {
    390 	if (sm == NULL)
    391 		return;
    392 
    393 	if (sm->sm_dbuf != NULL)
    394 		dmu_buf_rele(sm->sm_dbuf, sm);
    395 	sm->sm_dbuf = NULL;
    396 	sm->sm_phys = NULL;
    397 
    398 	kmem_free(sm, sizeof (*sm));
    399 }
    400 
    401 void
    402 space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
    403 {
    404 	objset_t *os = sm->sm_os;
    405 	spa_t *spa = dmu_objset_spa(os);
    406 	dmu_object_info_t doi;
    407 
    408 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
    409 	ASSERT(dmu_tx_is_syncing(tx));
    410 
    411 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
    412 
    413 	/*
    414 	 * If the space map has the wrong bonus size (because
    415 	 * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
    416 	 * the wrong block size (because space_map_blksz has changed),
    417 	 * free and re-allocate its object with the updated sizes.
    418 	 *
    419 	 * Otherwise, just truncate the current object.
    420 	 */
    421 	if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
    422 	    doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
    423 	    doi.doi_data_block_size != space_map_blksz) {
    424 		zfs_dbgmsg("txg %llu, spa %s, reallocating: "
    425 		    "old bonus %u, old blocksz %u", dmu_tx_get_txg(tx),
    426 		    spa_name(spa), doi.doi_bonus_size, doi.doi_data_block_size);
    427 
    428 		space_map_free(sm, tx);
    429 		dmu_buf_rele(sm->sm_dbuf, sm);
    430 
    431 		sm->sm_object = space_map_alloc(sm->sm_os, tx);
    432 		VERIFY0(space_map_open_impl(sm));
    433 	} else {
    434 		VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
    435 
    436 		/*
    437 		 * If the spacemap is reallocated, its histogram
    438 		 * will be reset.  Do the same in the common case so that
    439 		 * bugs related to the uncommon case do not go unnoticed.
    440 		 */
    441 		bzero(sm->sm_phys->smp_histogram,
    442 		    sizeof (sm->sm_phys->smp_histogram));
    443 	}
    444 
    445 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
    446 	sm->sm_phys->smp_objsize = 0;
    447 	sm->sm_phys->smp_alloc = 0;
    448 }
    449 
    450 /*
    451  * Update the in-core space_map allocation and length values.
    452  */
    453 void
    454 space_map_update(space_map_t *sm)
    455 {
    456 	if (sm == NULL)
    457 		return;
    458 
    459 	ASSERT(MUTEX_HELD(sm->sm_lock));
    460 
    461 	sm->sm_alloc = sm->sm_phys->smp_alloc;
    462 	sm->sm_length = sm->sm_phys->smp_objsize;
    463 }
    464 
    465 uint64_t
    466 space_map_alloc(objset_t *os, dmu_tx_t *tx)
    467 {
    468 	spa_t *spa = dmu_objset_spa(os);
    469 	uint64_t object;
    470 	int bonuslen;
    471 
    472 	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
    473 		spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
    474 		bonuslen = sizeof (space_map_phys_t);
    475 		ASSERT3U(bonuslen, <=, dmu_bonus_max());
    476 	} else {
    477 		bonuslen = SPACE_MAP_SIZE_V0;
    478 	}
    479 
    480 	object = dmu_object_alloc(os,
    481 	    DMU_OT_SPACE_MAP, space_map_blksz,
    482 	    DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
    483 
    484 	return (object);
    485 }
    486 
    487 void
    488 space_map_free(space_map_t *sm, dmu_tx_t *tx)
    489 {
    490 	spa_t *spa;
    491 
    492 	if (sm == NULL)
    493 		return;
    494 
    495 	spa = dmu_objset_spa(sm->sm_os);
    496 	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
    497 		dmu_object_info_t doi;
    498 
    499 		dmu_object_info_from_db(sm->sm_dbuf, &doi);
    500 		if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
    501 			VERIFY(spa_feature_is_active(spa,
    502 			    SPA_FEATURE_SPACEMAP_HISTOGRAM));
    503 			spa_feature_decr(spa,
    504 			    SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
    505 		}
    506 	}
    507 
    508 	VERIFY3U(dmu_object_free(sm->sm_os, space_map_object(sm), tx), ==, 0);
    509 	sm->sm_object = 0;
    510 }
    511 
    512 uint64_t
    513 space_map_object(space_map_t *sm)
    514 {
    515 	return (sm != NULL ? sm->sm_object : 0);
    516 }
    517 
    518 /*
    519  * Returns the already synced, on-disk allocated space.
    520  */
    521 uint64_t
    522 space_map_allocated(space_map_t *sm)
    523 {
    524 	return (sm != NULL ? sm->sm_alloc : 0);
    525 }
    526 
    527 /*
    528  * Returns the already synced, on-disk length;
    529  */
    530 uint64_t
    531 space_map_length(space_map_t *sm)
    532 {
    533 	return (sm != NULL ? sm->sm_length : 0);
    534 }
    535 
    536 /*
    537  * Returns the allocated space that is currently syncing.
    538  */
    539 int64_t
    540 space_map_alloc_delta(space_map_t *sm)
    541 {
    542 	if (sm == NULL)
    543 		return (0);
    544 	ASSERT(sm->sm_dbuf != NULL);
    545 	return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
    546 }
    547