Home | History | Annotate | Line # | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 /*
     26  * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
     27  */
     28 
     29 #include <sys/zfs_context.h>
     30 #include <sys/spa.h>
     31 #include <sys/dmu.h>
     32 #include <sys/dnode.h>
     33 #include <sys/zio.h>
     34 #include <sys/range_tree.h>
     35 
     36 kmem_cache_t *range_seg_cache;
     37 
     38 void
     39 range_tree_init(void)
     40 {
     41 	ASSERT(range_seg_cache == NULL);
     42 	range_seg_cache = kmem_cache_create("range_seg_cache",
     43 	    sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
     44 }
     45 
     46 void
     47 range_tree_fini(void)
     48 {
     49 	kmem_cache_destroy(range_seg_cache);
     50 	range_seg_cache = NULL;
     51 }
     52 
     53 void
     54 range_tree_stat_verify(range_tree_t *rt)
     55 {
     56 	range_seg_t *rs;
     57 	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
     58 	int i;
     59 
     60 	for (rs = avl_first(&rt->rt_root); rs != NULL;
     61 	    rs = AVL_NEXT(&rt->rt_root, rs)) {
     62 		uint64_t size = rs->rs_end - rs->rs_start;
     63 		int idx	= highbit64(size) - 1;
     64 
     65 		hist[idx]++;
     66 		ASSERT3U(hist[idx], !=, 0);
     67 	}
     68 
     69 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
     70 		if (hist[i] != rt->rt_histogram[i]) {
     71 			zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
     72 			    i, hist, hist[i], rt->rt_histogram[i]);
     73 		}
     74 		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
     75 	}
     76 }
     77 
     78 static void
     79 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
     80 {
     81 	uint64_t size = rs->rs_end - rs->rs_start;
     82 	int idx = highbit64(size) - 1;
     83 
     84 	ASSERT(size != 0);
     85 	ASSERT3U(idx, <,
     86 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
     87 
     88 	ASSERT(MUTEX_HELD(rt->rt_lock));
     89 	rt->rt_histogram[idx]++;
     90 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
     91 }
     92 
     93 static void
     94 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
     95 {
     96 	uint64_t size = rs->rs_end - rs->rs_start;
     97 	int idx = highbit64(size) - 1;
     98 
     99 	ASSERT(size != 0);
    100 	ASSERT3U(idx, <,
    101 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
    102 
    103 	ASSERT(MUTEX_HELD(rt->rt_lock));
    104 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
    105 	rt->rt_histogram[idx]--;
    106 }
    107 
    108 /*
    109  * NOTE: caller is responsible for all locking.
    110  */
    111 static int
    112 range_tree_seg_compare(const void *x1, const void *x2)
    113 {
    114 	const range_seg_t *r1 = x1;
    115 	const range_seg_t *r2 = x2;
    116 
    117 	if (r1->rs_start < r2->rs_start) {
    118 		if (r1->rs_end > r2->rs_start)
    119 			return (0);
    120 		return (-1);
    121 	}
    122 	if (r1->rs_start > r2->rs_start) {
    123 		if (r1->rs_start < r2->rs_end)
    124 			return (0);
    125 		return (1);
    126 	}
    127 	return (0);
    128 }
    129 
    130 range_tree_t *
    131 range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
    132 {
    133 	range_tree_t *rt;
    134 
    135 	rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
    136 
    137 	avl_create(&rt->rt_root, range_tree_seg_compare,
    138 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
    139 
    140 	rt->rt_lock = lp;
    141 	rt->rt_ops = ops;
    142 	rt->rt_arg = arg;
    143 
    144 	if (rt->rt_ops != NULL)
    145 		rt->rt_ops->rtop_create(rt, rt->rt_arg);
    146 
    147 	return (rt);
    148 }
    149 
    150 void
    151 range_tree_destroy(range_tree_t *rt)
    152 {
    153 	VERIFY0(rt->rt_space);
    154 
    155 	if (rt->rt_ops != NULL)
    156 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
    157 
    158 	avl_destroy(&rt->rt_root);
    159 	kmem_free(rt, sizeof (*rt));
    160 }
    161 
    162 void
    163 range_tree_add(void *arg, uint64_t start, uint64_t size)
    164 {
    165 	range_tree_t *rt = arg;
    166 	avl_index_t where;
    167 	range_seg_t rsearch, *rs_before, *rs_after, *rs;
    168 	uint64_t end = start + size;
    169 	boolean_t merge_before, merge_after;
    170 
    171 	ASSERT(MUTEX_HELD(rt->rt_lock));
    172 	VERIFY(size != 0);
    173 
    174 	rsearch.rs_start = start;
    175 	rsearch.rs_end = end;
    176 	rs = avl_find(&rt->rt_root, &rsearch, &where);
    177 
    178 	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
    179 		zfs_panic_recover("zfs: allocating allocated segment"
    180 		    "(offset=%llu size=%llu)\n",
    181 		    (longlong_t)start, (longlong_t)size);
    182 		return;
    183 	}
    184 
    185 	/* Make sure we don't overlap with either of our neighbors */
    186 	VERIFY(rs == NULL);
    187 
    188 	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
    189 	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
    190 
    191 	merge_before = (rs_before != NULL && rs_before->rs_end == start);
    192 	merge_after = (rs_after != NULL && rs_after->rs_start == end);
    193 
    194 	if (merge_before && merge_after) {
    195 		avl_remove(&rt->rt_root, rs_before);
    196 		if (rt->rt_ops != NULL) {
    197 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
    198 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
    199 		}
    200 
    201 		range_tree_stat_decr(rt, rs_before);
    202 		range_tree_stat_decr(rt, rs_after);
    203 
    204 		rs_after->rs_start = rs_before->rs_start;
    205 		kmem_cache_free(range_seg_cache, rs_before);
    206 		rs = rs_after;
    207 	} else if (merge_before) {
    208 		if (rt->rt_ops != NULL)
    209 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
    210 
    211 		range_tree_stat_decr(rt, rs_before);
    212 
    213 		rs_before->rs_end = end;
    214 		rs = rs_before;
    215 	} else if (merge_after) {
    216 		if (rt->rt_ops != NULL)
    217 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
    218 
    219 		range_tree_stat_decr(rt, rs_after);
    220 
    221 		rs_after->rs_start = start;
    222 		rs = rs_after;
    223 	} else {
    224 		rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
    225 		rs->rs_start = start;
    226 		rs->rs_end = end;
    227 		avl_insert(&rt->rt_root, rs, where);
    228 	}
    229 
    230 	if (rt->rt_ops != NULL)
    231 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
    232 
    233 	range_tree_stat_incr(rt, rs);
    234 	rt->rt_space += size;
    235 }
    236 
    237 void
    238 range_tree_remove(void *arg, uint64_t start, uint64_t size)
    239 {
    240 	range_tree_t *rt = arg;
    241 	avl_index_t where;
    242 	range_seg_t rsearch, *rs, *newseg;
    243 	uint64_t end = start + size;
    244 	boolean_t left_over, right_over;
    245 
    246 	ASSERT(MUTEX_HELD(rt->rt_lock));
    247 	VERIFY3U(size, !=, 0);
    248 	VERIFY3U(size, <=, rt->rt_space);
    249 
    250 	rsearch.rs_start = start;
    251 	rsearch.rs_end = end;
    252 	rs = avl_find(&rt->rt_root, &rsearch, &where);
    253 
    254 	/* Make sure we completely overlap with someone */
    255 	if (rs == NULL) {
    256 		zfs_panic_recover("zfs: freeing free segment "
    257 		    "(offset=%llu size=%llu)",
    258 		    (longlong_t)start, (longlong_t)size);
    259 		return;
    260 	}
    261 	VERIFY3U(rs->rs_start, <=, start);
    262 	VERIFY3U(rs->rs_end, >=, end);
    263 
    264 	left_over = (rs->rs_start != start);
    265 	right_over = (rs->rs_end != end);
    266 
    267 	range_tree_stat_decr(rt, rs);
    268 
    269 	if (rt->rt_ops != NULL)
    270 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
    271 
    272 	if (left_over && right_over) {
    273 		newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
    274 		newseg->rs_start = end;
    275 		newseg->rs_end = rs->rs_end;
    276 		range_tree_stat_incr(rt, newseg);
    277 
    278 		rs->rs_end = start;
    279 
    280 		avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
    281 		if (rt->rt_ops != NULL)
    282 			rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
    283 	} else if (left_over) {
    284 		rs->rs_end = start;
    285 	} else if (right_over) {
    286 		rs->rs_start = end;
    287 	} else {
    288 		avl_remove(&rt->rt_root, rs);
    289 		kmem_cache_free(range_seg_cache, rs);
    290 		rs = NULL;
    291 	}
    292 
    293 	if (rs != NULL) {
    294 		range_tree_stat_incr(rt, rs);
    295 
    296 		if (rt->rt_ops != NULL)
    297 			rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
    298 	}
    299 
    300 	rt->rt_space -= size;
    301 }
    302 
    303 static range_seg_t *
    304 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
    305 {
    306 	avl_index_t where;
    307 	range_seg_t rsearch;
    308 	uint64_t end = start + size;
    309 
    310 	ASSERT(MUTEX_HELD(rt->rt_lock));
    311 	VERIFY(size != 0);
    312 
    313 	rsearch.rs_start = start;
    314 	rsearch.rs_end = end;
    315 	return (avl_find(&rt->rt_root, &rsearch, &where));
    316 }
    317 
    318 static range_seg_t *
    319 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
    320 {
    321 	range_seg_t *rs = range_tree_find_impl(rt, start, size);
    322 	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
    323 		return (rs);
    324 	return (NULL);
    325 }
    326 
    327 void
    328 range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
    329 {
    330 	range_seg_t *rs;
    331 
    332 	mutex_enter(rt->rt_lock);
    333 	rs = range_tree_find(rt, off, size);
    334 	if (rs != NULL)
    335 		panic("freeing free block; rs=%p", (void *)rs);
    336 	mutex_exit(rt->rt_lock);
    337 }
    338 
    339 boolean_t
    340 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
    341 {
    342 	return (range_tree_find(rt, start, size) != NULL);
    343 }
    344 
    345 /*
    346  * Ensure that this range is not in the tree, regardless of whether
    347  * it is currently in the tree.
    348  */
    349 void
    350 range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
    351 {
    352 	range_seg_t *rs;
    353 
    354 	while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
    355 		uint64_t free_start = MAX(rs->rs_start, start);
    356 		uint64_t free_end = MIN(rs->rs_end, start + size);
    357 		range_tree_remove(rt, free_start, free_end - free_start);
    358 	}
    359 }
    360 
    361 void
    362 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
    363 {
    364 	range_tree_t *rt;
    365 
    366 	ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
    367 	ASSERT0(range_tree_space(*rtdst));
    368 	ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
    369 
    370 	rt = *rtsrc;
    371 	*rtsrc = *rtdst;
    372 	*rtdst = rt;
    373 }
    374 
    375 void
    376 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
    377 {
    378 	range_seg_t *rs;
    379 	void *cookie = NULL;
    380 
    381 	ASSERT(MUTEX_HELD(rt->rt_lock));
    382 
    383 	if (rt->rt_ops != NULL)
    384 		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
    385 
    386 	while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
    387 		if (func != NULL)
    388 			func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
    389 		kmem_cache_free(range_seg_cache, rs);
    390 	}
    391 
    392 	bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
    393 	rt->rt_space = 0;
    394 }
    395 
    396 void
    397 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
    398 {
    399 	range_seg_t *rs;
    400 
    401 	ASSERT(MUTEX_HELD(rt->rt_lock));
    402 
    403 	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
    404 		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
    405 }
    406 
    407 uint64_t
    408 range_tree_space(range_tree_t *rt)
    409 {
    410 	return (rt->rt_space);
    411 }
    412