Home | History | Annotate | Line # | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
     28  */
     29 
     30 #include <sys/zfs_context.h>
     31 #include <sys/dnode.h>
     32 #include <sys/dmu_objset.h>
     33 #include <sys/dmu_zfetch.h>
     34 #include <sys/dmu.h>
     35 #include <sys/dbuf.h>
     36 #include <sys/kstat.h>
     37 
     38 /*
     39  * This tunable disables predictive prefetch.  Note that it leaves "prescient"
     40  * prefetch (e.g. prefetch for zfs send) intact.  Unlike predictive prefetch,
     41  * prescient prefetch never issues i/os that end up not being needed,
     42  * so it can't hurt performance.
     43  */
     44 boolean_t zfs_prefetch_disable = B_FALSE;
     45 
     46 /* max # of streams per zfetch */
     47 uint32_t	zfetch_max_streams = 8;
     48 /* min time before stream reclaim */
     49 uint32_t	zfetch_min_sec_reap = 2;
     50 /* max bytes to prefetch per stream (default 8MB) */
     51 uint32_t	zfetch_max_distance = 8 * 1024 * 1024;
     52 /* max bytes to prefetch indirects for per stream (default 64MB) */
     53 uint32_t	zfetch_max_idistance = 64 * 1024 * 1024;
     54 /* max number of bytes in an array_read in which we allow prefetching (1MB) */
     55 uint64_t	zfetch_array_rd_sz = 1024 * 1024;
     56 
     57 SYSCTL_DECL(_vfs_zfs);
     58 SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW,
     59     &zfs_prefetch_disable, 0, "Disable prefetch");
     60 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
     61 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RWTUN,
     62     &zfetch_max_streams, 0, "Max # of streams per zfetch");
     63 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN,
     64     &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
     65 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
     66     &zfetch_max_distance, 0, "Max bytes to prefetch per stream");
     67 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN,
     68     &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream");
     69 SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN,
     70     &zfetch_array_rd_sz, 0,
     71     "Number of bytes in a array_read at which we stop prefetching");
     72 
     73 typedef struct zfetch_stats {
     74 	kstat_named_t zfetchstat_hits;
     75 	kstat_named_t zfetchstat_misses;
     76 	kstat_named_t zfetchstat_max_streams;
     77 } zfetch_stats_t;
     78 
     79 static zfetch_stats_t zfetch_stats = {
     80 	{ "hits",			KSTAT_DATA_UINT64 },
     81 	{ "misses",			KSTAT_DATA_UINT64 },
     82 	{ "max_streams",		KSTAT_DATA_UINT64 },
     83 };
     84 
     85 #define	ZFETCHSTAT_BUMP(stat) \
     86 	atomic_inc_64(&zfetch_stats.stat.value.ui64);
     87 
     88 kstat_t		*zfetch_ksp;
     89 
     90 void
     91 zfetch_init(void)
     92 {
     93 	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
     94 	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
     95 	    KSTAT_FLAG_VIRTUAL);
     96 
     97 	if (zfetch_ksp != NULL) {
     98 		zfetch_ksp->ks_data = &zfetch_stats;
     99 		kstat_install(zfetch_ksp);
    100 	}
    101 }
    102 
    103 void
    104 zfetch_fini(void)
    105 {
    106 	if (zfetch_ksp != NULL) {
    107 		kstat_delete(zfetch_ksp);
    108 		zfetch_ksp = NULL;
    109 	}
    110 }
    111 
    112 /*
    113  * This takes a pointer to a zfetch structure and a dnode.  It performs the
    114  * necessary setup for the zfetch structure, grokking data from the
    115  * associated dnode.
    116  */
    117 void
    118 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
    119 {
    120 	if (zf == NULL)
    121 		return;
    122 
    123 	zf->zf_dnode = dno;
    124 
    125 	list_create(&zf->zf_stream, sizeof (zstream_t),
    126 	    offsetof(zstream_t, zs_node));
    127 
    128 	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
    129 }
    130 
    131 static void
    132 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
    133 {
    134 	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
    135 	list_remove(&zf->zf_stream, zs);
    136 	mutex_destroy(&zs->zs_lock);
    137 	kmem_free(zs, sizeof (*zs));
    138 }
    139 
    140 /*
    141  * Clean-up state associated with a zfetch structure (e.g. destroy the
    142  * streams).  This doesn't free the zfetch_t itself, that's left to the caller.
    143  */
    144 void
    145 dmu_zfetch_fini(zfetch_t *zf)
    146 {
    147 	zstream_t *zs;
    148 
    149 	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
    150 
    151 	rw_enter(&zf->zf_rwlock, RW_WRITER);
    152 	while ((zs = list_head(&zf->zf_stream)) != NULL)
    153 		dmu_zfetch_stream_remove(zf, zs);
    154 	rw_exit(&zf->zf_rwlock);
    155 	list_destroy(&zf->zf_stream);
    156 	rw_destroy(&zf->zf_rwlock);
    157 
    158 	zf->zf_dnode = NULL;
    159 }
    160 
    161 /*
    162  * If there aren't too many streams already, create a new stream.
    163  * The "blkid" argument is the next block that we expect this stream to access.
    164  * While we're here, clean up old streams (which haven't been
    165  * accessed for at least zfetch_min_sec_reap seconds).
    166  */
    167 static void
    168 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
    169 {
    170 	zstream_t *zs_next;
    171 	int numstreams = 0;
    172 
    173 	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
    174 
    175 	/*
    176 	 * Clean up old streams.
    177 	 */
    178 	for (zstream_t *zs = list_head(&zf->zf_stream);
    179 	    zs != NULL; zs = zs_next) {
    180 		zs_next = list_next(&zf->zf_stream, zs);
    181 		if (((gethrtime() - zs->zs_atime) / NANOSEC) >
    182 		    zfetch_min_sec_reap)
    183 			dmu_zfetch_stream_remove(zf, zs);
    184 		else
    185 			numstreams++;
    186 	}
    187 
    188 	/*
    189 	 * The maximum number of streams is normally zfetch_max_streams,
    190 	 * but for small files we lower it such that it's at least possible
    191 	 * for all the streams to be non-overlapping.
    192 	 *
    193 	 * If we are already at the maximum number of streams for this file,
    194 	 * even after removing old streams, then don't create this stream.
    195 	 */
    196 	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
    197 	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
    198 	    zfetch_max_distance));
    199 	if (numstreams >= max_streams) {
    200 		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
    201 		return;
    202 	}
    203 
    204 	zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
    205 	zs->zs_blkid = blkid;
    206 	zs->zs_pf_blkid = blkid;
    207 	zs->zs_ipf_blkid = blkid;
    208 	zs->zs_atime = gethrtime();
    209 	mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
    210 
    211 	list_insert_head(&zf->zf_stream, zs);
    212 }
    213 
    214 /*
    215  * This is the predictive prefetch entry point.  It associates dnode access
    216  * specified with blkid and nblks arguments with prefetch stream, predicts
    217  * further accesses based on that stats and initiates speculative prefetch.
    218  * fetch_data argument specifies whether actual data blocks should be fetched:
    219  *   FALSE -- prefetch only indirect blocks for predicted data blocks;
    220  *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
    221  */
    222 void
    223 dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
    224 {
    225 	zstream_t *zs;
    226 	int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
    227 	int64_t pf_ahead_blks, max_blks;
    228 	int epbs, max_dist_blks, pf_nblks, ipf_nblks;
    229 	uint64_t end_of_access_blkid = blkid + nblks;
    230 
    231 	if (zfs_prefetch_disable)
    232 		return;
    233 
    234 	/*
    235 	 * As a fast path for small (single-block) files, ignore access
    236 	 * to the first block.
    237 	 */
    238 	if (blkid == 0)
    239 		return;
    240 
    241 	rw_enter(&zf->zf_rwlock, RW_READER);
    242 
    243 	for (zs = list_head(&zf->zf_stream); zs != NULL;
    244 	    zs = list_next(&zf->zf_stream, zs)) {
    245 		if (blkid == zs->zs_blkid) {
    246 			mutex_enter(&zs->zs_lock);
    247 			/*
    248 			 * zs_blkid could have changed before we
    249 			 * acquired zs_lock; re-check them here.
    250 			 */
    251 			if (blkid != zs->zs_blkid) {
    252 				mutex_exit(&zs->zs_lock);
    253 				continue;
    254 			}
    255 			break;
    256 		}
    257 	}
    258 
    259 	if (zs == NULL) {
    260 		/*
    261 		 * This access is not part of any existing stream.  Create
    262 		 * a new stream for it.
    263 		 */
    264 		ZFETCHSTAT_BUMP(zfetchstat_misses);
    265 		if (rw_tryupgrade(&zf->zf_rwlock))
    266 			dmu_zfetch_stream_create(zf, end_of_access_blkid);
    267 		rw_exit(&zf->zf_rwlock);
    268 		return;
    269 	}
    270 
    271 	/*
    272 	 * This access was to a block that we issued a prefetch for on
    273 	 * behalf of this stream. Issue further prefetches for this stream.
    274 	 *
    275 	 * Normally, we start prefetching where we stopped
    276 	 * prefetching last (zs_pf_blkid).  But when we get our first
    277 	 * hit on this stream, zs_pf_blkid == zs_blkid, we don't
    278 	 * want to prefetch the block we just accessed.  In this case,
    279 	 * start just after the block we just accessed.
    280 	 */
    281 	pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
    282 
    283 	/*
    284 	 * Double our amount of prefetched data, but don't let the
    285 	 * prefetch get further ahead than zfetch_max_distance.
    286 	 */
    287 	if (fetch_data) {
    288 		max_dist_blks =
    289 		    zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
    290 		/*
    291 		 * Previously, we were (zs_pf_blkid - blkid) ahead.  We
    292 		 * want to now be double that, so read that amount again,
    293 		 * plus the amount we are catching up by (i.e. the amount
    294 		 * read just now).
    295 		 */
    296 		pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
    297 		max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
    298 		pf_nblks = MIN(pf_ahead_blks, max_blks);
    299 	} else {
    300 		pf_nblks = 0;
    301 	}
    302 
    303 	zs->zs_pf_blkid = pf_start + pf_nblks;
    304 
    305 	/*
    306 	 * Do the same for indirects, starting from where we stopped last,
    307 	 * or where we will stop reading data blocks (and the indirects
    308 	 * that point to them).
    309 	 */
    310 	ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
    311 	max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
    312 	/*
    313 	 * We want to double our distance ahead of the data prefetch
    314 	 * (or reader, if we are not prefetching data).  Previously, we
    315 	 * were (zs_ipf_blkid - blkid) ahead.  To double that, we read
    316 	 * that amount again, plus the amount we are catching up by
    317 	 * (i.e. the amount read now + the amount of data prefetched now).
    318 	 */
    319 	pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
    320 	max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
    321 	ipf_nblks = MIN(pf_ahead_blks, max_blks);
    322 	zs->zs_ipf_blkid = ipf_start + ipf_nblks;
    323 
    324 	epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
    325 	ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
    326 	ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
    327 
    328 	zs->zs_atime = gethrtime();
    329 	zs->zs_blkid = end_of_access_blkid;
    330 	mutex_exit(&zs->zs_lock);
    331 	rw_exit(&zf->zf_rwlock);
    332 
    333 	/*
    334 	 * dbuf_prefetch() is asynchronous (even when it needs to read
    335 	 * indirect blocks), but we still prefer to drop our locks before
    336 	 * calling it to reduce the time we hold them.
    337 	 */
    338 
    339 	for (int i = 0; i < pf_nblks; i++) {
    340 		dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
    341 		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
    342 	}
    343 	for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
    344 		dbuf_prefetch(zf->zf_dnode, 1, iblk,
    345 		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
    346 	}
    347 	ZFETCHSTAT_BUMP(zfetchstat_hits);
    348 }
    349