Home | History | Annotate | Line # | Download | only in zfs
      1      1.1  haad /*
      2      1.1  haad  * CDDL HEADER START
      3      1.1  haad  *
      4      1.1  haad  * The contents of this file are subject to the terms of the
      5      1.1  haad  * Common Development and Distribution License (the "License").
      6      1.1  haad  * You may not use this file except in compliance with the License.
      7      1.1  haad  *
      8      1.1  haad  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9      1.1  haad  * or http://www.opensolaris.org/os/licensing.
     10      1.1  haad  * See the License for the specific language governing permissions
     11      1.1  haad  * and limitations under the License.
     12      1.1  haad  *
     13      1.1  haad  * When distributing Covered Code, include this CDDL HEADER in each
     14      1.1  haad  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15      1.1  haad  * If applicable, add the following below this CDDL HEADER, with the
     16      1.1  haad  * fields enclosed by brackets "[]" replaced with your own identifying
     17      1.1  haad  * information: Portions Copyright [yyyy] [name of copyright owner]
     18      1.1  haad  *
     19      1.1  haad  * CDDL HEADER END
     20      1.1  haad  */
     21      1.1  haad /*
     22  1.1.1.2  haad  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23      1.1  haad  * Use is subject to license terms.
     24      1.1  haad  */
     25      1.1  haad 
     26  1.1.1.3   chs /*
     27  1.1.1.3   chs  * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
     28  1.1.1.3   chs  */
     29  1.1.1.3   chs 
     30      1.1  haad #include <sys/zfs_context.h>
     31      1.1  haad #include <sys/dnode.h>
     32      1.1  haad #include <sys/dmu_objset.h>
     33      1.1  haad #include <sys/dmu_zfetch.h>
     34      1.1  haad #include <sys/dmu.h>
     35      1.1  haad #include <sys/dbuf.h>
     36  1.1.1.2  haad #include <sys/kstat.h>
     37      1.1  haad 
     38      1.1  haad /*
     39  1.1.1.3   chs  * This tunable disables predictive prefetch.  Note that it leaves "prescient"
     40  1.1.1.3   chs  * prefetch (e.g. prefetch for zfs send) intact.  Unlike predictive prefetch,
     41  1.1.1.3   chs  * prescient prefetch never issues i/os that end up not being needed,
     42  1.1.1.3   chs  * so it can't hurt performance.
     43      1.1  haad  */
     44  1.1.1.3   chs boolean_t zfs_prefetch_disable = B_FALSE;
     45      1.1  haad 
     46      1.1  haad /* max # of streams per zfetch */
     47      1.1  haad uint32_t	zfetch_max_streams = 8;
     48      1.1  haad /* min time before stream reclaim */
     49      1.1  haad uint32_t	zfetch_min_sec_reap = 2;
     50  1.1.1.3   chs /* max bytes to prefetch per stream (default 8MB) */
     51  1.1.1.3   chs uint32_t	zfetch_max_distance = 8 * 1024 * 1024;
     52  1.1.1.3   chs /* max bytes to prefetch indirects for per stream (default 64MB) */
     53  1.1.1.3   chs uint32_t	zfetch_max_idistance = 64 * 1024 * 1024;
     54  1.1.1.3   chs /* max number of bytes in an array_read in which we allow prefetching (1MB) */
     55      1.1  haad uint64_t	zfetch_array_rd_sz = 1024 * 1024;
     56      1.1  haad 
     57  1.1.1.3   chs SYSCTL_DECL(_vfs_zfs);
     58  1.1.1.3   chs SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW,
     59  1.1.1.3   chs     &zfs_prefetch_disable, 0, "Disable prefetch");
     60  1.1.1.3   chs SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
     61  1.1.1.3   chs SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RWTUN,
     62  1.1.1.3   chs     &zfetch_max_streams, 0, "Max # of streams per zfetch");
     63  1.1.1.3   chs SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN,
     64  1.1.1.3   chs     &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
     65  1.1.1.3   chs SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
     66  1.1.1.3   chs     &zfetch_max_distance, 0, "Max bytes to prefetch per stream");
     67  1.1.1.3   chs SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN,
     68  1.1.1.3   chs     &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream");
     69  1.1.1.3   chs SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN,
     70  1.1.1.3   chs     &zfetch_array_rd_sz, 0,
     71  1.1.1.3   chs     "Number of bytes in a array_read at which we stop prefetching");
     72      1.1  haad 
     73  1.1.1.2  haad typedef struct zfetch_stats {
     74  1.1.1.2  haad 	kstat_named_t zfetchstat_hits;
     75  1.1.1.2  haad 	kstat_named_t zfetchstat_misses;
     76  1.1.1.3   chs 	kstat_named_t zfetchstat_max_streams;
     77  1.1.1.2  haad } zfetch_stats_t;
     78  1.1.1.2  haad 
     79  1.1.1.2  haad static zfetch_stats_t zfetch_stats = {
     80  1.1.1.2  haad 	{ "hits",			KSTAT_DATA_UINT64 },
     81  1.1.1.2  haad 	{ "misses",			KSTAT_DATA_UINT64 },
     82  1.1.1.3   chs 	{ "max_streams",		KSTAT_DATA_UINT64 },
     83  1.1.1.2  haad };
     84  1.1.1.2  haad 
     85  1.1.1.3   chs #define	ZFETCHSTAT_BUMP(stat) \
     86  1.1.1.3   chs 	atomic_inc_64(&zfetch_stats.stat.value.ui64);
     87  1.1.1.2  haad 
     88  1.1.1.2  haad kstat_t		*zfetch_ksp;
     89  1.1.1.2  haad 
     90  1.1.1.2  haad void
     91  1.1.1.2  haad zfetch_init(void)
     92  1.1.1.2  haad {
     93  1.1.1.2  haad 	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
     94  1.1.1.2  haad 	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
     95  1.1.1.2  haad 	    KSTAT_FLAG_VIRTUAL);
     96  1.1.1.2  haad 
     97  1.1.1.2  haad 	if (zfetch_ksp != NULL) {
     98  1.1.1.2  haad 		zfetch_ksp->ks_data = &zfetch_stats;
     99  1.1.1.2  haad 		kstat_install(zfetch_ksp);
    100  1.1.1.2  haad 	}
    101  1.1.1.2  haad }
    102  1.1.1.2  haad 
    103  1.1.1.2  haad void
    104  1.1.1.2  haad zfetch_fini(void)
    105  1.1.1.2  haad {
    106  1.1.1.2  haad 	if (zfetch_ksp != NULL) {
    107  1.1.1.2  haad 		kstat_delete(zfetch_ksp);
    108  1.1.1.2  haad 		zfetch_ksp = NULL;
    109  1.1.1.2  haad 	}
    110      1.1  haad }
    111      1.1  haad 
    112      1.1  haad /*
    113      1.1  haad  * This takes a pointer to a zfetch structure and a dnode.  It performs the
    114      1.1  haad  * necessary setup for the zfetch structure, grokking data from the
    115      1.1  haad  * associated dnode.
    116      1.1  haad  */
    117      1.1  haad void
    118      1.1  haad dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
    119      1.1  haad {
    120  1.1.1.3   chs 	if (zf == NULL)
    121      1.1  haad 		return;
    122      1.1  haad 
    123      1.1  haad 	zf->zf_dnode = dno;
    124      1.1  haad 
    125      1.1  haad 	list_create(&zf->zf_stream, sizeof (zstream_t),
    126  1.1.1.3   chs 	    offsetof(zstream_t, zs_node));
    127      1.1  haad 
    128      1.1  haad 	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
    129      1.1  haad }
    130      1.1  haad 
    131  1.1.1.3   chs static void
    132  1.1.1.3   chs dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
    133      1.1  haad {
    134  1.1.1.3   chs 	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
    135  1.1.1.3   chs 	list_remove(&zf->zf_stream, zs);
    136  1.1.1.3   chs 	mutex_destroy(&zs->zs_lock);
    137  1.1.1.3   chs 	kmem_free(zs, sizeof (*zs));
    138      1.1  haad }
    139      1.1  haad 
    140      1.1  haad /*
    141  1.1.1.3   chs  * Clean-up state associated with a zfetch structure (e.g. destroy the
    142  1.1.1.3   chs  * streams).  This doesn't free the zfetch_t itself, that's left to the caller.
    143      1.1  haad  */
    144      1.1  haad void
    145  1.1.1.3   chs dmu_zfetch_fini(zfetch_t *zf)
    146      1.1  haad {
    147  1.1.1.3   chs 	zstream_t *zs;
    148      1.1  haad 
    149      1.1  haad 	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
    150      1.1  haad 
    151  1.1.1.3   chs 	rw_enter(&zf->zf_rwlock, RW_WRITER);
    152  1.1.1.3   chs 	while ((zs = list_head(&zf->zf_stream)) != NULL)
    153  1.1.1.3   chs 		dmu_zfetch_stream_remove(zf, zs);
    154  1.1.1.3   chs 	rw_exit(&zf->zf_rwlock);
    155      1.1  haad 	list_destroy(&zf->zf_stream);
    156      1.1  haad 	rw_destroy(&zf->zf_rwlock);
    157      1.1  haad 
    158      1.1  haad 	zf->zf_dnode = NULL;
    159      1.1  haad }
    160      1.1  haad 
    161      1.1  haad /*
    162  1.1.1.3   chs  * If there aren't too many streams already, create a new stream.
    163  1.1.1.3   chs  * The "blkid" argument is the next block that we expect this stream to access.
    164  1.1.1.3   chs  * While we're here, clean up old streams (which haven't been
    165  1.1.1.3   chs  * accessed for at least zfetch_min_sec_reap seconds).
    166      1.1  haad  */
    167  1.1.1.3   chs static void
    168  1.1.1.3   chs dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
    169      1.1  haad {
    170  1.1.1.3   chs 	zstream_t *zs_next;
    171  1.1.1.3   chs 	int numstreams = 0;
    172      1.1  haad 
    173      1.1  haad 	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
    174      1.1  haad 
    175  1.1.1.3   chs 	/*
    176  1.1.1.3   chs 	 * Clean up old streams.
    177  1.1.1.3   chs 	 */
    178  1.1.1.3   chs 	for (zstream_t *zs = list_head(&zf->zf_stream);
    179  1.1.1.3   chs 	    zs != NULL; zs = zs_next) {
    180  1.1.1.3   chs 		zs_next = list_next(&zf->zf_stream, zs);
    181  1.1.1.3   chs 		if (((gethrtime() - zs->zs_atime) / NANOSEC) >
    182  1.1.1.3   chs 		    zfetch_min_sec_reap)
    183  1.1.1.3   chs 			dmu_zfetch_stream_remove(zf, zs);
    184  1.1.1.3   chs 		else
    185  1.1.1.3   chs 			numstreams++;
    186      1.1  haad 	}
    187      1.1  haad 
    188  1.1.1.3   chs 	/*
    189  1.1.1.3   chs 	 * The maximum number of streams is normally zfetch_max_streams,
    190  1.1.1.3   chs 	 * but for small files we lower it such that it's at least possible
    191  1.1.1.3   chs 	 * for all the streams to be non-overlapping.
    192  1.1.1.3   chs 	 *
    193  1.1.1.3   chs 	 * If we are already at the maximum number of streams for this file,
    194  1.1.1.3   chs 	 * even after removing old streams, then don't create this stream.
    195  1.1.1.3   chs 	 */
    196  1.1.1.3   chs 	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
    197  1.1.1.3   chs 	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
    198  1.1.1.3   chs 	    zfetch_max_distance));
    199  1.1.1.3   chs 	if (numstreams >= max_streams) {
    200  1.1.1.3   chs 		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
    201  1.1.1.3   chs 		return;
    202      1.1  haad 	}
    203      1.1  haad 
    204  1.1.1.3   chs 	zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
    205  1.1.1.3   chs 	zs->zs_blkid = blkid;
    206  1.1.1.3   chs 	zs->zs_pf_blkid = blkid;
    207  1.1.1.3   chs 	zs->zs_ipf_blkid = blkid;
    208  1.1.1.3   chs 	zs->zs_atime = gethrtime();
    209  1.1.1.3   chs 	mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
    210      1.1  haad 
    211  1.1.1.3   chs 	list_insert_head(&zf->zf_stream, zs);
    212      1.1  haad }
    213      1.1  haad 
    214      1.1  haad /*
    215  1.1.1.3   chs  * This is the predictive prefetch entry point.  It associates dnode access
    216  1.1.1.3   chs  * specified with blkid and nblks arguments with prefetch stream, predicts
    217  1.1.1.3   chs  * further accesses based on that stats and initiates speculative prefetch.
    218  1.1.1.3   chs  * fetch_data argument specifies whether actual data blocks should be fetched:
    219  1.1.1.3   chs  *   FALSE -- prefetch only indirect blocks for predicted data blocks;
    220  1.1.1.3   chs  *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
    221      1.1  haad  */
    222      1.1  haad void
    223  1.1.1.3   chs dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
    224      1.1  haad {
    225  1.1.1.3   chs 	zstream_t *zs;
    226  1.1.1.3   chs 	int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
    227  1.1.1.3   chs 	int64_t pf_ahead_blks, max_blks;
    228  1.1.1.3   chs 	int epbs, max_dist_blks, pf_nblks, ipf_nblks;
    229  1.1.1.3   chs 	uint64_t end_of_access_blkid = blkid + nblks;
    230      1.1  haad 
    231      1.1  haad 	if (zfs_prefetch_disable)
    232      1.1  haad 		return;
    233      1.1  haad 
    234  1.1.1.3   chs 	/*
    235  1.1.1.3   chs 	 * As a fast path for small (single-block) files, ignore access
    236  1.1.1.3   chs 	 * to the first block.
    237  1.1.1.3   chs 	 */
    238  1.1.1.3   chs 	if (blkid == 0)
    239      1.1  haad 		return;
    240      1.1  haad 
    241  1.1.1.3   chs 	rw_enter(&zf->zf_rwlock, RW_READER);
    242  1.1.1.3   chs 
    243  1.1.1.3   chs 	for (zs = list_head(&zf->zf_stream); zs != NULL;
    244  1.1.1.3   chs 	    zs = list_next(&zf->zf_stream, zs)) {
    245  1.1.1.3   chs 		if (blkid == zs->zs_blkid) {
    246  1.1.1.3   chs 			mutex_enter(&zs->zs_lock);
    247  1.1.1.3   chs 			/*
    248  1.1.1.3   chs 			 * zs_blkid could have changed before we
    249  1.1.1.3   chs 			 * acquired zs_lock; re-check them here.
    250  1.1.1.3   chs 			 */
    251  1.1.1.3   chs 			if (blkid != zs->zs_blkid) {
    252  1.1.1.3   chs 				mutex_exit(&zs->zs_lock);
    253  1.1.1.3   chs 				continue;
    254  1.1.1.3   chs 			}
    255  1.1.1.3   chs 			break;
    256  1.1.1.2  haad 		}
    257      1.1  haad 	}
    258      1.1  haad 
    259  1.1.1.3   chs 	if (zs == NULL) {
    260      1.1  haad 		/*
    261  1.1.1.3   chs 		 * This access is not part of any existing stream.  Create
    262  1.1.1.3   chs 		 * a new stream for it.
    263      1.1  haad 		 */
    264  1.1.1.3   chs 		ZFETCHSTAT_BUMP(zfetchstat_misses);
    265  1.1.1.3   chs 		if (rw_tryupgrade(&zf->zf_rwlock))
    266  1.1.1.3   chs 			dmu_zfetch_stream_create(zf, end_of_access_blkid);
    267  1.1.1.3   chs 		rw_exit(&zf->zf_rwlock);
    268  1.1.1.3   chs 		return;
    269  1.1.1.3   chs 	}
    270      1.1  haad 
    271  1.1.1.3   chs 	/*
    272  1.1.1.3   chs 	 * This access was to a block that we issued a prefetch for on
    273  1.1.1.3   chs 	 * behalf of this stream. Issue further prefetches for this stream.
    274  1.1.1.3   chs 	 *
    275  1.1.1.3   chs 	 * Normally, we start prefetching where we stopped
    276  1.1.1.3   chs 	 * prefetching last (zs_pf_blkid).  But when we get our first
    277  1.1.1.3   chs 	 * hit on this stream, zs_pf_blkid == zs_blkid, we don't
    278  1.1.1.3   chs 	 * want to prefetch the block we just accessed.  In this case,
    279  1.1.1.3   chs 	 * start just after the block we just accessed.
    280  1.1.1.3   chs 	 */
    281  1.1.1.3   chs 	pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
    282  1.1.1.3   chs 
    283  1.1.1.3   chs 	/*
    284  1.1.1.3   chs 	 * Double our amount of prefetched data, but don't let the
    285  1.1.1.3   chs 	 * prefetch get further ahead than zfetch_max_distance.
    286  1.1.1.3   chs 	 */
    287  1.1.1.3   chs 	if (fetch_data) {
    288  1.1.1.3   chs 		max_dist_blks =
    289  1.1.1.3   chs 		    zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
    290  1.1.1.3   chs 		/*
    291  1.1.1.3   chs 		 * Previously, we were (zs_pf_blkid - blkid) ahead.  We
    292  1.1.1.3   chs 		 * want to now be double that, so read that amount again,
    293  1.1.1.3   chs 		 * plus the amount we are catching up by (i.e. the amount
    294  1.1.1.3   chs 		 * read just now).
    295  1.1.1.3   chs 		 */
    296  1.1.1.3   chs 		pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
    297  1.1.1.3   chs 		max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
    298  1.1.1.3   chs 		pf_nblks = MIN(pf_ahead_blks, max_blks);
    299  1.1.1.3   chs 	} else {
    300  1.1.1.3   chs 		pf_nblks = 0;
    301  1.1.1.3   chs 	}
    302      1.1  haad 
    303  1.1.1.3   chs 	zs->zs_pf_blkid = pf_start + pf_nblks;
    304      1.1  haad 
    305  1.1.1.3   chs 	/*
    306  1.1.1.3   chs 	 * Do the same for indirects, starting from where we stopped last,
    307  1.1.1.3   chs 	 * or where we will stop reading data blocks (and the indirects
    308  1.1.1.3   chs 	 * that point to them).
    309  1.1.1.3   chs 	 */
    310  1.1.1.3   chs 	ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
    311  1.1.1.3   chs 	max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
    312  1.1.1.3   chs 	/*
    313  1.1.1.3   chs 	 * We want to double our distance ahead of the data prefetch
    314  1.1.1.3   chs 	 * (or reader, if we are not prefetching data).  Previously, we
    315  1.1.1.3   chs 	 * were (zs_ipf_blkid - blkid) ahead.  To double that, we read
    316  1.1.1.3   chs 	 * that amount again, plus the amount we are catching up by
    317  1.1.1.3   chs 	 * (i.e. the amount read now + the amount of data prefetched now).
    318  1.1.1.3   chs 	 */
    319  1.1.1.3   chs 	pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
    320  1.1.1.3   chs 	max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
    321  1.1.1.3   chs 	ipf_nblks = MIN(pf_ahead_blks, max_blks);
    322  1.1.1.3   chs 	zs->zs_ipf_blkid = ipf_start + ipf_nblks;
    323  1.1.1.3   chs 
    324  1.1.1.3   chs 	epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
    325  1.1.1.3   chs 	ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
    326  1.1.1.3   chs 	ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
    327  1.1.1.3   chs 
    328  1.1.1.3   chs 	zs->zs_atime = gethrtime();
    329  1.1.1.3   chs 	zs->zs_blkid = end_of_access_blkid;
    330  1.1.1.3   chs 	mutex_exit(&zs->zs_lock);
    331  1.1.1.3   chs 	rw_exit(&zf->zf_rwlock);
    332      1.1  haad 
    333  1.1.1.3   chs 	/*
    334  1.1.1.3   chs 	 * dbuf_prefetch() is asynchronous (even when it needs to read
    335  1.1.1.3   chs 	 * indirect blocks), but we still prefer to drop our locks before
    336  1.1.1.3   chs 	 * calling it to reduce the time we hold them.
    337  1.1.1.3   chs 	 */
    338      1.1  haad 
    339  1.1.1.3   chs 	for (int i = 0; i < pf_nblks; i++) {
    340  1.1.1.3   chs 		dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
    341  1.1.1.3   chs 		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
    342  1.1.1.3   chs 	}
    343  1.1.1.3   chs 	for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
    344  1.1.1.3   chs 		dbuf_prefetch(zf->zf_dnode, 1, iblk,
    345  1.1.1.3   chs 		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
    346      1.1  haad 	}
    347  1.1.1.3   chs 	ZFETCHSTAT_BUMP(zfetchstat_hits);
    348      1.1  haad }
    349