Home | History | Annotate | Line # | Download | only in zfs
      1  1.1      haad /*
      2  1.1      haad  * CDDL HEADER START
      3  1.1      haad  *
      4  1.1      haad  * The contents of this file are subject to the terms of the
      5  1.1      haad  * Common Development and Distribution License (the "License").
      6  1.1      haad  * You may not use this file except in compliance with the License.
      7  1.1      haad  *
      8  1.1      haad  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  1.1      haad  * or http://www.opensolaris.org/os/licensing.
     10  1.1      haad  * See the License for the specific language governing permissions
     11  1.1      haad  * and limitations under the License.
     12  1.1      haad  *
     13  1.1      haad  * When distributing Covered Code, include this CDDL HEADER in each
     14  1.1      haad  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  1.1      haad  * If applicable, add the following below this CDDL HEADER, with the
     16  1.1      haad  * fields enclosed by brackets "[]" replaced with your own identifying
     17  1.1      haad  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  1.1      haad  *
     19  1.1      haad  * CDDL HEADER END
     20  1.1      haad  */
     21  1.1      haad /*
     22  1.6       chs  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
     23  1.6       chs  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
     24  1.1      haad  */
     25  1.6       chs /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
     26  1.6       chs /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
     27  1.6       chs /* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */
     28  1.1      haad 
     29  1.1      haad #include <sys/dmu.h>
     30  1.1      haad #include <sys/dmu_impl.h>
     31  1.1      haad #include <sys/dmu_tx.h>
     32  1.1      haad #include <sys/dbuf.h>
     33  1.1      haad #include <sys/dnode.h>
     34  1.1      haad #include <sys/zfs_context.h>
     35  1.1      haad #include <sys/dmu_objset.h>
     36  1.1      haad #include <sys/dmu_traverse.h>
     37  1.1      haad #include <sys/dsl_dataset.h>
     38  1.1      haad #include <sys/dsl_dir.h>
     39  1.1      haad #include <sys/dsl_pool.h>
     40  1.1      haad #include <sys/dsl_synctask.h>
     41  1.1      haad #include <sys/dsl_prop.h>
     42  1.1      haad #include <sys/dmu_zfetch.h>
     43  1.1      haad #include <sys/zfs_ioctl.h>
     44  1.1      haad #include <sys/zap.h>
     45  1.1      haad #include <sys/zio_checksum.h>
     46  1.6       chs #include <sys/zio_compress.h>
     47  1.6       chs #include <sys/sa.h>
     48  1.6       chs #include <sys/zfeature.h>
     49  1.1      haad #ifdef _KERNEL
     50  1.6       chs #include <sys/racct.h>
     51  1.6       chs #include <sys/vm.h>
     52  1.1      haad #include <sys/zfs_znode.h>
     53  1.1      haad #endif
     54  1.1      haad 
     55  1.6       chs /*
     56  1.6       chs  * Enable/disable nopwrite feature.
     57  1.6       chs  */
     58  1.6       chs int zfs_nopwrite_enabled = 1;
     59  1.6       chs SYSCTL_DECL(_vfs_zfs);
     60  1.6       chs SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
     61  1.6       chs     &zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
     62  1.6       chs 
     63  1.6       chs /*
     64  1.6       chs  * Tunable to control percentage of dirtied blocks from frees in one TXG.
     65  1.6       chs  * After this threshold is crossed, additional dirty blocks from frees
     66  1.6       chs  * wait until the next TXG.
     67  1.6       chs  * A value of zero will disable this throttle.
     68  1.6       chs  */
     69  1.6       chs uint32_t zfs_per_txg_dirty_frees_percent = 30;
     70  1.6       chs SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
     71  1.6       chs 	&zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg");
     72  1.6       chs 
     73  1.1      haad const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
     74  1.6       chs 	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
     75  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
     76  1.6       chs 	{	DMU_BSWAP_UINT64,	TRUE,	"object array"		},
     77  1.6       chs 	{	DMU_BSWAP_UINT8,	TRUE,	"packed nvlist"		},
     78  1.6       chs 	{	DMU_BSWAP_UINT64,	TRUE,	"packed nvlist size"	},
     79  1.6       chs 	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj"			},
     80  1.6       chs 	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj header"		},
     81  1.6       chs 	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map header"	},
     82  1.6       chs 	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map"		},
     83  1.6       chs 	{	DMU_BSWAP_UINT64,	TRUE,	"ZIL intent log"	},
     84  1.6       chs 	{	DMU_BSWAP_DNODE,	TRUE,	"DMU dnode"		},
     85  1.6       chs 	{	DMU_BSWAP_OBJSET,	TRUE,	"DMU objset"		},
     86  1.6       chs 	{	DMU_BSWAP_UINT64,	TRUE,	"DSL directory"		},
     87  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL directory child map"},
     88  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset snap map"	},
     89  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL props"		},
     90  1.6       chs 	{	DMU_BSWAP_UINT64,	TRUE,	"DSL dataset"		},
     91  1.6       chs 	{	DMU_BSWAP_ZNODE,	TRUE,	"ZFS znode"		},
     92  1.6       chs 	{	DMU_BSWAP_OLDACL,	TRUE,	"ZFS V0 ACL"		},
     93  1.6       chs 	{	DMU_BSWAP_UINT8,	FALSE,	"ZFS plain file"	},
     94  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS directory"		},
     95  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS master node"	},
     96  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS delete queue"	},
     97  1.6       chs 	{	DMU_BSWAP_UINT8,	FALSE,	"zvol object"		},
     98  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"zvol prop"		},
     99  1.6       chs 	{	DMU_BSWAP_UINT8,	FALSE,	"other uint8[]"		},
    100  1.6       chs 	{	DMU_BSWAP_UINT64,	FALSE,	"other uint64[]"	},
    101  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"other ZAP"		},
    102  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"persistent error log"	},
    103  1.6       chs 	{	DMU_BSWAP_UINT8,	TRUE,	"SPA history"		},
    104  1.6       chs 	{	DMU_BSWAP_UINT64,	TRUE,	"SPA history offsets"	},
    105  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"Pool properties"	},
    106  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL permissions"	},
    107  1.6       chs 	{	DMU_BSWAP_ACL,		TRUE,	"ZFS ACL"		},
    108  1.6       chs 	{	DMU_BSWAP_UINT8,	TRUE,	"ZFS SYSACL"		},
    109  1.6       chs 	{	DMU_BSWAP_UINT8,	TRUE,	"FUID table"		},
    110  1.6       chs 	{	DMU_BSWAP_UINT64,	TRUE,	"FUID table size"	},
    111  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset next clones"},
    112  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"scan work queue"	},
    113  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group used"	},
    114  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group quota"	},
    115  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"snapshot refcount tags"},
    116  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"DDT ZAP algorithm"	},
    117  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"DDT statistics"	},
    118  1.6       chs 	{	DMU_BSWAP_UINT8,	TRUE,	"System attributes"	},
    119  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"SA master node"	},
    120  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr registration"	},
    121  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr layouts"	},
    122  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"scan translations"	},
    123  1.6       chs 	{	DMU_BSWAP_UINT8,	FALSE,	"deduplicated block"	},
    124  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL deadlist map"	},
    125  1.6       chs 	{	DMU_BSWAP_UINT64,	TRUE,	"DSL deadlist map hdr"	},
    126  1.6       chs 	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dir clones"	},
    127  1.6       chs 	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj subobj"		}
    128  1.6       chs };
    129  1.6       chs 
    130  1.6       chs const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
    131  1.6       chs 	{	byteswap_uint8_array,	"uint8"		},
    132  1.6       chs 	{	byteswap_uint16_array,	"uint16"	},
    133  1.6       chs 	{	byteswap_uint32_array,	"uint32"	},
    134  1.6       chs 	{	byteswap_uint64_array,	"uint64"	},
    135  1.6       chs 	{	zap_byteswap,		"zap"		},
    136  1.6       chs 	{	dnode_buf_byteswap,	"dnode"		},
    137  1.6       chs 	{	dmu_objset_byteswap,	"objset"	},
    138  1.6       chs 	{	zfs_znode_byteswap,	"znode"		},
    139  1.6       chs 	{	zfs_oldacl_byteswap,	"oldacl"	},
    140  1.6       chs 	{	zfs_acl_byteswap,	"acl"		}
    141  1.1      haad };
    142  1.1      haad 
    143  1.1      haad int
    144  1.6       chs dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
    145  1.6       chs     void *tag, dmu_buf_t **dbp)
    146  1.6       chs {
    147  1.6       chs 	uint64_t blkid;
    148  1.6       chs 	dmu_buf_impl_t *db;
    149  1.6       chs 
    150  1.6       chs 	blkid = dbuf_whichblock(dn, 0, offset);
    151  1.6       chs 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
    152  1.6       chs 	db = dbuf_hold(dn, blkid, tag);
    153  1.6       chs 	rw_exit(&dn->dn_struct_rwlock);
    154  1.6       chs 
    155  1.6       chs 	if (db == NULL) {
    156  1.6       chs 		*dbp = NULL;
    157  1.6       chs 		return (SET_ERROR(EIO));
    158  1.6       chs 	}
    159  1.6       chs 
    160  1.6       chs 	*dbp = &db->db;
    161  1.6       chs 	return (0);
    162  1.6       chs }
    163  1.6       chs int
    164  1.6       chs dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
    165  1.1      haad     void *tag, dmu_buf_t **dbp)
    166  1.1      haad {
    167  1.1      haad 	dnode_t *dn;
    168  1.1      haad 	uint64_t blkid;
    169  1.1      haad 	dmu_buf_impl_t *db;
    170  1.1      haad 	int err;
    171  1.1      haad 
    172  1.3      haad 	err = dnode_hold(os, object, FTAG, &dn);
    173  1.1      haad 	if (err)
    174  1.1      haad 		return (err);
    175  1.6       chs 	blkid = dbuf_whichblock(dn, 0, offset);
    176  1.1      haad 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
    177  1.1      haad 	db = dbuf_hold(dn, blkid, tag);
    178  1.1      haad 	rw_exit(&dn->dn_struct_rwlock);
    179  1.6       chs 	dnode_rele(dn, FTAG);
    180  1.6       chs 
    181  1.1      haad 	if (db == NULL) {
    182  1.6       chs 		*dbp = NULL;
    183  1.6       chs 		return (SET_ERROR(EIO));
    184  1.6       chs 	}
    185  1.6       chs 
    186  1.6       chs 	*dbp = &db->db;
    187  1.6       chs 	return (err);
    188  1.6       chs }
    189  1.6       chs 
    190  1.6       chs int
    191  1.6       chs dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
    192  1.6       chs     void *tag, dmu_buf_t **dbp, int flags)
    193  1.6       chs {
    194  1.6       chs 	int err;
    195  1.6       chs 	int db_flags = DB_RF_CANFAIL;
    196  1.6       chs 
    197  1.6       chs 	if (flags & DMU_READ_NO_PREFETCH)
    198  1.6       chs 		db_flags |= DB_RF_NOPREFETCH;
    199  1.6       chs 
    200  1.6       chs 	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
    201  1.6       chs 	if (err == 0) {
    202  1.6       chs 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
    203  1.6       chs 		err = dbuf_read(db, NULL, db_flags);
    204  1.6       chs 		if (err != 0) {
    205  1.6       chs 			dbuf_rele(db, tag);
    206  1.6       chs 			*dbp = NULL;
    207  1.6       chs 		}
    208  1.6       chs 	}
    209  1.6       chs 
    210  1.6       chs 	return (err);
    211  1.6       chs }
    212  1.6       chs 
    213  1.6       chs int
    214  1.6       chs dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
    215  1.6       chs     void *tag, dmu_buf_t **dbp, int flags)
    216  1.6       chs {
    217  1.6       chs 	int err;
    218  1.6       chs 	int db_flags = DB_RF_CANFAIL;
    219  1.6       chs 
    220  1.6       chs 	if (flags & DMU_READ_NO_PREFETCH)
    221  1.6       chs 		db_flags |= DB_RF_NOPREFETCH;
    222  1.6       chs 
    223  1.6       chs 	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
    224  1.6       chs 	if (err == 0) {
    225  1.6       chs 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
    226  1.6       chs 		err = dbuf_read(db, NULL, db_flags);
    227  1.6       chs 		if (err != 0) {
    228  1.1      haad 			dbuf_rele(db, tag);
    229  1.6       chs 			*dbp = NULL;
    230  1.1      haad 		}
    231  1.1      haad 	}
    232  1.1      haad 
    233  1.1      haad 	return (err);
    234  1.1      haad }
    235  1.1      haad 
    236  1.1      haad int
    237  1.1      haad dmu_bonus_max(void)
    238  1.1      haad {
    239  1.1      haad 	return (DN_MAX_BONUSLEN);
    240  1.1      haad }
    241  1.1      haad 
    242  1.1      haad int
    243  1.6       chs dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
    244  1.6       chs {
    245  1.6       chs 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
    246  1.6       chs 	dnode_t *dn;
    247  1.6       chs 	int error;
    248  1.6       chs 
    249  1.6       chs 	DB_DNODE_ENTER(db);
    250  1.6       chs 	dn = DB_DNODE(db);
    251  1.6       chs 
    252  1.6       chs 	if (dn->dn_bonus != db) {
    253  1.6       chs 		error = SET_ERROR(EINVAL);
    254  1.6       chs 	} else if (newsize < 0 || newsize > db_fake->db_size) {
    255  1.6       chs 		error = SET_ERROR(EINVAL);
    256  1.6       chs 	} else {
    257  1.6       chs 		dnode_setbonuslen(dn, newsize, tx);
    258  1.6       chs 		error = 0;
    259  1.6       chs 	}
    260  1.6       chs 
    261  1.6       chs 	DB_DNODE_EXIT(db);
    262  1.6       chs 	return (error);
    263  1.6       chs }
    264  1.6       chs 
    265  1.6       chs int
    266  1.6       chs dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
    267  1.6       chs {
    268  1.6       chs 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
    269  1.6       chs 	dnode_t *dn;
    270  1.6       chs 	int error;
    271  1.6       chs 
    272  1.6       chs 	DB_DNODE_ENTER(db);
    273  1.6       chs 	dn = DB_DNODE(db);
    274  1.6       chs 
    275  1.6       chs 	if (!DMU_OT_IS_VALID(type)) {
    276  1.6       chs 		error = SET_ERROR(EINVAL);
    277  1.6       chs 	} else if (dn->dn_bonus != db) {
    278  1.6       chs 		error = SET_ERROR(EINVAL);
    279  1.6       chs 	} else {
    280  1.6       chs 		dnode_setbonus_type(dn, type, tx);
    281  1.6       chs 		error = 0;
    282  1.6       chs 	}
    283  1.6       chs 
    284  1.6       chs 	DB_DNODE_EXIT(db);
    285  1.6       chs 	return (error);
    286  1.6       chs }
    287  1.6       chs 
    288  1.6       chs dmu_object_type_t
    289  1.6       chs dmu_get_bonustype(dmu_buf_t *db_fake)
    290  1.6       chs {
    291  1.6       chs 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
    292  1.6       chs 	dnode_t *dn;
    293  1.6       chs 	dmu_object_type_t type;
    294  1.6       chs 
    295  1.6       chs 	DB_DNODE_ENTER(db);
    296  1.6       chs 	dn = DB_DNODE(db);
    297  1.6       chs 	type = dn->dn_bonustype;
    298  1.6       chs 	DB_DNODE_EXIT(db);
    299  1.6       chs 
    300  1.6       chs 	return (type);
    301  1.6       chs }
    302  1.6       chs 
    303  1.6       chs int
    304  1.6       chs dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
    305  1.1      haad {
    306  1.6       chs 	dnode_t *dn;
    307  1.6       chs 	int error;
    308  1.1      haad 
    309  1.6       chs 	error = dnode_hold(os, object, FTAG, &dn);
    310  1.6       chs 	dbuf_rm_spill(dn, tx);
    311  1.6       chs 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
    312  1.6       chs 	dnode_rm_spill(dn, tx);
    313  1.6       chs 	rw_exit(&dn->dn_struct_rwlock);
    314  1.6       chs 	dnode_rele(dn, FTAG);
    315  1.6       chs 	return (error);
    316  1.1      haad }
    317  1.1      haad 
    318  1.1      haad /*
    319  1.1      haad  * returns ENOENT, EIO, or 0.
    320  1.1      haad  */
    321  1.1      haad int
    322  1.1      haad dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
    323  1.1      haad {
    324  1.1      haad 	dnode_t *dn;
    325  1.1      haad 	dmu_buf_impl_t *db;
    326  1.1      haad 	int error;
    327  1.1      haad 
    328  1.3      haad 	error = dnode_hold(os, object, FTAG, &dn);
    329  1.1      haad 	if (error)
    330  1.1      haad 		return (error);
    331  1.1      haad 
    332  1.1      haad 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
    333  1.1      haad 	if (dn->dn_bonus == NULL) {
    334  1.1      haad 		rw_exit(&dn->dn_struct_rwlock);
    335  1.1      haad 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
    336  1.1      haad 		if (dn->dn_bonus == NULL)
    337  1.1      haad 			dbuf_create_bonus(dn);
    338  1.1      haad 	}
    339  1.1      haad 	db = dn->dn_bonus;
    340  1.1      haad 
    341  1.1      haad 	/* as long as the bonus buf is held, the dnode will be held */
    342  1.6       chs 	if (refcount_add(&db->db_holds, tag) == 1) {
    343  1.1      haad 		VERIFY(dnode_add_ref(dn, db));
    344  1.6       chs 		atomic_inc_32(&dn->dn_dbufs_count);
    345  1.6       chs 	}
    346  1.6       chs 
    347  1.6       chs 	/*
    348  1.6       chs 	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
    349  1.6       chs 	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
    350  1.6       chs 	 * a dnode hold for every dbuf.
    351  1.6       chs 	 */
    352  1.6       chs 	rw_exit(&dn->dn_struct_rwlock);
    353  1.1      haad 
    354  1.1      haad 	dnode_rele(dn, FTAG);
    355  1.1      haad 
    356  1.6       chs 	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
    357  1.1      haad 
    358  1.1      haad 	*dbp = &db->db;
    359  1.1      haad 	return (0);
    360  1.1      haad }
    361  1.1      haad 
    362  1.1      haad /*
    363  1.6       chs  * returns ENOENT, EIO, or 0.
    364  1.6       chs  *
    365  1.6       chs  * This interface will allocate a blank spill dbuf when a spill blk
    366  1.6       chs  * doesn't already exist on the dnode.
    367  1.6       chs  *
    368  1.6       chs  * if you only want to find an already existing spill db, then
    369  1.6       chs  * dmu_spill_hold_existing() should be used.
    370  1.6       chs  */
    371  1.6       chs int
    372  1.6       chs dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
    373  1.6       chs {
    374  1.6       chs 	dmu_buf_impl_t *db = NULL;
    375  1.6       chs 	int err;
    376  1.6       chs 
    377  1.6       chs 	if ((flags & DB_RF_HAVESTRUCT) == 0)
    378  1.6       chs 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
    379  1.6       chs 
    380  1.6       chs 	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
    381  1.6       chs 
    382  1.6       chs 	if ((flags & DB_RF_HAVESTRUCT) == 0)
    383  1.6       chs 		rw_exit(&dn->dn_struct_rwlock);
    384  1.6       chs 
    385  1.6       chs 	ASSERT(db != NULL);
    386  1.6       chs 	err = dbuf_read(db, NULL, flags);
    387  1.6       chs 	if (err == 0)
    388  1.6       chs 		*dbp = &db->db;
    389  1.6       chs 	else
    390  1.6       chs 		dbuf_rele(db, tag);
    391  1.6       chs 	return (err);
    392  1.6       chs }
    393  1.6       chs 
    394  1.6       chs int
    395  1.6       chs dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
    396  1.6       chs {
    397  1.6       chs 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
    398  1.6       chs 	dnode_t *dn;
    399  1.6       chs 	int err;
    400  1.6       chs 
    401  1.6       chs 	DB_DNODE_ENTER(db);
    402  1.6       chs 	dn = DB_DNODE(db);
    403  1.6       chs 
    404  1.6       chs 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
    405  1.6       chs 		err = SET_ERROR(EINVAL);
    406  1.6       chs 	} else {
    407  1.6       chs 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
    408  1.6       chs 
    409  1.6       chs 		if (!dn->dn_have_spill) {
    410  1.6       chs 			err = SET_ERROR(ENOENT);
    411  1.6       chs 		} else {
    412  1.6       chs 			err = dmu_spill_hold_by_dnode(dn,
    413  1.6       chs 			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
    414  1.6       chs 		}
    415  1.6       chs 
    416  1.6       chs 		rw_exit(&dn->dn_struct_rwlock);
    417  1.6       chs 	}
    418  1.6       chs 
    419  1.6       chs 	DB_DNODE_EXIT(db);
    420  1.6       chs 	return (err);
    421  1.6       chs }
    422  1.6       chs 
    423  1.6       chs int
    424  1.6       chs dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
    425  1.6       chs {
    426  1.6       chs 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
    427  1.6       chs 	dnode_t *dn;
    428  1.6       chs 	int err;
    429  1.6       chs 
    430  1.6       chs 	DB_DNODE_ENTER(db);
    431  1.6       chs 	dn = DB_DNODE(db);
    432  1.6       chs 	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
    433  1.6       chs 	DB_DNODE_EXIT(db);
    434  1.6       chs 
    435  1.6       chs 	return (err);
    436  1.6       chs }
    437  1.6       chs 
    438  1.6       chs /*
    439  1.1      haad  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
    440  1.1      haad  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
    441  1.1      haad  * and can induce severe lock contention when writing to several files
    442  1.1      haad  * whose dnodes are in the same block.
    443  1.1      haad  */
    444  1.1      haad static int
    445  1.3      haad dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
    446  1.6       chs     boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
    447  1.1      haad {
    448  1.1      haad 	dmu_buf_t **dbp;
    449  1.1      haad 	uint64_t blkid, nblks, i;
    450  1.3      haad 	uint32_t dbuf_flags;
    451  1.1      haad 	int err;
    452  1.1      haad 	zio_t *zio;
    453  1.1      haad 
    454  1.1      haad 	ASSERT(length <= DMU_MAX_ACCESS);
    455  1.1      haad 
    456  1.6       chs 	/*
    457  1.6       chs 	 * Note: We directly notify the prefetch code of this read, so that
    458  1.6       chs 	 * we can tell it about the multi-block read.  dbuf_read() only knows
    459  1.6       chs 	 * about the one block it is accessing.
    460  1.6       chs 	 */
    461  1.6       chs 	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
    462  1.6       chs 	    DB_RF_NOPREFETCH;
    463  1.1      haad 
    464  1.1      haad 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
    465  1.1      haad 	if (dn->dn_datablkshift) {
    466  1.1      haad 		int blkshift = dn->dn_datablkshift;
    467  1.6       chs 		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
    468  1.6       chs 		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
    469  1.1      haad 	} else {
    470  1.1      haad 		if (offset + length > dn->dn_datablksz) {
    471  1.1      haad 			zfs_panic_recover("zfs: accessing past end of object "
    472  1.1      haad 			    "%llx/%llx (size=%u access=%llu+%llu)",
    473  1.1      haad 			    (longlong_t)dn->dn_objset->
    474  1.1      haad 			    os_dsl_dataset->ds_object,
    475  1.1      haad 			    (longlong_t)dn->dn_object, dn->dn_datablksz,
    476  1.1      haad 			    (longlong_t)offset, (longlong_t)length);
    477  1.3      haad 			rw_exit(&dn->dn_struct_rwlock);
    478  1.6       chs 			return (SET_ERROR(EIO));
    479  1.1      haad 		}
    480  1.1      haad 		nblks = 1;
    481  1.1      haad 	}
    482  1.1      haad 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
    483  1.1      haad 
    484  1.6       chs #if defined(_KERNEL) && defined(RACCT)
    485  1.6       chs 	if (racct_enable && !read) {
    486  1.6       chs 		PROC_LOCK(curproc);
    487  1.6       chs 		racct_add_force(curproc, RACCT_WRITEBPS, length);
    488  1.6       chs 		racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
    489  1.6       chs 		PROC_UNLOCK(curproc);
    490  1.6       chs 	}
    491  1.6       chs #endif
    492  1.6       chs 
    493  1.1      haad 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
    494  1.6       chs 	blkid = dbuf_whichblock(dn, 0, offset);
    495  1.1      haad 	for (i = 0; i < nblks; i++) {
    496  1.6       chs 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
    497  1.1      haad 		if (db == NULL) {
    498  1.1      haad 			rw_exit(&dn->dn_struct_rwlock);
    499  1.1      haad 			dmu_buf_rele_array(dbp, nblks, tag);
    500  1.1      haad 			zio_nowait(zio);
    501  1.6       chs 			return (SET_ERROR(EIO));
    502  1.1      haad 		}
    503  1.6       chs 
    504  1.1      haad 		/* initiate async i/o */
    505  1.6       chs 		if (read)
    506  1.3      haad 			(void) dbuf_read(db, zio, dbuf_flags);
    507  1.6       chs 
    508  1.6       chs #ifdef _KERNEL
    509  1.6       chs 		else
    510  1.6       chs 			curthread->td_ru.ru_oublock++;
    511  1.6       chs #endif
    512  1.1      haad 		dbp[i] = &db->db;
    513  1.1      haad 	}
    514  1.6       chs 
    515  1.6       chs 	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
    516  1.6       chs 	    DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
    517  1.6       chs 		dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
    518  1.6       chs 		    read && DNODE_IS_CACHEABLE(dn));
    519  1.6       chs 	}
    520  1.1      haad 	rw_exit(&dn->dn_struct_rwlock);
    521  1.1      haad 
    522  1.1      haad 	/* wait for async i/o */
    523  1.1      haad 	err = zio_wait(zio);
    524  1.1      haad 	if (err) {
    525  1.1      haad 		dmu_buf_rele_array(dbp, nblks, tag);
    526  1.1      haad 		return (err);
    527  1.1      haad 	}
    528  1.1      haad 
    529  1.1      haad 	/* wait for other io to complete */
    530  1.1      haad 	if (read) {
    531  1.1      haad 		for (i = 0; i < nblks; i++) {
    532  1.1      haad 			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
    533  1.1      haad 			mutex_enter(&db->db_mtx);
    534  1.1      haad 			while (db->db_state == DB_READ ||
    535  1.1      haad 			    db->db_state == DB_FILL)
    536  1.1      haad 				cv_wait(&db->db_changed, &db->db_mtx);
    537  1.1      haad 			if (db->db_state == DB_UNCACHED)
    538  1.6       chs 				err = SET_ERROR(EIO);
    539  1.1      haad 			mutex_exit(&db->db_mtx);
    540  1.1      haad 			if (err) {
    541  1.1      haad 				dmu_buf_rele_array(dbp, nblks, tag);
    542  1.1      haad 				return (err);
    543  1.1      haad 			}
    544  1.1      haad 		}
    545  1.1      haad 	}
    546  1.1      haad 
    547  1.1      haad 	*numbufsp = nblks;
    548  1.1      haad 	*dbpp = dbp;
    549  1.1      haad 	return (0);
    550  1.1      haad }
    551  1.1      haad 
    552  1.1      haad static int
    553  1.1      haad dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
    554  1.1      haad     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
    555  1.1      haad {
    556  1.1      haad 	dnode_t *dn;
    557  1.1      haad 	int err;
    558  1.1      haad 
    559  1.3      haad 	err = dnode_hold(os, object, FTAG, &dn);
    560  1.1      haad 	if (err)
    561  1.1      haad 		return (err);
    562  1.1      haad 
    563  1.1      haad 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
    564  1.3      haad 	    numbufsp, dbpp, DMU_READ_PREFETCH);
    565  1.1      haad 
    566  1.1      haad 	dnode_rele(dn, FTAG);
    567  1.1      haad 
    568  1.1      haad 	return (err);
    569  1.1      haad }
    570  1.1      haad 
    571  1.1      haad int
    572  1.6       chs dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
    573  1.6       chs     uint64_t length, boolean_t read, void *tag, int *numbufsp,
    574  1.6       chs     dmu_buf_t ***dbpp)
    575  1.1      haad {
    576  1.6       chs 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
    577  1.6       chs 	dnode_t *dn;
    578  1.1      haad 	int err;
    579  1.1      haad 
    580  1.6       chs 	DB_DNODE_ENTER(db);
    581  1.6       chs 	dn = DB_DNODE(db);
    582  1.1      haad 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
    583  1.3      haad 	    numbufsp, dbpp, DMU_READ_PREFETCH);
    584  1.6       chs 	DB_DNODE_EXIT(db);
    585  1.1      haad 
    586  1.1      haad 	return (err);
    587  1.1      haad }
    588  1.1      haad 
    589  1.1      haad void
    590  1.1      haad dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
    591  1.1      haad {
    592  1.1      haad 	int i;
    593  1.1      haad 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
    594  1.1      haad 
    595  1.1      haad 	if (numbufs == 0)
    596  1.1      haad 		return;
    597  1.1      haad 
    598  1.1      haad 	for (i = 0; i < numbufs; i++) {
    599  1.1      haad 		if (dbp[i])
    600  1.1      haad 			dbuf_rele(dbp[i], tag);
    601  1.1      haad 	}
    602  1.1      haad 
    603  1.1      haad 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
    604  1.1      haad }
    605  1.1      haad 
    606  1.6       chs /*
    607  1.6       chs  * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
    608  1.6       chs  * indirect blocks prefeteched will be those that point to the blocks containing
    609  1.6       chs  * the data starting at offset, and continuing to offset + len.
    610  1.6       chs  *
    611  1.6       chs  * Note that if the indirect blocks above the blocks being prefetched are not in
    612  1.6       chs  * cache, they will be asychronously read in.
    613  1.6       chs  */
    614  1.1      haad void
    615  1.6       chs dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
    616  1.6       chs     uint64_t len, zio_priority_t pri)
    617  1.1      haad {
    618  1.1      haad 	dnode_t *dn;
    619  1.1      haad 	uint64_t blkid;
    620  1.6       chs 	int nblks, err;
    621  1.1      haad 
    622  1.1      haad 	if (len == 0) {  /* they're interested in the bonus buffer */
    623  1.6       chs 		dn = DMU_META_DNODE(os);
    624  1.1      haad 
    625  1.1      haad 		if (object == 0 || object >= DN_MAX_OBJECT)
    626  1.1      haad 			return;
    627  1.1      haad 
    628  1.1      haad 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
    629  1.6       chs 		blkid = dbuf_whichblock(dn, level,
    630  1.6       chs 		    object * sizeof (dnode_phys_t));
    631  1.6       chs 		dbuf_prefetch(dn, level, blkid, pri, 0);
    632  1.1      haad 		rw_exit(&dn->dn_struct_rwlock);
    633  1.1      haad 		return;
    634  1.1      haad 	}
    635  1.1      haad 
    636  1.1      haad 	/*
    637  1.1      haad 	 * XXX - Note, if the dnode for the requested object is not
    638  1.1      haad 	 * already cached, we will do a *synchronous* read in the
    639  1.1      haad 	 * dnode_hold() call.  The same is true for any indirects.
    640  1.1      haad 	 */
    641  1.3      haad 	err = dnode_hold(os, object, FTAG, &dn);
    642  1.1      haad 	if (err != 0)
    643  1.1      haad 		return;
    644  1.1      haad 
    645  1.1      haad 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
    646  1.6       chs 	/*
    647  1.6       chs 	 * offset + len - 1 is the last byte we want to prefetch for, and offset
    648  1.6       chs 	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
    649  1.6       chs 	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
    650  1.6       chs 	 * offset)  is the first.  Then the number we need to prefetch is the
    651  1.6       chs 	 * last - first + 1.
    652  1.6       chs 	 */
    653  1.6       chs 	if (level > 0 || dn->dn_datablkshift != 0) {
    654  1.6       chs 		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
    655  1.6       chs 		    dbuf_whichblock(dn, level, offset) + 1;
    656  1.1      haad 	} else {
    657  1.1      haad 		nblks = (offset < dn->dn_datablksz);
    658  1.1      haad 	}
    659  1.1      haad 
    660  1.1      haad 	if (nblks != 0) {
    661  1.6       chs 		blkid = dbuf_whichblock(dn, level, offset);
    662  1.6       chs 		for (int i = 0; i < nblks; i++)
    663  1.6       chs 			dbuf_prefetch(dn, level, blkid + i, pri, 0);
    664  1.1      haad 	}
    665  1.1      haad 
    666  1.1      haad 	rw_exit(&dn->dn_struct_rwlock);
    667  1.1      haad 
    668  1.1      haad 	dnode_rele(dn, FTAG);
    669  1.1      haad }
    670  1.1      haad 
    671  1.3      haad /*
    672  1.3      haad  * Get the next "chunk" of file data to free.  We traverse the file from
    673  1.3      haad  * the end so that the file gets shorter over time (if we crashes in the
    674  1.3      haad  * middle, this will leave us in a better state).  We find allocated file
    675  1.3      haad  * data by simply searching the allocated level 1 indirects.
    676  1.6       chs  *
    677  1.6       chs  * On input, *start should be the first offset that does not need to be
    678  1.6       chs  * freed (e.g. "offset + length").  On return, *start will be the first
    679  1.6       chs  * offset that should be freed.
    680  1.3      haad  */
    681  1.1      haad static int
    682  1.6       chs get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
    683  1.1      haad {
    684  1.6       chs 	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
    685  1.6       chs 	/* bytes of data covered by a level-1 indirect block */
    686  1.3      haad 	uint64_t iblkrange =
    687  1.1      haad 	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
    688  1.1      haad 
    689  1.6       chs 	ASSERT3U(minimum, <=, *start);
    690  1.1      haad 
    691  1.6       chs 	if (*start - minimum <= iblkrange * maxblks) {
    692  1.6       chs 		*start = minimum;
    693  1.1      haad 		return (0);
    694  1.1      haad 	}
    695  1.3      haad 	ASSERT(ISP2(iblkrange));
    696  1.1      haad 
    697  1.6       chs 	for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
    698  1.1      haad 		int err;
    699  1.1      haad 
    700  1.6       chs 		/*
    701  1.6       chs 		 * dnode_next_offset(BACKWARDS) will find an allocated L1
    702  1.6       chs 		 * indirect block at or before the input offset.  We must
    703  1.6       chs 		 * decrement *start so that it is at the end of the region
    704  1.6       chs 		 * to search.
    705  1.6       chs 		 */
    706  1.6       chs 		(*start)--;
    707  1.1      haad 		err = dnode_next_offset(dn,
    708  1.3      haad 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
    709  1.1      haad 
    710  1.6       chs 		/* if there are no indirect blocks before start, we are done */
    711  1.3      haad 		if (err == ESRCH) {
    712  1.6       chs 			*start = minimum;
    713  1.6       chs 			break;
    714  1.6       chs 		} else if (err != 0) {
    715  1.3      haad 			return (err);
    716  1.1      haad 		}
    717  1.1      haad 
    718  1.6       chs 		/* set start to the beginning of this L1 indirect */
    719  1.3      haad 		*start = P2ALIGN(*start, iblkrange);
    720  1.1      haad 	}
    721  1.6       chs 	if (*start < minimum)
    722  1.6       chs 		*start = minimum;
    723  1.1      haad 	return (0);
    724  1.1      haad }
    725  1.1      haad 
    726  1.1      haad static int
    727  1.1      haad dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
    728  1.6       chs     uint64_t length)
    729  1.1      haad {
    730  1.6       chs 	uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
    731  1.6       chs 	int err;
    732  1.6       chs 	uint64_t dirty_frees_threshold;
    733  1.6       chs 	dsl_pool_t *dp = dmu_objset_pool(os);
    734  1.6       chs 
    735  1.6       chs 	if (offset >= object_size)
    736  1.1      haad 		return (0);
    737  1.1      haad 
    738  1.6       chs 	if (zfs_per_txg_dirty_frees_percent <= 100)
    739  1.6       chs 		dirty_frees_threshold =
    740  1.6       chs 		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
    741  1.6       chs 	else
    742  1.6       chs 		dirty_frees_threshold = zfs_dirty_data_max / 4;
    743  1.6       chs 
    744  1.6       chs 	if (length == DMU_OBJECT_END || offset + length > object_size)
    745  1.6       chs 		length = object_size - offset;
    746  1.6       chs 
    747  1.6       chs 	while (length != 0) {
    748  1.6       chs 		uint64_t chunk_end, chunk_begin, chunk_len;
    749  1.6       chs 		uint64_t long_free_dirty_all_txgs = 0;
    750  1.6       chs 		dmu_tx_t *tx;
    751  1.6       chs 
    752  1.6       chs 		chunk_end = chunk_begin = offset + length;
    753  1.6       chs 
    754  1.6       chs 		/* move chunk_begin backwards to the beginning of this chunk */
    755  1.6       chs 		err = get_next_chunk(dn, &chunk_begin, offset);
    756  1.1      haad 		if (err)
    757  1.1      haad 			return (err);
    758  1.6       chs 		ASSERT3U(chunk_begin, >=, offset);
    759  1.6       chs 		ASSERT3U(chunk_begin, <=, chunk_end);
    760  1.6       chs 
    761  1.6       chs 		chunk_len = chunk_end - chunk_begin;
    762  1.6       chs 
    763  1.6       chs 		mutex_enter(&dp->dp_lock);
    764  1.6       chs 		for (int t = 0; t < TXG_SIZE; t++) {
    765  1.6       chs 			long_free_dirty_all_txgs +=
    766  1.6       chs 			    dp->dp_long_free_dirty_pertxg[t];
    767  1.6       chs 		}
    768  1.6       chs 		mutex_exit(&dp->dp_lock);
    769  1.6       chs 
    770  1.6       chs 		/*
    771  1.6       chs 		 * To avoid filling up a TXG with just frees wait for
    772  1.6       chs 		 * the next TXG to open before freeing more chunks if
    773  1.6       chs 		 * we have reached the threshold of frees
    774  1.6       chs 		 */
    775  1.6       chs 		if (dirty_frees_threshold != 0 &&
    776  1.6       chs 		    long_free_dirty_all_txgs >= dirty_frees_threshold) {
    777  1.6       chs 			txg_wait_open(dp, 0);
    778  1.6       chs 			continue;
    779  1.6       chs 		}
    780  1.1      haad 
    781  1.1      haad 		tx = dmu_tx_create(os);
    782  1.6       chs 		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
    783  1.6       chs 
    784  1.6       chs 		/*
    785  1.6       chs 		 * Mark this transaction as typically resulting in a net
    786  1.6       chs 		 * reduction in space used.
    787  1.6       chs 		 */
    788  1.6       chs 		dmu_tx_mark_netfree(tx);
    789  1.1      haad 		err = dmu_tx_assign(tx, TXG_WAIT);
    790  1.1      haad 		if (err) {
    791  1.1      haad 			dmu_tx_abort(tx);
    792  1.1      haad 			return (err);
    793  1.1      haad 		}
    794  1.1      haad 
    795  1.6       chs 		mutex_enter(&dp->dp_lock);
    796  1.6       chs 		dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
    797  1.6       chs 		    chunk_len;
    798  1.6       chs 		mutex_exit(&dp->dp_lock);
    799  1.6       chs 		DTRACE_PROBE3(free__long__range,
    800  1.6       chs 		    uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
    801  1.6       chs 		    uint64_t, dmu_tx_get_txg(tx));
    802  1.6       chs 		dnode_free_range(dn, chunk_begin, chunk_len, tx);
    803  1.6       chs 		dmu_tx_commit(tx);
    804  1.1      haad 
    805  1.6       chs 		length -= chunk_len;
    806  1.1      haad 	}
    807  1.1      haad 	return (0);
    808  1.1      haad }
    809  1.1      haad 
    810  1.1      haad int
    811  1.1      haad dmu_free_long_range(objset_t *os, uint64_t object,
    812  1.1      haad     uint64_t offset, uint64_t length)
    813  1.1      haad {
    814  1.1      haad 	dnode_t *dn;
    815  1.1      haad 	int err;
    816  1.1      haad 
    817  1.3      haad 	err = dnode_hold(os, object, FTAG, &dn);
    818  1.1      haad 	if (err != 0)
    819  1.1      haad 		return (err);
    820  1.6       chs 	err = dmu_free_long_range_impl(os, dn, offset, length);
    821  1.6       chs 
    822  1.6       chs 	/*
    823  1.6       chs 	 * It is important to zero out the maxblkid when freeing the entire
    824  1.6       chs 	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
    825  1.6       chs 	 * will take the fast path, and (b) dnode_reallocate() can verify
    826  1.6       chs 	 * that the entire file has been freed.
    827  1.6       chs 	 */
    828  1.6       chs 	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
    829  1.6       chs 		dn->dn_maxblkid = 0;
    830  1.6       chs 
    831  1.1      haad 	dnode_rele(dn, FTAG);
    832  1.1      haad 	return (err);
    833  1.1      haad }
    834  1.1      haad 
    835  1.1      haad int
    836  1.6       chs dmu_free_long_object(objset_t *os, uint64_t object)
    837  1.1      haad {
    838  1.1      haad 	dmu_tx_t *tx;
    839  1.1      haad 	int err;
    840  1.1      haad 
    841  1.6       chs 	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
    842  1.1      haad 	if (err != 0)
    843  1.1      haad 		return (err);
    844  1.6       chs 
    845  1.6       chs 	tx = dmu_tx_create(os);
    846  1.6       chs 	dmu_tx_hold_bonus(tx, object);
    847  1.6       chs 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
    848  1.6       chs 	dmu_tx_mark_netfree(tx);
    849  1.6       chs 	err = dmu_tx_assign(tx, TXG_WAIT);
    850  1.6       chs 	if (err == 0) {
    851  1.6       chs 		err = dmu_object_free(os, object, tx);
    852  1.6       chs 		dmu_tx_commit(tx);
    853  1.1      haad 	} else {
    854  1.6       chs 		dmu_tx_abort(tx);
    855  1.1      haad 	}
    856  1.6       chs 
    857  1.1      haad 	return (err);
    858  1.1      haad }
    859  1.1      haad 
    860  1.1      haad int
    861  1.1      haad dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
    862  1.1      haad     uint64_t size, dmu_tx_t *tx)
    863  1.1      haad {
    864  1.1      haad 	dnode_t *dn;
    865  1.3      haad 	int err = dnode_hold(os, object, FTAG, &dn);
    866  1.1      haad 	if (err)
    867  1.1      haad 		return (err);
    868  1.1      haad 	ASSERT(offset < UINT64_MAX);
    869  1.1      haad 	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
    870  1.1      haad 	dnode_free_range(dn, offset, size, tx);
    871  1.1      haad 	dnode_rele(dn, FTAG);
    872  1.1      haad 	return (0);
    873  1.1      haad }
    874  1.1      haad 
    875  1.1      haad int
    876  1.1      haad dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
    877  1.3      haad     void *buf, uint32_t flags)
    878  1.1      haad {
    879  1.1      haad 	dnode_t *dn;
    880  1.1      haad 	dmu_buf_t **dbp;
    881  1.3      haad 	int numbufs, err;
    882  1.1      haad 
    883  1.3      haad 	err = dnode_hold(os, object, FTAG, &dn);
    884  1.1      haad 	if (err)
    885  1.1      haad 		return (err);
    886  1.1      haad 
    887  1.1      haad 	/*
    888  1.1      haad 	 * Deal with odd block sizes, where there can't be data past the first
    889  1.1      haad 	 * block.  If we ever do the tail block optimization, we will need to
    890  1.1      haad 	 * handle that here as well.
    891  1.1      haad 	 */
    892  1.3      haad 	if (dn->dn_maxblkid == 0) {
    893  1.1      haad 		int newsz = offset > dn->dn_datablksz ? 0 :
    894  1.1      haad 		    MIN(size, dn->dn_datablksz - offset);
    895  1.1      haad 		bzero((char *)buf + newsz, size - newsz);
    896  1.1      haad 		size = newsz;
    897  1.1      haad 	}
    898  1.1      haad 
    899  1.1      haad 	while (size > 0) {
    900  1.1      haad 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
    901  1.3      haad 		int i;
    902  1.1      haad 
    903  1.1      haad 		/*
    904  1.1      haad 		 * NB: we could do this block-at-a-time, but it's nice
    905  1.1      haad 		 * to be reading in parallel.
    906  1.1      haad 		 */
    907  1.1      haad 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
    908  1.3      haad 		    TRUE, FTAG, &numbufs, &dbp, flags);
    909  1.1      haad 		if (err)
    910  1.1      haad 			break;
    911  1.1      haad 
    912  1.1      haad 		for (i = 0; i < numbufs; i++) {
    913  1.1      haad 			int tocpy;
    914  1.1      haad 			int bufoff;
    915  1.1      haad 			dmu_buf_t *db = dbp[i];
    916  1.1      haad 
    917  1.1      haad 			ASSERT(size > 0);
    918  1.1      haad 
    919  1.1      haad 			bufoff = offset - db->db_offset;
    920  1.1      haad 			tocpy = (int)MIN(db->db_size - bufoff, size);
    921  1.1      haad 
    922  1.1      haad 			bcopy((char *)db->db_data + bufoff, buf, tocpy);
    923  1.1      haad 
    924  1.1      haad 			offset += tocpy;
    925  1.1      haad 			size -= tocpy;
    926  1.1      haad 			buf = (char *)buf + tocpy;
    927  1.1      haad 		}
    928  1.1      haad 		dmu_buf_rele_array(dbp, numbufs, FTAG);
    929  1.1      haad 	}
    930  1.1      haad 	dnode_rele(dn, FTAG);
    931  1.1      haad 	return (err);
    932  1.1      haad }
    933  1.1      haad 
    934  1.1      haad void
    935  1.1      haad dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
    936  1.1      haad     const void *buf, dmu_tx_t *tx)
    937  1.1      haad {
    938  1.1      haad 	dmu_buf_t **dbp;
    939  1.1      haad 	int numbufs, i;
    940  1.1      haad 
    941  1.1      haad 	if (size == 0)
    942  1.1      haad 		return;
    943  1.1      haad 
    944  1.1      haad 	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
    945  1.1      haad 	    FALSE, FTAG, &numbufs, &dbp));
    946  1.1      haad 
    947  1.1      haad 	for (i = 0; i < numbufs; i++) {
    948  1.1      haad 		int tocpy;
    949  1.1      haad 		int bufoff;
    950  1.1      haad 		dmu_buf_t *db = dbp[i];
    951  1.1      haad 
    952  1.1      haad 		ASSERT(size > 0);
    953  1.1      haad 
    954  1.1      haad 		bufoff = offset - db->db_offset;
    955  1.1      haad 		tocpy = (int)MIN(db->db_size - bufoff, size);
    956  1.1      haad 
    957  1.1      haad 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
    958  1.1      haad 
    959  1.1      haad 		if (tocpy == db->db_size)
    960  1.1      haad 			dmu_buf_will_fill(db, tx);
    961  1.1      haad 		else
    962  1.1      haad 			dmu_buf_will_dirty(db, tx);
    963  1.1      haad 
    964  1.1      haad 		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
    965  1.1      haad 
    966  1.1      haad 		if (tocpy == db->db_size)
    967  1.1      haad 			dmu_buf_fill_done(db, tx);
    968  1.1      haad 
    969  1.1      haad 		offset += tocpy;
    970  1.1      haad 		size -= tocpy;
    971  1.1      haad 		buf = (char *)buf + tocpy;
    972  1.1      haad 	}
    973  1.1      haad 	dmu_buf_rele_array(dbp, numbufs, FTAG);
    974  1.1      haad }
    975  1.1      haad 
    976  1.1      haad void
    977  1.1      haad dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
    978  1.1      haad     dmu_tx_t *tx)
    979  1.1      haad {
    980  1.1      haad 	dmu_buf_t **dbp;
    981  1.1      haad 	int numbufs, i;
    982  1.1      haad 
    983  1.1      haad 	if (size == 0)
    984  1.1      haad 		return;
    985  1.1      haad 
    986  1.1      haad 	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
    987  1.1      haad 	    FALSE, FTAG, &numbufs, &dbp));
    988  1.1      haad 
    989  1.1      haad 	for (i = 0; i < numbufs; i++) {
    990  1.1      haad 		dmu_buf_t *db = dbp[i];
    991  1.1      haad 
    992  1.1      haad 		dmu_buf_will_not_fill(db, tx);
    993  1.1      haad 	}
    994  1.1      haad 	dmu_buf_rele_array(dbp, numbufs, FTAG);
    995  1.1      haad }
    996  1.1      haad 
    997  1.6       chs void
    998  1.6       chs dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
    999  1.6       chs     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
   1000  1.6       chs     int compressed_size, int byteorder, dmu_tx_t *tx)
   1001  1.6       chs {
   1002  1.6       chs 	dmu_buf_t *db;
   1003  1.6       chs 
   1004  1.6       chs 	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
   1005  1.6       chs 	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
   1006  1.6       chs 	VERIFY0(dmu_buf_hold_noread(os, object, offset,
   1007  1.6       chs 	    FTAG, &db));
   1008  1.6       chs 
   1009  1.6       chs 	dmu_buf_write_embedded(db,
   1010  1.6       chs 	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
   1011  1.6       chs 	    uncompressed_size, compressed_size, byteorder, tx);
   1012  1.6       chs 
   1013  1.6       chs 	dmu_buf_rele(db, FTAG);
   1014  1.6       chs }
   1015  1.6       chs 
   1016  1.3      haad /*
   1017  1.3      haad  * DMU support for xuio
   1018  1.3      haad  */
   1019  1.3      haad kstat_t *xuio_ksp = NULL;
   1020  1.3      haad 
   1021  1.3      haad int
   1022  1.3      haad dmu_xuio_init(xuio_t *xuio, int nblk)
   1023  1.3      haad {
   1024  1.3      haad 	dmu_xuio_t *priv;
   1025  1.3      haad 	uio_t *uio = &xuio->xu_uio;
   1026  1.3      haad 
   1027  1.3      haad 	uio->uio_iovcnt = nblk;
   1028  1.3      haad 	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
   1029  1.3      haad 
   1030  1.3      haad 	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
   1031  1.3      haad 	priv->cnt = nblk;
   1032  1.3      haad 	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
   1033  1.3      haad 	priv->iovp = uio->uio_iov;
   1034  1.3      haad 	XUIO_XUZC_PRIV(xuio) = priv;
   1035  1.3      haad 
   1036  1.3      haad 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
   1037  1.3      haad 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
   1038  1.3      haad 	else
   1039  1.3      haad 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
   1040  1.4  riastrad 
   1041  1.3      haad 	return (0);
   1042  1.3      haad }
   1043  1.3      haad 
   1044  1.3      haad void
   1045  1.3      haad dmu_xuio_fini(xuio_t *xuio)
   1046  1.3      haad {
   1047  1.3      haad 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
   1048  1.3      haad 	int nblk = priv->cnt;
   1049  1.3      haad 
   1050  1.3      haad 	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
   1051  1.3      haad 	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
   1052  1.3      haad 	kmem_free(priv, sizeof (dmu_xuio_t));
   1053  1.4  riastrad 
   1054  1.3      haad 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
   1055  1.3      haad 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
   1056  1.3      haad 	else
   1057  1.3      haad 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
   1058  1.3      haad }
   1059  1.3      haad 
   1060  1.3      haad /*
   1061  1.3      haad  * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
   1062  1.3      haad  * and increase priv->next by 1.
   1063  1.3      haad  */
   1064  1.3      haad int
   1065  1.3      haad dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
   1066  1.3      haad {
   1067  1.3      haad 	struct iovec *iov;
   1068  1.3      haad 	uio_t *uio = &xuio->xu_uio;
   1069  1.3      haad 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
   1070  1.3      haad 	int i = priv->next++;
   1071  1.3      haad 
   1072  1.3      haad 	ASSERT(i < priv->cnt);
   1073  1.3      haad 	ASSERT(off + n <= arc_buf_size(abuf));
   1074  1.3      haad 	iov = uio->uio_iov + i;
   1075  1.3      haad 	iov->iov_base = (char *)abuf->b_data + off;
   1076  1.3      haad 	iov->iov_len = n;
   1077  1.3      haad 	priv->bufs[i] = abuf;
   1078  1.3      haad 	return (0);
   1079  1.3      haad }
   1080  1.3      haad 
   1081  1.3      haad int
   1082  1.3      haad dmu_xuio_cnt(xuio_t *xuio)
   1083  1.3      haad {
   1084  1.3      haad 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
   1085  1.3      haad 	return (priv->cnt);
   1086  1.3      haad }
   1087  1.3      haad 
   1088  1.3      haad arc_buf_t *
   1089  1.3      haad dmu_xuio_arcbuf(xuio_t *xuio, int i)
   1090  1.3      haad {
   1091  1.3      haad 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
   1092  1.3      haad 
   1093  1.3      haad 	ASSERT(i < priv->cnt);
   1094  1.3      haad 	return (priv->bufs[i]);
   1095  1.3      haad }
   1096  1.3      haad 
   1097  1.3      haad void
   1098  1.3      haad dmu_xuio_clear(xuio_t *xuio, int i)
   1099  1.3      haad {
   1100  1.3      haad 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
   1101  1.3      haad 
   1102  1.3      haad 	ASSERT(i < priv->cnt);
   1103  1.3      haad 	priv->bufs[i] = NULL;
   1104  1.3      haad }
   1105  1.3      haad 
   1106  1.3      haad static void
   1107  1.3      haad xuio_stat_init(void)
   1108  1.3      haad {
   1109  1.3      haad 	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
   1110  1.3      haad 	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
   1111  1.3      haad 	    KSTAT_FLAG_VIRTUAL);
   1112  1.3      haad 	if (xuio_ksp != NULL) {
   1113  1.3      haad 		xuio_ksp->ks_data = &xuio_stats;
   1114  1.3      haad 		kstat_install(xuio_ksp);
   1115  1.3      haad 	}
   1116  1.3      haad }
   1117  1.3      haad 
   1118  1.3      haad static void
   1119  1.3      haad xuio_stat_fini(void)
   1120  1.3      haad {
   1121  1.3      haad 	if (xuio_ksp != NULL) {
   1122  1.3      haad 		kstat_delete(xuio_ksp);
   1123  1.3      haad 		xuio_ksp = NULL;
   1124  1.3      haad 	}
   1125  1.3      haad }
   1126  1.3      haad 
   1127  1.3      haad void
   1128  1.3      haad xuio_stat_wbuf_copied()
   1129  1.3      haad {
   1130  1.3      haad 	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
   1131  1.3      haad }
   1132  1.3      haad 
   1133  1.3      haad void
   1134  1.3      haad xuio_stat_wbuf_nocopy()
   1135  1.3      haad {
   1136  1.3      haad 	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
   1137  1.3      haad }
   1138  1.3      haad 
   1139  1.1      haad #ifdef _KERNEL
   1140  1.6       chs static int
   1141  1.6       chs dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
   1142  1.1      haad {
   1143  1.1      haad 	dmu_buf_t **dbp;
   1144  1.1      haad 	int numbufs, i, err;
   1145  1.3      haad 	xuio_t *xuio = NULL;
   1146  1.1      haad 
   1147  1.1      haad 	/*
   1148  1.1      haad 	 * NB: we could do this block-at-a-time, but it's nice
   1149  1.1      haad 	 * to be reading in parallel.
   1150  1.1      haad 	 */
   1151  1.6       chs 	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
   1152  1.6       chs 	    TRUE, FTAG, &numbufs, &dbp, 0);
   1153  1.1      haad 	if (err)
   1154  1.1      haad 		return (err);
   1155  1.1      haad 
   1156  1.6       chs #ifdef UIO_XUIO
   1157  1.3      haad 	if (uio->uio_extflg == UIO_XUIO)
   1158  1.3      haad 		xuio = (xuio_t *)uio;
   1159  1.5  riastrad #endif
   1160  1.3      haad 
   1161  1.1      haad 	for (i = 0; i < numbufs; i++) {
   1162  1.1      haad 		int tocpy;
   1163  1.1      haad 		int bufoff;
   1164  1.1      haad 		dmu_buf_t *db = dbp[i];
   1165  1.1      haad 
   1166  1.1      haad 		ASSERT(size > 0);
   1167  1.1      haad 
   1168  1.1      haad 		bufoff = uio->uio_loffset - db->db_offset;
   1169  1.1      haad 		tocpy = (int)MIN(db->db_size - bufoff, size);
   1170  1.1      haad 
   1171  1.3      haad 		if (xuio) {
   1172  1.3      haad 			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
   1173  1.3      haad 			arc_buf_t *dbuf_abuf = dbi->db_buf;
   1174  1.3      haad 			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
   1175  1.3      haad 			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
   1176  1.3      haad 			if (!err) {
   1177  1.3      haad 				uio->uio_resid -= tocpy;
   1178  1.3      haad 				uio->uio_loffset += tocpy;
   1179  1.3      haad 			}
   1180  1.3      haad 
   1181  1.3      haad 			if (abuf == dbuf_abuf)
   1182  1.3      haad 				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
   1183  1.3      haad 			else
   1184  1.3      haad 				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
   1185  1.3      haad 		} else {
   1186  1.6       chs #ifdef illumos
   1187  1.6       chs 			err = uiomove((char *)db->db_data + bufoff, tocpy,
   1188  1.6       chs 			    UIO_READ, uio);
   1189  1.6       chs #endif
   1190  1.6       chs #ifdef __FreeBSD__
   1191  1.6       chs 			err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
   1192  1.6       chs 			    tocpy, uio);
   1193  1.6       chs #endif
   1194  1.6       chs #ifdef __NetBSD__
   1195  1.3      haad 			err = uiomove((char *)db->db_data + bufoff, tocpy,
   1196  1.3      haad 			    UIO_READ, uio);
   1197  1.6       chs #endif
   1198  1.3      haad 		}
   1199  1.1      haad 		if (err)
   1200  1.1      haad 			break;
   1201  1.1      haad 
   1202  1.1      haad 		size -= tocpy;
   1203  1.1      haad 	}
   1204  1.1      haad 	dmu_buf_rele_array(dbp, numbufs, FTAG);
   1205  1.1      haad 
   1206  1.1      haad 	return (err);
   1207  1.1      haad }
   1208  1.1      haad 
   1209  1.6       chs /*
   1210  1.6       chs  * Read 'size' bytes into the uio buffer.
   1211  1.6       chs  * From object zdb->db_object.
   1212  1.6       chs  * Starting at offset uio->uio_loffset.
   1213  1.6       chs  *
   1214  1.6       chs  * If the caller already has a dbuf in the target object
   1215  1.6       chs  * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
   1216  1.6       chs  * because we don't have to find the dnode_t for the object.
   1217  1.6       chs  */
   1218  1.6       chs int
   1219  1.6       chs dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
   1220  1.6       chs {
   1221  1.6       chs 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
   1222  1.6       chs 	dnode_t *dn;
   1223  1.6       chs 	int err;
   1224  1.6       chs 
   1225  1.6       chs 	if (size == 0)
   1226  1.6       chs 		return (0);
   1227  1.6       chs 
   1228  1.6       chs 	DB_DNODE_ENTER(db);
   1229  1.6       chs 	dn = DB_DNODE(db);
   1230  1.6       chs 	err = dmu_read_uio_dnode(dn, uio, size);
   1231  1.6       chs 	DB_DNODE_EXIT(db);
   1232  1.6       chs 
   1233  1.6       chs 	return (err);
   1234  1.6       chs }
   1235  1.6       chs 
   1236  1.6       chs /*
   1237  1.6       chs  * Read 'size' bytes into the uio buffer.
   1238  1.6       chs  * From the specified object
   1239  1.6       chs  * Starting at offset uio->uio_loffset.
   1240  1.6       chs  */
   1241  1.1      haad int
   1242  1.6       chs dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
   1243  1.1      haad {
   1244  1.6       chs 	dnode_t *dn;
   1245  1.6       chs 	int err;
   1246  1.1      haad 
   1247  1.1      haad 	if (size == 0)
   1248  1.1      haad 		return (0);
   1249  1.1      haad 
   1250  1.6       chs 	err = dnode_hold(os, object, FTAG, &dn);
   1251  1.1      haad 	if (err)
   1252  1.1      haad 		return (err);
   1253  1.1      haad 
   1254  1.6       chs 	err = dmu_read_uio_dnode(dn, uio, size);
   1255  1.1      haad 
   1256  1.6       chs 	dnode_rele(dn, FTAG);
   1257  1.6       chs 
   1258  1.6       chs 	return (err);
   1259  1.6       chs }
   1260  1.6       chs 
   1261  1.6       chs static int
   1262  1.6       chs dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
   1263  1.6       chs {
   1264  1.6       chs 	dmu_buf_t **dbp;
   1265  1.6       chs 	int numbufs;
   1266  1.6       chs 	int err = 0;
   1267  1.6       chs 	int i;
   1268  1.6       chs 
   1269  1.6       chs 	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
   1270  1.6       chs 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
   1271  1.6       chs 	if (err)
   1272  1.6       chs 		return (err);
   1273  1.6       chs 
   1274  1.6       chs 	for (i = 0; i < numbufs; i++) {
   1275  1.6       chs 		int tocpy;
   1276  1.6       chs 		int bufoff;
   1277  1.6       chs 		dmu_buf_t *db = dbp[i];
   1278  1.6       chs 
   1279  1.6       chs 		ASSERT(size > 0);
   1280  1.1      haad 
   1281  1.1      haad 		bufoff = uio->uio_loffset - db->db_offset;
   1282  1.1      haad 		tocpy = (int)MIN(db->db_size - bufoff, size);
   1283  1.1      haad 
   1284  1.1      haad 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
   1285  1.1      haad 
   1286  1.1      haad 		if (tocpy == db->db_size)
   1287  1.1      haad 			dmu_buf_will_fill(db, tx);
   1288  1.1      haad 		else
   1289  1.1      haad 			dmu_buf_will_dirty(db, tx);
   1290  1.1      haad 
   1291  1.6       chs #ifdef illumos
   1292  1.1      haad 		/*
   1293  1.1      haad 		 * XXX uiomove could block forever (eg. nfs-backed
   1294  1.1      haad 		 * pages).  There needs to be a uiolockdown() function
   1295  1.1      haad 		 * to lock the pages in memory, so that uiomove won't
   1296  1.1      haad 		 * block.
   1297  1.1      haad 		 */
   1298  1.1      haad 		err = uiomove((char *)db->db_data + bufoff, tocpy,
   1299  1.1      haad 		    UIO_WRITE, uio);
   1300  1.6       chs #endif
   1301  1.6       chs #ifdef __FreeBSD__
   1302  1.6       chs 		err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy,
   1303  1.6       chs 		    uio);
   1304  1.6       chs #endif
   1305  1.6       chs #ifdef __NetBSD__
   1306  1.6       chs 		err = uiomove((char *)db->db_data + bufoff, tocpy,
   1307  1.6       chs 		    UIO_WRITE, uio);
   1308  1.6       chs #endif
   1309  1.1      haad 
   1310  1.1      haad 		if (tocpy == db->db_size)
   1311  1.1      haad 			dmu_buf_fill_done(db, tx);
   1312  1.1      haad 
   1313  1.1      haad 		if (err)
   1314  1.1      haad 			break;
   1315  1.1      haad 
   1316  1.1      haad 		size -= tocpy;
   1317  1.1      haad 	}
   1318  1.6       chs 
   1319  1.1      haad 	dmu_buf_rele_array(dbp, numbufs, FTAG);
   1320  1.1      haad 	return (err);
   1321  1.1      haad }
   1322  1.1      haad 
   1323  1.6       chs /*
   1324  1.6       chs  * Write 'size' bytes from the uio buffer.
   1325  1.6       chs  * To object zdb->db_object.
   1326  1.6       chs  * Starting at offset uio->uio_loffset.
   1327  1.6       chs  *
   1328  1.6       chs  * If the caller already has a dbuf in the target object
   1329  1.6       chs  * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
   1330  1.6       chs  * because we don't have to find the dnode_t for the object.
   1331  1.6       chs  */
   1332  1.6       chs int
   1333  1.6       chs dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
   1334  1.6       chs     dmu_tx_t *tx)
   1335  1.6       chs {
   1336  1.6       chs 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
   1337  1.6       chs 	dnode_t *dn;
   1338  1.6       chs 	int err;
   1339  1.6       chs 
   1340  1.6       chs 	if (size == 0)
   1341  1.6       chs 		return (0);
   1342  1.6       chs 
   1343  1.6       chs 	DB_DNODE_ENTER(db);
   1344  1.6       chs 	dn = DB_DNODE(db);
   1345  1.6       chs 	err = dmu_write_uio_dnode(dn, uio, size, tx);
   1346  1.6       chs 	DB_DNODE_EXIT(db);
   1347  1.6       chs 
   1348  1.6       chs 	return (err);
   1349  1.6       chs }
   1350  1.6       chs 
   1351  1.6       chs /*
   1352  1.6       chs  * Write 'size' bytes from the uio buffer.
   1353  1.6       chs  * To the specified object.
   1354  1.6       chs  * Starting at offset uio->uio_loffset.
   1355  1.6       chs  */
   1356  1.6       chs int
   1357  1.6       chs dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
   1358  1.6       chs     dmu_tx_t *tx)
   1359  1.6       chs {
   1360  1.6       chs 	dnode_t *dn;
   1361  1.6       chs 	int err;
   1362  1.6       chs 
   1363  1.6       chs 	if (size == 0)
   1364  1.6       chs 		return (0);
   1365  1.6       chs 
   1366  1.6       chs 	err = dnode_hold(os, object, FTAG, &dn);
   1367  1.6       chs 	if (err)
   1368  1.6       chs 		return (err);
   1369  1.6       chs 
   1370  1.6       chs 	err = dmu_write_uio_dnode(dn, uio, size, tx);
   1371  1.6       chs 
   1372  1.6       chs 	dnode_rele(dn, FTAG);
   1373  1.6       chs 
   1374  1.6       chs 	return (err);
   1375  1.6       chs }
   1376  1.6       chs 
   1377  1.6       chs #ifdef illumos
   1378  1.1      haad int
   1379  1.1      haad dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
   1380  1.1      haad     page_t *pp, dmu_tx_t *tx)
   1381  1.1      haad {
   1382  1.1      haad 	dmu_buf_t **dbp;
   1383  1.1      haad 	int numbufs, i;
   1384  1.1      haad 	int err;
   1385  1.1      haad 
   1386  1.1      haad 	if (size == 0)
   1387  1.1      haad 		return (0);
   1388  1.1      haad 
   1389  1.1      haad 	err = dmu_buf_hold_array(os, object, offset, size,
   1390  1.1      haad 	    FALSE, FTAG, &numbufs, &dbp);
   1391  1.1      haad 	if (err)
   1392  1.1      haad 		return (err);
   1393  1.1      haad 
   1394  1.1      haad 	for (i = 0; i < numbufs; i++) {
   1395  1.1      haad 		int tocpy, copied, thiscpy;
   1396  1.1      haad 		int bufoff;
   1397  1.1      haad 		dmu_buf_t *db = dbp[i];
   1398  1.1      haad 		caddr_t va;
   1399  1.1      haad 
   1400  1.1      haad 		ASSERT(size > 0);
   1401  1.1      haad 		ASSERT3U(db->db_size, >=, PAGESIZE);
   1402  1.1      haad 
   1403  1.1      haad 		bufoff = offset - db->db_offset;
   1404  1.1      haad 		tocpy = (int)MIN(db->db_size - bufoff, size);
   1405  1.1      haad 
   1406  1.1      haad 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
   1407  1.1      haad 
   1408  1.1      haad 		if (tocpy == db->db_size)
   1409  1.1      haad 			dmu_buf_will_fill(db, tx);
   1410  1.1      haad 		else
   1411  1.1      haad 			dmu_buf_will_dirty(db, tx);
   1412  1.1      haad 
   1413  1.1      haad 		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
   1414  1.1      haad 			ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
   1415  1.1      haad 			thiscpy = MIN(PAGESIZE, tocpy - copied);
   1416  1.1      haad 			va = zfs_map_page(pp, S_READ);
   1417  1.1      haad 			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
   1418  1.1      haad 			zfs_unmap_page(pp, va);
   1419  1.1      haad 			pp = pp->p_next;
   1420  1.1      haad 			bufoff += PAGESIZE;
   1421  1.1      haad 		}
   1422  1.1      haad 
   1423  1.1      haad 		if (tocpy == db->db_size)
   1424  1.1      haad 			dmu_buf_fill_done(db, tx);
   1425  1.1      haad 
   1426  1.1      haad 		offset += tocpy;
   1427  1.1      haad 		size -= tocpy;
   1428  1.1      haad 	}
   1429  1.1      haad 	dmu_buf_rele_array(dbp, numbufs, FTAG);
   1430  1.1      haad 	return (err);
   1431  1.1      haad }
   1432  1.6       chs #endif /* illumos */
   1433  1.6       chs 
   1434  1.6       chs #ifdef __FreeBSD__
   1435  1.6       chs int
   1436  1.6       chs dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
   1437  1.6       chs     vm_page_t *ma, dmu_tx_t *tx)
   1438  1.6       chs {
   1439  1.6       chs 	dmu_buf_t **dbp;
   1440  1.6       chs 	struct sf_buf *sf;
   1441  1.6       chs 	int numbufs, i;
   1442  1.6       chs 	int err;
   1443  1.6       chs 
   1444  1.6       chs 	if (size == 0)
   1445  1.6       chs 		return (0);
   1446  1.6       chs 
   1447  1.6       chs 	err = dmu_buf_hold_array(os, object, offset, size,
   1448  1.6       chs 	    FALSE, FTAG, &numbufs, &dbp);
   1449  1.6       chs 	if (err)
   1450  1.6       chs 		return (err);
   1451  1.6       chs 
   1452  1.6       chs 	for (i = 0; i < numbufs; i++) {
   1453  1.6       chs 		int tocpy, copied, thiscpy;
   1454  1.6       chs 		int bufoff;
   1455  1.6       chs 		dmu_buf_t *db = dbp[i];
   1456  1.6       chs 		caddr_t va;
   1457  1.6       chs 
   1458  1.6       chs 		ASSERT(size > 0);
   1459  1.6       chs 		ASSERT3U(db->db_size, >=, PAGESIZE);
   1460  1.6       chs 
   1461  1.6       chs 		bufoff = offset - db->db_offset;
   1462  1.6       chs 		tocpy = (int)MIN(db->db_size - bufoff, size);
   1463  1.6       chs 
   1464  1.6       chs 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
   1465  1.6       chs 
   1466  1.6       chs 		if (tocpy == db->db_size)
   1467  1.6       chs 			dmu_buf_will_fill(db, tx);
   1468  1.6       chs 		else
   1469  1.6       chs 			dmu_buf_will_dirty(db, tx);
   1470  1.6       chs 
   1471  1.6       chs 		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
   1472  1.6       chs 			ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
   1473  1.6       chs 			thiscpy = MIN(PAGESIZE, tocpy - copied);
   1474  1.6       chs 			va = zfs_map_page(*ma, &sf);
   1475  1.6       chs 			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
   1476  1.6       chs 			zfs_unmap_page(sf);
   1477  1.6       chs 			ma += 1;
   1478  1.6       chs 			bufoff += PAGESIZE;
   1479  1.6       chs 		}
   1480  1.6       chs 
   1481  1.6       chs 		if (tocpy == db->db_size)
   1482  1.6       chs 			dmu_buf_fill_done(db, tx);
   1483  1.6       chs 
   1484  1.6       chs 		offset += tocpy;
   1485  1.6       chs 		size -= tocpy;
   1486  1.6       chs 	}
   1487  1.6       chs 	dmu_buf_rele_array(dbp, numbufs, FTAG);
   1488  1.6       chs 	return (err);
   1489  1.6       chs }
   1490  1.6       chs #endif	/* __FreeBSD__ */
   1491  1.6       chs 
   1492  1.6       chs #ifdef __NetBSD__
   1493  1.6       chs int
   1494  1.6       chs dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
   1495  1.6       chs     struct vm_page **pgs, dmu_tx_t *tx)
   1496  1.6       chs {
   1497  1.6       chs 	dmu_buf_t **dbp;
   1498  1.6       chs 	int numbufs, i;
   1499  1.6       chs 	int err;
   1500  1.6       chs 
   1501  1.6       chs 	if (size == 0)
   1502  1.6       chs 		return (0);
   1503  1.6       chs 
   1504  1.6       chs 	err = dmu_buf_hold_array(os, object, offset, size,
   1505  1.6       chs 	    FALSE, FTAG, &numbufs, &dbp);
   1506  1.6       chs 	if (err)
   1507  1.6       chs 		return (err);
   1508  1.6       chs 
   1509  1.6       chs 	for (i = 0; i < numbufs; i++) {
   1510  1.6       chs 		int tocpy, copied, thiscpy;
   1511  1.6       chs 		int bufoff;
   1512  1.6       chs 		dmu_buf_t *db = dbp[i];
   1513  1.6       chs 		caddr_t va;
   1514  1.6       chs 
   1515  1.6       chs 		ASSERT(size > 0);
   1516  1.6       chs 		ASSERT3U(db->db_size, >=, PAGESIZE);
   1517  1.6       chs 
   1518  1.6       chs 		bufoff = offset - db->db_offset;
   1519  1.6       chs 		tocpy = (int)MIN(db->db_size - bufoff, size);
   1520  1.6       chs 
   1521  1.6       chs 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
   1522  1.6       chs 
   1523  1.6       chs 		if (tocpy == db->db_size)
   1524  1.6       chs 			dmu_buf_will_fill(db, tx);
   1525  1.6       chs 		else
   1526  1.6       chs 			dmu_buf_will_dirty(db, tx);
   1527  1.6       chs 
   1528  1.6       chs 		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
   1529  1.6       chs 			ASSERT3U((*pgs)->offset, ==, db->db_offset + bufoff);
   1530  1.6       chs 			thiscpy = MIN(PAGESIZE, tocpy - copied);
   1531  1.6       chs 			va = zfs_map_page(*pgs, S_READ);
   1532  1.6       chs 			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
   1533  1.6       chs 			zfs_unmap_page(*pgs, va);
   1534  1.6       chs 			pgs++;
   1535  1.6       chs 			bufoff += PAGESIZE;
   1536  1.6       chs 		}
   1537  1.6       chs 
   1538  1.6       chs 		if (tocpy == db->db_size)
   1539  1.6       chs 			dmu_buf_fill_done(db, tx);
   1540  1.6       chs 
   1541  1.6       chs 		offset += tocpy;
   1542  1.6       chs 		size -= tocpy;
   1543  1.6       chs 	}
   1544  1.6       chs 	dmu_buf_rele_array(dbp, numbufs, FTAG);
   1545  1.6       chs 	return (err);
   1546  1.6       chs }
   1547  1.1      haad #endif
   1548  1.6       chs #endif	/* _KERNEL */
   1549  1.1      haad 
   1550  1.3      haad /*
   1551  1.3      haad  * Allocate a loaned anonymous arc buffer.
   1552  1.3      haad  */
   1553  1.3      haad arc_buf_t *
   1554  1.3      haad dmu_request_arcbuf(dmu_buf_t *handle, int size)
   1555  1.3      haad {
   1556  1.6       chs 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
   1557  1.3      haad 
   1558  1.6       chs 	return (arc_loan_buf(db->db_objset->os_spa, size));
   1559  1.3      haad }
   1560  1.3      haad 
   1561  1.3      haad /*
   1562  1.3      haad  * Free a loaned arc buffer.
   1563  1.3      haad  */
   1564  1.3      haad void
   1565  1.3      haad dmu_return_arcbuf(arc_buf_t *buf)
   1566  1.3      haad {
   1567  1.3      haad 	arc_return_buf(buf, FTAG);
   1568  1.6       chs 	arc_buf_destroy(buf, FTAG);
   1569  1.3      haad }
   1570  1.3      haad 
   1571  1.3      haad /*
   1572  1.3      haad  * When possible directly assign passed loaned arc buffer to a dbuf.
   1573  1.3      haad  * If this is not possible copy the contents of passed arc buf via
   1574  1.3      haad  * dmu_write().
   1575  1.3      haad  */
   1576  1.3      haad void
   1577  1.3      haad dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
   1578  1.3      haad     dmu_tx_t *tx)
   1579  1.3      haad {
   1580  1.6       chs 	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
   1581  1.6       chs 	dnode_t *dn;
   1582  1.3      haad 	dmu_buf_impl_t *db;
   1583  1.3      haad 	uint32_t blksz = (uint32_t)arc_buf_size(buf);
   1584  1.3      haad 	uint64_t blkid;
   1585  1.3      haad 
   1586  1.6       chs 	DB_DNODE_ENTER(dbuf);
   1587  1.6       chs 	dn = DB_DNODE(dbuf);
   1588  1.3      haad 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
   1589  1.6       chs 	blkid = dbuf_whichblock(dn, 0, offset);
   1590  1.3      haad 	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
   1591  1.3      haad 	rw_exit(&dn->dn_struct_rwlock);
   1592  1.6       chs 	DB_DNODE_EXIT(dbuf);
   1593  1.3      haad 
   1594  1.6       chs 	/*
   1595  1.6       chs 	 * We can only assign if the offset is aligned, the arc buf is the
   1596  1.6       chs 	 * same size as the dbuf, and the dbuf is not metadata.  It
   1597  1.6       chs 	 * can't be metadata because the loaned arc buf comes from the
   1598  1.6       chs 	 * user-data kmem arena.
   1599  1.6       chs 	 */
   1600  1.6       chs 	if (offset == db->db.db_offset && blksz == db->db.db_size &&
   1601  1.6       chs 	    DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
   1602  1.6       chs #ifdef _KERNEL
   1603  1.6       chs 		curthread->td_ru.ru_oublock++;
   1604  1.6       chs #ifdef RACCT
   1605  1.6       chs 		if (racct_enable) {
   1606  1.6       chs 			PROC_LOCK(curproc);
   1607  1.6       chs 			racct_add_force(curproc, RACCT_WRITEBPS, blksz);
   1608  1.6       chs 			racct_add_force(curproc, RACCT_WRITEIOPS, 1);
   1609  1.6       chs 			PROC_UNLOCK(curproc);
   1610  1.6       chs 		}
   1611  1.6       chs #endif /* RACCT */
   1612  1.6       chs #endif /* _KERNEL */
   1613  1.3      haad 		dbuf_assign_arcbuf(db, buf, tx);
   1614  1.3      haad 		dbuf_rele(db, FTAG);
   1615  1.3      haad 	} else {
   1616  1.6       chs 		objset_t *os;
   1617  1.6       chs 		uint64_t object;
   1618  1.6       chs 
   1619  1.6       chs 		DB_DNODE_ENTER(dbuf);
   1620  1.6       chs 		dn = DB_DNODE(dbuf);
   1621  1.6       chs 		os = dn->dn_objset;
   1622  1.6       chs 		object = dn->dn_object;
   1623  1.6       chs 		DB_DNODE_EXIT(dbuf);
   1624  1.6       chs 
   1625  1.3      haad 		dbuf_rele(db, FTAG);
   1626  1.6       chs 		dmu_write(os, object, offset, blksz, buf->b_data, tx);
   1627  1.3      haad 		dmu_return_arcbuf(buf);
   1628  1.3      haad 		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
   1629  1.3      haad 	}
   1630  1.3      haad }
   1631  1.3      haad 
   1632  1.1      haad typedef struct {
   1633  1.3      haad 	dbuf_dirty_record_t	*dsa_dr;
   1634  1.3      haad 	dmu_sync_cb_t		*dsa_done;
   1635  1.3      haad 	zgd_t			*dsa_zgd;
   1636  1.3      haad 	dmu_tx_t		*dsa_tx;
   1637  1.1      haad } dmu_sync_arg_t;
   1638  1.1      haad 
   1639  1.1      haad /* ARGSUSED */
   1640  1.1      haad static void
   1641  1.1      haad dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
   1642  1.1      haad {
   1643  1.3      haad 	dmu_sync_arg_t *dsa = varg;
   1644  1.3      haad 	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
   1645  1.1      haad 	blkptr_t *bp = zio->io_bp;
   1646  1.1      haad 
   1647  1.3      haad 	if (zio->io_error == 0) {
   1648  1.3      haad 		if (BP_IS_HOLE(bp)) {
   1649  1.3      haad 			/*
   1650  1.3      haad 			 * A block of zeros may compress to a hole, but the
   1651  1.3      haad 			 * block size still needs to be known for replay.
   1652  1.3      haad 			 */
   1653  1.3      haad 			BP_SET_LSIZE(bp, db->db_size);
   1654  1.6       chs 		} else if (!BP_IS_EMBEDDED(bp)) {
   1655  1.3      haad 			ASSERT(BP_GET_LEVEL(bp) == 0);
   1656  1.3      haad 			bp->blk_fill = 1;
   1657  1.3      haad 		}
   1658  1.1      haad 	}
   1659  1.1      haad }
   1660  1.1      haad 
   1661  1.3      haad static void
   1662  1.3      haad dmu_sync_late_arrival_ready(zio_t *zio)
   1663  1.3      haad {
   1664  1.3      haad 	dmu_sync_ready(zio, NULL, zio->io_private);
   1665  1.3      haad }
   1666  1.3      haad 
   1667  1.1      haad /* ARGSUSED */
   1668  1.1      haad static void
   1669  1.1      haad dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
   1670  1.1      haad {
   1671  1.3      haad 	dmu_sync_arg_t *dsa = varg;
   1672  1.3      haad 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
   1673  1.1      haad 	dmu_buf_impl_t *db = dr->dr_dbuf;
   1674  1.1      haad 
   1675  1.1      haad 	mutex_enter(&db->db_mtx);
   1676  1.1      haad 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
   1677  1.3      haad 	if (zio->io_error == 0) {
   1678  1.6       chs 		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
   1679  1.6       chs 		if (dr->dt.dl.dr_nopwrite) {
   1680  1.6       chs 			blkptr_t *bp = zio->io_bp;
   1681  1.6       chs 			blkptr_t *bp_orig = &zio->io_bp_orig;
   1682  1.6       chs 			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
   1683  1.6       chs 
   1684  1.6       chs 			ASSERT(BP_EQUAL(bp, bp_orig));
   1685  1.6       chs 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
   1686  1.6       chs 			ASSERT(zio_checksum_table[chksum].ci_flags &
   1687  1.6       chs 			    ZCHECKSUM_FLAG_NOPWRITE);
   1688  1.6       chs 		}
   1689  1.3      haad 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
   1690  1.3      haad 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
   1691  1.3      haad 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
   1692  1.6       chs 
   1693  1.6       chs 		/*
   1694  1.6       chs 		 * Old style holes are filled with all zeros, whereas
   1695  1.6       chs 		 * new-style holes maintain their lsize, type, level,
   1696  1.6       chs 		 * and birth time (see zio_write_compress). While we
   1697  1.6       chs 		 * need to reset the BP_SET_LSIZE() call that happened
   1698  1.6       chs 		 * in dmu_sync_ready for old style holes, we do *not*
   1699  1.6       chs 		 * want to wipe out the information contained in new
   1700  1.6       chs 		 * style holes. Thus, only zero out the block pointer if
   1701  1.6       chs 		 * it's an old style hole.
   1702  1.6       chs 		 */
   1703  1.6       chs 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
   1704  1.6       chs 		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
   1705  1.3      haad 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
   1706  1.3      haad 	} else {
   1707  1.3      haad 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
   1708  1.3      haad 	}
   1709  1.1      haad 	cv_broadcast(&db->db_changed);
   1710  1.1      haad 	mutex_exit(&db->db_mtx);
   1711  1.1      haad 
   1712  1.3      haad 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
   1713  1.3      haad 
   1714  1.3      haad 	kmem_free(dsa, sizeof (*dsa));
   1715  1.3      haad }
   1716  1.3      haad 
   1717  1.3      haad static void
   1718  1.3      haad dmu_sync_late_arrival_done(zio_t *zio)
   1719  1.3      haad {
   1720  1.3      haad 	blkptr_t *bp = zio->io_bp;
   1721  1.3      haad 	dmu_sync_arg_t *dsa = zio->io_private;
   1722  1.6       chs 	blkptr_t *bp_orig = &zio->io_bp_orig;
   1723  1.3      haad 
   1724  1.3      haad 	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
   1725  1.6       chs 		/*
   1726  1.6       chs 		 * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
   1727  1.6       chs 		 * then there is nothing to do here. Otherwise, free the
   1728  1.6       chs 		 * newly allocated block in this txg.
   1729  1.6       chs 		 */
   1730  1.6       chs 		if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
   1731  1.6       chs 			ASSERT(BP_EQUAL(bp, bp_orig));
   1732  1.6       chs 		} else {
   1733  1.6       chs 			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
   1734  1.6       chs 			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
   1735  1.6       chs 			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
   1736  1.6       chs 			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
   1737  1.6       chs 		}
   1738  1.3      haad 	}
   1739  1.3      haad 
   1740  1.3      haad 	dmu_tx_commit(dsa->dsa_tx);
   1741  1.1      haad 
   1742  1.3      haad 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
   1743  1.3      haad 
   1744  1.3      haad 	kmem_free(dsa, sizeof (*dsa));
   1745  1.3      haad }
   1746  1.3      haad 
   1747  1.3      haad static int
   1748  1.3      haad dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
   1749  1.6       chs     zio_prop_t *zp, zbookmark_phys_t *zb)
   1750  1.3      haad {
   1751  1.3      haad 	dmu_sync_arg_t *dsa;
   1752  1.3      haad 	dmu_tx_t *tx;
   1753  1.3      haad 
   1754  1.3      haad 	tx = dmu_tx_create(os);
   1755  1.3      haad 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
   1756  1.3      haad 	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
   1757  1.3      haad 		dmu_tx_abort(tx);
   1758  1.6       chs 		/* Make zl_get_data do txg_waited_synced() */
   1759  1.6       chs 		return (SET_ERROR(EIO));
   1760  1.3      haad 	}
   1761  1.3      haad 
   1762  1.3      haad 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
   1763  1.3      haad 	dsa->dsa_dr = NULL;
   1764  1.3      haad 	dsa->dsa_done = done;
   1765  1.3      haad 	dsa->dsa_zgd = zgd;
   1766  1.3      haad 	dsa->dsa_tx = tx;
   1767  1.3      haad 
   1768  1.6       chs 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx),
   1769  1.6       chs 	    zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size,
   1770  1.6       chs 	    zp, dmu_sync_late_arrival_ready, NULL,
   1771  1.6       chs 	    NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
   1772  1.6       chs 	    ZIO_FLAG_CANFAIL, zb));
   1773  1.3      haad 
   1774  1.3      haad 	return (0);
   1775  1.1      haad }
   1776  1.1      haad 
   1777  1.1      haad /*
   1778  1.1      haad  * Intent log support: sync the block associated with db to disk.
   1779  1.1      haad  * N.B. and XXX: the caller is responsible for making sure that the
   1780  1.1      haad  * data isn't changing while dmu_sync() is writing it.
   1781  1.1      haad  *
   1782  1.1      haad  * Return values:
   1783  1.1      haad  *
   1784  1.6       chs  *	EEXIST: this txg has already been synced, so there's nothing to do.
   1785  1.1      haad  *		The caller should not log the write.
   1786  1.1      haad  *
   1787  1.1      haad  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
   1788  1.1      haad  *		The caller should not log the write.
   1789  1.1      haad  *
   1790  1.1      haad  *	EALREADY: this block is already in the process of being synced.
   1791  1.1      haad  *		The caller should track its progress (somehow).
   1792  1.1      haad  *
   1793  1.3      haad  *	EIO: could not do the I/O.
   1794  1.3      haad  *		The caller should do a txg_wait_synced().
   1795  1.1      haad  *
   1796  1.3      haad  *	0: the I/O has been initiated.
   1797  1.3      haad  *		The caller should log this blkptr in the done callback.
   1798  1.3      haad  *		It is possible that the I/O will fail, in which case
   1799  1.3      haad  *		the error will be reported to the done callback and
   1800  1.3      haad  *		propagated to pio from zio_done().
   1801  1.1      haad  */
   1802  1.1      haad int
   1803  1.3      haad dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
   1804  1.1      haad {
   1805  1.3      haad 	blkptr_t *bp = zgd->zgd_bp;
   1806  1.3      haad 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
   1807  1.3      haad 	objset_t *os = db->db_objset;
   1808  1.3      haad 	dsl_dataset_t *ds = os->os_dsl_dataset;
   1809  1.1      haad 	dbuf_dirty_record_t *dr;
   1810  1.3      haad 	dmu_sync_arg_t *dsa;
   1811  1.6       chs 	zbookmark_phys_t zb;
   1812  1.3      haad 	zio_prop_t zp;
   1813  1.6       chs 	dnode_t *dn;
   1814  1.1      haad 
   1815  1.3      haad 	ASSERT(pio != NULL);
   1816  1.1      haad 	ASSERT(txg != 0);
   1817  1.1      haad 
   1818  1.3      haad 	SET_BOOKMARK(&zb, ds->ds_object,
   1819  1.3      haad 	    db->db.db_object, db->db_level, db->db_blkid);
   1820  1.3      haad 
   1821  1.6       chs 	DB_DNODE_ENTER(db);
   1822  1.6       chs 	dn = DB_DNODE(db);
   1823  1.6       chs 	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
   1824  1.6       chs 	DB_DNODE_EXIT(db);
   1825  1.1      haad 
   1826  1.1      haad 	/*
   1827  1.3      haad 	 * If we're frozen (running ziltest), we always need to generate a bp.
   1828  1.1      haad 	 */
   1829  1.3      haad 	if (txg > spa_freeze_txg(os->os_spa))
   1830  1.3      haad 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
   1831  1.1      haad 
   1832  1.1      haad 	/*
   1833  1.3      haad 	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
   1834  1.3      haad 	 * and us.  If we determine that this txg is not yet syncing,
   1835  1.3      haad 	 * but it begins to sync a moment later, that's OK because the
   1836  1.3      haad 	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
   1837  1.1      haad 	 */
   1838  1.3      haad 	mutex_enter(&db->db_mtx);
   1839  1.3      haad 
   1840  1.3      haad 	if (txg <= spa_last_synced_txg(os->os_spa)) {
   1841  1.1      haad 		/*
   1842  1.3      haad 		 * This txg has already synced.  There's nothing to do.
   1843  1.1      haad 		 */
   1844  1.3      haad 		mutex_exit(&db->db_mtx);
   1845  1.6       chs 		return (SET_ERROR(EEXIST));
   1846  1.1      haad 	}
   1847  1.1      haad 
   1848  1.3      haad 	if (txg <= spa_syncing_txg(os->os_spa)) {
   1849  1.3      haad 		/*
   1850  1.3      haad 		 * This txg is currently syncing, so we can't mess with
   1851  1.3      haad 		 * the dirty record anymore; just write a new log block.
   1852  1.3      haad 		 */
   1853  1.3      haad 		mutex_exit(&db->db_mtx);
   1854  1.3      haad 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
   1855  1.1      haad 	}
   1856  1.1      haad 
   1857  1.1      haad 	dr = db->db_last_dirty;
   1858  1.3      haad 	while (dr && dr->dr_txg != txg)
   1859  1.1      haad 		dr = dr->dr_next;
   1860  1.3      haad 
   1861  1.3      haad 	if (dr == NULL) {
   1862  1.1      haad 		/*
   1863  1.3      haad 		 * There's no dr for this dbuf, so it must have been freed.
   1864  1.1      haad 		 * There's no need to log writes to freed blocks, so we're done.
   1865  1.1      haad 		 */
   1866  1.1      haad 		mutex_exit(&db->db_mtx);
   1867  1.6       chs 		return (SET_ERROR(ENOENT));
   1868  1.1      haad 	}
   1869  1.1      haad 
   1870  1.6       chs 	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
   1871  1.6       chs 
   1872  1.6       chs 	/*
   1873  1.6       chs 	 * Assume the on-disk data is X, the current syncing data (in
   1874  1.6       chs 	 * txg - 1) is Y, and the current in-memory data is Z (currently
   1875  1.6       chs 	 * in dmu_sync).
   1876  1.6       chs 	 *
   1877  1.6       chs 	 * We usually want to perform a nopwrite if X and Z are the
   1878  1.6       chs 	 * same.  However, if Y is different (i.e. the BP is going to
   1879  1.6       chs 	 * change before this write takes effect), then a nopwrite will
   1880  1.6       chs 	 * be incorrect - we would override with X, which could have
   1881  1.6       chs 	 * been freed when Y was written.
   1882  1.6       chs 	 *
   1883  1.6       chs 	 * (Note that this is not a concern when we are nop-writing from
   1884  1.6       chs 	 * syncing context, because X and Y must be identical, because
   1885  1.6       chs 	 * all previous txgs have been synced.)
   1886  1.6       chs 	 *
   1887  1.6       chs 	 * Therefore, we disable nopwrite if the current BP could change
   1888  1.6       chs 	 * before this TXG.  There are two ways it could change: by
   1889  1.6       chs 	 * being dirty (dr_next is non-NULL), or by being freed
   1890  1.6       chs 	 * (dnode_block_freed()).  This behavior is verified by
   1891  1.6       chs 	 * zio_done(), which VERIFYs that the override BP is identical
   1892  1.6       chs 	 * to the on-disk BP.
   1893  1.6       chs 	 */
   1894  1.6       chs 	DB_DNODE_ENTER(db);
   1895  1.6       chs 	dn = DB_DNODE(db);
   1896  1.6       chs 	if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
   1897  1.6       chs 		zp.zp_nopwrite = B_FALSE;
   1898  1.6       chs 	DB_DNODE_EXIT(db);
   1899  1.6       chs 
   1900  1.1      haad 	ASSERT(dr->dr_txg == txg);
   1901  1.3      haad 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
   1902  1.3      haad 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
   1903  1.1      haad 		/*
   1904  1.3      haad 		 * We have already issued a sync write for this buffer,
   1905  1.3      haad 		 * or this buffer has already been synced.  It could not
   1906  1.3      haad 		 * have been dirtied since, or we would have cleared the state.
   1907  1.1      haad 		 */
   1908  1.1      haad 		mutex_exit(&db->db_mtx);
   1909  1.6       chs 		return (SET_ERROR(EALREADY));
   1910  1.1      haad 	}
   1911  1.1      haad 
   1912  1.3      haad 	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
   1913  1.1      haad 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
   1914  1.1      haad 	mutex_exit(&db->db_mtx);
   1915  1.1      haad 
   1916  1.3      haad 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
   1917  1.3      haad 	dsa->dsa_dr = dr;
   1918  1.3      haad 	dsa->dsa_done = done;
   1919  1.3      haad 	dsa->dsa_zgd = zgd;
   1920  1.3      haad 	dsa->dsa_tx = NULL;
   1921  1.3      haad 
   1922  1.3      haad 	zio_nowait(arc_write(pio, os->os_spa, txg,
   1923  1.6       chs 	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
   1924  1.6       chs 	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
   1925  1.3      haad 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
   1926  1.1      haad 
   1927  1.3      haad 	return (0);
   1928  1.1      haad }
   1929  1.1      haad 
   1930  1.1      haad int
   1931  1.1      haad dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
   1932  1.6       chs     dmu_tx_t *tx)
   1933  1.1      haad {
   1934  1.1      haad 	dnode_t *dn;
   1935  1.1      haad 	int err;
   1936  1.1      haad 
   1937  1.3      haad 	err = dnode_hold(os, object, FTAG, &dn);
   1938  1.1      haad 	if (err)
   1939  1.1      haad 		return (err);
   1940  1.1      haad 	err = dnode_set_blksz(dn, size, ibs, tx);
   1941  1.1      haad 	dnode_rele(dn, FTAG);
   1942  1.1      haad 	return (err);
   1943  1.1      haad }
   1944  1.1      haad 
   1945  1.1      haad void
   1946  1.1      haad dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
   1947  1.6       chs     dmu_tx_t *tx)
   1948  1.1      haad {
   1949  1.1      haad 	dnode_t *dn;
   1950  1.1      haad 
   1951  1.6       chs 	/*
   1952  1.6       chs 	 * Send streams include each object's checksum function.  This
   1953  1.6       chs 	 * check ensures that the receiving system can understand the
   1954  1.6       chs 	 * checksum function transmitted.
   1955  1.6       chs 	 */
   1956  1.6       chs 	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
   1957  1.6       chs 
   1958  1.6       chs 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
   1959  1.6       chs 	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
   1960  1.1      haad 	dn->dn_checksum = checksum;
   1961  1.1      haad 	dnode_setdirty(dn, tx);
   1962  1.1      haad 	dnode_rele(dn, FTAG);
   1963  1.1      haad }
   1964  1.1      haad 
   1965  1.1      haad void
   1966  1.1      haad dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
   1967  1.6       chs     dmu_tx_t *tx)
   1968  1.1      haad {
   1969  1.1      haad 	dnode_t *dn;
   1970  1.1      haad 
   1971  1.6       chs 	/*
   1972  1.6       chs 	 * Send streams include each object's compression function.  This
   1973  1.6       chs 	 * check ensures that the receiving system can understand the
   1974  1.6       chs 	 * compression function transmitted.
   1975  1.6       chs 	 */
   1976  1.6       chs 	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
   1977  1.6       chs 
   1978  1.6       chs 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
   1979  1.1      haad 	dn->dn_compress = compress;
   1980  1.1      haad 	dnode_setdirty(dn, tx);
   1981  1.1      haad 	dnode_rele(dn, FTAG);
   1982  1.1      haad }
   1983  1.1      haad 
   1984  1.3      haad int zfs_mdcomp_disable = 0;
   1985  1.6       chs SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN,
   1986  1.6       chs     &zfs_mdcomp_disable, 0, "Disable metadata compression");
   1987  1.6       chs 
   1988  1.6       chs /*
   1989  1.6       chs  * When the "redundant_metadata" property is set to "most", only indirect
   1990  1.6       chs  * blocks of this level and higher will have an additional ditto block.
   1991  1.6       chs  */
   1992  1.6       chs int zfs_redundant_metadata_most_ditto_level = 2;
   1993  1.3      haad 
   1994  1.3      haad void
   1995  1.3      haad dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
   1996  1.3      haad {
   1997  1.3      haad 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
   1998  1.6       chs 	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
   1999  1.6       chs 	    (wp & WP_SPILL));
   2000  1.3      haad 	enum zio_checksum checksum = os->os_checksum;
   2001  1.3      haad 	enum zio_compress compress = os->os_compress;
   2002  1.3      haad 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
   2003  1.6       chs 	boolean_t dedup = B_FALSE;
   2004  1.6       chs 	boolean_t nopwrite = B_FALSE;
   2005  1.3      haad 	boolean_t dedup_verify = os->os_dedup_verify;
   2006  1.3      haad 	int copies = os->os_copies;
   2007  1.3      haad 
   2008  1.3      haad 	/*
   2009  1.6       chs 	 * We maintain different write policies for each of the following
   2010  1.6       chs 	 * types of data:
   2011  1.6       chs 	 *	 1. metadata
   2012  1.6       chs 	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
   2013  1.6       chs 	 *	 3. all other level 0 blocks
   2014  1.3      haad 	 */
   2015  1.3      haad 	if (ismd) {
   2016  1.6       chs 		if (zfs_mdcomp_disable) {
   2017  1.6       chs 			compress = ZIO_COMPRESS_EMPTY;
   2018  1.6       chs 		} else {
   2019  1.6       chs 			/*
   2020  1.6       chs 			 * XXX -- we should design a compression algorithm
   2021  1.6       chs 			 * that specializes in arrays of bps.
   2022  1.6       chs 			 */
   2023  1.6       chs 			compress = zio_compress_select(os->os_spa,
   2024  1.6       chs 			    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
   2025  1.6       chs 		}
   2026  1.6       chs 
   2027  1.3      haad 		/*
   2028  1.3      haad 		 * Metadata always gets checksummed.  If the data
   2029  1.3      haad 		 * checksum is multi-bit correctable, and it's not a
   2030  1.3      haad 		 * ZBT-style checksum, then it's suitable for metadata
   2031  1.3      haad 		 * as well.  Otherwise, the metadata checksum defaults
   2032  1.3      haad 		 * to fletcher4.
   2033  1.3      haad 		 */
   2034  1.6       chs 		if (!(zio_checksum_table[checksum].ci_flags &
   2035  1.6       chs 		    ZCHECKSUM_FLAG_METADATA) ||
   2036  1.6       chs 		    (zio_checksum_table[checksum].ci_flags &
   2037  1.6       chs 		    ZCHECKSUM_FLAG_EMBEDDED))
   2038  1.3      haad 			checksum = ZIO_CHECKSUM_FLETCHER_4;
   2039  1.3      haad 
   2040  1.6       chs 		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
   2041  1.6       chs 		    (os->os_redundant_metadata ==
   2042  1.6       chs 		    ZFS_REDUNDANT_METADATA_MOST &&
   2043  1.6       chs 		    (level >= zfs_redundant_metadata_most_ditto_level ||
   2044  1.6       chs 		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
   2045  1.6       chs 			copies++;
   2046  1.6       chs 	} else if (wp & WP_NOFILL) {
   2047  1.6       chs 		ASSERT(level == 0);
   2048  1.6       chs 
   2049  1.3      haad 		/*
   2050  1.6       chs 		 * If we're writing preallocated blocks, we aren't actually
   2051  1.6       chs 		 * writing them so don't set any policy properties.  These
   2052  1.6       chs 		 * blocks are currently only used by an external subsystem
   2053  1.6       chs 		 * outside of zfs (i.e. dump) and not written by the zio
   2054  1.6       chs 		 * pipeline.
   2055  1.3      haad 		 */
   2056  1.6       chs 		compress = ZIO_COMPRESS_OFF;
   2057  1.6       chs 		checksum = ZIO_CHECKSUM_NOPARITY;
   2058  1.3      haad 	} else {
   2059  1.6       chs 		compress = zio_compress_select(os->os_spa, dn->dn_compress,
   2060  1.6       chs 		    compress);
   2061  1.6       chs 
   2062  1.6       chs 		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
   2063  1.6       chs 		    zio_checksum_select(dn->dn_checksum, checksum) :
   2064  1.6       chs 		    dedup_checksum;
   2065  1.3      haad 
   2066  1.6       chs 		/*
   2067  1.6       chs 		 * Determine dedup setting.  If we are in dmu_sync(),
   2068  1.6       chs 		 * we won't actually dedup now because that's all
   2069  1.6       chs 		 * done in syncing context; but we do want to use the
   2070  1.6       chs 		 * dedup checkum.  If the checksum is not strong
   2071  1.6       chs 		 * enough to ensure unique signatures, force
   2072  1.6       chs 		 * dedup_verify.
   2073  1.6       chs 		 */
   2074  1.6       chs 		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
   2075  1.6       chs 			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
   2076  1.6       chs 			if (!(zio_checksum_table[checksum].ci_flags &
   2077  1.6       chs 			    ZCHECKSUM_FLAG_DEDUP))
   2078  1.6       chs 				dedup_verify = B_TRUE;
   2079  1.6       chs 		}
   2080  1.3      haad 
   2081  1.6       chs 		/*
   2082  1.6       chs 		 * Enable nopwrite if we have secure enough checksum
   2083  1.6       chs 		 * algorithm (see comment in zio_nop_write) and
   2084  1.6       chs 		 * compression is enabled.  We don't enable nopwrite if
   2085  1.6       chs 		 * dedup is enabled as the two features are mutually
   2086  1.6       chs 		 * exclusive.
   2087  1.6       chs 		 */
   2088  1.6       chs 		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
   2089  1.6       chs 		    ZCHECKSUM_FLAG_NOPWRITE) &&
   2090  1.6       chs 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
   2091  1.3      haad 	}
   2092  1.3      haad 
   2093  1.3      haad 	zp->zp_checksum = checksum;
   2094  1.3      haad 	zp->zp_compress = compress;
   2095  1.6       chs 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
   2096  1.3      haad 	zp->zp_level = level;
   2097  1.6       chs 	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
   2098  1.3      haad 	zp->zp_dedup = dedup;
   2099  1.3      haad 	zp->zp_dedup_verify = dedup && dedup_verify;
   2100  1.6       chs 	zp->zp_nopwrite = nopwrite;
   2101  1.3      haad }
   2102  1.3      haad 
   2103  1.1      haad int
   2104  1.1      haad dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
   2105  1.1      haad {
   2106  1.1      haad 	dnode_t *dn;
   2107  1.6       chs 	int err;
   2108  1.1      haad 
   2109  1.1      haad 	/*
   2110  1.1      haad 	 * Sync any current changes before
   2111  1.1      haad 	 * we go trundling through the block pointers.
   2112  1.1      haad 	 */
   2113  1.6       chs 	err = dmu_object_wait_synced(os, object);
   2114  1.6       chs 	if (err) {
   2115  1.6       chs 		return (err);
   2116  1.6       chs 	}
   2117  1.6       chs 
   2118  1.6       chs 	err = dnode_hold(os, object, FTAG, &dn);
   2119  1.6       chs 	if (err) {
   2120  1.6       chs 		return (err);
   2121  1.6       chs 	}
   2122  1.6       chs 
   2123  1.6       chs 	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
   2124  1.6       chs 	dnode_rele(dn, FTAG);
   2125  1.6       chs 
   2126  1.6       chs 	return (err);
   2127  1.6       chs }
   2128  1.6       chs 
   2129  1.6       chs /*
   2130  1.6       chs  * Given the ZFS object, if it contains any dirty nodes
   2131  1.6       chs  * this function flushes all dirty blocks to disk. This
   2132  1.6       chs  * ensures the DMU object info is updated. A more efficient
   2133  1.6       chs  * future version might just find the TXG with the maximum
   2134  1.6       chs  * ID and wait for that to be synced.
   2135  1.6       chs  */
   2136  1.6       chs int
   2137  1.6       chs dmu_object_wait_synced(objset_t *os, uint64_t object)
   2138  1.6       chs {
   2139  1.6       chs 	dnode_t *dn;
   2140  1.6       chs 	int error, i;
   2141  1.6       chs 
   2142  1.6       chs 	error = dnode_hold(os, object, FTAG, &dn);
   2143  1.6       chs 	if (error) {
   2144  1.6       chs 		return (error);
   2145  1.6       chs 	}
   2146  1.6       chs 
   2147  1.1      haad 	for (i = 0; i < TXG_SIZE; i++) {
   2148  1.7    simonb 		if (list_link_active(&dn->dn_dirty_link[i]) ||
   2149  1.7    simonb 		    !list_is_empty(&dn->dn_dirty_records[i])) {
   2150  1.1      haad 			break;
   2151  1.6       chs 		}
   2152  1.1      haad 	}
   2153  1.6       chs 	dnode_rele(dn, FTAG);
   2154  1.1      haad 	if (i != TXG_SIZE) {
   2155  1.1      haad 		txg_wait_synced(dmu_objset_pool(os), 0);
   2156  1.1      haad 	}
   2157  1.1      haad 
   2158  1.6       chs 	return (0);
   2159  1.1      haad }
   2160  1.1      haad 
   2161  1.1      haad void
   2162  1.1      haad dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
   2163  1.1      haad {
   2164  1.3      haad 	dnode_phys_t *dnp;
   2165  1.3      haad 
   2166  1.1      haad 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
   2167  1.1      haad 	mutex_enter(&dn->dn_mtx);
   2168  1.1      haad 
   2169  1.3      haad 	dnp = dn->dn_phys;
   2170  1.3      haad 
   2171  1.1      haad 	doi->doi_data_block_size = dn->dn_datablksz;
   2172  1.1      haad 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
   2173  1.1      haad 	    1ULL << dn->dn_indblkshift : 0;
   2174  1.3      haad 	doi->doi_type = dn->dn_type;
   2175  1.3      haad 	doi->doi_bonus_type = dn->dn_bonustype;
   2176  1.3      haad 	doi->doi_bonus_size = dn->dn_bonuslen;
   2177  1.1      haad 	doi->doi_indirection = dn->dn_nlevels;
   2178  1.1      haad 	doi->doi_checksum = dn->dn_checksum;
   2179  1.1      haad 	doi->doi_compress = dn->dn_compress;
   2180  1.6       chs 	doi->doi_nblkptr = dn->dn_nblkptr;
   2181  1.3      haad 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
   2182  1.6       chs 	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
   2183  1.3      haad 	doi->doi_fill_count = 0;
   2184  1.3      haad 	for (int i = 0; i < dnp->dn_nblkptr; i++)
   2185  1.6       chs 		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
   2186  1.1      haad 
   2187  1.1      haad 	mutex_exit(&dn->dn_mtx);
   2188  1.1      haad 	rw_exit(&dn->dn_struct_rwlock);
   2189  1.1      haad }
   2190  1.1      haad 
   2191  1.1      haad /*
   2192  1.1      haad  * Get information on a DMU object.
   2193  1.1      haad  * If doi is NULL, just indicates whether the object exists.
   2194  1.1      haad  */
   2195  1.1      haad int
   2196  1.1      haad dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
   2197  1.1      haad {
   2198  1.1      haad 	dnode_t *dn;
   2199  1.3      haad 	int err = dnode_hold(os, object, FTAG, &dn);
   2200  1.1      haad 
   2201  1.1      haad 	if (err)
   2202  1.1      haad 		return (err);
   2203  1.1      haad 
   2204  1.1      haad 	if (doi != NULL)
   2205  1.1      haad 		dmu_object_info_from_dnode(dn, doi);
   2206  1.1      haad 
   2207  1.1      haad 	dnode_rele(dn, FTAG);
   2208  1.1      haad 	return (0);
   2209  1.1      haad }
   2210  1.1      haad 
   2211  1.1      haad /*
   2212  1.1      haad  * As above, but faster; can be used when you have a held dbuf in hand.
   2213  1.1      haad  */
   2214  1.1      haad void
   2215  1.6       chs dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
   2216  1.1      haad {
   2217  1.6       chs 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
   2218  1.6       chs 
   2219  1.6       chs 	DB_DNODE_ENTER(db);
   2220  1.6       chs 	dmu_object_info_from_dnode(DB_DNODE(db), doi);
   2221  1.6       chs 	DB_DNODE_EXIT(db);
   2222  1.1      haad }
   2223  1.1      haad 
   2224  1.1      haad /*
   2225  1.1      haad  * Faster still when you only care about the size.
   2226  1.1      haad  * This is specifically optimized for zfs_getattr().
   2227  1.1      haad  */
   2228  1.1      haad void
   2229  1.6       chs dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
   2230  1.6       chs     u_longlong_t *nblk512)
   2231  1.1      haad {
   2232  1.6       chs 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
   2233  1.6       chs 	dnode_t *dn;
   2234  1.6       chs 
   2235  1.6       chs 	DB_DNODE_ENTER(db);
   2236  1.6       chs 	dn = DB_DNODE(db);
   2237  1.1      haad 
   2238  1.1      haad 	*blksize = dn->dn_datablksz;
   2239  1.1      haad 	/* add 1 for dnode space */
   2240  1.1      haad 	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
   2241  1.1      haad 	    SPA_MINBLOCKSHIFT) + 1;
   2242  1.6       chs 	DB_DNODE_EXIT(db);
   2243  1.1      haad }
   2244  1.1      haad 
   2245  1.1      haad void
   2246  1.1      haad byteswap_uint64_array(void *vbuf, size_t size)
   2247  1.1      haad {
   2248  1.1      haad 	uint64_t *buf = vbuf;
   2249  1.1      haad 	size_t count = size >> 3;
   2250  1.1      haad 	int i;
   2251  1.1      haad 
   2252  1.1      haad 	ASSERT((size & 7) == 0);
   2253  1.1      haad 
   2254  1.1      haad 	for (i = 0; i < count; i++)
   2255  1.1      haad 		buf[i] = BSWAP_64(buf[i]);
   2256  1.1      haad }
   2257  1.1      haad 
   2258  1.1      haad void
   2259  1.1      haad byteswap_uint32_array(void *vbuf, size_t size)
   2260  1.1      haad {
   2261  1.1      haad 	uint32_t *buf = vbuf;
   2262  1.1      haad 	size_t count = size >> 2;
   2263  1.1      haad 	int i;
   2264  1.1      haad 
   2265  1.1      haad 	ASSERT((size & 3) == 0);
   2266  1.1      haad 
   2267  1.1      haad 	for (i = 0; i < count; i++)
   2268  1.1      haad 		buf[i] = BSWAP_32(buf[i]);
   2269  1.1      haad }
   2270  1.1      haad 
   2271  1.1      haad void
   2272  1.1      haad byteswap_uint16_array(void *vbuf, size_t size)
   2273  1.1      haad {
   2274  1.1      haad 	uint16_t *buf = vbuf;
   2275  1.1      haad 	size_t count = size >> 1;
   2276  1.1      haad 	int i;
   2277  1.1      haad 
   2278  1.1      haad 	ASSERT((size & 1) == 0);
   2279  1.1      haad 
   2280  1.1      haad 	for (i = 0; i < count; i++)
   2281  1.1      haad 		buf[i] = BSWAP_16(buf[i]);
   2282  1.1      haad }
   2283  1.1      haad 
   2284  1.1      haad /* ARGSUSED */
   2285  1.1      haad void
   2286  1.1      haad byteswap_uint8_array(void *vbuf, size_t size)
   2287  1.1      haad {
   2288  1.1      haad }
   2289  1.1      haad 
   2290  1.1      haad void
   2291  1.1      haad dmu_init(void)
   2292  1.1      haad {
   2293  1.6       chs 	zfs_dbgmsg_init();
   2294  1.6       chs 	sa_cache_init();
   2295  1.6       chs 	xuio_stat_init();
   2296  1.6       chs 	dmu_objset_init();
   2297  1.1      haad 	dnode_init();
   2298  1.3      haad 	zfetch_init();
   2299  1.6       chs 	zio_compress_init();
   2300  1.6       chs 	l2arc_init();
   2301  1.1      haad 	arc_init();
   2302  1.6       chs 	dbuf_init();
   2303  1.1      haad }
   2304  1.1      haad 
   2305  1.1      haad void
   2306  1.1      haad dmu_fini(void)
   2307  1.1      haad {
   2308  1.6       chs 	arc_fini(); /* arc depends on l2arc, so arc must go first */
   2309  1.6       chs 	l2arc_fini();
   2310  1.3      haad 	zfetch_fini();
   2311  1.6       chs 	zio_compress_fini();
   2312  1.6       chs 	dbuf_fini();
   2313  1.1      haad 	dnode_fini();
   2314  1.6       chs 	dmu_objset_fini();
   2315  1.3      haad 	xuio_stat_fini();
   2316  1.6       chs 	sa_cache_fini();
   2317  1.6       chs 	zfs_dbgmsg_fini();
   2318  1.1      haad }
   2319