1 1.1 haad /* 2 1.1 haad * CDDL HEADER START 3 1.1 haad * 4 1.1 haad * The contents of this file are subject to the terms of the 5 1.1 haad * Common Development and Distribution License (the "License"). 6 1.1 haad * You may not use this file except in compliance with the License. 7 1.1 haad * 8 1.1 haad * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 1.1 haad * or http://www.opensolaris.org/os/licensing. 10 1.1 haad * See the License for the specific language governing permissions 11 1.1 haad * and limitations under the License. 12 1.1 haad * 13 1.1 haad * When distributing Covered Code, include this CDDL HEADER in each 14 1.1 haad * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 1.1 haad * If applicable, add the following below this CDDL HEADER, with the 16 1.1 haad * fields enclosed by brackets "[]" replaced with your own identifying 17 1.1 haad * information: Portions Copyright [yyyy] [name of copyright owner] 18 1.1 haad * 19 1.1 haad * CDDL HEADER END 20 1.1 haad */ 21 1.1 haad /* 22 1.6 chs * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 1.6 chs * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 1.6 chs * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 25 1.6 chs * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26 1.6 chs * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27 1.6 chs * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28 1.6 chs * Copyright (c) 2014 Integros [integros.com] 29 1.1 haad */ 30 1.1 haad 31 1.1 haad #include <sys/zfs_context.h> 32 1.1 haad #include <sys/dmu.h> 33 1.6 chs #include <sys/dmu_send.h> 34 1.1 haad #include <sys/dmu_impl.h> 35 1.1 haad #include <sys/dbuf.h> 36 1.1 haad #include <sys/dmu_objset.h> 37 1.1 haad #include <sys/dsl_dataset.h> 38 1.1 haad #include <sys/dsl_dir.h> 39 1.1 haad #include <sys/dmu_tx.h> 40 1.1 haad #include <sys/spa.h> 41 1.1 haad #include <sys/zio.h> 42 1.1 haad #include <sys/dmu_zfetch.h> 43 1.6 chs #include <sys/sa.h> 44 1.6 chs #include <sys/sa_impl.h> 45 1.6 chs #include <sys/zfeature.h> 46 1.6 chs #include <sys/blkptr.h> 47 1.6 chs #include <sys/range_tree.h> 48 1.6 chs #include <sys/callb.h> 49 1.1 haad 50 1.6 chs uint_t zfs_dbuf_evict_key; 51 1.6 chs 52 1.6 chs static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 53 1.1 haad static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 54 1.1 haad 55 1.6 chs #ifndef __lint 56 1.6 chs extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, 57 1.6 chs dmu_buf_evict_func_t *evict_func_sync, 58 1.6 chs dmu_buf_evict_func_t *evict_func_async, 59 1.6 chs dmu_buf_t **clear_on_evict_dbufp); 60 1.6 chs #endif /* ! __lint */ 61 1.6 chs 62 1.1 haad /* 63 1.1 haad * Global data structures and functions for the dbuf cache. 64 1.1 haad */ 65 1.6 chs static kmem_cache_t *dbuf_kmem_cache; 66 1.6 chs static taskq_t *dbu_evict_taskq; 67 1.6 chs 68 1.6 chs static kthread_t *dbuf_cache_evict_thread; 69 1.6 chs static kmutex_t dbuf_evict_lock; 70 1.6 chs static kcondvar_t dbuf_evict_cv; 71 1.6 chs static boolean_t dbuf_evict_thread_exit; 72 1.6 chs 73 1.6 chs /* 74 1.6 chs * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that 75 1.6 chs * are not currently held but have been recently released. These dbufs 76 1.6 chs * are not eligible for arc eviction until they are aged out of the cache. 77 1.6 chs * Dbufs are added to the dbuf cache once the last hold is released. If a 78 1.6 chs * dbuf is later accessed and still exists in the dbuf cache, then it will 79 1.6 chs * be removed from the cache and later re-added to the head of the cache. 80 1.6 chs * Dbufs that are aged out of the cache will be immediately destroyed and 81 1.6 chs * become eligible for arc eviction. 82 1.6 chs */ 83 1.6 chs static multilist_t dbuf_cache; 84 1.6 chs static refcount_t dbuf_cache_size; 85 1.6 chs uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024; 86 1.6 chs 87 1.6 chs /* Cap the size of the dbuf cache to log2 fraction of arc size. */ 88 1.6 chs int dbuf_cache_max_shift = 5; 89 1.6 chs 90 1.6 chs /* 91 1.6 chs * The dbuf cache uses a three-stage eviction policy: 92 1.6 chs * - A low water marker designates when the dbuf eviction thread 93 1.6 chs * should stop evicting from the dbuf cache. 94 1.6 chs * - When we reach the maximum size (aka mid water mark), we 95 1.6 chs * signal the eviction thread to run. 96 1.6 chs * - The high water mark indicates when the eviction thread 97 1.6 chs * is unable to keep up with the incoming load and eviction must 98 1.6 chs * happen in the context of the calling thread. 99 1.6 chs * 100 1.6 chs * The dbuf cache: 101 1.6 chs * (max size) 102 1.6 chs * low water mid water hi water 103 1.6 chs * +----------------------------------------+----------+----------+ 104 1.6 chs * | | | | 105 1.6 chs * | | | | 106 1.6 chs * | | | | 107 1.6 chs * | | | | 108 1.6 chs * +----------------------------------------+----------+----------+ 109 1.6 chs * stop signal evict 110 1.6 chs * evicting eviction directly 111 1.6 chs * thread 112 1.6 chs * 113 1.6 chs * The high and low water marks indicate the operating range for the eviction 114 1.6 chs * thread. The low water mark is, by default, 90% of the total size of the 115 1.6 chs * cache and the high water mark is at 110% (both of these percentages can be 116 1.6 chs * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, 117 1.6 chs * respectively). The eviction thread will try to ensure that the cache remains 118 1.6 chs * within this range by waking up every second and checking if the cache is 119 1.6 chs * above the low water mark. The thread can also be woken up by callers adding 120 1.6 chs * elements into the cache if the cache is larger than the mid water (i.e max 121 1.6 chs * cache size). Once the eviction thread is woken up and eviction is required, 122 1.6 chs * it will continue evicting buffers until it's able to reduce the cache size 123 1.6 chs * to the low water mark. If the cache size continues to grow and hits the high 124 1.6 chs * water mark, then callers adding elments to the cache will begin to evict 125 1.6 chs * directly from the cache until the cache is no longer above the high water 126 1.6 chs * mark. 127 1.6 chs */ 128 1.6 chs 129 1.6 chs /* 130 1.6 chs * The percentage above and below the maximum cache size. 131 1.6 chs */ 132 1.6 chs uint_t dbuf_cache_hiwater_pct = 10; 133 1.6 chs uint_t dbuf_cache_lowater_pct = 10; 134 1.1 haad 135 1.1 haad /* ARGSUSED */ 136 1.1 haad static int 137 1.1 haad dbuf_cons(void *vdb, void *unused, int kmflag) 138 1.1 haad { 139 1.6 chs dmu_buf_impl_t *db = vdb; 140 1.6 chs 141 1.1 haad bzero(db, sizeof (dmu_buf_impl_t)); 142 1.1 haad mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 143 1.1 haad cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 144 1.6 chs multilist_link_init(&db->db_cache_link); 145 1.1 haad refcount_create(&db->db_holds); 146 1.6 chs 147 1.1 haad return (0); 148 1.1 haad } 149 1.1 haad 150 1.1 haad /* ARGSUSED */ 151 1.1 haad static void 152 1.1 haad dbuf_dest(void *vdb, void *unused) 153 1.1 haad { 154 1.6 chs dmu_buf_impl_t *db = vdb; 155 1.6 chs 156 1.1 haad mutex_destroy(&db->db_mtx); 157 1.1 haad cv_destroy(&db->db_changed); 158 1.6 chs ASSERT(!multilist_link_active(&db->db_cache_link)); 159 1.1 haad refcount_destroy(&db->db_holds); 160 1.1 haad } 161 1.1 haad 162 1.1 haad /* 163 1.1 haad * dbuf hash table routines 164 1.1 haad */ 165 1.1 haad static dbuf_hash_table_t dbuf_hash_table; 166 1.1 haad 167 1.1 haad static uint64_t dbuf_hash_count; 168 1.1 haad 169 1.1 haad static uint64_t 170 1.1 haad dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 171 1.1 haad { 172 1.1 haad uintptr_t osv = (uintptr_t)os; 173 1.1 haad uint64_t crc = -1ULL; 174 1.1 haad 175 1.1 haad ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 176 1.1 haad crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 177 1.1 haad crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 178 1.1 haad crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 179 1.1 haad crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 180 1.1 haad crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 181 1.1 haad crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 182 1.1 haad 183 1.1 haad crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 184 1.1 haad 185 1.1 haad return (crc); 186 1.1 haad } 187 1.1 haad 188 1.1 haad #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 189 1.1 haad ((dbuf)->db.db_object == (obj) && \ 190 1.1 haad (dbuf)->db_objset == (os) && \ 191 1.1 haad (dbuf)->db_level == (level) && \ 192 1.1 haad (dbuf)->db_blkid == (blkid)) 193 1.1 haad 194 1.1 haad dmu_buf_impl_t * 195 1.6 chs dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) 196 1.1 haad { 197 1.1 haad dbuf_hash_table_t *h = &dbuf_hash_table; 198 1.6 chs uint64_t hv = dbuf_hash(os, obj, level, blkid); 199 1.1 haad uint64_t idx = hv & h->hash_table_mask; 200 1.1 haad dmu_buf_impl_t *db; 201 1.1 haad 202 1.1 haad mutex_enter(DBUF_HASH_MUTEX(h, idx)); 203 1.1 haad for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 204 1.1 haad if (DBUF_EQUAL(db, os, obj, level, blkid)) { 205 1.1 haad mutex_enter(&db->db_mtx); 206 1.1 haad if (db->db_state != DB_EVICTING) { 207 1.1 haad mutex_exit(DBUF_HASH_MUTEX(h, idx)); 208 1.1 haad return (db); 209 1.1 haad } 210 1.1 haad mutex_exit(&db->db_mtx); 211 1.1 haad } 212 1.1 haad } 213 1.1 haad mutex_exit(DBUF_HASH_MUTEX(h, idx)); 214 1.1 haad return (NULL); 215 1.1 haad } 216 1.1 haad 217 1.6 chs static dmu_buf_impl_t * 218 1.6 chs dbuf_find_bonus(objset_t *os, uint64_t object) 219 1.6 chs { 220 1.6 chs dnode_t *dn; 221 1.6 chs dmu_buf_impl_t *db = NULL; 222 1.6 chs 223 1.6 chs if (dnode_hold(os, object, FTAG, &dn) == 0) { 224 1.6 chs rw_enter(&dn->dn_struct_rwlock, RW_READER); 225 1.6 chs if (dn->dn_bonus != NULL) { 226 1.6 chs db = dn->dn_bonus; 227 1.6 chs mutex_enter(&db->db_mtx); 228 1.6 chs } 229 1.6 chs rw_exit(&dn->dn_struct_rwlock); 230 1.6 chs dnode_rele(dn, FTAG); 231 1.6 chs } 232 1.6 chs return (db); 233 1.6 chs } 234 1.6 chs 235 1.1 haad /* 236 1.1 haad * Insert an entry into the hash table. If there is already an element 237 1.1 haad * equal to elem in the hash table, then the already existing element 238 1.1 haad * will be returned and the new element will not be inserted. 239 1.1 haad * Otherwise returns NULL. 240 1.1 haad */ 241 1.1 haad static dmu_buf_impl_t * 242 1.1 haad dbuf_hash_insert(dmu_buf_impl_t *db) 243 1.1 haad { 244 1.1 haad dbuf_hash_table_t *h = &dbuf_hash_table; 245 1.4 haad objset_t *os = db->db_objset; 246 1.1 haad uint64_t obj = db->db.db_object; 247 1.1 haad int level = db->db_level; 248 1.1 haad uint64_t blkid = db->db_blkid; 249 1.6 chs uint64_t hv = dbuf_hash(os, obj, level, blkid); 250 1.1 haad uint64_t idx = hv & h->hash_table_mask; 251 1.1 haad dmu_buf_impl_t *dbf; 252 1.1 haad 253 1.1 haad mutex_enter(DBUF_HASH_MUTEX(h, idx)); 254 1.1 haad for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 255 1.1 haad if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 256 1.1 haad mutex_enter(&dbf->db_mtx); 257 1.1 haad if (dbf->db_state != DB_EVICTING) { 258 1.1 haad mutex_exit(DBUF_HASH_MUTEX(h, idx)); 259 1.1 haad return (dbf); 260 1.1 haad } 261 1.1 haad mutex_exit(&dbf->db_mtx); 262 1.1 haad } 263 1.1 haad } 264 1.1 haad 265 1.1 haad mutex_enter(&db->db_mtx); 266 1.1 haad db->db_hash_next = h->hash_table[idx]; 267 1.1 haad h->hash_table[idx] = db; 268 1.1 haad mutex_exit(DBUF_HASH_MUTEX(h, idx)); 269 1.6 chs atomic_inc_64(&dbuf_hash_count); 270 1.1 haad 271 1.1 haad return (NULL); 272 1.1 haad } 273 1.1 haad 274 1.1 haad /* 275 1.6 chs * Remove an entry from the hash table. It must be in the EVICTING state. 276 1.1 haad */ 277 1.1 haad static void 278 1.1 haad dbuf_hash_remove(dmu_buf_impl_t *db) 279 1.1 haad { 280 1.1 haad dbuf_hash_table_t *h = &dbuf_hash_table; 281 1.6 chs uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object, 282 1.1 haad db->db_level, db->db_blkid); 283 1.1 haad uint64_t idx = hv & h->hash_table_mask; 284 1.1 haad dmu_buf_impl_t *dbf, **dbp; 285 1.1 haad 286 1.1 haad /* 287 1.6 chs * We musn't hold db_mtx to maintain lock ordering: 288 1.1 haad * DBUF_HASH_MUTEX > db_mtx. 289 1.1 haad */ 290 1.1 haad ASSERT(refcount_is_zero(&db->db_holds)); 291 1.1 haad ASSERT(db->db_state == DB_EVICTING); 292 1.1 haad ASSERT(!MUTEX_HELD(&db->db_mtx)); 293 1.1 haad 294 1.1 haad mutex_enter(DBUF_HASH_MUTEX(h, idx)); 295 1.1 haad dbp = &h->hash_table[idx]; 296 1.1 haad while ((dbf = *dbp) != db) { 297 1.1 haad dbp = &dbf->db_hash_next; 298 1.1 haad ASSERT(dbf != NULL); 299 1.1 haad } 300 1.1 haad *dbp = db->db_hash_next; 301 1.1 haad db->db_hash_next = NULL; 302 1.1 haad mutex_exit(DBUF_HASH_MUTEX(h, idx)); 303 1.6 chs atomic_dec_64(&dbuf_hash_count); 304 1.1 haad } 305 1.1 haad 306 1.6 chs typedef enum { 307 1.6 chs DBVU_EVICTING, 308 1.6 chs DBVU_NOT_EVICTING 309 1.6 chs } dbvu_verify_type_t; 310 1.6 chs 311 1.6 chs static void 312 1.6 chs dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) 313 1.6 chs { 314 1.6 chs #ifdef ZFS_DEBUG 315 1.6 chs int64_t holds; 316 1.6 chs 317 1.6 chs if (db->db_user == NULL) 318 1.6 chs return; 319 1.6 chs 320 1.6 chs /* Only data blocks support the attachment of user data. */ 321 1.6 chs ASSERT(db->db_level == 0); 322 1.6 chs 323 1.6 chs /* Clients must resolve a dbuf before attaching user data. */ 324 1.6 chs ASSERT(db->db.db_data != NULL); 325 1.6 chs ASSERT3U(db->db_state, ==, DB_CACHED); 326 1.6 chs 327 1.6 chs holds = refcount_count(&db->db_holds); 328 1.6 chs if (verify_type == DBVU_EVICTING) { 329 1.6 chs /* 330 1.6 chs * Immediate eviction occurs when holds == dirtycnt. 331 1.6 chs * For normal eviction buffers, holds is zero on 332 1.6 chs * eviction, except when dbuf_fix_old_data() calls 333 1.6 chs * dbuf_clear_data(). However, the hold count can grow 334 1.6 chs * during eviction even though db_mtx is held (see 335 1.6 chs * dmu_bonus_hold() for an example), so we can only 336 1.6 chs * test the generic invariant that holds >= dirtycnt. 337 1.6 chs */ 338 1.6 chs ASSERT3U(holds, >=, db->db_dirtycnt); 339 1.6 chs } else { 340 1.6 chs if (db->db_user_immediate_evict == TRUE) 341 1.6 chs ASSERT3U(holds, >=, db->db_dirtycnt); 342 1.6 chs else 343 1.6 chs ASSERT3U(holds, >, 0); 344 1.6 chs } 345 1.6 chs #endif 346 1.6 chs } 347 1.1 haad 348 1.1 haad static void 349 1.1 haad dbuf_evict_user(dmu_buf_impl_t *db) 350 1.1 haad { 351 1.6 chs dmu_buf_user_t *dbu = db->db_user; 352 1.6 chs 353 1.1 haad ASSERT(MUTEX_HELD(&db->db_mtx)); 354 1.1 haad 355 1.6 chs if (dbu == NULL) 356 1.1 haad return; 357 1.1 haad 358 1.6 chs dbuf_verify_user(db, DBVU_EVICTING); 359 1.6 chs db->db_user = NULL; 360 1.6 chs 361 1.6 chs #ifdef ZFS_DEBUG 362 1.6 chs if (dbu->dbu_clear_on_evict_dbufp != NULL) 363 1.6 chs *dbu->dbu_clear_on_evict_dbufp = NULL; 364 1.6 chs #endif 365 1.6 chs 366 1.6 chs /* 367 1.6 chs * There are two eviction callbacks - one that we call synchronously 368 1.6 chs * and one that we invoke via a taskq. The async one is useful for 369 1.6 chs * avoiding lock order reversals and limiting stack depth. 370 1.6 chs * 371 1.6 chs * Note that if we have a sync callback but no async callback, 372 1.6 chs * it's likely that the sync callback will free the structure 373 1.6 chs * containing the dbu. In that case we need to take care to not 374 1.6 chs * dereference dbu after calling the sync evict func. 375 1.6 chs */ 376 1.6 chs boolean_t has_async = (dbu->dbu_evict_func_async != NULL); 377 1.6 chs 378 1.6 chs if (dbu->dbu_evict_func_sync != NULL) 379 1.6 chs dbu->dbu_evict_func_sync(dbu); 380 1.6 chs 381 1.6 chs if (has_async) { 382 1.6 chs taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, 383 1.6 chs dbu, 0, &dbu->dbu_tqent); 384 1.6 chs } 385 1.6 chs } 386 1.6 chs 387 1.6 chs boolean_t 388 1.6 chs dbuf_is_metadata(dmu_buf_impl_t *db) 389 1.6 chs { 390 1.6 chs if (db->db_level > 0) { 391 1.6 chs return (B_TRUE); 392 1.6 chs } else { 393 1.6 chs boolean_t is_metadata; 394 1.6 chs 395 1.6 chs DB_DNODE_ENTER(db); 396 1.6 chs is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 397 1.6 chs DB_DNODE_EXIT(db); 398 1.6 chs 399 1.6 chs return (is_metadata); 400 1.6 chs } 401 1.6 chs } 402 1.6 chs 403 1.6 chs /* 404 1.6 chs * This function *must* return indices evenly distributed between all 405 1.6 chs * sublists of the multilist. This is needed due to how the dbuf eviction 406 1.6 chs * code is laid out; dbuf_evict_thread() assumes dbufs are evenly 407 1.6 chs * distributed between all sublists and uses this assumption when 408 1.6 chs * deciding which sublist to evict from and how much to evict from it. 409 1.6 chs */ 410 1.6 chs unsigned int 411 1.6 chs dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) 412 1.6 chs { 413 1.6 chs dmu_buf_impl_t *db = obj; 414 1.6 chs 415 1.6 chs /* 416 1.6 chs * The assumption here, is the hash value for a given 417 1.6 chs * dmu_buf_impl_t will remain constant throughout it's lifetime 418 1.6 chs * (i.e. it's objset, object, level and blkid fields don't change). 419 1.6 chs * Thus, we don't need to store the dbuf's sublist index 420 1.6 chs * on insertion, as this index can be recalculated on removal. 421 1.6 chs * 422 1.6 chs * Also, the low order bits of the hash value are thought to be 423 1.6 chs * distributed evenly. Otherwise, in the case that the multilist 424 1.6 chs * has a power of two number of sublists, each sublists' usage 425 1.6 chs * would not be evenly distributed. 426 1.6 chs */ 427 1.6 chs return (dbuf_hash(db->db_objset, db->db.db_object, 428 1.6 chs db->db_level, db->db_blkid) % 429 1.6 chs multilist_get_num_sublists(ml)); 430 1.6 chs } 431 1.6 chs 432 1.6 chs static inline boolean_t 433 1.6 chs dbuf_cache_above_hiwater(void) 434 1.6 chs { 435 1.6 chs uint64_t dbuf_cache_hiwater_bytes = 436 1.6 chs (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; 437 1.6 chs 438 1.6 chs return (refcount_count(&dbuf_cache_size) > 439 1.6 chs dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); 440 1.6 chs } 441 1.6 chs 442 1.6 chs static inline boolean_t 443 1.6 chs dbuf_cache_above_lowater(void) 444 1.6 chs { 445 1.6 chs uint64_t dbuf_cache_lowater_bytes = 446 1.6 chs (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; 447 1.6 chs 448 1.6 chs return (refcount_count(&dbuf_cache_size) > 449 1.6 chs dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); 450 1.6 chs } 451 1.6 chs 452 1.6 chs /* 453 1.6 chs * Evict the oldest eligible dbuf from the dbuf cache. 454 1.6 chs */ 455 1.6 chs static void 456 1.6 chs dbuf_evict_one(void) 457 1.6 chs { 458 1.6 chs int idx = multilist_get_random_index(&dbuf_cache); 459 1.6 chs multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx); 460 1.6 chs 461 1.6 chs ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); 462 1.6 chs 463 1.6 chs /* 464 1.6 chs * Set the thread's tsd to indicate that it's processing evictions. 465 1.6 chs * Once a thread stops evicting from the dbuf cache it will 466 1.6 chs * reset its tsd to NULL. 467 1.6 chs */ 468 1.6 chs ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); 469 1.6 chs (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); 470 1.6 chs 471 1.6 chs dmu_buf_impl_t *db = multilist_sublist_tail(mls); 472 1.6 chs while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { 473 1.6 chs db = multilist_sublist_prev(mls, db); 474 1.6 chs } 475 1.6 chs 476 1.6 chs DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, 477 1.6 chs multilist_sublist_t *, mls); 478 1.6 chs 479 1.6 chs if (db != NULL) { 480 1.6 chs multilist_sublist_remove(mls, db); 481 1.6 chs multilist_sublist_unlock(mls); 482 1.6 chs (void) refcount_remove_many(&dbuf_cache_size, 483 1.6 chs db->db.db_size, db); 484 1.6 chs dbuf_destroy(db); 485 1.6 chs } else { 486 1.6 chs multilist_sublist_unlock(mls); 487 1.6 chs } 488 1.6 chs (void) tsd_set(zfs_dbuf_evict_key, NULL); 489 1.6 chs } 490 1.6 chs 491 1.6 chs /* 492 1.6 chs * The dbuf evict thread is responsible for aging out dbufs from the 493 1.6 chs * cache. Once the cache has reached it's maximum size, dbufs are removed 494 1.6 chs * and destroyed. The eviction thread will continue running until the size 495 1.6 chs * of the dbuf cache is at or below the maximum size. Once the dbuf is aged 496 1.6 chs * out of the cache it is destroyed and becomes eligible for arc eviction. 497 1.6 chs */ 498 1.6 chs static void 499 1.6 chs dbuf_evict_thread(void *dummy __unused) 500 1.6 chs { 501 1.6 chs callb_cpr_t cpr; 502 1.6 chs 503 1.6 chs CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); 504 1.6 chs 505 1.6 chs mutex_enter(&dbuf_evict_lock); 506 1.6 chs while (!dbuf_evict_thread_exit) { 507 1.6 chs while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { 508 1.6 chs CALLB_CPR_SAFE_BEGIN(&cpr); 509 1.6 chs (void) cv_timedwait_hires(&dbuf_evict_cv, 510 1.6 chs &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 511 1.6 chs CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); 512 1.6 chs } 513 1.6 chs mutex_exit(&dbuf_evict_lock); 514 1.6 chs 515 1.6 chs /* 516 1.6 chs * Keep evicting as long as we're above the low water mark 517 1.6 chs * for the cache. We do this without holding the locks to 518 1.6 chs * minimize lock contention. 519 1.6 chs */ 520 1.6 chs while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { 521 1.6 chs dbuf_evict_one(); 522 1.6 chs } 523 1.6 chs 524 1.6 chs mutex_enter(&dbuf_evict_lock); 525 1.6 chs } 526 1.6 chs 527 1.6 chs dbuf_evict_thread_exit = B_FALSE; 528 1.6 chs cv_broadcast(&dbuf_evict_cv); 529 1.6 chs CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ 530 1.6 chs thread_exit(); 531 1.1 haad } 532 1.1 haad 533 1.6 chs /* 534 1.6 chs * Wake up the dbuf eviction thread if the dbuf cache is at its max size. 535 1.6 chs * If the dbuf cache is at its high water mark, then evict a dbuf from the 536 1.6 chs * dbuf cache using the callers context. 537 1.6 chs */ 538 1.6 chs static void 539 1.6 chs dbuf_evict_notify(void) 540 1.1 haad { 541 1.1 haad 542 1.6 chs /* 543 1.6 chs * We use thread specific data to track when a thread has 544 1.6 chs * started processing evictions. This allows us to avoid deeply 545 1.6 chs * nested stacks that would have a call flow similar to this: 546 1.6 chs * 547 1.6 chs * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() 548 1.6 chs * ^ | 549 1.6 chs * | | 550 1.6 chs * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ 551 1.6 chs * 552 1.6 chs * The dbuf_eviction_thread will always have its tsd set until 553 1.6 chs * that thread exits. All other threads will only set their tsd 554 1.6 chs * if they are participating in the eviction process. This only 555 1.6 chs * happens if the eviction thread is unable to process evictions 556 1.6 chs * fast enough. To keep the dbuf cache size in check, other threads 557 1.6 chs * can evict from the dbuf cache directly. Those threads will set 558 1.6 chs * their tsd values so that we ensure that they only evict one dbuf 559 1.6 chs * from the dbuf cache. 560 1.6 chs */ 561 1.6 chs if (tsd_get(zfs_dbuf_evict_key) != NULL) 562 1.6 chs return; 563 1.6 chs 564 1.6 chs if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { 565 1.6 chs boolean_t evict_now = B_FALSE; 566 1.6 chs 567 1.6 chs mutex_enter(&dbuf_evict_lock); 568 1.6 chs if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { 569 1.6 chs evict_now = dbuf_cache_above_hiwater(); 570 1.6 chs cv_signal(&dbuf_evict_cv); 571 1.6 chs } 572 1.6 chs mutex_exit(&dbuf_evict_lock); 573 1.6 chs 574 1.6 chs if (evict_now) { 575 1.6 chs dbuf_evict_one(); 576 1.6 chs } 577 1.6 chs } 578 1.1 haad } 579 1.1 haad 580 1.1 haad void 581 1.1 haad dbuf_init(void) 582 1.1 haad { 583 1.1 haad uint64_t hsize = 1ULL << 16; 584 1.1 haad dbuf_hash_table_t *h = &dbuf_hash_table; 585 1.1 haad int i; 586 1.6 chs 587 1.1 haad /* 588 1.1 haad * The hash table is big enough to fill all of physical memory 589 1.1 haad * with an average 4K block size. The table will take up 590 1.1 haad * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 591 1.1 haad */ 592 1.3 haad while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) 593 1.1 haad hsize <<= 1; 594 1.1 haad 595 1.1 haad retry: 596 1.1 haad h->hash_table_mask = hsize - 1; 597 1.1 haad h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 598 1.1 haad if (h->hash_table == NULL) { 599 1.1 haad /* XXX - we should really return an error instead of assert */ 600 1.1 haad ASSERT(hsize > (1ULL << 10)); 601 1.1 haad hsize >>= 1; 602 1.1 haad goto retry; 603 1.1 haad } 604 1.1 haad 605 1.6 chs dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", 606 1.1 haad sizeof (dmu_buf_impl_t), 607 1.1 haad 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 608 1.1 haad 609 1.1 haad for (i = 0; i < DBUF_MUTEXES; i++) 610 1.1 haad mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 611 1.6 chs 612 1.6 chs /* 613 1.6 chs * Setup the parameters for the dbuf cache. We cap the size of the 614 1.6 chs * dbuf cache to 1/32nd (default) of the size of the ARC. 615 1.6 chs */ 616 1.6 chs dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes, 617 1.6 chs arc_max_bytes() >> dbuf_cache_max_shift); 618 1.6 chs 619 1.6 chs /* 620 1.6 chs * All entries are queued via taskq_dispatch_ent(), so min/maxalloc 621 1.6 chs * configuration is not required. 622 1.6 chs */ 623 1.6 chs dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); 624 1.6 chs 625 1.6 chs multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t), 626 1.6 chs offsetof(dmu_buf_impl_t, db_cache_link), 627 1.6 chs zfs_arc_num_sublists_per_state, 628 1.6 chs dbuf_cache_multilist_index_func); 629 1.6 chs refcount_create(&dbuf_cache_size); 630 1.6 chs 631 1.6 chs tsd_create(&zfs_dbuf_evict_key, NULL); 632 1.6 chs dbuf_evict_thread_exit = B_FALSE; 633 1.6 chs mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); 634 1.6 chs cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); 635 1.6 chs dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, 636 1.6 chs NULL, 0, &p0, TS_RUN, minclsyspri); 637 1.1 haad } 638 1.1 haad 639 1.1 haad void 640 1.1 haad dbuf_fini(void) 641 1.1 haad { 642 1.1 haad dbuf_hash_table_t *h = &dbuf_hash_table; 643 1.1 haad int i; 644 1.1 haad 645 1.1 haad for (i = 0; i < DBUF_MUTEXES; i++) 646 1.1 haad mutex_destroy(&h->hash_mutexes[i]); 647 1.1 haad kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 648 1.6 chs kmem_cache_destroy(dbuf_kmem_cache); 649 1.6 chs taskq_destroy(dbu_evict_taskq); 650 1.6 chs 651 1.6 chs mutex_enter(&dbuf_evict_lock); 652 1.6 chs dbuf_evict_thread_exit = B_TRUE; 653 1.6 chs while (dbuf_evict_thread_exit) { 654 1.6 chs cv_signal(&dbuf_evict_cv); 655 1.6 chs cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); 656 1.6 chs } 657 1.6 chs mutex_exit(&dbuf_evict_lock); 658 1.6 chs tsd_destroy(&zfs_dbuf_evict_key); 659 1.6 chs 660 1.6 chs mutex_destroy(&dbuf_evict_lock); 661 1.6 chs cv_destroy(&dbuf_evict_cv); 662 1.6 chs 663 1.6 chs refcount_destroy(&dbuf_cache_size); 664 1.6 chs multilist_destroy(&dbuf_cache); 665 1.1 haad } 666 1.1 haad 667 1.1 haad /* 668 1.1 haad * Other stuff. 669 1.1 haad */ 670 1.1 haad 671 1.1 haad #ifdef ZFS_DEBUG 672 1.1 haad static void 673 1.1 haad dbuf_verify(dmu_buf_impl_t *db) 674 1.1 haad { 675 1.6 chs dnode_t *dn; 676 1.4 haad dbuf_dirty_record_t *dr; 677 1.1 haad 678 1.1 haad ASSERT(MUTEX_HELD(&db->db_mtx)); 679 1.1 haad 680 1.1 haad if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 681 1.1 haad return; 682 1.1 haad 683 1.1 haad ASSERT(db->db_objset != NULL); 684 1.6 chs DB_DNODE_ENTER(db); 685 1.6 chs dn = DB_DNODE(db); 686 1.1 haad if (dn == NULL) { 687 1.1 haad ASSERT(db->db_parent == NULL); 688 1.1 haad ASSERT(db->db_blkptr == NULL); 689 1.1 haad } else { 690 1.1 haad ASSERT3U(db->db.db_object, ==, dn->dn_object); 691 1.1 haad ASSERT3P(db->db_objset, ==, dn->dn_objset); 692 1.1 haad ASSERT3U(db->db_level, <, dn->dn_nlevels); 693 1.6 chs ASSERT(db->db_blkid == DMU_BONUS_BLKID || 694 1.6 chs db->db_blkid == DMU_SPILL_BLKID || 695 1.6 chs !avl_is_empty(&dn->dn_dbufs)); 696 1.1 haad } 697 1.6 chs if (db->db_blkid == DMU_BONUS_BLKID) { 698 1.6 chs ASSERT(dn != NULL); 699 1.6 chs ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 700 1.6 chs ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 701 1.6 chs } else if (db->db_blkid == DMU_SPILL_BLKID) { 702 1.1 haad ASSERT(dn != NULL); 703 1.1 haad ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 704 1.6 chs ASSERT0(db->db.db_offset); 705 1.1 haad } else { 706 1.1 haad ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 707 1.1 haad } 708 1.1 haad 709 1.4 haad for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 710 1.4 haad ASSERT(dr->dr_dbuf == db); 711 1.4 haad 712 1.4 haad for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 713 1.4 haad ASSERT(dr->dr_dbuf == db); 714 1.4 haad 715 1.1 haad /* 716 1.1 haad * We can't assert that db_size matches dn_datablksz because it 717 1.1 haad * can be momentarily different when another thread is doing 718 1.1 haad * dnode_set_blksz(). 719 1.1 haad */ 720 1.1 haad if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 721 1.4 haad dr = db->db_data_pending; 722 1.1 haad /* 723 1.1 haad * It should only be modified in syncing context, so 724 1.1 haad * make sure we only have one copy of the data. 725 1.1 haad */ 726 1.1 haad ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 727 1.1 haad } 728 1.1 haad 729 1.1 haad /* verify db->db_blkptr */ 730 1.1 haad if (db->db_blkptr) { 731 1.1 haad if (db->db_parent == dn->dn_dbuf) { 732 1.1 haad /* db is pointed to by the dnode */ 733 1.1 haad /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 734 1.4 haad if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 735 1.1 haad ASSERT(db->db_parent == NULL); 736 1.1 haad else 737 1.1 haad ASSERT(db->db_parent != NULL); 738 1.6 chs if (db->db_blkid != DMU_SPILL_BLKID) 739 1.6 chs ASSERT3P(db->db_blkptr, ==, 740 1.6 chs &dn->dn_phys->dn_blkptr[db->db_blkid]); 741 1.1 haad } else { 742 1.1 haad /* db is pointed to by an indirect block */ 743 1.1 haad int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 744 1.1 haad ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 745 1.1 haad ASSERT3U(db->db_parent->db.db_object, ==, 746 1.1 haad db->db.db_object); 747 1.1 haad /* 748 1.1 haad * dnode_grow_indblksz() can make this fail if we don't 749 1.1 haad * have the struct_rwlock. XXX indblksz no longer 750 1.1 haad * grows. safe to do this now? 751 1.1 haad */ 752 1.6 chs if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 753 1.1 haad ASSERT3P(db->db_blkptr, ==, 754 1.1 haad ((blkptr_t *)db->db_parent->db.db_data + 755 1.1 haad db->db_blkid % epb)); 756 1.1 haad } 757 1.1 haad } 758 1.1 haad } 759 1.1 haad if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 760 1.6 chs (db->db_buf == NULL || db->db_buf->b_data) && 761 1.6 chs db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 762 1.1 haad db->db_state != DB_FILL && !dn->dn_free_txg) { 763 1.1 haad /* 764 1.1 haad * If the blkptr isn't set but they have nonzero data, 765 1.1 haad * it had better be dirty, otherwise we'll lose that 766 1.1 haad * data when we evict this buffer. 767 1.6 chs * 768 1.6 chs * There is an exception to this rule for indirect blocks; in 769 1.6 chs * this case, if the indirect block is a hole, we fill in a few 770 1.6 chs * fields on each of the child blocks (importantly, birth time) 771 1.6 chs * to prevent hole birth times from being lost when you 772 1.6 chs * partially fill in a hole. 773 1.1 haad */ 774 1.1 haad if (db->db_dirtycnt == 0) { 775 1.6 chs if (db->db_level == 0) { 776 1.6 chs uint64_t *buf = db->db.db_data; 777 1.6 chs int i; 778 1.6 chs 779 1.6 chs for (i = 0; i < db->db.db_size >> 3; i++) { 780 1.6 chs ASSERT(buf[i] == 0); 781 1.6 chs } 782 1.6 chs } else { 783 1.6 chs blkptr_t *bps = db->db.db_data; 784 1.6 chs ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, 785 1.6 chs db->db.db_size); 786 1.6 chs /* 787 1.6 chs * We want to verify that all the blkptrs in the 788 1.6 chs * indirect block are holes, but we may have 789 1.6 chs * automatically set up a few fields for them. 790 1.6 chs * We iterate through each blkptr and verify 791 1.6 chs * they only have those fields set. 792 1.6 chs */ 793 1.6 chs for (int i = 0; 794 1.6 chs i < db->db.db_size / sizeof (blkptr_t); 795 1.6 chs i++) { 796 1.6 chs blkptr_t *bp = &bps[i]; 797 1.6 chs ASSERT(ZIO_CHECKSUM_IS_ZERO( 798 1.6 chs &bp->blk_cksum)); 799 1.6 chs ASSERT( 800 1.6 chs DVA_IS_EMPTY(&bp->blk_dva[0]) && 801 1.6 chs DVA_IS_EMPTY(&bp->blk_dva[1]) && 802 1.6 chs DVA_IS_EMPTY(&bp->blk_dva[2])); 803 1.6 chs ASSERT0(bp->blk_fill); 804 1.6 chs ASSERT0(bp->blk_pad[0]); 805 1.6 chs ASSERT0(bp->blk_pad[1]); 806 1.6 chs ASSERT(!BP_IS_EMBEDDED(bp)); 807 1.6 chs ASSERT(BP_IS_HOLE(bp)); 808 1.6 chs ASSERT0(bp->blk_phys_birth); 809 1.6 chs } 810 1.1 haad } 811 1.1 haad } 812 1.1 haad } 813 1.6 chs DB_DNODE_EXIT(db); 814 1.1 haad } 815 1.1 haad #endif 816 1.1 haad 817 1.1 haad static void 818 1.6 chs dbuf_clear_data(dmu_buf_impl_t *db) 819 1.1 haad { 820 1.1 haad ASSERT(MUTEX_HELD(&db->db_mtx)); 821 1.6 chs dbuf_evict_user(db); 822 1.6 chs ASSERT3P(db->db_buf, ==, NULL); 823 1.6 chs db->db.db_data = NULL; 824 1.6 chs if (db->db_state != DB_NOFILL) 825 1.6 chs db->db_state = DB_UNCACHED; 826 1.1 haad } 827 1.1 haad 828 1.1 haad static void 829 1.1 haad dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 830 1.1 haad { 831 1.1 haad ASSERT(MUTEX_HELD(&db->db_mtx)); 832 1.6 chs ASSERT(buf != NULL); 833 1.6 chs 834 1.1 haad db->db_buf = buf; 835 1.6 chs ASSERT(buf->b_data != NULL); 836 1.6 chs db->db.db_data = buf->b_data; 837 1.1 haad } 838 1.1 haad 839 1.4 haad /* 840 1.4 haad * Loan out an arc_buf for read. Return the loaned arc_buf. 841 1.4 haad */ 842 1.4 haad arc_buf_t * 843 1.4 haad dbuf_loan_arcbuf(dmu_buf_impl_t *db) 844 1.4 haad { 845 1.4 haad arc_buf_t *abuf; 846 1.4 haad 847 1.6 chs ASSERT(db->db_blkid != DMU_BONUS_BLKID); 848 1.4 haad mutex_enter(&db->db_mtx); 849 1.4 haad if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 850 1.4 haad int blksz = db->db.db_size; 851 1.6 chs spa_t *spa = db->db_objset->os_spa; 852 1.6 chs 853 1.4 haad mutex_exit(&db->db_mtx); 854 1.6 chs abuf = arc_loan_buf(spa, blksz); 855 1.4 haad bcopy(db->db.db_data, abuf->b_data, blksz); 856 1.4 haad } else { 857 1.4 haad abuf = db->db_buf; 858 1.4 haad arc_loan_inuse_buf(abuf, db); 859 1.6 chs db->db_buf = NULL; 860 1.6 chs dbuf_clear_data(db); 861 1.4 haad mutex_exit(&db->db_mtx); 862 1.4 haad } 863 1.4 haad return (abuf); 864 1.4 haad } 865 1.4 haad 866 1.6 chs /* 867 1.6 chs * Calculate which level n block references the data at the level 0 offset 868 1.6 chs * provided. 869 1.6 chs */ 870 1.1 haad uint64_t 871 1.6 chs dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) 872 1.1 haad { 873 1.6 chs if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { 874 1.6 chs /* 875 1.6 chs * The level n blkid is equal to the level 0 blkid divided by 876 1.6 chs * the number of level 0s in a level n block. 877 1.6 chs * 878 1.6 chs * The level 0 blkid is offset >> datablkshift = 879 1.6 chs * offset / 2^datablkshift. 880 1.6 chs * 881 1.6 chs * The number of level 0s in a level n is the number of block 882 1.6 chs * pointers in an indirect block, raised to the power of level. 883 1.6 chs * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = 884 1.6 chs * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). 885 1.6 chs * 886 1.6 chs * Thus, the level n blkid is: offset / 887 1.6 chs * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) 888 1.6 chs * = offset / 2^(datablkshift + level * 889 1.6 chs * (indblkshift - SPA_BLKPTRSHIFT)) 890 1.6 chs * = offset >> (datablkshift + level * 891 1.6 chs * (indblkshift - SPA_BLKPTRSHIFT)) 892 1.6 chs */ 893 1.6 chs return (offset >> (dn->dn_datablkshift + level * 894 1.6 chs (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); 895 1.1 haad } else { 896 1.1 haad ASSERT3U(offset, <, dn->dn_datablksz); 897 1.1 haad return (0); 898 1.1 haad } 899 1.1 haad } 900 1.1 haad 901 1.1 haad static void 902 1.1 haad dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 903 1.1 haad { 904 1.1 haad dmu_buf_impl_t *db = vdb; 905 1.1 haad 906 1.1 haad mutex_enter(&db->db_mtx); 907 1.1 haad ASSERT3U(db->db_state, ==, DB_READ); 908 1.1 haad /* 909 1.1 haad * All reads are synchronous, so we must have a hold on the dbuf 910 1.1 haad */ 911 1.1 haad ASSERT(refcount_count(&db->db_holds) > 0); 912 1.1 haad ASSERT(db->db_buf == NULL); 913 1.1 haad ASSERT(db->db.db_data == NULL); 914 1.1 haad if (db->db_level == 0 && db->db_freed_in_flight) { 915 1.1 haad /* we were freed in flight; disregard any error */ 916 1.1 haad arc_release(buf, db); 917 1.1 haad bzero(buf->b_data, db->db.db_size); 918 1.1 haad arc_buf_freeze(buf); 919 1.1 haad db->db_freed_in_flight = FALSE; 920 1.1 haad dbuf_set_data(db, buf); 921 1.1 haad db->db_state = DB_CACHED; 922 1.1 haad } else if (zio == NULL || zio->io_error == 0) { 923 1.1 haad dbuf_set_data(db, buf); 924 1.1 haad db->db_state = DB_CACHED; 925 1.1 haad } else { 926 1.6 chs ASSERT(db->db_blkid != DMU_BONUS_BLKID); 927 1.1 haad ASSERT3P(db->db_buf, ==, NULL); 928 1.6 chs arc_buf_destroy(buf, db); 929 1.1 haad db->db_state = DB_UNCACHED; 930 1.1 haad } 931 1.1 haad cv_broadcast(&db->db_changed); 932 1.6 chs dbuf_rele_and_unlock(db, NULL); 933 1.1 haad } 934 1.1 haad 935 1.1 haad static void 936 1.6 chs dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 937 1.1 haad { 938 1.6 chs dnode_t *dn; 939 1.6 chs zbookmark_phys_t zb; 940 1.6 chs arc_flags_t aflags = ARC_FLAG_NOWAIT; 941 1.1 haad 942 1.6 chs DB_DNODE_ENTER(db); 943 1.6 chs dn = DB_DNODE(db); 944 1.1 haad ASSERT(!refcount_is_zero(&db->db_holds)); 945 1.1 haad /* We need the struct_rwlock to prevent db_blkptr from changing. */ 946 1.1 haad ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 947 1.1 haad ASSERT(MUTEX_HELD(&db->db_mtx)); 948 1.1 haad ASSERT(db->db_state == DB_UNCACHED); 949 1.1 haad ASSERT(db->db_buf == NULL); 950 1.1 haad 951 1.6 chs if (db->db_blkid == DMU_BONUS_BLKID) { 952 1.4 haad int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 953 1.1 haad 954 1.1 haad ASSERT3U(bonuslen, <=, db->db.db_size); 955 1.1 haad db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 956 1.4 haad arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 957 1.1 haad if (bonuslen < DN_MAX_BONUSLEN) 958 1.1 haad bzero(db->db.db_data, DN_MAX_BONUSLEN); 959 1.4 haad if (bonuslen) 960 1.4 haad bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 961 1.6 chs DB_DNODE_EXIT(db); 962 1.1 haad db->db_state = DB_CACHED; 963 1.1 haad mutex_exit(&db->db_mtx); 964 1.1 haad return; 965 1.1 haad } 966 1.1 haad 967 1.1 haad /* 968 1.1 haad * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 969 1.1 haad * processes the delete record and clears the bp while we are waiting 970 1.1 haad * for the dn_mtx (resulting in a "no" from block_freed). 971 1.1 haad */ 972 1.1 haad if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 973 1.1 haad (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 974 1.1 haad BP_IS_HOLE(db->db_blkptr)))) { 975 1.1 haad arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 976 1.1 haad 977 1.6 chs dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, 978 1.1 haad db->db.db_size, db, type)); 979 1.1 haad bzero(db->db.db_data, db->db.db_size); 980 1.6 chs 981 1.6 chs if (db->db_blkptr != NULL && db->db_level > 0 && 982 1.6 chs BP_IS_HOLE(db->db_blkptr) && 983 1.6 chs db->db_blkptr->blk_birth != 0) { 984 1.6 chs blkptr_t *bps = db->db.db_data; 985 1.6 chs for (int i = 0; i < ((1 << 986 1.6 chs DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t)); 987 1.6 chs i++) { 988 1.6 chs blkptr_t *bp = &bps[i]; 989 1.6 chs ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 990 1.6 chs 1 << dn->dn_indblkshift); 991 1.6 chs BP_SET_LSIZE(bp, 992 1.6 chs BP_GET_LEVEL(db->db_blkptr) == 1 ? 993 1.6 chs dn->dn_datablksz : 994 1.6 chs BP_GET_LSIZE(db->db_blkptr)); 995 1.6 chs BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); 996 1.6 chs BP_SET_LEVEL(bp, 997 1.6 chs BP_GET_LEVEL(db->db_blkptr) - 1); 998 1.6 chs BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); 999 1.6 chs } 1000 1.6 chs } 1001 1.6 chs DB_DNODE_EXIT(db); 1002 1.1 haad db->db_state = DB_CACHED; 1003 1.1 haad mutex_exit(&db->db_mtx); 1004 1.1 haad return; 1005 1.1 haad } 1006 1.1 haad 1007 1.6 chs DB_DNODE_EXIT(db); 1008 1.6 chs 1009 1.1 haad db->db_state = DB_READ; 1010 1.1 haad mutex_exit(&db->db_mtx); 1011 1.1 haad 1012 1.1 haad if (DBUF_IS_L2CACHEABLE(db)) 1013 1.6 chs aflags |= ARC_FLAG_L2CACHE; 1014 1.1 haad 1015 1.4 haad SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 1016 1.4 haad db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 1017 1.4 haad db->db.db_object, db->db_level, db->db_blkid); 1018 1.1 haad 1019 1.1 haad dbuf_add_ref(db, NULL); 1020 1.1 haad 1021 1.6 chs (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 1022 1.1 haad dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 1023 1.6 chs (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 1024 1.1 haad &aflags, &zb); 1025 1.1 haad } 1026 1.1 haad 1027 1.1 haad int 1028 1.1 haad dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 1029 1.1 haad { 1030 1.1 haad int err = 0; 1031 1.6 chs boolean_t havepzio = (zio != NULL); 1032 1.6 chs boolean_t prefetch; 1033 1.6 chs dnode_t *dn; 1034 1.1 haad 1035 1.1 haad /* 1036 1.1 haad * We don't have to hold the mutex to check db_state because it 1037 1.1 haad * can't be freed while we have a hold on the buffer. 1038 1.1 haad */ 1039 1.1 haad ASSERT(!refcount_is_zero(&db->db_holds)); 1040 1.1 haad 1041 1.1 haad if (db->db_state == DB_NOFILL) 1042 1.6 chs return (SET_ERROR(EIO)); 1043 1.1 haad 1044 1.6 chs DB_DNODE_ENTER(db); 1045 1.6 chs dn = DB_DNODE(db); 1046 1.1 haad if ((flags & DB_RF_HAVESTRUCT) == 0) 1047 1.6 chs rw_enter(&dn->dn_struct_rwlock, RW_READER); 1048 1.1 haad 1049 1.6 chs prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1050 1.6 chs (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 1051 1.1 haad DBUF_IS_CACHEABLE(db); 1052 1.1 haad 1053 1.1 haad mutex_enter(&db->db_mtx); 1054 1.1 haad if (db->db_state == DB_CACHED) { 1055 1.1 haad mutex_exit(&db->db_mtx); 1056 1.1 haad if (prefetch) 1057 1.6 chs dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1058 1.1 haad if ((flags & DB_RF_HAVESTRUCT) == 0) 1059 1.6 chs rw_exit(&dn->dn_struct_rwlock); 1060 1.6 chs DB_DNODE_EXIT(db); 1061 1.1 haad } else if (db->db_state == DB_UNCACHED) { 1062 1.6 chs spa_t *spa = dn->dn_objset->os_spa; 1063 1.6 chs 1064 1.6 chs if (zio == NULL) 1065 1.6 chs zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 1066 1.6 chs dbuf_read_impl(db, zio, flags); 1067 1.1 haad 1068 1.1 haad /* dbuf_read_impl has dropped db_mtx for us */ 1069 1.1 haad 1070 1.1 haad if (prefetch) 1071 1.6 chs dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1072 1.1 haad 1073 1.1 haad if ((flags & DB_RF_HAVESTRUCT) == 0) 1074 1.6 chs rw_exit(&dn->dn_struct_rwlock); 1075 1.6 chs DB_DNODE_EXIT(db); 1076 1.1 haad 1077 1.1 haad if (!havepzio) 1078 1.1 haad err = zio_wait(zio); 1079 1.1 haad } else { 1080 1.6 chs /* 1081 1.6 chs * Another reader came in while the dbuf was in flight 1082 1.6 chs * between UNCACHED and CACHED. Either a writer will finish 1083 1.6 chs * writing the buffer (sending the dbuf to CACHED) or the 1084 1.6 chs * first reader's request will reach the read_done callback 1085 1.6 chs * and send the dbuf to CACHED. Otherwise, a failure 1086 1.6 chs * occurred and the dbuf went to UNCACHED. 1087 1.6 chs */ 1088 1.1 haad mutex_exit(&db->db_mtx); 1089 1.1 haad if (prefetch) 1090 1.6 chs dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 1091 1.1 haad if ((flags & DB_RF_HAVESTRUCT) == 0) 1092 1.6 chs rw_exit(&dn->dn_struct_rwlock); 1093 1.6 chs DB_DNODE_EXIT(db); 1094 1.1 haad 1095 1.6 chs /* Skip the wait per the caller's request. */ 1096 1.1 haad mutex_enter(&db->db_mtx); 1097 1.1 haad if ((flags & DB_RF_NEVERWAIT) == 0) { 1098 1.1 haad while (db->db_state == DB_READ || 1099 1.1 haad db->db_state == DB_FILL) { 1100 1.1 haad ASSERT(db->db_state == DB_READ || 1101 1.1 haad (flags & DB_RF_HAVESTRUCT) == 0); 1102 1.6 chs DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, 1103 1.6 chs db, zio_t *, zio); 1104 1.1 haad cv_wait(&db->db_changed, &db->db_mtx); 1105 1.1 haad } 1106 1.1 haad if (db->db_state == DB_UNCACHED) 1107 1.6 chs err = SET_ERROR(EIO); 1108 1.1 haad } 1109 1.1 haad mutex_exit(&db->db_mtx); 1110 1.1 haad } 1111 1.1 haad 1112 1.1 haad ASSERT(err || havepzio || db->db_state == DB_CACHED); 1113 1.1 haad return (err); 1114 1.1 haad } 1115 1.1 haad 1116 1.1 haad static void 1117 1.1 haad dbuf_noread(dmu_buf_impl_t *db) 1118 1.1 haad { 1119 1.1 haad ASSERT(!refcount_is_zero(&db->db_holds)); 1120 1.6 chs ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1121 1.1 haad mutex_enter(&db->db_mtx); 1122 1.1 haad while (db->db_state == DB_READ || db->db_state == DB_FILL) 1123 1.1 haad cv_wait(&db->db_changed, &db->db_mtx); 1124 1.1 haad if (db->db_state == DB_UNCACHED) { 1125 1.1 haad arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1126 1.6 chs spa_t *spa = db->db_objset->os_spa; 1127 1.1 haad 1128 1.1 haad ASSERT(db->db_buf == NULL); 1129 1.1 haad ASSERT(db->db.db_data == NULL); 1130 1.6 chs dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type)); 1131 1.1 haad db->db_state = DB_FILL; 1132 1.1 haad } else if (db->db_state == DB_NOFILL) { 1133 1.6 chs dbuf_clear_data(db); 1134 1.1 haad } else { 1135 1.1 haad ASSERT3U(db->db_state, ==, DB_CACHED); 1136 1.1 haad } 1137 1.1 haad mutex_exit(&db->db_mtx); 1138 1.1 haad } 1139 1.1 haad 1140 1.1 haad /* 1141 1.1 haad * This is our just-in-time copy function. It makes a copy of 1142 1.1 haad * buffers, that have been modified in a previous transaction 1143 1.1 haad * group, before we modify them in the current active group. 1144 1.1 haad * 1145 1.1 haad * This function is used in two places: when we are dirtying a 1146 1.1 haad * buffer for the first time in a txg, and when we are freeing 1147 1.1 haad * a range in a dnode that includes this buffer. 1148 1.1 haad * 1149 1.1 haad * Note that when we are called from dbuf_free_range() we do 1150 1.1 haad * not put a hold on the buffer, we just traverse the active 1151 1.1 haad * dbuf list for the dnode. 1152 1.1 haad */ 1153 1.1 haad static void 1154 1.1 haad dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 1155 1.1 haad { 1156 1.1 haad dbuf_dirty_record_t *dr = db->db_last_dirty; 1157 1.1 haad 1158 1.1 haad ASSERT(MUTEX_HELD(&db->db_mtx)); 1159 1.1 haad ASSERT(db->db.db_data != NULL); 1160 1.1 haad ASSERT(db->db_level == 0); 1161 1.1 haad ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 1162 1.1 haad 1163 1.1 haad if (dr == NULL || 1164 1.1 haad (dr->dt.dl.dr_data != 1165 1.6 chs ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 1166 1.1 haad return; 1167 1.1 haad 1168 1.1 haad /* 1169 1.1 haad * If the last dirty record for this dbuf has not yet synced 1170 1.1 haad * and its referencing the dbuf data, either: 1171 1.6 chs * reset the reference to point to a new copy, 1172 1.1 haad * or (if there a no active holders) 1173 1.1 haad * just null out the current db_data pointer. 1174 1.1 haad */ 1175 1.1 haad ASSERT(dr->dr_txg >= txg - 2); 1176 1.6 chs if (db->db_blkid == DMU_BONUS_BLKID) { 1177 1.1 haad /* Note that the data bufs here are zio_bufs */ 1178 1.1 haad dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 1179 1.4 haad arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1180 1.1 haad bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 1181 1.1 haad } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1182 1.1 haad int size = db->db.db_size; 1183 1.1 haad arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1184 1.6 chs spa_t *spa = db->db_objset->os_spa; 1185 1.6 chs 1186 1.6 chs dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type); 1187 1.1 haad bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 1188 1.1 haad } else { 1189 1.6 chs db->db_buf = NULL; 1190 1.6 chs dbuf_clear_data(db); 1191 1.1 haad } 1192 1.1 haad } 1193 1.1 haad 1194 1.1 haad void 1195 1.1 haad dbuf_unoverride(dbuf_dirty_record_t *dr) 1196 1.1 haad { 1197 1.1 haad dmu_buf_impl_t *db = dr->dr_dbuf; 1198 1.4 haad blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 1199 1.1 haad uint64_t txg = dr->dr_txg; 1200 1.1 haad 1201 1.1 haad ASSERT(MUTEX_HELD(&db->db_mtx)); 1202 1.1 haad ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 1203 1.1 haad ASSERT(db->db_level == 0); 1204 1.1 haad 1205 1.6 chs if (db->db_blkid == DMU_BONUS_BLKID || 1206 1.1 haad dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 1207 1.1 haad return; 1208 1.1 haad 1209 1.4 haad ASSERT(db->db_data_pending != dr); 1210 1.4 haad 1211 1.1 haad /* free this block */ 1212 1.6 chs if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 1213 1.6 chs zio_free(db->db_objset->os_spa, txg, bp); 1214 1.4 haad 1215 1.1 haad dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 1216 1.6 chs dr->dt.dl.dr_nopwrite = B_FALSE; 1217 1.6 chs 1218 1.1 haad /* 1219 1.1 haad * Release the already-written buffer, so we leave it in 1220 1.1 haad * a consistent dirty state. Note that all callers are 1221 1.1 haad * modifying the buffer, so they will immediately do 1222 1.1 haad * another (redundant) arc_release(). Therefore, leave 1223 1.1 haad * the buf thawed to save the effort of freezing & 1224 1.1 haad * immediately re-thawing it. 1225 1.1 haad */ 1226 1.1 haad arc_release(dr->dt.dl.dr_data, db); 1227 1.1 haad } 1228 1.1 haad 1229 1.1 haad /* 1230 1.1 haad * Evict (if its unreferenced) or clear (if its referenced) any level-0 1231 1.1 haad * data blocks in the free range, so that any future readers will find 1232 1.6 chs * empty blocks. 1233 1.1 haad */ 1234 1.1 haad void 1235 1.6 chs dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 1236 1.6 chs dmu_tx_t *tx) 1237 1.1 haad { 1238 1.6 chs dmu_buf_impl_t db_search; 1239 1.1 haad dmu_buf_impl_t *db, *db_next; 1240 1.1 haad uint64_t txg = tx->tx_txg; 1241 1.6 chs avl_index_t where; 1242 1.6 chs 1243 1.6 chs if (end_blkid > dn->dn_maxblkid && 1244 1.6 chs !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID)) 1245 1.6 chs end_blkid = dn->dn_maxblkid; 1246 1.6 chs dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 1247 1.6 chs 1248 1.6 chs db_search.db_level = 0; 1249 1.6 chs db_search.db_blkid = start_blkid; 1250 1.6 chs db_search.db_state = DB_SEARCH; 1251 1.6 chs 1252 1.1 haad mutex_enter(&dn->dn_dbufs_mtx); 1253 1.6 chs db = avl_find(&dn->dn_dbufs, &db_search, &where); 1254 1.6 chs ASSERT3P(db, ==, NULL); 1255 1.6 chs 1256 1.6 chs db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 1257 1.1 haad 1258 1.6 chs for (; db != NULL; db = db_next) { 1259 1.6 chs db_next = AVL_NEXT(&dn->dn_dbufs, db); 1260 1.6 chs ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1261 1.6 chs 1262 1.6 chs if (db->db_level != 0 || db->db_blkid > end_blkid) { 1263 1.6 chs break; 1264 1.1 haad } 1265 1.6 chs ASSERT3U(db->db_blkid, >=, start_blkid); 1266 1.1 haad 1267 1.1 haad /* found a level 0 buffer in the range */ 1268 1.6 chs mutex_enter(&db->db_mtx); 1269 1.6 chs if (dbuf_undirty(db, tx)) { 1270 1.6 chs /* mutex has been dropped and dbuf destroyed */ 1271 1.1 haad continue; 1272 1.6 chs } 1273 1.1 haad 1274 1.1 haad if (db->db_state == DB_UNCACHED || 1275 1.1 haad db->db_state == DB_NOFILL || 1276 1.1 haad db->db_state == DB_EVICTING) { 1277 1.1 haad ASSERT(db->db.db_data == NULL); 1278 1.1 haad mutex_exit(&db->db_mtx); 1279 1.1 haad continue; 1280 1.1 haad } 1281 1.1 haad if (db->db_state == DB_READ || db->db_state == DB_FILL) { 1282 1.1 haad /* will be handled in dbuf_read_done or dbuf_rele */ 1283 1.1 haad db->db_freed_in_flight = TRUE; 1284 1.1 haad mutex_exit(&db->db_mtx); 1285 1.1 haad continue; 1286 1.1 haad } 1287 1.1 haad if (refcount_count(&db->db_holds) == 0) { 1288 1.1 haad ASSERT(db->db_buf); 1289 1.6 chs dbuf_destroy(db); 1290 1.1 haad continue; 1291 1.1 haad } 1292 1.1 haad /* The dbuf is referenced */ 1293 1.1 haad 1294 1.1 haad if (db->db_last_dirty != NULL) { 1295 1.1 haad dbuf_dirty_record_t *dr = db->db_last_dirty; 1296 1.1 haad 1297 1.1 haad if (dr->dr_txg == txg) { 1298 1.1 haad /* 1299 1.1 haad * This buffer is "in-use", re-adjust the file 1300 1.1 haad * size to reflect that this buffer may 1301 1.1 haad * contain new data when we sync. 1302 1.1 haad */ 1303 1.6 chs if (db->db_blkid != DMU_SPILL_BLKID && 1304 1.6 chs db->db_blkid > dn->dn_maxblkid) 1305 1.1 haad dn->dn_maxblkid = db->db_blkid; 1306 1.1 haad dbuf_unoverride(dr); 1307 1.1 haad } else { 1308 1.1 haad /* 1309 1.1 haad * This dbuf is not dirty in the open context. 1310 1.1 haad * Either uncache it (if its not referenced in 1311 1.1 haad * the open context) or reset its contents to 1312 1.1 haad * empty. 1313 1.1 haad */ 1314 1.1 haad dbuf_fix_old_data(db, txg); 1315 1.1 haad } 1316 1.1 haad } 1317 1.1 haad /* clear the contents if its cached */ 1318 1.1 haad if (db->db_state == DB_CACHED) { 1319 1.1 haad ASSERT(db->db.db_data != NULL); 1320 1.1 haad arc_release(db->db_buf, db); 1321 1.1 haad bzero(db->db.db_data, db->db.db_size); 1322 1.1 haad arc_buf_freeze(db->db_buf); 1323 1.1 haad } 1324 1.1 haad 1325 1.1 haad mutex_exit(&db->db_mtx); 1326 1.1 haad } 1327 1.1 haad mutex_exit(&dn->dn_dbufs_mtx); 1328 1.1 haad } 1329 1.1 haad 1330 1.1 haad static int 1331 1.1 haad dbuf_block_freeable(dmu_buf_impl_t *db) 1332 1.1 haad { 1333 1.1 haad dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 1334 1.1 haad uint64_t birth_txg = 0; 1335 1.1 haad 1336 1.1 haad /* 1337 1.1 haad * We don't need any locking to protect db_blkptr: 1338 1.1 haad * If it's syncing, then db_last_dirty will be set 1339 1.1 haad * so we'll ignore db_blkptr. 1340 1.6 chs * 1341 1.6 chs * This logic ensures that only block births for 1342 1.6 chs * filled blocks are considered. 1343 1.1 haad */ 1344 1.1 haad ASSERT(MUTEX_HELD(&db->db_mtx)); 1345 1.6 chs if (db->db_last_dirty && (db->db_blkptr == NULL || 1346 1.6 chs !BP_IS_HOLE(db->db_blkptr))) { 1347 1.1 haad birth_txg = db->db_last_dirty->dr_txg; 1348 1.6 chs } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 1349 1.1 haad birth_txg = db->db_blkptr->blk_birth; 1350 1.6 chs } 1351 1.1 haad 1352 1.6 chs /* 1353 1.6 chs * If this block don't exist or is in a snapshot, it can't be freed. 1354 1.6 chs * Don't pass the bp to dsl_dataset_block_freeable() since we 1355 1.6 chs * are holding the db_mtx lock and might deadlock if we are 1356 1.6 chs * prefetching a dedup-ed block. 1357 1.6 chs */ 1358 1.6 chs if (birth_txg != 0) 1359 1.1 haad return (ds == NULL || 1360 1.6 chs dsl_dataset_block_freeable(ds, NULL, birth_txg)); 1361 1.1 haad else 1362 1.6 chs return (B_FALSE); 1363 1.1 haad } 1364 1.1 haad 1365 1.1 haad void 1366 1.1 haad dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 1367 1.1 haad { 1368 1.1 haad arc_buf_t *buf, *obuf; 1369 1.1 haad int osize = db->db.db_size; 1370 1.1 haad arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1371 1.6 chs dnode_t *dn; 1372 1.6 chs 1373 1.6 chs ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1374 1.1 haad 1375 1.6 chs DB_DNODE_ENTER(db); 1376 1.6 chs dn = DB_DNODE(db); 1377 1.1 haad 1378 1.1 haad /* XXX does *this* func really need the lock? */ 1379 1.6 chs ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1380 1.1 haad 1381 1.1 haad /* 1382 1.6 chs * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 1383 1.1 haad * is OK, because there can be no other references to the db 1384 1.1 haad * when we are changing its size, so no concurrent DB_FILL can 1385 1.1 haad * be happening. 1386 1.1 haad */ 1387 1.1 haad /* 1388 1.1 haad * XXX we should be doing a dbuf_read, checking the return 1389 1.1 haad * value and returning that up to our callers 1390 1.1 haad */ 1391 1.6 chs dmu_buf_will_dirty(&db->db, tx); 1392 1.1 haad 1393 1.1 haad /* create the data buffer for the new block */ 1394 1.6 chs buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type); 1395 1.1 haad 1396 1.1 haad /* copy old block data to the new block */ 1397 1.1 haad obuf = db->db_buf; 1398 1.1 haad bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 1399 1.1 haad /* zero the remainder */ 1400 1.1 haad if (size > osize) 1401 1.1 haad bzero((uint8_t *)buf->b_data + osize, size - osize); 1402 1.1 haad 1403 1.1 haad mutex_enter(&db->db_mtx); 1404 1.1 haad dbuf_set_data(db, buf); 1405 1.6 chs arc_buf_destroy(obuf, db); 1406 1.1 haad db->db.db_size = size; 1407 1.1 haad 1408 1.1 haad if (db->db_level == 0) { 1409 1.1 haad ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1410 1.1 haad db->db_last_dirty->dt.dl.dr_data = buf; 1411 1.1 haad } 1412 1.1 haad mutex_exit(&db->db_mtx); 1413 1.1 haad 1414 1.6 chs dnode_willuse_space(dn, size-osize, tx); 1415 1.6 chs DB_DNODE_EXIT(db); 1416 1.6 chs } 1417 1.6 chs 1418 1.6 chs void 1419 1.6 chs dbuf_release_bp(dmu_buf_impl_t *db) 1420 1.6 chs { 1421 1.6 chs objset_t *os = db->db_objset; 1422 1.6 chs 1423 1.6 chs ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 1424 1.6 chs ASSERT(arc_released(os->os_phys_buf) || 1425 1.6 chs list_link_active(&os->os_dsl_dataset->ds_synced_link)); 1426 1.6 chs ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1427 1.6 chs 1428 1.6 chs (void) arc_release(db->db_buf, db); 1429 1.6 chs } 1430 1.6 chs 1431 1.6 chs /* 1432 1.6 chs * We already have a dirty record for this TXG, and we are being 1433 1.6 chs * dirtied again. 1434 1.6 chs */ 1435 1.6 chs static void 1436 1.6 chs dbuf_redirty(dbuf_dirty_record_t *dr) 1437 1.6 chs { 1438 1.6 chs dmu_buf_impl_t *db = dr->dr_dbuf; 1439 1.6 chs 1440 1.6 chs ASSERT(MUTEX_HELD(&db->db_mtx)); 1441 1.6 chs 1442 1.6 chs if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1443 1.6 chs /* 1444 1.6 chs * If this buffer has already been written out, 1445 1.6 chs * we now need to reset its state. 1446 1.6 chs */ 1447 1.6 chs dbuf_unoverride(dr); 1448 1.6 chs if (db->db.db_object != DMU_META_DNODE_OBJECT && 1449 1.6 chs db->db_state != DB_NOFILL) { 1450 1.6 chs /* Already released on initial dirty, so just thaw. */ 1451 1.6 chs ASSERT(arc_released(db->db_buf)); 1452 1.6 chs arc_buf_thaw(db->db_buf); 1453 1.6 chs } 1454 1.6 chs } 1455 1.1 haad } 1456 1.1 haad 1457 1.1 haad dbuf_dirty_record_t * 1458 1.1 haad dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1459 1.1 haad { 1460 1.6 chs dnode_t *dn; 1461 1.6 chs objset_t *os; 1462 1.1 haad dbuf_dirty_record_t **drp, *dr; 1463 1.1 haad int drop_struct_lock = FALSE; 1464 1.1 haad boolean_t do_free_accounting = B_FALSE; 1465 1.1 haad int txgoff = tx->tx_txg & TXG_MASK; 1466 1.1 haad 1467 1.1 haad ASSERT(tx->tx_txg != 0); 1468 1.1 haad ASSERT(!refcount_is_zero(&db->db_holds)); 1469 1.1 haad DMU_TX_DIRTY_BUF(tx, db); 1470 1.1 haad 1471 1.6 chs DB_DNODE_ENTER(db); 1472 1.6 chs dn = DB_DNODE(db); 1473 1.1 haad /* 1474 1.1 haad * Shouldn't dirty a regular buffer in syncing context. Private 1475 1.1 haad * objects may be dirtied in syncing context, but only if they 1476 1.1 haad * were already pre-dirtied in open context. 1477 1.1 haad */ 1478 1.6 chs #ifdef DEBUG 1479 1.6 chs if (dn->dn_objset->os_dsl_dataset != NULL) { 1480 1.6 chs rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1481 1.6 chs RW_READER, FTAG); 1482 1.6 chs } 1483 1.1 haad ASSERT(!dmu_tx_is_syncing(tx) || 1484 1.1 haad BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1485 1.4 haad DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1486 1.4 haad dn->dn_objset->os_dsl_dataset == NULL); 1487 1.6 chs if (dn->dn_objset->os_dsl_dataset != NULL) 1488 1.6 chs rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); 1489 1.6 chs #endif 1490 1.1 haad /* 1491 1.1 haad * We make this assert for private objects as well, but after we 1492 1.1 haad * check if we're already dirty. They are allowed to re-dirty 1493 1.1 haad * in syncing context. 1494 1.1 haad */ 1495 1.1 haad ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1496 1.1 haad dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1497 1.1 haad (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1498 1.1 haad 1499 1.1 haad mutex_enter(&db->db_mtx); 1500 1.1 haad /* 1501 1.1 haad * XXX make this true for indirects too? The problem is that 1502 1.1 haad * transactions created with dmu_tx_create_assigned() from 1503 1.1 haad * syncing context don't bother holding ahead. 1504 1.1 haad */ 1505 1.1 haad ASSERT(db->db_level != 0 || 1506 1.1 haad db->db_state == DB_CACHED || db->db_state == DB_FILL || 1507 1.1 haad db->db_state == DB_NOFILL); 1508 1.1 haad 1509 1.1 haad mutex_enter(&dn->dn_mtx); 1510 1.1 haad /* 1511 1.1 haad * Don't set dirtyctx to SYNC if we're just modifying this as we 1512 1.1 haad * initialize the objset. 1513 1.1 haad */ 1514 1.6 chs if (dn->dn_dirtyctx == DN_UNDIRTIED) { 1515 1.6 chs if (dn->dn_objset->os_dsl_dataset != NULL) { 1516 1.6 chs rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1517 1.6 chs RW_READER, FTAG); 1518 1.6 chs } 1519 1.6 chs if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1520 1.6 chs dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? 1521 1.6 chs DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1522 1.6 chs ASSERT(dn->dn_dirtyctx_firstset == NULL); 1523 1.6 chs dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1524 1.6 chs } 1525 1.6 chs if (dn->dn_objset->os_dsl_dataset != NULL) { 1526 1.6 chs rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, 1527 1.6 chs FTAG); 1528 1.6 chs } 1529 1.1 haad } 1530 1.1 haad mutex_exit(&dn->dn_mtx); 1531 1.1 haad 1532 1.6 chs if (db->db_blkid == DMU_SPILL_BLKID) 1533 1.6 chs dn->dn_have_spill = B_TRUE; 1534 1.6 chs 1535 1.1 haad /* 1536 1.1 haad * If this buffer is already dirty, we're done. 1537 1.1 haad */ 1538 1.1 haad drp = &db->db_last_dirty; 1539 1.1 haad ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1540 1.1 haad db->db.db_object == DMU_META_DNODE_OBJECT); 1541 1.1 haad while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1542 1.1 haad drp = &dr->dr_next; 1543 1.1 haad if (dr && dr->dr_txg == tx->tx_txg) { 1544 1.6 chs DB_DNODE_EXIT(db); 1545 1.6 chs 1546 1.6 chs dbuf_redirty(dr); 1547 1.1 haad mutex_exit(&db->db_mtx); 1548 1.1 haad return (dr); 1549 1.1 haad } 1550 1.1 haad 1551 1.1 haad /* 1552 1.1 haad * Only valid if not already dirty. 1553 1.1 haad */ 1554 1.4 haad ASSERT(dn->dn_object == 0 || 1555 1.4 haad dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1556 1.1 haad (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1557 1.1 haad 1558 1.1 haad ASSERT3U(dn->dn_nlevels, >, db->db_level); 1559 1.1 haad ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1560 1.1 haad dn->dn_phys->dn_nlevels > db->db_level || 1561 1.1 haad dn->dn_next_nlevels[txgoff] > db->db_level || 1562 1.1 haad dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1563 1.1 haad dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1564 1.1 haad 1565 1.1 haad /* 1566 1.1 haad * We should only be dirtying in syncing context if it's the 1567 1.4 haad * mos or we're initializing the os or it's a special object. 1568 1.4 haad * However, we are allowed to dirty in syncing context provided 1569 1.4 haad * we already dirtied it in open context. Hence we must make 1570 1.4 haad * this assertion only if we're not already dirty. 1571 1.1 haad */ 1572 1.6 chs os = dn->dn_objset; 1573 1.6 chs #ifdef DEBUG 1574 1.6 chs if (dn->dn_objset->os_dsl_dataset != NULL) 1575 1.6 chs rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); 1576 1.4 haad ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1577 1.4 haad os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1578 1.6 chs if (dn->dn_objset->os_dsl_dataset != NULL) 1579 1.6 chs rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); 1580 1.6 chs #endif 1581 1.1 haad ASSERT(db->db.db_size != 0); 1582 1.1 haad 1583 1.1 haad dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1584 1.1 haad 1585 1.6 chs if (db->db_blkid != DMU_BONUS_BLKID) { 1586 1.1 haad /* 1587 1.1 haad * Update the accounting. 1588 1.1 haad * Note: we delay "free accounting" until after we drop 1589 1.1 haad * the db_mtx. This keeps us from grabbing other locks 1590 1.4 haad * (and possibly deadlocking) in bp_get_dsize() while 1591 1.1 haad * also holding the db_mtx. 1592 1.1 haad */ 1593 1.1 haad dnode_willuse_space(dn, db->db.db_size, tx); 1594 1.1 haad do_free_accounting = dbuf_block_freeable(db); 1595 1.1 haad } 1596 1.1 haad 1597 1.1 haad /* 1598 1.1 haad * If this buffer is dirty in an old transaction group we need 1599 1.1 haad * to make a copy of it so that the changes we make in this 1600 1.1 haad * transaction group won't leak out when we sync the older txg. 1601 1.1 haad */ 1602 1.1 haad dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1603 1.1 haad if (db->db_level == 0) { 1604 1.1 haad void *data_old = db->db_buf; 1605 1.1 haad 1606 1.1 haad if (db->db_state != DB_NOFILL) { 1607 1.6 chs if (db->db_blkid == DMU_BONUS_BLKID) { 1608 1.1 haad dbuf_fix_old_data(db, tx->tx_txg); 1609 1.1 haad data_old = db->db.db_data; 1610 1.1 haad } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1611 1.1 haad /* 1612 1.1 haad * Release the data buffer from the cache so 1613 1.1 haad * that we can modify it without impacting 1614 1.1 haad * possible other users of this cached data 1615 1.1 haad * block. Note that indirect blocks and 1616 1.1 haad * private objects are not released until the 1617 1.1 haad * syncing state (since they are only modified 1618 1.1 haad * then). 1619 1.1 haad */ 1620 1.1 haad arc_release(db->db_buf, db); 1621 1.1 haad dbuf_fix_old_data(db, tx->tx_txg); 1622 1.1 haad data_old = db->db_buf; 1623 1.1 haad } 1624 1.1 haad ASSERT(data_old != NULL); 1625 1.1 haad } 1626 1.1 haad dr->dt.dl.dr_data = data_old; 1627 1.1 haad } else { 1628 1.1 haad mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1629 1.1 haad list_create(&dr->dt.di.dr_children, 1630 1.1 haad sizeof (dbuf_dirty_record_t), 1631 1.1 haad offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1632 1.1 haad } 1633 1.6 chs if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1634 1.6 chs dr->dr_accounted = db->db.db_size; 1635 1.1 haad dr->dr_dbuf = db; 1636 1.1 haad dr->dr_txg = tx->tx_txg; 1637 1.1 haad dr->dr_next = *drp; 1638 1.1 haad *drp = dr; 1639 1.1 haad 1640 1.1 haad /* 1641 1.1 haad * We could have been freed_in_flight between the dbuf_noread 1642 1.1 haad * and dbuf_dirty. We win, as though the dbuf_noread() had 1643 1.1 haad * happened after the free. 1644 1.1 haad */ 1645 1.6 chs if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1646 1.6 chs db->db_blkid != DMU_SPILL_BLKID) { 1647 1.1 haad mutex_enter(&dn->dn_mtx); 1648 1.6 chs if (dn->dn_free_ranges[txgoff] != NULL) { 1649 1.6 chs range_tree_clear(dn->dn_free_ranges[txgoff], 1650 1.6 chs db->db_blkid, 1); 1651 1.6 chs } 1652 1.1 haad mutex_exit(&dn->dn_mtx); 1653 1.1 haad db->db_freed_in_flight = FALSE; 1654 1.1 haad } 1655 1.1 haad 1656 1.1 haad /* 1657 1.1 haad * This buffer is now part of this txg 1658 1.1 haad */ 1659 1.1 haad dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1660 1.1 haad db->db_dirtycnt += 1; 1661 1.1 haad ASSERT3U(db->db_dirtycnt, <=, 3); 1662 1.1 haad 1663 1.1 haad mutex_exit(&db->db_mtx); 1664 1.1 haad 1665 1.6 chs if (db->db_blkid == DMU_BONUS_BLKID || 1666 1.6 chs db->db_blkid == DMU_SPILL_BLKID) { 1667 1.1 haad mutex_enter(&dn->dn_mtx); 1668 1.1 haad ASSERT(!list_link_active(&dr->dr_dirty_node)); 1669 1.1 haad list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1670 1.1 haad mutex_exit(&dn->dn_mtx); 1671 1.1 haad dnode_setdirty(dn, tx); 1672 1.6 chs DB_DNODE_EXIT(db); 1673 1.1 haad return (dr); 1674 1.6 chs } 1675 1.6 chs 1676 1.6 chs /* 1677 1.6 chs * The dn_struct_rwlock prevents db_blkptr from changing 1678 1.6 chs * due to a write from syncing context completing 1679 1.6 chs * while we are running, so we want to acquire it before 1680 1.6 chs * looking at db_blkptr. 1681 1.6 chs */ 1682 1.6 chs if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1683 1.6 chs rw_enter(&dn->dn_struct_rwlock, RW_READER); 1684 1.6 chs drop_struct_lock = TRUE; 1685 1.6 chs } 1686 1.6 chs 1687 1.6 chs if (do_free_accounting) { 1688 1.1 haad blkptr_t *bp = db->db_blkptr; 1689 1.1 haad int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1690 1.4 haad bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1691 1.1 haad /* 1692 1.1 haad * This is only a guess -- if the dbuf is dirty 1693 1.1 haad * in a previous txg, we don't know how much 1694 1.1 haad * space it will use on disk yet. We should 1695 1.1 haad * really have the struct_rwlock to access 1696 1.1 haad * db_blkptr, but since this is just a guess, 1697 1.1 haad * it's OK if we get an odd answer. 1698 1.1 haad */ 1699 1.6 chs ddt_prefetch(os->os_spa, bp); 1700 1.1 haad dnode_willuse_space(dn, -willfree, tx); 1701 1.1 haad } 1702 1.1 haad 1703 1.1 haad if (db->db_level == 0) { 1704 1.1 haad dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1705 1.1 haad ASSERT(dn->dn_maxblkid >= db->db_blkid); 1706 1.1 haad } 1707 1.1 haad 1708 1.1 haad if (db->db_level+1 < dn->dn_nlevels) { 1709 1.1 haad dmu_buf_impl_t *parent = db->db_parent; 1710 1.1 haad dbuf_dirty_record_t *di; 1711 1.1 haad int parent_held = FALSE; 1712 1.1 haad 1713 1.1 haad if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1714 1.1 haad int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1715 1.1 haad 1716 1.1 haad parent = dbuf_hold_level(dn, db->db_level+1, 1717 1.1 haad db->db_blkid >> epbs, FTAG); 1718 1.6 chs ASSERT(parent != NULL); 1719 1.1 haad parent_held = TRUE; 1720 1.1 haad } 1721 1.1 haad if (drop_struct_lock) 1722 1.1 haad rw_exit(&dn->dn_struct_rwlock); 1723 1.1 haad ASSERT3U(db->db_level+1, ==, parent->db_level); 1724 1.1 haad di = dbuf_dirty(parent, tx); 1725 1.1 haad if (parent_held) 1726 1.1 haad dbuf_rele(parent, FTAG); 1727 1.1 haad 1728 1.1 haad mutex_enter(&db->db_mtx); 1729 1.6 chs /* 1730 1.6 chs * Since we've dropped the mutex, it's possible that 1731 1.6 chs * dbuf_undirty() might have changed this out from under us. 1732 1.6 chs */ 1733 1.1 haad if (db->db_last_dirty == dr || 1734 1.1 haad dn->dn_object == DMU_META_DNODE_OBJECT) { 1735 1.1 haad mutex_enter(&di->dt.di.dr_mtx); 1736 1.1 haad ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1737 1.1 haad ASSERT(!list_link_active(&dr->dr_dirty_node)); 1738 1.1 haad list_insert_tail(&di->dt.di.dr_children, dr); 1739 1.1 haad mutex_exit(&di->dt.di.dr_mtx); 1740 1.1 haad dr->dr_parent = di; 1741 1.1 haad } 1742 1.1 haad mutex_exit(&db->db_mtx); 1743 1.1 haad } else { 1744 1.1 haad ASSERT(db->db_level+1 == dn->dn_nlevels); 1745 1.1 haad ASSERT(db->db_blkid < dn->dn_nblkptr); 1746 1.6 chs ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1747 1.1 haad mutex_enter(&dn->dn_mtx); 1748 1.1 haad ASSERT(!list_link_active(&dr->dr_dirty_node)); 1749 1.1 haad list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1750 1.1 haad mutex_exit(&dn->dn_mtx); 1751 1.1 haad if (drop_struct_lock) 1752 1.1 haad rw_exit(&dn->dn_struct_rwlock); 1753 1.1 haad } 1754 1.1 haad 1755 1.1 haad dnode_setdirty(dn, tx); 1756 1.6 chs DB_DNODE_EXIT(db); 1757 1.1 haad return (dr); 1758 1.1 haad } 1759 1.1 haad 1760 1.6 chs /* 1761 1.6 chs * Undirty a buffer in the transaction group referenced by the given 1762 1.6 chs * transaction. Return whether this evicted the dbuf. 1763 1.6 chs */ 1764 1.6 chs static boolean_t 1765 1.1 haad dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1766 1.1 haad { 1767 1.6 chs dnode_t *dn; 1768 1.1 haad uint64_t txg = tx->tx_txg; 1769 1.1 haad dbuf_dirty_record_t *dr, **drp; 1770 1.1 haad 1771 1.1 haad ASSERT(txg != 0); 1772 1.1 haad 1773 1.6 chs /* 1774 1.6 chs * Due to our use of dn_nlevels below, this can only be called 1775 1.6 chs * in open context, unless we are operating on the MOS. 1776 1.6 chs * From syncing context, dn_nlevels may be different from the 1777 1.6 chs * dn_nlevels used when dbuf was dirtied. 1778 1.6 chs */ 1779 1.6 chs ASSERT(db->db_objset == 1780 1.6 chs dmu_objset_pool(db->db_objset)->dp_meta_objset || 1781 1.6 chs txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); 1782 1.6 chs ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1783 1.6 chs ASSERT0(db->db_level); 1784 1.6 chs ASSERT(MUTEX_HELD(&db->db_mtx)); 1785 1.6 chs 1786 1.1 haad /* 1787 1.1 haad * If this buffer is not dirty, we're done. 1788 1.1 haad */ 1789 1.1 haad for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1790 1.1 haad if (dr->dr_txg <= txg) 1791 1.1 haad break; 1792 1.6 chs if (dr == NULL || dr->dr_txg < txg) 1793 1.6 chs return (B_FALSE); 1794 1.1 haad ASSERT(dr->dr_txg == txg); 1795 1.4 haad ASSERT(dr->dr_dbuf == db); 1796 1.1 haad 1797 1.6 chs DB_DNODE_ENTER(db); 1798 1.6 chs dn = DB_DNODE(db); 1799 1.1 haad 1800 1.1 haad dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1801 1.1 haad 1802 1.1 haad ASSERT(db->db.db_size != 0); 1803 1.1 haad 1804 1.6 chs dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), 1805 1.6 chs dr->dr_accounted, txg); 1806 1.1 haad 1807 1.1 haad *drp = dr->dr_next; 1808 1.1 haad 1809 1.6 chs /* 1810 1.6 chs * Note that there are three places in dbuf_dirty() 1811 1.6 chs * where this dirty record may be put on a list. 1812 1.6 chs * Make sure to do a list_remove corresponding to 1813 1.6 chs * every one of those list_insert calls. 1814 1.6 chs */ 1815 1.1 haad if (dr->dr_parent) { 1816 1.1 haad mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1817 1.1 haad list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1818 1.1 haad mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1819 1.6 chs } else if (db->db_blkid == DMU_SPILL_BLKID || 1820 1.6 chs db->db_level + 1 == dn->dn_nlevels) { 1821 1.1 haad ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1822 1.1 haad mutex_enter(&dn->dn_mtx); 1823 1.1 haad list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1824 1.1 haad mutex_exit(&dn->dn_mtx); 1825 1.1 haad } 1826 1.6 chs DB_DNODE_EXIT(db); 1827 1.1 haad 1828 1.6 chs if (db->db_state != DB_NOFILL) { 1829 1.6 chs dbuf_unoverride(dr); 1830 1.1 haad 1831 1.1 haad ASSERT(db->db_buf != NULL); 1832 1.6 chs ASSERT(dr->dt.dl.dr_data != NULL); 1833 1.6 chs if (dr->dt.dl.dr_data != db->db_buf) 1834 1.6 chs arc_buf_destroy(dr->dt.dl.dr_data, db); 1835 1.1 haad } 1836 1.6 chs 1837 1.1 haad kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1838 1.1 haad 1839 1.1 haad ASSERT(db->db_dirtycnt > 0); 1840 1.1 haad db->db_dirtycnt -= 1; 1841 1.1 haad 1842 1.1 haad if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1843 1.6 chs ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); 1844 1.6 chs dbuf_destroy(db); 1845 1.6 chs return (B_TRUE); 1846 1.1 haad } 1847 1.1 haad 1848 1.6 chs return (B_FALSE); 1849 1.1 haad } 1850 1.1 haad 1851 1.6 chs void 1852 1.5 joerg dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1853 1.5 joerg { 1854 1.5 joerg dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1855 1.1 haad int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1856 1.1 haad 1857 1.1 haad ASSERT(tx->tx_txg != 0); 1858 1.1 haad ASSERT(!refcount_is_zero(&db->db_holds)); 1859 1.1 haad 1860 1.6 chs /* 1861 1.6 chs * Quick check for dirtyness. For already dirty blocks, this 1862 1.6 chs * reduces runtime of this function by >90%, and overall performance 1863 1.6 chs * by 50% for some workloads (e.g. file deletion with indirect blocks 1864 1.6 chs * cached). 1865 1.6 chs */ 1866 1.6 chs mutex_enter(&db->db_mtx); 1867 1.6 chs dbuf_dirty_record_t *dr; 1868 1.6 chs for (dr = db->db_last_dirty; 1869 1.6 chs dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { 1870 1.6 chs /* 1871 1.6 chs * It's possible that it is already dirty but not cached, 1872 1.6 chs * because there are some calls to dbuf_dirty() that don't 1873 1.6 chs * go through dmu_buf_will_dirty(). 1874 1.6 chs */ 1875 1.6 chs if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { 1876 1.6 chs /* This dbuf is already dirty and cached. */ 1877 1.6 chs dbuf_redirty(dr); 1878 1.6 chs mutex_exit(&db->db_mtx); 1879 1.6 chs return; 1880 1.6 chs } 1881 1.6 chs } 1882 1.6 chs mutex_exit(&db->db_mtx); 1883 1.6 chs 1884 1.6 chs DB_DNODE_ENTER(db); 1885 1.6 chs if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1886 1.1 haad rf |= DB_RF_HAVESTRUCT; 1887 1.6 chs DB_DNODE_EXIT(db); 1888 1.1 haad (void) dbuf_read(db, NULL, rf); 1889 1.1 haad (void) dbuf_dirty(db, tx); 1890 1.1 haad } 1891 1.1 haad 1892 1.1 haad void 1893 1.1 haad dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1894 1.1 haad { 1895 1.1 haad dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1896 1.1 haad 1897 1.1 haad db->db_state = DB_NOFILL; 1898 1.1 haad 1899 1.1 haad dmu_buf_will_fill(db_fake, tx); 1900 1.1 haad } 1901 1.1 haad 1902 1.1 haad void 1903 1.1 haad dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1904 1.1 haad { 1905 1.1 haad dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1906 1.1 haad 1907 1.6 chs ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1908 1.1 haad ASSERT(tx->tx_txg != 0); 1909 1.1 haad ASSERT(db->db_level == 0); 1910 1.1 haad ASSERT(!refcount_is_zero(&db->db_holds)); 1911 1.1 haad 1912 1.1 haad ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1913 1.1 haad dmu_tx_private_ok(tx)); 1914 1.1 haad 1915 1.1 haad dbuf_noread(db); 1916 1.1 haad (void) dbuf_dirty(db, tx); 1917 1.1 haad } 1918 1.1 haad 1919 1.6 chs #pragma weak dmu_buf_fill_done = dbuf_fill_done 1920 1.1 haad /* ARGSUSED */ 1921 1.1 haad void 1922 1.1 haad dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1923 1.1 haad { 1924 1.1 haad mutex_enter(&db->db_mtx); 1925 1.1 haad DBUF_VERIFY(db); 1926 1.1 haad 1927 1.1 haad if (db->db_state == DB_FILL) { 1928 1.1 haad if (db->db_level == 0 && db->db_freed_in_flight) { 1929 1.6 chs ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1930 1.1 haad /* we were freed while filling */ 1931 1.1 haad /* XXX dbuf_undirty? */ 1932 1.1 haad bzero(db->db.db_data, db->db.db_size); 1933 1.1 haad db->db_freed_in_flight = FALSE; 1934 1.1 haad } 1935 1.1 haad db->db_state = DB_CACHED; 1936 1.1 haad cv_broadcast(&db->db_changed); 1937 1.1 haad } 1938 1.1 haad mutex_exit(&db->db_mtx); 1939 1.1 haad } 1940 1.1 haad 1941 1.6 chs void 1942 1.6 chs dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 1943 1.6 chs bp_embedded_type_t etype, enum zio_compress comp, 1944 1.6 chs int uncompressed_size, int compressed_size, int byteorder, 1945 1.6 chs dmu_tx_t *tx) 1946 1.6 chs { 1947 1.6 chs dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1948 1.6 chs struct dirty_leaf *dl; 1949 1.6 chs dmu_object_type_t type; 1950 1.6 chs 1951 1.6 chs if (etype == BP_EMBEDDED_TYPE_DATA) { 1952 1.6 chs ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), 1953 1.6 chs SPA_FEATURE_EMBEDDED_DATA)); 1954 1.6 chs } 1955 1.6 chs 1956 1.6 chs DB_DNODE_ENTER(db); 1957 1.6 chs type = DB_DNODE(db)->dn_type; 1958 1.6 chs DB_DNODE_EXIT(db); 1959 1.6 chs 1960 1.6 chs ASSERT0(db->db_level); 1961 1.6 chs ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1962 1.6 chs 1963 1.6 chs dmu_buf_will_not_fill(dbuf, tx); 1964 1.6 chs 1965 1.6 chs ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1966 1.6 chs dl = &db->db_last_dirty->dt.dl; 1967 1.6 chs encode_embedded_bp_compressed(&dl->dr_overridden_by, 1968 1.6 chs data, comp, uncompressed_size, compressed_size); 1969 1.6 chs BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 1970 1.6 chs BP_SET_TYPE(&dl->dr_overridden_by, type); 1971 1.6 chs BP_SET_LEVEL(&dl->dr_overridden_by, 0); 1972 1.6 chs BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 1973 1.6 chs 1974 1.6 chs dl->dr_override_state = DR_OVERRIDDEN; 1975 1.6 chs dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 1976 1.6 chs } 1977 1.6 chs 1978 1.1 haad /* 1979 1.4 haad * Directly assign a provided arc buf to a given dbuf if it's not referenced 1980 1.4 haad * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1981 1.4 haad */ 1982 1.4 haad void 1983 1.4 haad dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1984 1.4 haad { 1985 1.4 haad ASSERT(!refcount_is_zero(&db->db_holds)); 1986 1.6 chs ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1987 1.4 haad ASSERT(db->db_level == 0); 1988 1.4 haad ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1989 1.4 haad ASSERT(buf != NULL); 1990 1.4 haad ASSERT(arc_buf_size(buf) == db->db.db_size); 1991 1.4 haad ASSERT(tx->tx_txg != 0); 1992 1.4 haad 1993 1.4 haad arc_return_buf(buf, db); 1994 1.4 haad ASSERT(arc_released(buf)); 1995 1.4 haad 1996 1.4 haad mutex_enter(&db->db_mtx); 1997 1.4 haad 1998 1.4 haad while (db->db_state == DB_READ || db->db_state == DB_FILL) 1999 1.4 haad cv_wait(&db->db_changed, &db->db_mtx); 2000 1.4 haad 2001 1.4 haad ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 2002 1.4 haad 2003 1.4 haad if (db->db_state == DB_CACHED && 2004 1.4 haad refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 2005 1.4 haad mutex_exit(&db->db_mtx); 2006 1.4 haad (void) dbuf_dirty(db, tx); 2007 1.4 haad bcopy(buf->b_data, db->db.db_data, db->db.db_size); 2008 1.6 chs arc_buf_destroy(buf, db); 2009 1.4 haad xuio_stat_wbuf_copied(); 2010 1.4 haad return; 2011 1.4 haad } 2012 1.4 haad 2013 1.4 haad xuio_stat_wbuf_nocopy(); 2014 1.4 haad if (db->db_state == DB_CACHED) { 2015 1.4 haad dbuf_dirty_record_t *dr = db->db_last_dirty; 2016 1.4 haad 2017 1.4 haad ASSERT(db->db_buf != NULL); 2018 1.4 haad if (dr != NULL && dr->dr_txg == tx->tx_txg) { 2019 1.4 haad ASSERT(dr->dt.dl.dr_data == db->db_buf); 2020 1.4 haad if (!arc_released(db->db_buf)) { 2021 1.4 haad ASSERT(dr->dt.dl.dr_override_state == 2022 1.4 haad DR_OVERRIDDEN); 2023 1.4 haad arc_release(db->db_buf, db); 2024 1.4 haad } 2025 1.4 haad dr->dt.dl.dr_data = buf; 2026 1.6 chs arc_buf_destroy(db->db_buf, db); 2027 1.4 haad } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 2028 1.4 haad arc_release(db->db_buf, db); 2029 1.6 chs arc_buf_destroy(db->db_buf, db); 2030 1.4 haad } 2031 1.4 haad db->db_buf = NULL; 2032 1.4 haad } 2033 1.4 haad ASSERT(db->db_buf == NULL); 2034 1.4 haad dbuf_set_data(db, buf); 2035 1.4 haad db->db_state = DB_FILL; 2036 1.4 haad mutex_exit(&db->db_mtx); 2037 1.4 haad (void) dbuf_dirty(db, tx); 2038 1.6 chs dmu_buf_fill_done(&db->db, tx); 2039 1.4 haad } 2040 1.4 haad 2041 1.1 haad void 2042 1.6 chs dbuf_destroy(dmu_buf_impl_t *db) 2043 1.1 haad { 2044 1.6 chs dnode_t *dn; 2045 1.1 haad dmu_buf_impl_t *parent = db->db_parent; 2046 1.6 chs dmu_buf_impl_t *dndb; 2047 1.1 haad 2048 1.1 haad ASSERT(MUTEX_HELD(&db->db_mtx)); 2049 1.1 haad ASSERT(refcount_is_zero(&db->db_holds)); 2050 1.1 haad 2051 1.6 chs if (db->db_buf != NULL) { 2052 1.6 chs arc_buf_destroy(db->db_buf, db); 2053 1.6 chs db->db_buf = NULL; 2054 1.6 chs } 2055 1.1 haad 2056 1.6 chs if (db->db_blkid == DMU_BONUS_BLKID) { 2057 1.1 haad ASSERT(db->db.db_data != NULL); 2058 1.6 chs zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 2059 1.6 chs arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2060 1.1 haad db->db_state = DB_UNCACHED; 2061 1.1 haad } 2062 1.1 haad 2063 1.6 chs dbuf_clear_data(db); 2064 1.6 chs 2065 1.6 chs if (multilist_link_active(&db->db_cache_link)) { 2066 1.6 chs multilist_remove(&dbuf_cache, db); 2067 1.6 chs (void) refcount_remove_many(&dbuf_cache_size, 2068 1.6 chs db->db.db_size, db); 2069 1.6 chs } 2070 1.6 chs 2071 1.1 haad ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 2072 1.1 haad ASSERT(db->db_data_pending == NULL); 2073 1.1 haad 2074 1.1 haad db->db_state = DB_EVICTING; 2075 1.1 haad db->db_blkptr = NULL; 2076 1.1 haad 2077 1.6 chs /* 2078 1.6 chs * Now that db_state is DB_EVICTING, nobody else can find this via 2079 1.6 chs * the hash table. We can now drop db_mtx, which allows us to 2080 1.6 chs * acquire the dn_dbufs_mtx. 2081 1.6 chs */ 2082 1.6 chs mutex_exit(&db->db_mtx); 2083 1.6 chs 2084 1.6 chs DB_DNODE_ENTER(db); 2085 1.6 chs dn = DB_DNODE(db); 2086 1.6 chs dndb = dn->dn_dbuf; 2087 1.6 chs if (db->db_blkid != DMU_BONUS_BLKID) { 2088 1.6 chs boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); 2089 1.6 chs if (needlock) 2090 1.6 chs mutex_enter(&dn->dn_dbufs_mtx); 2091 1.6 chs avl_remove(&dn->dn_dbufs, db); 2092 1.6 chs atomic_dec_32(&dn->dn_dbufs_count); 2093 1.6 chs membar_producer(); 2094 1.6 chs DB_DNODE_EXIT(db); 2095 1.6 chs if (needlock) 2096 1.6 chs mutex_exit(&dn->dn_dbufs_mtx); 2097 1.6 chs /* 2098 1.6 chs * Decrementing the dbuf count means that the hold corresponding 2099 1.6 chs * to the removed dbuf is no longer discounted in dnode_move(), 2100 1.6 chs * so the dnode cannot be moved until after we release the hold. 2101 1.6 chs * The membar_producer() ensures visibility of the decremented 2102 1.6 chs * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 2103 1.6 chs * release any lock. 2104 1.6 chs */ 2105 1.1 haad dnode_rele(dn, db); 2106 1.6 chs db->db_dnode_handle = NULL; 2107 1.6 chs 2108 1.6 chs dbuf_hash_remove(db); 2109 1.6 chs } else { 2110 1.6 chs DB_DNODE_EXIT(db); 2111 1.1 haad } 2112 1.1 haad 2113 1.6 chs ASSERT(refcount_is_zero(&db->db_holds)); 2114 1.1 haad 2115 1.6 chs db->db_parent = NULL; 2116 1.6 chs 2117 1.6 chs ASSERT(db->db_buf == NULL); 2118 1.6 chs ASSERT(db->db.db_data == NULL); 2119 1.6 chs ASSERT(db->db_hash_next == NULL); 2120 1.6 chs ASSERT(db->db_blkptr == NULL); 2121 1.6 chs ASSERT(db->db_data_pending == NULL); 2122 1.6 chs ASSERT(!multilist_link_active(&db->db_cache_link)); 2123 1.6 chs 2124 1.6 chs kmem_cache_free(dbuf_kmem_cache, db); 2125 1.6 chs arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2126 1.1 haad 2127 1.1 haad /* 2128 1.6 chs * If this dbuf is referenced from an indirect dbuf, 2129 1.1 haad * decrement the ref count on the indirect dbuf. 2130 1.1 haad */ 2131 1.1 haad if (parent && parent != dndb) 2132 1.1 haad dbuf_rele(parent, db); 2133 1.1 haad } 2134 1.1 haad 2135 1.6 chs /* 2136 1.6 chs * Note: While bpp will always be updated if the function returns success, 2137 1.6 chs * parentp will not be updated if the dnode does not have dn_dbuf filled in; 2138 1.6 chs * this happens when the dnode is the meta-dnode, or a userused or groupused 2139 1.6 chs * object. 2140 1.6 chs */ 2141 1.1 haad static int 2142 1.1 haad dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 2143 1.1 haad dmu_buf_impl_t **parentp, blkptr_t **bpp) 2144 1.1 haad { 2145 1.1 haad int nlevels, epbs; 2146 1.1 haad 2147 1.1 haad *parentp = NULL; 2148 1.1 haad *bpp = NULL; 2149 1.1 haad 2150 1.6 chs ASSERT(blkid != DMU_BONUS_BLKID); 2151 1.6 chs 2152 1.6 chs if (blkid == DMU_SPILL_BLKID) { 2153 1.6 chs mutex_enter(&dn->dn_mtx); 2154 1.6 chs if (dn->dn_have_spill && 2155 1.6 chs (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 2156 1.6 chs *bpp = &dn->dn_phys->dn_spill; 2157 1.6 chs else 2158 1.6 chs *bpp = NULL; 2159 1.6 chs dbuf_add_ref(dn->dn_dbuf, NULL); 2160 1.6 chs *parentp = dn->dn_dbuf; 2161 1.6 chs mutex_exit(&dn->dn_mtx); 2162 1.6 chs return (0); 2163 1.6 chs } 2164 1.1 haad 2165 1.1 haad if (dn->dn_phys->dn_nlevels == 0) 2166 1.1 haad nlevels = 1; 2167 1.1 haad else 2168 1.1 haad nlevels = dn->dn_phys->dn_nlevels; 2169 1.1 haad 2170 1.1 haad epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 2171 1.1 haad 2172 1.1 haad ASSERT3U(level * epbs, <, 64); 2173 1.1 haad ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2174 1.1 haad if (level >= nlevels || 2175 1.1 haad (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 2176 1.1 haad /* the buffer has no parent yet */ 2177 1.6 chs return (SET_ERROR(ENOENT)); 2178 1.1 haad } else if (level < nlevels-1) { 2179 1.1 haad /* this block is referenced from an indirect block */ 2180 1.1 haad int err = dbuf_hold_impl(dn, level+1, 2181 1.6 chs blkid >> epbs, fail_sparse, FALSE, NULL, parentp); 2182 1.1 haad if (err) 2183 1.1 haad return (err); 2184 1.1 haad err = dbuf_read(*parentp, NULL, 2185 1.1 haad (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 2186 1.1 haad if (err) { 2187 1.1 haad dbuf_rele(*parentp, NULL); 2188 1.1 haad *parentp = NULL; 2189 1.1 haad return (err); 2190 1.1 haad } 2191 1.1 haad *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 2192 1.1 haad (blkid & ((1ULL << epbs) - 1)); 2193 1.1 haad return (0); 2194 1.1 haad } else { 2195 1.1 haad /* the block is referenced from the dnode */ 2196 1.1 haad ASSERT3U(level, ==, nlevels-1); 2197 1.1 haad ASSERT(dn->dn_phys->dn_nblkptr == 0 || 2198 1.1 haad blkid < dn->dn_phys->dn_nblkptr); 2199 1.1 haad if (dn->dn_dbuf) { 2200 1.1 haad dbuf_add_ref(dn->dn_dbuf, NULL); 2201 1.1 haad *parentp = dn->dn_dbuf; 2202 1.1 haad } 2203 1.1 haad *bpp = &dn->dn_phys->dn_blkptr[blkid]; 2204 1.1 haad return (0); 2205 1.1 haad } 2206 1.1 haad } 2207 1.1 haad 2208 1.1 haad static dmu_buf_impl_t * 2209 1.1 haad dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 2210 1.1 haad dmu_buf_impl_t *parent, blkptr_t *blkptr) 2211 1.1 haad { 2212 1.4 haad objset_t *os = dn->dn_objset; 2213 1.1 haad dmu_buf_impl_t *db, *odb; 2214 1.1 haad 2215 1.1 haad ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2216 1.1 haad ASSERT(dn->dn_type != DMU_OT_NONE); 2217 1.1 haad 2218 1.6 chs db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); 2219 1.1 haad 2220 1.1 haad db->db_objset = os; 2221 1.1 haad db->db.db_object = dn->dn_object; 2222 1.1 haad db->db_level = level; 2223 1.1 haad db->db_blkid = blkid; 2224 1.1 haad db->db_last_dirty = NULL; 2225 1.1 haad db->db_dirtycnt = 0; 2226 1.6 chs db->db_dnode_handle = dn->dn_handle; 2227 1.1 haad db->db_parent = parent; 2228 1.1 haad db->db_blkptr = blkptr; 2229 1.1 haad 2230 1.6 chs db->db_user = NULL; 2231 1.6 chs db->db_user_immediate_evict = FALSE; 2232 1.6 chs db->db_freed_in_flight = FALSE; 2233 1.6 chs db->db_pending_evict = FALSE; 2234 1.1 haad 2235 1.6 chs if (blkid == DMU_BONUS_BLKID) { 2236 1.1 haad ASSERT3P(parent, ==, dn->dn_dbuf); 2237 1.1 haad db->db.db_size = DN_MAX_BONUSLEN - 2238 1.1 haad (dn->dn_nblkptr-1) * sizeof (blkptr_t); 2239 1.1 haad ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 2240 1.6 chs db->db.db_offset = DMU_BONUS_BLKID; 2241 1.1 haad db->db_state = DB_UNCACHED; 2242 1.1 haad /* the bonus dbuf is not placed in the hash table */ 2243 1.4 haad arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2244 1.1 haad return (db); 2245 1.6 chs } else if (blkid == DMU_SPILL_BLKID) { 2246 1.6 chs db->db.db_size = (blkptr != NULL) ? 2247 1.6 chs BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 2248 1.6 chs db->db.db_offset = 0; 2249 1.1 haad } else { 2250 1.1 haad int blocksize = 2251 1.6 chs db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 2252 1.1 haad db->db.db_size = blocksize; 2253 1.1 haad db->db.db_offset = db->db_blkid * blocksize; 2254 1.1 haad } 2255 1.1 haad 2256 1.1 haad /* 2257 1.1 haad * Hold the dn_dbufs_mtx while we get the new dbuf 2258 1.1 haad * in the hash table *and* added to the dbufs list. 2259 1.1 haad * This prevents a possible deadlock with someone 2260 1.1 haad * trying to look up this dbuf before its added to the 2261 1.1 haad * dn_dbufs list. 2262 1.1 haad */ 2263 1.1 haad mutex_enter(&dn->dn_dbufs_mtx); 2264 1.1 haad db->db_state = DB_EVICTING; 2265 1.1 haad if ((odb = dbuf_hash_insert(db)) != NULL) { 2266 1.1 haad /* someone else inserted it first */ 2267 1.6 chs kmem_cache_free(dbuf_kmem_cache, db); 2268 1.1 haad mutex_exit(&dn->dn_dbufs_mtx); 2269 1.1 haad return (odb); 2270 1.1 haad } 2271 1.6 chs avl_add(&dn->dn_dbufs, db); 2272 1.6 chs 2273 1.1 haad db->db_state = DB_UNCACHED; 2274 1.1 haad mutex_exit(&dn->dn_dbufs_mtx); 2275 1.4 haad arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2276 1.1 haad 2277 1.1 haad if (parent && parent != dn->dn_dbuf) 2278 1.1 haad dbuf_add_ref(parent, db); 2279 1.1 haad 2280 1.1 haad ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 2281 1.1 haad refcount_count(&dn->dn_holds) > 0); 2282 1.1 haad (void) refcount_add(&dn->dn_holds, db); 2283 1.6 chs atomic_inc_32(&dn->dn_dbufs_count); 2284 1.1 haad 2285 1.1 haad dprintf_dbuf(db, "db=%p\n", db); 2286 1.1 haad 2287 1.1 haad return (db); 2288 1.1 haad } 2289 1.1 haad 2290 1.6 chs typedef struct dbuf_prefetch_arg { 2291 1.6 chs spa_t *dpa_spa; /* The spa to issue the prefetch in. */ 2292 1.6 chs zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ 2293 1.6 chs int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ 2294 1.6 chs int dpa_curlevel; /* The current level that we're reading */ 2295 1.6 chs dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ 2296 1.6 chs zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ 2297 1.6 chs zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ 2298 1.6 chs arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ 2299 1.6 chs } dbuf_prefetch_arg_t; 2300 1.6 chs 2301 1.6 chs /* 2302 1.6 chs * Actually issue the prefetch read for the block given. 2303 1.6 chs */ 2304 1.6 chs static void 2305 1.6 chs dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) 2306 1.1 haad { 2307 1.6 chs if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 2308 1.6 chs return; 2309 1.1 haad 2310 1.6 chs arc_flags_t aflags = 2311 1.6 chs dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 2312 1.1 haad 2313 1.6 chs ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2314 1.6 chs ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); 2315 1.6 chs ASSERT(dpa->dpa_zio != NULL); 2316 1.6 chs (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, 2317 1.6 chs dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2318 1.6 chs &aflags, &dpa->dpa_zb); 2319 1.1 haad } 2320 1.1 haad 2321 1.6 chs /* 2322 1.6 chs * Called when an indirect block above our prefetch target is read in. This 2323 1.6 chs * will either read in the next indirect block down the tree or issue the actual 2324 1.6 chs * prefetch if the next block down is our target. 2325 1.6 chs */ 2326 1.1 haad static void 2327 1.6 chs dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) 2328 1.1 haad { 2329 1.6 chs dbuf_prefetch_arg_t *dpa = private; 2330 1.1 haad 2331 1.6 chs ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); 2332 1.6 chs ASSERT3S(dpa->dpa_curlevel, >, 0); 2333 1.1 haad 2334 1.6 chs /* 2335 1.6 chs * The dpa_dnode is only valid if we are called with a NULL 2336 1.6 chs * zio. This indicates that the arc_read() returned without 2337 1.6 chs * first calling zio_read() to issue a physical read. Once 2338 1.6 chs * a physical read is made the dpa_dnode must be invalidated 2339 1.6 chs * as the locks guarding it may have been dropped. If the 2340 1.6 chs * dpa_dnode is still valid, then we want to add it to the dbuf 2341 1.6 chs * cache. To do so, we must hold the dbuf associated with the block 2342 1.6 chs * we just prefetched, read its contents so that we associate it 2343 1.6 chs * with an arc_buf_t, and then release it. 2344 1.6 chs */ 2345 1.6 chs if (zio != NULL) { 2346 1.6 chs ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); 2347 1.6 chs if (zio->io_flags & ZIO_FLAG_RAW) { 2348 1.6 chs ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); 2349 1.6 chs } else { 2350 1.6 chs ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); 2351 1.6 chs } 2352 1.6 chs ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); 2353 1.1 haad 2354 1.6 chs dpa->dpa_dnode = NULL; 2355 1.6 chs } else if (dpa->dpa_dnode != NULL) { 2356 1.6 chs uint64_t curblkid = dpa->dpa_zb.zb_blkid >> 2357 1.6 chs (dpa->dpa_epbs * (dpa->dpa_curlevel - 2358 1.6 chs dpa->dpa_zb.zb_level)); 2359 1.6 chs dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, 2360 1.6 chs dpa->dpa_curlevel, curblkid, FTAG); 2361 1.6 chs (void) dbuf_read(db, NULL, 2362 1.6 chs DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); 2363 1.6 chs dbuf_rele(db, FTAG); 2364 1.6 chs } 2365 1.6 chs 2366 1.6 chs dpa->dpa_curlevel--; 2367 1.6 chs 2368 1.6 chs uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> 2369 1.6 chs (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); 2370 1.6 chs blkptr_t *bp = ((blkptr_t *)abuf->b_data) + 2371 1.6 chs P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); 2372 1.6 chs if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { 2373 1.6 chs kmem_free(dpa, sizeof (*dpa)); 2374 1.6 chs } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { 2375 1.6 chs ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); 2376 1.6 chs dbuf_issue_final_prefetch(dpa, bp); 2377 1.6 chs kmem_free(dpa, sizeof (*dpa)); 2378 1.6 chs } else { 2379 1.6 chs arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2380 1.6 chs zbookmark_phys_t zb; 2381 1.6 chs 2382 1.6 chs ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2383 1.6 chs 2384 1.6 chs SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, 2385 1.6 chs dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); 2386 1.6 chs 2387 1.6 chs (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2388 1.6 chs bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, 2389 1.6 chs ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2390 1.6 chs &iter_aflags, &zb); 2391 1.1 haad } 2392 1.1 haad 2393 1.6 chs arc_buf_destroy(abuf, private); 2394 1.1 haad } 2395 1.1 haad 2396 1.6 chs /* 2397 1.6 chs * Issue prefetch reads for the given block on the given level. If the indirect 2398 1.6 chs * blocks above that block are not in memory, we will read them in 2399 1.6 chs * asynchronously. As a result, this call never blocks waiting for a read to 2400 1.6 chs * complete. 2401 1.6 chs */ 2402 1.1 haad void 2403 1.6 chs dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, 2404 1.6 chs arc_flags_t aflags) 2405 1.1 haad { 2406 1.6 chs blkptr_t bp; 2407 1.6 chs int epbs, nlevels, curlevel; 2408 1.6 chs uint64_t curblkid; 2409 1.1 haad 2410 1.6 chs ASSERT(blkid != DMU_BONUS_BLKID); 2411 1.1 haad ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2412 1.1 haad 2413 1.6 chs if (blkid > dn->dn_maxblkid) 2414 1.6 chs return; 2415 1.6 chs 2416 1.1 haad if (dnode_block_freed(dn, blkid)) 2417 1.1 haad return; 2418 1.1 haad 2419 1.6 chs /* 2420 1.6 chs * This dnode hasn't been written to disk yet, so there's nothing to 2421 1.6 chs * prefetch. 2422 1.6 chs */ 2423 1.6 chs nlevels = dn->dn_phys->dn_nlevels; 2424 1.6 chs if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) 2425 1.6 chs return; 2426 1.6 chs 2427 1.6 chs epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2428 1.6 chs if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) 2429 1.6 chs return; 2430 1.6 chs 2431 1.6 chs dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, 2432 1.6 chs level, blkid); 2433 1.6 chs if (db != NULL) { 2434 1.6 chs mutex_exit(&db->db_mtx); 2435 1.6 chs /* 2436 1.6 chs * This dbuf already exists. It is either CACHED, or 2437 1.6 chs * (we assume) about to be read or filled. 2438 1.6 chs */ 2439 1.6 chs return; 2440 1.6 chs } 2441 1.6 chs 2442 1.6 chs /* 2443 1.6 chs * Find the closest ancestor (indirect block) of the target block 2444 1.6 chs * that is present in the cache. In this indirect block, we will 2445 1.6 chs * find the bp that is at curlevel, curblkid. 2446 1.6 chs */ 2447 1.6 chs curlevel = level; 2448 1.6 chs curblkid = blkid; 2449 1.6 chs while (curlevel < nlevels - 1) { 2450 1.6 chs int parent_level = curlevel + 1; 2451 1.6 chs uint64_t parent_blkid = curblkid >> epbs; 2452 1.6 chs dmu_buf_impl_t *db; 2453 1.6 chs 2454 1.6 chs if (dbuf_hold_impl(dn, parent_level, parent_blkid, 2455 1.6 chs FALSE, TRUE, FTAG, &db) == 0) { 2456 1.6 chs blkptr_t *bpp = db->db_buf->b_data; 2457 1.6 chs bp = bpp[P2PHASE(curblkid, 1 << epbs)]; 2458 1.6 chs dbuf_rele(db, FTAG); 2459 1.6 chs break; 2460 1.1 haad } 2461 1.6 chs 2462 1.6 chs curlevel = parent_level; 2463 1.6 chs curblkid = parent_blkid; 2464 1.1 haad } 2465 1.1 haad 2466 1.6 chs if (curlevel == nlevels - 1) { 2467 1.6 chs /* No cached indirect blocks found. */ 2468 1.6 chs ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); 2469 1.6 chs bp = dn->dn_phys->dn_blkptr[curblkid]; 2470 1.6 chs } 2471 1.6 chs if (BP_IS_HOLE(&bp)) 2472 1.6 chs return; 2473 1.4 haad 2474 1.6 chs ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); 2475 1.1 haad 2476 1.6 chs zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, 2477 1.6 chs ZIO_FLAG_CANFAIL); 2478 1.1 haad 2479 1.6 chs dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); 2480 1.6 chs dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 2481 1.6 chs SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2482 1.6 chs dn->dn_object, level, blkid); 2483 1.6 chs dpa->dpa_curlevel = curlevel; 2484 1.6 chs dpa->dpa_prio = prio; 2485 1.6 chs dpa->dpa_aflags = aflags; 2486 1.6 chs dpa->dpa_spa = dn->dn_objset->os_spa; 2487 1.6 chs dpa->dpa_dnode = dn; 2488 1.6 chs dpa->dpa_epbs = epbs; 2489 1.6 chs dpa->dpa_zio = pio; 2490 1.6 chs 2491 1.6 chs /* 2492 1.6 chs * If we have the indirect just above us, no need to do the asynchronous 2493 1.6 chs * prefetch chain; we'll just run the last step ourselves. If we're at 2494 1.6 chs * a higher level, though, we want to issue the prefetches for all the 2495 1.6 chs * indirect blocks asynchronously, so we can go on with whatever we were 2496 1.6 chs * doing. 2497 1.6 chs */ 2498 1.6 chs if (curlevel == level) { 2499 1.6 chs ASSERT3U(curblkid, ==, blkid); 2500 1.6 chs dbuf_issue_final_prefetch(dpa, &bp); 2501 1.6 chs kmem_free(dpa, sizeof (*dpa)); 2502 1.6 chs } else { 2503 1.6 chs arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2504 1.6 chs zbookmark_phys_t zb; 2505 1.6 chs 2506 1.6 chs SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2507 1.6 chs dn->dn_object, curlevel, curblkid); 2508 1.6 chs (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2509 1.6 chs &bp, dbuf_prefetch_indirect_done, dpa, prio, 2510 1.6 chs ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2511 1.6 chs &iter_aflags, &zb); 2512 1.1 haad } 2513 1.6 chs /* 2514 1.6 chs * We use pio here instead of dpa_zio since it's possible that 2515 1.6 chs * dpa may have already been freed. 2516 1.6 chs */ 2517 1.6 chs zio_nowait(pio); 2518 1.1 haad } 2519 1.1 haad 2520 1.1 haad /* 2521 1.1 haad * Returns with db_holds incremented, and db_mtx not held. 2522 1.1 haad * Note: dn_struct_rwlock must be held. 2523 1.1 haad */ 2524 1.1 haad int 2525 1.6 chs dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, 2526 1.6 chs boolean_t fail_sparse, boolean_t fail_uncached, 2527 1.1 haad void *tag, dmu_buf_impl_t **dbp) 2528 1.1 haad { 2529 1.1 haad dmu_buf_impl_t *db, *parent = NULL; 2530 1.1 haad 2531 1.6 chs ASSERT(blkid != DMU_BONUS_BLKID); 2532 1.1 haad ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2533 1.1 haad ASSERT3U(dn->dn_nlevels, >, level); 2534 1.1 haad 2535 1.1 haad *dbp = NULL; 2536 1.1 haad top: 2537 1.1 haad /* dbuf_find() returns with db_mtx held */ 2538 1.6 chs db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); 2539 1.1 haad 2540 1.1 haad if (db == NULL) { 2541 1.1 haad blkptr_t *bp = NULL; 2542 1.1 haad int err; 2543 1.1 haad 2544 1.6 chs if (fail_uncached) 2545 1.6 chs return (SET_ERROR(ENOENT)); 2546 1.6 chs 2547 1.1 haad ASSERT3P(parent, ==, NULL); 2548 1.1 haad err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 2549 1.1 haad if (fail_sparse) { 2550 1.1 haad if (err == 0 && bp && BP_IS_HOLE(bp)) 2551 1.6 chs err = SET_ERROR(ENOENT); 2552 1.1 haad if (err) { 2553 1.1 haad if (parent) 2554 1.1 haad dbuf_rele(parent, NULL); 2555 1.1 haad return (err); 2556 1.1 haad } 2557 1.1 haad } 2558 1.1 haad if (err && err != ENOENT) 2559 1.1 haad return (err); 2560 1.1 haad db = dbuf_create(dn, level, blkid, parent, bp); 2561 1.1 haad } 2562 1.1 haad 2563 1.6 chs if (fail_uncached && db->db_state != DB_CACHED) { 2564 1.6 chs mutex_exit(&db->db_mtx); 2565 1.6 chs return (SET_ERROR(ENOENT)); 2566 1.6 chs } 2567 1.6 chs 2568 1.6 chs if (db->db_buf != NULL) 2569 1.1 haad ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 2570 1.1 haad 2571 1.1 haad ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 2572 1.1 haad 2573 1.1 haad /* 2574 1.1 haad * If this buffer is currently syncing out, and we are are 2575 1.1 haad * still referencing it from db_data, we need to make a copy 2576 1.1 haad * of it in case we decide we want to dirty it again in this txg. 2577 1.1 haad */ 2578 1.6 chs if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 2579 1.1 haad dn->dn_object != DMU_META_DNODE_OBJECT && 2580 1.1 haad db->db_state == DB_CACHED && db->db_data_pending) { 2581 1.1 haad dbuf_dirty_record_t *dr = db->db_data_pending; 2582 1.1 haad 2583 1.1 haad if (dr->dt.dl.dr_data == db->db_buf) { 2584 1.1 haad arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2585 1.1 haad 2586 1.1 haad dbuf_set_data(db, 2587 1.6 chs arc_alloc_buf(dn->dn_objset->os_spa, 2588 1.1 haad db->db.db_size, db, type)); 2589 1.1 haad bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 2590 1.1 haad db->db.db_size); 2591 1.1 haad } 2592 1.1 haad } 2593 1.1 haad 2594 1.6 chs if (multilist_link_active(&db->db_cache_link)) { 2595 1.6 chs ASSERT(refcount_is_zero(&db->db_holds)); 2596 1.6 chs multilist_remove(&dbuf_cache, db); 2597 1.6 chs (void) refcount_remove_many(&dbuf_cache_size, 2598 1.6 chs db->db.db_size, db); 2599 1.6 chs } 2600 1.1 haad (void) refcount_add(&db->db_holds, tag); 2601 1.1 haad DBUF_VERIFY(db); 2602 1.1 haad mutex_exit(&db->db_mtx); 2603 1.1 haad 2604 1.1 haad /* NOTE: we can't rele the parent until after we drop the db_mtx */ 2605 1.1 haad if (parent) 2606 1.1 haad dbuf_rele(parent, NULL); 2607 1.1 haad 2608 1.6 chs ASSERT3P(DB_DNODE(db), ==, dn); 2609 1.1 haad ASSERT3U(db->db_blkid, ==, blkid); 2610 1.1 haad ASSERT3U(db->db_level, ==, level); 2611 1.1 haad *dbp = db; 2612 1.1 haad 2613 1.1 haad return (0); 2614 1.1 haad } 2615 1.1 haad 2616 1.1 haad dmu_buf_impl_t * 2617 1.1 haad dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 2618 1.1 haad { 2619 1.6 chs return (dbuf_hold_level(dn, 0, blkid, tag)); 2620 1.1 haad } 2621 1.1 haad 2622 1.1 haad dmu_buf_impl_t * 2623 1.1 haad dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 2624 1.1 haad { 2625 1.1 haad dmu_buf_impl_t *db; 2626 1.6 chs int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); 2627 1.1 haad return (err ? NULL : db); 2628 1.1 haad } 2629 1.1 haad 2630 1.1 haad void 2631 1.1 haad dbuf_create_bonus(dnode_t *dn) 2632 1.1 haad { 2633 1.1 haad ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2634 1.1 haad 2635 1.1 haad ASSERT(dn->dn_bonus == NULL); 2636 1.6 chs dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 2637 1.1 haad } 2638 1.1 haad 2639 1.6 chs int 2640 1.6 chs dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 2641 1.5 joerg { 2642 1.5 joerg dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2643 1.6 chs dnode_t *dn; 2644 1.6 chs 2645 1.6 chs if (db->db_blkid != DMU_SPILL_BLKID) 2646 1.6 chs return (SET_ERROR(ENOTSUP)); 2647 1.6 chs if (blksz == 0) 2648 1.6 chs blksz = SPA_MINBLOCKSIZE; 2649 1.6 chs ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); 2650 1.6 chs blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 2651 1.6 chs 2652 1.6 chs DB_DNODE_ENTER(db); 2653 1.6 chs dn = DB_DNODE(db); 2654 1.6 chs rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2655 1.6 chs dbuf_new_size(db, blksz, tx); 2656 1.6 chs rw_exit(&dn->dn_struct_rwlock); 2657 1.6 chs DB_DNODE_EXIT(db); 2658 1.6 chs 2659 1.6 chs return (0); 2660 1.5 joerg } 2661 1.5 joerg 2662 1.1 haad void 2663 1.6 chs dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 2664 1.6 chs { 2665 1.6 chs dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2666 1.6 chs } 2667 1.6 chs 2668 1.6 chs #pragma weak dmu_buf_add_ref = dbuf_add_ref 2669 1.6 chs void 2670 1.1 haad dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2671 1.1 haad { 2672 1.1 haad int64_t holds = refcount_add(&db->db_holds, tag); 2673 1.6 chs ASSERT3S(holds, >, 1); 2674 1.1 haad } 2675 1.1 haad 2676 1.6 chs #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref 2677 1.6 chs boolean_t 2678 1.6 chs dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, 2679 1.6 chs void *tag) 2680 1.5 joerg { 2681 1.5 joerg dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2682 1.6 chs dmu_buf_impl_t *found_db; 2683 1.6 chs boolean_t result = B_FALSE; 2684 1.6 chs 2685 1.6 chs if (db->db_blkid == DMU_BONUS_BLKID) 2686 1.6 chs found_db = dbuf_find_bonus(os, obj); 2687 1.6 chs else 2688 1.6 chs found_db = dbuf_find(os, obj, 0, blkid); 2689 1.6 chs 2690 1.6 chs if (found_db != NULL) { 2691 1.6 chs if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { 2692 1.6 chs (void) refcount_add(&db->db_holds, tag); 2693 1.6 chs result = B_TRUE; 2694 1.6 chs } 2695 1.6 chs mutex_exit(&db->db_mtx); 2696 1.6 chs } 2697 1.6 chs return (result); 2698 1.5 joerg } 2699 1.5 joerg 2700 1.6 chs /* 2701 1.6 chs * If you call dbuf_rele() you had better not be referencing the dnode handle 2702 1.6 chs * unless you have some other direct or indirect hold on the dnode. (An indirect 2703 1.6 chs * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2704 1.6 chs * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2705 1.6 chs * dnode's parent dbuf evicting its dnode handles. 2706 1.6 chs */ 2707 1.1 haad void 2708 1.1 haad dbuf_rele(dmu_buf_impl_t *db, void *tag) 2709 1.1 haad { 2710 1.4 haad mutex_enter(&db->db_mtx); 2711 1.4 haad dbuf_rele_and_unlock(db, tag); 2712 1.4 haad } 2713 1.4 haad 2714 1.6 chs void 2715 1.6 chs dmu_buf_rele(dmu_buf_t *db, void *tag) 2716 1.6 chs { 2717 1.6 chs dbuf_rele((dmu_buf_impl_t *)db, tag); 2718 1.6 chs } 2719 1.6 chs 2720 1.4 haad /* 2721 1.4 haad * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2722 1.4 haad * db_dirtycnt and db_holds to be updated atomically. 2723 1.4 haad */ 2724 1.4 haad void 2725 1.4 haad dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2726 1.4 haad { 2727 1.1 haad int64_t holds; 2728 1.1 haad 2729 1.4 haad ASSERT(MUTEX_HELD(&db->db_mtx)); 2730 1.1 haad DBUF_VERIFY(db); 2731 1.1 haad 2732 1.6 chs /* 2733 1.6 chs * Remove the reference to the dbuf before removing its hold on the 2734 1.6 chs * dnode so we can guarantee in dnode_move() that a referenced bonus 2735 1.6 chs * buffer has a corresponding dnode hold. 2736 1.6 chs */ 2737 1.1 haad holds = refcount_remove(&db->db_holds, tag); 2738 1.1 haad ASSERT(holds >= 0); 2739 1.1 haad 2740 1.1 haad /* 2741 1.1 haad * We can't freeze indirects if there is a possibility that they 2742 1.1 haad * may be modified in the current syncing context. 2743 1.1 haad */ 2744 1.6 chs if (db->db_buf != NULL && 2745 1.6 chs holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { 2746 1.1 haad arc_buf_freeze(db->db_buf); 2747 1.6 chs } 2748 1.1 haad 2749 1.1 haad if (holds == db->db_dirtycnt && 2750 1.6 chs db->db_level == 0 && db->db_user_immediate_evict) 2751 1.1 haad dbuf_evict_user(db); 2752 1.1 haad 2753 1.1 haad if (holds == 0) { 2754 1.6 chs if (db->db_blkid == DMU_BONUS_BLKID) { 2755 1.6 chs dnode_t *dn; 2756 1.6 chs boolean_t evict_dbuf = db->db_pending_evict; 2757 1.6 chs 2758 1.6 chs /* 2759 1.6 chs * If the dnode moves here, we cannot cross this 2760 1.6 chs * barrier until the move completes. 2761 1.6 chs */ 2762 1.6 chs DB_DNODE_ENTER(db); 2763 1.6 chs 2764 1.6 chs dn = DB_DNODE(db); 2765 1.6 chs atomic_dec_32(&dn->dn_dbufs_count); 2766 1.6 chs 2767 1.6 chs /* 2768 1.6 chs * Decrementing the dbuf count means that the bonus 2769 1.6 chs * buffer's dnode hold is no longer discounted in 2770 1.6 chs * dnode_move(). The dnode cannot move until after 2771 1.6 chs * the dnode_rele() below. 2772 1.6 chs */ 2773 1.6 chs DB_DNODE_EXIT(db); 2774 1.6 chs 2775 1.6 chs /* 2776 1.6 chs * Do not reference db after its lock is dropped. 2777 1.6 chs * Another thread may evict it. 2778 1.6 chs */ 2779 1.1 haad mutex_exit(&db->db_mtx); 2780 1.6 chs 2781 1.6 chs if (evict_dbuf) 2782 1.6 chs dnode_evict_bonus(dn); 2783 1.6 chs 2784 1.6 chs dnode_rele(dn, db); 2785 1.1 haad } else if (db->db_buf == NULL) { 2786 1.1 haad /* 2787 1.1 haad * This is a special case: we never associated this 2788 1.1 haad * dbuf with any data allocated from the ARC. 2789 1.1 haad */ 2790 1.1 haad ASSERT(db->db_state == DB_UNCACHED || 2791 1.1 haad db->db_state == DB_NOFILL); 2792 1.6 chs dbuf_destroy(db); 2793 1.1 haad } else if (arc_released(db->db_buf)) { 2794 1.1 haad /* 2795 1.1 haad * This dbuf has anonymous data associated with it. 2796 1.1 haad */ 2797 1.6 chs dbuf_destroy(db); 2798 1.1 haad } else { 2799 1.6 chs boolean_t do_arc_evict = B_FALSE; 2800 1.6 chs blkptr_t bp; 2801 1.6 chs spa_t *spa = dmu_objset_spa(db->db_objset); 2802 1.6 chs 2803 1.6 chs if (!DBUF_IS_CACHEABLE(db) && 2804 1.6 chs db->db_blkptr != NULL && 2805 1.6 chs !BP_IS_HOLE(db->db_blkptr) && 2806 1.6 chs !BP_IS_EMBEDDED(db->db_blkptr)) { 2807 1.6 chs do_arc_evict = B_TRUE; 2808 1.6 chs bp = *db->db_blkptr; 2809 1.6 chs } 2810 1.6 chs 2811 1.6 chs if (!DBUF_IS_CACHEABLE(db) || 2812 1.6 chs db->db_pending_evict) { 2813 1.6 chs dbuf_destroy(db); 2814 1.6 chs } else if (!multilist_link_active(&db->db_cache_link)) { 2815 1.6 chs multilist_insert(&dbuf_cache, db); 2816 1.6 chs (void) refcount_add_many(&dbuf_cache_size, 2817 1.6 chs db->db.db_size, db); 2818 1.1 haad mutex_exit(&db->db_mtx); 2819 1.6 chs 2820 1.6 chs dbuf_evict_notify(); 2821 1.6 chs } 2822 1.6 chs 2823 1.6 chs if (do_arc_evict) 2824 1.6 chs arc_freed(spa, &bp); 2825 1.1 haad } 2826 1.1 haad } else { 2827 1.1 haad mutex_exit(&db->db_mtx); 2828 1.1 haad } 2829 1.1 haad 2830 1.5 joerg } 2831 1.5 joerg 2832 1.6 chs #pragma weak dmu_buf_refcount = dbuf_refcount 2833 1.1 haad uint64_t 2834 1.1 haad dbuf_refcount(dmu_buf_impl_t *db) 2835 1.1 haad { 2836 1.1 haad return (refcount_count(&db->db_holds)); 2837 1.1 haad } 2838 1.1 haad 2839 1.1 haad void * 2840 1.6 chs dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, 2841 1.6 chs dmu_buf_user_t *new_user) 2842 1.1 haad { 2843 1.6 chs dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2844 1.6 chs 2845 1.6 chs mutex_enter(&db->db_mtx); 2846 1.6 chs dbuf_verify_user(db, DBVU_NOT_EVICTING); 2847 1.6 chs if (db->db_user == old_user) 2848 1.6 chs db->db_user = new_user; 2849 1.6 chs else 2850 1.6 chs old_user = db->db_user; 2851 1.6 chs dbuf_verify_user(db, DBVU_NOT_EVICTING); 2852 1.6 chs mutex_exit(&db->db_mtx); 2853 1.6 chs 2854 1.6 chs return (old_user); 2855 1.1 haad } 2856 1.1 haad 2857 1.1 haad void * 2858 1.6 chs dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2859 1.1 haad { 2860 1.6 chs return (dmu_buf_replace_user(db_fake, NULL, user)); 2861 1.1 haad } 2862 1.1 haad 2863 1.1 haad void * 2864 1.6 chs dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2865 1.1 haad { 2866 1.1 haad dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2867 1.1 haad 2868 1.6 chs db->db_user_immediate_evict = TRUE; 2869 1.6 chs return (dmu_buf_set_user(db_fake, user)); 2870 1.6 chs } 2871 1.1 haad 2872 1.6 chs void * 2873 1.6 chs dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2874 1.6 chs { 2875 1.6 chs return (dmu_buf_replace_user(db_fake, user, NULL)); 2876 1.1 haad } 2877 1.1 haad 2878 1.1 haad void * 2879 1.1 haad dmu_buf_get_user(dmu_buf_t *db_fake) 2880 1.1 haad { 2881 1.1 haad dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2882 1.1 haad 2883 1.6 chs dbuf_verify_user(db, DBVU_NOT_EVICTING); 2884 1.6 chs return (db->db_user); 2885 1.6 chs } 2886 1.6 chs 2887 1.6 chs void 2888 1.6 chs dmu_buf_user_evict_wait() 2889 1.6 chs { 2890 1.6 chs taskq_wait(dbu_evict_taskq); 2891 1.1 haad } 2892 1.1 haad 2893 1.4 haad boolean_t 2894 1.4 haad dmu_buf_freeable(dmu_buf_t *dbuf) 2895 1.4 haad { 2896 1.4 haad boolean_t res = B_FALSE; 2897 1.4 haad dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2898 1.4 haad 2899 1.4 haad if (db->db_blkptr) 2900 1.4 haad res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2901 1.6 chs db->db_blkptr, db->db_blkptr->blk_birth); 2902 1.4 haad 2903 1.4 haad return (res); 2904 1.4 haad } 2905 1.4 haad 2906 1.6 chs blkptr_t * 2907 1.6 chs dmu_buf_get_blkptr(dmu_buf_t *db) 2908 1.6 chs { 2909 1.6 chs dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2910 1.6 chs return (dbi->db_blkptr); 2911 1.6 chs } 2912 1.6 chs 2913 1.6 chs objset_t * 2914 1.6 chs dmu_buf_get_objset(dmu_buf_t *db) 2915 1.6 chs { 2916 1.6 chs dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2917 1.6 chs return (dbi->db_objset); 2918 1.6 chs } 2919 1.6 chs 2920 1.6 chs dnode_t * 2921 1.6 chs dmu_buf_dnode_enter(dmu_buf_t *db) 2922 1.6 chs { 2923 1.6 chs dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2924 1.6 chs DB_DNODE_ENTER(dbi); 2925 1.6 chs return (DB_DNODE(dbi)); 2926 1.6 chs } 2927 1.6 chs 2928 1.6 chs void 2929 1.6 chs dmu_buf_dnode_exit(dmu_buf_t *db) 2930 1.6 chs { 2931 1.6 chs dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2932 1.6 chs DB_DNODE_EXIT(dbi); 2933 1.6 chs } 2934 1.6 chs 2935 1.1 haad static void 2936 1.1 haad dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2937 1.1 haad { 2938 1.1 haad /* ASSERT(dmu_tx_is_syncing(tx) */ 2939 1.1 haad ASSERT(MUTEX_HELD(&db->db_mtx)); 2940 1.1 haad 2941 1.1 haad if (db->db_blkptr != NULL) 2942 1.1 haad return; 2943 1.1 haad 2944 1.6 chs if (db->db_blkid == DMU_SPILL_BLKID) { 2945 1.6 chs db->db_blkptr = &dn->dn_phys->dn_spill; 2946 1.6 chs BP_ZERO(db->db_blkptr); 2947 1.6 chs return; 2948 1.6 chs } 2949 1.1 haad if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2950 1.1 haad /* 2951 1.1 haad * This buffer was allocated at a time when there was 2952 1.1 haad * no available blkptrs from the dnode, or it was 2953 1.1 haad * inappropriate to hook it in (i.e., nlevels mis-match). 2954 1.1 haad */ 2955 1.1 haad ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2956 1.1 haad ASSERT(db->db_parent == NULL); 2957 1.1 haad db->db_parent = dn->dn_dbuf; 2958 1.1 haad db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2959 1.1 haad DBUF_VERIFY(db); 2960 1.1 haad } else { 2961 1.1 haad dmu_buf_impl_t *parent = db->db_parent; 2962 1.1 haad int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2963 1.1 haad 2964 1.1 haad ASSERT(dn->dn_phys->dn_nlevels > 1); 2965 1.1 haad if (parent == NULL) { 2966 1.1 haad mutex_exit(&db->db_mtx); 2967 1.1 haad rw_enter(&dn->dn_struct_rwlock, RW_READER); 2968 1.6 chs parent = dbuf_hold_level(dn, db->db_level + 1, 2969 1.6 chs db->db_blkid >> epbs, db); 2970 1.1 haad rw_exit(&dn->dn_struct_rwlock); 2971 1.1 haad mutex_enter(&db->db_mtx); 2972 1.1 haad db->db_parent = parent; 2973 1.1 haad } 2974 1.1 haad db->db_blkptr = (blkptr_t *)parent->db.db_data + 2975 1.1 haad (db->db_blkid & ((1ULL << epbs) - 1)); 2976 1.1 haad DBUF_VERIFY(db); 2977 1.1 haad } 2978 1.1 haad } 2979 1.1 haad 2980 1.1 haad static void 2981 1.1 haad dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2982 1.1 haad { 2983 1.1 haad dmu_buf_impl_t *db = dr->dr_dbuf; 2984 1.6 chs dnode_t *dn; 2985 1.1 haad zio_t *zio; 2986 1.1 haad 2987 1.1 haad ASSERT(dmu_tx_is_syncing(tx)); 2988 1.1 haad 2989 1.1 haad dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2990 1.1 haad 2991 1.1 haad mutex_enter(&db->db_mtx); 2992 1.1 haad 2993 1.1 haad ASSERT(db->db_level > 0); 2994 1.1 haad DBUF_VERIFY(db); 2995 1.1 haad 2996 1.6 chs /* Read the block if it hasn't been read yet. */ 2997 1.1 haad if (db->db_buf == NULL) { 2998 1.1 haad mutex_exit(&db->db_mtx); 2999 1.1 haad (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 3000 1.1 haad mutex_enter(&db->db_mtx); 3001 1.1 haad } 3002 1.1 haad ASSERT3U(db->db_state, ==, DB_CACHED); 3003 1.1 haad ASSERT(db->db_buf != NULL); 3004 1.1 haad 3005 1.6 chs DB_DNODE_ENTER(db); 3006 1.6 chs dn = DB_DNODE(db); 3007 1.6 chs /* Indirect block size must match what the dnode thinks it is. */ 3008 1.6 chs ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 3009 1.1 haad dbuf_check_blkptr(dn, db); 3010 1.6 chs DB_DNODE_EXIT(db); 3011 1.1 haad 3012 1.6 chs /* Provide the pending dirty record to child dbufs */ 3013 1.1 haad db->db_data_pending = dr; 3014 1.1 haad 3015 1.1 haad mutex_exit(&db->db_mtx); 3016 1.1 haad dbuf_write(dr, db->db_buf, tx); 3017 1.1 haad 3018 1.1 haad zio = dr->dr_zio; 3019 1.1 haad mutex_enter(&dr->dt.di.dr_mtx); 3020 1.6 chs dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); 3021 1.1 haad ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3022 1.1 haad mutex_exit(&dr->dt.di.dr_mtx); 3023 1.1 haad zio_nowait(zio); 3024 1.1 haad } 3025 1.1 haad 3026 1.1 haad static void 3027 1.1 haad dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 3028 1.1 haad { 3029 1.1 haad arc_buf_t **datap = &dr->dt.dl.dr_data; 3030 1.1 haad dmu_buf_impl_t *db = dr->dr_dbuf; 3031 1.6 chs dnode_t *dn; 3032 1.6 chs objset_t *os; 3033 1.1 haad uint64_t txg = tx->tx_txg; 3034 1.1 haad 3035 1.1 haad ASSERT(dmu_tx_is_syncing(tx)); 3036 1.1 haad 3037 1.1 haad dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 3038 1.1 haad 3039 1.1 haad mutex_enter(&db->db_mtx); 3040 1.1 haad /* 3041 1.1 haad * To be synced, we must be dirtied. But we 3042 1.1 haad * might have been freed after the dirty. 3043 1.1 haad */ 3044 1.1 haad if (db->db_state == DB_UNCACHED) { 3045 1.1 haad /* This buffer has been freed since it was dirtied */ 3046 1.1 haad ASSERT(db->db.db_data == NULL); 3047 1.1 haad } else if (db->db_state == DB_FILL) { 3048 1.1 haad /* This buffer was freed and is now being re-filled */ 3049 1.1 haad ASSERT(db->db.db_data != dr->dt.dl.dr_data); 3050 1.1 haad } else { 3051 1.1 haad ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 3052 1.1 haad } 3053 1.1 haad DBUF_VERIFY(db); 3054 1.1 haad 3055 1.6 chs DB_DNODE_ENTER(db); 3056 1.6 chs dn = DB_DNODE(db); 3057 1.6 chs 3058 1.6 chs if (db->db_blkid == DMU_SPILL_BLKID) { 3059 1.6 chs mutex_enter(&dn->dn_mtx); 3060 1.6 chs dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 3061 1.6 chs mutex_exit(&dn->dn_mtx); 3062 1.6 chs } 3063 1.6 chs 3064 1.1 haad /* 3065 1.1 haad * If this is a bonus buffer, simply copy the bonus data into the 3066 1.1 haad * dnode. It will be written out when the dnode is synced (and it 3067 1.1 haad * will be synced, since it must have been dirty for dbuf_sync to 3068 1.1 haad * be called). 3069 1.1 haad */ 3070 1.6 chs if (db->db_blkid == DMU_BONUS_BLKID) { 3071 1.1 haad dbuf_dirty_record_t **drp; 3072 1.1 haad 3073 1.1 haad ASSERT(*datap != NULL); 3074 1.6 chs ASSERT0(db->db_level); 3075 1.1 haad ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 3076 1.1 haad bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 3077 1.6 chs DB_DNODE_EXIT(db); 3078 1.6 chs 3079 1.1 haad if (*datap != db->db.db_data) { 3080 1.1 haad zio_buf_free(*datap, DN_MAX_BONUSLEN); 3081 1.4 haad arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 3082 1.1 haad } 3083 1.1 haad db->db_data_pending = NULL; 3084 1.1 haad drp = &db->db_last_dirty; 3085 1.1 haad while (*drp != dr) 3086 1.1 haad drp = &(*drp)->dr_next; 3087 1.1 haad ASSERT(dr->dr_next == NULL); 3088 1.4 haad ASSERT(dr->dr_dbuf == db); 3089 1.1 haad *drp = dr->dr_next; 3090 1.6 chs if (dr->dr_dbuf->db_level != 0) { 3091 1.6 chs list_destroy(&dr->dt.di.dr_children); 3092 1.6 chs mutex_destroy(&dr->dt.di.dr_mtx); 3093 1.6 chs } 3094 1.1 haad kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3095 1.1 haad ASSERT(db->db_dirtycnt > 0); 3096 1.1 haad db->db_dirtycnt -= 1; 3097 1.4 haad dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 3098 1.1 haad return; 3099 1.1 haad } 3100 1.1 haad 3101 1.6 chs os = dn->dn_objset; 3102 1.6 chs 3103 1.1 haad /* 3104 1.1 haad * This function may have dropped the db_mtx lock allowing a dmu_sync 3105 1.1 haad * operation to sneak in. As a result, we need to ensure that we 3106 1.1 haad * don't check the dr_override_state until we have returned from 3107 1.1 haad * dbuf_check_blkptr. 3108 1.1 haad */ 3109 1.1 haad dbuf_check_blkptr(dn, db); 3110 1.1 haad 3111 1.1 haad /* 3112 1.6 chs * If this buffer is in the middle of an immediate write, 3113 1.1 haad * wait for the synchronous IO to complete. 3114 1.1 haad */ 3115 1.1 haad while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 3116 1.1 haad ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 3117 1.1 haad cv_wait(&db->db_changed, &db->db_mtx); 3118 1.1 haad ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 3119 1.1 haad } 3120 1.1 haad 3121 1.4 haad if (db->db_state != DB_NOFILL && 3122 1.4 haad dn->dn_object != DMU_META_DNODE_OBJECT && 3123 1.4 haad refcount_count(&db->db_holds) > 1 && 3124 1.4 haad dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 3125 1.4 haad *datap == db->db_buf) { 3126 1.4 haad /* 3127 1.4 haad * If this buffer is currently "in use" (i.e., there 3128 1.4 haad * are active holds and db_data still references it), 3129 1.4 haad * then make a copy before we start the write so that 3130 1.4 haad * any modifications from the open txg will not leak 3131 1.4 haad * into this write. 3132 1.4 haad * 3133 1.4 haad * NOTE: this copy does not need to be made for 3134 1.4 haad * objects only modified in the syncing context (e.g. 3135 1.4 haad * DNONE_DNODE blocks). 3136 1.4 haad */ 3137 1.4 haad int blksz = arc_buf_size(*datap); 3138 1.4 haad arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 3139 1.6 chs *datap = arc_alloc_buf(os->os_spa, blksz, db, type); 3140 1.4 haad bcopy(db->db.db_data, (*datap)->b_data, blksz); 3141 1.1 haad } 3142 1.1 haad db->db_data_pending = dr; 3143 1.1 haad 3144 1.1 haad mutex_exit(&db->db_mtx); 3145 1.1 haad 3146 1.1 haad dbuf_write(dr, *datap, tx); 3147 1.1 haad 3148 1.1 haad ASSERT(!list_link_active(&dr->dr_dirty_node)); 3149 1.6 chs if (dn->dn_object == DMU_META_DNODE_OBJECT) { 3150 1.1 haad list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 3151 1.6 chs DB_DNODE_EXIT(db); 3152 1.6 chs } else { 3153 1.6 chs /* 3154 1.6 chs * Although zio_nowait() does not "wait for an IO", it does 3155 1.6 chs * initiate the IO. If this is an empty write it seems plausible 3156 1.6 chs * that the IO could actually be completed before the nowait 3157 1.6 chs * returns. We need to DB_DNODE_EXIT() first in case 3158 1.6 chs * zio_nowait() invalidates the dbuf. 3159 1.6 chs */ 3160 1.6 chs DB_DNODE_EXIT(db); 3161 1.1 haad zio_nowait(dr->dr_zio); 3162 1.6 chs } 3163 1.1 haad } 3164 1.1 haad 3165 1.1 haad void 3166 1.6 chs dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) 3167 1.1 haad { 3168 1.1 haad dbuf_dirty_record_t *dr; 3169 1.1 haad 3170 1.1 haad while (dr = list_head(list)) { 3171 1.1 haad if (dr->dr_zio != NULL) { 3172 1.1 haad /* 3173 1.1 haad * If we find an already initialized zio then we 3174 1.1 haad * are processing the meta-dnode, and we have finished. 3175 1.1 haad * The dbufs for all dnodes are put back on the list 3176 1.1 haad * during processing, so that we can zio_wait() 3177 1.1 haad * these IOs after initiating all child IOs. 3178 1.1 haad */ 3179 1.1 haad ASSERT3U(dr->dr_dbuf->db.db_object, ==, 3180 1.1 haad DMU_META_DNODE_OBJECT); 3181 1.1 haad break; 3182 1.1 haad } 3183 1.6 chs if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && 3184 1.6 chs dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { 3185 1.6 chs VERIFY3U(dr->dr_dbuf->db_level, ==, level); 3186 1.6 chs } 3187 1.1 haad list_remove(list, dr); 3188 1.1 haad if (dr->dr_dbuf->db_level > 0) 3189 1.1 haad dbuf_sync_indirect(dr, tx); 3190 1.1 haad else 3191 1.1 haad dbuf_sync_leaf(dr, tx); 3192 1.1 haad } 3193 1.1 haad } 3194 1.1 haad 3195 1.1 haad /* ARGSUSED */ 3196 1.1 haad static void 3197 1.1 haad dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 3198 1.1 haad { 3199 1.1 haad dmu_buf_impl_t *db = vdb; 3200 1.6 chs dnode_t *dn; 3201 1.1 haad blkptr_t *bp = zio->io_bp; 3202 1.1 haad blkptr_t *bp_orig = &zio->io_bp_orig; 3203 1.4 haad spa_t *spa = zio->io_spa; 3204 1.4 haad int64_t delta; 3205 1.1 haad uint64_t fill = 0; 3206 1.4 haad int i; 3207 1.1 haad 3208 1.6 chs ASSERT3P(db->db_blkptr, !=, NULL); 3209 1.6 chs ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); 3210 1.1 haad 3211 1.6 chs DB_DNODE_ENTER(db); 3212 1.6 chs dn = DB_DNODE(db); 3213 1.4 haad delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 3214 1.4 haad dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 3215 1.4 haad zio->io_prev_space_delta = delta; 3216 1.1 haad 3217 1.6 chs if (bp->blk_birth != 0) { 3218 1.6 chs ASSERT((db->db_blkid != DMU_SPILL_BLKID && 3219 1.6 chs BP_GET_TYPE(bp) == dn->dn_type) || 3220 1.6 chs (db->db_blkid == DMU_SPILL_BLKID && 3221 1.6 chs BP_GET_TYPE(bp) == dn->dn_bonustype) || 3222 1.6 chs BP_IS_EMBEDDED(bp)); 3223 1.6 chs ASSERT(BP_GET_LEVEL(bp) == db->db_level); 3224 1.1 haad } 3225 1.1 haad 3226 1.6 chs mutex_enter(&db->db_mtx); 3227 1.1 haad 3228 1.6 chs #ifdef ZFS_DEBUG 3229 1.6 chs if (db->db_blkid == DMU_SPILL_BLKID) { 3230 1.6 chs ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 3231 1.6 chs ASSERT(!(BP_IS_HOLE(bp)) && 3232 1.6 chs db->db_blkptr == &dn->dn_phys->dn_spill); 3233 1.6 chs } 3234 1.6 chs #endif 3235 1.1 haad 3236 1.1 haad if (db->db_level == 0) { 3237 1.1 haad mutex_enter(&dn->dn_mtx); 3238 1.6 chs if (db->db_blkid > dn->dn_phys->dn_maxblkid && 3239 1.6 chs db->db_blkid != DMU_SPILL_BLKID) 3240 1.1 haad dn->dn_phys->dn_maxblkid = db->db_blkid; 3241 1.1 haad mutex_exit(&dn->dn_mtx); 3242 1.1 haad 3243 1.1 haad if (dn->dn_type == DMU_OT_DNODE) { 3244 1.1 haad dnode_phys_t *dnp = db->db.db_data; 3245 1.1 haad for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 3246 1.1 haad i--, dnp++) { 3247 1.1 haad if (dnp->dn_type != DMU_OT_NONE) 3248 1.1 haad fill++; 3249 1.1 haad } 3250 1.1 haad } else { 3251 1.6 chs if (BP_IS_HOLE(bp)) { 3252 1.6 chs fill = 0; 3253 1.6 chs } else { 3254 1.6 chs fill = 1; 3255 1.6 chs } 3256 1.1 haad } 3257 1.1 haad } else { 3258 1.1 haad blkptr_t *ibp = db->db.db_data; 3259 1.1 haad ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 3260 1.1 haad for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 3261 1.1 haad if (BP_IS_HOLE(ibp)) 3262 1.1 haad continue; 3263 1.6 chs fill += BP_GET_FILL(ibp); 3264 1.1 haad } 3265 1.1 haad } 3266 1.6 chs DB_DNODE_EXIT(db); 3267 1.1 haad 3268 1.6 chs if (!BP_IS_EMBEDDED(bp)) 3269 1.6 chs bp->blk_fill = fill; 3270 1.1 haad 3271 1.1 haad mutex_exit(&db->db_mtx); 3272 1.6 chs 3273 1.6 chs rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 3274 1.6 chs *db->db_blkptr = *bp; 3275 1.6 chs rw_exit(&dn->dn_struct_rwlock); 3276 1.6 chs } 3277 1.6 chs 3278 1.6 chs /* ARGSUSED */ 3279 1.6 chs /* 3280 1.6 chs * This function gets called just prior to running through the compression 3281 1.6 chs * stage of the zio pipeline. If we're an indirect block comprised of only 3282 1.6 chs * holes, then we want this indirect to be compressed away to a hole. In 3283 1.6 chs * order to do that we must zero out any information about the holes that 3284 1.6 chs * this indirect points to prior to before we try to compress it. 3285 1.6 chs */ 3286 1.6 chs static void 3287 1.6 chs dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 3288 1.6 chs { 3289 1.6 chs dmu_buf_impl_t *db = vdb; 3290 1.6 chs dnode_t *dn; 3291 1.6 chs blkptr_t *bp; 3292 1.6 chs uint64_t i; 3293 1.6 chs int epbs; 3294 1.6 chs 3295 1.6 chs ASSERT3U(db->db_level, >, 0); 3296 1.6 chs DB_DNODE_ENTER(db); 3297 1.6 chs dn = DB_DNODE(db); 3298 1.6 chs epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 3299 1.6 chs 3300 1.6 chs /* Determine if all our children are holes */ 3301 1.6 chs for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { 3302 1.6 chs if (!BP_IS_HOLE(bp)) 3303 1.6 chs break; 3304 1.6 chs } 3305 1.6 chs 3306 1.6 chs /* 3307 1.6 chs * If all the children are holes, then zero them all out so that 3308 1.6 chs * we may get compressed away. 3309 1.6 chs */ 3310 1.6 chs if (i == 1 << epbs) { 3311 1.6 chs /* didn't find any non-holes */ 3312 1.6 chs bzero(db->db.db_data, db->db.db_size); 3313 1.6 chs } 3314 1.6 chs DB_DNODE_EXIT(db); 3315 1.6 chs } 3316 1.6 chs 3317 1.6 chs /* 3318 1.6 chs * The SPA will call this callback several times for each zio - once 3319 1.6 chs * for every physical child i/o (zio->io_phys_children times). This 3320 1.6 chs * allows the DMU to monitor the progress of each logical i/o. For example, 3321 1.6 chs * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 3322 1.6 chs * block. There may be a long delay before all copies/fragments are completed, 3323 1.6 chs * so this callback allows us to retire dirty space gradually, as the physical 3324 1.6 chs * i/os complete. 3325 1.6 chs */ 3326 1.6 chs /* ARGSUSED */ 3327 1.6 chs static void 3328 1.6 chs dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 3329 1.6 chs { 3330 1.6 chs dmu_buf_impl_t *db = arg; 3331 1.6 chs objset_t *os = db->db_objset; 3332 1.6 chs dsl_pool_t *dp = dmu_objset_pool(os); 3333 1.6 chs dbuf_dirty_record_t *dr; 3334 1.6 chs int delta = 0; 3335 1.6 chs 3336 1.6 chs dr = db->db_data_pending; 3337 1.6 chs ASSERT3U(dr->dr_txg, ==, zio->io_txg); 3338 1.6 chs 3339 1.6 chs /* 3340 1.6 chs * The callback will be called io_phys_children times. Retire one 3341 1.6 chs * portion of our dirty space each time we are called. Any rounding 3342 1.6 chs * error will be cleaned up by dsl_pool_sync()'s call to 3343 1.6 chs * dsl_pool_undirty_space(). 3344 1.6 chs */ 3345 1.6 chs delta = dr->dr_accounted / zio->io_phys_children; 3346 1.6 chs dsl_pool_undirty_space(dp, delta, zio->io_txg); 3347 1.1 haad } 3348 1.1 haad 3349 1.1 haad /* ARGSUSED */ 3350 1.1 haad static void 3351 1.1 haad dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 3352 1.1 haad { 3353 1.1 haad dmu_buf_impl_t *db = vdb; 3354 1.4 haad blkptr_t *bp_orig = &zio->io_bp_orig; 3355 1.6 chs blkptr_t *bp = db->db_blkptr; 3356 1.6 chs objset_t *os = db->db_objset; 3357 1.6 chs dmu_tx_t *tx = os->os_synctx; 3358 1.1 haad dbuf_dirty_record_t **drp, *dr; 3359 1.1 haad 3360 1.6 chs ASSERT0(zio->io_error); 3361 1.4 haad ASSERT(db->db_blkptr == bp); 3362 1.4 haad 3363 1.6 chs /* 3364 1.6 chs * For nopwrites and rewrites we ensure that the bp matches our 3365 1.6 chs * original and bypass all the accounting. 3366 1.6 chs */ 3367 1.6 chs if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 3368 1.4 haad ASSERT(BP_EQUAL(bp, bp_orig)); 3369 1.4 haad } else { 3370 1.4 haad dsl_dataset_t *ds = os->os_dsl_dataset; 3371 1.4 haad (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 3372 1.4 haad dsl_dataset_block_born(ds, bp, tx); 3373 1.4 haad } 3374 1.1 haad 3375 1.1 haad mutex_enter(&db->db_mtx); 3376 1.1 haad 3377 1.4 haad DBUF_VERIFY(db); 3378 1.4 haad 3379 1.1 haad drp = &db->db_last_dirty; 3380 1.1 haad while ((dr = *drp) != db->db_data_pending) 3381 1.1 haad drp = &dr->dr_next; 3382 1.1 haad ASSERT(!list_link_active(&dr->dr_dirty_node)); 3383 1.4 haad ASSERT(dr->dr_dbuf == db); 3384 1.1 haad ASSERT(dr->dr_next == NULL); 3385 1.1 haad *drp = dr->dr_next; 3386 1.1 haad 3387 1.6 chs #ifdef ZFS_DEBUG 3388 1.6 chs if (db->db_blkid == DMU_SPILL_BLKID) { 3389 1.6 chs dnode_t *dn; 3390 1.6 chs 3391 1.6 chs DB_DNODE_ENTER(db); 3392 1.6 chs dn = DB_DNODE(db); 3393 1.6 chs ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 3394 1.6 chs ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 3395 1.6 chs db->db_blkptr == &dn->dn_phys->dn_spill); 3396 1.6 chs DB_DNODE_EXIT(db); 3397 1.6 chs } 3398 1.6 chs #endif 3399 1.6 chs 3400 1.1 haad if (db->db_level == 0) { 3401 1.6 chs ASSERT(db->db_blkid != DMU_BONUS_BLKID); 3402 1.1 haad ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 3403 1.1 haad if (db->db_state != DB_NOFILL) { 3404 1.1 haad if (dr->dt.dl.dr_data != db->db_buf) 3405 1.6 chs arc_buf_destroy(dr->dt.dl.dr_data, db); 3406 1.1 haad } 3407 1.1 haad } else { 3408 1.6 chs dnode_t *dn; 3409 1.6 chs 3410 1.6 chs DB_DNODE_ENTER(db); 3411 1.6 chs dn = DB_DNODE(db); 3412 1.1 haad ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3413 1.6 chs ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 3414 1.1 haad if (!BP_IS_HOLE(db->db_blkptr)) { 3415 1.1 haad int epbs = 3416 1.1 haad dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 3417 1.6 chs ASSERT3U(db->db_blkid, <=, 3418 1.6 chs dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 3419 1.1 haad ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 3420 1.1 haad db->db.db_size); 3421 1.1 haad } 3422 1.6 chs DB_DNODE_EXIT(db); 3423 1.1 haad mutex_destroy(&dr->dt.di.dr_mtx); 3424 1.1 haad list_destroy(&dr->dt.di.dr_children); 3425 1.1 haad } 3426 1.1 haad kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3427 1.1 haad 3428 1.1 haad cv_broadcast(&db->db_changed); 3429 1.1 haad ASSERT(db->db_dirtycnt > 0); 3430 1.1 haad db->db_dirtycnt -= 1; 3431 1.1 haad db->db_data_pending = NULL; 3432 1.6 chs dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 3433 1.4 haad } 3434 1.4 haad 3435 1.4 haad static void 3436 1.4 haad dbuf_write_nofill_ready(zio_t *zio) 3437 1.4 haad { 3438 1.4 haad dbuf_write_ready(zio, NULL, zio->io_private); 3439 1.4 haad } 3440 1.4 haad 3441 1.4 haad static void 3442 1.4 haad dbuf_write_nofill_done(zio_t *zio) 3443 1.4 haad { 3444 1.4 haad dbuf_write_done(zio, NULL, zio->io_private); 3445 1.4 haad } 3446 1.4 haad 3447 1.4 haad static void 3448 1.4 haad dbuf_write_override_ready(zio_t *zio) 3449 1.4 haad { 3450 1.4 haad dbuf_dirty_record_t *dr = zio->io_private; 3451 1.4 haad dmu_buf_impl_t *db = dr->dr_dbuf; 3452 1.4 haad 3453 1.4 haad dbuf_write_ready(zio, NULL, db); 3454 1.4 haad } 3455 1.4 haad 3456 1.4 haad static void 3457 1.4 haad dbuf_write_override_done(zio_t *zio) 3458 1.4 haad { 3459 1.4 haad dbuf_dirty_record_t *dr = zio->io_private; 3460 1.4 haad dmu_buf_impl_t *db = dr->dr_dbuf; 3461 1.4 haad blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 3462 1.4 haad 3463 1.4 haad mutex_enter(&db->db_mtx); 3464 1.4 haad if (!BP_EQUAL(zio->io_bp, obp)) { 3465 1.4 haad if (!BP_IS_HOLE(obp)) 3466 1.4 haad dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 3467 1.4 haad arc_release(dr->dt.dl.dr_data, db); 3468 1.4 haad } 3469 1.1 haad mutex_exit(&db->db_mtx); 3470 1.1 haad 3471 1.4 haad dbuf_write_done(zio, NULL, db); 3472 1.4 haad } 3473 1.1 haad 3474 1.6 chs /* Issue I/O to commit a dirty buffer to disk. */ 3475 1.4 haad static void 3476 1.4 haad dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 3477 1.4 haad { 3478 1.4 haad dmu_buf_impl_t *db = dr->dr_dbuf; 3479 1.6 chs dnode_t *dn; 3480 1.6 chs objset_t *os; 3481 1.4 haad dmu_buf_impl_t *parent = db->db_parent; 3482 1.4 haad uint64_t txg = tx->tx_txg; 3483 1.6 chs zbookmark_phys_t zb; 3484 1.4 haad zio_prop_t zp; 3485 1.4 haad zio_t *zio; 3486 1.6 chs int wp_flag = 0; 3487 1.6 chs 3488 1.6 chs ASSERT(dmu_tx_is_syncing(tx)); 3489 1.6 chs 3490 1.6 chs DB_DNODE_ENTER(db); 3491 1.6 chs dn = DB_DNODE(db); 3492 1.6 chs os = dn->dn_objset; 3493 1.4 haad 3494 1.4 haad if (db->db_state != DB_NOFILL) { 3495 1.4 haad if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 3496 1.4 haad /* 3497 1.4 haad * Private object buffers are released here rather 3498 1.4 haad * than in dbuf_dirty() since they are only modified 3499 1.4 haad * in the syncing context and we don't want the 3500 1.4 haad * overhead of making multiple copies of the data. 3501 1.4 haad */ 3502 1.4 haad if (BP_IS_HOLE(db->db_blkptr)) { 3503 1.4 haad arc_buf_thaw(data); 3504 1.4 haad } else { 3505 1.6 chs dbuf_release_bp(db); 3506 1.4 haad } 3507 1.4 haad } 3508 1.4 haad } 3509 1.4 haad 3510 1.4 haad if (parent != dn->dn_dbuf) { 3511 1.6 chs /* Our parent is an indirect block. */ 3512 1.6 chs /* We have a dirty parent that has been scheduled for write. */ 3513 1.4 haad ASSERT(parent && parent->db_data_pending); 3514 1.6 chs /* Our parent's buffer is one level closer to the dnode. */ 3515 1.4 haad ASSERT(db->db_level == parent->db_level-1); 3516 1.6 chs /* 3517 1.6 chs * We're about to modify our parent's db_data by modifying 3518 1.6 chs * our block pointer, so the parent must be released. 3519 1.6 chs */ 3520 1.4 haad ASSERT(arc_released(parent->db_buf)); 3521 1.4 haad zio = parent->db_data_pending->dr_zio; 3522 1.4 haad } else { 3523 1.6 chs /* Our parent is the dnode itself. */ 3524 1.6 chs ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 3525 1.6 chs db->db_blkid != DMU_SPILL_BLKID) || 3526 1.6 chs (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 3527 1.6 chs if (db->db_blkid != DMU_SPILL_BLKID) 3528 1.6 chs ASSERT3P(db->db_blkptr, ==, 3529 1.6 chs &dn->dn_phys->dn_blkptr[db->db_blkid]); 3530 1.4 haad zio = dn->dn_zio; 3531 1.4 haad } 3532 1.4 haad 3533 1.4 haad ASSERT(db->db_level == 0 || data == db->db_buf); 3534 1.4 haad ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 3535 1.4 haad ASSERT(zio); 3536 1.4 haad 3537 1.4 haad SET_BOOKMARK(&zb, os->os_dsl_dataset ? 3538 1.4 haad os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 3539 1.4 haad db->db.db_object, db->db_level, db->db_blkid); 3540 1.4 haad 3541 1.6 chs if (db->db_blkid == DMU_SPILL_BLKID) 3542 1.6 chs wp_flag = WP_SPILL; 3543 1.6 chs wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 3544 1.6 chs 3545 1.6 chs dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 3546 1.6 chs DB_DNODE_EXIT(db); 3547 1.6 chs 3548 1.6 chs /* 3549 1.6 chs * We copy the blkptr now (rather than when we instantiate the dirty 3550 1.6 chs * record), because its value can change between open context and 3551 1.6 chs * syncing context. We do not need to hold dn_struct_rwlock to read 3552 1.6 chs * db_blkptr because we are in syncing context. 3553 1.6 chs */ 3554 1.6 chs dr->dr_bp_copy = *db->db_blkptr; 3555 1.6 chs 3556 1.6 chs if (db->db_level == 0 && 3557 1.6 chs dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 3558 1.6 chs /* 3559 1.6 chs * The BP for this block has been provided by open context 3560 1.6 chs * (by dmu_sync() or dmu_buf_write_embedded()). 3561 1.6 chs */ 3562 1.6 chs void *contents = (data != NULL) ? data->b_data : NULL; 3563 1.4 haad 3564 1.4 haad dr->dr_zio = zio_write(zio, os->os_spa, txg, 3565 1.6 chs &dr->dr_bp_copy, contents, db->db.db_size, &zp, 3566 1.6 chs dbuf_write_override_ready, NULL, NULL, 3567 1.6 chs dbuf_write_override_done, 3568 1.6 chs dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3569 1.4 haad mutex_enter(&db->db_mtx); 3570 1.4 haad dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 3571 1.4 haad zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 3572 1.6 chs dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 3573 1.4 haad mutex_exit(&db->db_mtx); 3574 1.4 haad } else if (db->db_state == DB_NOFILL) { 3575 1.6 chs ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 3576 1.6 chs zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 3577 1.4 haad dr->dr_zio = zio_write(zio, os->os_spa, txg, 3578 1.6 chs &dr->dr_bp_copy, NULL, db->db.db_size, &zp, 3579 1.6 chs dbuf_write_nofill_ready, NULL, NULL, 3580 1.6 chs dbuf_write_nofill_done, db, 3581 1.4 haad ZIO_PRIORITY_ASYNC_WRITE, 3582 1.4 haad ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 3583 1.4 haad } else { 3584 1.4 haad ASSERT(arc_released(data)); 3585 1.6 chs 3586 1.6 chs /* 3587 1.6 chs * For indirect blocks, we want to setup the children 3588 1.6 chs * ready callback so that we can properly handle an indirect 3589 1.6 chs * block that only contains holes. 3590 1.6 chs */ 3591 1.6 chs arc_done_func_t *children_ready_cb = NULL; 3592 1.6 chs if (db->db_level != 0) 3593 1.6 chs children_ready_cb = dbuf_write_children_ready; 3594 1.6 chs 3595 1.4 haad dr->dr_zio = arc_write(zio, os->os_spa, txg, 3596 1.6 chs &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), 3597 1.6 chs &zp, dbuf_write_ready, children_ready_cb, 3598 1.6 chs dbuf_write_physdone, dbuf_write_done, db, 3599 1.4 haad ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3600 1.4 haad } 3601 1.1 haad } 3602