1 1.1 chs /* 2 1.1 chs * CDDL HEADER START 3 1.1 chs * 4 1.1 chs * The contents of this file are subject to the terms of the 5 1.1 chs * Common Development and Distribution License (the "License"). 6 1.1 chs * You may not use this file except in compliance with the License. 7 1.1 chs * 8 1.1 chs * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 1.1 chs * or http://www.opensolaris.org/os/licensing. 10 1.1 chs * See the License for the specific language governing permissions 11 1.1 chs * and limitations under the License. 12 1.1 chs * 13 1.1 chs * When distributing Covered Code, include this CDDL HEADER in each 14 1.1 chs * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 1.1 chs * If applicable, add the following below this CDDL HEADER, with the 16 1.1 chs * fields enclosed by brackets "[]" replaced with your own identifying 17 1.1 chs * information: Portions Copyright [yyyy] [name of copyright owner] 18 1.1 chs * 19 1.1 chs * CDDL HEADER END 20 1.1 chs */ 21 1.1 chs /* 22 1.1 chs * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23 1.1 chs * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 1.1 chs * Copyright 2016 Gary Mills 25 1.1 chs */ 26 1.1 chs 27 1.1 chs #include <sys/dsl_scan.h> 28 1.1 chs #include <sys/dsl_pool.h> 29 1.1 chs #include <sys/dsl_dataset.h> 30 1.1 chs #include <sys/dsl_prop.h> 31 1.1 chs #include <sys/dsl_dir.h> 32 1.1 chs #include <sys/dsl_synctask.h> 33 1.1 chs #include <sys/dnode.h> 34 1.1 chs #include <sys/dmu_tx.h> 35 1.1 chs #include <sys/dmu_objset.h> 36 1.1 chs #include <sys/arc.h> 37 1.1 chs #include <sys/zap.h> 38 1.1 chs #include <sys/zio.h> 39 1.1 chs #include <sys/zfs_context.h> 40 1.1 chs #include <sys/fs/zfs.h> 41 1.1 chs #include <sys/zfs_znode.h> 42 1.1 chs #include <sys/spa_impl.h> 43 1.1 chs #include <sys/vdev_impl.h> 44 1.1 chs #include <sys/zil_impl.h> 45 1.1 chs #include <sys/zio_checksum.h> 46 1.1 chs #include <sys/ddt.h> 47 1.1 chs #include <sys/sa.h> 48 1.1 chs #include <sys/sa_impl.h> 49 1.1 chs #include <sys/zfeature.h> 50 1.1 chs #ifdef _KERNEL 51 1.1 chs #include <sys/zfs_vfsops.h> 52 1.1 chs #endif 53 1.1 chs 54 1.1 chs typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, 55 1.1 chs const zbookmark_phys_t *); 56 1.1 chs 57 1.1 chs static scan_cb_t dsl_scan_scrub_cb; 58 1.1 chs static void dsl_scan_cancel_sync(void *, dmu_tx_t *); 59 1.1 chs static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *); 60 1.1 chs static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *); 61 1.1 chs 62 1.1 chs unsigned int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ 63 1.1 chs unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ 64 1.1 chs unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ 65 1.1 chs unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ 66 1.1 chs 67 1.1 chs unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ 68 1.1 chs unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ 69 1.1 chs unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver 70 1.1 chs per txg */ 71 1.1 chs boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ 72 1.1 chs boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ 73 1.1 chs 74 1.1 chs SYSCTL_DECL(_vfs_zfs); 75 1.1 chs SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, 76 1.1 chs &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev"); 77 1.1 chs SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN, 78 1.1 chs &zfs_resilver_delay, 0, "Number of ticks to delay resilver"); 79 1.1 chs SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN, 80 1.1 chs &zfs_scrub_delay, 0, "Number of ticks to delay scrub"); 81 1.1 chs SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN, 82 1.1 chs &zfs_scan_idle, 0, "Idle scan window in clock ticks"); 83 1.1 chs SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN, 84 1.1 chs &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg"); 85 1.1 chs SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN, 86 1.1 chs &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); 87 1.1 chs SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN, 88 1.1 chs &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg"); 89 1.1 chs SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN, 90 1.1 chs &zfs_no_scrub_io, 0, "Disable scrub I/O"); 91 1.1 chs SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN, 92 1.1 chs &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching"); 93 1.1 chs 94 1.1 chs enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; 95 1.1 chs /* max number of blocks to free in a single TXG */ 96 1.1 chs uint64_t zfs_free_max_blocks = UINT64_MAX; 97 1.1 chs SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, 98 1.1 chs &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG"); 99 1.1 chs 100 1.1 chs 101 1.1 chs #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ 102 1.1 chs ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ 103 1.1 chs (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) 104 1.1 chs 105 1.1 chs extern int zfs_txg_timeout; 106 1.1 chs 107 1.1 chs /* 108 1.1 chs * Enable/disable the processing of the free_bpobj object. 109 1.1 chs */ 110 1.1 chs boolean_t zfs_free_bpobj_enabled = B_TRUE; 111 1.1 chs 112 1.1 chs SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN, 113 1.1 chs &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing"); 114 1.1 chs 115 1.1 chs /* the order has to match pool_scan_type */ 116 1.1 chs static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { 117 1.1 chs NULL, 118 1.1 chs dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ 119 1.1 chs dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ 120 1.1 chs }; 121 1.1 chs 122 1.1 chs int 123 1.1 chs dsl_scan_init(dsl_pool_t *dp, uint64_t txg) 124 1.1 chs { 125 1.1 chs int err; 126 1.1 chs dsl_scan_t *scn; 127 1.1 chs spa_t *spa = dp->dp_spa; 128 1.1 chs uint64_t f; 129 1.1 chs 130 1.1 chs scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); 131 1.1 chs scn->scn_dp = dp; 132 1.1 chs 133 1.1 chs /* 134 1.1 chs * It's possible that we're resuming a scan after a reboot so 135 1.1 chs * make sure that the scan_async_destroying flag is initialized 136 1.1 chs * appropriately. 137 1.1 chs */ 138 1.1 chs ASSERT(!scn->scn_async_destroying); 139 1.1 chs scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, 140 1.1 chs SPA_FEATURE_ASYNC_DESTROY); 141 1.1 chs 142 1.1 chs err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 143 1.1 chs "scrub_func", sizeof (uint64_t), 1, &f); 144 1.1 chs if (err == 0) { 145 1.1 chs /* 146 1.1 chs * There was an old-style scrub in progress. Restart a 147 1.1 chs * new-style scrub from the beginning. 148 1.1 chs */ 149 1.1 chs scn->scn_restart_txg = txg; 150 1.1 chs zfs_dbgmsg("old-style scrub was in progress; " 151 1.1 chs "restarting new-style scrub in txg %llu", 152 1.1 chs scn->scn_restart_txg); 153 1.1 chs 154 1.1 chs /* 155 1.1 chs * Load the queue obj from the old location so that it 156 1.1 chs * can be freed by dsl_scan_done(). 157 1.1 chs */ 158 1.1 chs (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 159 1.1 chs "scrub_queue", sizeof (uint64_t), 1, 160 1.1 chs &scn->scn_phys.scn_queue_obj); 161 1.1 chs } else { 162 1.1 chs err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 163 1.1 chs DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 164 1.1 chs &scn->scn_phys); 165 1.1 chs if (err == ENOENT) 166 1.1 chs return (0); 167 1.1 chs else if (err) 168 1.1 chs return (err); 169 1.1 chs 170 1.1 chs if (scn->scn_phys.scn_state == DSS_SCANNING && 171 1.1 chs spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { 172 1.1 chs /* 173 1.1 chs * A new-type scrub was in progress on an old 174 1.1 chs * pool, and the pool was accessed by old 175 1.1 chs * software. Restart from the beginning, since 176 1.1 chs * the old software may have changed the pool in 177 1.1 chs * the meantime. 178 1.1 chs */ 179 1.1 chs scn->scn_restart_txg = txg; 180 1.1 chs zfs_dbgmsg("new-style scrub was modified " 181 1.1 chs "by old software; restarting in txg %llu", 182 1.1 chs scn->scn_restart_txg); 183 1.1 chs } 184 1.1 chs } 185 1.1 chs 186 1.1 chs spa_scan_stat_init(spa); 187 1.1 chs return (0); 188 1.1 chs } 189 1.1 chs 190 1.1 chs void 191 1.1 chs dsl_scan_fini(dsl_pool_t *dp) 192 1.1 chs { 193 1.1 chs if (dp->dp_scan) { 194 1.1 chs kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); 195 1.1 chs dp->dp_scan = NULL; 196 1.1 chs } 197 1.1 chs } 198 1.1 chs 199 1.1 chs /* ARGSUSED */ 200 1.1 chs static int 201 1.1 chs dsl_scan_setup_check(void *arg, dmu_tx_t *tx) 202 1.1 chs { 203 1.1 chs dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; 204 1.1 chs 205 1.1 chs if (scn->scn_phys.scn_state == DSS_SCANNING) 206 1.1 chs return (SET_ERROR(EBUSY)); 207 1.1 chs 208 1.1 chs return (0); 209 1.1 chs } 210 1.1 chs 211 1.1 chs static void 212 1.1 chs dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) 213 1.1 chs { 214 1.1 chs dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; 215 1.1 chs pool_scan_func_t *funcp = arg; 216 1.1 chs dmu_object_type_t ot = 0; 217 1.1 chs dsl_pool_t *dp = scn->scn_dp; 218 1.1 chs spa_t *spa = dp->dp_spa; 219 1.1 chs 220 1.1 chs ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); 221 1.1 chs ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); 222 1.1 chs bzero(&scn->scn_phys, sizeof (scn->scn_phys)); 223 1.1 chs scn->scn_phys.scn_func = *funcp; 224 1.1 chs scn->scn_phys.scn_state = DSS_SCANNING; 225 1.1 chs scn->scn_phys.scn_min_txg = 0; 226 1.1 chs scn->scn_phys.scn_max_txg = tx->tx_txg; 227 1.1 chs scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ 228 1.1 chs scn->scn_phys.scn_start_time = gethrestime_sec(); 229 1.1 chs scn->scn_phys.scn_errors = 0; 230 1.1 chs scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; 231 1.1 chs scn->scn_restart_txg = 0; 232 1.1 chs scn->scn_done_txg = 0; 233 1.1 chs spa_scan_stat_init(spa); 234 1.1 chs 235 1.1 chs if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 236 1.1 chs scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; 237 1.1 chs 238 1.1 chs /* rewrite all disk labels */ 239 1.1 chs vdev_config_dirty(spa->spa_root_vdev); 240 1.1 chs 241 1.1 chs if (vdev_resilver_needed(spa->spa_root_vdev, 242 1.1 chs &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { 243 1.1 chs spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 244 1.1 chs } else { 245 1.1 chs spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START); 246 1.1 chs } 247 1.1 chs 248 1.1 chs spa->spa_scrub_started = B_TRUE; 249 1.1 chs /* 250 1.1 chs * If this is an incremental scrub, limit the DDT scrub phase 251 1.1 chs * to just the auto-ditto class (for correctness); the rest 252 1.1 chs * of the scrub should go faster using top-down pruning. 253 1.1 chs */ 254 1.1 chs if (scn->scn_phys.scn_min_txg > TXG_INITIAL) 255 1.1 chs scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; 256 1.1 chs 257 1.1 chs } 258 1.1 chs 259 1.1 chs /* back to the generic stuff */ 260 1.1 chs 261 1.1 chs if (dp->dp_blkstats == NULL) { 262 1.1 chs dp->dp_blkstats = 263 1.1 chs kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); 264 1.1 chs } 265 1.1 chs bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 266 1.1 chs 267 1.1 chs if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) 268 1.1 chs ot = DMU_OT_ZAP_OTHER; 269 1.1 chs 270 1.1 chs scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, 271 1.1 chs ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); 272 1.1 chs 273 1.1 chs dsl_scan_sync_state(scn, tx); 274 1.1 chs 275 1.1 chs spa_history_log_internal(spa, "scan setup", tx, 276 1.1 chs "func=%u mintxg=%llu maxtxg=%llu", 277 1.1 chs *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); 278 1.1 chs } 279 1.1 chs 280 1.1 chs /* ARGSUSED */ 281 1.1 chs static void 282 1.1 chs dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) 283 1.1 chs { 284 1.1 chs static const char *old_names[] = { 285 1.1 chs "scrub_bookmark", 286 1.1 chs "scrub_ddt_bookmark", 287 1.1 chs "scrub_ddt_class_max", 288 1.1 chs "scrub_queue", 289 1.1 chs "scrub_min_txg", 290 1.1 chs "scrub_max_txg", 291 1.1 chs "scrub_func", 292 1.1 chs "scrub_errors", 293 1.1 chs NULL 294 1.1 chs }; 295 1.1 chs 296 1.1 chs dsl_pool_t *dp = scn->scn_dp; 297 1.1 chs spa_t *spa = dp->dp_spa; 298 1.1 chs int i; 299 1.1 chs 300 1.1 chs /* Remove any remnants of an old-style scrub. */ 301 1.1 chs for (i = 0; old_names[i]; i++) { 302 1.1 chs (void) zap_remove(dp->dp_meta_objset, 303 1.1 chs DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); 304 1.1 chs } 305 1.1 chs 306 1.1 chs if (scn->scn_phys.scn_queue_obj != 0) { 307 1.1 chs VERIFY(0 == dmu_object_free(dp->dp_meta_objset, 308 1.1 chs scn->scn_phys.scn_queue_obj, tx)); 309 1.1 chs scn->scn_phys.scn_queue_obj = 0; 310 1.1 chs } 311 1.1 chs 312 1.1 chs /* 313 1.1 chs * If we were "restarted" from a stopped state, don't bother 314 1.1 chs * with anything else. 315 1.1 chs */ 316 1.1 chs if (scn->scn_phys.scn_state != DSS_SCANNING) 317 1.1 chs return; 318 1.1 chs 319 1.1 chs if (complete) 320 1.1 chs scn->scn_phys.scn_state = DSS_FINISHED; 321 1.1 chs else 322 1.1 chs scn->scn_phys.scn_state = DSS_CANCELED; 323 1.1 chs 324 1.1 chs if (dsl_scan_restarting(scn, tx)) 325 1.1 chs spa_history_log_internal(spa, "scan aborted, restarting", tx, 326 1.1 chs "errors=%llu", spa_get_errlog_size(spa)); 327 1.1 chs else if (!complete) 328 1.1 chs spa_history_log_internal(spa, "scan cancelled", tx, 329 1.1 chs "errors=%llu", spa_get_errlog_size(spa)); 330 1.1 chs else 331 1.1 chs spa_history_log_internal(spa, "scan done", tx, 332 1.1 chs "errors=%llu", spa_get_errlog_size(spa)); 333 1.1 chs 334 1.1 chs if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 335 1.1 chs mutex_enter(&spa->spa_scrub_lock); 336 1.1 chs while (spa->spa_scrub_inflight > 0) { 337 1.1 chs cv_wait(&spa->spa_scrub_io_cv, 338 1.1 chs &spa->spa_scrub_lock); 339 1.1 chs } 340 1.1 chs mutex_exit(&spa->spa_scrub_lock); 341 1.1 chs spa->spa_scrub_started = B_FALSE; 342 1.1 chs spa->spa_scrub_active = B_FALSE; 343 1.1 chs 344 1.1 chs /* 345 1.1 chs * If the scrub/resilver completed, update all DTLs to 346 1.1 chs * reflect this. Whether it succeeded or not, vacate 347 1.1 chs * all temporary scrub DTLs. 348 1.1 chs */ 349 1.1 chs vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 350 1.1 chs complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); 351 1.1 chs if (complete) { 352 1.1 chs spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? 353 1.1 chs ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); 354 1.1 chs } 355 1.1 chs spa_errlog_rotate(spa); 356 1.1 chs 357 1.1 chs /* 358 1.1 chs * We may have finished replacing a device. 359 1.1 chs * Let the async thread assess this and handle the detach. 360 1.1 chs */ 361 1.1 chs spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 362 1.1 chs } 363 1.1 chs 364 1.1 chs scn->scn_phys.scn_end_time = gethrestime_sec(); 365 1.1 chs } 366 1.1 chs 367 1.1 chs /* ARGSUSED */ 368 1.1 chs static int 369 1.1 chs dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) 370 1.1 chs { 371 1.1 chs dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; 372 1.1 chs 373 1.1 chs if (scn->scn_phys.scn_state != DSS_SCANNING) 374 1.1 chs return (SET_ERROR(ENOENT)); 375 1.1 chs return (0); 376 1.1 chs } 377 1.1 chs 378 1.1 chs /* ARGSUSED */ 379 1.1 chs static void 380 1.1 chs dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) 381 1.1 chs { 382 1.1 chs dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; 383 1.1 chs 384 1.1 chs dsl_scan_done(scn, B_FALSE, tx); 385 1.1 chs dsl_scan_sync_state(scn, tx); 386 1.1 chs } 387 1.1 chs 388 1.1 chs int 389 1.1 chs dsl_scan_cancel(dsl_pool_t *dp) 390 1.1 chs { 391 1.1 chs return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, 392 1.1 chs dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); 393 1.1 chs } 394 1.1 chs 395 1.1 chs static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, 396 1.1 chs dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, 397 1.1 chs dmu_objset_type_t ostype, dmu_tx_t *tx); 398 1.1 chs static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, 399 1.1 chs dmu_objset_type_t ostype, 400 1.3 jdolecek dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx, zbookmark_phys_t *); 401 1.1 chs 402 1.1 chs void 403 1.1 chs dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) 404 1.1 chs { 405 1.1 chs zio_free(dp->dp_spa, txg, bp); 406 1.1 chs } 407 1.1 chs 408 1.1 chs void 409 1.1 chs dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) 410 1.1 chs { 411 1.1 chs ASSERT(dsl_pool_sync_context(dp)); 412 1.1 chs zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), 413 1.1 chs pio->io_flags)); 414 1.1 chs } 415 1.1 chs 416 1.1 chs static uint64_t 417 1.1 chs dsl_scan_ds_maxtxg(dsl_dataset_t *ds) 418 1.1 chs { 419 1.1 chs uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; 420 1.1 chs if (ds->ds_is_snapshot) 421 1.1 chs return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); 422 1.1 chs return (smt); 423 1.1 chs } 424 1.1 chs 425 1.1 chs static void 426 1.1 chs dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) 427 1.1 chs { 428 1.1 chs VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, 429 1.1 chs DMU_POOL_DIRECTORY_OBJECT, 430 1.1 chs DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 431 1.1 chs &scn->scn_phys, tx)); 432 1.1 chs } 433 1.1 chs 434 1.1 chs extern int zfs_vdev_async_write_active_min_dirty_percent; 435 1.1 chs 436 1.1 chs static boolean_t 437 1.1 chs dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb) 438 1.1 chs { 439 1.1 chs /* we never skip user/group accounting objects */ 440 1.1 chs if (zb && (int64_t)zb->zb_object < 0) 441 1.1 chs return (B_FALSE); 442 1.1 chs 443 1.1 chs if (scn->scn_pausing) 444 1.1 chs return (B_TRUE); /* we're already pausing */ 445 1.1 chs 446 1.1 chs if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) 447 1.1 chs return (B_FALSE); /* we're resuming */ 448 1.1 chs 449 1.1 chs /* We only know how to resume from level-0 blocks. */ 450 1.1 chs if (zb && zb->zb_level != 0) 451 1.1 chs return (B_FALSE); 452 1.1 chs 453 1.1 chs /* 454 1.1 chs * We pause if: 455 1.1 chs * - we have scanned for the maximum time: an entire txg 456 1.1 chs * timeout (default 5 sec) 457 1.1 chs * or 458 1.1 chs * - we have scanned for at least the minimum time (default 1 sec 459 1.1 chs * for scrub, 3 sec for resilver), and either we have sufficient 460 1.1 chs * dirty data that we are starting to write more quickly 461 1.1 chs * (default 30%), or someone is explicitly waiting for this txg 462 1.1 chs * to complete. 463 1.1 chs * or 464 1.1 chs * - the spa is shutting down because this pool is being exported 465 1.1 chs * or the machine is rebooting. 466 1.1 chs */ 467 1.1 chs int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? 468 1.1 chs zfs_resilver_min_time_ms : zfs_scan_min_time_ms; 469 1.1 chs uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; 470 1.1 chs int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; 471 1.1 chs if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout || 472 1.1 chs (NSEC2MSEC(elapsed_nanosecs) > mintime && 473 1.1 chs (txg_sync_waiting(scn->scn_dp) || 474 1.1 chs dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) || 475 1.1 chs spa_shutting_down(scn->scn_dp->dp_spa)) { 476 1.1 chs if (zb) { 477 1.1 chs dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", 478 1.1 chs (longlong_t)zb->zb_objset, 479 1.1 chs (longlong_t)zb->zb_object, 480 1.1 chs (longlong_t)zb->zb_level, 481 1.1 chs (longlong_t)zb->zb_blkid); 482 1.1 chs scn->scn_phys.scn_bookmark = *zb; 483 1.1 chs } 484 1.1 chs dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", 485 1.1 chs (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, 486 1.1 chs (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, 487 1.1 chs (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, 488 1.1 chs (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); 489 1.1 chs scn->scn_pausing = B_TRUE; 490 1.1 chs return (B_TRUE); 491 1.1 chs } 492 1.1 chs return (B_FALSE); 493 1.1 chs } 494 1.1 chs 495 1.1 chs typedef struct zil_scan_arg { 496 1.1 chs dsl_pool_t *zsa_dp; 497 1.1 chs zil_header_t *zsa_zh; 498 1.1 chs } zil_scan_arg_t; 499 1.1 chs 500 1.1 chs /* ARGSUSED */ 501 1.1 chs static int 502 1.1 chs dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 503 1.1 chs { 504 1.1 chs zil_scan_arg_t *zsa = arg; 505 1.1 chs dsl_pool_t *dp = zsa->zsa_dp; 506 1.1 chs dsl_scan_t *scn = dp->dp_scan; 507 1.1 chs zil_header_t *zh = zsa->zsa_zh; 508 1.1 chs zbookmark_phys_t zb; 509 1.1 chs 510 1.1 chs if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 511 1.1 chs return (0); 512 1.1 chs 513 1.1 chs /* 514 1.1 chs * One block ("stubby") can be allocated a long time ago; we 515 1.1 chs * want to visit that one because it has been allocated 516 1.1 chs * (on-disk) even if it hasn't been claimed (even though for 517 1.1 chs * scrub there's nothing to do to it). 518 1.1 chs */ 519 1.1 chs if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) 520 1.1 chs return (0); 521 1.1 chs 522 1.1 chs SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 523 1.1 chs ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 524 1.1 chs 525 1.1 chs VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); 526 1.1 chs return (0); 527 1.1 chs } 528 1.1 chs 529 1.1 chs /* ARGSUSED */ 530 1.1 chs static int 531 1.1 chs dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 532 1.1 chs { 533 1.1 chs if (lrc->lrc_txtype == TX_WRITE) { 534 1.1 chs zil_scan_arg_t *zsa = arg; 535 1.1 chs dsl_pool_t *dp = zsa->zsa_dp; 536 1.1 chs dsl_scan_t *scn = dp->dp_scan; 537 1.1 chs zil_header_t *zh = zsa->zsa_zh; 538 1.1 chs lr_write_t *lr = (lr_write_t *)lrc; 539 1.1 chs blkptr_t *bp = &lr->lr_blkptr; 540 1.1 chs zbookmark_phys_t zb; 541 1.1 chs 542 1.1 chs if (BP_IS_HOLE(bp) || 543 1.1 chs bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 544 1.1 chs return (0); 545 1.1 chs 546 1.1 chs /* 547 1.1 chs * birth can be < claim_txg if this record's txg is 548 1.1 chs * already txg sync'ed (but this log block contains 549 1.1 chs * other records that are not synced) 550 1.1 chs */ 551 1.1 chs if (claim_txg == 0 || bp->blk_birth < claim_txg) 552 1.1 chs return (0); 553 1.1 chs 554 1.1 chs SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 555 1.1 chs lr->lr_foid, ZB_ZIL_LEVEL, 556 1.1 chs lr->lr_offset / BP_GET_LSIZE(bp)); 557 1.1 chs 558 1.1 chs VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); 559 1.1 chs } 560 1.1 chs return (0); 561 1.1 chs } 562 1.1 chs 563 1.1 chs static void 564 1.1 chs dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) 565 1.1 chs { 566 1.1 chs uint64_t claim_txg = zh->zh_claim_txg; 567 1.1 chs zil_scan_arg_t zsa = { dp, zh }; 568 1.1 chs zilog_t *zilog; 569 1.1 chs 570 1.1 chs /* 571 1.1 chs * We only want to visit blocks that have been claimed but not yet 572 1.1 chs * replayed (or, in read-only mode, blocks that *would* be claimed). 573 1.1 chs */ 574 1.1 chs if (claim_txg == 0 && spa_writeable(dp->dp_spa)) 575 1.1 chs return; 576 1.1 chs 577 1.1 chs zilog = zil_alloc(dp->dp_meta_objset, zh); 578 1.1 chs 579 1.1 chs (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, 580 1.1 chs claim_txg); 581 1.1 chs 582 1.1 chs zil_free(zilog); 583 1.1 chs } 584 1.1 chs 585 1.1 chs /* ARGSUSED */ 586 1.1 chs static void 587 1.1 chs dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, 588 1.3 jdolecek uint64_t objset, uint64_t object, uint64_t blkid, zbookmark_phys_t *czb) 589 1.1 chs { 590 1.1 chs arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 591 1.1 chs 592 1.1 chs if (zfs_no_scrub_prefetch) 593 1.1 chs return; 594 1.1 chs 595 1.1 chs if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || 596 1.1 chs (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) 597 1.1 chs return; 598 1.1 chs 599 1.3 jdolecek SET_BOOKMARK(czb, objset, object, BP_GET_LEVEL(bp), blkid); 600 1.1 chs 601 1.1 chs (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, 602 1.1 chs NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 603 1.3 jdolecek ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, czb); 604 1.1 chs } 605 1.1 chs 606 1.1 chs static boolean_t 607 1.1 chs dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, 608 1.1 chs const zbookmark_phys_t *zb) 609 1.1 chs { 610 1.1 chs /* 611 1.1 chs * We never skip over user/group accounting objects (obj<0) 612 1.1 chs */ 613 1.1 chs if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && 614 1.1 chs (int64_t)zb->zb_object >= 0) { 615 1.1 chs /* 616 1.1 chs * If we already visited this bp & everything below (in 617 1.1 chs * a prior txg sync), don't bother doing it again. 618 1.1 chs */ 619 1.1 chs if (zbookmark_subtree_completed(dnp, zb, 620 1.1 chs &scn->scn_phys.scn_bookmark)) 621 1.1 chs return (B_TRUE); 622 1.1 chs 623 1.1 chs /* 624 1.1 chs * If we found the block we're trying to resume from, or 625 1.1 chs * we went past it to a different object, zero it out to 626 1.1 chs * indicate that it's OK to start checking for pausing 627 1.1 chs * again. 628 1.1 chs */ 629 1.1 chs if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || 630 1.1 chs zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { 631 1.1 chs dprintf("resuming at %llx/%llx/%llx/%llx\n", 632 1.1 chs (longlong_t)zb->zb_objset, 633 1.1 chs (longlong_t)zb->zb_object, 634 1.1 chs (longlong_t)zb->zb_level, 635 1.1 chs (longlong_t)zb->zb_blkid); 636 1.1 chs bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); 637 1.1 chs } 638 1.1 chs } 639 1.1 chs return (B_FALSE); 640 1.1 chs } 641 1.1 chs 642 1.1 chs /* 643 1.1 chs * Return nonzero on i/o error. 644 1.1 chs * Return new buf to write out in *bufp. 645 1.1 chs */ 646 1.1 chs static int 647 1.1 chs dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, 648 1.1 chs dnode_phys_t *dnp, const blkptr_t *bp, 649 1.1 chs const zbookmark_phys_t *zb, dmu_tx_t *tx) 650 1.1 chs { 651 1.1 chs dsl_pool_t *dp = scn->scn_dp; 652 1.1 chs int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; 653 1.1 chs int err; 654 1.1 chs 655 1.1 chs if (BP_GET_LEVEL(bp) > 0) { 656 1.1 chs arc_flags_t flags = ARC_FLAG_WAIT; 657 1.1 chs int i; 658 1.1 chs blkptr_t *cbp; 659 1.1 chs int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 660 1.1 chs arc_buf_t *buf; 661 1.3 jdolecek zbookmark_phys_t *czb; 662 1.1 chs 663 1.1 chs err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, 664 1.1 chs ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); 665 1.1 chs if (err) { 666 1.1 chs scn->scn_phys.scn_errors++; 667 1.1 chs return (err); 668 1.1 chs } 669 1.3 jdolecek czb = kmem_alloc(sizeof (*czb), KM_SLEEP); 670 1.1 chs for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { 671 1.1 chs dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, 672 1.3 jdolecek zb->zb_object, zb->zb_blkid * epb + i, czb); 673 1.1 chs } 674 1.1 chs for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { 675 1.3 jdolecek SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, 676 1.1 chs zb->zb_level - 1, 677 1.1 chs zb->zb_blkid * epb + i); 678 1.3 jdolecek dsl_scan_visitbp(cbp, czb, dnp, 679 1.1 chs ds, scn, ostype, tx); 680 1.1 chs } 681 1.3 jdolecek kmem_free(czb, sizeof (*czb)); 682 1.1 chs arc_buf_destroy(buf, &buf); 683 1.1 chs } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 684 1.1 chs arc_flags_t flags = ARC_FLAG_WAIT; 685 1.1 chs dnode_phys_t *cdnp; 686 1.1 chs int i, j; 687 1.1 chs int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 688 1.1 chs arc_buf_t *buf; 689 1.3 jdolecek zbookmark_phys_t *czb; 690 1.1 chs 691 1.1 chs err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, 692 1.1 chs ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); 693 1.1 chs if (err) { 694 1.1 chs scn->scn_phys.scn_errors++; 695 1.1 chs return (err); 696 1.1 chs } 697 1.3 jdolecek czb = kmem_alloc(sizeof (*czb), KM_SLEEP); 698 1.1 chs for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { 699 1.1 chs for (j = 0; j < cdnp->dn_nblkptr; j++) { 700 1.1 chs blkptr_t *cbp = &cdnp->dn_blkptr[j]; 701 1.1 chs dsl_scan_prefetch(scn, buf, cbp, 702 1.3 jdolecek zb->zb_objset, zb->zb_blkid * epb + i, j, 703 1.3 jdolecek czb); 704 1.1 chs } 705 1.1 chs } 706 1.1 chs for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { 707 1.1 chs dsl_scan_visitdnode(scn, ds, ostype, 708 1.3 jdolecek cdnp, zb->zb_blkid * epb + i, tx, czb); 709 1.1 chs } 710 1.3 jdolecek kmem_free(czb, sizeof (*czb)); 711 1.1 chs 712 1.1 chs arc_buf_destroy(buf, &buf); 713 1.1 chs } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 714 1.1 chs arc_flags_t flags = ARC_FLAG_WAIT; 715 1.1 chs objset_phys_t *osp; 716 1.1 chs arc_buf_t *buf; 717 1.3 jdolecek zbookmark_phys_t *czb; 718 1.1 chs 719 1.1 chs err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, 720 1.1 chs ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); 721 1.1 chs if (err) { 722 1.1 chs scn->scn_phys.scn_errors++; 723 1.1 chs return (err); 724 1.1 chs } 725 1.1 chs 726 1.1 chs osp = buf->b_data; 727 1.1 chs 728 1.3 jdolecek czb = kmem_alloc(sizeof (*czb), KM_SLEEP); 729 1.1 chs dsl_scan_visitdnode(scn, ds, osp->os_type, 730 1.3 jdolecek &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx, czb); 731 1.1 chs 732 1.1 chs if (OBJSET_BUF_HAS_USERUSED(buf)) { 733 1.1 chs /* 734 1.1 chs * We also always visit user/group accounting 735 1.1 chs * objects, and never skip them, even if we are 736 1.1 chs * pausing. This is necessary so that the space 737 1.1 chs * deltas from this txg get integrated. 738 1.1 chs */ 739 1.1 chs dsl_scan_visitdnode(scn, ds, osp->os_type, 740 1.1 chs &osp->os_groupused_dnode, 741 1.3 jdolecek DMU_GROUPUSED_OBJECT, tx, czb); 742 1.1 chs dsl_scan_visitdnode(scn, ds, osp->os_type, 743 1.1 chs &osp->os_userused_dnode, 744 1.3 jdolecek DMU_USERUSED_OBJECT, tx, czb); 745 1.1 chs } 746 1.3 jdolecek kmem_free(czb, sizeof (*czb)); 747 1.1 chs arc_buf_destroy(buf, &buf); 748 1.1 chs } 749 1.1 chs 750 1.1 chs return (0); 751 1.1 chs } 752 1.1 chs 753 1.1 chs static void 754 1.1 chs dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, 755 1.1 chs dmu_objset_type_t ostype, dnode_phys_t *dnp, 756 1.3 jdolecek uint64_t object, dmu_tx_t *tx, zbookmark_phys_t *czb) 757 1.1 chs { 758 1.1 chs int j; 759 1.1 chs 760 1.1 chs for (j = 0; j < dnp->dn_nblkptr; j++) { 761 1.3 jdolecek SET_BOOKMARK(czb, ds ? ds->ds_object : 0, object, 762 1.1 chs dnp->dn_nlevels - 1, j); 763 1.1 chs dsl_scan_visitbp(&dnp->dn_blkptr[j], 764 1.3 jdolecek czb, dnp, ds, scn, ostype, tx); 765 1.1 chs } 766 1.1 chs 767 1.1 chs if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 768 1.3 jdolecek SET_BOOKMARK(czb, ds ? ds->ds_object : 0, object, 769 1.1 chs 0, DMU_SPILL_BLKID); 770 1.1 chs dsl_scan_visitbp(&dnp->dn_spill, 771 1.3 jdolecek czb, dnp, ds, scn, ostype, tx); 772 1.1 chs } 773 1.1 chs } 774 1.1 chs 775 1.1 chs /* 776 1.1 chs * The arguments are in this order because mdb can only print the 777 1.1 chs * first 5; we want them to be useful. 778 1.1 chs */ 779 1.1 chs static void 780 1.1 chs dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, 781 1.1 chs dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, 782 1.1 chs dmu_objset_type_t ostype, dmu_tx_t *tx) 783 1.1 chs { 784 1.1 chs dsl_pool_t *dp = scn->scn_dp; 785 1.2 jdolecek blkptr_t *bp_toread = NULL; 786 1.1 chs 787 1.1 chs if (dsl_scan_check_pause(scn, zb)) 788 1.1 chs return; 789 1.1 chs 790 1.1 chs if (dsl_scan_check_resume(scn, dnp, zb)) 791 1.1 chs return; 792 1.1 chs 793 1.1 chs if (BP_IS_HOLE(bp)) 794 1.1 chs return; 795 1.1 chs 796 1.1 chs scn->scn_visited_this_txg++; 797 1.1 chs 798 1.1 chs dprintf_bp(bp, 799 1.1 chs "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", 800 1.1 chs ds, ds ? ds->ds_object : 0, 801 1.1 chs zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, 802 1.1 chs bp); 803 1.1 chs 804 1.1 chs if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 805 1.1 chs return; 806 1.1 chs 807 1.2 jdolecek bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); 808 1.2 jdolecek *bp_toread = *bp; 809 1.2 jdolecek 810 1.2 jdolecek if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0) 811 1.2 jdolecek goto out; 812 1.1 chs 813 1.1 chs /* 814 1.1 chs * If dsl_scan_ddt() has aready visited this block, it will have 815 1.1 chs * already done any translations or scrubbing, so don't call the 816 1.1 chs * callback again. 817 1.1 chs */ 818 1.1 chs if (ddt_class_contains(dp->dp_spa, 819 1.1 chs scn->scn_phys.scn_ddt_class_max, bp)) { 820 1.2 jdolecek goto out; 821 1.1 chs } 822 1.1 chs 823 1.1 chs /* 824 1.1 chs * If this block is from the future (after cur_max_txg), then we 825 1.1 chs * are doing this on behalf of a deleted snapshot, and we will 826 1.1 chs * revisit the future block on the next pass of this dataset. 827 1.1 chs * Don't scan it now unless we need to because something 828 1.1 chs * under it was modified. 829 1.1 chs */ 830 1.1 chs if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) { 831 1.1 chs scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); 832 1.1 chs } 833 1.2 jdolecek 834 1.2 jdolecek out: 835 1.2 jdolecek kmem_free(bp_toread, sizeof (blkptr_t)); 836 1.1 chs } 837 1.1 chs 838 1.1 chs static void 839 1.1 chs dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, 840 1.1 chs dmu_tx_t *tx) 841 1.1 chs { 842 1.1 chs zbookmark_phys_t zb; 843 1.1 chs 844 1.1 chs SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 845 1.1 chs ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 846 1.1 chs dsl_scan_visitbp(bp, &zb, NULL, 847 1.1 chs ds, scn, DMU_OST_NONE, tx); 848 1.1 chs 849 1.1 chs dprintf_ds(ds, "finished scan%s", ""); 850 1.1 chs } 851 1.1 chs 852 1.1 chs void 853 1.1 chs dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) 854 1.1 chs { 855 1.1 chs dsl_pool_t *dp = ds->ds_dir->dd_pool; 856 1.1 chs dsl_scan_t *scn = dp->dp_scan; 857 1.1 chs uint64_t mintxg; 858 1.1 chs 859 1.1 chs if (scn->scn_phys.scn_state != DSS_SCANNING) 860 1.1 chs return; 861 1.1 chs 862 1.1 chs if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { 863 1.1 chs if (ds->ds_is_snapshot) { 864 1.1 chs /* 865 1.1 chs * Note: 866 1.1 chs * - scn_cur_{min,max}_txg stays the same. 867 1.1 chs * - Setting the flag is not really necessary if 868 1.1 chs * scn_cur_max_txg == scn_max_txg, because there 869 1.1 chs * is nothing after this snapshot that we care 870 1.1 chs * about. However, we set it anyway and then 871 1.1 chs * ignore it when we retraverse it in 872 1.1 chs * dsl_scan_visitds(). 873 1.1 chs */ 874 1.1 chs scn->scn_phys.scn_bookmark.zb_objset = 875 1.1 chs dsl_dataset_phys(ds)->ds_next_snap_obj; 876 1.1 chs zfs_dbgmsg("destroying ds %llu; currently traversing; " 877 1.1 chs "reset zb_objset to %llu", 878 1.1 chs (u_longlong_t)ds->ds_object, 879 1.1 chs (u_longlong_t)dsl_dataset_phys(ds)-> 880 1.1 chs ds_next_snap_obj); 881 1.1 chs scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; 882 1.1 chs } else { 883 1.1 chs SET_BOOKMARK(&scn->scn_phys.scn_bookmark, 884 1.1 chs ZB_DESTROYED_OBJSET, 0, 0, 0); 885 1.1 chs zfs_dbgmsg("destroying ds %llu; currently traversing; " 886 1.1 chs "reset bookmark to -1,0,0,0", 887 1.1 chs (u_longlong_t)ds->ds_object); 888 1.1 chs } 889 1.1 chs } else if (zap_lookup_int_key(dp->dp_meta_objset, 890 1.1 chs scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { 891 1.1 chs ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); 892 1.1 chs VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 893 1.1 chs scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); 894 1.1 chs if (ds->ds_is_snapshot) { 895 1.1 chs /* 896 1.1 chs * We keep the same mintxg; it could be > 897 1.1 chs * ds_creation_txg if the previous snapshot was 898 1.1 chs * deleted too. 899 1.1 chs */ 900 1.1 chs VERIFY(zap_add_int_key(dp->dp_meta_objset, 901 1.1 chs scn->scn_phys.scn_queue_obj, 902 1.1 chs dsl_dataset_phys(ds)->ds_next_snap_obj, 903 1.1 chs mintxg, tx) == 0); 904 1.1 chs zfs_dbgmsg("destroying ds %llu; in queue; " 905 1.1 chs "replacing with %llu", 906 1.1 chs (u_longlong_t)ds->ds_object, 907 1.1 chs (u_longlong_t)dsl_dataset_phys(ds)-> 908 1.1 chs ds_next_snap_obj); 909 1.1 chs } else { 910 1.1 chs zfs_dbgmsg("destroying ds %llu; in queue; removing", 911 1.1 chs (u_longlong_t)ds->ds_object); 912 1.1 chs } 913 1.1 chs } 914 1.1 chs 915 1.1 chs /* 916 1.1 chs * dsl_scan_sync() should be called after this, and should sync 917 1.1 chs * out our changed state, but just to be safe, do it here. 918 1.1 chs */ 919 1.1 chs dsl_scan_sync_state(scn, tx); 920 1.1 chs } 921 1.1 chs 922 1.1 chs void 923 1.1 chs dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) 924 1.1 chs { 925 1.1 chs dsl_pool_t *dp = ds->ds_dir->dd_pool; 926 1.1 chs dsl_scan_t *scn = dp->dp_scan; 927 1.1 chs uint64_t mintxg; 928 1.1 chs 929 1.1 chs if (scn->scn_phys.scn_state != DSS_SCANNING) 930 1.1 chs return; 931 1.1 chs 932 1.1 chs ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); 933 1.1 chs 934 1.1 chs if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { 935 1.1 chs scn->scn_phys.scn_bookmark.zb_objset = 936 1.1 chs dsl_dataset_phys(ds)->ds_prev_snap_obj; 937 1.1 chs zfs_dbgmsg("snapshotting ds %llu; currently traversing; " 938 1.1 chs "reset zb_objset to %llu", 939 1.1 chs (u_longlong_t)ds->ds_object, 940 1.1 chs (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); 941 1.1 chs } else if (zap_lookup_int_key(dp->dp_meta_objset, 942 1.1 chs scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { 943 1.1 chs VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 944 1.1 chs scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); 945 1.1 chs VERIFY(zap_add_int_key(dp->dp_meta_objset, 946 1.1 chs scn->scn_phys.scn_queue_obj, 947 1.1 chs dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); 948 1.1 chs zfs_dbgmsg("snapshotting ds %llu; in queue; " 949 1.1 chs "replacing with %llu", 950 1.1 chs (u_longlong_t)ds->ds_object, 951 1.1 chs (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); 952 1.1 chs } 953 1.1 chs dsl_scan_sync_state(scn, tx); 954 1.1 chs } 955 1.1 chs 956 1.1 chs void 957 1.1 chs dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) 958 1.1 chs { 959 1.1 chs dsl_pool_t *dp = ds1->ds_dir->dd_pool; 960 1.1 chs dsl_scan_t *scn = dp->dp_scan; 961 1.1 chs uint64_t mintxg; 962 1.1 chs 963 1.1 chs if (scn->scn_phys.scn_state != DSS_SCANNING) 964 1.1 chs return; 965 1.1 chs 966 1.1 chs if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { 967 1.1 chs scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; 968 1.1 chs zfs_dbgmsg("clone_swap ds %llu; currently traversing; " 969 1.1 chs "reset zb_objset to %llu", 970 1.1 chs (u_longlong_t)ds1->ds_object, 971 1.1 chs (u_longlong_t)ds2->ds_object); 972 1.1 chs } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { 973 1.1 chs scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; 974 1.1 chs zfs_dbgmsg("clone_swap ds %llu; currently traversing; " 975 1.1 chs "reset zb_objset to %llu", 976 1.1 chs (u_longlong_t)ds2->ds_object, 977 1.1 chs (u_longlong_t)ds1->ds_object); 978 1.1 chs } 979 1.1 chs 980 1.1 chs if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 981 1.1 chs ds1->ds_object, &mintxg) == 0) { 982 1.1 chs int err; 983 1.1 chs 984 1.1 chs ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); 985 1.1 chs ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); 986 1.1 chs VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 987 1.1 chs scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); 988 1.1 chs err = zap_add_int_key(dp->dp_meta_objset, 989 1.1 chs scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); 990 1.1 chs VERIFY(err == 0 || err == EEXIST); 991 1.1 chs if (err == EEXIST) { 992 1.1 chs /* Both were there to begin with */ 993 1.1 chs VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, 994 1.1 chs scn->scn_phys.scn_queue_obj, 995 1.1 chs ds1->ds_object, mintxg, tx)); 996 1.1 chs } 997 1.1 chs zfs_dbgmsg("clone_swap ds %llu; in queue; " 998 1.1 chs "replacing with %llu", 999 1.1 chs (u_longlong_t)ds1->ds_object, 1000 1.1 chs (u_longlong_t)ds2->ds_object); 1001 1.1 chs } else if (zap_lookup_int_key(dp->dp_meta_objset, 1002 1.1 chs scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { 1003 1.1 chs ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); 1004 1.1 chs ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); 1005 1.1 chs VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 1006 1.1 chs scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); 1007 1.1 chs VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, 1008 1.1 chs scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); 1009 1.1 chs zfs_dbgmsg("clone_swap ds %llu; in queue; " 1010 1.1 chs "replacing with %llu", 1011 1.1 chs (u_longlong_t)ds2->ds_object, 1012 1.1 chs (u_longlong_t)ds1->ds_object); 1013 1.1 chs } 1014 1.1 chs 1015 1.1 chs dsl_scan_sync_state(scn, tx); 1016 1.1 chs } 1017 1.1 chs 1018 1.1 chs struct enqueue_clones_arg { 1019 1.1 chs dmu_tx_t *tx; 1020 1.1 chs uint64_t originobj; 1021 1.1 chs }; 1022 1.1 chs 1023 1.1 chs /* ARGSUSED */ 1024 1.1 chs static int 1025 1.1 chs enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) 1026 1.1 chs { 1027 1.1 chs struct enqueue_clones_arg *eca = arg; 1028 1.1 chs dsl_dataset_t *ds; 1029 1.1 chs int err; 1030 1.1 chs dsl_scan_t *scn = dp->dp_scan; 1031 1.1 chs 1032 1.1 chs if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj) 1033 1.1 chs return (0); 1034 1.1 chs 1035 1.1 chs err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 1036 1.1 chs if (err) 1037 1.1 chs return (err); 1038 1.1 chs 1039 1.1 chs while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) { 1040 1.1 chs dsl_dataset_t *prev; 1041 1.1 chs err = dsl_dataset_hold_obj(dp, 1042 1.1 chs dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 1043 1.1 chs 1044 1.1 chs dsl_dataset_rele(ds, FTAG); 1045 1.1 chs if (err) 1046 1.1 chs return (err); 1047 1.1 chs ds = prev; 1048 1.1 chs } 1049 1.1 chs VERIFY(zap_add_int_key(dp->dp_meta_objset, 1050 1.1 chs scn->scn_phys.scn_queue_obj, ds->ds_object, 1051 1.1 chs dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0); 1052 1.1 chs dsl_dataset_rele(ds, FTAG); 1053 1.1 chs return (0); 1054 1.1 chs } 1055 1.1 chs 1056 1.1 chs static void 1057 1.1 chs dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) 1058 1.1 chs { 1059 1.1 chs dsl_pool_t *dp = scn->scn_dp; 1060 1.1 chs dsl_dataset_t *ds; 1061 1.1 chs objset_t *os; 1062 1.1 chs 1063 1.1 chs VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 1064 1.1 chs 1065 1.1 chs if (scn->scn_phys.scn_cur_min_txg >= 1066 1.1 chs scn->scn_phys.scn_max_txg) { 1067 1.1 chs /* 1068 1.1 chs * This can happen if this snapshot was created after the 1069 1.1 chs * scan started, and we already completed a previous snapshot 1070 1.1 chs * that was created after the scan started. This snapshot 1071 1.1 chs * only references blocks with: 1072 1.1 chs * 1073 1.1 chs * birth < our ds_creation_txg 1074 1.1 chs * cur_min_txg is no less than ds_creation_txg. 1075 1.1 chs * We have already visited these blocks. 1076 1.1 chs * or 1077 1.1 chs * birth > scn_max_txg 1078 1.1 chs * The scan requested not to visit these blocks. 1079 1.1 chs * 1080 1.1 chs * Subsequent snapshots (and clones) can reference our 1081 1.1 chs * blocks, or blocks with even higher birth times. 1082 1.1 chs * Therefore we do not need to visit them either, 1083 1.1 chs * so we do not add them to the work queue. 1084 1.1 chs * 1085 1.1 chs * Note that checking for cur_min_txg >= cur_max_txg 1086 1.1 chs * is not sufficient, because in that case we may need to 1087 1.1 chs * visit subsequent snapshots. This happens when min_txg > 0, 1088 1.1 chs * which raises cur_min_txg. In this case we will visit 1089 1.1 chs * this dataset but skip all of its blocks, because the 1090 1.1 chs * rootbp's birth time is < cur_min_txg. Then we will 1091 1.1 chs * add the next snapshots/clones to the work queue. 1092 1.1 chs */ 1093 1.1 chs char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); 1094 1.1 chs dsl_dataset_name(ds, dsname); 1095 1.1 chs zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " 1096 1.1 chs "cur_min_txg (%llu) >= max_txg (%llu)", 1097 1.1 chs dsobj, dsname, 1098 1.1 chs scn->scn_phys.scn_cur_min_txg, 1099 1.1 chs scn->scn_phys.scn_max_txg); 1100 1.1 chs kmem_free(dsname, MAXNAMELEN); 1101 1.1 chs 1102 1.1 chs goto out; 1103 1.1 chs } 1104 1.1 chs 1105 1.1 chs if (dmu_objset_from_ds(ds, &os)) 1106 1.1 chs goto out; 1107 1.1 chs 1108 1.1 chs /* 1109 1.1 chs * Only the ZIL in the head (non-snapshot) is valid. Even though 1110 1.1 chs * snapshots can have ZIL block pointers (which may be the same 1111 1.1 chs * BP as in the head), they must be ignored. So we traverse the 1112 1.1 chs * ZIL here, rather than in scan_recurse(), because the regular 1113 1.1 chs * snapshot block-sharing rules don't apply to it. 1114 1.1 chs */ 1115 1.1 chs if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot) 1116 1.1 chs dsl_scan_zil(dp, &os->os_zil_header); 1117 1.1 chs 1118 1.1 chs /* 1119 1.1 chs * Iterate over the bps in this ds. 1120 1.1 chs */ 1121 1.1 chs dmu_buf_will_dirty(ds->ds_dbuf, tx); 1122 1.1 chs rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); 1123 1.1 chs dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); 1124 1.1 chs rrw_exit(&ds->ds_bp_rwlock, FTAG); 1125 1.1 chs 1126 1.1 chs char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 1127 1.1 chs dsl_dataset_name(ds, dsname); 1128 1.1 chs zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " 1129 1.1 chs "pausing=%u", 1130 1.1 chs (longlong_t)dsobj, dsname, 1131 1.1 chs (longlong_t)scn->scn_phys.scn_cur_min_txg, 1132 1.1 chs (longlong_t)scn->scn_phys.scn_cur_max_txg, 1133 1.1 chs (int)scn->scn_pausing); 1134 1.1 chs kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); 1135 1.1 chs 1136 1.1 chs if (scn->scn_pausing) 1137 1.1 chs goto out; 1138 1.1 chs 1139 1.1 chs /* 1140 1.1 chs * We've finished this pass over this dataset. 1141 1.1 chs */ 1142 1.1 chs 1143 1.1 chs /* 1144 1.1 chs * If we did not completely visit this dataset, do another pass. 1145 1.1 chs */ 1146 1.1 chs if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { 1147 1.1 chs zfs_dbgmsg("incomplete pass; visiting again"); 1148 1.1 chs scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; 1149 1.1 chs VERIFY(zap_add_int_key(dp->dp_meta_objset, 1150 1.1 chs scn->scn_phys.scn_queue_obj, ds->ds_object, 1151 1.1 chs scn->scn_phys.scn_cur_max_txg, tx) == 0); 1152 1.1 chs goto out; 1153 1.1 chs } 1154 1.1 chs 1155 1.1 chs /* 1156 1.1 chs * Add descendent datasets to work queue. 1157 1.1 chs */ 1158 1.1 chs if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { 1159 1.1 chs VERIFY(zap_add_int_key(dp->dp_meta_objset, 1160 1.1 chs scn->scn_phys.scn_queue_obj, 1161 1.1 chs dsl_dataset_phys(ds)->ds_next_snap_obj, 1162 1.1 chs dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0); 1163 1.1 chs } 1164 1.1 chs if (dsl_dataset_phys(ds)->ds_num_children > 1) { 1165 1.1 chs boolean_t usenext = B_FALSE; 1166 1.1 chs if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { 1167 1.1 chs uint64_t count; 1168 1.1 chs /* 1169 1.1 chs * A bug in a previous version of the code could 1170 1.1 chs * cause upgrade_clones_cb() to not set 1171 1.1 chs * ds_next_snap_obj when it should, leading to a 1172 1.1 chs * missing entry. Therefore we can only use the 1173 1.1 chs * next_clones_obj when its count is correct. 1174 1.1 chs */ 1175 1.1 chs int err = zap_count(dp->dp_meta_objset, 1176 1.1 chs dsl_dataset_phys(ds)->ds_next_clones_obj, &count); 1177 1.1 chs if (err == 0 && 1178 1.1 chs count == dsl_dataset_phys(ds)->ds_num_children - 1) 1179 1.1 chs usenext = B_TRUE; 1180 1.1 chs } 1181 1.1 chs 1182 1.1 chs if (usenext) { 1183 1.1 chs VERIFY0(zap_join_key(dp->dp_meta_objset, 1184 1.1 chs dsl_dataset_phys(ds)->ds_next_clones_obj, 1185 1.1 chs scn->scn_phys.scn_queue_obj, 1186 1.1 chs dsl_dataset_phys(ds)->ds_creation_txg, tx)); 1187 1.1 chs } else { 1188 1.1 chs struct enqueue_clones_arg eca; 1189 1.1 chs eca.tx = tx; 1190 1.1 chs eca.originobj = ds->ds_object; 1191 1.1 chs 1192 1.1 chs VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1193 1.1 chs enqueue_clones_cb, &eca, DS_FIND_CHILDREN)); 1194 1.1 chs } 1195 1.1 chs } 1196 1.1 chs 1197 1.1 chs out: 1198 1.1 chs dsl_dataset_rele(ds, FTAG); 1199 1.1 chs } 1200 1.1 chs 1201 1.1 chs /* ARGSUSED */ 1202 1.1 chs static int 1203 1.1 chs enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) 1204 1.1 chs { 1205 1.1 chs dmu_tx_t *tx = arg; 1206 1.1 chs dsl_dataset_t *ds; 1207 1.1 chs int err; 1208 1.1 chs dsl_scan_t *scn = dp->dp_scan; 1209 1.1 chs 1210 1.1 chs err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 1211 1.1 chs if (err) 1212 1.1 chs return (err); 1213 1.1 chs 1214 1.1 chs while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { 1215 1.1 chs dsl_dataset_t *prev; 1216 1.1 chs err = dsl_dataset_hold_obj(dp, 1217 1.1 chs dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 1218 1.1 chs if (err) { 1219 1.1 chs dsl_dataset_rele(ds, FTAG); 1220 1.1 chs return (err); 1221 1.1 chs } 1222 1.1 chs 1223 1.1 chs /* 1224 1.1 chs * If this is a clone, we don't need to worry about it for now. 1225 1.1 chs */ 1226 1.1 chs if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { 1227 1.1 chs dsl_dataset_rele(ds, FTAG); 1228 1.1 chs dsl_dataset_rele(prev, FTAG); 1229 1.1 chs return (0); 1230 1.1 chs } 1231 1.1 chs dsl_dataset_rele(ds, FTAG); 1232 1.1 chs ds = prev; 1233 1.1 chs } 1234 1.1 chs 1235 1.1 chs VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 1236 1.1 chs ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0); 1237 1.1 chs dsl_dataset_rele(ds, FTAG); 1238 1.1 chs return (0); 1239 1.1 chs } 1240 1.1 chs 1241 1.1 chs /* 1242 1.1 chs * Scrub/dedup interaction. 1243 1.1 chs * 1244 1.1 chs * If there are N references to a deduped block, we don't want to scrub it 1245 1.1 chs * N times -- ideally, we should scrub it exactly once. 1246 1.1 chs * 1247 1.1 chs * We leverage the fact that the dde's replication class (enum ddt_class) 1248 1.1 chs * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest 1249 1.1 chs * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. 1250 1.1 chs * 1251 1.1 chs * To prevent excess scrubbing, the scrub begins by walking the DDT 1252 1.1 chs * to find all blocks with refcnt > 1, and scrubs each of these once. 1253 1.1 chs * Since there are two replication classes which contain blocks with 1254 1.1 chs * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. 1255 1.1 chs * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. 1256 1.1 chs * 1257 1.1 chs * There would be nothing more to say if a block's refcnt couldn't change 1258 1.1 chs * during a scrub, but of course it can so we must account for changes 1259 1.1 chs * in a block's replication class. 1260 1.1 chs * 1261 1.1 chs * Here's an example of what can occur: 1262 1.1 chs * 1263 1.1 chs * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 1264 1.1 chs * when visited during the top-down scrub phase, it will be scrubbed twice. 1265 1.1 chs * This negates our scrub optimization, but is otherwise harmless. 1266 1.1 chs * 1267 1.1 chs * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 1268 1.1 chs * on each visit during the top-down scrub phase, it will never be scrubbed. 1269 1.1 chs * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's 1270 1.1 chs * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to 1271 1.1 chs * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 1272 1.1 chs * while a scrub is in progress, it scrubs the block right then. 1273 1.1 chs */ 1274 1.1 chs static void 1275 1.1 chs dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) 1276 1.1 chs { 1277 1.1 chs ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; 1278 1.1 chs ddt_entry_t dde = { 0 }; 1279 1.1 chs int error; 1280 1.1 chs uint64_t n = 0; 1281 1.1 chs 1282 1.1 chs while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { 1283 1.1 chs ddt_t *ddt; 1284 1.1 chs 1285 1.1 chs if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) 1286 1.1 chs break; 1287 1.1 chs dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", 1288 1.1 chs (longlong_t)ddb->ddb_class, 1289 1.1 chs (longlong_t)ddb->ddb_type, 1290 1.1 chs (longlong_t)ddb->ddb_checksum, 1291 1.1 chs (longlong_t)ddb->ddb_cursor); 1292 1.1 chs 1293 1.1 chs /* There should be no pending changes to the dedup table */ 1294 1.1 chs ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; 1295 1.1 chs ASSERT(avl_first(&ddt->ddt_tree) == NULL); 1296 1.1 chs 1297 1.1 chs dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); 1298 1.1 chs n++; 1299 1.1 chs 1300 1.1 chs if (dsl_scan_check_pause(scn, NULL)) 1301 1.1 chs break; 1302 1.1 chs } 1303 1.1 chs 1304 1.1 chs zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u", 1305 1.1 chs (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, 1306 1.1 chs (int)scn->scn_pausing); 1307 1.1 chs 1308 1.1 chs ASSERT(error == 0 || error == ENOENT); 1309 1.1 chs ASSERT(error != ENOENT || 1310 1.1 chs ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); 1311 1.1 chs } 1312 1.1 chs 1313 1.1 chs /* ARGSUSED */ 1314 1.1 chs void 1315 1.1 chs dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, 1316 1.1 chs ddt_entry_t *dde, dmu_tx_t *tx) 1317 1.1 chs { 1318 1.1 chs const ddt_key_t *ddk = &dde->dde_key; 1319 1.1 chs ddt_phys_t *ddp = dde->dde_phys; 1320 1.1 chs blkptr_t bp; 1321 1.1 chs zbookmark_phys_t zb = { 0 }; 1322 1.1 chs 1323 1.1 chs if (scn->scn_phys.scn_state != DSS_SCANNING) 1324 1.1 chs return; 1325 1.1 chs 1326 1.1 chs for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1327 1.1 chs if (ddp->ddp_phys_birth == 0 || 1328 1.1 chs ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) 1329 1.1 chs continue; 1330 1.1 chs ddt_bp_create(checksum, ddk, ddp, &bp); 1331 1.1 chs 1332 1.1 chs scn->scn_visited_this_txg++; 1333 1.1 chs scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); 1334 1.1 chs } 1335 1.1 chs } 1336 1.1 chs 1337 1.1 chs static void 1338 1.1 chs dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) 1339 1.1 chs { 1340 1.1 chs dsl_pool_t *dp = scn->scn_dp; 1341 1.1 chs zap_cursor_t zc; 1342 1.1 chs zap_attribute_t za; 1343 1.1 chs 1344 1.1 chs if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= 1345 1.1 chs scn->scn_phys.scn_ddt_class_max) { 1346 1.1 chs scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; 1347 1.1 chs scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; 1348 1.1 chs dsl_scan_ddt(scn, tx); 1349 1.1 chs if (scn->scn_pausing) 1350 1.1 chs return; 1351 1.1 chs } 1352 1.1 chs 1353 1.1 chs if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { 1354 1.1 chs /* First do the MOS & ORIGIN */ 1355 1.1 chs 1356 1.1 chs scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; 1357 1.1 chs scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; 1358 1.1 chs dsl_scan_visit_rootbp(scn, NULL, 1359 1.1 chs &dp->dp_meta_rootbp, tx); 1360 1.1 chs spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 1361 1.1 chs if (scn->scn_pausing) 1362 1.1 chs return; 1363 1.1 chs 1364 1.1 chs if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { 1365 1.1 chs VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1366 1.1 chs enqueue_cb, tx, DS_FIND_CHILDREN)); 1367 1.1 chs } else { 1368 1.1 chs dsl_scan_visitds(scn, 1369 1.1 chs dp->dp_origin_snap->ds_object, tx); 1370 1.1 chs } 1371 1.1 chs ASSERT(!scn->scn_pausing); 1372 1.1 chs } else if (scn->scn_phys.scn_bookmark.zb_objset != 1373 1.1 chs ZB_DESTROYED_OBJSET) { 1374 1.1 chs /* 1375 1.1 chs * If we were paused, continue from here. Note if the 1376 1.1 chs * ds we were paused on was deleted, the zb_objset may 1377 1.1 chs * be -1, so we will skip this and find a new objset 1378 1.1 chs * below. 1379 1.1 chs */ 1380 1.1 chs dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); 1381 1.1 chs if (scn->scn_pausing) 1382 1.1 chs return; 1383 1.1 chs } 1384 1.1 chs 1385 1.1 chs /* 1386 1.1 chs * In case we were paused right at the end of the ds, zero the 1387 1.1 chs * bookmark so we don't think that we're still trying to resume. 1388 1.1 chs */ 1389 1.1 chs bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); 1390 1.1 chs 1391 1.1 chs /* keep pulling things out of the zap-object-as-queue */ 1392 1.1 chs while (zap_cursor_init(&zc, dp->dp_meta_objset, 1393 1.1 chs scn->scn_phys.scn_queue_obj), 1394 1.1 chs zap_cursor_retrieve(&zc, &za) == 0) { 1395 1.1 chs dsl_dataset_t *ds; 1396 1.1 chs uint64_t dsobj; 1397 1.1 chs 1398 1.1 chs dsobj = strtonum(za.za_name, NULL); 1399 1.1 chs VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 1400 1.1 chs scn->scn_phys.scn_queue_obj, dsobj, tx)); 1401 1.1 chs 1402 1.1 chs /* Set up min/max txg */ 1403 1.1 chs VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 1404 1.1 chs if (za.za_first_integer != 0) { 1405 1.1 chs scn->scn_phys.scn_cur_min_txg = 1406 1.1 chs MAX(scn->scn_phys.scn_min_txg, 1407 1.1 chs za.za_first_integer); 1408 1.1 chs } else { 1409 1.1 chs scn->scn_phys.scn_cur_min_txg = 1410 1.1 chs MAX(scn->scn_phys.scn_min_txg, 1411 1.1 chs dsl_dataset_phys(ds)->ds_prev_snap_txg); 1412 1.1 chs } 1413 1.1 chs scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); 1414 1.1 chs dsl_dataset_rele(ds, FTAG); 1415 1.1 chs 1416 1.1 chs dsl_scan_visitds(scn, dsobj, tx); 1417 1.1 chs zap_cursor_fini(&zc); 1418 1.1 chs if (scn->scn_pausing) 1419 1.1 chs return; 1420 1.1 chs } 1421 1.1 chs zap_cursor_fini(&zc); 1422 1.1 chs } 1423 1.1 chs 1424 1.1 chs static boolean_t 1425 1.1 chs dsl_scan_free_should_pause(dsl_scan_t *scn) 1426 1.1 chs { 1427 1.1 chs uint64_t elapsed_nanosecs; 1428 1.1 chs 1429 1.1 chs if (zfs_recover) 1430 1.1 chs return (B_FALSE); 1431 1.1 chs 1432 1.1 chs if (scn->scn_visited_this_txg >= zfs_free_max_blocks) 1433 1.1 chs return (B_TRUE); 1434 1.1 chs 1435 1.1 chs elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; 1436 1.1 chs return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || 1437 1.1 chs (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms && 1438 1.1 chs txg_sync_waiting(scn->scn_dp)) || 1439 1.1 chs spa_shutting_down(scn->scn_dp->dp_spa)); 1440 1.1 chs } 1441 1.1 chs 1442 1.1 chs static int 1443 1.1 chs dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 1444 1.1 chs { 1445 1.1 chs dsl_scan_t *scn = arg; 1446 1.1 chs 1447 1.1 chs if (!scn->scn_is_bptree || 1448 1.1 chs (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { 1449 1.1 chs if (dsl_scan_free_should_pause(scn)) 1450 1.1 chs return (SET_ERROR(ERESTART)); 1451 1.1 chs } 1452 1.1 chs 1453 1.1 chs zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, 1454 1.1 chs dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); 1455 1.1 chs dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 1456 1.1 chs -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), 1457 1.1 chs -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 1458 1.1 chs scn->scn_visited_this_txg++; 1459 1.1 chs return (0); 1460 1.1 chs } 1461 1.1 chs 1462 1.1 chs boolean_t 1463 1.1 chs dsl_scan_active(dsl_scan_t *scn) 1464 1.1 chs { 1465 1.1 chs spa_t *spa = scn->scn_dp->dp_spa; 1466 1.1 chs uint64_t used = 0, comp, uncomp; 1467 1.1 chs 1468 1.1 chs if (spa->spa_load_state != SPA_LOAD_NONE) 1469 1.1 chs return (B_FALSE); 1470 1.1 chs if (spa_shutting_down(spa)) 1471 1.1 chs return (B_FALSE); 1472 1.1 chs if (scn->scn_phys.scn_state == DSS_SCANNING || 1473 1.1 chs (scn->scn_async_destroying && !scn->scn_async_stalled)) 1474 1.1 chs return (B_TRUE); 1475 1.1 chs 1476 1.1 chs if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 1477 1.1 chs (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, 1478 1.1 chs &used, &comp, &uncomp); 1479 1.1 chs } 1480 1.1 chs return (used != 0); 1481 1.1 chs } 1482 1.1 chs 1483 1.1 chs void 1484 1.1 chs dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) 1485 1.1 chs { 1486 1.1 chs dsl_scan_t *scn = dp->dp_scan; 1487 1.1 chs spa_t *spa = dp->dp_spa; 1488 1.1 chs int err = 0; 1489 1.1 chs 1490 1.1 chs /* 1491 1.1 chs * Check for scn_restart_txg before checking spa_load_state, so 1492 1.1 chs * that we can restart an old-style scan while the pool is being 1493 1.1 chs * imported (see dsl_scan_init). 1494 1.1 chs */ 1495 1.1 chs if (dsl_scan_restarting(scn, tx)) { 1496 1.1 chs pool_scan_func_t func = POOL_SCAN_SCRUB; 1497 1.1 chs dsl_scan_done(scn, B_FALSE, tx); 1498 1.1 chs if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) 1499 1.1 chs func = POOL_SCAN_RESILVER; 1500 1.1 chs zfs_dbgmsg("restarting scan func=%u txg=%llu", 1501 1.1 chs func, tx->tx_txg); 1502 1.1 chs dsl_scan_setup_sync(&func, tx); 1503 1.1 chs } 1504 1.1 chs 1505 1.1 chs /* 1506 1.1 chs * Only process scans in sync pass 1. 1507 1.1 chs */ 1508 1.1 chs if (spa_sync_pass(dp->dp_spa) > 1) 1509 1.1 chs return; 1510 1.1 chs 1511 1.1 chs /* 1512 1.1 chs * If the spa is shutting down, then stop scanning. This will 1513 1.1 chs * ensure that the scan does not dirty any new data during the 1514 1.1 chs * shutdown phase. 1515 1.1 chs */ 1516 1.1 chs if (spa_shutting_down(spa)) 1517 1.1 chs return; 1518 1.1 chs 1519 1.1 chs /* 1520 1.1 chs * If the scan is inactive due to a stalled async destroy, try again. 1521 1.1 chs */ 1522 1.1 chs if (!scn->scn_async_stalled && !dsl_scan_active(scn)) 1523 1.1 chs return; 1524 1.1 chs 1525 1.1 chs scn->scn_visited_this_txg = 0; 1526 1.1 chs scn->scn_pausing = B_FALSE; 1527 1.1 chs scn->scn_sync_start_time = gethrtime(); 1528 1.1 chs spa->spa_scrub_active = B_TRUE; 1529 1.1 chs 1530 1.1 chs /* 1531 1.1 chs * First process the async destroys. If we pause, don't do 1532 1.1 chs * any scrubbing or resilvering. This ensures that there are no 1533 1.1 chs * async destroys while we are scanning, so the scan code doesn't 1534 1.1 chs * have to worry about traversing it. It is also faster to free the 1535 1.1 chs * blocks than to scrub them. 1536 1.1 chs */ 1537 1.1 chs if (zfs_free_bpobj_enabled && 1538 1.1 chs spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 1539 1.1 chs scn->scn_is_bptree = B_FALSE; 1540 1.1 chs scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 1541 1.1 chs NULL, ZIO_FLAG_MUSTSUCCEED); 1542 1.1 chs err = bpobj_iterate(&dp->dp_free_bpobj, 1543 1.1 chs dsl_scan_free_block_cb, scn, tx); 1544 1.1 chs VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); 1545 1.1 chs 1546 1.1 chs if (err != 0 && err != ERESTART) 1547 1.1 chs zfs_panic_recover("error %u from bpobj_iterate()", err); 1548 1.1 chs } 1549 1.1 chs 1550 1.1 chs if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { 1551 1.1 chs ASSERT(scn->scn_async_destroying); 1552 1.1 chs scn->scn_is_bptree = B_TRUE; 1553 1.1 chs scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 1554 1.1 chs NULL, ZIO_FLAG_MUSTSUCCEED); 1555 1.1 chs err = bptree_iterate(dp->dp_meta_objset, 1556 1.1 chs dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); 1557 1.1 chs VERIFY0(zio_wait(scn->scn_zio_root)); 1558 1.1 chs 1559 1.1 chs if (err == EIO || err == ECKSUM) { 1560 1.1 chs err = 0; 1561 1.1 chs } else if (err != 0 && err != ERESTART) { 1562 1.1 chs zfs_panic_recover("error %u from " 1563 1.1 chs "traverse_dataset_destroyed()", err); 1564 1.1 chs } 1565 1.1 chs 1566 1.1 chs if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { 1567 1.1 chs /* finished; deactivate async destroy feature */ 1568 1.1 chs spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); 1569 1.1 chs ASSERT(!spa_feature_is_active(spa, 1570 1.1 chs SPA_FEATURE_ASYNC_DESTROY)); 1571 1.1 chs VERIFY0(zap_remove(dp->dp_meta_objset, 1572 1.1 chs DMU_POOL_DIRECTORY_OBJECT, 1573 1.1 chs DMU_POOL_BPTREE_OBJ, tx)); 1574 1.1 chs VERIFY0(bptree_free(dp->dp_meta_objset, 1575 1.1 chs dp->dp_bptree_obj, tx)); 1576 1.1 chs dp->dp_bptree_obj = 0; 1577 1.1 chs scn->scn_async_destroying = B_FALSE; 1578 1.1 chs scn->scn_async_stalled = B_FALSE; 1579 1.1 chs } else { 1580 1.1 chs /* 1581 1.1 chs * If we didn't make progress, mark the async 1582 1.1 chs * destroy as stalled, so that we will not initiate 1583 1.1 chs * a spa_sync() on its behalf. Note that we only 1584 1.1 chs * check this if we are not finished, because if the 1585 1.1 chs * bptree had no blocks for us to visit, we can 1586 1.1 chs * finish without "making progress". 1587 1.1 chs */ 1588 1.1 chs scn->scn_async_stalled = 1589 1.1 chs (scn->scn_visited_this_txg == 0); 1590 1.1 chs } 1591 1.1 chs } 1592 1.1 chs if (scn->scn_visited_this_txg) { 1593 1.1 chs zfs_dbgmsg("freed %llu blocks in %llums from " 1594 1.1 chs "free_bpobj/bptree txg %llu; err=%d", 1595 1.1 chs (longlong_t)scn->scn_visited_this_txg, 1596 1.1 chs (longlong_t) 1597 1.1 chs NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), 1598 1.1 chs (longlong_t)tx->tx_txg, err); 1599 1.1 chs scn->scn_visited_this_txg = 0; 1600 1.1 chs 1601 1.1 chs /* 1602 1.1 chs * Write out changes to the DDT that may be required as a 1603 1.1 chs * result of the blocks freed. This ensures that the DDT 1604 1.1 chs * is clean when a scrub/resilver runs. 1605 1.1 chs */ 1606 1.1 chs ddt_sync(spa, tx->tx_txg); 1607 1.1 chs } 1608 1.1 chs if (err != 0) 1609 1.1 chs return; 1610 1.1 chs if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && 1611 1.1 chs zfs_free_leak_on_eio && 1612 1.1 chs (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || 1613 1.1 chs dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || 1614 1.1 chs dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { 1615 1.1 chs /* 1616 1.1 chs * We have finished background destroying, but there is still 1617 1.1 chs * some space left in the dp_free_dir. Transfer this leaked 1618 1.1 chs * space to the dp_leak_dir. 1619 1.1 chs */ 1620 1.1 chs if (dp->dp_leak_dir == NULL) { 1621 1.1 chs rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 1622 1.1 chs (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 1623 1.1 chs LEAK_DIR_NAME, tx); 1624 1.1 chs VERIFY0(dsl_pool_open_special_dir(dp, 1625 1.1 chs LEAK_DIR_NAME, &dp->dp_leak_dir)); 1626 1.1 chs rrw_exit(&dp->dp_config_rwlock, FTAG); 1627 1.1 chs } 1628 1.1 chs dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, 1629 1.1 chs dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, 1630 1.1 chs dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, 1631 1.1 chs dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); 1632 1.1 chs dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, 1633 1.1 chs -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, 1634 1.1 chs -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, 1635 1.1 chs -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); 1636 1.1 chs } 1637 1.1 chs if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { 1638 1.1 chs /* finished; verify that space accounting went to zero */ 1639 1.1 chs ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); 1640 1.1 chs ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); 1641 1.1 chs ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); 1642 1.1 chs } 1643 1.1 chs 1644 1.1 chs if (scn->scn_phys.scn_state != DSS_SCANNING) 1645 1.1 chs return; 1646 1.1 chs 1647 1.1 chs if (scn->scn_done_txg == tx->tx_txg) { 1648 1.1 chs ASSERT(!scn->scn_pausing); 1649 1.1 chs /* finished with scan. */ 1650 1.1 chs zfs_dbgmsg("txg %llu scan complete", tx->tx_txg); 1651 1.1 chs dsl_scan_done(scn, B_TRUE, tx); 1652 1.1 chs ASSERT3U(spa->spa_scrub_inflight, ==, 0); 1653 1.1 chs dsl_scan_sync_state(scn, tx); 1654 1.1 chs return; 1655 1.1 chs } 1656 1.1 chs 1657 1.1 chs if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= 1658 1.1 chs scn->scn_phys.scn_ddt_class_max) { 1659 1.1 chs zfs_dbgmsg("doing scan sync txg %llu; " 1660 1.1 chs "ddt bm=%llu/%llu/%llu/%llx", 1661 1.1 chs (longlong_t)tx->tx_txg, 1662 1.1 chs (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, 1663 1.1 chs (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, 1664 1.1 chs (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, 1665 1.1 chs (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); 1666 1.1 chs ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); 1667 1.1 chs ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); 1668 1.1 chs ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); 1669 1.1 chs ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); 1670 1.1 chs } else { 1671 1.1 chs zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", 1672 1.1 chs (longlong_t)tx->tx_txg, 1673 1.1 chs (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, 1674 1.1 chs (longlong_t)scn->scn_phys.scn_bookmark.zb_object, 1675 1.1 chs (longlong_t)scn->scn_phys.scn_bookmark.zb_level, 1676 1.1 chs (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); 1677 1.1 chs } 1678 1.1 chs 1679 1.1 chs scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 1680 1.1 chs NULL, ZIO_FLAG_CANFAIL); 1681 1.1 chs dsl_pool_config_enter(dp, FTAG); 1682 1.1 chs dsl_scan_visit(scn, tx); 1683 1.1 chs dsl_pool_config_exit(dp, FTAG); 1684 1.1 chs (void) zio_wait(scn->scn_zio_root); 1685 1.1 chs scn->scn_zio_root = NULL; 1686 1.1 chs 1687 1.1 chs zfs_dbgmsg("visited %llu blocks in %llums", 1688 1.1 chs (longlong_t)scn->scn_visited_this_txg, 1689 1.1 chs (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time)); 1690 1.1 chs 1691 1.1 chs if (!scn->scn_pausing) { 1692 1.1 chs scn->scn_done_txg = tx->tx_txg + 1; 1693 1.1 chs zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu", 1694 1.1 chs tx->tx_txg, scn->scn_done_txg); 1695 1.1 chs } 1696 1.1 chs 1697 1.1 chs if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 1698 1.1 chs mutex_enter(&spa->spa_scrub_lock); 1699 1.1 chs while (spa->spa_scrub_inflight > 0) { 1700 1.1 chs cv_wait(&spa->spa_scrub_io_cv, 1701 1.1 chs &spa->spa_scrub_lock); 1702 1.1 chs } 1703 1.1 chs mutex_exit(&spa->spa_scrub_lock); 1704 1.1 chs } 1705 1.1 chs 1706 1.1 chs dsl_scan_sync_state(scn, tx); 1707 1.1 chs } 1708 1.1 chs 1709 1.1 chs /* 1710 1.1 chs * This will start a new scan, or restart an existing one. 1711 1.1 chs */ 1712 1.1 chs void 1713 1.1 chs dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) 1714 1.1 chs { 1715 1.1 chs if (txg == 0) { 1716 1.1 chs dmu_tx_t *tx; 1717 1.1 chs tx = dmu_tx_create_dd(dp->dp_mos_dir); 1718 1.1 chs VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); 1719 1.1 chs 1720 1.1 chs txg = dmu_tx_get_txg(tx); 1721 1.1 chs dp->dp_scan->scn_restart_txg = txg; 1722 1.1 chs dmu_tx_commit(tx); 1723 1.1 chs } else { 1724 1.1 chs dp->dp_scan->scn_restart_txg = txg; 1725 1.1 chs } 1726 1.1 chs zfs_dbgmsg("restarting resilver txg=%llu", txg); 1727 1.1 chs } 1728 1.1 chs 1729 1.1 chs boolean_t 1730 1.1 chs dsl_scan_resilvering(dsl_pool_t *dp) 1731 1.1 chs { 1732 1.1 chs return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && 1733 1.1 chs dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); 1734 1.1 chs } 1735 1.1 chs 1736 1.1 chs /* 1737 1.1 chs * scrub consumers 1738 1.1 chs */ 1739 1.1 chs 1740 1.1 chs static void 1741 1.1 chs count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) 1742 1.1 chs { 1743 1.1 chs int i; 1744 1.1 chs 1745 1.1 chs /* 1746 1.1 chs * If we resume after a reboot, zab will be NULL; don't record 1747 1.1 chs * incomplete stats in that case. 1748 1.1 chs */ 1749 1.1 chs if (zab == NULL) 1750 1.1 chs return; 1751 1.1 chs 1752 1.1 chs for (i = 0; i < 4; i++) { 1753 1.1 chs int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; 1754 1.1 chs int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; 1755 1.1 chs if (t & DMU_OT_NEWTYPE) 1756 1.1 chs t = DMU_OT_OTHER; 1757 1.1 chs zfs_blkstat_t *zb = &zab->zab_type[l][t]; 1758 1.1 chs int equal; 1759 1.1 chs 1760 1.1 chs zb->zb_count++; 1761 1.1 chs zb->zb_asize += BP_GET_ASIZE(bp); 1762 1.1 chs zb->zb_lsize += BP_GET_LSIZE(bp); 1763 1.1 chs zb->zb_psize += BP_GET_PSIZE(bp); 1764 1.1 chs zb->zb_gangs += BP_COUNT_GANG(bp); 1765 1.1 chs 1766 1.1 chs switch (BP_GET_NDVAS(bp)) { 1767 1.1 chs case 2: 1768 1.1 chs if (DVA_GET_VDEV(&bp->blk_dva[0]) == 1769 1.1 chs DVA_GET_VDEV(&bp->blk_dva[1])) 1770 1.1 chs zb->zb_ditto_2_of_2_samevdev++; 1771 1.1 chs break; 1772 1.1 chs case 3: 1773 1.1 chs equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 1774 1.1 chs DVA_GET_VDEV(&bp->blk_dva[1])) + 1775 1.1 chs (DVA_GET_VDEV(&bp->blk_dva[0]) == 1776 1.1 chs DVA_GET_VDEV(&bp->blk_dva[2])) + 1777 1.1 chs (DVA_GET_VDEV(&bp->blk_dva[1]) == 1778 1.1 chs DVA_GET_VDEV(&bp->blk_dva[2])); 1779 1.1 chs if (equal == 1) 1780 1.1 chs zb->zb_ditto_2_of_3_samevdev++; 1781 1.1 chs else if (equal == 3) 1782 1.1 chs zb->zb_ditto_3_of_3_samevdev++; 1783 1.1 chs break; 1784 1.1 chs } 1785 1.1 chs } 1786 1.1 chs } 1787 1.1 chs 1788 1.1 chs static void 1789 1.1 chs dsl_scan_scrub_done(zio_t *zio) 1790 1.1 chs { 1791 1.1 chs spa_t *spa = zio->io_spa; 1792 1.1 chs 1793 1.1 chs zio_data_buf_free(zio->io_data, zio->io_size); 1794 1.1 chs 1795 1.1 chs mutex_enter(&spa->spa_scrub_lock); 1796 1.1 chs spa->spa_scrub_inflight--; 1797 1.1 chs cv_broadcast(&spa->spa_scrub_io_cv); 1798 1.1 chs 1799 1.1 chs if (zio->io_error && (zio->io_error != ECKSUM || 1800 1.1 chs !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { 1801 1.1 chs spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; 1802 1.1 chs } 1803 1.1 chs mutex_exit(&spa->spa_scrub_lock); 1804 1.1 chs } 1805 1.1 chs 1806 1.1 chs static int 1807 1.1 chs dsl_scan_scrub_cb(dsl_pool_t *dp, 1808 1.1 chs const blkptr_t *bp, const zbookmark_phys_t *zb) 1809 1.1 chs { 1810 1.1 chs dsl_scan_t *scn = dp->dp_scan; 1811 1.1 chs size_t size = BP_GET_PSIZE(bp); 1812 1.1 chs spa_t *spa = dp->dp_spa; 1813 1.1 chs uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); 1814 1.1 chs boolean_t needs_io; 1815 1.1 chs int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; 1816 1.1 chs unsigned int scan_delay = 0; 1817 1.1 chs 1818 1.1 chs if (phys_birth <= scn->scn_phys.scn_min_txg || 1819 1.1 chs phys_birth >= scn->scn_phys.scn_max_txg) 1820 1.1 chs return (0); 1821 1.1 chs 1822 1.1 chs count_block(dp->dp_blkstats, bp); 1823 1.1 chs 1824 1.1 chs if (BP_IS_EMBEDDED(bp)) 1825 1.1 chs return (0); 1826 1.1 chs 1827 1.1 chs ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); 1828 1.1 chs if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { 1829 1.1 chs zio_flags |= ZIO_FLAG_SCRUB; 1830 1.1 chs needs_io = B_TRUE; 1831 1.1 chs scan_delay = zfs_scrub_delay; 1832 1.1 chs } else { 1833 1.1 chs ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); 1834 1.1 chs zio_flags |= ZIO_FLAG_RESILVER; 1835 1.1 chs needs_io = B_FALSE; 1836 1.1 chs scan_delay = zfs_resilver_delay; 1837 1.1 chs } 1838 1.1 chs 1839 1.1 chs /* If it's an intent log block, failure is expected. */ 1840 1.1 chs if (zb->zb_level == ZB_ZIL_LEVEL) 1841 1.1 chs zio_flags |= ZIO_FLAG_SPECULATIVE; 1842 1.1 chs 1843 1.1 chs for (int d = 0; d < BP_GET_NDVAS(bp); d++) { 1844 1.1 chs vdev_t *vd = vdev_lookup_top(spa, 1845 1.1 chs DVA_GET_VDEV(&bp->blk_dva[d])); 1846 1.1 chs 1847 1.1 chs /* 1848 1.1 chs * Keep track of how much data we've examined so that 1849 1.1 chs * zpool(1M) status can make useful progress reports. 1850 1.1 chs */ 1851 1.1 chs scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); 1852 1.1 chs spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); 1853 1.1 chs 1854 1.1 chs /* if it's a resilver, this may not be in the target range */ 1855 1.1 chs if (!needs_io) { 1856 1.1 chs if (DVA_GET_GANG(&bp->blk_dva[d])) { 1857 1.1 chs /* 1858 1.1 chs * Gang members may be spread across multiple 1859 1.1 chs * vdevs, so the best estimate we have is the 1860 1.1 chs * scrub range, which has already been checked. 1861 1.1 chs * XXX -- it would be better to change our 1862 1.1 chs * allocation policy to ensure that all 1863 1.1 chs * gang members reside on the same vdev. 1864 1.1 chs */ 1865 1.1 chs needs_io = B_TRUE; 1866 1.1 chs } else { 1867 1.1 chs needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, 1868 1.1 chs phys_birth, 1); 1869 1.1 chs } 1870 1.1 chs } 1871 1.1 chs } 1872 1.1 chs 1873 1.1 chs if (needs_io && !zfs_no_scrub_io) { 1874 1.1 chs vdev_t *rvd = spa->spa_root_vdev; 1875 1.1 chs uint64_t maxinflight = rvd->vdev_children * 1876 1.1 chs MAX(zfs_top_maxinflight, 1); 1877 1.1 chs void *data = zio_data_buf_alloc(size); 1878 1.1 chs 1879 1.1 chs mutex_enter(&spa->spa_scrub_lock); 1880 1.1 chs while (spa->spa_scrub_inflight >= maxinflight) 1881 1.1 chs cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1882 1.1 chs spa->spa_scrub_inflight++; 1883 1.1 chs mutex_exit(&spa->spa_scrub_lock); 1884 1.1 chs 1885 1.1 chs /* 1886 1.1 chs * If we're seeing recent (zfs_scan_idle) "important" I/Os 1887 1.1 chs * then throttle our workload to limit the impact of a scan. 1888 1.1 chs */ 1889 1.1 chs if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) 1890 1.1 chs delay(MAX((int)scan_delay, 0)); 1891 1.1 chs 1892 1.1 chs zio_nowait(zio_read(NULL, spa, bp, data, size, 1893 1.1 chs dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB, 1894 1.1 chs zio_flags, zb)); 1895 1.1 chs } 1896 1.1 chs 1897 1.1 chs /* do not relocate this block */ 1898 1.1 chs return (0); 1899 1.1 chs } 1900 1.1 chs 1901 1.1 chs int 1902 1.1 chs dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) 1903 1.1 chs { 1904 1.1 chs spa_t *spa = dp->dp_spa; 1905 1.1 chs 1906 1.1 chs /* 1907 1.1 chs * Purge all vdev caches and probe all devices. We do this here 1908 1.1 chs * rather than in sync context because this requires a writer lock 1909 1.1 chs * on the spa_config lock, which we can't do from sync context. The 1910 1.1 chs * spa_scrub_reopen flag indicates that vdev_open() should not 1911 1.1 chs * attempt to start another scrub. 1912 1.1 chs */ 1913 1.1 chs spa_vdev_state_enter(spa, SCL_NONE); 1914 1.1 chs spa->spa_scrub_reopen = B_TRUE; 1915 1.1 chs vdev_reopen(spa->spa_root_vdev); 1916 1.1 chs spa->spa_scrub_reopen = B_FALSE; 1917 1.1 chs (void) spa_vdev_state_exit(spa, NULL, 0); 1918 1.1 chs 1919 1.1 chs return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, 1920 1.1 chs dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE)); 1921 1.1 chs } 1922 1.1 chs 1923 1.1 chs static boolean_t 1924 1.1 chs dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) 1925 1.1 chs { 1926 1.1 chs return (scn->scn_restart_txg != 0 && 1927 1.1 chs scn->scn_restart_txg <= tx->tx_txg); 1928 1.1 chs } 1929