aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndriy Gapon <avg@FreeBSD.org>2019-11-21 14:10:53 +0000
committerAndriy Gapon <avg@FreeBSD.org>2019-11-21 14:10:53 +0000
commit8a408c7a3a4606de36ef379ed7bdd9f2766fcdcf (patch)
tree0c83be986d68d7d5852965b1c5112dd41ebb99e4
parent1328f69913138b48fb85296bcee6d08dc1a9f268 (diff)
downloadsrc-8a408c7a3a4606de36ef379ed7bdd9f2766fcdcf.tar.gz
src-8a408c7a3a4606de36ef379ed7bdd9f2766fcdcf.zip
10952 defer new resilvers and misc. resilver-related fixes
illumos/illumos-gate@e4c795beb33bf59dd4ad2e3f88f493111484b890 https://github.com/illumos/illumos-gate/commit/e4c795beb33bf59dd4ad2e3f88f493111484b890 https://www.illumos.org/issues/10952 From ZoL 612c4930dd2 Fix the spelling of deferred ??? cef48f14da6 Remove races from scrub / resilver tests 4021ba4cfaa Make vdev_set_deferred_resilver() recursive 8cb119e3dc0 Fix 2 small bugs with cached dsl_scan_phys_t 5e0bd0ae056 Fix issue with scanning dedup blocks as scan ends b3d7725c943 Remove zfs_gitrev.h (this shouldn't be part of 80a91e74696) 80a91e74696 Defer new resilvers until the current one ends Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com> Portions contributed by: Brian Behlendorf <behlendorf1@llnl.gov> Portions contributed by: Arkadiusz BubaƂa <arkadiusz.bubala@open-e.com> Author: Tom Caputi <tcaputi@datto.com>
Notes
Notes: svn path=/vendor-sys/illumos/dist/; revision=354961
-rw-r--r--cmd/zpool/zpool_main.c57
-rw-r--r--common/zfs/zfeature_common.c7
-rw-r--r--common/zfs/zfeature_common.h1
-rw-r--r--lib/libzfs/common/libzfs.h1
-rw-r--r--lib/libzfs/common/libzfs_pool.c6
-rw-r--r--lib/libzfs/common/libzfs_util.c3
-rw-r--r--man/man1m/zpool.1m18
-rw-r--r--man/man5/zpool-features.523
-rw-r--r--uts/common/fs/zfs/dsl_scan.c142
-rw-r--r--uts/common/fs/zfs/spa.c20
-rw-r--r--uts/common/fs/zfs/sys/spa_impl.h7
-rw-r--r--uts/common/fs/zfs/sys/vdev.h2
-rw-r--r--uts/common/fs/zfs/sys/vdev_impl.h1
-rw-r--r--uts/common/fs/zfs/vdev.c42
-rw-r--r--uts/common/fs/zfs/vdev_indirect.c23
-rw-r--r--uts/common/fs/zfs/vdev_label.c6
-rw-r--r--uts/common/fs/zfs/vdev_removal.c6
-rw-r--r--uts/common/fs/zfs/zil.c2
-rw-r--r--uts/common/sys/fs/zfs.h2
19 files changed, 336 insertions, 33 deletions
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 9713aaac9bac..4e903406c217 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -88,6 +88,7 @@ static int zpool_do_split(int, char **);
static int zpool_do_initialize(int, char **);
static int zpool_do_scrub(int, char **);
+static int zpool_do_resilver(int, char **);
static int zpool_do_import(int, char **);
static int zpool_do_export(int, char **);
@@ -140,6 +141,7 @@ typedef enum {
HELP_REMOVE,
HELP_INITIALIZE,
HELP_SCRUB,
+ HELP_RESILVER,
HELP_STATUS,
HELP_UPGRADE,
HELP_GET,
@@ -192,6 +194,7 @@ static zpool_command_t command_table[] = {
{ "split", zpool_do_split, HELP_SPLIT },
{ NULL },
{ "initialize", zpool_do_initialize, HELP_INITIALIZE },
+ { "resilver", zpool_do_resilver, HELP_RESILVER },
{ "scrub", zpool_do_scrub, HELP_SCRUB },
{ NULL },
{ "import", zpool_do_import, HELP_IMPORT },
@@ -273,6 +276,8 @@ get_usage(zpool_help_t idx)
return (gettext("\tinitialize [-cs] <pool> [<device> ...]\n"));
case HELP_SCRUB:
return (gettext("\tscrub [-s | -p] <pool> ...\n"));
+ case HELP_RESILVER:
+ return (gettext("\tresilver <pool> ...\n"));
case HELP_STATUS:
return (gettext("\tstatus [-DgLPvx] [-T d|u] [pool] ... "
"[interval [count]]\n"));
@@ -1688,11 +1693,14 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
(void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
(uint64_t **)&ps, &c);
- if (ps != NULL && ps->pss_state == DSS_SCANNING &&
- vs->vs_scan_processed != 0 && children == 0) {
- (void) printf(gettext(" (%s)"),
- (ps->pss_func == POOL_SCAN_RESILVER) ?
- "resilvering" : "repairing");
+ if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) {
+ if (vs->vs_scan_processed != 0) {
+ (void) printf(gettext(" (%s)"),
+ (ps->pss_func == POOL_SCAN_RESILVER) ?
+ "resilvering" : "repairing");
+ } else if (vs->vs_resilver_deferred) {
+ (void) printf(gettext(" (awaiting resilver)"));
+ }
}
if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE ||
@@ -4519,7 +4527,7 @@ scrub_callback(zpool_handle_t *zhp, void *data)
* Ignore faulted pools.
*/
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
- (void) fprintf(stderr, gettext("cannot scrub '%s': pool is "
+ (void) fprintf(stderr, gettext("cannot scan '%s': pool is "
"currently unavailable\n"), zpool_get_name(zhp));
return (1);
}
@@ -4587,6 +4595,43 @@ zpool_do_scrub(int argc, char **argv)
return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
}
+/*
+ * zpool resilver <pool> ...
+ *
+ * Restarts any in-progress resilver
+ */
+int
+zpool_do_resilver(int argc, char **argv)
+{
+ int c;
+ scrub_cbdata_t cb;
+
+ cb.cb_type = POOL_SCAN_RESILVER;
+ cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
+ cb.cb_argc = argc;
+ cb.cb_argv = argv;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "")) != -1) {
+ switch (c) {
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ }
+
+ return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
+}
+
static void
zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res)
{
diff --git a/common/zfs/zfeature_common.c b/common/zfs/zfeature_common.c
index b4e80758eee6..1be115dfe61c 100644
--- a/common/zfs/zfeature_common.c
+++ b/common/zfs/zfeature_common.c
@@ -300,10 +300,13 @@ zpool_feature_init(void)
"freed or remapped.",
ZFEATURE_FLAG_READONLY_COMPAT, obsolete_counts_deps);
- {
zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES,
"org.zfsonlinux:allocation_classes", "allocation_classes",
"Support for separate allocation classes.",
ZFEATURE_FLAG_READONLY_COMPAT, NULL);
- }
+
+ zfeature_register(SPA_FEATURE_RESILVER_DEFER,
+ "com.datto:resilver_defer", "resilver_defer",
+ "Support for defering new resilvers when one is already running.",
+ ZFEATURE_FLAG_READONLY_COMPAT, NULL);
}
diff --git a/common/zfs/zfeature_common.h b/common/zfs/zfeature_common.h
index af29560ae902..ebe9626caf91 100644
--- a/common/zfs/zfeature_common.h
+++ b/common/zfs/zfeature_common.h
@@ -63,6 +63,7 @@ typedef enum spa_feature {
SPA_FEATURE_POOL_CHECKPOINT,
SPA_FEATURE_SPACEMAP_V2,
SPA_FEATURE_ALLOCATION_CLASSES,
+ SPA_FEATURE_RESILVER_DEFER,
SPA_FEATURES
} spa_feature_t;
diff --git a/lib/libzfs/common/libzfs.h b/lib/libzfs/common/libzfs.h
index fa005259c228..3dc5454c48d5 100644
--- a/lib/libzfs/common/libzfs.h
+++ b/lib/libzfs/common/libzfs.h
@@ -140,6 +140,7 @@ typedef enum zfs_error {
EZFS_TOOMANY, /* argument list too long */
EZFS_INITIALIZING, /* currently initializing */
EZFS_NO_INITIALIZE, /* no active initialize */
+ EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */
EZFS_UNKNOWN
} zfs_error_t;
diff --git a/lib/libzfs/common/libzfs_pool.c b/lib/libzfs/common/libzfs_pool.c
index 0596fa62ee57..699b7133ef8e 100644
--- a/lib/libzfs/common/libzfs_pool.c
+++ b/lib/libzfs/common/libzfs_pool.c
@@ -2034,6 +2034,10 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot scrub %s"), zc.zc_name);
}
+ } else if (func == POOL_SCAN_RESILVER) {
+ assert(cmd == POOL_SCRUB_NORMAL);
+ (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+ "cannot restart resilver on %s"), zc.zc_name);
} else if (func == POOL_SCAN_NONE) {
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"),
@@ -2061,6 +2065,8 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
}
} else if (err == ENOENT) {
return (zfs_error(hdl, EZFS_NO_SCRUB, msg));
+ } else if (err == ENOTSUP && func == POOL_SCAN_RESILVER) {
+ return (zfs_error(hdl, EZFS_NO_RESILVER_DEFER, msg));
} else {
return (zpool_standard_error(hdl, err, msg));
}
diff --git a/lib/libzfs/common/libzfs_util.c b/lib/libzfs/common/libzfs_util.c
index 3ea967fbedbb..246874b9f9af 100644
--- a/lib/libzfs/common/libzfs_util.c
+++ b/lib/libzfs/common/libzfs_util.c
@@ -260,6 +260,9 @@ libzfs_error_description(libzfs_handle_t *hdl)
case EZFS_NO_INITIALIZE:
return (dgettext(TEXT_DOMAIN, "there is no active "
"initialization"));
+ case EZFS_NO_RESILVER_DEFER:
+ return (dgettext(TEXT_DOMAIN, "this action requires the "
+ "resilver_defer feature"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
diff --git a/man/man1m/zpool.1m b/man/man1m/zpool.1m
index f07bf81bd03a..636af46e24e4 100644
--- a/man/man1m/zpool.1m
+++ b/man/man1m/zpool.1m
@@ -26,7 +26,7 @@
.\" Copyright (c) 2017 George Melikov. All Rights Reserved.
.\" Copyright 2019 Joyent, Inc.
.\"
-.Dd April 27, 2018
+.Dd May 15, 2019
.Dt ZPOOL 1M
.Os
.Sh NAME
@@ -156,6 +156,9 @@
.Op Fl f
.Ar pool Ar device Op Ar new_device
.Nm
+.Cm resilver
+.Ar pool Ns ...
+.Nm
.Cm scrub
.Op Fl s | Fl p
.Ar pool Ns ...
@@ -1776,6 +1779,19 @@ Not all devices can be overridden in this manner.
.El
.It Xo
.Nm
+.Cm resilver
+.Ar pool Ns ...
+.Xc
+Starts a resilver.
+If an existing resilver is already running it will be restarted from the
+beginning.
+Any drives that were scheduled for a deferred resilver will be added to the
+new one.
+This requires the
+.Sy resilver_defer
+feature.
+.It Xo
+.Nm
.Cm scrub
.Op Fl s | Fl p
.Ar pool Ns ...
diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5
index 9601932b26f9..2f3786fbb4c9 100644
--- a/man/man5/zpool-features.5
+++ b/man/man5/zpool-features.5
@@ -15,7 +15,7 @@
.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
-.TH ZPOOL-FEATURES 5 "Apr 19, 2019"
+.TH ZPOOL-FEATURES 5 "May 15, 2019"
.SH NAME
zpool\-features \- ZFS pool feature descriptions
.SH DESCRIPTION
@@ -683,5 +683,26 @@ This feature becomes \fBactive\fR when a dedicated allocation class vdev
removal, it can be returned to the \fBenabled\fR state if all the top-level
vdevs from an allocation class are removed.
+.RE
+.sp
+.ne 2
+.na
+\fB\fBresilver_defer\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID com.datto:resilver_defer
+READ\-ONLY COMPATIBLE yes
+DEPENDENCIES none
+.TE
+
+This feature allows zfs to postpone new resilvers if an existing one is already
+in progress. Without this feature, any new resilvers will cause the currently
+running one to be immediately restarted from the beginning.
+
+This feature becomes \fBactive\fR once a resilver has been deferred, and
+returns to being \fBenabled\fR when the deferred resilver begins.
+
.SH "SEE ALSO"
\fBzpool\fR(1M)
diff --git a/uts/common/fs/zfs/dsl_scan.c b/uts/common/fs/zfs/dsl_scan.c
index 78c8c2a5818e..b5ef5a89e9db 100644
--- a/uts/common/fs/zfs/dsl_scan.c
+++ b/uts/common/fs/zfs/dsl_scan.c
@@ -183,12 +183,15 @@ unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
unsigned int zfs_obsolete_min_time_ms = 500;
/* min millisecs to resilver per txg */
unsigned int zfs_resilver_min_time_ms = 3000;
+int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */
boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
/* max number of blocks to free in a single TXG */
uint64_t zfs_async_block_max_blocks = UINT64_MAX;
+int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */
+
/*
* We wait a few txgs after importing a pool to begin scanning so that
* the import / mounting code isn't held up by scrub / resilver IO.
@@ -455,7 +458,6 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
SPA_FEATURE_ASYNC_DESTROY);
- bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
offsetof(scan_ds_t, sds_node));
avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
@@ -513,6 +515,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
}
}
+ bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+
/* reload the queue into the in-core state */
if (scn->scn_phys.scn_queue_obj != 0) {
zap_cursor_t zc;
@@ -751,6 +755,11 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
spa->spa_scrub_reopen = B_FALSE;
(void) spa_vdev_state_exit(spa, NULL, 0);
+ if (func == POOL_SCAN_RESILVER) {
+ dsl_resilver_restart(spa->spa_dsl_pool, 0);
+ return (0);
+ }
+
if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
/* got scrub start cmd, resume paused scrub */
int err = dsl_scrub_set_pause_resume(scn->scn_dp,
@@ -766,6 +775,41 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
}
+/*
+ * Sets the resilver defer flag to B_FALSE on all leaf devs under vd. Returns
+ * B_TRUE if we have devices that need to be resilvered and are available to
+ * accept resilver I/Os.
+ */
+static boolean_t
+dsl_scan_clear_deferred(vdev_t *vd, dmu_tx_t *tx)
+{
+ boolean_t resilver_needed = B_FALSE;
+ spa_t *spa = vd->vdev_spa;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ resilver_needed |=
+ dsl_scan_clear_deferred(vd->vdev_child[c], tx);
+ }
+
+ if (vd == spa->spa_root_vdev &&
+ spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
+ spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+ vdev_config_dirty(vd);
+ spa->spa_resilver_deferred = B_FALSE;
+ return (resilver_needed);
+ }
+
+ if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+ !vd->vdev_ops->vdev_op_leaf)
+ return (resilver_needed);
+
+ if (vd->vdev_resilver_deferred)
+ vd->vdev_resilver_deferred = B_FALSE;
+
+ return (!vdev_is_dead(vd) && !vd->vdev_offline &&
+ vdev_resilver_needed(vd, NULL, NULL));
+}
+
/* ARGSUSED */
static void
dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
@@ -865,6 +909,25 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
* Let the async thread assess this and handle the detach.
*/
spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+
+ /*
+ * Clear any deferred_resilver flags in the config.
+ * If there are drives that need resilvering, kick
+ * off an asynchronous request to start resilver.
+ * dsl_scan_clear_deferred() may update the config
+ * before the resilver can restart. In the event of
+ * a crash during this period, the spa loading code
+ * will find the drives that need to be resilvered
+ * when the machine reboots and start the resilver then.
+ */
+ boolean_t resilver_needed =
+ dsl_scan_clear_deferred(spa->spa_root_vdev, tx);
+ if (resilver_needed) {
+ spa_history_log_internal(spa,
+ "starting deferred resilver", tx,
+ "errors=%llu", spa_get_errlog_size(spa));
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+ }
}
scn->scn_phys.scn_end_time = gethrestime_sec();
@@ -935,6 +998,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
/* can't pause a scrub when there is no in-progress scrub */
spa->spa_scan_pass_scrub_pause = gethrestime_sec();
scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
+ scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED;
dsl_scan_sync_state(scn, tx, SYNC_CACHED);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
} else {
@@ -949,6 +1013,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
spa->spa_scan_pass_scrub_pause = 0;
scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
+ scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
dsl_scan_sync_state(scn, tx, SYNC_CACHED);
}
}
@@ -2335,6 +2400,20 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;
+ /*
+ * This function is special because it is the only thing
+ * that can add scan_io_t's to the vdev scan queues from
+ * outside dsl_scan_sync(). For the most part this is ok
+ * as long as it is called from within syncing context.
+ * However, dsl_scan_sync() expects that no new sio's will
+ * be added between when all the work for a scan is done
+ * and the next txg when the scan is actually marked as
+ * completed. This check ensures we do not issue new sio's
+ * during this period.
+ */
+ if (scn->scn_done_txg != 0)
+ return;
+
for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0 ||
ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
@@ -2986,6 +3065,26 @@ dsl_scan_active(dsl_scan_t *scn)
}
static boolean_t
+dsl_scan_check_deferred(vdev_t *vd)
+{
+ boolean_t need_resilver = B_FALSE;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ need_resilver |=
+ dsl_scan_check_deferred(vd->vdev_child[c]);
+ }
+
+ if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+ !vd->vdev_ops->vdev_op_leaf)
+ return (need_resilver);
+
+ if (!vd->vdev_resilver_deferred)
+ need_resilver = B_TRUE;
+
+ return (need_resilver);
+}
+
+static boolean_t
dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
uint64_t phys_birth)
{
@@ -3032,6 +3131,13 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
return (B_FALSE);
+ /*
+ * Check that this top-level vdev has a device under it which
+ * is resilvering and is not deferred.
+ */
+ if (!dsl_scan_check_deferred(vd))
+ return (B_FALSE);
+
return (B_TRUE);
}
@@ -3193,12 +3299,19 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
int err = 0;
state_sync_type_t sync_type = SYNC_OPTIONAL;
+ if (spa->spa_resilver_deferred &&
+ !spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
+ spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+
/*
* Check for scn_restart_txg before checking spa_load_state, so
* that we can restart an old-style scan while the pool is being
- * imported (see dsl_scan_init).
+ * imported (see dsl_scan_init). We also restart scans if there
+ * is a deferred resilver and the user has manually disabled
+ * deferred resilvers via the tunable.
*/
- if (dsl_scan_restarting(scn, tx)) {
+ if (dsl_scan_restarting(scn, tx) ||
+ (spa->spa_resilver_deferred && zfs_resilver_disable_defer)) {
pool_scan_func_t func = POOL_SCAN_SCRUB;
dsl_scan_done(scn, B_FALSE, tx);
if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
@@ -3265,6 +3378,27 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
return;
/*
+ * zfs_scan_suspend_progress can be set to disable scan progress.
+ * We don't want to spin the txg_sync thread, so we add a delay
+ * here to simulate the time spent doing a scan. This is mostly
+ * useful for testing and debugging.
+ */
+ if (zfs_scan_suspend_progress) {
+ uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+ int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+ while (zfs_scan_suspend_progress &&
+ !txg_sync_waiting(scn->scn_dp) &&
+ !spa_shutting_down(scn->scn_dp->dp_spa) &&
+ NSEC2MSEC(scan_time_ns) < mintime) {
+ delay(hz);
+ scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+ }
+ return;
+ }
+
+ /*
* It is possible to switch from unsorted to sorted at any time,
* but afterwards the scan will remain sorted unless reloaded from
* a checkpoint after a reboot.
@@ -3393,6 +3527,8 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
(longlong_t)tx->tx_txg);
}
} else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
+ ASSERT(scn->scn_clearing);
+
/* need to issue scrubbing IOs from per-vdev queues */
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
NULL, ZIO_FLAG_CANFAIL);
diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c
index d3f6b4792050..f701fd044d59 100644
--- a/uts/common/fs/zfs/spa.c
+++ b/uts/common/fs/zfs/spa.c
@@ -6176,9 +6176,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
/*
* Schedule the resilver to restart in the future. We do this to
* ensure that dmu_sync-ed blocks have been stitched into the
- * respective datasets.
+ * respective datasets. We do not do this if resilvers have been
+ * deferred.
*/
- dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+ if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
+ vdev_set_deferred_resilver(spa, newvd);
+ else
+ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
if (spa->spa_bootfs)
spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
@@ -7069,6 +7074,10 @@ spa_scan(spa_t *spa, pool_scan_func_t func)
if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
return (SET_ERROR(ENOTSUP));
+ if (func == POOL_SCAN_RESILVER &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
+ return (SET_ERROR(ENOTSUP));
+
/*
* If a resilver was requested, but there is no DTL on a
* writeable leaf device, we have nothing to do.
@@ -7160,6 +7169,7 @@ static void
spa_async_thread(void *arg)
{
spa_t *spa = (spa_t *)arg;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
int tasks;
ASSERT(spa->spa_sync_on);
@@ -7235,8 +7245,10 @@ spa_async_thread(void *arg)
/*
* Kick off a resilver.
*/
- if (tasks & SPA_ASYNC_RESILVER)
- dsl_resilver_restart(spa->spa_dsl_pool, 0);
+ if (tasks & SPA_ASYNC_RESILVER &&
+ (!dsl_scan_resilvering(dp) ||
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
+ dsl_resilver_restart(dp, 0);
if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
mutex_enter(&spa_namespace_lock);
diff --git a/uts/common/fs/zfs/sys/spa_impl.h b/uts/common/fs/zfs/sys/spa_impl.h
index b3840a98a338..19516a1a1bde 100644
--- a/uts/common/fs/zfs/sys/spa_impl.h
+++ b/uts/common/fs/zfs/sys/spa_impl.h
@@ -279,6 +279,13 @@ struct spa {
uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */
uint64_t spa_scan_pass_exam; /* examined bytes per pass */
uint64_t spa_scan_pass_issued; /* issued bytes per pass */
+
+ /*
+ * We are in the middle of a resilver, and another resilver
+ * is needed once this one completes. This is set iff any
+ * vdev_resilver_deferred is set.
+ */
+ boolean_t spa_resilver_deferred;
kmutex_t spa_async_lock; /* protect async state */
kthread_t *spa_async_thread; /* thread doing async task */
int spa_async_suspended; /* async tasks suspended */
diff --git a/uts/common/fs/zfs/sys/vdev.h b/uts/common/fs/zfs/sys/vdev.h
index e21989641b91..3a3662c612da 100644
--- a/uts/common/fs/zfs/sys/vdev.h
+++ b/uts/common/fs/zfs/sys/vdev.h
@@ -149,6 +149,8 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
extern void vdev_state_dirty(vdev_t *vd);
extern void vdev_state_clean(vdev_t *vd);
+extern void vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd);
+
typedef enum vdev_config_flag {
VDEV_CONFIG_SPARE = 1 << 0,
VDEV_CONFIG_L2CACHE = 1 << 1,
diff --git a/uts/common/fs/zfs/sys/vdev_impl.h b/uts/common/fs/zfs/sys/vdev_impl.h
index a91927dbb6a5..b5116447eaca 100644
--- a/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/uts/common/fs/zfs/sys/vdev_impl.h
@@ -346,6 +346,7 @@ struct vdev {
boolean_t vdev_cant_write; /* vdev is failing all writes */
boolean_t vdev_isspare; /* was a hot spare */
boolean_t vdev_isl2cache; /* was a l2cache device */
+ boolean_t vdev_resilver_deferred; /* resilver deferred */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
vdev_cache_t vdev_cache; /* physical block cache */
spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c
index 1fd81bcf7b43..219be1901578 100644
--- a/uts/common/fs/zfs/vdev.c
+++ b/uts/common/fs/zfs/vdev.c
@@ -760,6 +760,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
&vd->vdev_resilver_txg);
+ if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
+ vdev_set_deferred_resilver(spa, vd);
+
/*
* When importing a pool, we want to ignore the persistent fault
* state, as the diagnosis made on another system may not be
@@ -1733,8 +1736,13 @@ vdev_open(vdev_t *vd)
* since this would just restart the scrub we are already doing.
*/
if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
- vdev_resilver_needed(vd, NULL, NULL))
- spa_async_request(spa, SPA_ASYNC_RESILVER);
+ vdev_resilver_needed(vd, NULL, NULL)) {
+ if (dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
+ vdev_set_deferred_resilver(spa, vd);
+ else
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+ }
return (0);
}
@@ -2441,6 +2449,9 @@ vdev_dtl_should_excise(vdev_t *vd)
if (vd->vdev_state < VDEV_STATE_DEGRADED)
return (B_FALSE);
+ if (vd->vdev_resilver_deferred)
+ return (B_FALSE);
+
if (vd->vdev_resilver_txg == 0 ||
range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
return (B_TRUE);
@@ -3474,8 +3485,14 @@ vdev_clear(spa_t *spa, vdev_t *vd)
if (vd != rvd && vdev_writeable(vd->vdev_top))
vdev_state_dirty(vd->vdev_top);
- if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
- spa_async_request(spa, SPA_ASYNC_RESILVER);
+ if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) {
+ if (dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ spa_feature_is_enabled(spa,
+ SPA_FEATURE_RESILVER_DEFER))
+ vdev_set_deferred_resilver(spa, vd);
+ else
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
+ }
spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
}
@@ -3618,6 +3635,8 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
vd->vdev_mg->mg_fragmentation : 0;
}
+ if (vd->vdev_ops->vdev_op_leaf)
+ vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
/*
* If we're getting stats on the root vdev, aggregate the I/O counts
@@ -4331,3 +4350,18 @@ vdev_deadman(vdev_t *vd)
mutex_exit(&vq->vq_lock);
}
}
+
+void
+vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd)
+{
+ for (uint64_t i = 0; i < vd->vdev_children; i++)
+ vdev_set_deferred_resilver(spa, vd->vdev_child[i]);
+
+ if (!vd->vdev_ops->vdev_op_leaf || !vdev_writeable(vd) ||
+ range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
+ return;
+ }
+
+ vd->vdev_resilver_deferred = B_TRUE;
+ spa->spa_resilver_deferred = B_TRUE;
+}
diff --git a/uts/common/fs/zfs/vdev_indirect.c b/uts/common/fs/zfs/vdev_indirect.c
index 958e8cd8581e..062c4073a81b 100644
--- a/uts/common/fs/zfs/vdev_indirect.c
+++ b/uts/common/fs/zfs/vdev_indirect.c
@@ -1239,6 +1239,8 @@ vdev_indirect_read_all(zio_t *zio)
{
indirect_vsd_t *iv = zio->io_vsd;
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
for (int i = 0; i < is->is_children; i++) {
@@ -1321,7 +1323,8 @@ vdev_indirect_io_start(zio_t *zio)
vdev_indirect_child_io_done, zio));
} else {
iv->iv_split_block = B_TRUE;
- if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+ if (zio->io_type == ZIO_TYPE_READ &&
+ zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
/*
* Read all copies. Note that for simplicity,
* we don't bother consulting the DTL in the
@@ -1330,13 +1333,17 @@ vdev_indirect_io_start(zio_t *zio)
vdev_indirect_read_all(zio);
} else {
/*
- * Read one copy of each split segment, from the
- * top-level vdev. Since we don't know the
- * checksum of each split individually, the child
- * zio can't ensure that we get the right data.
- * E.g. if it's a mirror, it will just read from a
- * random (healthy) leaf vdev. We have to verify
- * the checksum in vdev_indirect_io_done().
+ * If this is a read zio, we read one copy of each
+ * split segment, from the top-level vdev. Since
+ * we don't know the checksum of each split
+ * individually, the child zio can't ensure that
+ * we get the right data. E.g. if it's a mirror,
+ * it will just read from a random (healthy) leaf
+ * vdev. We have to verify the checksum in
+ * vdev_indirect_io_done().
+ *
+ * For write zios, the vdev code will ensure we write
+ * to all children.
*/
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
diff --git a/uts/common/fs/zfs/vdev_label.c b/uts/common/fs/zfs/vdev_label.c
index 31e05f7b1780..0ba8a948237a 100644
--- a/uts/common/fs/zfs/vdev_label.c
+++ b/uts/common/fs/zfs/vdev_label.c
@@ -377,6 +377,12 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
vd->vdev_top_zap);
}
+
+ if (vd->vdev_resilver_deferred) {
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(spa->spa_resilver_deferred);
+ fnvlist_add_boolean(nv, ZPOOL_CONFIG_RESILVER_DEFER);
+ }
}
if (getstats) {
diff --git a/uts/common/fs/zfs/vdev_removal.c b/uts/common/fs/zfs/vdev_removal.c
index c6874ae06a3f..aaaece7c4d4d 100644
--- a/uts/common/fs/zfs/vdev_removal.c
+++ b/uts/common/fs/zfs/vdev_removal.c
@@ -127,7 +127,7 @@ int vdev_removal_max_span = 32 * 1024;
* This is used by the test suite so that it can ensure that certain
* actions happen while in the middle of a removal.
*/
-uint64_t zfs_remove_max_bytes_pause = UINT64_MAX;
+int zfs_removal_suspend_progress = 0;
#define VDEV_REMOVAL_ZAP_OBJS "lzap"
@@ -1433,14 +1433,14 @@ spa_vdev_remove_thread(void *arg)
/*
* This delay will pause the removal around the point
- * specified by zfs_remove_max_bytes_pause. We do this
+ * specified by zfs_removal_suspend_progress. We do this
* solely from the test suite or during debugging.
*/
uint64_t bytes_copied =
spa->spa_removing_phys.sr_copied;
for (int i = 0; i < TXG_SIZE; i++)
bytes_copied += svr->svr_bytes_done[i];
- while (zfs_remove_max_bytes_pause <= bytes_copied &&
+ while (zfs_removal_suspend_progress &&
!svr->svr_thread_exit)
delay(hz);
diff --git a/uts/common/fs/zfs/zil.c b/uts/common/fs/zfs/zil.c
index 546e4f3d1e12..7e88c51a0b39 100644
--- a/uts/common/fs/zfs/zil.c
+++ b/uts/common/fs/zfs/zil.c
@@ -1252,7 +1252,7 @@ zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
* root zios). This is required because of how we can
* defer the DKIOCFLUSHWRITECACHE commands for each lwb.
*
- * When the DKIOCFLUSHWRITECACHE commands are defered,
+ * When the DKIOCFLUSHWRITECACHE commands are deferred,
* the previous lwb will rely on this lwb to flush the
* vdevs written to by that previous lwb. Thus, we need
* to ensure this lwb doesn't issue the flush until
diff --git a/uts/common/sys/fs/zfs.h b/uts/common/sys/fs/zfs.h
index 4cb8a4351344..831f9c1e3277 100644
--- a/uts/common/sys/fs/zfs.h
+++ b/uts/common/sys/fs/zfs.h
@@ -597,6 +597,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top"
#define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf"
#define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps"
+#define ZPOOL_CONFIG_RESILVER_DEFER "com.datto:resilver_defer"
#define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */
@@ -896,6 +897,7 @@ typedef struct vdev_stat {
uint64_t vs_initialize_state; /* vdev_initialzing_state_t */
uint64_t vs_initialize_action_time; /* time_t */
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
+ uint64_t vs_resilver_deferred; /* resilver deferred */
} vdev_stat_t;
/*