aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--uts/common/fs/zfs/arc.c16
-rw-r--r--uts/common/fs/zfs/spa.c360
-rw-r--r--uts/common/fs/zfs/vdev.c6
-rw-r--r--uts/common/fs/zfs/vdev_removal.c3
4 files changed, 214 insertions, 171 deletions
diff --git a/uts/common/fs/zfs/arc.c b/uts/common/fs/zfs/arc.c
index e64d8945973f..2db6cc557046 100644
--- a/uts/common/fs/zfs/arc.c
+++ b/uts/common/fs/zfs/arc.c
@@ -3124,7 +3124,8 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
{
l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
l2arc_dev_t *dev = l2hdr->b_dev;
- uint64_t psize = arc_hdr_size(hdr);
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
ASSERT(HDR_HAS_L2HDR(hdr));
@@ -3134,9 +3135,10 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
ARCSTAT_INCR(arcstat_l2_psize, -psize);
ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
- vdev_space_update(dev->l2ad_vdev, -psize, 0, 0);
+ vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
- (void) zfs_refcount_remove_many(&dev->l2ad_alloc, psize, hdr);
+ (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
+ hdr);
arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
}
@@ -6759,10 +6761,12 @@ top:
list_remove(buflist, hdr);
arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
- ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr));
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ ARCSTAT_INCR(arcstat_l2_psize, -psize);
ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
- bytes_dropped += arc_hdr_size(hdr);
+ bytes_dropped +=
+ vdev_psize_to_asize(dev->l2ad_vdev, psize);
(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
arc_hdr_size(hdr), hdr);
}
@@ -7213,6 +7217,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
write_psize += psize;
write_asize += asize;
dev->l2ad_hand += asize;
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
mutex_exit(hash_lock);
@@ -7238,7 +7243,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
ARCSTAT_INCR(arcstat_l2_psize, write_psize);
- vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
/*
* Bump device hand to the device start if it is approaching the end.
diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c
index 3b6821828853..d3f6b4792050 100644
--- a/uts/common/fs/zfs/spa.c
+++ b/uts/common/fs/zfs/spa.c
@@ -7379,6 +7379,9 @@ spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
static void
spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
{
+ if (spa_sync_pass(spa) != 1)
+ return;
+
zio_t *zio = zio_root(spa, NULL, NULL, 0);
VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
spa_free_sync_cb, zio, tx), ==, 0);
@@ -7774,10 +7777,10 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
static void
spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
{
- dsl_pool_t *dp = spa->spa_dsl_pool;
-
- ASSERT(spa->spa_sync_pass == 1);
+ if (spa_sync_pass(spa) != 1)
+ return;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
@@ -7872,116 +7875,31 @@ vdev_indirect_state_sync_verify(vdev_t *vd)
}
/*
- * Sync the specified transaction group. New blocks may be dirtied as
- * part of the process, so we iterate until it converges.
+ * Set the top-level vdev's max queue depth. Evaluate each top-level's
+ * async write queue depth in case it changed. The max queue depth will
+ * not change in the middle of syncing out this txg.
*/
-void
-spa_sync(spa_t *spa, uint64_t txg)
+static void
+spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
{
- dsl_pool_t *dp = spa->spa_dsl_pool;
- objset_t *mos = spa->spa_meta_objset;
- bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
- metaslab_class_t *normal = spa_normal_class(spa);
- metaslab_class_t *special = spa_special_class(spa);
- metaslab_class_t *dedup = spa_dedup_class(spa);
+ ASSERT(spa_writeable(spa));
+
vdev_t *rvd = spa->spa_root_vdev;
- vdev_t *vd;
- dmu_tx_t *tx;
- int error;
uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
zfs_vdev_queue_depth_pct / 100;
+ metaslab_class_t *normal = spa_normal_class(spa);
+ metaslab_class_t *special = spa_special_class(spa);
+ metaslab_class_t *dedup = spa_dedup_class(spa);
- VERIFY(spa_writeable(spa));
-
- /*
- * Wait for i/os issued in open context that need to complete
- * before this txg syncs.
- */
- (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
- spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CANFAIL);
-
- /*
- * Lock out configuration changes.
- */
- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
- spa->spa_syncing_txg = txg;
- spa->spa_sync_pass = 0;
-
- for (int i = 0; i < spa->spa_alloc_count; i++) {
- mutex_enter(&spa->spa_alloc_locks[i]);
- VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
- mutex_exit(&spa->spa_alloc_locks[i]);
- }
-
- /*
- * If there are any pending vdev state changes, convert them
- * into config changes that go out with this transaction group.
- */
- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
- while (list_head(&spa->spa_state_dirty_list) != NULL) {
- /*
- * We need the write lock here because, for aux vdevs,
- * calling vdev_config_dirty() modifies sav_config.
- * This is ugly and will become unnecessary when we
- * eliminate the aux vdev wart by integrating all vdevs
- * into the root vdev tree.
- */
- spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
- spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
- while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
- vdev_state_clean(vd);
- vdev_config_dirty(vd);
- }
- spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
- spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
- }
- spa_config_exit(spa, SCL_STATE, FTAG);
-
- tx = dmu_tx_create_assigned(dp, txg);
-
- spa->spa_sync_starttime = gethrtime();
- VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
- spa->spa_sync_starttime + spa->spa_deadman_synctime));
-
- /*
- * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
- * set spa_deflate if we have no raid-z vdevs.
- */
- if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
- spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
- int i;
-
- for (i = 0; i < rvd->vdev_children; i++) {
- vd = rvd->vdev_child[i];
- if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
- break;
- }
- if (i == rvd->vdev_children) {
- spa->spa_deflate = TRUE;
- VERIFY(0 == zap_add(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
- sizeof (uint64_t), 1, &spa->spa_deflate, tx));
- }
- }
-
- /*
- * Set the top-level vdev's max queue depth. Evaluate each
- * top-level's async write queue depth in case it changed.
- * The max queue depth will not change in the middle of syncing
- * out this txg.
- */
uint64_t slots_per_allocator = 0;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
- metaslab_group_t *mg = tvd->vdev_mg;
- metaslab_class_t *mc;
+ metaslab_group_t *mg = tvd->vdev_mg;
if (mg == NULL || !metaslab_group_initialized(mg))
continue;
- mc = mg->mg_class;
+ metaslab_class_t *mc = mg->mg_class;
if (mc != normal && mc != special && mc != dedup)
continue;
@@ -8013,7 +7931,14 @@ spa_sync(spa_t *spa, uint64_t txg)
normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+}
+
+static void
+spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
+{
+ ASSERT(spa_writeable(spa));
+ vdev_t *rvd = spa->spa_root_vdev;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
vdev_indirect_state_sync_verify(vd);
@@ -8023,10 +7948,16 @@ spa_sync(spa_t *spa, uint64_t txg)
break;
}
}
+}
+
+static void
+spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ uint64_t txg = tx->tx_txg;
+ bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
- /*
- * Iterate to convergence.
- */
do {
int pass = ++spa->spa_sync_pass;
@@ -8052,79 +7983,60 @@ spa_sync(spa_t *spa, uint64_t txg)
ddt_sync(spa, txg);
dsl_scan_sync(dp, tx);
+ svr_sync(spa, tx);
+ spa_sync_upgrades(spa, tx);
- if (spa->spa_vdev_removal != NULL)
- svr_sync(spa, tx);
-
+ vdev_t *vd = NULL;
while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
!= NULL)
vdev_sync(vd, txg);
- if (pass == 1) {
- spa_sync_upgrades(spa, tx);
- ASSERT3U(txg, >=,
- spa->spa_uberblock.ub_rootbp.blk_birth);
+ /*
+ * Note: We need to check if the MOS is dirty because we could
+ * have marked the MOS dirty without updating the uberblock
+ * (e.g. if we have sync tasks but no dirty user data). We need
+ * to check the uberblock's rootbp because it is updated if we
+ * have synced out dirty data (though in this case the MOS will
+ * most likely also be dirty due to second order effects, we
+ * don't want to rely on that here).
+ */
+ if (pass == 1 &&
+ spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+ !dmu_objset_is_dirty(mos, txg)) {
/*
- * Note: We need to check if the MOS is dirty
- * because we could have marked the MOS dirty
- * without updating the uberblock (e.g. if we
- * have sync tasks but no dirty user data). We
- * need to check the uberblock's rootbp because
- * it is updated if we have synced out dirty
- * data (though in this case the MOS will most
- * likely also be dirty due to second order
- * effects, we don't want to rely on that here).
+ * Nothing changed on the first pass, therefore this
+ * TXG is a no-op. Avoid syncing deferred frees, so
+ * that we can keep this TXG as a no-op.
*/
- if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
- !dmu_objset_is_dirty(mos, txg)) {
- /*
- * Nothing changed on the first pass,
- * therefore this TXG is a no-op. Avoid
- * syncing deferred frees, so that we
- * can keep this TXG as a no-op.
- */
- ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
- txg));
- ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
- ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
- ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
- txg));
- break;
- }
- spa_sync_deferred_frees(spa, tx);
+ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+ ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+ ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+ ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
+ break;
}
+ spa_sync_deferred_frees(spa, tx);
} while (dmu_objset_is_dirty(mos, txg));
+}
- if (!list_is_empty(&spa->spa_config_dirty_list)) {
- /*
- * Make sure that the number of ZAPs for all the vdevs matches
- * the number of ZAPs in the per-vdev ZAP list. This only gets
- * called if the config is dirty; otherwise there may be
- * outstanding AVZ operations that weren't completed in
- * spa_sync_config_object.
- */
- uint64_t all_vdev_zap_entry_count;
- ASSERT0(zap_count(spa->spa_meta_objset,
- spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
- ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
- all_vdev_zap_entry_count);
- }
-
- if (spa->spa_vdev_removal != NULL) {
- ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
- }
+/*
+ * Rewrite the vdev configuration (which includes the uberblock) to
+ * commit the transaction group.
+ *
+ * If there are no dirty vdevs, we sync the uberblock to a few random
+ * top-level vdevs that are known to be visible in the config cache
+ * (see spa_vdev_add() for a complete description). If there *are* dirty
+ * vdevs, sync the uberblock to all vdevs.
+ */
+static void
+spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t txg = tx->tx_txg;
- /*
- * Rewrite the vdev configuration (which includes the uberblock)
- * to commit the transaction group.
- *
- * If there are no dirty vdevs, we sync the uberblock to a few
- * random top-level vdevs that are known to be visible in the
- * config cache (see spa_vdev_add() for a complete description).
- * If there *are* dirty vdevs, sync the uberblock to all vdevs.
- */
for (;;) {
+ int error = 0;
+
/*
* We hold SCL_STATE to prevent vdev open/close/etc.
* while we're attempting to write the vdev labels.
@@ -8138,13 +8050,15 @@ spa_sync(spa_t *spa, uint64_t txg)
int c0 = spa_get_random(children);
for (int c = 0; c < children; c++) {
- vd = rvd->vdev_child[(c0 + c) % children];
+ vdev_t *vd =
+ rvd->vdev_child[(c0 + c) % children];
/* Stop when revisiting the first vdev */
if (c > 0 && svd[0] == vd)
break;
- if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+ if (vd->vdev_ms_array == 0 ||
+ vd->vdev_islog ||
!vdev_is_concrete(vd))
continue;
@@ -8168,6 +8082,122 @@ spa_sync(spa_t *spa, uint64_t txg)
zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
zio_resume_wait(spa);
}
+}
+
+/*
+ * Sync the specified transaction group. New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+ vdev_t *vd = NULL;
+
+ VERIFY(spa_writeable(spa));
+
+ /*
+ * Wait for i/os issued in open context that need to complete
+ * before this txg syncs.
+ */
+ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+ spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+
+ /*
+ * Lock out configuration changes.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ spa->spa_syncing_txg = txg;
+ spa->spa_sync_pass = 0;
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_enter(&spa->spa_alloc_locks[i]);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+ mutex_exit(&spa->spa_alloc_locks[i]);
+ }
+
+ /*
+ * If there are any pending vdev state changes, convert them
+ * into config changes that go out with this transaction group.
+ */
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ while (list_head(&spa->spa_state_dirty_list) != NULL) {
+ /*
+ * We need the write lock here because, for aux vdevs,
+ * calling vdev_config_dirty() modifies sav_config.
+ * This is ugly and will become unnecessary when we
+ * eliminate the aux vdev wart by integrating all vdevs
+ * into the root vdev tree.
+ */
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
+ while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+ vdev_state_clean(vd);
+ vdev_config_dirty(vd);
+ }
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+ }
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+
+ spa->spa_sync_starttime = gethrtime();
+ VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
+ spa->spa_sync_starttime + spa->spa_deadman_synctime));
+
+ /*
+ * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
+ * set spa_deflate if we have no raid-z vdevs.
+ */
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ int i;
+ for (i = 0; i < rvd->vdev_children; i++) {
+ vd = rvd->vdev_child[i];
+ if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
+ break;
+ }
+ if (i == rvd->vdev_children) {
+ spa->spa_deflate = TRUE;
+ VERIFY0(zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+ sizeof (uint64_t), 1, &spa->spa_deflate, tx));
+ }
+ }
+
+ spa_sync_adjust_vdev_max_queue_depth(spa);
+
+ spa_sync_condense_indirect(spa, tx);
+
+ spa_sync_iterate_to_convergence(spa, tx);
+
+#ifdef ZFS_DEBUG
+ if (!list_is_empty(&spa->spa_config_dirty_list)) {
+ /*
+ * Make sure that the number of ZAPs for all the vdevs matches
+ * the number of ZAPs in the per-vdev ZAP list. This only gets
+ * called if the config is dirty; otherwise there may be
+ * outstanding AVZ operations that weren't completed in
+ * spa_sync_config_object.
+ */
+ uint64_t all_vdev_zap_entry_count;
+ ASSERT0(zap_count(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
+ ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
+ all_vdev_zap_entry_count);
+ }
+#endif
+
+ if (spa->spa_vdev_removal != NULL) {
+ ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
+ }
+
+ spa_sync_rewrite_vdev_config(spa, tx);
dmu_tx_commit(tx);
VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c
index 7ca3429b46de..1fd81bcf7b43 100644
--- a/uts/common/fs/zfs/vdev.c
+++ b/uts/common/fs/zfs/vdev.c
@@ -3833,6 +3833,11 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
dspace_delta = vdev_deflated_space(vd, space_delta);
mutex_enter(&vd->vdev_stat_lock);
+ /* ensure we won't underflow */
+ if (alloc_delta < 0) {
+ ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
+ }
+
vd->vdev_stat.vs_alloc += alloc_delta;
vd->vdev_stat.vs_space += space_delta;
vd->vdev_stat.vs_dspace += dspace_delta;
@@ -3840,6 +3845,7 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
/* every class but log contributes to root space stats */
if (vd->vdev_mg != NULL && !vd->vdev_islog) {
+ ASSERT(!vd->vdev_isl2cache);
mutex_enter(&rvd->vdev_stat_lock);
rvd->vdev_stat.vs_alloc += alloc_delta;
rvd->vdev_stat.vs_space += space_delta;
diff --git a/uts/common/fs/zfs/vdev_removal.c b/uts/common/fs/zfs/vdev_removal.c
index e4d0224333ff..c6874ae06a3f 100644
--- a/uts/common/fs/zfs/vdev_removal.c
+++ b/uts/common/fs/zfs/vdev_removal.c
@@ -1688,6 +1688,9 @@ svr_sync(spa_t *spa, dmu_tx_t *tx)
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+ if (svr == NULL)
+ return;
+
/*
* This check is necessary so that we do not dirty the
* DIRECTORY_OBJECT via spa_sync_removing_state() when there