aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndriy Gapon <avg@FreeBSD.org>2019-10-16 06:18:37 +0000
committerAndriy Gapon <avg@FreeBSD.org>2019-10-16 06:18:37 +0000
commitb7cab79de23a8bfb689bfb5bddb4deb2e09b360d (patch)
tree7eb20525e426f32635fc498419104fe029641a25
parentc4dff1c2563b495ebe757f50c160425a576d9abf (diff)
downloadsrc-b7cab79de23a8bfb689bfb5bddb4deb2e09b360d.tar.gz
src-b7cab79de23a8bfb689bfb5bddb4deb2e09b360d.zip
10330 merge recent ZoL vdev and metaslab changes
illumos/illumos-gate@a0b03b161c4df3cfc54fbc741db09b3bdc23ffba https://github.com/illumos/illumos-gate/commit/a0b03b161c4df3cfc54fbc741db09b3bdc23ffba https://www.illumos.org/issues/10330 3 recent ZoL changes in the vdev and metaslab code which we can pull over: PR 8324 c853f382db 8324 Change target size of metaslabs from 256GB to 16GB PR 8290 b194fab0fb 8290 Factor metaslab_load_wait() in metaslab_load() PR 8286 419ba59145 8286 Update vdev_is_spacemap_addressable() for new spacemap encoding Author: Serapheim Dimitropoulos <serapheimd@gmail.com>
Notes
Notes: svn path=/vendor-sys/illumos/dist/; revision=353611
-rw-r--r--cmd/zdb/zdb.c7
-rw-r--r--uts/common/fs/zfs/metaslab.c82
-rw-r--r--uts/common/fs/zfs/sys/metaslab.h1
-rw-r--r--uts/common/fs/zfs/sys/metaslab_impl.h4
-rw-r--r--uts/common/fs/zfs/vdev.c88
-rw-r--r--uts/common/fs/zfs/vdev_initialize.c18
6 files changed, 108 insertions, 92 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 3b56a34fb707..65ef35f171a8 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -901,11 +901,8 @@ dump_metaslab(metaslab_t *msp)
if (dump_opt['m'] > 2 && !dump_opt['L']) {
mutex_enter(&msp->ms_lock);
- metaslab_load_wait(msp);
- if (!msp->ms_loaded) {
- VERIFY0(metaslab_load(msp));
- range_tree_stat_verify(msp->ms_allocatable);
- }
+ VERIFY0(metaslab_load(msp));
+ range_tree_stat_verify(msp->ms_allocatable);
dump_metaslab_stats(msp);
metaslab_unload(msp);
mutex_exit(&msp->ms_lock);
diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c
index 1839bf79483c..44fc2b0e8604 100644
--- a/uts/common/fs/zfs/metaslab.c
+++ b/uts/common/fs/zfs/metaslab.c
@@ -1456,7 +1456,7 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
/*
* Wait for any in-progress metaslab loads to complete.
*/
-void
+static void
metaslab_load_wait(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -1467,20 +1467,17 @@ metaslab_load_wait(metaslab_t *msp)
}
}
-int
-metaslab_load(metaslab_t *msp)
+static int
+metaslab_load_impl(metaslab_t *msp)
{
int error = 0;
- boolean_t success = B_FALSE;
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT(!msp->ms_loaded);
- ASSERT(!msp->ms_loading);
+ ASSERT(msp->ms_loading);
- msp->ms_loading = B_TRUE;
/*
* Nobody else can manipulate a loading metaslab, so it's now safe
- * to drop the lock. This way we don't have to hold the lock while
+ * to drop the lock. This way we don't have to hold the lock while
* reading the spacemap from disk.
*/
mutex_exit(&msp->ms_lock);
@@ -1497,29 +1494,49 @@ metaslab_load(metaslab_t *msp)
msp->ms_start, msp->ms_size);
}
- success = (error == 0);
-
mutex_enter(&msp->ms_lock);
- msp->ms_loading = B_FALSE;
- if (success) {
- ASSERT3P(msp->ms_group, !=, NULL);
- msp->ms_loaded = B_TRUE;
+ if (error != 0)
+ return (error);
- /*
- * If the metaslab already has a spacemap, then we need to
- * remove all segments from the defer tree; otherwise, the
- * metaslab is completely empty and we can skip this.
- */
- if (msp->ms_sm != NULL) {
- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- range_tree_walk(msp->ms_defer[t],
- range_tree_remove, msp->ms_allocatable);
- }
+ ASSERT3P(msp->ms_group, !=, NULL);
+ msp->ms_loaded = B_TRUE;
+
+ /*
+ * If the metaslab already has a spacemap, then we need to
+ * remove all segments from the defer tree; otherwise, the
+ * metaslab is completely empty and we can skip this.
+ */
+ if (msp->ms_sm != NULL) {
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_walk(msp->ms_defer[t],
+ range_tree_remove, msp->ms_allocatable);
}
- msp->ms_max_size = metaslab_block_maxsize(msp);
}
+ msp->ms_max_size = metaslab_block_maxsize(msp);
+
+ return (0);
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ /*
+ * There may be another thread loading the same metaslab, if that's
+ * the case just wait until the other thread is done and return.
+ */
+ metaslab_load_wait(msp);
+ if (msp->ms_loaded)
+ return (0);
+ VERIFY(!msp->ms_loading);
+
+ msp->ms_loading = B_TRUE;
+ int error = metaslab_load_impl(msp);
+ msp->ms_loading = B_FALSE;
cv_broadcast(&msp->ms_load_cv);
+
return (error);
}
@@ -2078,13 +2095,10 @@ metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- int error = 0;
- metaslab_load_wait(msp);
- if (!msp->ms_loaded) {
- if ((error = metaslab_load(msp)) != 0) {
- metaslab_group_sort(msp->ms_group, msp, 0);
- return (error);
- }
+ int error = metaslab_load(msp);
+ if (error != 0) {
+ metaslab_group_sort(msp->ms_group, msp, 0);
+ return (error);
}
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
/*
@@ -2196,9 +2210,7 @@ metaslab_preload(void *arg)
ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
mutex_enter(&msp->ms_lock);
- metaslab_load_wait(msp);
- if (!msp->ms_loaded)
- (void) metaslab_load(msp);
+ (void) metaslab_load(msp);
msp->ms_selected_txg = spa_syncing_txg(spa);
mutex_exit(&msp->ms_lock);
}
diff --git a/uts/common/fs/zfs/sys/metaslab.h b/uts/common/fs/zfs/sys/metaslab.h
index 32a19ca64584..f0c68c77fc06 100644
--- a/uts/common/fs/zfs/sys/metaslab.h
+++ b/uts/common/fs/zfs/sys/metaslab.h
@@ -48,7 +48,6 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
metaslab_t **);
void metaslab_fini(metaslab_t *);
-void metaslab_load_wait(metaslab_t *);
int metaslab_load(metaslab_t *);
void metaslab_unload(metaslab_t *);
diff --git a/uts/common/fs/zfs/sys/metaslab_impl.h b/uts/common/fs/zfs/sys/metaslab_impl.h
index fe09c274a958..a2c8e6051772 100644
--- a/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -369,8 +369,8 @@ struct metaslab {
uint64_t ms_initializing; /* leaves initializing this ms */
/*
- * We must hold both ms_lock and ms_group->mg_lock in order to
- * modify ms_loaded.
+ * We must always hold the ms_lock when modifying ms_loaded
+ * and ms_loading.
*/
boolean_t ms_loaded;
boolean_t ms_loading;
diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c
index 27d1dcf97f5b..5441c9059d28 100644
--- a/uts/common/fs/zfs/vdev.c
+++ b/uts/common/fs/zfs/vdev.c
@@ -72,20 +72,20 @@ static vdev_ops_t *vdev_ops_table[] = {
/* maximum scrub/resilver I/O queue per leaf vdev */
int zfs_scrub_limit = 10;
-/* target number of metaslabs per top-level vdev */
-int vdev_max_ms_count = 200;
+/* default target for number of metaslabs per top-level vdev */
+int zfs_vdev_default_ms_count = 200;
/* minimum number of metaslabs per top-level vdev */
-int vdev_min_ms_count = 16;
+int zfs_vdev_min_ms_count = 16;
/* practical upper limit of total metaslabs per top-level vdev */
-int vdev_ms_count_limit = 1ULL << 17;
+int zfs_vdev_ms_count_limit = 1ULL << 17;
/* lower limit for metaslab size (512M) */
-int vdev_default_ms_shift = 29;
+int zfs_vdev_default_ms_shift = 29;
-/* upper limit for metaslab size (256G) */
-int vdev_max_ms_shift = 38;
+/* upper limit for metaslab size (16G) */
+int zfs_vdev_max_ms_shift = 34;
boolean_t vdev_validate_skip = B_FALSE;
@@ -2034,16 +2034,24 @@ void
vdev_metaslab_set_size(vdev_t *vd)
{
uint64_t asize = vd->vdev_asize;
- uint64_t ms_count = asize >> vdev_default_ms_shift;
+ uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
uint64_t ms_shift;
/*
* There are two dimensions to the metaslab sizing calculation:
* the size of the metaslab and the count of metaslabs per vdev.
- * In general, we aim for vdev_max_ms_count (200) metaslabs. The
- * range of the dimensions are as follows:
*
- * 2^29 <= ms_size <= 2^38
+ * The default values used below are a good balance between memory
+ * usage (larger metaslab size means more memory needed for loaded
+ * metaslabs; more metaslabs means more memory needed for the
+ * metaslab_t structs), metaslab load time (larger metaslabs take
+ * longer to load), and metaslab sync time (more metaslabs means
+ * more time spent syncing all of them).
+ *
+ * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
+ * The range of the dimensions are as follows:
+ *
+ * 2^29 <= ms_size <= 2^34
* 16 <= ms_count <= 131,072
*
* On the lower end of vdev sizes, we aim for metaslabs sizes of
@@ -2052,35 +2060,41 @@ vdev_metaslab_set_size(vdev_t *vd)
* of at least 16 metaslabs will override this minimum size goal.
*
* On the upper end of vdev sizes, we aim for a maximum metaslab
- * size of 256GB. However, we will cap the total count to 2^17
- * metaslabs to keep our memory footprint in check.
+ * size of 16GB. However, we will cap the total count to 2^17
+ * metaslabs to keep our memory footprint in check and let the
+ * metaslab size grow from there if that limit is hit.
*
* The net effect of applying above constrains is summarized below.
*
- * vdev size metaslab count
- * -------------|-----------------
- * < 8GB ~16
- * 8GB - 100GB one per 512MB
- * 100GB - 50TB ~200
- * 50TB - 32PB one per 256GB
- * > 32PB ~131,072
- * -------------------------------
+ * vdev size metaslab count
+ * --------------|-----------------
+ * < 8GB ~16
+ * 8GB - 100GB one per 512MB
+ * 100GB - 3TB ~200
+ * 3TB - 2PB one per 16GB
+ * > 2PB ~131,072
+ * --------------------------------
+ *
+ * Finally, note that all of the above calculate the initial
+ * number of metaslabs. Expanding a top-level vdev will result
+ * in additional metaslabs being allocated making it possible
+ * to exceed the zfs_vdev_ms_count_limit.
*/
- if (ms_count < vdev_min_ms_count)
- ms_shift = highbit64(asize / vdev_min_ms_count);
- else if (ms_count > vdev_max_ms_count)
- ms_shift = highbit64(asize / vdev_max_ms_count);
+ if (ms_count < zfs_vdev_min_ms_count)
+ ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
+ else if (ms_count > zfs_vdev_default_ms_count)
+ ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
else
- ms_shift = vdev_default_ms_shift;
+ ms_shift = zfs_vdev_default_ms_shift;
if (ms_shift < SPA_MAXBLOCKSHIFT) {
ms_shift = SPA_MAXBLOCKSHIFT;
- } else if (ms_shift > vdev_max_ms_shift) {
- ms_shift = vdev_max_ms_shift;
+ } else if (ms_shift > zfs_vdev_max_ms_shift) {
+ ms_shift = zfs_vdev_max_ms_shift;
/* cap the total count to constrain memory footprint */
- if ((asize >> ms_shift) > vdev_ms_count_limit)
- ms_shift = highbit64(asize / vdev_ms_count_limit);
+ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
+ ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
}
vd->vdev_ms_shift = ms_shift;
@@ -3386,13 +3400,17 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
boolean_t
vdev_is_spacemap_addressable(vdev_t *vd)
{
+ if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
+ return (B_TRUE);
+
/*
- * Assuming 47 bits of the space map entry dedicated for the entry's
- * offset (see description in space_map.h), we calculate the maximum
- * address that can be described by a space map entry for the given
- * device.
+ * If double-word space map entries are not enabled we assume
+ * 47 bits of the space map entry are dedicated to the entry's
+ * offset (see SM_OFFSET_BITS in space_map.h). We then use that
+ * to calculate the maximum address that can be described by a
+ * space map entry for the given device.
*/
- uint64_t shift = vd->vdev_ashift + 47;
+ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
if (shift >= 63) /* detect potential overflow */
return (B_TRUE);
diff --git a/uts/common/fs/zfs/vdev_initialize.c b/uts/common/fs/zfs/vdev_initialize.c
index 559c0153d6cc..bf246cd8ddcf 100644
--- a/uts/common/fs/zfs/vdev_initialize.c
+++ b/uts/common/fs/zfs/vdev_initialize.c
@@ -353,16 +353,6 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
}
static void
-vdev_initialize_ms_load(metaslab_t *msp)
-{
- ASSERT(MUTEX_HELD(&msp->ms_lock));
-
- metaslab_load_wait(msp);
- if (!msp->ms_loaded)
- VERIFY0(metaslab_load(msp));
-}
-
-static void
vdev_initialize_mg_wait(metaslab_group_t *mg)
{
ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
@@ -484,10 +474,10 @@ vdev_initialize_calculate_progress(vdev_t *vd)
* metaslab. Load it and walk the free tree for more accurate
* progress estimation.
*/
- vdev_initialize_ms_load(msp);
+ VERIFY0(metaslab_load(msp));
- for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs;
- rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
+ for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
+ rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
logical_rs.rs_start = rs->rs_start;
logical_rs.rs_end = rs->rs_end;
vdev_xlate(vd, &logical_rs, &physical_rs);
@@ -615,7 +605,7 @@ vdev_initialize_thread(void *arg)
vdev_initialize_ms_mark(msp);
mutex_enter(&msp->ms_lock);
- vdev_initialize_ms_load(msp);
+ VERIFY0(metaslab_load(msp));
range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
vd);