aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndriy Gapon <avg@FreeBSD.org>2019-10-16 07:02:02 +0000
committerAndriy Gapon <avg@FreeBSD.org>2019-10-16 07:02:02 +0000
commitf4902c2a07172325115df5f6dad0519d4a63244f (patch)
tree3a0c64e4975de2883418354520da0ff1aad28be3
parent35b885b060938f1f618e83e93a871548e64005c4 (diff)
downloadsrc-f4902c2a07172325115df5f6dad0519d4a63244f.tar.gz
src-f4902c2a07172325115df5f6dad0519d4a63244f.zip
9691 fat zap should prefetch when iterating
illumos/illumos-gate@52abb70e073c2a88808c0d66fd810ba8c5080572 https://github.com/illumos/illumos-gate/commit/52abb70e073c2a88808c0d66fd810ba8c5080572 https://www.illumos.org/issues/9691 When iterating over a ZAP object, we're almost always certain to iterate over the entire object. If there are multiple leaf blocks, we can realize a performance win by issuing reads for all the leaf blocks in parallel when the iteration begins. For example, if we have 10,000 snapshots, "zfs destroy -nv pool/fs@1%9999" can take 30 minutes when the cache is cold. This change provides a >3x performance improvement, by issuing the reads for all ~64 blocks of each ZAP object in parallel. Author: Matthew Ahrens <mahrens@delphix.com>
Notes
Notes: svn path=/vendor-sys/illumos/dist/; revision=353619
-rw-r--r--uts/common/fs/zfs/ddt_zap.c14
-rw-r--r--uts/common/fs/zfs/dmu.c12
-rw-r--r--uts/common/fs/zfs/sys/zap.h5
-rw-r--r--uts/common/fs/zfs/zap.c46
-rw-r--r--uts/common/fs/zfs/zap_micro.c31
5 files changed, 100 insertions, 8 deletions
diff --git a/uts/common/fs/zfs/ddt_zap.c b/uts/common/fs/zfs/ddt_zap.c
index d6a991c7c19e..661813743335 100644
--- a/uts/common/fs/zfs/ddt_zap.c
+++ b/uts/common/fs/zfs/ddt_zap.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -114,7 +115,18 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
zap_attribute_t za;
int error;
- zap_cursor_init_serialized(&zc, os, object, *walk);
+ if (*walk == 0) {
+ /*
+ * We don't want to prefetch the entire ZAP object, because
+ * it can be enormous. Also the primary use of DDT iteration
+ * is for scrubbing, in which case we will be issuing many
+ * scrub i/os for each ZAP block that we read in, so
+ * reading the ZAP is unlikely to be the bottleneck.
+ */
+ zap_cursor_init_noprefetch(&zc, os, object);
+ } else {
+ zap_cursor_init_serialized(&zc, os, object, *walk);
+ }
if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
uchar_t cbuf[sizeof (dde->dde_phys) + 1];
uint64_t csize = za.za_num_integers;
diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c
index 022f376291c3..24786ce9cdfc 100644
--- a/uts/common/fs/zfs/dmu.c
+++ b/uts/common/fs/zfs/dmu.c
@@ -88,6 +88,13 @@ uint32_t zfs_per_txg_dirty_frees_percent = 30;
*/
int zfs_object_remap_one_indirect_delay_ticks = 0;
+/*
+ * Limit the amount we can prefetch with one call to this amount. This
+ * helps to limit the amount of memory that can be used by prefetching.
+ * Larger objects should be prefetched a bit at a time.
+ */
+uint64_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" },
{ DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" },
@@ -638,6 +645,11 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
}
/*
+ * See comment before the definition of dmu_prefetch_max.
+ */
+ len = MIN(len, dmu_prefetch_max);
+
+ /*
* XXX - Note, if the dnode for the requested object is not
* already cached, we will do a *synchronous* read in the
* dnode_hold() call. The same is true for any indirects.
diff --git a/uts/common/fs/zfs/sys/zap.h b/uts/common/fs/zfs/sys/zap.h
index e7f535814027..15e912d1c323 100644
--- a/uts/common/fs/zfs/sys/zap.h
+++ b/uts/common/fs/zfs/sys/zap.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
*/
@@ -347,6 +347,7 @@ typedef struct zap_cursor {
uint64_t zc_serialized;
uint64_t zc_hash;
uint32_t zc_cd;
+ boolean_t zc_prefetch;
} zap_cursor_t;
typedef struct {
@@ -373,6 +374,8 @@ typedef struct {
* zapobj. You must _fini the cursor when you are done with it.
*/
void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
+void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
+ uint64_t zapobj);
void zap_cursor_fini(zap_cursor_t *zc);
/*
diff --git a/uts/common/fs/zfs/zap.c b/uts/common/fs/zfs/zap.c
index 7a1994f603c1..de8e2120776f 100644
--- a/uts/common/fs/zfs/zap.c
+++ b/uts/common/fs/zfs/zap.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@@ -49,6 +49,36 @@
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
+/*
+ * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
+ * (all leaf blocks) when we start iterating over it.
+ *
+ * For zap_cursor_init(), the callers all intend to iterate through all the
+ * entries. There are a few cases where an error (typically i/o error) could
+ * cause it to bail out early.
+ *
+ * For zap_cursor_init_serialized(), there are callers that do the iteration
+ * outside of ZFS. Typically they would iterate over everything, but we
+ * don't have control of that. E.g. zfs_ioc_snapshot_list_next(),
+ * zcp_snapshots_iter(), and other iterators over things in the MOS - these
+ * are called by /sbin/zfs and channel programs. The other example is
+ * zfs_readdir() which iterates over directory entries for the getdents()
+ * syscall. /sbin/ls iterates to the end (unless it receives a signal), but
+ * userland doesn't have to.
+ *
+ * Given that the ZAP entries aren't returned in a specific order, the only
+ * legitimate use cases for partial iteration would be:
+ *
+ * 1. Pagination: e.g. you only want to display 100 entries at a time, so you
+ * get the first 100 and then wait for the user to hit "next page", which
+ * they may never do).
+ *
+ * 2. You want to know if there are more than X entries, without relying on
+ * the zfs-specific implementation of the directory's st_size (which is
+ * the number of entries).
+ */
+boolean_t zap_iterate_prefetch = B_TRUE;
+
int fzap_default_block_shift = 14; /* 16k blocksize */
extern inline zap_phys_t *zap_f_phys(zap_t *zap);
@@ -1169,6 +1199,20 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
/* retrieve the next entry at or after zc_hash/zc_cd */
/* if no entry, return ENOENT */
+ /*
+ * If we are reading from the beginning, we're almost
+ * certain to iterate over the entire ZAP object. If there are
+ * multiple leaf blocks (freeblk > 2), prefetch the whole
+ * object, so that we read the leaf blocks concurrently.
+ * (Unless noprefetch was requested via zap_cursor_init_noprefetch()).
+ */
+ if (zc->zc_hash == 0 && zap_iterate_prefetch &&
+ zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
+ dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
+ zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
+ ZIO_PRIORITY_ASYNC_READ);
+ }
+
if (zc->zc_leaf &&
(ZAP_HASH_IDX(zc->zc_hash,
zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
diff --git a/uts/common/fs/zfs/zap_micro.c b/uts/common/fs/zfs/zap_micro.c
index d093fe1e736e..bb5da13ba152 100644
--- a/uts/common/fs/zfs/zap_micro.c
+++ b/uts/common/fs/zfs/zap_micro.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Nexenta Systems, Inc.
@@ -1386,9 +1386,9 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
* Routines for iterating over the attributes.
*/
-void
-zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
- uint64_t serialized)
+static void
+zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+ uint64_t serialized, boolean_t prefetch)
{
zc->zc_objset = os;
zc->zc_zap = NULL;
@@ -1397,12 +1397,33 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
zc->zc_serialized = serialized;
zc->zc_hash = 0;
zc->zc_cd = 0;
+ zc->zc_prefetch = prefetch;
+}
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+ uint64_t serialized)
+{
+ zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
}
+/*
+ * Initialize a cursor at the beginning of the ZAP object. The entire
+ * ZAP object will be prefetched.
+ */
void
zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
{
- zap_cursor_init_serialized(zc, os, zapobj, 0);
+ zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
+}
+
+/*
+ * Initialize a cursor at the beginning, but request that we not prefetch
+ * the entire ZAP object.
+ */
+void
+zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+ zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
}
void