aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorAlexander Motin <mav@FreeBSD.org>2018-08-01 18:28:17 +0000
committerAlexander Motin <mav@FreeBSD.org>2018-08-01 18:28:17 +0000
commit2395a7f52dc8a224e322d6d10ce1fd59c6d971c9 (patch)
treec0a3299d224b5f92f059bfb538d5e82e556f554f /lib
parent2a44a23db2023b975e979c690e4453b5d39431b2 (diff)
downloadsrc-2395a7f52dc8a224e322d6d10ce1fd59c6d971c9.tar.gz
src-2395a7f52dc8a224e322d6d10ce1fd59c6d971c9.zip
8115 parallel zfs mount
Mounting of filesystems in "filesystem/local" is done using `zfs mount -a`, which mounts each filesystems serially. The bottleneck for each mount is the I/O done to load metadata for each filesystem. As such, mounting filesystems using a parallel algorithm should be a big win, and bring down the runtime of "filesystem/local"'s start method. illumos/illumos-gate@591e0e133f9980083db5d64ac33a30bcc3382ff7 Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed by: Brad Lewis <brad.lewis@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Paul Dagnelie <pcd@delphix.com> Reviewed by: Prashanth Sreenivasa <pks@delphix.com> Approved by: Matt Ahrens <mahrens@delphix.com> Author: Sebastien Roy <seb@delphix.com>
Notes
Notes: svn path=/vendor-sys/illumos/dist/; revision=337047
Diffstat (limited to 'lib')
-rw-r--r--lib/libzfs/common/libzfs.h5
-rw-r--r--lib/libzfs/common/libzfs_dataset.c31
-rw-r--r--lib/libzfs/common/libzfs_impl.h10
-rw-r--r--lib/libzfs/common/libzfs_mount.c409
-rw-r--r--lib/libzfs/common/libzfs_taskq.c297
-rw-r--r--lib/libzfs/common/libzfs_taskq.h63
6 files changed, 725 insertions, 90 deletions
diff --git a/lib/libzfs/common/libzfs.h b/lib/libzfs/common/libzfs.h
index 9dc2b02e147f..d296fed59f01 100644
--- a/lib/libzfs/common/libzfs.h
+++ b/lib/libzfs/common/libzfs.h
@@ -576,12 +576,11 @@ typedef struct get_all_cb {
zfs_handle_t **cb_handles;
size_t cb_alloc;
size_t cb_used;
- boolean_t cb_verbose;
- int (*cb_getone)(zfs_handle_t *, void *);
} get_all_cb_t;
+void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, size_t,
+ zfs_iter_f, void *, boolean_t);
void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *);
-int libzfs_dataset_cmp(const void *, const void *);
/*
* Functions to create and destroy datasets.
diff --git a/lib/libzfs/common/libzfs_dataset.c b/lib/libzfs/common/libzfs_dataset.c
index 011c2653a152..556538a4402b 100644
--- a/lib/libzfs/common/libzfs_dataset.c
+++ b/lib/libzfs/common/libzfs_dataset.c
@@ -54,6 +54,7 @@
#include <idmap.h>
#include <aclutils.h>
#include <directory.h>
+#include <time.h>
#include <sys/dnode.h>
#include <sys/spa.h>
@@ -785,6 +786,8 @@ libzfs_mnttab_cache_compare(const void *arg1, const void *arg2)
void
libzfs_mnttab_init(libzfs_handle_t *hdl)
{
+ (void) mutex_init(&hdl->libzfs_mnttab_cache_lock,
+ LOCK_NORMAL | LOCK_ERRORCHECK, NULL);
assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);
avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,
sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
@@ -825,6 +828,7 @@ libzfs_mnttab_fini(libzfs_handle_t *hdl)
free(mtn);
}
avl_destroy(&hdl->libzfs_mnttab_cache);
+ (void) mutex_destroy(&hdl->libzfs_mnttab_cache_lock);
}
void
@@ -839,6 +843,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,
{
mnttab_node_t find;
mnttab_node_t *mtn;
+ int ret = ENOENT;
if (!hdl->libzfs_mnttab_enable) {
struct mnttab srch = { 0 };
@@ -854,6 +859,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,
return (ENOENT);
}
+ mutex_enter(&hdl->libzfs_mnttab_cache_lock);
if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
libzfs_mnttab_update(hdl);
@@ -861,9 +867,10 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,
mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);
if (mtn) {
*entry = mtn->mtn_mt;
- return (0);
+ ret = 0;
}
- return (ENOENT);
+ mutex_exit(&hdl->libzfs_mnttab_cache_lock);
+ return (ret);
}
void
@@ -872,14 +879,16 @@ libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special,
{
mnttab_node_t *mtn;
- if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
- return;
- mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
- mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
- mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
- mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
- mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
- avl_add(&hdl->libzfs_mnttab_cache, mtn);
+ mutex_enter(&hdl->libzfs_mnttab_cache_lock);
+ if (avl_numnodes(&hdl->libzfs_mnttab_cache) != 0) {
+ mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
+ mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
+ mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
+ mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
+ mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
+ avl_add(&hdl->libzfs_mnttab_cache, mtn);
+ }
+ mutex_exit(&hdl->libzfs_mnttab_cache_lock);
}
void
@@ -888,6 +897,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)
mnttab_node_t find;
mnttab_node_t *ret;
+ mutex_enter(&hdl->libzfs_mnttab_cache_lock);
find.mtn_mt.mnt_special = (char *)fsname;
if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL))
!= NULL) {
@@ -898,6 +908,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)
free(ret->mtn_mt.mnt_mntopts);
free(ret);
}
+ mutex_exit(&hdl->libzfs_mnttab_cache_lock);
}
int
diff --git a/lib/libzfs/common/libzfs_impl.h b/lib/libzfs/common/libzfs_impl.h
index 50f48fd7932d..cd9a53d91fa8 100644
--- a/lib/libzfs/common/libzfs_impl.h
+++ b/lib/libzfs/common/libzfs_impl.h
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
*/
#ifndef _LIBZFS_IMPL_H
@@ -33,6 +33,7 @@
#include <sys/nvpair.h>
#include <sys/dmu.h>
#include <sys/zfs_ioctl.h>
+#include <synch.h>
#include <libuutil.h>
#include <libzfs.h>
@@ -73,6 +74,13 @@ struct libzfs_handle {
int libzfs_storeerr; /* stuff error messages into buffer */
void *libzfs_sharehdl; /* libshare handle */
boolean_t libzfs_mnttab_enable;
+ /*
+ * We need a lock to handle the case where parallel mount
+ * threads are populating the mnttab cache simultaneously. The
+ * lock only protects the integrity of the avl tree, and does
+ * not protect the contents of the mnttab entries themselves.
+ */
+ mutex_t libzfs_mnttab_cache_lock;
avl_tree_t libzfs_mnttab_cache;
int libzfs_pool_iter;
topo_hdl_t *libzfs_topo_hdl;
diff --git a/lib/libzfs/common/libzfs_mount.c b/lib/libzfs/common/libzfs_mount.c
index 9fd37825a31c..cf15735f3f9c 100644
--- a/lib/libzfs/common/libzfs_mount.c
+++ b/lib/libzfs/common/libzfs_mount.c
@@ -22,7 +22,7 @@
/*
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
* Copyright 2017 Joyent, Inc.
* Copyright 2017 RackTop Systems.
@@ -34,25 +34,25 @@
* they are used by mount and unmount and when changing a filesystem's
* mountpoint.
*
- * zfs_is_mounted()
- * zfs_mount()
- * zfs_unmount()
- * zfs_unmountall()
+ * zfs_is_mounted()
+ * zfs_mount()
+ * zfs_unmount()
+ * zfs_unmountall()
*
* This file also contains the functions used to manage sharing filesystems via
* NFS and iSCSI:
*
- * zfs_is_shared()
- * zfs_share()
- * zfs_unshare()
+ * zfs_is_shared()
+ * zfs_share()
+ * zfs_unshare()
*
- * zfs_is_shared_nfs()
- * zfs_is_shared_smb()
- * zfs_share_proto()
- * zfs_shareall();
- * zfs_unshare_nfs()
- * zfs_unshare_smb()
- * zfs_unshareall_nfs()
+ * zfs_is_shared_nfs()
+ * zfs_is_shared_smb()
+ * zfs_share_proto()
+ * zfs_shareall();
+ * zfs_unshare_nfs()
+ * zfs_unshare_smb()
+ * zfs_unshareall_nfs()
* zfs_unshareall_smb()
* zfs_unshareall()
* zfs_unshareall_bypath()
@@ -60,8 +60,8 @@
* The following functions are available for pool consumers, and will
* mount/unmount and share/unshare all datasets within pool:
*
- * zpool_enable_datasets()
- * zpool_disable_datasets()
+ * zpool_enable_datasets()
+ * zpool_disable_datasets()
*/
#include <dirent.h>
@@ -83,11 +83,15 @@
#include <libzfs.h>
#include "libzfs_impl.h"
+#include "libzfs_taskq.h"
#include <libshare.h>
#include <sys/systeminfo.h>
#define MAXISALEN 257 /* based on sysinfo(2) man page */
+static int mount_tq_nthr = 512; /* taskq threads for multi-threaded mounting */
+
+static void zfs_mount_task(void *);
static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *);
zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **,
zfs_share_proto_t);
@@ -1077,25 +1081,32 @@ remove_mountpoint(zfs_handle_t *zhp)
}
}
+/*
+ * Add the given zfs handle to the cb_handles array, dynamically reallocating
+ * the array if it is out of space.
+ */
void
libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp)
{
if (cbp->cb_alloc == cbp->cb_used) {
size_t newsz;
- void *ptr;
+ zfs_handle_t **newhandles;
- newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64;
- ptr = zfs_realloc(zhp->zfs_hdl,
- cbp->cb_handles, cbp->cb_alloc * sizeof (void *),
- newsz * sizeof (void *));
- cbp->cb_handles = ptr;
+ newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64;
+ newhandles = zfs_realloc(zhp->zfs_hdl,
+ cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *),
+ newsz * sizeof (zfs_handle_t *));
+ cbp->cb_handles = newhandles;
cbp->cb_alloc = newsz;
}
cbp->cb_handles[cbp->cb_used++] = zhp;
}
+/*
+ * Recursive helper function used during file system enumeration
+ */
static int
-mount_cb(zfs_handle_t *zhp, void *data)
+zfs_iter_cb(zfs_handle_t *zhp, void *data)
{
get_all_cb_t *cbp = data;
@@ -1121,104 +1132,350 @@ mount_cb(zfs_handle_t *zhp, void *data)
}
libzfs_add_handle(cbp, zhp);
- if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) {
+ if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) {
zfs_close(zhp);
return (-1);
}
return (0);
}
+/*
+ * Sort comparator that compares two mountpoint paths. We sort these paths so
+ * that subdirectories immediately follow their parents. This means that we
+ * effectively treat the '/' character as the lowest value non-nul char. An
+ * example sorted list using this comparator would look like:
+ *
+ * /foo
+ * /foo/bar
+ * /foo/bar/baz
+ * /foo/baz
+ * /foo.bar
+ *
+ * The mounting code depends on this ordering to deterministically iterate
+ * over filesystems in order to spawn parallel mount tasks.
+ */
int
-libzfs_dataset_cmp(const void *a, const void *b)
+mountpoint_cmp(const void *arga, const void *argb)
{
- zfs_handle_t **za = (zfs_handle_t **)a;
- zfs_handle_t **zb = (zfs_handle_t **)b;
+ zfs_handle_t *const *zap = arga;
+ zfs_handle_t *za = *zap;
+ zfs_handle_t *const *zbp = argb;
+ zfs_handle_t *zb = *zbp;
char mounta[MAXPATHLEN];
char mountb[MAXPATHLEN];
+ const char *a = mounta;
+ const char *b = mountb;
boolean_t gota, gotb;
- if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)
- verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,
+ gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM);
+ if (gota) {
+ verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta,
sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
- if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)
- verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,
+ }
+ gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM);
+ if (gotb) {
+ verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb,
sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
+ }
- if (gota && gotb)
- return (strcmp(mounta, mountb));
+ if (gota && gotb) {
+ while (*a != '\0' && (*a == *b)) {
+ a++;
+ b++;
+ }
+ if (*a == *b)
+ return (0);
+ if (*a == '\0')
+ return (-1);
+ if (*b == '\0')
+ return (1);
+ if (*a == '/')
+ return (-1);
+ if (*b == '/')
+ return (1);
+ return (*a < *b ? -1 : *a > *b);
+ }
if (gota)
return (-1);
if (gotb)
return (1);
- return (strcmp(zfs_get_name(a), zfs_get_name(b)));
+ /*
+ * If neither filesystem has a mountpoint, revert to sorting by
+ * dataset name.
+ */
+ return (strcmp(zfs_get_name(za), zfs_get_name(zb)));
+}
+
+/*
+ * Return true if path2 is a child of path1.
+ */
+static boolean_t
+libzfs_path_contains(const char *path1, const char *path2)
+{
+ return (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/');
+}
+
+/*
+ * Given a mountpoint specified by idx in the handles array, find the first
+ * non-descendent of that mountpoint and return its index. Descendant paths
+ * start with the parent's path. This function relies on the ordering
+ * enforced by mountpoint_cmp().
+ */
+static int
+non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int idx)
+{
+ char parent[ZFS_MAXPROPLEN];
+ char child[ZFS_MAXPROPLEN];
+ int i;
+
+ verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent,
+ sizeof (parent), NULL, NULL, 0, B_FALSE) == 0);
+
+ for (i = idx + 1; i < num_handles; i++) {
+ verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child,
+ sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
+ if (!libzfs_path_contains(parent, child))
+ break;
+ }
+ return (i);
+}
+
+typedef struct mnt_param {
+ libzfs_handle_t *mnt_hdl;
+ zfs_taskq_t *mnt_tq;
+ zfs_handle_t **mnt_zhps; /* filesystems to mount */
+ size_t mnt_num_handles;
+ int mnt_idx; /* Index of selected entry to mount */
+ zfs_iter_f mnt_func;
+ void *mnt_data;
+} mnt_param_t;
+
+/*
+ * Allocate and populate the parameter struct for mount function, and
+ * schedule mounting of the entry selected by idx.
+ */
+static void
+zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles,
+ size_t num_handles, int idx, zfs_iter_f func, void *data, zfs_taskq_t *tq)
+{
+ mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t));
+
+ mnt_param->mnt_hdl = hdl;
+ mnt_param->mnt_tq = tq;
+ mnt_param->mnt_zhps = handles;
+ mnt_param->mnt_num_handles = num_handles;
+ mnt_param->mnt_idx = idx;
+ mnt_param->mnt_func = func;
+ mnt_param->mnt_data = data;
+
+ (void) zfs_taskq_dispatch(tq, zfs_mount_task, (void*)mnt_param,
+ ZFS_TQ_SLEEP);
+}
+
+/*
+ * This is the structure used to keep state of mounting or sharing operations
+ * during a call to zpool_enable_datasets().
+ */
+typedef struct mount_state {
+ /*
+ * ms_mntstatus is set to -1 if any mount fails. While multiple threads
+ * could update this variable concurrently, no synchronization is
+ * needed as it's only ever set to -1.
+ */
+ int ms_mntstatus;
+ int ms_mntflags;
+ const char *ms_mntopts;
+} mount_state_t;
+
+static int
+zfs_mount_one(zfs_handle_t *zhp, void *arg)
+{
+ mount_state_t *ms = arg;
+ int ret = 0;
+
+ if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0)
+ ret = ms->ms_mntstatus = -1;
+ return (ret);
+}
+
+static int
+zfs_share_one(zfs_handle_t *zhp, void *arg)
+{
+ mount_state_t *ms = arg;
+ int ret = 0;
+
+ if (zfs_share(zhp) != 0)
+ ret = ms->ms_mntstatus = -1;
+ return (ret);
+}
+
+/*
+ * Task queue function to mount one file system. On completion, it finds and
+ * schedules its children to be mounted. This depends on the sorting done in
+ * zfs_foreach_mountpoint(). Note that the degenerate case (chain of entries
+ * each descending from the previous) will have no parallelism since we always
+ * have to wait for the parent to finish mounting before we can schedule
+ * its children.
+ */
+static void
+zfs_mount_task(void *arg)
+{
+ mnt_param_t *mp = arg;
+ int idx = mp->mnt_idx;
+ zfs_handle_t **handles = mp->mnt_zhps;
+ size_t num_handles = mp->mnt_num_handles;
+ char mountpoint[ZFS_MAXPROPLEN];
+
+ verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint,
+ sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
+
+ if (mp->mnt_func(handles[idx], mp->mnt_data) != 0)
+ return;
+
+ /*
+ * We dispatch tasks to mount filesystems with mountpoints underneath
+ * this one. We do this by dispatching the next filesystem with a
+ * descendant mountpoint of the one we just mounted, then skip all of
+ * its descendants, dispatch the next descendant mountpoint, and so on.
+ * The non_descendant_idx() function skips over filesystems that are
+ * descendants of the filesystem we just dispatched.
+ */
+ for (int i = idx + 1; i < num_handles;
+ i = non_descendant_idx(handles, num_handles, i)) {
+ char child[ZFS_MAXPROPLEN];
+ verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT,
+ child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
+
+ if (!libzfs_path_contains(mountpoint, child))
+ break; /* not a descendant, return */
+ zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i,
+ mp->mnt_func, mp->mnt_data, mp->mnt_tq);
+ }
+ free(mp);
+}
+
+/*
+ * Issue the func callback for each ZFS handle contained in the handles
+ * array. This function is used to mount all datasets, and so this function
+ * guarantees that filesystems for parent mountpoints are called before their
+ * children. As such, before issuing any callbacks, we first sort the array
+ * of handles by mountpoint.
+ *
+ * Callbacks are issued in one of two ways:
+ *
+ * 1. Sequentially: If the parallel argument is B_FALSE or the ZFS_SERIAL_MOUNT
+ * environment variable is set, then we issue callbacks sequentially.
+ *
+ * 2. In parallel: If the parallel argument is B_TRUE and the ZFS_SERIAL_MOUNT
+ * environment variable is not set, then we use a taskq to dispatch threads
+ * to mount filesystems is parallel. This function dispatches tasks to mount
+ * the filesystems at the top-level mountpoints, and these tasks in turn
+ * are responsible for recursively mounting filesystems in their children
+ * mountpoints.
+ */
+void
+zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles,
+ size_t num_handles, zfs_iter_f func, void *data, boolean_t parallel)
+{
+ /*
+ * The ZFS_SERIAL_MOUNT environment variable is an undocumented
+ * variable that can be used as a convenience to do a/b comparison
+ * of serial vs. parallel mounting.
+ */
+ boolean_t serial_mount = !parallel ||
+ (getenv("ZFS_SERIAL_MOUNT") != NULL);
+
+ /*
+ * Sort the datasets by mountpoint. See mountpoint_cmp for details
+ * of how these are sorted.
+ */
+ qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp);
+
+ if (serial_mount) {
+ for (int i = 0; i < num_handles; i++) {
+ func(handles[i], data);
+ }
+ return;
+ }
+
+ /*
+ * Issue the callback function for each dataset using a parallel
+ * algorithm that uses a taskq to manage threads.
+ */
+ zfs_taskq_t *tq = zfs_taskq_create("mount_taskq", mount_tq_nthr, 0,
+ mount_tq_nthr, mount_tq_nthr, ZFS_TASKQ_PREPOPULATE);
+
+ /*
+ * There may be multiple "top level" mountpoints outside of the pool's
+ * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each of
+ * these.
+ */
+ for (int i = 0; i < num_handles;
+ i = non_descendant_idx(handles, num_handles, i)) {
+ zfs_dispatch_mount(hdl, handles, num_handles, i, func, data,
+ tq);
+ }
+
+ zfs_taskq_wait(tq); /* wait for all scheduled mounts to complete */
+ zfs_taskq_destroy(tq);
}
/*
* Mount and share all datasets within the given pool. This assumes that no
- * datasets within the pool are currently mounted. Because users can create
- * complicated nested hierarchies of mountpoints, we first gather all the
- * datasets and mountpoints within the pool, and sort them by mountpoint. Once
- * we have the list of all filesystems, we iterate over them in order and mount
- * and/or share each one.
+ * datasets within the pool are currently mounted.
*/
#pragma weak zpool_mount_datasets = zpool_enable_datasets
int
zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
{
get_all_cb_t cb = { 0 };
- libzfs_handle_t *hdl = zhp->zpool_hdl;
+ mount_state_t ms = { 0 };
zfs_handle_t *zfsp;
- int i, ret = -1;
- int *good;
+ sa_init_selective_arg_t sharearg;
+ int ret = 0;
- /*
- * Gather all non-snap datasets within the pool.
- */
- if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) == NULL)
+ if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
+ ZFS_TYPE_DATASET)) == NULL)
goto out;
- libzfs_add_handle(&cb, zfsp);
- if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0)
- goto out;
- /*
- * Sort the datasets by mountpoint.
- */
- qsort(cb.cb_handles, cb.cb_used, sizeof (void *),
- libzfs_dataset_cmp);
/*
- * And mount all the datasets, keeping track of which ones
- * succeeded or failed.
+ * Gather all non-snapshot datasets within the pool. Start by adding
+ * the root filesystem for this pool to the list, and then iterate
+ * over all child filesystems.
*/
- if ((good = zfs_alloc(zhp->zpool_hdl,
- cb.cb_used * sizeof (int))) == NULL)
+ libzfs_add_handle(&cb, zfsp);
+ if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0)
goto out;
- ret = 0;
- for (i = 0; i < cb.cb_used; i++) {
- if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0)
- ret = -1;
- else
- good[i] = 1;
- }
+ ms.ms_mntopts = mntopts;
+ ms.ms_mntflags = flags;
+ zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
+ zfs_mount_one, &ms, B_TRUE);
+ if (ms.ms_mntstatus != 0)
+ ret = ms.ms_mntstatus;
/*
- * Then share all the ones that need to be shared. This needs
- * to be a separate pass in order to avoid excessive reloading
- * of the configuration. Good should never be NULL since
- * zfs_alloc is supposed to exit if memory isn't available.
+ * Share all filesystems that need to be shared. This needs to be
+ * a separate pass because libshare is not mt-safe, and so we need
+ * to share serially.
*/
- for (i = 0; i < cb.cb_used; i++) {
- if (good[i] && zfs_share(cb.cb_handles[i]) != 0)
- ret = -1;
- }
+ sharearg.zhandle_arr = cb.cb_handles;
+ sharearg.zhandle_len = cb.cb_used;
+ if ((ret = zfs_init_libshare_arg(zhp->zpool_hdl,
+ SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != 0)
+ goto out;
- free(good);
+ ms.ms_mntstatus = 0;
+ zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
+ zfs_share_one, &ms, B_FALSE);
+ if (ms.ms_mntstatus != 0)
+ ret = ms.ms_mntstatus;
out:
- for (i = 0; i < cb.cb_used; i++)
+ for (int i = 0; i < cb.cb_used; i++)
zfs_close(cb.cb_handles[i]);
free(cb.cb_handles);
diff --git a/lib/libzfs/common/libzfs_taskq.c b/lib/libzfs/common/libzfs_taskq.c
new file mode 100644
index 000000000000..28bf64971084
--- /dev/null
+++ b/lib/libzfs/common/libzfs_taskq.c
@@ -0,0 +1,297 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
+ * Copyright (c) 2014, 2018 by Delphix. All rights reserved.
+ */
+
+#include <thread.h>
+#include <synch.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+
+#include "libzfs_taskq.h"
+
+#define ZFS_TASKQ_ACTIVE 0x00010000
+#define ZFS_TASKQ_NAMELEN 31
+
+typedef struct zfs_taskq_ent {
+ struct zfs_taskq_ent *ztqent_next;
+ struct zfs_taskq_ent *ztqent_prev;
+ ztask_func_t *ztqent_func;
+ void *ztqent_arg;
+ uintptr_t ztqent_flags;
+} zfs_taskq_ent_t;
+
+struct zfs_taskq {
+ char ztq_name[ZFS_TASKQ_NAMELEN + 1];
+ mutex_t ztq_lock;
+ rwlock_t ztq_threadlock;
+ cond_t ztq_dispatch_cv;
+ cond_t ztq_wait_cv;
+ thread_t *ztq_threadlist;
+ int ztq_flags;
+ int ztq_active;
+ int ztq_nthreads;
+ int ztq_nalloc;
+ int ztq_minalloc;
+ int ztq_maxalloc;
+ cond_t ztq_maxalloc_cv;
+ int ztq_maxalloc_wait;
+ zfs_taskq_ent_t *ztq_freelist;
+ zfs_taskq_ent_t ztq_task;
+};
+
+static zfs_taskq_ent_t *
+ztask_alloc(zfs_taskq_t *ztq, int ztqflags)
+{
+ zfs_taskq_ent_t *t;
+ timestruc_t ts;
+ int err;
+
+again: if ((t = ztq->ztq_freelist) != NULL &&
+ ztq->ztq_nalloc >= ztq->ztq_minalloc) {
+ ztq->ztq_freelist = t->ztqent_next;
+ } else {
+ if (ztq->ztq_nalloc >= ztq->ztq_maxalloc) {
+ if (!(ztqflags & UMEM_NOFAIL))
+ return (NULL);
+
+ /*
+ * We don't want to exceed ztq_maxalloc, but we can't
+ * wait for other tasks to complete (and thus free up
+ * task structures) without risking deadlock with
+ * the caller. So, we just delay for one second
+ * to throttle the allocation rate. If we have tasks
+ * complete before one second timeout expires then
+ * zfs_taskq_ent_free will signal us and we will
+ * immediately retry the allocation.
+ */
+ ztq->ztq_maxalloc_wait++;
+
+ ts.tv_sec = 1;
+ ts.tv_nsec = 0;
+ err = cond_reltimedwait(&ztq->ztq_maxalloc_cv,
+ &ztq->ztq_lock, &ts);
+
+ ztq->ztq_maxalloc_wait--;
+ if (err == 0)
+ goto again; /* signaled */
+ }
+ mutex_exit(&ztq->ztq_lock);
+
+ t = umem_alloc(sizeof (zfs_taskq_ent_t), ztqflags);
+
+ mutex_enter(&ztq->ztq_lock);
+ if (t != NULL)
+ ztq->ztq_nalloc++;
+ }
+ return (t);
+}
+
+static void
+ztask_free(zfs_taskq_t *ztq, zfs_taskq_ent_t *t)
+{
+ if (ztq->ztq_nalloc <= ztq->ztq_minalloc) {
+ t->ztqent_next = ztq->ztq_freelist;
+ ztq->ztq_freelist = t;
+ } else {
+ ztq->ztq_nalloc--;
+ mutex_exit(&ztq->ztq_lock);
+ umem_free(t, sizeof (zfs_taskq_ent_t));
+ mutex_enter(&ztq->ztq_lock);
+ }
+
+ if (ztq->ztq_maxalloc_wait)
+ VERIFY0(cond_signal(&ztq->ztq_maxalloc_cv));
+}
+
+zfs_taskqid_t
+zfs_taskq_dispatch(zfs_taskq_t *ztq, ztask_func_t func, void *arg,
+ uint_t ztqflags)
+{
+ zfs_taskq_ent_t *t;
+
+ mutex_enter(&ztq->ztq_lock);
+ ASSERT(ztq->ztq_flags & ZFS_TASKQ_ACTIVE);
+ if ((t = ztask_alloc(ztq, ztqflags)) == NULL) {
+ mutex_exit(&ztq->ztq_lock);
+ return (0);
+ }
+ if (ztqflags & ZFS_TQ_FRONT) {
+ t->ztqent_next = ztq->ztq_task.ztqent_next;
+ t->ztqent_prev = &ztq->ztq_task;
+ } else {
+ t->ztqent_next = &ztq->ztq_task;
+ t->ztqent_prev = ztq->ztq_task.ztqent_prev;
+ }
+ t->ztqent_next->ztqent_prev = t;
+ t->ztqent_prev->ztqent_next = t;
+ t->ztqent_func = func;
+ t->ztqent_arg = arg;
+ t->ztqent_flags = 0;
+ VERIFY0(cond_signal(&ztq->ztq_dispatch_cv));
+ mutex_exit(&ztq->ztq_lock);
+ return (1);
+}
+
+void
+zfs_taskq_wait(zfs_taskq_t *ztq)
+{
+ mutex_enter(&ztq->ztq_lock);
+ while (ztq->ztq_task.ztqent_next != &ztq->ztq_task ||
+ ztq->ztq_active != 0) {
+ int ret = cond_wait(&ztq->ztq_wait_cv, &ztq->ztq_lock);
+ VERIFY(ret == 0 || ret == EINTR);
+ }
+ mutex_exit(&ztq->ztq_lock);
+}
+
+static void *
+zfs_taskq_thread(void *arg)
+{
+ zfs_taskq_t *ztq = arg;
+ zfs_taskq_ent_t *t;
+ boolean_t prealloc;
+
+ mutex_enter(&ztq->ztq_lock);
+ while (ztq->ztq_flags & ZFS_TASKQ_ACTIVE) {
+ if ((t = ztq->ztq_task.ztqent_next) == &ztq->ztq_task) {
+ int ret;
+ if (--ztq->ztq_active == 0)
+ VERIFY0(cond_broadcast(&ztq->ztq_wait_cv));
+ ret = cond_wait(&ztq->ztq_dispatch_cv, &ztq->ztq_lock);
+ VERIFY(ret == 0 || ret == EINTR);
+ ztq->ztq_active++;
+ continue;
+ }
+ t->ztqent_prev->ztqent_next = t->ztqent_next;
+ t->ztqent_next->ztqent_prev = t->ztqent_prev;
+ t->ztqent_next = NULL;
+ t->ztqent_prev = NULL;
+ prealloc = t->ztqent_flags & ZFS_TQENT_FLAG_PREALLOC;
+ mutex_exit(&ztq->ztq_lock);
+
+ VERIFY0(rw_rdlock(&ztq->ztq_threadlock));
+ t->ztqent_func(t->ztqent_arg);
+ VERIFY0(rw_unlock(&ztq->ztq_threadlock));
+
+ mutex_enter(&ztq->ztq_lock);
+ if (!prealloc)
+ ztask_free(ztq, t);
+ }
+ ztq->ztq_nthreads--;
+ VERIFY0(cond_broadcast(&ztq->ztq_wait_cv));
+ mutex_exit(&ztq->ztq_lock);
+ return (NULL);
+}
+
+/*ARGSUSED*/
+zfs_taskq_t *
+zfs_taskq_create(const char *name, int nthreads, pri_t pri, int minalloc,
+ int maxalloc, uint_t flags)
+{
+ zfs_taskq_t *ztq = umem_zalloc(sizeof (zfs_taskq_t), UMEM_NOFAIL);
+ int t;
+
+ ASSERT3S(nthreads, >=, 1);
+
+ VERIFY0(rwlock_init(&ztq->ztq_threadlock, USYNC_THREAD, NULL));
+ VERIFY0(cond_init(&ztq->ztq_dispatch_cv, USYNC_THREAD, NULL));
+ VERIFY0(cond_init(&ztq->ztq_wait_cv, USYNC_THREAD, NULL));
+ VERIFY0(cond_init(&ztq->ztq_maxalloc_cv, USYNC_THREAD, NULL));
+ VERIFY0(mutex_init(
+ &ztq->ztq_lock, LOCK_NORMAL | LOCK_ERRORCHECK, NULL));
+
+ (void) strncpy(ztq->ztq_name, name, ZFS_TASKQ_NAMELEN + 1);
+
+ ztq->ztq_flags = flags | ZFS_TASKQ_ACTIVE;
+ ztq->ztq_active = nthreads;
+ ztq->ztq_nthreads = nthreads;
+ ztq->ztq_minalloc = minalloc;
+ ztq->ztq_maxalloc = maxalloc;
+ ztq->ztq_task.ztqent_next = &ztq->ztq_task;
+ ztq->ztq_task.ztqent_prev = &ztq->ztq_task;
+ ztq->ztq_threadlist =
+ umem_alloc(nthreads * sizeof (thread_t), UMEM_NOFAIL);
+
+ if (flags & ZFS_TASKQ_PREPOPULATE) {
+ mutex_enter(&ztq->ztq_lock);
+ while (minalloc-- > 0)
+ ztask_free(ztq, ztask_alloc(ztq, UMEM_NOFAIL));
+ mutex_exit(&ztq->ztq_lock);
+ }
+
+ for (t = 0; t < nthreads; t++) {
+ (void) thr_create(0, 0, zfs_taskq_thread,
+ ztq, THR_BOUND, &ztq->ztq_threadlist[t]);
+ }
+
+ return (ztq);
+}
+
+void
+zfs_taskq_destroy(zfs_taskq_t *ztq)
+{
+ int t;
+ int nthreads = ztq->ztq_nthreads;
+
+ zfs_taskq_wait(ztq);
+
+ mutex_enter(&ztq->ztq_lock);
+
+ ztq->ztq_flags &= ~ZFS_TASKQ_ACTIVE;
+ VERIFY0(cond_broadcast(&ztq->ztq_dispatch_cv));
+
+ while (ztq->ztq_nthreads != 0) {
+ int ret = cond_wait(&ztq->ztq_wait_cv, &ztq->ztq_lock);
+ VERIFY(ret == 0 || ret == EINTR);
+ }
+
+ ztq->ztq_minalloc = 0;
+ while (ztq->ztq_nalloc != 0) {
+ ASSERT(ztq->ztq_freelist != NULL);
+ ztask_free(ztq, ztask_alloc(ztq, UMEM_NOFAIL));
+ }
+
+ mutex_exit(&ztq->ztq_lock);
+
+ for (t = 0; t < nthreads; t++)
+ (void) thr_join(ztq->ztq_threadlist[t], NULL, NULL);
+
+ umem_free(ztq->ztq_threadlist, nthreads * sizeof (thread_t));
+
+ VERIFY0(rwlock_destroy(&ztq->ztq_threadlock));
+ VERIFY0(cond_destroy(&ztq->ztq_dispatch_cv));
+ VERIFY0(cond_destroy(&ztq->ztq_wait_cv));
+ VERIFY0(cond_destroy(&ztq->ztq_maxalloc_cv));
+ VERIFY0(mutex_destroy(&ztq->ztq_lock));
+
+ umem_free(ztq, sizeof (zfs_taskq_t));
+}
diff --git a/lib/libzfs/common/libzfs_taskq.h b/lib/libzfs/common/libzfs_taskq.h
new file mode 100644
index 000000000000..7ac045738c36
--- /dev/null
+++ b/lib/libzfs/common/libzfs_taskq.h
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _ZFS_TASKQ_H
+#define _ZFS_TASKQ_H
+
+#include <stdint.h>
+#include <umem.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zfs_taskq zfs_taskq_t;
+typedef uintptr_t zfs_taskqid_t;
+typedef void (ztask_func_t)(void *);
+
+#define ZFS_TQENT_FLAG_PREALLOC 0x1 /* taskq_dispatch_ent used */
+
+#define ZFS_TASKQ_PREPOPULATE 0x0001
+
+#define ZFS_TQ_SLEEP UMEM_NOFAIL /* Can block for memory */
+#define ZFS_TQ_NOSLEEP UMEM_DEFAULT /* cannot block for memory; may fail */
+#define ZFS_TQ_FRONT 0x08 /* Queue in front */
+
+extern zfs_taskq_t *zfs_taskq_create(const char *, int, pri_t, int,
+ int, uint_t);
+extern void zfs_taskq_destroy(zfs_taskq_t *);
+
+extern zfs_taskqid_t zfs_taskq_dispatch(zfs_taskq_t *, ztask_func_t,
+ void *, uint_t);
+
+extern void zfs_taskq_wait(zfs_taskq_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_TASKQ_H */