diff options
author | Alexander Motin <mav@FreeBSD.org> | 2018-08-01 18:28:17 +0000 |
---|---|---|
committer | Alexander Motin <mav@FreeBSD.org> | 2018-08-01 18:28:17 +0000 |
commit | 2395a7f52dc8a224e322d6d10ce1fd59c6d971c9 (patch) | |
tree | c0a3299d224b5f92f059bfb538d5e82e556f554f /lib | |
parent | 2a44a23db2023b975e979c690e4453b5d39431b2 (diff) | |
download | src-2395a7f52dc8a224e322d6d10ce1fd59c6d971c9.tar.gz src-2395a7f52dc8a224e322d6d10ce1fd59c6d971c9.zip |
8115 parallel zfs mount
Mounting of filesystems in "filesystem/local" is done using `zfs mount -a`,
which mounts each filesystems serially. The bottleneck for each mount is
the I/O done to load metadata for each filesystem. As such, mounting
filesystems using a parallel algorithm should be a big win, and bring down
the runtime of "filesystem/local"'s start method.
illumos/illumos-gate@591e0e133f9980083db5d64ac33a30bcc3382ff7
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Prashanth Sreenivasa <pks@delphix.com>
Approved by: Matt Ahrens <mahrens@delphix.com>
Author: Sebastien Roy <seb@delphix.com>
Notes
Notes:
svn path=/vendor-sys/illumos/dist/; revision=337047
Diffstat (limited to 'lib')
-rw-r--r-- | lib/libzfs/common/libzfs.h | 5 | ||||
-rw-r--r-- | lib/libzfs/common/libzfs_dataset.c | 31 | ||||
-rw-r--r-- | lib/libzfs/common/libzfs_impl.h | 10 | ||||
-rw-r--r-- | lib/libzfs/common/libzfs_mount.c | 409 | ||||
-rw-r--r-- | lib/libzfs/common/libzfs_taskq.c | 297 | ||||
-rw-r--r-- | lib/libzfs/common/libzfs_taskq.h | 63 |
6 files changed, 725 insertions, 90 deletions
diff --git a/lib/libzfs/common/libzfs.h b/lib/libzfs/common/libzfs.h index 9dc2b02e147f..d296fed59f01 100644 --- a/lib/libzfs/common/libzfs.h +++ b/lib/libzfs/common/libzfs.h @@ -576,12 +576,11 @@ typedef struct get_all_cb { zfs_handle_t **cb_handles; size_t cb_alloc; size_t cb_used; - boolean_t cb_verbose; - int (*cb_getone)(zfs_handle_t *, void *); } get_all_cb_t; +void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, size_t, + zfs_iter_f, void *, boolean_t); void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); -int libzfs_dataset_cmp(const void *, const void *); /* * Functions to create and destroy datasets. diff --git a/lib/libzfs/common/libzfs_dataset.c b/lib/libzfs/common/libzfs_dataset.c index 011c2653a152..556538a4402b 100644 --- a/lib/libzfs/common/libzfs_dataset.c +++ b/lib/libzfs/common/libzfs_dataset.c @@ -54,6 +54,7 @@ #include <idmap.h> #include <aclutils.h> #include <directory.h> +#include <time.h> #include <sys/dnode.h> #include <sys/spa.h> @@ -785,6 +786,8 @@ libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) void libzfs_mnttab_init(libzfs_handle_t *hdl) { + (void) mutex_init(&hdl->libzfs_mnttab_cache_lock, + LOCK_NORMAL | LOCK_ERRORCHECK, NULL); assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0); avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare, sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); @@ -825,6 +828,7 @@ libzfs_mnttab_fini(libzfs_handle_t *hdl) free(mtn); } avl_destroy(&hdl->libzfs_mnttab_cache); + (void) mutex_destroy(&hdl->libzfs_mnttab_cache_lock); } void @@ -839,6 +843,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, { mnttab_node_t find; mnttab_node_t *mtn; + int ret = ENOENT; if (!hdl->libzfs_mnttab_enable) { struct mnttab srch = { 0 }; @@ -854,6 +859,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, return (ENOENT); } + mutex_enter(&hdl->libzfs_mnttab_cache_lock); if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) libzfs_mnttab_update(hdl); @@ -861,9 +867,10 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL); if (mtn) { *entry = mtn->mtn_mt; - return (0); + ret = 0; } - return (ENOENT); + mutex_exit(&hdl->libzfs_mnttab_cache_lock); + return (ret); } void @@ -872,14 +879,16 @@ libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special, { mnttab_node_t *mtn; - if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) - return; - mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); - mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special); - mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp); - mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS); - mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts); - avl_add(&hdl->libzfs_mnttab_cache, mtn); + mutex_enter(&hdl->libzfs_mnttab_cache_lock); + if (avl_numnodes(&hdl->libzfs_mnttab_cache) != 0) { + mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); + mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special); + mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp); + mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS); + mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts); + avl_add(&hdl->libzfs_mnttab_cache, mtn); + } + mutex_exit(&hdl->libzfs_mnttab_cache_lock); } void @@ -888,6 +897,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname) mnttab_node_t find; mnttab_node_t *ret; + mutex_enter(&hdl->libzfs_mnttab_cache_lock); find.mtn_mt.mnt_special = (char *)fsname; if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) != NULL) { @@ -898,6 +908,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname) free(ret->mtn_mt.mnt_mntopts); free(ret); } + mutex_exit(&hdl->libzfs_mnttab_cache_lock); } int diff --git a/lib/libzfs/common/libzfs_impl.h b/lib/libzfs/common/libzfs_impl.h index 50f48fd7932d..cd9a53d91fa8 100644 --- a/lib/libzfs/common/libzfs_impl.h +++ b/lib/libzfs/common/libzfs_impl.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. */ #ifndef _LIBZFS_IMPL_H @@ -33,6 +33,7 @@ #include <sys/nvpair.h> #include <sys/dmu.h> #include <sys/zfs_ioctl.h> +#include <synch.h> #include <libuutil.h> #include <libzfs.h> @@ -73,6 +74,13 @@ struct libzfs_handle { int libzfs_storeerr; /* stuff error messages into buffer */ void *libzfs_sharehdl; /* libshare handle */ boolean_t libzfs_mnttab_enable; + /* + * We need a lock to handle the case where parallel mount + * threads are populating the mnttab cache simultaneously. The + * lock only protects the integrity of the avl tree, and does + * not protect the contents of the mnttab entries themselves. + */ + mutex_t libzfs_mnttab_cache_lock; avl_tree_t libzfs_mnttab_cache; int libzfs_pool_iter; topo_hdl_t *libzfs_topo_hdl; diff --git a/lib/libzfs/common/libzfs_mount.c b/lib/libzfs/common/libzfs_mount.c index 9fd37825a31c..cf15735f3f9c 100644 --- a/lib/libzfs/common/libzfs_mount.c +++ b/lib/libzfs/common/libzfs_mount.c @@ -22,7 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com> * Copyright 2017 Joyent, Inc. * Copyright 2017 RackTop Systems. @@ -34,25 +34,25 @@ * they are used by mount and unmount and when changing a filesystem's * mountpoint. * - * zfs_is_mounted() - * zfs_mount() - * zfs_unmount() - * zfs_unmountall() + * zfs_is_mounted() + * zfs_mount() + * zfs_unmount() + * zfs_unmountall() * * This file also contains the functions used to manage sharing filesystems via * NFS and iSCSI: * - * zfs_is_shared() - * zfs_share() - * zfs_unshare() + * zfs_is_shared() + * zfs_share() + * zfs_unshare() * - * zfs_is_shared_nfs() - * zfs_is_shared_smb() - * zfs_share_proto() - * zfs_shareall(); - * zfs_unshare_nfs() - * zfs_unshare_smb() - * zfs_unshareall_nfs() + * zfs_is_shared_nfs() + * zfs_is_shared_smb() + * zfs_share_proto() + * zfs_shareall(); + * zfs_unshare_nfs() + * zfs_unshare_smb() + * zfs_unshareall_nfs() * zfs_unshareall_smb() * zfs_unshareall() * zfs_unshareall_bypath() @@ -60,8 +60,8 @@ * The following functions are available for pool consumers, and will * mount/unmount and share/unshare all datasets within pool: * - * zpool_enable_datasets() - * zpool_disable_datasets() + * zpool_enable_datasets() + * zpool_disable_datasets() */ #include <dirent.h> @@ -83,11 +83,15 @@ #include <libzfs.h> #include "libzfs_impl.h" +#include "libzfs_taskq.h" #include <libshare.h> #include <sys/systeminfo.h> #define MAXISALEN 257 /* based on sysinfo(2) man page */ +static int mount_tq_nthr = 512; /* taskq threads for multi-threaded mounting */ + +static void zfs_mount_task(void *); static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *); zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **, zfs_share_proto_t); @@ -1077,25 +1081,32 @@ remove_mountpoint(zfs_handle_t *zhp) } } +/* + * Add the given zfs handle to the cb_handles array, dynamically reallocating + * the array if it is out of space. + */ void libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp) { if (cbp->cb_alloc == cbp->cb_used) { size_t newsz; - void *ptr; + zfs_handle_t **newhandles; - newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64; - ptr = zfs_realloc(zhp->zfs_hdl, - cbp->cb_handles, cbp->cb_alloc * sizeof (void *), - newsz * sizeof (void *)); - cbp->cb_handles = ptr; + newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64; + newhandles = zfs_realloc(zhp->zfs_hdl, + cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *), + newsz * sizeof (zfs_handle_t *)); + cbp->cb_handles = newhandles; cbp->cb_alloc = newsz; } cbp->cb_handles[cbp->cb_used++] = zhp; } +/* + * Recursive helper function used during file system enumeration + */ static int -mount_cb(zfs_handle_t *zhp, void *data) +zfs_iter_cb(zfs_handle_t *zhp, void *data) { get_all_cb_t *cbp = data; @@ -1121,104 +1132,350 @@ mount_cb(zfs_handle_t *zhp, void *data) } libzfs_add_handle(cbp, zhp); - if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) { + if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) { zfs_close(zhp); return (-1); } return (0); } +/* + * Sort comparator that compares two mountpoint paths. We sort these paths so + * that subdirectories immediately follow their parents. This means that we + * effectively treat the '/' character as the lowest value non-nul char. An + * example sorted list using this comparator would look like: + * + * /foo + * /foo/bar + * /foo/bar/baz + * /foo/baz + * /foo.bar + * + * The mounting code depends on this ordering to deterministically iterate + * over filesystems in order to spawn parallel mount tasks. + */ int -libzfs_dataset_cmp(const void *a, const void *b) +mountpoint_cmp(const void *arga, const void *argb) { - zfs_handle_t **za = (zfs_handle_t **)a; - zfs_handle_t **zb = (zfs_handle_t **)b; + zfs_handle_t *const *zap = arga; + zfs_handle_t *za = *zap; + zfs_handle_t *const *zbp = argb; + zfs_handle_t *zb = *zbp; char mounta[MAXPATHLEN]; char mountb[MAXPATHLEN]; + const char *a = mounta; + const char *b = mountb; boolean_t gota, gotb; - if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0) - verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta, + gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM); + if (gota) { + verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta, sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0); - if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0) - verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb, + } + gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM); + if (gotb) { + verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb, sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0); + } - if (gota && gotb) - return (strcmp(mounta, mountb)); + if (gota && gotb) { + while (*a != '\0' && (*a == *b)) { + a++; + b++; + } + if (*a == *b) + return (0); + if (*a == '\0') + return (-1); + if (*b == '\0') + return (1); + if (*a == '/') + return (-1); + if (*b == '/') + return (1); + return (*a < *b ? -1 : *a > *b); + } if (gota) return (-1); if (gotb) return (1); - return (strcmp(zfs_get_name(a), zfs_get_name(b))); + /* + * If neither filesystem has a mountpoint, revert to sorting by + * dataset name. + */ + return (strcmp(zfs_get_name(za), zfs_get_name(zb))); +} + +/* + * Return true if path2 is a child of path1. + */ +static boolean_t +libzfs_path_contains(const char *path1, const char *path2) +{ + return (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/'); +} + +/* + * Given a mountpoint specified by idx in the handles array, find the first + * non-descendent of that mountpoint and return its index. Descendant paths + * start with the parent's path. This function relies on the ordering + * enforced by mountpoint_cmp(). + */ +static int +non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int idx) +{ + char parent[ZFS_MAXPROPLEN]; + char child[ZFS_MAXPROPLEN]; + int i; + + verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent, + sizeof (parent), NULL, NULL, 0, B_FALSE) == 0); + + for (i = idx + 1; i < num_handles; i++) { + verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child, + sizeof (child), NULL, NULL, 0, B_FALSE) == 0); + if (!libzfs_path_contains(parent, child)) + break; + } + return (i); +} + +typedef struct mnt_param { + libzfs_handle_t *mnt_hdl; + zfs_taskq_t *mnt_tq; + zfs_handle_t **mnt_zhps; /* filesystems to mount */ + size_t mnt_num_handles; + int mnt_idx; /* Index of selected entry to mount */ + zfs_iter_f mnt_func; + void *mnt_data; +} mnt_param_t; + +/* + * Allocate and populate the parameter struct for mount function, and + * schedule mounting of the entry selected by idx. + */ +static void +zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles, + size_t num_handles, int idx, zfs_iter_f func, void *data, zfs_taskq_t *tq) +{ + mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t)); + + mnt_param->mnt_hdl = hdl; + mnt_param->mnt_tq = tq; + mnt_param->mnt_zhps = handles; + mnt_param->mnt_num_handles = num_handles; + mnt_param->mnt_idx = idx; + mnt_param->mnt_func = func; + mnt_param->mnt_data = data; + + (void) zfs_taskq_dispatch(tq, zfs_mount_task, (void*)mnt_param, + ZFS_TQ_SLEEP); +} + +/* + * This is the structure used to keep state of mounting or sharing operations + * during a call to zpool_enable_datasets(). + */ +typedef struct mount_state { + /* + * ms_mntstatus is set to -1 if any mount fails. While multiple threads + * could update this variable concurrently, no synchronization is + * needed as it's only ever set to -1. + */ + int ms_mntstatus; + int ms_mntflags; + const char *ms_mntopts; +} mount_state_t; + +static int +zfs_mount_one(zfs_handle_t *zhp, void *arg) +{ + mount_state_t *ms = arg; + int ret = 0; + + if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0) + ret = ms->ms_mntstatus = -1; + return (ret); +} + +static int +zfs_share_one(zfs_handle_t *zhp, void *arg) +{ + mount_state_t *ms = arg; + int ret = 0; + + if (zfs_share(zhp) != 0) + ret = ms->ms_mntstatus = -1; + return (ret); +} + +/* + * Task queue function to mount one file system. On completion, it finds and + * schedules its children to be mounted. This depends on the sorting done in + * zfs_foreach_mountpoint(). Note that the degenerate case (chain of entries + * each descending from the previous) will have no parallelism since we always + * have to wait for the parent to finish mounting before we can schedule + * its children. + */ +static void +zfs_mount_task(void *arg) +{ + mnt_param_t *mp = arg; + int idx = mp->mnt_idx; + zfs_handle_t **handles = mp->mnt_zhps; + size_t num_handles = mp->mnt_num_handles; + char mountpoint[ZFS_MAXPROPLEN]; + + verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint, + sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); + + if (mp->mnt_func(handles[idx], mp->mnt_data) != 0) + return; + + /* + * We dispatch tasks to mount filesystems with mountpoints underneath + * this one. We do this by dispatching the next filesystem with a + * descendant mountpoint of the one we just mounted, then skip all of + * its descendants, dispatch the next descendant mountpoint, and so on. + * The non_descendant_idx() function skips over filesystems that are + * descendants of the filesystem we just dispatched. + */ + for (int i = idx + 1; i < num_handles; + i = non_descendant_idx(handles, num_handles, i)) { + char child[ZFS_MAXPROPLEN]; + verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, + child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0); + + if (!libzfs_path_contains(mountpoint, child)) + break; /* not a descendant, return */ + zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i, + mp->mnt_func, mp->mnt_data, mp->mnt_tq); + } + free(mp); +} + +/* + * Issue the func callback for each ZFS handle contained in the handles + * array. This function is used to mount all datasets, and so this function + * guarantees that filesystems for parent mountpoints are called before their + * children. As such, before issuing any callbacks, we first sort the array + * of handles by mountpoint. + * + * Callbacks are issued in one of two ways: + * + * 1. Sequentially: If the parallel argument is B_FALSE or the ZFS_SERIAL_MOUNT + * environment variable is set, then we issue callbacks sequentially. + * + * 2. In parallel: If the parallel argument is B_TRUE and the ZFS_SERIAL_MOUNT + * environment variable is not set, then we use a taskq to dispatch threads + * to mount filesystems is parallel. This function dispatches tasks to mount + * the filesystems at the top-level mountpoints, and these tasks in turn + * are responsible for recursively mounting filesystems in their children + * mountpoints. + */ +void +zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, + size_t num_handles, zfs_iter_f func, void *data, boolean_t parallel) +{ + /* + * The ZFS_SERIAL_MOUNT environment variable is an undocumented + * variable that can be used as a convenience to do a/b comparison + * of serial vs. parallel mounting. + */ + boolean_t serial_mount = !parallel || + (getenv("ZFS_SERIAL_MOUNT") != NULL); + + /* + * Sort the datasets by mountpoint. See mountpoint_cmp for details + * of how these are sorted. + */ + qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp); + + if (serial_mount) { + for (int i = 0; i < num_handles; i++) { + func(handles[i], data); + } + return; + } + + /* + * Issue the callback function for each dataset using a parallel + * algorithm that uses a taskq to manage threads. + */ + zfs_taskq_t *tq = zfs_taskq_create("mount_taskq", mount_tq_nthr, 0, + mount_tq_nthr, mount_tq_nthr, ZFS_TASKQ_PREPOPULATE); + + /* + * There may be multiple "top level" mountpoints outside of the pool's + * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each of + * these. + */ + for (int i = 0; i < num_handles; + i = non_descendant_idx(handles, num_handles, i)) { + zfs_dispatch_mount(hdl, handles, num_handles, i, func, data, + tq); + } + + zfs_taskq_wait(tq); /* wait for all scheduled mounts to complete */ + zfs_taskq_destroy(tq); } /* * Mount and share all datasets within the given pool. This assumes that no - * datasets within the pool are currently mounted. Because users can create - * complicated nested hierarchies of mountpoints, we first gather all the - * datasets and mountpoints within the pool, and sort them by mountpoint. Once - * we have the list of all filesystems, we iterate over them in order and mount - * and/or share each one. + * datasets within the pool are currently mounted. */ #pragma weak zpool_mount_datasets = zpool_enable_datasets int zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) { get_all_cb_t cb = { 0 }; - libzfs_handle_t *hdl = zhp->zpool_hdl; + mount_state_t ms = { 0 }; zfs_handle_t *zfsp; - int i, ret = -1; - int *good; + sa_init_selective_arg_t sharearg; + int ret = 0; - /* - * Gather all non-snap datasets within the pool. - */ - if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) == NULL) + if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name, + ZFS_TYPE_DATASET)) == NULL) goto out; - libzfs_add_handle(&cb, zfsp); - if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0) - goto out; - /* - * Sort the datasets by mountpoint. - */ - qsort(cb.cb_handles, cb.cb_used, sizeof (void *), - libzfs_dataset_cmp); /* - * And mount all the datasets, keeping track of which ones - * succeeded or failed. + * Gather all non-snapshot datasets within the pool. Start by adding + * the root filesystem for this pool to the list, and then iterate + * over all child filesystems. */ - if ((good = zfs_alloc(zhp->zpool_hdl, - cb.cb_used * sizeof (int))) == NULL) + libzfs_add_handle(&cb, zfsp); + if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0) goto out; - ret = 0; - for (i = 0; i < cb.cb_used; i++) { - if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0) - ret = -1; - else - good[i] = 1; - } + ms.ms_mntopts = mntopts; + ms.ms_mntflags = flags; + zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, + zfs_mount_one, &ms, B_TRUE); + if (ms.ms_mntstatus != 0) + ret = ms.ms_mntstatus; /* - * Then share all the ones that need to be shared. This needs - * to be a separate pass in order to avoid excessive reloading - * of the configuration. Good should never be NULL since - * zfs_alloc is supposed to exit if memory isn't available. + * Share all filesystems that need to be shared. This needs to be + * a separate pass because libshare is not mt-safe, and so we need + * to share serially. */ - for (i = 0; i < cb.cb_used; i++) { - if (good[i] && zfs_share(cb.cb_handles[i]) != 0) - ret = -1; - } + sharearg.zhandle_arr = cb.cb_handles; + sharearg.zhandle_len = cb.cb_used; + if ((ret = zfs_init_libshare_arg(zhp->zpool_hdl, + SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != 0) + goto out; - free(good); + ms.ms_mntstatus = 0; + zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, + zfs_share_one, &ms, B_FALSE); + if (ms.ms_mntstatus != 0) + ret = ms.ms_mntstatus; out: - for (i = 0; i < cb.cb_used; i++) + for (int i = 0; i < cb.cb_used; i++) zfs_close(cb.cb_handles[i]); free(cb.cb_handles); diff --git a/lib/libzfs/common/libzfs_taskq.c b/lib/libzfs/common/libzfs_taskq.c new file mode 100644 index 000000000000..28bf64971084 --- /dev/null +++ b/lib/libzfs/common/libzfs_taskq.c @@ -0,0 +1,297 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. + * Copyright (c) 2014, 2018 by Delphix. All rights reserved. + */ + +#include <thread.h> +#include <synch.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <sys/debug.h> +#include <sys/sysmacros.h> + +#include "libzfs_taskq.h" + +#define ZFS_TASKQ_ACTIVE 0x00010000 +#define ZFS_TASKQ_NAMELEN 31 + +typedef struct zfs_taskq_ent { + struct zfs_taskq_ent *ztqent_next; + struct zfs_taskq_ent *ztqent_prev; + ztask_func_t *ztqent_func; + void *ztqent_arg; + uintptr_t ztqent_flags; +} zfs_taskq_ent_t; + +struct zfs_taskq { + char ztq_name[ZFS_TASKQ_NAMELEN + 1]; + mutex_t ztq_lock; + rwlock_t ztq_threadlock; + cond_t ztq_dispatch_cv; + cond_t ztq_wait_cv; + thread_t *ztq_threadlist; + int ztq_flags; + int ztq_active; + int ztq_nthreads; + int ztq_nalloc; + int ztq_minalloc; + int ztq_maxalloc; + cond_t ztq_maxalloc_cv; + int ztq_maxalloc_wait; + zfs_taskq_ent_t *ztq_freelist; + zfs_taskq_ent_t ztq_task; +}; + +static zfs_taskq_ent_t * +ztask_alloc(zfs_taskq_t *ztq, int ztqflags) +{ + zfs_taskq_ent_t *t; + timestruc_t ts; + int err; + +again: if ((t = ztq->ztq_freelist) != NULL && + ztq->ztq_nalloc >= ztq->ztq_minalloc) { + ztq->ztq_freelist = t->ztqent_next; + } else { + if (ztq->ztq_nalloc >= ztq->ztq_maxalloc) { + if (!(ztqflags & UMEM_NOFAIL)) + return (NULL); + + /* + * We don't want to exceed ztq_maxalloc, but we can't + * wait for other tasks to complete (and thus free up + * task structures) without risking deadlock with + * the caller. So, we just delay for one second + * to throttle the allocation rate. If we have tasks + * complete before one second timeout expires then + * zfs_taskq_ent_free will signal us and we will + * immediately retry the allocation. + */ + ztq->ztq_maxalloc_wait++; + + ts.tv_sec = 1; + ts.tv_nsec = 0; + err = cond_reltimedwait(&ztq->ztq_maxalloc_cv, + &ztq->ztq_lock, &ts); + + ztq->ztq_maxalloc_wait--; + if (err == 0) + goto again; /* signaled */ + } + mutex_exit(&ztq->ztq_lock); + + t = umem_alloc(sizeof (zfs_taskq_ent_t), ztqflags); + + mutex_enter(&ztq->ztq_lock); + if (t != NULL) + ztq->ztq_nalloc++; + } + return (t); +} + +static void +ztask_free(zfs_taskq_t *ztq, zfs_taskq_ent_t *t) +{ + if (ztq->ztq_nalloc <= ztq->ztq_minalloc) { + t->ztqent_next = ztq->ztq_freelist; + ztq->ztq_freelist = t; + } else { + ztq->ztq_nalloc--; + mutex_exit(&ztq->ztq_lock); + umem_free(t, sizeof (zfs_taskq_ent_t)); + mutex_enter(&ztq->ztq_lock); + } + + if (ztq->ztq_maxalloc_wait) + VERIFY0(cond_signal(&ztq->ztq_maxalloc_cv)); +} + +zfs_taskqid_t +zfs_taskq_dispatch(zfs_taskq_t *ztq, ztask_func_t func, void *arg, + uint_t ztqflags) +{ + zfs_taskq_ent_t *t; + + mutex_enter(&ztq->ztq_lock); + ASSERT(ztq->ztq_flags & ZFS_TASKQ_ACTIVE); + if ((t = ztask_alloc(ztq, ztqflags)) == NULL) { + mutex_exit(&ztq->ztq_lock); + return (0); + } + if (ztqflags & ZFS_TQ_FRONT) { + t->ztqent_next = ztq->ztq_task.ztqent_next; + t->ztqent_prev = &ztq->ztq_task; + } else { + t->ztqent_next = &ztq->ztq_task; + t->ztqent_prev = ztq->ztq_task.ztqent_prev; + } + t->ztqent_next->ztqent_prev = t; + t->ztqent_prev->ztqent_next = t; + t->ztqent_func = func; + t->ztqent_arg = arg; + t->ztqent_flags = 0; + VERIFY0(cond_signal(&ztq->ztq_dispatch_cv)); + mutex_exit(&ztq->ztq_lock); + return (1); +} + +void +zfs_taskq_wait(zfs_taskq_t *ztq) +{ + mutex_enter(&ztq->ztq_lock); + while (ztq->ztq_task.ztqent_next != &ztq->ztq_task || + ztq->ztq_active != 0) { + int ret = cond_wait(&ztq->ztq_wait_cv, &ztq->ztq_lock); + VERIFY(ret == 0 || ret == EINTR); + } + mutex_exit(&ztq->ztq_lock); +} + +static void * +zfs_taskq_thread(void *arg) +{ + zfs_taskq_t *ztq = arg; + zfs_taskq_ent_t *t; + boolean_t prealloc; + + mutex_enter(&ztq->ztq_lock); + while (ztq->ztq_flags & ZFS_TASKQ_ACTIVE) { + if ((t = ztq->ztq_task.ztqent_next) == &ztq->ztq_task) { + int ret; + if (--ztq->ztq_active == 0) + VERIFY0(cond_broadcast(&ztq->ztq_wait_cv)); + ret = cond_wait(&ztq->ztq_dispatch_cv, &ztq->ztq_lock); + VERIFY(ret == 0 || ret == EINTR); + ztq->ztq_active++; + continue; + } + t->ztqent_prev->ztqent_next = t->ztqent_next; + t->ztqent_next->ztqent_prev = t->ztqent_prev; + t->ztqent_next = NULL; + t->ztqent_prev = NULL; + prealloc = t->ztqent_flags & ZFS_TQENT_FLAG_PREALLOC; + mutex_exit(&ztq->ztq_lock); + + VERIFY0(rw_rdlock(&ztq->ztq_threadlock)); + t->ztqent_func(t->ztqent_arg); + VERIFY0(rw_unlock(&ztq->ztq_threadlock)); + + mutex_enter(&ztq->ztq_lock); + if (!prealloc) + ztask_free(ztq, t); + } + ztq->ztq_nthreads--; + VERIFY0(cond_broadcast(&ztq->ztq_wait_cv)); + mutex_exit(&ztq->ztq_lock); + return (NULL); +} + +/*ARGSUSED*/ +zfs_taskq_t * +zfs_taskq_create(const char *name, int nthreads, pri_t pri, int minalloc, + int maxalloc, uint_t flags) +{ + zfs_taskq_t *ztq = umem_zalloc(sizeof (zfs_taskq_t), UMEM_NOFAIL); + int t; + + ASSERT3S(nthreads, >=, 1); + + VERIFY0(rwlock_init(&ztq->ztq_threadlock, USYNC_THREAD, NULL)); + VERIFY0(cond_init(&ztq->ztq_dispatch_cv, USYNC_THREAD, NULL)); + VERIFY0(cond_init(&ztq->ztq_wait_cv, USYNC_THREAD, NULL)); + VERIFY0(cond_init(&ztq->ztq_maxalloc_cv, USYNC_THREAD, NULL)); + VERIFY0(mutex_init( + &ztq->ztq_lock, LOCK_NORMAL | LOCK_ERRORCHECK, NULL)); + + (void) strncpy(ztq->ztq_name, name, ZFS_TASKQ_NAMELEN + 1); + + ztq->ztq_flags = flags | ZFS_TASKQ_ACTIVE; + ztq->ztq_active = nthreads; + ztq->ztq_nthreads = nthreads; + ztq->ztq_minalloc = minalloc; + ztq->ztq_maxalloc = maxalloc; + ztq->ztq_task.ztqent_next = &ztq->ztq_task; + ztq->ztq_task.ztqent_prev = &ztq->ztq_task; + ztq->ztq_threadlist = + umem_alloc(nthreads * sizeof (thread_t), UMEM_NOFAIL); + + if (flags & ZFS_TASKQ_PREPOPULATE) { + mutex_enter(&ztq->ztq_lock); + while (minalloc-- > 0) + ztask_free(ztq, ztask_alloc(ztq, UMEM_NOFAIL)); + mutex_exit(&ztq->ztq_lock); + } + + for (t = 0; t < nthreads; t++) { + (void) thr_create(0, 0, zfs_taskq_thread, + ztq, THR_BOUND, &ztq->ztq_threadlist[t]); + } + + return (ztq); +} + +void +zfs_taskq_destroy(zfs_taskq_t *ztq) +{ + int t; + int nthreads = ztq->ztq_nthreads; + + zfs_taskq_wait(ztq); + + mutex_enter(&ztq->ztq_lock); + + ztq->ztq_flags &= ~ZFS_TASKQ_ACTIVE; + VERIFY0(cond_broadcast(&ztq->ztq_dispatch_cv)); + + while (ztq->ztq_nthreads != 0) { + int ret = cond_wait(&ztq->ztq_wait_cv, &ztq->ztq_lock); + VERIFY(ret == 0 || ret == EINTR); + } + + ztq->ztq_minalloc = 0; + while (ztq->ztq_nalloc != 0) { + ASSERT(ztq->ztq_freelist != NULL); + ztask_free(ztq, ztask_alloc(ztq, UMEM_NOFAIL)); + } + + mutex_exit(&ztq->ztq_lock); + + for (t = 0; t < nthreads; t++) + (void) thr_join(ztq->ztq_threadlist[t], NULL, NULL); + + umem_free(ztq->ztq_threadlist, nthreads * sizeof (thread_t)); + + VERIFY0(rwlock_destroy(&ztq->ztq_threadlock)); + VERIFY0(cond_destroy(&ztq->ztq_dispatch_cv)); + VERIFY0(cond_destroy(&ztq->ztq_wait_cv)); + VERIFY0(cond_destroy(&ztq->ztq_maxalloc_cv)); + VERIFY0(mutex_destroy(&ztq->ztq_lock)); + + umem_free(ztq, sizeof (zfs_taskq_t)); +} diff --git a/lib/libzfs/common/libzfs_taskq.h b/lib/libzfs/common/libzfs_taskq.h new file mode 100644 index 000000000000..7ac045738c36 --- /dev/null +++ b/lib/libzfs/common/libzfs_taskq.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _ZFS_TASKQ_H +#define _ZFS_TASKQ_H + +#include <stdint.h> +#include <umem.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zfs_taskq zfs_taskq_t; +typedef uintptr_t zfs_taskqid_t; +typedef void (ztask_func_t)(void *); + +#define ZFS_TQENT_FLAG_PREALLOC 0x1 /* taskq_dispatch_ent used */ + +#define ZFS_TASKQ_PREPOPULATE 0x0001 + +#define ZFS_TQ_SLEEP UMEM_NOFAIL /* Can block for memory */ +#define ZFS_TQ_NOSLEEP UMEM_DEFAULT /* cannot block for memory; may fail */ +#define ZFS_TQ_FRONT 0x08 /* Queue in front */ + +extern zfs_taskq_t *zfs_taskq_create(const char *, int, pri_t, int, + int, uint_t); +extern void zfs_taskq_destroy(zfs_taskq_t *); + +extern zfs_taskqid_t zfs_taskq_dispatch(zfs_taskq_t *, ztask_func_t, + void *, uint_t); + +extern void zfs_taskq_wait(zfs_taskq_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_TASKQ_H */ |