aboutsummaryrefslogtreecommitdiffstats
path: root/cmd/zed
diff options
context:
space:
mode:
authorMatt Macy <mmacy@FreeBSD.org>2020-08-24 22:48:19 +0000
committerMatt Macy <mmacy@FreeBSD.org>2020-08-24 22:48:19 +0000
commit3b0ce0e28db46d0403929aba45c682285e1ac217 (patch)
tree91721e6e5518bd0d8113dee535898f2225443411 /cmd/zed
downloadsrc-3b0ce0e28db46d0403929aba45c682285e1ac217.tar.gz
src-3b0ce0e28db46d0403929aba45c682285e1ac217.zip
Vendor import of openzfs master @ 184df27eef0abdc7ab2105b21257f753834b936bvendor/openzfs/2.0-rc0-g184df27
Sponsored by: iX Systems, Inc.
Notes
Notes: svn path=/vendor-sys/openzfs/dist/; revision=364736 svn path=/vendor-sys/openzfs/2.0-rc0-g184df27/; revision=364741; tag=vendor/openzfs/2.0-rc0-g184df27
Diffstat (limited to 'cmd/zed')
-rw-r--r--cmd/zed/.gitignore1
-rw-r--r--cmd/zed/Makefile.am49
-rw-r--r--cmd/zed/agents/README.md112
-rw-r--r--cmd/zed/agents/fmd_api.c760
-rw-r--r--cmd/zed/agents/fmd_api.h246
-rw-r--r--cmd/zed/agents/fmd_serd.c316
-rw-r--r--cmd/zed/agents/fmd_serd.h86
-rw-r--r--cmd/zed/agents/zfs_agents.c422
-rw-r--r--cmd/zed/agents/zfs_agents.h46
-rw-r--r--cmd/zed/agents/zfs_diagnosis.c981
-rw-r--r--cmd/zed/agents/zfs_mod.c956
-rw-r--r--cmd/zed/agents/zfs_retire.c557
-rw-r--r--cmd/zed/zed.c306
-rw-r--r--cmd/zed/zed.d/.gitignore1
-rw-r--r--cmd/zed/zed.d/Makefile.am53
-rw-r--r--cmd/zed/zed.d/README30
-rwxr-xr-xcmd/zed/zed.d/all-debug.sh26
-rwxr-xr-xcmd/zed/zed.d/all-syslog.sh14
-rwxr-xr-xcmd/zed/zed.d/data-notify.sh43
-rwxr-xr-xcmd/zed/zed.d/generic-notify.sh54
-rwxr-xr-xcmd/zed/zed.d/history_event-zfs-list-cacher.sh.in85
l---------cmd/zed/zed.d/pool_import-led.sh1
l---------cmd/zed/zed.d/resilver_finish-notify.sh1
-rwxr-xr-xcmd/zed/zed.d/resilver_finish-start-scrub.sh19
-rwxr-xr-xcmd/zed/zed.d/scrub_finish-notify.sh59
-rwxr-xr-xcmd/zed/zed.d/statechange-led.sh177
-rwxr-xr-xcmd/zed/zed.d/statechange-notify.sh74
-rwxr-xr-xcmd/zed/zed.d/trim_finish-notify.sh37
l---------cmd/zed/zed.d/vdev_attach-led.sh1
l---------cmd/zed/zed.d/vdev_clear-led.sh1
-rwxr-xr-xcmd/zed/zed.d/zed-functions.sh538
-rw-r--r--cmd/zed/zed.d/zed.rc122
-rw-r--r--cmd/zed/zed.h58
-rw-r--r--cmd/zed/zed_conf.c735
-rw-r--r--cmd/zed/zed_conf.h62
-rw-r--r--cmd/zed/zed_disk_event.c416
-rw-r--r--cmd/zed/zed_disk_event.h31
-rw-r--r--cmd/zed/zed_event.c965
-rw-r--r--cmd/zed/zed_event.h29
-rw-r--r--cmd/zed/zed_exec.c232
-rw-r--r--cmd/zed/zed_exec.h25
-rw-r--r--cmd/zed/zed_file.c217
-rw-r--r--cmd/zed/zed_file.h35
-rw-r--r--cmd/zed/zed_log.c256
-rw-r--r--cmd/zed/zed_log.h44
-rw-r--r--cmd/zed/zed_strings.c247
-rw-r--r--cmd/zed/zed_strings.h27
47 files changed, 9553 insertions, 0 deletions
diff --git a/cmd/zed/.gitignore b/cmd/zed/.gitignore
new file mode 100644
index 000000000000..76557bb6bb3a
--- /dev/null
+++ b/cmd/zed/.gitignore
@@ -0,0 +1 @@
+/zed
diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am
new file mode 100644
index 000000000000..4bd8ac4a53e6
--- /dev/null
+++ b/cmd/zed/Makefile.am
@@ -0,0 +1,49 @@
+include $(top_srcdir)/config/Rules.am
+
+AM_CFLAGS += $(LIBUDEV_CFLAGS) $(LIBUUID_CFLAGS)
+
+SUBDIRS = zed.d
+
+sbin_PROGRAMS = zed
+
+ZED_SRC = \
+ zed.c \
+ zed.h \
+ zed_conf.c \
+ zed_conf.h \
+ zed_disk_event.c \
+ zed_disk_event.h \
+ zed_event.c \
+ zed_event.h \
+ zed_exec.c \
+ zed_exec.h \
+ zed_file.c \
+ zed_file.h \
+ zed_log.c \
+ zed_log.h \
+ zed_strings.c \
+ zed_strings.h
+
+FMA_SRC = \
+ agents/zfs_agents.c \
+ agents/zfs_agents.h \
+ agents/zfs_diagnosis.c \
+ agents/zfs_mod.c \
+ agents/zfs_retire.c \
+ agents/fmd_api.c \
+ agents/fmd_api.h \
+ agents/fmd_serd.c \
+ agents/fmd_serd.h
+
+zed_SOURCES = $(ZED_SRC) $(FMA_SRC)
+
+zed_LDADD = \
+ $(abs_top_builddir)/lib/libzfs/libzfs.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(abs_top_builddir)/lib/libnvpair/libnvpair.la \
+ $(abs_top_builddir)/lib/libuutil/libuutil.la
+
+zed_LDADD += -lrt $(LIBUDEV_LIBS) $(LIBUUID_LIBS)
+zed_LDFLAGS = -pthread
+
+EXTRA_DIST = agents/README.md
diff --git a/cmd/zed/agents/README.md b/cmd/zed/agents/README.md
new file mode 100644
index 000000000000..e35b97668a9d
--- /dev/null
+++ b/cmd/zed/agents/README.md
@@ -0,0 +1,112 @@
+## Fault Management Logic for ZED ##
+
+The integration of Fault Management Daemon (FMD) logic from illumos
+is being deployed in three phases. This logic is encapsulated in
+several software modules inside ZED.
+
+### ZED+FM Phase 1 ###
+
+All the phase 1 work is in current Master branch. Phase I work includes:
+
+* Add new paths to the persistent VDEV label for device matching.
+* Add a disk monitor for generating _disk-add_ and _disk-change_ events.
+* Add support for automated VDEV auto-online, auto-replace and auto-expand.
+* Expand the statechange event to include all VDEV state transitions.
+
+### ZED+FM Phase 2 (WIP) ###
+
+The phase 2 work primarily entails the _Diagnosis Engine_ and the
+_Retire Agent_ modules. It also includes infrastructure to support a
+crude FMD environment to host these modules. For additional
+information see the **FMD Components in ZED** and **Implementation
+Notes** sections below.
+
+### ZED+FM Phase 3 ###
+
+Future work will add additional functionality and will likely include:
+
+* Add FMD module garbage collection (periodically call `fmd_module_gc()`).
+* Add real module property retrieval (currently hard-coded in accessors).
+* Additional diagnosis telemetry (like latency outliers and SMART data).
+* Export FMD module statistics.
+* Zedlet parallel execution and resiliency (add watchdog).
+
+### ZFS Fault Management Overview ###
+
+The primary purpose with ZFS fault management is automated diagnosis
+and isolation of VDEV faults. A fault is something we can associate
+with an impact (e.g. loss of data redundancy) and a corrective action
+(e.g. offline or replace a disk). A typical ZFS fault management stack
+is comprised of _error detectors_ (e.g. `zfs_ereport_post()`), a _disk
+monitor_, a _diagnosis engine_ and _response agents_.
+
+After detecting a software error, the ZFS kernel module sends error
+events to the ZED user daemon which in turn routes the events to its
+internal FMA modules based on their event subscriptions. Likewise, if
+a disk is added or changed in the system, the disk monitor sends disk
+events which are consumed by a response agent.
+
+### FMD Components in ZED ###
+
+There are three FMD modules (aka agents) that are now built into ZED.
+
+ 1. A _Diagnosis Engine_ module (`agents/zfs_diagnosis.c`)
+ 2. A _Retire Agent_ module (`agents/zfs_retire.c`)
+ 3. A _Disk Add Agent_ module (`agents/zfs_mod.c`)
+
+To begin with, a **Diagnosis Engine** consumes per-vdev I/O and checksum
+ereports and feeds them into a Soft Error Rate Discrimination (SERD)
+algorithm which will generate a corresponding fault diagnosis when the
+tracked VDEV encounters **N** events in a given **T** time window. The
+initial N and T values for the SERD algorithm are estimates inherited
+from illumos (10 errors in 10 minutes).
+
+In turn, a **Retire Agent** responds to diagnosed faults by isolating
+the faulty VDEV. It will notify the ZFS kernel module of the new VDEV
+state (degraded or faulted). The retire agent is also responsible for
+managing hot spares across all pools. When it encounters a device fault
+or a device removal it will replace the device with an appropriate
+spare if available.
+
+Finally, a **Disk Add Agent** responds to events from a libudev disk
+monitor (`EC_DEV_ADD` or `EC_DEV_STATUS`) and will online, replace or
+expand the associated VDEV. This agent is also known as the `zfs_mod`
+or Sysevent Loadable Module (SLM) on the illumos platform. The added
+disk is matched to a specific VDEV using its device id, physical path
+or VDEV GUID.
+
+Note that the _auto-replace_ feature (aka hot plug) is opt-in and you
+must set the pool's `autoreplace` property to enable it. The new disk
+will be matched to the corresponding leaf VDEV by physical location
+and labeled with a GPT partition before replacing the original VDEV
+in the pool.
+
+### Implementation Notes ###
+
+* The FMD module API required for logic modules is emulated and implemented
+ in the `fmd_api.c` and `fmd_serd.c` source files. This support includes
+ module registration, memory allocation, module property accessors, basic
+ case management, one-shot timers and SERD engines.
+ For detailed information on the FMD module API, see the document --
+ _"Fault Management Daemon Programmer's Reference Manual"_.
+
+* The event subscriptions for the modules (located in a module specific
+ configuration file on illumos) are currently hard-coded into the ZED
+ `zfs_agent_dispatch()` function.
+
+* The FMD modules are called one at a time from a single thread that
+ consumes events queued to the modules. These events are sourced from
+ the normal ZED events and also include events posted from the diagnosis
+ engine and the libudev disk event monitor.
+
+* The FMD code modules have minimal changes and were intentionally left
+ as similar as possible to their upstream source files.
+
+* The sysevent namespace in ZED differs from illumos. For example:
+ * illumos uses `"resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove"`
+ * Linux uses `"sysevent.fs.zfs.vdev_remove"`
+
+* The FMD Modules port was produced by Intel Federal, LLC under award
+ number B609815 between the U.S. Department of Energy (DOE) and Intel
+ Federal, LLC.
+
diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c
new file mode 100644
index 000000000000..607b387ca3a8
--- /dev/null
+++ b/cmd/zed/agents/fmd_api.c
@@ -0,0 +1,760 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+/*
+ * This file implements the minimal FMD module API required to support the
+ * fault logic modules in ZED. This support includes module registration,
+ * memory allocation, module property accessors, basic case management,
+ * one-shot timers and SERD engines.
+ *
+ * In the ZED runtime, the modules are called from a single thread so no
+ * locking is required in this emulated FMD environment.
+ */
+
+#include <sys/types.h>
+#include <sys/fm/protocol.h>
+#include <uuid/uuid.h>
+#include <signal.h>
+#include <strings.h>
+#include <time.h>
+
+#include "fmd_api.h"
+#include "fmd_serd.h"
+
+#include "zfs_agents.h"
+#include "../zed_log.h"
+
+typedef struct fmd_modstat {
+ fmd_stat_t ms_accepted; /* total events accepted by module */
+ fmd_stat_t ms_caseopen; /* cases currently open */
+ fmd_stat_t ms_casesolved; /* total cases solved by module */
+ fmd_stat_t ms_caseclosed; /* total cases closed by module */
+} fmd_modstat_t;
+
+typedef struct fmd_module {
+ const char *mod_name; /* basename of module (ro) */
+ const fmd_hdl_info_t *mod_info; /* module info registered with handle */
+ void *mod_spec; /* fmd_hdl_get/setspecific data value */
+ fmd_stat_t *mod_ustat; /* module specific custom stats */
+ uint_t mod_ustat_cnt; /* count of ustat stats */
+ fmd_modstat_t mod_stats; /* fmd built-in per-module statistics */
+ fmd_serd_hash_t mod_serds; /* hash of serd engs owned by module */
+ char *mod_vers; /* a copy of module version string */
+} fmd_module_t;
+
+/*
+ * ZED has two FMD hardwired module instances
+ */
+fmd_module_t zfs_retire_module;
+fmd_module_t zfs_diagnosis_module;
+
+/*
+ * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
+ */
+
+#ifdef DEBUG
+const char *
+_umem_debug_init(void)
+{
+ return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+#endif
+
+/*
+ * Register a module with fmd and finish module initialization.
+ * Returns an integer indicating whether it succeeded (zero) or
+ * failed (non-zero).
+ */
+int
+fmd_hdl_register(fmd_hdl_t *hdl, int version, const fmd_hdl_info_t *mip)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ mp->mod_info = mip;
+ mp->mod_name = mip->fmdi_desc + 4; /* drop 'ZFS ' prefix */
+ mp->mod_spec = NULL;
+
+ /* bare minimum module stats */
+ (void) strcpy(mp->mod_stats.ms_accepted.fmds_name, "fmd.accepted");
+ (void) strcpy(mp->mod_stats.ms_caseopen.fmds_name, "fmd.caseopen");
+ (void) strcpy(mp->mod_stats.ms_casesolved.fmds_name, "fmd.casesolved");
+ (void) strcpy(mp->mod_stats.ms_caseclosed.fmds_name, "fmd.caseclosed");
+
+ fmd_serd_hash_create(&mp->mod_serds);
+
+ fmd_hdl_debug(hdl, "register module");
+
+ return (0);
+}
+
+void
+fmd_hdl_unregister(fmd_hdl_t *hdl)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ fmd_modstat_t *msp = &mp->mod_stats;
+ const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+
+ /* dump generic module stats */
+ fmd_hdl_debug(hdl, "%s: %llu", msp->ms_accepted.fmds_name,
+ msp->ms_accepted.fmds_value.ui64);
+ if (ops->fmdo_close != NULL) {
+ fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseopen.fmds_name,
+ msp->ms_caseopen.fmds_value.ui64);
+ fmd_hdl_debug(hdl, "%s: %llu", msp->ms_casesolved.fmds_name,
+ msp->ms_casesolved.fmds_value.ui64);
+ fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseclosed.fmds_name,
+ msp->ms_caseclosed.fmds_value.ui64);
+ }
+
+ /* dump module specific stats */
+ if (mp->mod_ustat != NULL) {
+ int i;
+
+ for (i = 0; i < mp->mod_ustat_cnt; i++) {
+ fmd_hdl_debug(hdl, "%s: %llu",
+ mp->mod_ustat[i].fmds_name,
+ mp->mod_ustat[i].fmds_value.ui64);
+ }
+ }
+
+ fmd_serd_hash_destroy(&mp->mod_serds);
+
+ fmd_hdl_debug(hdl, "unregister module");
+}
+
+/*
+ * fmd_hdl_setspecific() is used to associate a data pointer with
+ * the specified handle for the duration of the module's lifetime.
+ * This pointer can be retrieved using fmd_hdl_getspecific().
+ */
+void
+fmd_hdl_setspecific(fmd_hdl_t *hdl, void *spec)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ mp->mod_spec = spec;
+}
+
+/*
+ * Return the module-specific data pointer previously associated
+ * with the handle using fmd_hdl_setspecific().
+ */
+void *
+fmd_hdl_getspecific(fmd_hdl_t *hdl)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ return (mp->mod_spec);
+}
+
+void *
+fmd_hdl_alloc(fmd_hdl_t *hdl, size_t size, int flags)
+{
+ return (umem_alloc(size, flags));
+}
+
+void *
+fmd_hdl_zalloc(fmd_hdl_t *hdl, size_t size, int flags)
+{
+ return (umem_zalloc(size, flags));
+}
+
+void
+fmd_hdl_free(fmd_hdl_t *hdl, void *data, size_t size)
+{
+ umem_free(data, size);
+}
+
+/*
+ * Record a module debug message using the specified format.
+ */
+void
+fmd_hdl_debug(fmd_hdl_t *hdl, const char *format, ...)
+{
+ char message[256];
+ va_list vargs;
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ va_start(vargs, format);
+ (void) vsnprintf(message, sizeof (message), format, vargs);
+ va_end(vargs);
+
+ /* prefix message with module name */
+ zed_log_msg(LOG_INFO, "%s: %s", mp->mod_name, message);
+}
+
+/* Property Retrieval */
+
+int32_t
+fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
+{
+ /*
+ * These can be looked up in mp->modinfo->fmdi_props
+ * For now we just hard code for phase 2. In the
+ * future, there can be a ZED based override.
+ */
+ if (strcmp(name, "spare_on_remove") == 0)
+ return (1);
+
+ if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
+ return (10); /* N = 10 events */
+
+ return (0);
+}
+
+int64_t
+fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
+{
+ /*
+ * These can be looked up in mp->modinfo->fmdi_props
+ * For now we just hard code for phase 2. In the
+ * future, there can be a ZED based override.
+ */
+ if (strcmp(name, "remove_timeout") == 0)
+ return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */
+
+ if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
+ return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */
+
+ return (0);
+}
+
+/* FMD Statistics */
+
+fmd_stat_t *
+fmd_stat_create(fmd_hdl_t *hdl, uint_t flags, uint_t nstats, fmd_stat_t *statv)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ if (flags == FMD_STAT_NOALLOC) {
+ mp->mod_ustat = statv;
+ mp->mod_ustat_cnt = nstats;
+ }
+
+ return (statv);
+}
+
+/* Case Management */
+
+fmd_case_t *
+fmd_case_open(fmd_hdl_t *hdl, void *data)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ uuid_t uuid;
+
+ fmd_case_t *cp;
+
+ cp = fmd_hdl_zalloc(hdl, sizeof (fmd_case_t), FMD_SLEEP);
+ cp->ci_mod = hdl;
+ cp->ci_state = FMD_CASE_UNSOLVED;
+ cp->ci_flags = FMD_CF_DIRTY;
+ cp->ci_data = data;
+ cp->ci_bufptr = NULL;
+ cp->ci_bufsiz = 0;
+
+ uuid_generate(uuid);
+ uuid_unparse(uuid, cp->ci_uuid);
+
+ fmd_hdl_debug(hdl, "case opened (%s)", cp->ci_uuid);
+ mp->mod_stats.ms_caseopen.fmds_value.ui64++;
+
+ return (cp);
+}
+
+void
+fmd_case_solve(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ /*
+ * For ZED, the event was already sent from fmd_case_add_suspect()
+ */
+
+ if (cp->ci_state >= FMD_CASE_SOLVED)
+ fmd_hdl_debug(hdl, "case is already solved or closed");
+
+ cp->ci_state = FMD_CASE_SOLVED;
+
+ fmd_hdl_debug(hdl, "case solved (%s)", cp->ci_uuid);
+ mp->mod_stats.ms_casesolved.fmds_value.ui64++;
+}
+
+void
+fmd_case_close(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+
+ fmd_hdl_debug(hdl, "case closed (%s)", cp->ci_uuid);
+
+ if (ops->fmdo_close != NULL)
+ ops->fmdo_close(hdl, cp);
+
+ mp->mod_stats.ms_caseopen.fmds_value.ui64--;
+ mp->mod_stats.ms_caseclosed.fmds_value.ui64++;
+
+ if (cp->ci_bufptr != NULL && cp->ci_bufsiz > 0)
+ fmd_hdl_free(hdl, cp->ci_bufptr, cp->ci_bufsiz);
+
+ fmd_hdl_free(hdl, cp, sizeof (fmd_case_t));
+}
+
+void
+fmd_case_uuresolved(fmd_hdl_t *hdl, const char *uuid)
+{
+ fmd_hdl_debug(hdl, "case resolved by uuid (%s)", uuid);
+}
+
+int
+fmd_case_solved(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ return ((cp->ci_state >= FMD_CASE_SOLVED) ? FMD_B_TRUE : FMD_B_FALSE);
+}
+
+void
+fmd_case_add_ereport(fmd_hdl_t *hdl, fmd_case_t *cp, fmd_event_t *ep)
+{
+}
+
+static void
+zed_log_fault(nvlist_t *nvl, const char *uuid, const char *code)
+{
+ nvlist_t *rsrc;
+ char *strval;
+ uint64_t guid;
+ uint8_t byte;
+
+ zed_log_msg(LOG_INFO, "\nzed_fault_event:");
+
+ if (uuid != NULL)
+ zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_UUID, uuid);
+ if (nvlist_lookup_string(nvl, FM_CLASS, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", FM_CLASS, strval);
+ if (code != NULL)
+ zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_DIAG_CODE, code);
+ if (nvlist_lookup_uint8(nvl, FM_FAULT_CERTAINTY, &byte) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FAULT_CERTAINTY, byte);
+ if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) {
+ if (nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", FM_FMRI_SCHEME,
+ strval);
+ if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_POOL, &guid) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FMRI_ZFS_POOL,
+ guid);
+ if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_VDEV, &guid) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu \n", FM_FMRI_ZFS_VDEV,
+ guid);
+ }
+}
+
+static const char *
+fmd_fault_mkcode(nvlist_t *fault)
+{
+ char *class, *code = "-";
+
+ /*
+ * Note: message codes come from: openzfs/usr/src/cmd/fm/dicts/ZFS.po
+ */
+ if (nvlist_lookup_string(fault, FM_CLASS, &class) == 0) {
+ if (strcmp(class, "fault.fs.zfs.vdev.io") == 0)
+ code = "ZFS-8000-FD";
+ else if (strcmp(class, "fault.fs.zfs.vdev.checksum") == 0)
+ code = "ZFS-8000-GH";
+ else if (strcmp(class, "fault.fs.zfs.io_failure_wait") == 0)
+ code = "ZFS-8000-HC";
+ else if (strcmp(class, "fault.fs.zfs.io_failure_continue") == 0)
+ code = "ZFS-8000-JQ";
+ else if (strcmp(class, "fault.fs.zfs.log_replay") == 0)
+ code = "ZFS-8000-K4";
+ else if (strcmp(class, "fault.fs.zfs.pool") == 0)
+ code = "ZFS-8000-CS";
+ else if (strcmp(class, "fault.fs.zfs.device") == 0)
+ code = "ZFS-8000-D3";
+
+ }
+ return (code);
+}
+
+void
+fmd_case_add_suspect(fmd_hdl_t *hdl, fmd_case_t *cp, nvlist_t *fault)
+{
+ nvlist_t *nvl;
+ const char *code = fmd_fault_mkcode(fault);
+ int64_t tod[2];
+ int err = 0;
+
+ /*
+ * payload derived from fmd_protocol_list()
+ */
+
+ (void) gettimeofday(&cp->ci_tv, NULL);
+ tod[0] = cp->ci_tv.tv_sec;
+ tod[1] = cp->ci_tv.tv_usec;
+
+ nvl = fmd_nvl_alloc(hdl, FMD_SLEEP);
+
+ err |= nvlist_add_uint8(nvl, FM_VERSION, FM_SUSPECT_VERSION);
+ err |= nvlist_add_string(nvl, FM_CLASS, FM_LIST_SUSPECT_CLASS);
+ err |= nvlist_add_string(nvl, FM_SUSPECT_UUID, cp->ci_uuid);
+ err |= nvlist_add_string(nvl, FM_SUSPECT_DIAG_CODE, code);
+ err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2);
+ err |= nvlist_add_uint32(nvl, FM_SUSPECT_FAULT_SZ, 1);
+ err |= nvlist_add_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &fault, 1);
+
+ if (err)
+ zed_log_die("failed to populate nvlist");
+
+ zed_log_fault(fault, cp->ci_uuid, code);
+ zfs_agent_post_event(FM_LIST_SUSPECT_CLASS, NULL, nvl);
+
+ nvlist_free(nvl);
+ nvlist_free(fault);
+}
+
+void
+fmd_case_setspecific(fmd_hdl_t *hdl, fmd_case_t *cp, void *data)
+{
+ cp->ci_data = data;
+}
+
+void *
+fmd_case_getspecific(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ return (cp->ci_data);
+}
+
+void
+fmd_buf_create(fmd_hdl_t *hdl, fmd_case_t *cp, const char *name, size_t size)
+{
+ assert(strcmp(name, "data") == 0);
+ assert(cp->ci_bufptr == NULL);
+ assert(size < (1024 * 1024));
+
+ cp->ci_bufptr = fmd_hdl_alloc(hdl, size, FMD_SLEEP);
+ cp->ci_bufsiz = size;
+}
+
+void
+fmd_buf_read(fmd_hdl_t *hdl, fmd_case_t *cp,
+ const char *name, void *buf, size_t size)
+{
+ assert(strcmp(name, "data") == 0);
+ assert(cp->ci_bufptr != NULL);
+ assert(size <= cp->ci_bufsiz);
+
+ bcopy(cp->ci_bufptr, buf, size);
+}
+
+void
+fmd_buf_write(fmd_hdl_t *hdl, fmd_case_t *cp,
+ const char *name, const void *buf, size_t size)
+{
+ assert(strcmp(name, "data") == 0);
+ assert(cp->ci_bufptr != NULL);
+ assert(cp->ci_bufsiz >= size);
+
+ bcopy(buf, cp->ci_bufptr, size);
+}
+
+/* SERD Engines */
+
+void
+fmd_serd_create(fmd_hdl_t *hdl, const char *name, uint_t n, hrtime_t t)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ if (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL) {
+ zed_log_msg(LOG_ERR, "failed to create SERD engine '%s': "
+ " name already exists", name);
+ return;
+ }
+
+ (void) fmd_serd_eng_insert(&mp->mod_serds, name, n, t);
+}
+
+void
+fmd_serd_destroy(fmd_hdl_t *hdl, const char *name)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ fmd_serd_eng_delete(&mp->mod_serds, name);
+
+ fmd_hdl_debug(hdl, "serd_destroy %s", name);
+}
+
+int
+fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
+}
+
+void
+fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ fmd_serd_eng_t *sgp;
+
+ if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
+ zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
+ return;
+ }
+
+ fmd_serd_eng_reset(sgp);
+
+ fmd_hdl_debug(hdl, "serd_reset %s", name);
+}
+
+int
+fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ fmd_serd_eng_t *sgp;
+ int err;
+
+ if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
+ zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
+ name);
+ return (FMD_B_FALSE);
+ }
+ err = fmd_serd_eng_record(sgp, ep->ev_hrt);
+
+ return (err);
+}
+
+/* FMD Timers */
+
+static void
+_timer_notify(union sigval sv)
+{
+ fmd_timer_t *ftp = sv.sival_ptr;
+ fmd_hdl_t *hdl = ftp->ft_hdl;
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+ struct itimerspec its;
+
+ fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid);
+
+ /* disarm the timer */
+ bzero(&its, sizeof (struct itimerspec));
+ timer_settime(ftp->ft_tid, 0, &its, NULL);
+
+ /* Note that the fmdo_timeout can remove this timer */
+ if (ops->fmdo_timeout != NULL)
+ ops->fmdo_timeout(hdl, ftp, ftp->ft_arg);
+}
+
+/*
+ * Install a new timer which will fire at least delta nanoseconds after the
+ * current time. After the timeout has expired, the module's fmdo_timeout
+ * entry point is called.
+ */
+fmd_timer_t *
+fmd_timer_install(fmd_hdl_t *hdl, void *arg, fmd_event_t *ep, hrtime_t delta)
+{
+ struct sigevent sev;
+ struct itimerspec its;
+ fmd_timer_t *ftp;
+
+ ftp = fmd_hdl_alloc(hdl, sizeof (fmd_timer_t), FMD_SLEEP);
+ ftp->ft_arg = arg;
+ ftp->ft_hdl = hdl;
+
+ its.it_value.tv_sec = delta / 1000000000;
+ its.it_value.tv_nsec = delta % 1000000000;
+ its.it_interval.tv_sec = its.it_value.tv_sec;
+ its.it_interval.tv_nsec = its.it_value.tv_nsec;
+
+ sev.sigev_notify = SIGEV_THREAD;
+ sev.sigev_notify_function = _timer_notify;
+ sev.sigev_notify_attributes = NULL;
+ sev.sigev_value.sival_ptr = ftp;
+
+ timer_create(CLOCK_REALTIME, &sev, &ftp->ft_tid);
+ timer_settime(ftp->ft_tid, 0, &its, NULL);
+
+ fmd_hdl_debug(hdl, "installing timer for %d secs (%p)",
+ (int)its.it_value.tv_sec, ftp->ft_tid);
+
+ return (ftp);
+}
+
+void
+fmd_timer_remove(fmd_hdl_t *hdl, fmd_timer_t *ftp)
+{
+ fmd_hdl_debug(hdl, "removing timer (%p)", ftp->ft_tid);
+
+ timer_delete(ftp->ft_tid);
+
+ fmd_hdl_free(hdl, ftp, sizeof (fmd_timer_t));
+}
+
+/* Name-Value Pair Lists */
+
+nvlist_t *
+fmd_nvl_create_fault(fmd_hdl_t *hdl, const char *class, uint8_t certainty,
+ nvlist_t *asru, nvlist_t *fru, nvlist_t *resource)
+{
+ nvlist_t *nvl;
+ int err = 0;
+
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+ zed_log_die("failed to xalloc fault nvlist");
+
+ err |= nvlist_add_uint8(nvl, FM_VERSION, FM_FAULT_VERSION);
+ err |= nvlist_add_string(nvl, FM_CLASS, class);
+ err |= nvlist_add_uint8(nvl, FM_FAULT_CERTAINTY, certainty);
+
+ if (asru != NULL)
+ err |= nvlist_add_nvlist(nvl, FM_FAULT_ASRU, asru);
+ if (fru != NULL)
+ err |= nvlist_add_nvlist(nvl, FM_FAULT_FRU, fru);
+ if (resource != NULL)
+ err |= nvlist_add_nvlist(nvl, FM_FAULT_RESOURCE, resource);
+
+ if (err)
+ zed_log_die("failed to populate nvlist: %s\n", strerror(err));
+
+ return (nvl);
+}
+
+/*
+ * sourced from fmd_string.c
+ */
+static int
+fmd_strmatch(const char *s, const char *p)
+{
+ char c;
+
+ if (p == NULL)
+ return (0);
+
+ if (s == NULL)
+ s = ""; /* treat NULL string as the empty string */
+
+ do {
+ if ((c = *p++) == '\0')
+ return (*s == '\0');
+
+ if (c == '*') {
+ while (*p == '*')
+ p++; /* consecutive *'s can be collapsed */
+
+ if (*p == '\0')
+ return (1);
+
+ while (*s != '\0') {
+ if (fmd_strmatch(s++, p) != 0)
+ return (1);
+ }
+
+ return (0);
+ }
+ } while (c == *s++);
+
+ return (0);
+}
+
+int
+fmd_nvl_class_match(fmd_hdl_t *hdl, nvlist_t *nvl, const char *pattern)
+{
+ char *class;
+
+ return (nvl != NULL &&
+ nvlist_lookup_string(nvl, FM_CLASS, &class) == 0 &&
+ fmd_strmatch(class, pattern));
+}
+
+nvlist_t *
+fmd_nvl_alloc(fmd_hdl_t *hdl, int flags)
+{
+ nvlist_t *nvl = NULL;
+
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+ return (NULL);
+
+ return (nvl);
+}
+
+
+/*
+ * ZED Agent specific APIs
+ */
+
+fmd_hdl_t *
+fmd_module_hdl(const char *name)
+{
+ if (strcmp(name, "zfs-retire") == 0)
+ return ((fmd_hdl_t *)&zfs_retire_module);
+ if (strcmp(name, "zfs-diagnosis") == 0)
+ return ((fmd_hdl_t *)&zfs_diagnosis_module);
+
+ return (NULL);
+}
+
+boolean_t
+fmd_module_initialized(fmd_hdl_t *hdl)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ return (mp->mod_info != NULL);
+}
+
+/*
+ * fmd_module_recv is called for each event that is received by
+ * the fault manager that has a class that matches one of the
+ * module's subscriptions.
+ */
+void
+fmd_module_recv(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+ fmd_event_t faux_event = {0};
+ int64_t *tv;
+ uint_t n;
+
+ /*
+ * Will need to normalized this if we persistently store the case data
+ */
+ if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0)
+ faux_event.ev_hrt = tv[0] * NANOSEC + tv[1];
+ else
+ faux_event.ev_hrt = 0;
+
+ ops->fmdo_recv(hdl, &faux_event, nvl, class);
+
+ mp->mod_stats.ms_accepted.fmds_value.ui64++;
+
+ /* TBD - should we initiate fm_module_gc() periodically? */
+}
diff --git a/cmd/zed/agents/fmd_api.h b/cmd/zed/agents/fmd_api.h
new file mode 100644
index 000000000000..4f06fb244b7b
--- /dev/null
+++ b/cmd/zed/agents/fmd_api.h
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef _FMD_API_H
+#define _FMD_API_H
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include <libnvpair.h>
+#include <stdarg.h>
+#include <umem.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Fault Management Daemon Client Interfaces
+ */
+
+#define FMD_API_VERSION 5
+
+typedef struct fmd_hdl fmd_hdl_t;
+
+typedef struct fmd_timer {
+ timer_t ft_tid;
+ void *ft_arg;
+ fmd_hdl_t *ft_hdl;
+} fmd_timer_t;
+
+#define id_t fmd_timer_t *
+
+
+typedef struct fmd_event {
+ hrtime_t ev_hrt; /* event time used by SERD engines */
+} fmd_event_t;
+
+typedef struct fmd_case {
+ char ci_uuid[48]; /* uuid string for this case */
+ fmd_hdl_t *ci_mod; /* module that owns this case */
+ void *ci_data; /* data from fmd_case_setspecific() */
+ ushort_t ci_state; /* case state (see below) */
+ ushort_t ci_flags; /* case flags (see below) */
+ struct timeval ci_tv; /* time of original diagnosis */
+ void *ci_bufptr; /* case data serialization buffer */
+ size_t ci_bufsiz;
+} fmd_case_t;
+
+
+#define FMD_B_FALSE 0 /* false value for booleans as int */
+#define FMD_B_TRUE 1 /* true value for booleans as int */
+
+
+#define FMD_CASE_UNSOLVED 0 /* case is not yet solved (waiting) */
+#define FMD_CASE_SOLVED 1 /* case is solved (suspects added) */
+#define FMD_CASE_CLOSE_WAIT 2 /* case is executing fmdo_close() */
+#define FMD_CASE_CLOSED 3 /* case is closed (reconfig done) */
+#define FMD_CASE_REPAIRED 4 /* case is repaired */
+#define FMD_CASE_RESOLVED 5 /* case is resolved (can be freed) */
+
+#define FMD_CF_DIRTY 0x01 /* case is in need of checkpoint */
+#define FMD_CF_SOLVED 0x02 /* case has been solved */
+#define FMD_CF_ISOLATED 0x04 /* case has been isolated */
+#define FMD_CF_REPAIRED 0x08 /* case has been repaired */
+#define FMD_CF_RESOLVED 0x10 /* case has been resolved */
+
+
+#define FMD_TYPE_BOOL 0 /* int */
+#define FMD_TYPE_INT32 1 /* int32_t */
+#define FMD_TYPE_UINT32 2 /* uint32_t */
+#define FMD_TYPE_INT64 3 /* int64_t */
+#define FMD_TYPE_UINT64 4 /* uint64_t */
+#define FMD_TYPE_TIME 5 /* uint64_t */
+#define FMD_TYPE_SIZE 6 /* uint64_t */
+
+typedef struct fmd_prop {
+ const char *fmdp_name; /* property name */
+ uint_t fmdp_type; /* property type (see above) */
+ const char *fmdp_defv; /* default value */
+} fmd_prop_t;
+
+typedef struct fmd_stat {
+ char fmds_name[32]; /* statistic name */
+ uint_t fmds_type; /* statistic type (see above) */
+ char fmds_desc[64]; /* statistic description */
+ union {
+ int bool; /* FMD_TYPE_BOOL */
+ int32_t i32; /* FMD_TYPE_INT32 */
+ uint32_t ui32; /* FMD_TYPE_UINT32 */
+ int64_t i64; /* FMD_TYPE_INT64 */
+ uint64_t ui64; /* FMD_TYPE_UINT64 */
+ } fmds_value;
+} fmd_stat_t;
+
+typedef struct fmd_hdl_ops {
+ void (*fmdo_recv)(fmd_hdl_t *, fmd_event_t *, nvlist_t *, const char *);
+ void (*fmdo_timeout)(fmd_hdl_t *, id_t, void *);
+ void (*fmdo_close)(fmd_hdl_t *, fmd_case_t *);
+ void (*fmdo_stats)(fmd_hdl_t *);
+ void (*fmdo_gc)(fmd_hdl_t *);
+} fmd_hdl_ops_t;
+
+#define FMD_SEND_SUCCESS 0 /* fmdo_send queued event */
+#define FMD_SEND_FAILED 1 /* fmdo_send unrecoverable error */
+#define FMD_SEND_RETRY 2 /* fmdo_send requests retry */
+
+typedef struct fmd_hdl_info {
+ const char *fmdi_desc; /* fmd client description string */
+ const char *fmdi_vers; /* fmd client version string */
+ const fmd_hdl_ops_t *fmdi_ops; /* ops vector for client */
+ const fmd_prop_t *fmdi_props; /* array of configuration props */
+} fmd_hdl_info_t;
+
+extern int fmd_hdl_register(fmd_hdl_t *, int, const fmd_hdl_info_t *);
+extern void fmd_hdl_unregister(fmd_hdl_t *);
+
+extern void fmd_hdl_setspecific(fmd_hdl_t *, void *);
+extern void *fmd_hdl_getspecific(fmd_hdl_t *);
+
+#define FMD_SLEEP UMEM_NOFAIL
+
+extern void *fmd_hdl_alloc(fmd_hdl_t *, size_t, int);
+extern void *fmd_hdl_zalloc(fmd_hdl_t *, size_t, int);
+extern void fmd_hdl_free(fmd_hdl_t *, void *, size_t);
+
+extern char *fmd_hdl_strdup(fmd_hdl_t *, const char *, int);
+extern void fmd_hdl_strfree(fmd_hdl_t *, char *);
+
+extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list);
+extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...);
+
+extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *);
+extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *);
+
+#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */
+#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */
+
+extern fmd_stat_t *fmd_stat_create(fmd_hdl_t *, uint_t, uint_t, fmd_stat_t *);
+extern void fmd_stat_destroy(fmd_hdl_t *, uint_t, fmd_stat_t *);
+extern void fmd_stat_setstr(fmd_hdl_t *, fmd_stat_t *, const char *);
+
+extern fmd_case_t *fmd_case_open(fmd_hdl_t *, void *);
+extern void fmd_case_reset(fmd_hdl_t *, fmd_case_t *);
+extern void fmd_case_solve(fmd_hdl_t *, fmd_case_t *);
+extern void fmd_case_close(fmd_hdl_t *, fmd_case_t *);
+
+extern const char *fmd_case_uuid(fmd_hdl_t *, fmd_case_t *);
+extern fmd_case_t *fmd_case_uulookup(fmd_hdl_t *, const char *);
+extern void fmd_case_uuclose(fmd_hdl_t *, const char *);
+extern int fmd_case_uuclosed(fmd_hdl_t *, const char *);
+extern int fmd_case_uuisresolved(fmd_hdl_t *, const char *);
+extern void fmd_case_uuresolved(fmd_hdl_t *, const char *);
+
+extern int fmd_case_solved(fmd_hdl_t *, fmd_case_t *);
+extern int fmd_case_closed(fmd_hdl_t *, fmd_case_t *);
+
+extern void fmd_case_add_ereport(fmd_hdl_t *, fmd_case_t *, fmd_event_t *);
+extern void fmd_case_add_serd(fmd_hdl_t *, fmd_case_t *, const char *);
+extern void fmd_case_add_suspect(fmd_hdl_t *, fmd_case_t *, nvlist_t *);
+
+extern void fmd_case_setspecific(fmd_hdl_t *, fmd_case_t *, void *);
+extern void *fmd_case_getspecific(fmd_hdl_t *, fmd_case_t *);
+
+extern fmd_case_t *fmd_case_next(fmd_hdl_t *, fmd_case_t *);
+extern fmd_case_t *fmd_case_prev(fmd_hdl_t *, fmd_case_t *);
+
+extern void fmd_buf_create(fmd_hdl_t *, fmd_case_t *, const char *, size_t);
+extern void fmd_buf_destroy(fmd_hdl_t *, fmd_case_t *, const char *);
+extern void fmd_buf_read(fmd_hdl_t *, fmd_case_t *,
+ const char *, void *, size_t);
+extern void fmd_buf_write(fmd_hdl_t *, fmd_case_t *,
+ const char *, const void *, size_t);
+extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *);
+
+extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t);
+extern void fmd_serd_destroy(fmd_hdl_t *, const char *);
+extern int fmd_serd_exists(fmd_hdl_t *, const char *);
+extern void fmd_serd_reset(fmd_hdl_t *, const char *);
+extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *);
+extern int fmd_serd_fired(fmd_hdl_t *, const char *);
+extern int fmd_serd_empty(fmd_hdl_t *, const char *);
+
+extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t);
+extern void fmd_timer_remove(fmd_hdl_t *, id_t);
+
+extern nvlist_t *fmd_nvl_create_fault(fmd_hdl_t *,
+ const char *, uint8_t, nvlist_t *, nvlist_t *, nvlist_t *);
+
+extern int fmd_nvl_class_match(fmd_hdl_t *, nvlist_t *, const char *);
+
+#define FMD_HAS_FAULT_FRU 0
+#define FMD_HAS_FAULT_ASRU 1
+#define FMD_HAS_FAULT_RESOURCE 2
+
+extern void fmd_repair_fru(fmd_hdl_t *, const char *);
+extern int fmd_repair_asru(fmd_hdl_t *, const char *);
+
+extern nvlist_t *fmd_nvl_alloc(fmd_hdl_t *, int);
+extern nvlist_t *fmd_nvl_dup(fmd_hdl_t *, nvlist_t *, int);
+
+/*
+ * ZED Specific Interfaces
+ */
+
+extern fmd_hdl_t *fmd_module_hdl(const char *);
+extern boolean_t fmd_module_initialized(fmd_hdl_t *);
+extern void fmd_module_recv(fmd_hdl_t *, nvlist_t *, const char *);
+
+/* ZFS FMA Retire Agent */
+extern void _zfs_retire_init(fmd_hdl_t *);
+extern void _zfs_retire_fini(fmd_hdl_t *);
+
+/* ZFS FMA Diagnosis Engine */
+extern void _zfs_diagnosis_init(fmd_hdl_t *);
+extern void _zfs_diagnosis_fini(fmd_hdl_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FMD_API_H */
diff --git a/cmd/zed/agents/fmd_serd.c b/cmd/zed/agents/fmd_serd.c
new file mode 100644
index 000000000000..d4ec37fb7691
--- /dev/null
+++ b/cmd/zed/agents/fmd_serd.c
@@ -0,0 +1,316 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/list.h>
+#include <sys/time.h>
+
+#include "fmd_api.h"
+#include "fmd_serd.h"
+#include "../zed_log.h"
+
+
+#define FMD_STR_BUCKETS 211
+
+
+#ifdef SERD_ENG_DEBUG
+#define serd_log_msg(fmt, ...) \
+ zed_log_msg(LOG_INFO, fmt, __VA_ARGS__)
+#else
+#define serd_log_msg(fmt, ...)
+#endif
+
+
+/*
+ * SERD Engine Backend
+ */
+
+/*
+ * Compute the delta between events in nanoseconds. To account for very old
+ * events which are replayed, we must handle the case where time is negative.
+ * We convert the hrtime_t's to unsigned 64-bit integers and then handle the
+ * case where 'old' is greater than 'new' (i.e. high-res time has wrapped).
+ */
+static hrtime_t
+fmd_event_delta(hrtime_t t1, hrtime_t t2)
+{
+ uint64_t old = t1;
+ uint64_t new = t2;
+
+ return (new >= old ? new - old : (UINT64_MAX - old) + new + 1);
+}
+
+static fmd_serd_eng_t *
+fmd_serd_eng_alloc(const char *name, uint64_t n, hrtime_t t)
+{
+ fmd_serd_eng_t *sgp;
+
+ sgp = malloc(sizeof (fmd_serd_eng_t));
+ bzero(sgp, sizeof (fmd_serd_eng_t));
+
+ sgp->sg_name = strdup(name);
+ sgp->sg_flags = FMD_SERD_DIRTY;
+ sgp->sg_n = n;
+ sgp->sg_t = t;
+
+ list_create(&sgp->sg_list, sizeof (fmd_serd_elem_t),
+ offsetof(fmd_serd_elem_t, se_list));
+
+ return (sgp);
+}
+
+static void
+fmd_serd_eng_free(fmd_serd_eng_t *sgp)
+{
+ fmd_serd_eng_reset(sgp);
+ free(sgp->sg_name);
+ list_destroy(&sgp->sg_list);
+ free(sgp);
+}
+
+/*
+ * sourced from fmd_string.c
+ */
+static ulong_t
+fmd_strhash(const char *key)
+{
+ ulong_t g, h = 0;
+ const char *p;
+
+ for (p = key; *p != '\0'; p++) {
+ h = (h << 4) + *p;
+
+ if ((g = (h & 0xf0000000)) != 0) {
+ h ^= (g >> 24);
+ h ^= g;
+ }
+ }
+
+ return (h);
+}
+
+void
+fmd_serd_hash_create(fmd_serd_hash_t *shp)
+{
+ shp->sh_hashlen = FMD_STR_BUCKETS;
+ shp->sh_hash = calloc(shp->sh_hashlen, sizeof (void *));
+ shp->sh_count = 0;
+}
+
+void
+fmd_serd_hash_destroy(fmd_serd_hash_t *shp)
+{
+ fmd_serd_eng_t *sgp, *ngp;
+ uint_t i;
+
+ for (i = 0; i < shp->sh_hashlen; i++) {
+ for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = ngp) {
+ ngp = sgp->sg_next;
+ fmd_serd_eng_free(sgp);
+ }
+ }
+
+ free(shp->sh_hash);
+ bzero(shp, sizeof (fmd_serd_hash_t));
+}
+
+void
+fmd_serd_hash_apply(fmd_serd_hash_t *shp, fmd_serd_eng_f *func, void *arg)
+{
+ fmd_serd_eng_t *sgp;
+ uint_t i;
+
+ for (i = 0; i < shp->sh_hashlen; i++) {
+ for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = sgp->sg_next)
+ func(sgp, arg);
+ }
+}
+
+fmd_serd_eng_t *
+fmd_serd_eng_insert(fmd_serd_hash_t *shp, const char *name,
+ uint_t n, hrtime_t t)
+{
+ uint_t h = fmd_strhash(name) % shp->sh_hashlen;
+ fmd_serd_eng_t *sgp = fmd_serd_eng_alloc(name, n, t);
+
+ serd_log_msg(" SERD Engine: inserting %s N %d T %llu",
+ name, (int)n, (long long unsigned)t);
+
+ sgp->sg_next = shp->sh_hash[h];
+ shp->sh_hash[h] = sgp;
+ shp->sh_count++;
+
+ return (sgp);
+}
+
+fmd_serd_eng_t *
+fmd_serd_eng_lookup(fmd_serd_hash_t *shp, const char *name)
+{
+ uint_t h = fmd_strhash(name) % shp->sh_hashlen;
+ fmd_serd_eng_t *sgp;
+
+ for (sgp = shp->sh_hash[h]; sgp != NULL; sgp = sgp->sg_next) {
+ if (strcmp(name, sgp->sg_name) == 0)
+ return (sgp);
+ }
+
+ return (NULL);
+}
+
+void
+fmd_serd_eng_delete(fmd_serd_hash_t *shp, const char *name)
+{
+ uint_t h = fmd_strhash(name) % shp->sh_hashlen;
+ fmd_serd_eng_t *sgp, **pp = &shp->sh_hash[h];
+
+ serd_log_msg(" SERD Engine: deleting %s", name);
+
+ for (sgp = *pp; sgp != NULL; sgp = sgp->sg_next) {
+ if (strcmp(sgp->sg_name, name) != 0)
+ pp = &sgp->sg_next;
+ else
+ break;
+ }
+
+ if (sgp != NULL) {
+ *pp = sgp->sg_next;
+ fmd_serd_eng_free(sgp);
+ assert(shp->sh_count != 0);
+ shp->sh_count--;
+ }
+}
+
+static void
+fmd_serd_eng_discard(fmd_serd_eng_t *sgp, fmd_serd_elem_t *sep)
+{
+ list_remove(&sgp->sg_list, sep);
+ sgp->sg_count--;
+
+ serd_log_msg(" SERD Engine: discarding %s, %d remaining",
+ sgp->sg_name, (int)sgp->sg_count);
+
+ free(sep);
+}
+
+int
+fmd_serd_eng_record(fmd_serd_eng_t *sgp, hrtime_t hrt)
+{
+ fmd_serd_elem_t *sep, *oep;
+
+ /*
+ * If the fired flag is already set, return false and discard the
+ * event. This means that the caller will only see the engine "fire"
+ * once until fmd_serd_eng_reset() is called. The fmd_serd_eng_fired()
+ * function can also be used in combination with fmd_serd_eng_record().
+ */
+ if (sgp->sg_flags & FMD_SERD_FIRED) {
+ serd_log_msg(" SERD Engine: record %s already fired!",
+ sgp->sg_name);
+ return (FMD_B_FALSE);
+ }
+
+ while (sgp->sg_count >= sgp->sg_n)
+ fmd_serd_eng_discard(sgp, list_tail(&sgp->sg_list));
+
+ sep = malloc(sizeof (fmd_serd_elem_t));
+ sep->se_hrt = hrt;
+
+ list_insert_head(&sgp->sg_list, sep);
+ sgp->sg_count++;
+
+ serd_log_msg(" SERD Engine: recording %s of %d (%llu)",
+ sgp->sg_name, (int)sgp->sg_count, (long long unsigned)hrt);
+
+ /*
+ * Pick up the oldest element pointer for comparison to 'sep'. We must
+ * do this after adding 'sep' because 'oep' and 'sep' can be the same.
+ */
+ oep = list_tail(&sgp->sg_list);
+
+ if (sgp->sg_count >= sgp->sg_n &&
+ fmd_event_delta(oep->se_hrt, sep->se_hrt) <= sgp->sg_t) {
+ sgp->sg_flags |= FMD_SERD_FIRED | FMD_SERD_DIRTY;
+ serd_log_msg(" SERD Engine: fired %s", sgp->sg_name);
+ return (FMD_B_TRUE);
+ }
+
+ sgp->sg_flags |= FMD_SERD_DIRTY;
+ return (FMD_B_FALSE);
+}
+
+int
+fmd_serd_eng_fired(fmd_serd_eng_t *sgp)
+{
+ return (sgp->sg_flags & FMD_SERD_FIRED);
+}
+
+int
+fmd_serd_eng_empty(fmd_serd_eng_t *sgp)
+{
+ return (sgp->sg_count == 0);
+}
+
+void
+fmd_serd_eng_reset(fmd_serd_eng_t *sgp)
+{
+ serd_log_msg(" SERD Engine: resetting %s", sgp->sg_name);
+
+ while (sgp->sg_count != 0)
+ fmd_serd_eng_discard(sgp, list_head(&sgp->sg_list));
+
+ sgp->sg_flags &= ~FMD_SERD_FIRED;
+ sgp->sg_flags |= FMD_SERD_DIRTY;
+}
+
+void
+fmd_serd_eng_gc(fmd_serd_eng_t *sgp)
+{
+ fmd_serd_elem_t *sep, *nep;
+ hrtime_t hrt;
+
+ if (sgp->sg_count == 0 || (sgp->sg_flags & FMD_SERD_FIRED))
+ return; /* no garbage collection needed if empty or fired */
+
+ sep = list_head(&sgp->sg_list);
+ if (sep == NULL)
+ return;
+
+ hrt = sep->se_hrt - sgp->sg_t;
+
+ for (sep = list_head(&sgp->sg_list); sep != NULL; sep = nep) {
+ if (sep->se_hrt >= hrt)
+ break; /* sep and subsequent events are all within T */
+
+ nep = list_next(&sgp->sg_list, sep);
+ fmd_serd_eng_discard(sgp, sep);
+ sgp->sg_flags |= FMD_SERD_DIRTY;
+ }
+}
diff --git a/cmd/zed/agents/fmd_serd.h b/cmd/zed/agents/fmd_serd.h
new file mode 100644
index 000000000000..c35c9acc7785
--- /dev/null
+++ b/cmd/zed/agents/fmd_serd.h
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef _FMD_SERD_H
+#define _FMD_SERD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/list.h>
+#include <sys/time.h>
+
+typedef struct fmd_serd_elem {
+ list_node_t se_list; /* linked list forward/back pointers */
+ hrtime_t se_hrt; /* upper bound on event hrtime */
+} fmd_serd_elem_t;
+
+typedef struct fmd_serd_eng {
+ char *sg_name; /* string name for this engine */
+ struct fmd_serd_eng *sg_next; /* next engine on hash chain */
+ list_t sg_list; /* list of fmd_serd_elem_t's */
+ uint_t sg_count; /* count of events in sg_list */
+ uint_t sg_flags; /* engine flags (see below) */
+ uint_t sg_n; /* engine N parameter (event count) */
+ hrtime_t sg_t; /* engine T parameter (nanoseconds) */
+} fmd_serd_eng_t;
+
+#define FMD_SERD_FIRED 0x1 /* error rate has exceeded threshold */
+#define FMD_SERD_DIRTY 0x2 /* engine needs to be checkpointed */
+
+typedef void fmd_serd_eng_f(fmd_serd_eng_t *, void *);
+
+typedef struct fmd_serd_hash {
+ fmd_serd_eng_t **sh_hash; /* hash bucket array for buffers */
+ uint_t sh_hashlen; /* length of hash bucket array */
+ uint_t sh_count; /* count of engines in hash */
+} fmd_serd_hash_t;
+
+extern void fmd_serd_hash_create(fmd_serd_hash_t *);
+extern void fmd_serd_hash_destroy(fmd_serd_hash_t *);
+extern void fmd_serd_hash_apply(fmd_serd_hash_t *, fmd_serd_eng_f *, void *);
+
+extern fmd_serd_eng_t *fmd_serd_eng_insert(fmd_serd_hash_t *,
+ const char *, uint32_t, hrtime_t);
+
+extern fmd_serd_eng_t *fmd_serd_eng_lookup(fmd_serd_hash_t *, const char *);
+extern void fmd_serd_eng_delete(fmd_serd_hash_t *, const char *);
+
+extern int fmd_serd_eng_record(fmd_serd_eng_t *, hrtime_t);
+extern int fmd_serd_eng_fired(fmd_serd_eng_t *);
+extern int fmd_serd_eng_empty(fmd_serd_eng_t *);
+
+extern void fmd_serd_eng_reset(fmd_serd_eng_t *);
+extern void fmd_serd_eng_gc(fmd_serd_eng_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FMD_SERD_H */
diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c
new file mode 100644
index 000000000000..006e0ab99f47
--- /dev/null
+++ b/cmd/zed/agents/zfs_agents.c
@@ -0,0 +1,422 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
+ */
+
+#include <libnvpair.h>
+#include <libzfs.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/list.h>
+#include <sys/time.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/fs/zfs.h>
+#include <pthread.h>
+#include <unistd.h>
+
+#include "zfs_agents.h"
+#include "fmd_api.h"
+#include "../zed_log.h"
+
+/*
+ * agent dispatch code
+ */
+
+static pthread_mutex_t agent_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t agent_cond = PTHREAD_COND_INITIALIZER;
+static list_t agent_events; /* list of pending events */
+static int agent_exiting;
+
+typedef struct agent_event {
+ char ae_class[64];
+ char ae_subclass[32];
+ nvlist_t *ae_nvl;
+ list_node_t ae_node;
+} agent_event_t;
+
+pthread_t g_agents_tid;
+
+libzfs_handle_t *g_zfs_hdl;
+
+/* guid search data */
+typedef enum device_type {
+ DEVICE_TYPE_L2ARC, /* l2arc device */
+ DEVICE_TYPE_SPARE, /* spare device */
+ DEVICE_TYPE_PRIMARY /* any primary pool storage device */
+} device_type_t;
+
+typedef struct guid_search {
+ uint64_t gs_pool_guid;
+ uint64_t gs_vdev_guid;
+ char *gs_devid;
+ device_type_t gs_vdev_type;
+ uint64_t gs_vdev_expandtime; /* vdev expansion time */
+} guid_search_t;
+
+/*
+ * Walks the vdev tree recursively looking for a matching devid.
+ * Returns B_TRUE as soon as a matching device is found, B_FALSE otherwise.
+ */
+static boolean_t
+zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
+{
+ guid_search_t *gsp = arg;
+ char *path = NULL;
+ uint_t c, children;
+ nvlist_t **child;
+
+ /*
+ * First iterate over any children.
+ */
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
+ gsp->gs_vdev_type = DEVICE_TYPE_PRIMARY;
+ return (B_TRUE);
+ }
+ }
+ }
+ /*
+ * Iterate over any spares and cache devices
+ */
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
+ gsp->gs_vdev_type = DEVICE_TYPE_L2ARC;
+ return (B_TRUE);
+ }
+ }
+ }
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
+ gsp->gs_vdev_type = DEVICE_TYPE_SPARE;
+ return (B_TRUE);
+ }
+ }
+ }
+ /*
+ * On a devid match, grab the vdev guid and expansion time, if any.
+ */
+ if (gsp->gs_devid != NULL &&
+ (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) &&
+ (strcmp(gsp->gs_devid, path) == 0)) {
+ (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
+ &gsp->gs_vdev_guid);
+ (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
+ &gsp->gs_vdev_expandtime);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static int
+zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)
+{
+ guid_search_t *gsp = arg;
+ nvlist_t *config, *nvl;
+
+ /*
+ * For each vdev in this pool, look for a match by devid
+ */
+ if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvl) == 0) {
+ (void) zfs_agent_iter_vdev(zhp, nvl, gsp);
+ }
+ }
+ /*
+ * if a match was found then grab the pool guid
+ */
+ if (gsp->gs_vdev_guid) {
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &gsp->gs_pool_guid);
+ }
+
+ zpool_close(zhp);
+ return (gsp->gs_vdev_guid != 0);
+}
+
+void
+zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
+{
+ agent_event_t *event;
+
+ if (subclass == NULL)
+ subclass = "";
+
+ event = malloc(sizeof (agent_event_t));
+ if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) {
+ if (event)
+ free(event);
+ return;
+ }
+
+ if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) {
+ class = EC_ZFS;
+ subclass = ESC_ZFS_VDEV_CHECK;
+ }
+
+ /*
+ * On ZFS on Linux, we don't get the expected FM_RESOURCE_REMOVED
+ * ereport from vdev_disk layer after a hot unplug. Fortunately we
+ * get a EC_DEV_REMOVE from our disk monitor and it is a suitable
+ * proxy so we remap it here for the benefit of the diagnosis engine.
+ */
+ if ((strcmp(class, EC_DEV_REMOVE) == 0) &&
+ (strcmp(subclass, ESC_DISK) == 0) &&
+ (nvlist_exists(nvl, ZFS_EV_VDEV_GUID) ||
+ nvlist_exists(nvl, DEV_IDENTIFIER))) {
+ nvlist_t *payload = event->ae_nvl;
+ struct timeval tv;
+ int64_t tod[2];
+ uint64_t pool_guid = 0, vdev_guid = 0;
+ guid_search_t search = { 0 };
+ device_type_t devtype = DEVICE_TYPE_PRIMARY;
+
+ class = "resource.fs.zfs.removed";
+ subclass = "";
+
+ (void) nvlist_add_string(payload, FM_CLASS, class);
+ (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
+ (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid);
+
+ (void) gettimeofday(&tv, NULL);
+ tod[0] = tv.tv_sec;
+ tod[1] = tv.tv_usec;
+ (void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2);
+
+ /*
+ * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
+ * ZFS_EV_POOL_GUID may be missing so find them.
+ */
+ (void) nvlist_lookup_string(nvl, DEV_IDENTIFIER,
+ &search.gs_devid);
+ (void) zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
+ pool_guid = search.gs_pool_guid;
+ vdev_guid = search.gs_vdev_guid;
+ devtype = search.gs_vdev_type;
+
+ /*
+ * We want to avoid reporting "remove" events coming from
+ * libudev for VDEVs which were expanded recently (10s) and
+ * avoid activating spares in response to partitions being
+ * deleted and created in rapid succession.
+ */
+ if (search.gs_vdev_expandtime != 0 &&
+ search.gs_vdev_expandtime + 10 > tv.tv_sec) {
+ zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' "
+ "for recently expanded device '%s'", EC_DEV_REMOVE,
+ search.gs_devid);
+ goto out;
+ }
+
+ (void) nvlist_add_uint64(payload,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid);
+ (void) nvlist_add_uint64(payload,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid);
+ switch (devtype) {
+ case DEVICE_TYPE_L2ARC:
+ (void) nvlist_add_string(payload,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ VDEV_TYPE_L2CACHE);
+ break;
+ case DEVICE_TYPE_SPARE:
+ (void) nvlist_add_string(payload,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_SPARE);
+ break;
+ case DEVICE_TYPE_PRIMARY:
+ (void) nvlist_add_string(payload,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_DISK);
+ break;
+ }
+
+ zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'",
+ EC_DEV_REMOVE, class);
+ }
+
+ (void) strlcpy(event->ae_class, class, sizeof (event->ae_class));
+ (void) strlcpy(event->ae_subclass, subclass,
+ sizeof (event->ae_subclass));
+
+ (void) pthread_mutex_lock(&agent_lock);
+ list_insert_tail(&agent_events, event);
+ (void) pthread_mutex_unlock(&agent_lock);
+
+out:
+ (void) pthread_cond_signal(&agent_cond);
+}
+
+static void
+zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl)
+{
+ /*
+ * The diagnosis engine subscribes to the following events.
+ * On illumos these subscriptions reside in:
+ * /usr/lib/fm/fmd/plugins/zfs-diagnosis.conf
+ */
+ if (strstr(class, "ereport.fs.zfs.") != NULL ||
+ strstr(class, "resource.fs.zfs.") != NULL ||
+ strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 ||
+ strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 ||
+ strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) {
+ fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class);
+ }
+
+ /*
+ * The retire agent subscribes to the following events.
+ * On illumos these subscriptions reside in:
+ * /usr/lib/fm/fmd/plugins/zfs-retire.conf
+ *
+ * NOTE: faults events come directly from our diagnosis engine
+ * and will not pass through the zfs kernel module.
+ */
+ if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 ||
+ strcmp(class, "resource.fs.zfs.removed") == 0 ||
+ strcmp(class, "resource.fs.zfs.statechange") == 0 ||
+ strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {
+ fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class);
+ }
+
+ /*
+ * The SLM module only consumes disk events and vdev check events
+ *
+ * NOTE: disk events come directly from disk monitor and will
+ * not pass through the zfs kernel module.
+ */
+ if (strstr(class, "EC_dev_") != NULL ||
+ strcmp(class, EC_ZFS) == 0) {
+ (void) zfs_slm_event(class, subclass, nvl);
+ }
+}
+
+/*
+ * Events are consumed and dispatched from this thread
+ * An agent can also post an event so event list lock
+ * is not held when calling an agent.
+ * One event is consumed at a time.
+ */
+static void *
+zfs_agent_consumer_thread(void *arg)
+{
+ for (;;) {
+ agent_event_t *event;
+
+ (void) pthread_mutex_lock(&agent_lock);
+
+ /* wait for an event to show up */
+ while (!agent_exiting && list_is_empty(&agent_events))
+ (void) pthread_cond_wait(&agent_cond, &agent_lock);
+
+ if (agent_exiting) {
+ (void) pthread_mutex_unlock(&agent_lock);
+ zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: "
+ "exiting");
+ return (NULL);
+ }
+
+ if ((event = (list_head(&agent_events))) != NULL) {
+ list_remove(&agent_events, event);
+
+ (void) pthread_mutex_unlock(&agent_lock);
+
+ /* dispatch to all event subscribers */
+ zfs_agent_dispatch(event->ae_class, event->ae_subclass,
+ event->ae_nvl);
+
+ nvlist_free(event->ae_nvl);
+ free(event);
+ continue;
+ }
+
+ (void) pthread_mutex_unlock(&agent_lock);
+ }
+
+ return (NULL);
+}
+
+void
+zfs_agent_init(libzfs_handle_t *zfs_hdl)
+{
+ fmd_hdl_t *hdl;
+
+ g_zfs_hdl = zfs_hdl;
+
+ if (zfs_slm_init() != 0)
+ zed_log_die("Failed to initialize zfs slm");
+ zed_log_msg(LOG_INFO, "Add Agent: init");
+
+ hdl = fmd_module_hdl("zfs-diagnosis");
+ _zfs_diagnosis_init(hdl);
+ if (!fmd_module_initialized(hdl))
+ zed_log_die("Failed to initialize zfs diagnosis");
+
+ hdl = fmd_module_hdl("zfs-retire");
+ _zfs_retire_init(hdl);
+ if (!fmd_module_initialized(hdl))
+ zed_log_die("Failed to initialize zfs retire");
+
+ list_create(&agent_events, sizeof (agent_event_t),
+ offsetof(struct agent_event, ae_node));
+
+ if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread,
+ NULL) != 0) {
+ list_destroy(&agent_events);
+ zed_log_die("Failed to initialize agents");
+ }
+}
+
+void
+zfs_agent_fini(void)
+{
+ fmd_hdl_t *hdl;
+ agent_event_t *event;
+
+ agent_exiting = 1;
+ (void) pthread_cond_signal(&agent_cond);
+
+ /* wait for zfs_enum_pools thread to complete */
+ (void) pthread_join(g_agents_tid, NULL);
+
+ /* drain any pending events */
+ while ((event = (list_head(&agent_events))) != NULL) {
+ list_remove(&agent_events, event);
+ nvlist_free(event->ae_nvl);
+ free(event);
+ }
+
+ list_destroy(&agent_events);
+
+ if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) {
+ _zfs_retire_fini(hdl);
+ fmd_hdl_unregister(hdl);
+ }
+ if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) {
+ _zfs_diagnosis_fini(hdl);
+ fmd_hdl_unregister(hdl);
+ }
+
+ zed_log_msg(LOG_INFO, "Add Agent: fini");
+ zfs_slm_fini();
+
+ g_zfs_hdl = NULL;
+}
diff --git a/cmd/zed/agents/zfs_agents.h b/cmd/zed/agents/zfs_agents.h
new file mode 100644
index 000000000000..d1a459139b1e
--- /dev/null
+++ b/cmd/zed/agents/zfs_agents.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef ZFS_AGENTS_H
+#define ZFS_AGENTS_H
+
+#include <libzfs.h>
+#include <libnvpair.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Agent abstraction presented to ZED
+ */
+extern void zfs_agent_init(libzfs_handle_t *);
+extern void zfs_agent_fini(void);
+extern void zfs_agent_post_event(const char *, const char *, nvlist_t *);
+
+/*
+ * ZFS Sysevent Linkable Module (SLM)
+ */
+extern int zfs_slm_init(void);
+extern void zfs_slm_fini(void);
+extern void zfs_slm_event(const char *, const char *, nvlist_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !ZFS_AGENTS_H */
diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c
new file mode 100644
index 000000000000..0b27f6702ee8
--- /dev/null
+++ b/cmd/zed/agents/zfs_diagnosis.c
@@ -0,0 +1,981 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <strings.h>
+#include <libuutil.h>
+#include <libzfs.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/fs/zfs.h>
+
+#include "zfs_agents.h"
+#include "fmd_api.h"
+
+/*
+ * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This
+ * #define reserves enough space for two 64-bit hex values plus the length of
+ * the longest string.
+ */
+#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum"))
+
+/*
+ * On-disk case structure. This must maintain backwards compatibility with
+ * previous versions of the DE. By default, any members appended to the end
+ * will be filled with zeros if they don't exist in a previous version.
+ */
+typedef struct zfs_case_data {
+ uint64_t zc_version;
+ uint64_t zc_ena;
+ uint64_t zc_pool_guid;
+ uint64_t zc_vdev_guid;
+ int zc_pool_state;
+ char zc_serd_checksum[MAX_SERDLEN];
+ char zc_serd_io[MAX_SERDLEN];
+ int zc_has_remove_timer;
+} zfs_case_data_t;
+
+/*
+ * Time-of-day
+ */
+typedef struct er_timeval {
+ uint64_t ertv_sec;
+ uint64_t ertv_nsec;
+} er_timeval_t;
+
+/*
+ * In-core case structure.
+ */
+typedef struct zfs_case {
+ boolean_t zc_present;
+ uint32_t zc_version;
+ zfs_case_data_t zc_data;
+ fmd_case_t *zc_case;
+ uu_list_node_t zc_node;
+ id_t zc_remove_timer;
+ char *zc_fru;
+ er_timeval_t zc_when;
+} zfs_case_t;
+
+#define CASE_DATA "data"
+#define CASE_FRU "fru"
+#define CASE_DATA_VERSION_INITIAL 1
+#define CASE_DATA_VERSION_SERD 2
+
+typedef struct zfs_de_stats {
+ fmd_stat_t old_drops;
+ fmd_stat_t dev_drops;
+ fmd_stat_t vdev_drops;
+ fmd_stat_t import_drops;
+ fmd_stat_t resource_drops;
+} zfs_de_stats_t;
+
+zfs_de_stats_t zfs_stats = {
+ { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" },
+ { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"},
+ { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"},
+ { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" },
+ { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
+};
+
+static hrtime_t zfs_remove_timeout;
+
+uu_list_pool_t *zfs_case_pool;
+uu_list_t *zfs_cases;
+
+#define ZFS_MAKE_RSRC(type) \
+ FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type
+#define ZFS_MAKE_EREPORT(type) \
+ FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
+
+/*
+ * Write out the persistent representation of an active case.
+ */
+static void
+zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp)
+{
+ zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD;
+}
+
+/*
+ * Read back the persistent representation of an active case.
+ */
+static zfs_case_t *
+zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ zfs_case_t *zcp;
+
+ zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP);
+ zcp->zc_case = cp;
+
+ fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data,
+ sizeof (zcp->zc_data));
+
+ if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) {
+ fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+ return (NULL);
+ }
+
+ /*
+ * fmd_buf_read() will have already zeroed out the remainder of the
+ * buffer, so we don't have to do anything special if the version
+ * doesn't include the SERD engine name.
+ */
+
+ if (zcp->zc_data.zc_has_remove_timer)
+ zcp->zc_remove_timer = fmd_timer_install(hdl, zcp,
+ NULL, zfs_remove_timeout);
+
+ uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool);
+ (void) uu_list_insert_before(zfs_cases, NULL, zcp);
+
+ fmd_case_setspecific(hdl, cp, zcp);
+
+ return (zcp);
+}
+
+/*
+ * Iterate over any active cases. If any cases are associated with a pool or
+ * vdev which is no longer present on the system, close the associated case.
+ */
+static void
+zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded)
+{
+ uint64_t vdev_guid = 0;
+ uint_t c, children;
+ nvlist_t **child;
+ zfs_case_t *zcp;
+
+ (void) nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid);
+
+ /*
+ * Mark any cases associated with this (pool, vdev) pair.
+ */
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == pool_guid &&
+ zcp->zc_data.zc_vdev_guid == vdev_guid) {
+ zcp->zc_present = B_TRUE;
+ zcp->zc_when = *loaded;
+ }
+ }
+
+ /*
+ * Iterate over all children.
+ */
+ if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_mark_vdev(pool_guid, child[c], loaded);
+ }
+
+ if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_mark_vdev(pool_guid, child[c], loaded);
+ }
+
+ if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_mark_vdev(pool_guid, child[c], loaded);
+ }
+}
+
+/*ARGSUSED*/
+static int
+zfs_mark_pool(zpool_handle_t *zhp, void *unused)
+{
+ zfs_case_t *zcp;
+ uint64_t pool_guid;
+ uint64_t *tod;
+ er_timeval_t loaded = { 0 };
+ nvlist_t *config, *vd;
+ uint_t nelem = 0;
+ int ret;
+
+ pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL);
+ /*
+ * Mark any cases associated with just this pool.
+ */
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == pool_guid &&
+ zcp->zc_data.zc_vdev_guid == 0)
+ zcp->zc_present = B_TRUE;
+ }
+
+ if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+ zpool_close(zhp);
+ return (-1);
+ }
+
+ (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME,
+ &tod, &nelem);
+ if (nelem == 2) {
+ loaded.ertv_sec = tod[0];
+ loaded.ertv_nsec = tod[1];
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == pool_guid &&
+ zcp->zc_data.zc_vdev_guid == 0) {
+ zcp->zc_when = loaded;
+ }
+ }
+ }
+
+ ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd);
+ if (ret) {
+ zpool_close(zhp);
+ return (-1);
+ }
+
+ zfs_mark_vdev(pool_guid, vd, &loaded);
+
+ zpool_close(zhp);
+
+ return (0);
+}
+
+struct load_time_arg {
+ uint64_t lt_guid;
+ er_timeval_t *lt_time;
+ boolean_t lt_found;
+};
+
+static int
+zpool_find_load_time(zpool_handle_t *zhp, void *arg)
+{
+ struct load_time_arg *lta = arg;
+ uint64_t pool_guid;
+ uint64_t *tod;
+ nvlist_t *config;
+ uint_t nelem;
+
+ if (lta->lt_found) {
+ zpool_close(zhp);
+ return (0);
+ }
+
+ pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL);
+ if (pool_guid != lta->lt_guid) {
+ zpool_close(zhp);
+ return (0);
+ }
+
+ if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+ zpool_close(zhp);
+ return (-1);
+ }
+
+ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME,
+ &tod, &nelem) == 0 && nelem == 2) {
+ lta->lt_found = B_TRUE;
+ lta->lt_time->ertv_sec = tod[0];
+ lta->lt_time->ertv_nsec = tod[1];
+ }
+
+ zpool_close(zhp);
+
+ return (0);
+}
+
+static void
+zfs_purge_cases(fmd_hdl_t *hdl)
+{
+ zfs_case_t *zcp;
+ uu_list_walk_t *walk;
+ libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
+
+ /*
+ * There is no way to open a pool by GUID, or lookup a vdev by GUID. No
+ * matter what we do, we're going to have to stomach an O(vdevs * cases)
+ * algorithm. In reality, both quantities are likely so small that
+ * neither will matter. Given that iterating over pools is more
+ * expensive than iterating over the in-memory case list, we opt for a
+ * 'present' flag in each case that starts off cleared. We then iterate
+ * over all pools, marking those that are still present, and removing
+ * those that aren't found.
+ *
+ * Note that we could also construct an FMRI and rely on
+ * fmd_nvl_fmri_present(), but this would end up doing the same search.
+ */
+
+ /*
+ * Mark the cases as not present.
+ */
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp))
+ zcp->zc_present = B_FALSE;
+
+ /*
+ * Iterate over all pools and mark the pools and vdevs found. If this
+ * fails (most probably because we're out of memory), then don't close
+ * any of the cases and we cannot be sure they are accurate.
+ */
+ if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0)
+ return;
+
+ /*
+ * Remove those cases which were not found.
+ */
+ walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
+ while ((zcp = uu_list_walk_next(walk)) != NULL) {
+ if (!zcp->zc_present)
+ fmd_case_close(hdl, zcp->zc_case);
+ }
+ uu_list_walk_end(walk);
+}
+
+/*
+ * Construct the name of a serd engine given the pool/vdev GUID and type (io or
+ * checksum).
+ */
+static void
+zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
+ const char *type)
+{
+ (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s",
+ (long long unsigned int)pool_guid,
+ (long long unsigned int)vdev_guid, type);
+}
+
+/*
+ * Solve a given ZFS case. This first checks to make sure the diagnosis is
+ * still valid, as well as cleaning up any pending timer associated with the
+ * case.
+ */
+static void
+zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname,
+ boolean_t checkunusable)
+{
+ nvlist_t *detector, *fault;
+ boolean_t serialize;
+ nvlist_t *fru = NULL;
+ fmd_hdl_debug(hdl, "solving fault '%s'", faultname);
+
+ /*
+ * Construct the detector from the case data. The detector is in the
+ * ZFS scheme, and is either the pool or the vdev, depending on whether
+ * this is a vdev or pool fault.
+ */
+ detector = fmd_nvl_alloc(hdl, FMD_SLEEP);
+
+ (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0);
+ (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS);
+ (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL,
+ zcp->zc_data.zc_pool_guid);
+ if (zcp->zc_data.zc_vdev_guid != 0) {
+ (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV,
+ zcp->zc_data.zc_vdev_guid);
+ }
+
+ fault = fmd_nvl_create_fault(hdl, faultname, 100, detector,
+ fru, detector);
+ fmd_case_add_suspect(hdl, zcp->zc_case, fault);
+
+ nvlist_free(fru);
+
+ fmd_case_solve(hdl, zcp->zc_case);
+
+ serialize = B_FALSE;
+ if (zcp->zc_data.zc_has_remove_timer) {
+ fmd_timer_remove(hdl, zcp->zc_remove_timer);
+ zcp->zc_data.zc_has_remove_timer = 0;
+ serialize = B_TRUE;
+ }
+ if (serialize)
+ zfs_case_serialize(hdl, zcp);
+
+ nvlist_free(detector);
+}
+
+static boolean_t
+timeval_earlier(er_timeval_t *a, er_timeval_t *b)
+{
+ return (a->ertv_sec < b->ertv_sec ||
+ (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec));
+}
+
+/*ARGSUSED*/
+static void
+zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when)
+{
+ int64_t *tod;
+ uint_t nelem;
+
+ if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod,
+ &nelem) == 0 && nelem == 2) {
+ when->ertv_sec = tod[0];
+ when->ertv_nsec = tod[1];
+ } else {
+ when->ertv_sec = when->ertv_nsec = UINT64_MAX;
+ }
+}
+
+/*
+ * Main fmd entry point.
+ */
+/*ARGSUSED*/
+static void
+zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
+{
+ zfs_case_t *zcp, *dcp;
+ int32_t pool_state;
+ uint64_t ena, pool_guid, vdev_guid;
+ er_timeval_t pool_load;
+ er_timeval_t er_when;
+ nvlist_t *detector;
+ boolean_t pool_found = B_FALSE;
+ boolean_t isresource;
+ char *type;
+
+ /*
+ * We subscribe to notifications for vdev or pool removal. In these
+ * cases, there may be cases that no longer apply. Purge any cases
+ * that no longer apply.
+ */
+ if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) {
+ fmd_hdl_debug(hdl, "purging orphaned cases from %s",
+ strrchr(class, '.') + 1);
+ zfs_purge_cases(hdl);
+ zfs_stats.resource_drops.fmds_value.ui64++;
+ return;
+ }
+
+ isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*");
+
+ if (isresource) {
+ /*
+ * For resources, we don't have a normal payload.
+ */
+ if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ &vdev_guid) != 0)
+ pool_state = SPA_LOAD_OPEN;
+ else
+ pool_state = SPA_LOAD_NONE;
+ detector = NULL;
+ } else {
+ (void) nvlist_lookup_nvlist(nvl,
+ FM_EREPORT_DETECTOR, &detector);
+ (void) nvlist_lookup_int32(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state);
+ }
+
+ /*
+ * We also ignore all ereports generated during an import of a pool,
+ * since the only possible fault (.pool) would result in import failure,
+ * and hence no persistent fault. Some day we may want to do something
+ * with these ereports, so we continue generating them internally.
+ */
+ if (pool_state == SPA_LOAD_IMPORT) {
+ zfs_stats.import_drops.fmds_value.ui64++;
+ fmd_hdl_debug(hdl, "ignoring '%s' during import", class);
+ return;
+ }
+
+ /*
+ * Device I/O errors are ignored during pool open.
+ */
+ if (pool_state == SPA_LOAD_OPEN &&
+ (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) {
+ fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class);
+ zfs_stats.dev_drops.fmds_value.ui64++;
+ return;
+ }
+
+ /*
+ * We ignore ereports for anything except disks and files.
+ */
+ if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ &type) == 0) {
+ if (strcmp(type, VDEV_TYPE_DISK) != 0 &&
+ strcmp(type, VDEV_TYPE_FILE) != 0) {
+ zfs_stats.vdev_drops.fmds_value.ui64++;
+ return;
+ }
+ }
+
+ /*
+ * Determine if this ereport corresponds to an open case.
+ * Each vdev or pool can have a single case.
+ */
+ (void) nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid);
+ if (nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
+ vdev_guid = 0;
+ if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0)
+ ena = 0;
+
+ zfs_ereport_when(hdl, nvl, &er_when);
+
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == pool_guid) {
+ pool_found = B_TRUE;
+ pool_load = zcp->zc_when;
+ }
+ if (zcp->zc_data.zc_vdev_guid == vdev_guid)
+ break;
+ }
+
+ /*
+ * Avoid falsely accusing a pool of being faulty. Do so by
+ * not replaying ereports that were generated prior to the
+ * current import. If the failure that generated them was
+ * transient because the device was actually removed but we
+ * didn't receive the normal asynchronous notification, we
+ * don't want to mark it as faulted and potentially panic. If
+ * there is still a problem we'd expect not to be able to
+ * import the pool, or that new ereports will be generated
+ * once the pool is used.
+ */
+ if (pool_found && timeval_earlier(&er_when, &pool_load)) {
+ fmd_hdl_debug(hdl, "ignoring pool %llx, "
+ "ereport time %lld.%lld, pool load time = %lld.%lld",
+ pool_guid, er_when.ertv_sec, er_when.ertv_nsec,
+ pool_load.ertv_sec, pool_load.ertv_nsec);
+ zfs_stats.old_drops.fmds_value.ui64++;
+ return;
+ }
+
+ if (!pool_found) {
+ /*
+ * Haven't yet seen this pool, but same situation
+ * may apply.
+ */
+ libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
+ struct load_time_arg la;
+
+ la.lt_guid = pool_guid;
+ la.lt_time = &pool_load;
+ la.lt_found = B_FALSE;
+
+ if (zhdl != NULL &&
+ zpool_iter(zhdl, zpool_find_load_time, &la) == 0 &&
+ la.lt_found == B_TRUE) {
+ pool_found = B_TRUE;
+
+ if (timeval_earlier(&er_when, &pool_load)) {
+ fmd_hdl_debug(hdl, "ignoring pool %llx, "
+ "ereport time %lld.%lld, "
+ "pool load time = %lld.%lld",
+ pool_guid, er_when.ertv_sec,
+ er_when.ertv_nsec, pool_load.ertv_sec,
+ pool_load.ertv_nsec);
+ zfs_stats.old_drops.fmds_value.ui64++;
+ return;
+ }
+ }
+ }
+
+ if (zcp == NULL) {
+ fmd_case_t *cs;
+ zfs_case_data_t data = { 0 };
+
+ /*
+ * If this is one of our 'fake' resource ereports, and there is
+ * no case open, simply discard it.
+ */
+ if (isresource) {
+ zfs_stats.resource_drops.fmds_value.ui64++;
+ fmd_hdl_debug(hdl, "discarding '%s for vdev %llu",
+ class, vdev_guid);
+ return;
+ }
+
+ /*
+ * Skip tracking some ereports
+ */
+ if (strcmp(class,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
+ strcmp(class,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 ||
+ strcmp(class,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
+ zfs_stats.resource_drops.fmds_value.ui64++;
+ return;
+ }
+
+ /*
+ * Open a new case.
+ */
+ cs = fmd_case_open(hdl, NULL);
+
+ fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'",
+ vdev_guid, class);
+
+ /*
+ * Initialize the case buffer. To commonize code, we actually
+ * create the buffer with existing data, and then call
+ * zfs_case_unserialize() to instantiate the in-core structure.
+ */
+ fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t));
+
+ data.zc_version = CASE_DATA_VERSION_SERD;
+ data.zc_ena = ena;
+ data.zc_pool_guid = pool_guid;
+ data.zc_vdev_guid = vdev_guid;
+ data.zc_pool_state = (int)pool_state;
+
+ fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data));
+
+ zcp = zfs_case_unserialize(hdl, cs);
+ assert(zcp != NULL);
+ if (pool_found)
+ zcp->zc_when = pool_load;
+ }
+
+ if (isresource) {
+ fmd_hdl_debug(hdl, "resource event '%s'", class);
+
+ if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) {
+ /*
+ * The 'resource.fs.zfs.autoreplace' event indicates
+ * that the pool was loaded with the 'autoreplace'
+ * property set. In this case, any pending device
+ * failures should be ignored, as the asynchronous
+ * autoreplace handling will take care of them.
+ */
+ fmd_case_close(hdl, zcp->zc_case);
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) {
+ /*
+ * The 'resource.fs.zfs.removed' event indicates that
+ * device removal was detected, and the device was
+ * closed asynchronously. If this is the case, we
+ * assume that any recent I/O errors were due to the
+ * device removal, not any fault of the device itself.
+ * We reset the SERD engine, and cancel any pending
+ * timers.
+ */
+ if (zcp->zc_data.zc_has_remove_timer) {
+ fmd_timer_remove(hdl, zcp->zc_remove_timer);
+ zcp->zc_data.zc_has_remove_timer = 0;
+ zfs_case_serialize(hdl, zcp);
+ }
+ if (zcp->zc_data.zc_serd_io[0] != '\0')
+ fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io);
+ if (zcp->zc_data.zc_serd_checksum[0] != '\0')
+ fmd_serd_reset(hdl,
+ zcp->zc_data.zc_serd_checksum);
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
+ uint64_t state = 0;
+
+ if (zcp != NULL &&
+ nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 &&
+ state == VDEV_STATE_HEALTHY) {
+ fmd_hdl_debug(hdl, "closing case after a "
+ "device statechange to healthy");
+ fmd_case_close(hdl, zcp->zc_case);
+ }
+ }
+ zfs_stats.resource_drops.fmds_value.ui64++;
+ return;
+ }
+
+ /*
+ * Associate the ereport with this case.
+ */
+ fmd_case_add_ereport(hdl, zcp->zc_case, ep);
+
+ /*
+ * Don't do anything else if this case is already solved.
+ */
+ if (fmd_case_solved(hdl, zcp->zc_case))
+ return;
+
+ fmd_hdl_debug(hdl, "error event '%s'", class);
+
+ /*
+ * Determine if we should solve the case and generate a fault. We solve
+ * a case if:
+ *
+ * a. A pool failed to open (ereport.fs.zfs.pool)
+ * b. A device failed to open (ereport.fs.zfs.pool) while a pool
+ * was up and running.
+ *
+ * We may see a series of ereports associated with a pool open, all
+ * chained together by the same ENA. If the pool open succeeds, then
+ * we'll see no further ereports. To detect when a pool open has
+ * succeeded, we associate a timer with the event. When it expires, we
+ * close the case.
+ */
+ if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) {
+ /*
+ * Pool level fault. Before solving the case, go through and
+ * close any open device cases that may be pending.
+ */
+ for (dcp = uu_list_first(zfs_cases); dcp != NULL;
+ dcp = uu_list_next(zfs_cases, dcp)) {
+ if (dcp->zc_data.zc_pool_guid ==
+ zcp->zc_data.zc_pool_guid &&
+ dcp->zc_data.zc_vdev_guid != 0)
+ fmd_case_close(hdl, dcp->zc_case);
+ }
+
+ zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE);
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) {
+ /*
+ * Pool level fault for reading the intent logs.
+ */
+ zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE);
+ } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) {
+ /*
+ * Device fault.
+ */
+ zfs_case_solve(hdl, zcp, "fault.fs.zfs.device", B_TRUE);
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
+ char *failmode = NULL;
+ boolean_t checkremove = B_FALSE;
+
+ /*
+ * If this is a checksum or I/O error, then toss it into the
+ * appropriate SERD engine and check to see if it has fired.
+ * Ideally, we want to do something more sophisticated,
+ * (persistent errors for a single data block, etc). For now,
+ * a single SERD engine is sufficient.
+ */
+ if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) {
+ if (zcp->zc_data.zc_serd_io[0] == '\0') {
+ zfs_serd_name(zcp->zc_data.zc_serd_io,
+ pool_guid, vdev_guid, "io");
+ fmd_serd_create(hdl, zcp->zc_data.zc_serd_io,
+ fmd_prop_get_int32(hdl, "io_N"),
+ fmd_prop_get_int64(hdl, "io_T"));
+ zfs_case_serialize(hdl, zcp);
+ }
+ if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
+ checkremove = B_TRUE;
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
+ if (zcp->zc_data.zc_serd_checksum[0] == '\0') {
+ zfs_serd_name(zcp->zc_data.zc_serd_checksum,
+ pool_guid, vdev_guid, "checksum");
+ fmd_serd_create(hdl,
+ zcp->zc_data.zc_serd_checksum,
+ fmd_prop_get_int32(hdl, "checksum_N"),
+ fmd_prop_get_int64(hdl, "checksum_T"));
+ zfs_case_serialize(hdl, zcp);
+ }
+ if (fmd_serd_record(hdl,
+ zcp->zc_data.zc_serd_checksum, ep)) {
+ zfs_case_solve(hdl, zcp,
+ "fault.fs.zfs.vdev.checksum", B_FALSE);
+ }
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) &&
+ (nvlist_lookup_string(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) &&
+ failmode != NULL) {
+ if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE,
+ strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) {
+ zfs_case_solve(hdl, zcp,
+ "fault.fs.zfs.io_failure_continue",
+ B_FALSE);
+ } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT,
+ strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) {
+ zfs_case_solve(hdl, zcp,
+ "fault.fs.zfs.io_failure_wait", B_FALSE);
+ }
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
+#ifndef __linux__
+ /* This causes an unexpected fault diagnosis on linux */
+ checkremove = B_TRUE;
+#endif
+ }
+
+ /*
+ * Because I/O errors may be due to device removal, we postpone
+ * any diagnosis until we're sure that we aren't about to
+ * receive a 'resource.fs.zfs.removed' event.
+ */
+ if (checkremove) {
+ if (zcp->zc_data.zc_has_remove_timer)
+ fmd_timer_remove(hdl, zcp->zc_remove_timer);
+ zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL,
+ zfs_remove_timeout);
+ if (!zcp->zc_data.zc_has_remove_timer) {
+ zcp->zc_data.zc_has_remove_timer = 1;
+ zfs_case_serialize(hdl, zcp);
+ }
+ }
+ }
+}
+
+/*
+ * The timeout is fired when we diagnosed an I/O error, and it was not due to
+ * device removal (which would cause the timeout to be cancelled).
+ */
+/* ARGSUSED */
+static void
+zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data)
+{
+ zfs_case_t *zcp = data;
+
+ if (id == zcp->zc_remove_timer)
+ zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io", B_FALSE);
+}
+
+/*
+ * The specified case has been closed and any case-specific
+ * data structures should be deallocated.
+ */
+static void
+zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
+{
+ zfs_case_t *zcp = fmd_case_getspecific(hdl, cs);
+
+ if (zcp->zc_data.zc_serd_checksum[0] != '\0')
+ fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
+ if (zcp->zc_data.zc_serd_io[0] != '\0')
+ fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
+ if (zcp->zc_data.zc_has_remove_timer)
+ fmd_timer_remove(hdl, zcp->zc_remove_timer);
+
+ uu_list_remove(zfs_cases, zcp);
+ uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool);
+ fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+}
+
+/*
+ * We use the fmd gc entry point to look for old cases that no longer apply.
+ * This allows us to keep our set of case data small in a long running system.
+ */
+static void
+zfs_fm_gc(fmd_hdl_t *hdl)
+{
+ zfs_purge_cases(hdl);
+}
+
+static const fmd_hdl_ops_t fmd_ops = {
+ zfs_fm_recv, /* fmdo_recv */
+ zfs_fm_timeout, /* fmdo_timeout */
+ zfs_fm_close, /* fmdo_close */
+ NULL, /* fmdo_stats */
+ zfs_fm_gc, /* fmdo_gc */
+};
+
+static const fmd_prop_t fmd_props[] = {
+ { "checksum_N", FMD_TYPE_UINT32, "10" },
+ { "checksum_T", FMD_TYPE_TIME, "10min" },
+ { "io_N", FMD_TYPE_UINT32, "10" },
+ { "io_T", FMD_TYPE_TIME, "10min" },
+ { "remove_timeout", FMD_TYPE_TIME, "15sec" },
+ { NULL, 0, NULL }
+};
+
+static const fmd_hdl_info_t fmd_info = {
+ "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props
+};
+
+void
+_zfs_diagnosis_init(fmd_hdl_t *hdl)
+{
+ libzfs_handle_t *zhdl;
+
+ if ((zhdl = libzfs_init()) == NULL)
+ return;
+
+ if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool",
+ sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node),
+ NULL, UU_LIST_POOL_DEBUG)) == NULL) {
+ libzfs_fini(zhdl);
+ return;
+ }
+
+ if ((zfs_cases = uu_list_create(zfs_case_pool, NULL,
+ UU_LIST_DEBUG)) == NULL) {
+ uu_list_pool_destroy(zfs_case_pool);
+ libzfs_fini(zhdl);
+ return;
+ }
+
+ if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
+ uu_list_destroy(zfs_cases);
+ uu_list_pool_destroy(zfs_case_pool);
+ libzfs_fini(zhdl);
+ return;
+ }
+
+ fmd_hdl_setspecific(hdl, zhdl);
+
+ (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
+ sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
+
+ zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
+}
+
+void
+_zfs_diagnosis_fini(fmd_hdl_t *hdl)
+{
+ zfs_case_t *zcp;
+ uu_list_walk_t *walk;
+ libzfs_handle_t *zhdl;
+
+ /*
+ * Remove all active cases.
+ */
+ walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
+ while ((zcp = uu_list_walk_next(walk)) != NULL) {
+ fmd_hdl_debug(hdl, "removing case ena %llu",
+ (long long unsigned)zcp->zc_data.zc_ena);
+ uu_list_remove(zfs_cases, zcp);
+ uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool);
+ fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+ }
+ uu_list_walk_end(walk);
+
+ uu_list_destroy(zfs_cases);
+ uu_list_pool_destroy(zfs_case_pool);
+
+ zhdl = fmd_hdl_getspecific(hdl);
+ libzfs_fini(zhdl);
+}
diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c
new file mode 100644
index 000000000000..8d0a3b420086
--- /dev/null
+++ b/cmd/zed/agents/zfs_mod.c
@@ -0,0 +1,956 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2016, 2017, Intel Corporation.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ */
+
+/*
+ * ZFS syseventd module.
+ *
+ * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c
+ *
+ * The purpose of this module is to identify when devices are added to the
+ * system, and appropriately online or replace the affected vdevs.
+ *
+ * When a device is added to the system:
+ *
+ * 1. Search for any vdevs whose devid matches that of the newly added
+ * device.
+ *
+ * 2. If no vdevs are found, then search for any vdevs whose udev path
+ * matches that of the new device.
+ *
+ * 3. If no vdevs match by either method, then ignore the event.
+ *
+ * 4. Attempt to online the device with a flag to indicate that it should
+ * be unspared when resilvering completes. If this succeeds, then the
+ * same device was inserted and we should continue normally.
+ *
+ * 5. If the pool does not have the 'autoreplace' property set, attempt to
+ * online the device again without the unspare flag, which will
+ * generate a FMA fault.
+ *
+ * 6. If the pool has the 'autoreplace' property set, and the matching vdev
+ * is a whole disk, then label the new disk and attempt a 'zpool
+ * replace'.
+ *
+ * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK
+ * event indicates that a device failed to open during pool load, but the
+ * autoreplace property was set. In this case, we deferred the associated
+ * FMA fault until our module had a chance to process the autoreplace logic.
+ * If the device could not be replaced, then the second online attempt will
+ * trigger the FMA fault that we skipped earlier.
+ *
+ * ZFS on Linux porting notes:
+ * Linux udev provides a disk insert for both the disk and the partition
+ *
+ */
+
+#include <ctype.h>
+#include <fcntl.h>
+#include <libnvpair.h>
+#include <libzfs.h>
+#include <libzutil.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
+#include <sys/list.h>
+#include <sys/sunddi.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
+#include <thread_pool.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <errno.h>
+#include "zfs_agents.h"
+#include "../zed_log.h"
+
+#define DEV_BYID_PATH "/dev/disk/by-id/"
+#define DEV_BYPATH_PATH "/dev/disk/by-path/"
+#define DEV_BYVDEV_PATH "/dev/disk/by-vdev/"
+
+typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);
+
+libzfs_handle_t *g_zfshdl;
+list_t g_pool_list; /* list of unavailable pools at initialization */
+list_t g_device_list; /* list of disks with asynchronous label request */
+tpool_t *g_tpool;
+boolean_t g_enumeration_done;
+pthread_t g_zfs_tid; /* zfs_enum_pools() thread */
+
+typedef struct unavailpool {
+ zpool_handle_t *uap_zhp;
+ list_node_t uap_node;
+} unavailpool_t;
+
+typedef struct pendingdev {
+ char pd_physpath[128];
+ list_node_t pd_node;
+} pendingdev_t;
+
+static int
+zfs_toplevel_state(zpool_handle_t *zhp)
+{
+ nvlist_t *nvroot;
+ vdev_stat_t *vs;
+ unsigned int c;
+
+ verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &c) == 0);
+ return (vs->vs_state);
+}
+
+static int
+zfs_unavail_pool(zpool_handle_t *zhp, void *data)
+{
+ zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)",
+ zpool_get_name(zhp), (int)zfs_toplevel_state(zhp));
+
+ if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) {
+ unavailpool_t *uap;
+ uap = malloc(sizeof (unavailpool_t));
+ uap->uap_zhp = zhp;
+ list_insert_tail((list_t *)data, uap);
+ } else {
+ zpool_close(zhp);
+ }
+ return (0);
+}
+
+/*
+ * Two stage replace on Linux
+ * since we get disk notifications
+ * we can wait for partitioned disk slice to show up!
+ *
+ * First stage tags the disk, initiates async partitioning, and returns
+ * Second stage finds the tag and proceeds to ZFS labeling/replace
+ *
+ * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach
+ *
+ * 1. physical match with no fs, no partition
+ * tag it top, partition disk
+ *
+ * 2. physical match again, see partition and tag
+ *
+ */
+
+/*
+ * The device associated with the given vdev (either by devid or physical path)
+ * has been added to the system. If 'isdisk' is set, then we only attempt a
+ * replacement if it's a whole disk. This also implies that we should label the
+ * disk first.
+ *
+ * First, we attempt to online the device (making sure to undo any spare
+ * operation when finished). If this succeeds, then we're done. If it fails,
+ * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
+ * but that the label was not what we expected. If the 'autoreplace' property
+ * is enabled, then we relabel the disk (if specified), and attempt a 'zpool
+ * replace'. If the online is successful, but the new state is something else
+ * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
+ * race, and we should avoid attempting to relabel the disk.
+ *
+ * Also can arrive here from a ESC_ZFS_VDEV_CHECK event
+ */
+static void
+zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
+{
+ char *path;
+ vdev_state_t newstate;
+ nvlist_t *nvroot, *newvd;
+ pendingdev_t *device;
+ uint64_t wholedisk = 0ULL;
+ uint64_t offline = 0ULL;
+ uint64_t guid = 0ULL;
+ char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
+ char rawpath[PATH_MAX], fullpath[PATH_MAX];
+ char devpath[PATH_MAX];
+ int ret;
+ boolean_t is_dm = B_FALSE;
+ boolean_t is_sd = B_FALSE;
+ uint_t c;
+ vdev_stat_t *vs;
+
+ if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
+ return;
+
+ /* Skip healthy disks */
+ verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &c) == 0);
+ if (vs->vs_state == VDEV_STATE_HEALTHY) {
+ zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.",
+ __func__, path);
+ return;
+ }
+
+ (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
+ (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
+ &enc_sysfs_path);
+ (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
+ (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline);
+ (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid);
+
+ if (offline)
+ return; /* don't intervene if it was taken offline */
+
+ is_dm = zfs_dev_is_dm(path);
+ zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'"
+ " wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path,
+ physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not",
+ (long long unsigned int)guid);
+
+ /*
+ * The VDEV guid is preferred for identification (gets passed in path)
+ */
+ if (guid != 0) {
+ (void) snprintf(fullpath, sizeof (fullpath), "%llu",
+ (long long unsigned int)guid);
+ } else {
+ /*
+ * otherwise use path sans partition suffix for whole disks
+ */
+ (void) strlcpy(fullpath, path, sizeof (fullpath));
+ if (wholedisk) {
+ char *spath = zfs_strip_partition(fullpath);
+ if (!spath) {
+ zed_log_msg(LOG_INFO, "%s: Can't alloc",
+ __func__);
+ return;
+ }
+
+ (void) strlcpy(fullpath, spath, sizeof (fullpath));
+ free(spath);
+ }
+ }
+
+ /*
+ * Attempt to online the device.
+ */
+ if (zpool_vdev_online(zhp, fullpath,
+ ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
+ (newstate == VDEV_STATE_HEALTHY ||
+ newstate == VDEV_STATE_DEGRADED)) {
+ zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s",
+ fullpath, (newstate == VDEV_STATE_HEALTHY) ?
+ "HEALTHY" : "DEGRADED");
+ return;
+ }
+
+ /*
+ * vdev_id alias rule for using scsi_debug devices (FMA automated
+ * testing)
+ */
+ if (physpath != NULL && strcmp("scsidebug", physpath) == 0)
+ is_sd = B_TRUE;
+
+ /*
+ * If the pool doesn't have the autoreplace property set, then use
+ * vdev online to trigger a FMA fault by posting an ereport.
+ */
+ if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
+ !(wholedisk || is_dm) || (physpath == NULL)) {
+ (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
+ &newstate);
+ zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or "
+ "not a whole disk for '%s'", fullpath);
+ return;
+ }
+
+ /*
+ * Convert physical path into its current device node. Rawpath
+ * needs to be /dev/disk/by-vdev for a scsi_debug device since
+ * /dev/disk/by-path will not be present.
+ */
+ (void) snprintf(rawpath, sizeof (rawpath), "%s%s",
+ is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);
+
+ if (realpath(rawpath, devpath) == NULL && !is_dm) {
+ zed_log_msg(LOG_INFO, " realpath: %s failed (%s)",
+ rawpath, strerror(errno));
+
+ (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
+ &newstate);
+
+ zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)",
+ fullpath, libzfs_error_description(g_zfshdl));
+ return;
+ }
+
+ /* Only autoreplace bad disks */
+ if ((vs->vs_state != VDEV_STATE_DEGRADED) &&
+ (vs->vs_state != VDEV_STATE_FAULTED) &&
+ (vs->vs_state != VDEV_STATE_CANT_OPEN)) {
+ return;
+ }
+
+ nvlist_lookup_string(vdev, "new_devid", &new_devid);
+
+ if (is_dm) {
+ /* Don't label device mapper or multipath disks. */
+ } else if (!labeled) {
+ /*
+ * we're auto-replacing a raw disk, so label it first
+ */
+ char *leafname;
+
+ /*
+ * If this is a request to label a whole disk, then attempt to
+ * write out the label. Before we can label the disk, we need
+ * to map the physical string that was matched on to the under
+ * lying device node.
+ *
+ * If any part of this process fails, then do a force online
+ * to trigger a ZFS fault for the device (and any hot spare
+ * replacement).
+ */
+ leafname = strrchr(devpath, '/') + 1;
+
+ /*
+ * If this is a request to label a whole disk, then attempt to
+ * write out the label.
+ */
+ if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) {
+ zed_log_msg(LOG_INFO, " zpool_label_disk: could not "
+ "label '%s' (%s)", leafname,
+ libzfs_error_description(g_zfshdl));
+
+ (void) zpool_vdev_online(zhp, fullpath,
+ ZFS_ONLINE_FORCEFAULT, &newstate);
+ return;
+ }
+
+ /*
+ * The disk labeling is asynchronous on Linux. Just record
+ * this label request and return as there will be another
+ * disk add event for the partition after the labeling is
+ * completed.
+ */
+ device = malloc(sizeof (pendingdev_t));
+ (void) strlcpy(device->pd_physpath, physpath,
+ sizeof (device->pd_physpath));
+ list_insert_tail(&g_device_list, device);
+
+ zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)",
+ leafname, (u_longlong_t)guid);
+
+ return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */
+
+ } else /* labeled */ {
+ boolean_t found = B_FALSE;
+ /*
+ * match up with request above to label the disk
+ */
+ for (device = list_head(&g_device_list); device != NULL;
+ device = list_next(&g_device_list, device)) {
+ if (strcmp(physpath, device->pd_physpath) == 0) {
+ list_remove(&g_device_list, device);
+ free(device);
+ found = B_TRUE;
+ break;
+ }
+ zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s",
+ physpath, device->pd_physpath);
+ }
+ if (!found) {
+ /* unexpected partition slice encountered */
+ zed_log_msg(LOG_INFO, "labeled disk %s unexpected here",
+ fullpath);
+ (void) zpool_vdev_online(zhp, fullpath,
+ ZFS_ONLINE_FORCEFAULT, &newstate);
+ return;
+ }
+
+ zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)",
+ physpath, (u_longlong_t)guid);
+
+ (void) snprintf(devpath, sizeof (devpath), "%s%s",
+ DEV_BYID_PATH, new_devid);
+ }
+
+ /*
+ * Construct the root vdev to pass to zpool_vdev_attach(). While adding
+ * the entire vdev structure is harmless, we construct a reduced set of
+ * path/physpath/wholedisk to keep it simple.
+ */
+ if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) {
+ zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory");
+ return;
+ }
+ if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
+ zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory");
+ nvlist_free(nvroot);
+ return;
+ }
+
+ if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
+ nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
+ nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 ||
+ (physpath != NULL && nvlist_add_string(newvd,
+ ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) ||
+ (enc_sysfs_path != NULL && nvlist_add_string(newvd,
+ ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) ||
+ nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
+ nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
+ nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd,
+ 1) != 0) {
+ zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs");
+ nvlist_free(newvd);
+ nvlist_free(nvroot);
+ return;
+ }
+
+ nvlist_free(newvd);
+
+ /*
+ * Wait for udev to verify the links exist, then auto-replace
+ * the leaf disk at same physical location.
+ */
+ if (zpool_label_disk_wait(path, 3000) != 0) {
+ zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement "
+ "disk %s is missing", path);
+ nvlist_free(nvroot);
+ return;
+ }
+
+ ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE);
+
+ zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)",
+ fullpath, path, (ret == 0) ? "no errors" :
+ libzfs_error_description(g_zfshdl));
+
+ nvlist_free(nvroot);
+}
+
+/*
+ * Utility functions to find a vdev matching given criteria.
+ */
+typedef struct dev_data {
+ const char *dd_compare;
+ const char *dd_prop;
+ zfs_process_func_t dd_func;
+ boolean_t dd_found;
+ boolean_t dd_islabeled;
+ uint64_t dd_pool_guid;
+ uint64_t dd_vdev_guid;
+ const char *dd_new_devid;
+} dev_data_t;
+
+static void
+zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
+{
+ dev_data_t *dp = data;
+ char *path = NULL;
+ uint_t c, children;
+ nvlist_t **child;
+
+ /*
+ * First iterate over any children.
+ */
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_iter_vdev(zhp, child[c], data);
+ }
+
+ /*
+ * Iterate over any spares and cache devices
+ */
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_iter_vdev(zhp, child[c], data);
+ }
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_iter_vdev(zhp, child[c], data);
+ }
+
+ /* once a vdev was matched and processed there is nothing left to do */
+ if (dp->dd_found)
+ return;
+
+ /*
+ * Match by GUID if available otherwise fallback to devid or physical
+ */
+ if (dp->dd_vdev_guid != 0) {
+ uint64_t guid;
+
+ if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
+ &guid) != 0 || guid != dp->dd_vdev_guid) {
+ return;
+ }
+ zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid);
+ dp->dd_found = B_TRUE;
+
+ } else if (dp->dd_compare != NULL) {
+ /*
+ * NOTE: On Linux there is an event for partition, so unlike
+ * illumos, substring matching is not required to accommodate
+ * the partition suffix. An exact match will be present in
+ * the dp->dd_compare value.
+ */
+ if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
+ strcmp(dp->dd_compare, path) != 0)
+ return;
+
+ zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s",
+ dp->dd_prop, path);
+ dp->dd_found = B_TRUE;
+
+ /* pass the new devid for use by replacing code */
+ if (dp->dd_new_devid != NULL) {
+ (void) nvlist_add_string(nvl, "new_devid",
+ dp->dd_new_devid);
+ }
+ }
+
+ (dp->dd_func)(zhp, nvl, dp->dd_islabeled);
+}
+
+static void
+zfs_enable_ds(void *arg)
+{
+ unavailpool_t *pool = (unavailpool_t *)arg;
+
+ (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0);
+ zpool_close(pool->uap_zhp);
+ free(pool);
+}
+
+static int
+zfs_iter_pool(zpool_handle_t *zhp, void *data)
+{
+ nvlist_t *config, *nvl;
+ dev_data_t *dp = data;
+ uint64_t pool_guid;
+ unavailpool_t *pool;
+
+ zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)",
+ zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop);
+
+ /*
+ * For each vdev in this pool, look for a match to apply dd_func
+ */
+ if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+ if (dp->dd_pool_guid == 0 ||
+ (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) {
+ (void) nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvl);
+ zfs_iter_vdev(zhp, nvl, data);
+ }
+ }
+
+ /*
+ * if this pool was originally unavailable,
+ * then enable its datasets asynchronously
+ */
+ if (g_enumeration_done) {
+ for (pool = list_head(&g_pool_list); pool != NULL;
+ pool = list_next(&g_pool_list, pool)) {
+
+ if (strcmp(zpool_get_name(zhp),
+ zpool_get_name(pool->uap_zhp)))
+ continue;
+ if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) {
+ list_remove(&g_pool_list, pool);
+ (void) tpool_dispatch(g_tpool, zfs_enable_ds,
+ pool);
+ break;
+ }
+ }
+ }
+
+ zpool_close(zhp);
+ return (dp->dd_found); /* cease iteration after a match */
+}
+
+/*
+ * Given a physical device location, iterate over all
+ * (pool, vdev) pairs which correspond to that location.
+ */
+static boolean_t
+devphys_iter(const char *physical, const char *devid, zfs_process_func_t func,
+ boolean_t is_slice)
+{
+ dev_data_t data = { 0 };
+
+ data.dd_compare = physical;
+ data.dd_func = func;
+ data.dd_prop = ZPOOL_CONFIG_PHYS_PATH;
+ data.dd_found = B_FALSE;
+ data.dd_islabeled = is_slice;
+ data.dd_new_devid = devid; /* used by auto replace code */
+
+ (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
+
+ return (data.dd_found);
+}
+
+/*
+ * Given a device identifier, find any vdevs with a matching devid.
+ * On Linux we can match devid directly which is always a whole disk.
+ */
+static boolean_t
+devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice)
+{
+ dev_data_t data = { 0 };
+
+ data.dd_compare = devid;
+ data.dd_func = func;
+ data.dd_prop = ZPOOL_CONFIG_DEVID;
+ data.dd_found = B_FALSE;
+ data.dd_islabeled = is_slice;
+ data.dd_new_devid = devid;
+
+ (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
+
+ return (data.dd_found);
+}
+
+/*
+ * Handle a EC_DEV_ADD.ESC_DISK event.
+ *
+ * illumos
+ * Expects: DEV_PHYS_PATH string in schema
+ * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID
+ *
+ * path: '/dev/dsk/c0t1d0s0' (persistent)
+ * devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a'
+ * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a'
+ *
+ * linux
+ * provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema
+ * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID
+ *
+ * path: '/dev/sdc1' (not persistent)
+ * devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1'
+ * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0'
+ */
+static int
+zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
+{
+ char *devpath = NULL, *devid;
+ boolean_t is_slice;
+
+ /*
+ * Expecting a devid string and an optional physical location
+ */
+ if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0)
+ return (-1);
+
+ (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath);
+
+ is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0);
+
+ zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)",
+ devid, devpath ? devpath : "NULL", is_slice);
+
+ /*
+ * Iterate over all vdevs looking for a match in the following order:
+ * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk)
+ * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location).
+ *
+ * For disks, we only want to pay attention to vdevs marked as whole
+ * disks or are a multipath device.
+ */
+ if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL)
+ (void) devphys_iter(devpath, devid, zfs_process_add, is_slice);
+
+ return (0);
+}
+
+/*
+ * Called when we receive a VDEV_CHECK event, which indicates a device could not
+ * be opened during initial pool open, but the autoreplace property was set on
+ * the pool. In this case, we treat it as if it were an add event.
+ */
+static int
+zfs_deliver_check(nvlist_t *nvl)
+{
+ dev_data_t data = { 0 };
+
+ if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID,
+ &data.dd_pool_guid) != 0 ||
+ nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID,
+ &data.dd_vdev_guid) != 0 ||
+ data.dd_vdev_guid == 0)
+ return (0);
+
+ zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu",
+ data.dd_pool_guid, data.dd_vdev_guid);
+
+ data.dd_func = zfs_process_add;
+
+ (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
+
+ return (0);
+}
+
+static int
+zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
+{
+ char *devname = data;
+ boolean_t avail_spare, l2cache;
+ nvlist_t *tgt;
+ int error;
+
+ zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'",
+ devname, zpool_get_name(zhp));
+
+ if ((tgt = zpool_find_vdev_by_physpath(zhp, devname,
+ &avail_spare, &l2cache, NULL)) != NULL) {
+ char *path, fullpath[MAXPATHLEN];
+ uint64_t wholedisk;
+
+ error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path);
+ if (error) {
+ zpool_close(zhp);
+ return (0);
+ }
+
+ error = nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
+ &wholedisk);
+ if (error)
+ wholedisk = 0;
+
+ if (wholedisk) {
+ path = strrchr(path, '/');
+ if (path != NULL) {
+ path = zfs_strip_partition(path + 1);
+ if (path == NULL) {
+ zpool_close(zhp);
+ return (0);
+ }
+ } else {
+ zpool_close(zhp);
+ return (0);
+ }
+
+ (void) strlcpy(fullpath, path, sizeof (fullpath));
+ free(path);
+
+ /*
+ * We need to reopen the pool associated with this
+ * device so that the kernel can update the size of
+ * the expanded device. When expanding there is no
+ * need to restart the scrub from the beginning.
+ */
+ boolean_t scrub_restart = B_FALSE;
+ (void) zpool_reopen_one(zhp, &scrub_restart);
+ } else {
+ (void) strlcpy(fullpath, path, sizeof (fullpath));
+ }
+
+ if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
+ vdev_state_t newstate;
+
+ if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) {
+ error = zpool_vdev_online(zhp, fullpath, 0,
+ &newstate);
+ zed_log_msg(LOG_INFO, "zfsdle_vdev_online: "
+ "setting device '%s' to ONLINE state "
+ "in pool '%s': %d", fullpath,
+ zpool_get_name(zhp), error);
+ }
+ }
+ zpool_close(zhp);
+ return (1);
+ }
+ zpool_close(zhp);
+ return (0);
+}
+
+/*
+ * This function handles the ESC_DEV_DLE device change event. Use the
+ * provided vdev guid when looking up a disk or partition, when the guid
+ * is not present assume the entire disk is owned by ZFS and append the
+ * expected -part1 partition information then lookup by physical path.
+ */
+static int
+zfs_deliver_dle(nvlist_t *nvl)
+{
+ char *devname, name[MAXPATHLEN];
+ uint64_t guid;
+
+ if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) {
+ sprintf(name, "%llu", (u_longlong_t)guid);
+ } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) {
+ strlcpy(name, devname, MAXPATHLEN);
+ zfs_append_partition(name, MAXPATHLEN);
+ } else {
+ zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath");
+ }
+
+ if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) {
+ zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not "
+ "found", name);
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * syseventd daemon module event handler
+ *
+ * Handles syseventd daemon zfs device related events:
+ *
+ * EC_DEV_ADD.ESC_DISK
+ * EC_DEV_STATUS.ESC_DEV_DLE
+ * EC_ZFS.ESC_ZFS_VDEV_CHECK
+ *
+ * Note: assumes only one thread active at a time (not thread safe)
+ */
+static int
+zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl)
+{
+ int ret;
+ boolean_t is_lofi = B_FALSE, is_check = B_FALSE, is_dle = B_FALSE;
+
+ if (strcmp(class, EC_DEV_ADD) == 0) {
+ /*
+ * We're mainly interested in disk additions, but we also listen
+ * for new loop devices, to allow for simplified testing.
+ */
+ if (strcmp(subclass, ESC_DISK) == 0)
+ is_lofi = B_FALSE;
+ else if (strcmp(subclass, ESC_LOFI) == 0)
+ is_lofi = B_TRUE;
+ else
+ return (0);
+
+ is_check = B_FALSE;
+ } else if (strcmp(class, EC_ZFS) == 0 &&
+ strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) {
+ /*
+ * This event signifies that a device failed to open
+ * during pool load, but the 'autoreplace' property was
+ * set, so we should pretend it's just been added.
+ */
+ is_check = B_TRUE;
+ } else if (strcmp(class, EC_DEV_STATUS) == 0 &&
+ strcmp(subclass, ESC_DEV_DLE) == 0) {
+ is_dle = B_TRUE;
+ } else {
+ return (0);
+ }
+
+ if (is_dle)
+ ret = zfs_deliver_dle(nvl);
+ else if (is_check)
+ ret = zfs_deliver_check(nvl);
+ else
+ ret = zfs_deliver_add(nvl, is_lofi);
+
+ return (ret);
+}
+
+/*ARGSUSED*/
+static void *
+zfs_enum_pools(void *arg)
+{
+ (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list);
+ /*
+ * Linux - instead of using a thread pool, each list entry
+ * will spawn a thread when an unavailable pool transitions
+ * to available. zfs_slm_fini will wait for these threads.
+ */
+ g_enumeration_done = B_TRUE;
+ return (NULL);
+}
+
+/*
+ * called from zed daemon at startup
+ *
+ * sent messages from zevents or udev monitor
+ *
+ * For now, each agent has its own libzfs instance
+ */
+int
+zfs_slm_init()
+{
+ if ((g_zfshdl = libzfs_init()) == NULL)
+ return (-1);
+
+ /*
+ * collect a list of unavailable pools (asynchronously,
+ * since this can take a while)
+ */
+ list_create(&g_pool_list, sizeof (struct unavailpool),
+ offsetof(struct unavailpool, uap_node));
+
+ if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) {
+ list_destroy(&g_pool_list);
+ libzfs_fini(g_zfshdl);
+ return (-1);
+ }
+
+ list_create(&g_device_list, sizeof (struct pendingdev),
+ offsetof(struct pendingdev, pd_node));
+
+ return (0);
+}
+
+void
+zfs_slm_fini()
+{
+ unavailpool_t *pool;
+ pendingdev_t *device;
+
+ /* wait for zfs_enum_pools thread to complete */
+ (void) pthread_join(g_zfs_tid, NULL);
+ /* destroy the thread pool */
+ if (g_tpool != NULL) {
+ tpool_wait(g_tpool);
+ tpool_destroy(g_tpool);
+ }
+
+ while ((pool = (list_head(&g_pool_list))) != NULL) {
+ list_remove(&g_pool_list, pool);
+ zpool_close(pool->uap_zhp);
+ free(pool);
+ }
+ list_destroy(&g_pool_list);
+
+ while ((device = (list_head(&g_device_list))) != NULL) {
+ list_remove(&g_device_list, device);
+ free(device);
+ }
+ list_destroy(&g_device_list);
+
+ libzfs_fini(g_zfshdl);
+}
+
+void
+zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl)
+{
+ zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass);
+ (void) zfs_slm_deliver_event(class, subclass, nvl);
+}
diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c
new file mode 100644
index 000000000000..9e95e20d5683
--- /dev/null
+++ b/cmd/zed/agents/zfs_retire.c
@@ -0,0 +1,557 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
+ */
+
+/*
+ * The ZFS retire agent is responsible for managing hot spares across all pools.
+ * When we see a device fault or a device removal, we try to open the associated
+ * pool and look for any hot spares. We iterate over any available hot spares
+ * and attempt a 'zpool replace' for each one.
+ *
+ * For vdevs diagnosed as faulty, the agent is also responsible for proactively
+ * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors).
+ */
+
+#include <sys/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/fs/zfs.h>
+#include <libzfs.h>
+#include <string.h>
+
+#include "zfs_agents.h"
+#include "fmd_api.h"
+
+
+typedef struct zfs_retire_repaired {
+ struct zfs_retire_repaired *zrr_next;
+ uint64_t zrr_pool;
+ uint64_t zrr_vdev;
+} zfs_retire_repaired_t;
+
+typedef struct zfs_retire_data {
+ libzfs_handle_t *zrd_hdl;
+ zfs_retire_repaired_t *zrd_repaired;
+} zfs_retire_data_t;
+
+static void
+zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp)
+{
+ zfs_retire_repaired_t *zrp;
+
+ while ((zrp = zdp->zrd_repaired) != NULL) {
+ zdp->zrd_repaired = zrp->zrr_next;
+ fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t));
+ }
+}
+
+/*
+ * Find a pool with a matching GUID.
+ */
+typedef struct find_cbdata {
+ uint64_t cb_guid;
+ zpool_handle_t *cb_zhp;
+ nvlist_t *cb_vdev;
+} find_cbdata_t;
+
+static int
+find_pool(zpool_handle_t *zhp, void *data)
+{
+ find_cbdata_t *cbp = data;
+
+ if (cbp->cb_guid ==
+ zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) {
+ cbp->cb_zhp = zhp;
+ return (1);
+ }
+
+ zpool_close(zhp);
+ return (0);
+}
+
+/*
+ * Find a vdev within a tree with a matching GUID.
+ */
+static nvlist_t *
+find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid)
+{
+ uint64_t guid;
+ nvlist_t **child;
+ uint_t c, children;
+ nvlist_t *ret;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
+ guid == search_guid) {
+ fmd_hdl_debug(fmd_module_hdl("zfs-retire"),
+ "matched vdev %llu", guid);
+ return (nv);
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ return (NULL);
+
+ for (c = 0; c < children; c++) {
+ if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)
+ return (ret);
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) != 0)
+ return (NULL);
+
+ for (c = 0; c < children; c++) {
+ if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)
+ return (ret);
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+ &child, &children) != 0)
+ return (NULL);
+
+ for (c = 0; c < children; c++) {
+ if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)
+ return (ret);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Given a (pool, vdev) GUID pair, find the matching pool and vdev.
+ */
+static zpool_handle_t *
+find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid,
+ nvlist_t **vdevp)
+{
+ find_cbdata_t cb;
+ zpool_handle_t *zhp;
+ nvlist_t *config, *nvroot;
+
+ /*
+ * Find the corresponding pool and make sure the vdev still exists.
+ */
+ cb.cb_guid = pool_guid;
+ if (zpool_iter(zhdl, find_pool, &cb) != 1)
+ return (NULL);
+
+ zhp = cb.cb_zhp;
+ config = zpool_get_config(zhp, NULL);
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) != 0) {
+ zpool_close(zhp);
+ return (NULL);
+ }
+
+ if (vdev_guid != 0) {
+ if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) {
+ zpool_close(zhp);
+ return (NULL);
+ }
+ }
+
+ return (zhp);
+}
+
+/*
+ * Given a vdev, attempt to replace it with every known spare until one
+ * succeeds or we run out of devices to try.
+ * Return whether we were successful or not in replacing the device.
+ */
+static boolean_t
+replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
+{
+ nvlist_t *config, *nvroot, *replacement;
+ nvlist_t **spares;
+ uint_t s, nspares;
+ char *dev_name;
+ zprop_source_t source;
+ int ashift;
+
+ config = zpool_get_config(zhp, NULL);
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) != 0)
+ return (B_FALSE);
+
+ /*
+ * Find out if there are any hot spares available in the pool.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) != 0)
+ return (B_FALSE);
+
+ /*
+ * lookup "ashift" pool property, we may need it for the replacement
+ */
+ ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source);
+
+ replacement = fmd_nvl_alloc(hdl, FMD_SLEEP);
+
+ (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT);
+
+ dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);
+
+ /*
+ * Try to replace each spare, ending when we successfully
+ * replace it.
+ */
+ for (s = 0; s < nspares; s++) {
+ char *spare_name;
+
+ if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
+ &spare_name) != 0)
+ continue;
+
+ /* if set, add the "ashift" pool property to the spare nvlist */
+ if (source != ZPROP_SRC_DEFAULT)
+ (void) nvlist_add_uint64(spares[s],
+ ZPOOL_CONFIG_ASHIFT, ashift);
+
+ (void) nvlist_add_nvlist_array(replacement,
+ ZPOOL_CONFIG_CHILDREN, &spares[s], 1);
+
+ fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'",
+ dev_name, basename(spare_name));
+
+ if (zpool_vdev_attach(zhp, dev_name, spare_name,
+ replacement, B_TRUE, B_FALSE) == 0) {
+ free(dev_name);
+ nvlist_free(replacement);
+ return (B_TRUE);
+ }
+ }
+
+ free(dev_name);
+ nvlist_free(replacement);
+
+ return (B_FALSE);
+}
+
+/*
+ * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and
+ * ASRU is now usable. ZFS has found the device to be present and
+ * functioning.
+ */
+/*ARGSUSED*/
+static void
+zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl)
+{
+ zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
+ zfs_retire_repaired_t *zrp;
+ uint64_t pool_guid, vdev_guid;
+ if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+ &pool_guid) != 0 || nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
+ return;
+
+ /*
+ * Before checking the state of the ASRU, go through and see if we've
+ * already made an attempt to repair this ASRU. This list is cleared
+ * whenever we receive any kind of list event, and is designed to
+ * prevent us from generating a feedback loop when we attempt repairs
+ * against a faulted pool. The problem is that checking the unusable
+ * state of the ASRU can involve opening the pool, which can post
+ * statechange events but otherwise leave the pool in the faulted
+ * state. This list allows us to detect when a statechange event is
+ * due to our own request.
+ */
+ for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) {
+ if (zrp->zrr_pool == pool_guid &&
+ zrp->zrr_vdev == vdev_guid)
+ return;
+ }
+
+ zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP);
+ zrp->zrr_next = zdp->zrd_repaired;
+ zrp->zrr_pool = pool_guid;
+ zrp->zrr_vdev = vdev_guid;
+ zdp->zrd_repaired = zrp;
+
+ fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu",
+ vdev_guid, pool_guid);
+}
+
+/*ARGSUSED*/
+static void
+zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
+ const char *class)
+{
+ uint64_t pool_guid, vdev_guid;
+ zpool_handle_t *zhp;
+ nvlist_t *resource, *fault;
+ nvlist_t **faults;
+ uint_t f, nfaults;
+ zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
+ libzfs_handle_t *zhdl = zdp->zrd_hdl;
+ boolean_t fault_device, degrade_device;
+ boolean_t is_repair;
+ char *scheme;
+ nvlist_t *vdev = NULL;
+ char *uuid;
+ int repair_done = 0;
+ boolean_t retire;
+ boolean_t is_disk;
+ vdev_aux_t aux;
+ uint64_t state = 0;
+
+ fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class);
+
+ nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state);
+
+ /*
+ * If this is a resource notifying us of device removal then simply
+ * check for an available spare and continue unless the device is a
+ * l2arc vdev, in which case we just offline it.
+ */
+ if (strcmp(class, "resource.fs.zfs.removed") == 0 ||
+ (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
+ state == VDEV_STATE_REMOVED)) {
+ char *devtype;
+ char *devname;
+
+ if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+ &pool_guid) != 0 ||
+ nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ &vdev_guid) != 0)
+ return;
+
+ if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,
+ &vdev)) == NULL)
+ return;
+
+ devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);
+
+ /* Can't replace l2arc with a spare: offline the device */
+ if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ &devtype) == 0 && strcmp(devtype, VDEV_TYPE_L2CACHE) == 0) {
+ fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname);
+ zpool_vdev_offline(zhp, devname, B_TRUE);
+ } else if (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
+ replace_with_spare(hdl, zhp, vdev) == B_FALSE) {
+ /* Could not handle with spare */
+ fmd_hdl_debug(hdl, "no spare for '%s'", devname);
+ }
+
+ free(devname);
+ zpool_close(zhp);
+ return;
+ }
+
+ if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
+ return;
+
+ /*
+ * Note: on zfsonlinux statechange events are more than just
+ * healthy ones so we need to confirm the actual state value.
+ */
+ if (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
+ state == VDEV_STATE_HEALTHY) {
+ zfs_vdev_repair(hdl, nvl);
+ return;
+ }
+ if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {
+ zfs_vdev_repair(hdl, nvl);
+ return;
+ }
+
+ zfs_retire_clear_data(hdl, zdp);
+
+ if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
+ is_repair = B_TRUE;
+ else
+ is_repair = B_FALSE;
+
+ /*
+ * We subscribe to zfs faults as well as all repair events.
+ */
+ if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
+ &faults, &nfaults) != 0)
+ return;
+
+ for (f = 0; f < nfaults; f++) {
+ fault = faults[f];
+
+ fault_device = B_FALSE;
+ degrade_device = B_FALSE;
+ is_disk = B_FALSE;
+
+ if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE,
+ &retire) == 0 && retire == 0)
+ continue;
+
+ /*
+ * While we subscribe to fault.fs.zfs.*, we only take action
+ * for faults targeting a specific vdev (open failure or SERD
+ * failure). We also subscribe to fault.io.* events, so that
+ * faulty disks will be faulted in the ZFS configuration.
+ */
+ if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) {
+ fault_device = B_TRUE;
+ } else if (fmd_nvl_class_match(hdl, fault,
+ "fault.fs.zfs.vdev.checksum")) {
+ degrade_device = B_TRUE;
+ } else if (fmd_nvl_class_match(hdl, fault,
+ "fault.fs.zfs.device")) {
+ fault_device = B_FALSE;
+ } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) {
+ is_disk = B_TRUE;
+ fault_device = B_TRUE;
+ } else {
+ continue;
+ }
+
+ if (is_disk) {
+ continue;
+ } else {
+ /*
+ * This is a ZFS fault. Lookup the resource, and
+ * attempt to find the matching vdev.
+ */
+ if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE,
+ &resource) != 0 ||
+ nvlist_lookup_string(resource, FM_FMRI_SCHEME,
+ &scheme) != 0)
+ continue;
+
+ if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0)
+ continue;
+
+ if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL,
+ &pool_guid) != 0)
+ continue;
+
+ if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV,
+ &vdev_guid) != 0) {
+ if (is_repair)
+ vdev_guid = 0;
+ else
+ continue;
+ }
+
+ if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,
+ &vdev)) == NULL)
+ continue;
+
+ aux = VDEV_AUX_ERR_EXCEEDED;
+ }
+
+ if (vdev_guid == 0) {
+ /*
+ * For pool-level repair events, clear the entire pool.
+ */
+ fmd_hdl_debug(hdl, "zpool_clear of pool '%s'",
+ zpool_get_name(zhp));
+ (void) zpool_clear(zhp, NULL, NULL);
+ zpool_close(zhp);
+ continue;
+ }
+
+ /*
+ * If this is a repair event, then mark the vdev as repaired and
+ * continue.
+ */
+ if (is_repair) {
+ repair_done = 1;
+ fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu",
+ zpool_get_name(zhp), vdev_guid);
+ (void) zpool_vdev_clear(zhp, vdev_guid);
+ zpool_close(zhp);
+ continue;
+ }
+
+ /*
+ * Actively fault the device if needed.
+ */
+ if (fault_device)
+ (void) zpool_vdev_fault(zhp, vdev_guid, aux);
+ if (degrade_device)
+ (void) zpool_vdev_degrade(zhp, vdev_guid, aux);
+
+ if (fault_device || degrade_device)
+ fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'",
+ fault_device ? "fault" : "degrade", vdev_guid,
+ zpool_get_name(zhp));
+
+ /*
+ * Attempt to substitute a hot spare.
+ */
+ (void) replace_with_spare(hdl, zhp, vdev);
+ zpool_close(zhp);
+ }
+
+ if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done &&
+ nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0)
+ fmd_case_uuresolved(hdl, uuid);
+}
+
+static const fmd_hdl_ops_t fmd_ops = {
+ zfs_retire_recv, /* fmdo_recv */
+ NULL, /* fmdo_timeout */
+ NULL, /* fmdo_close */
+ NULL, /* fmdo_stats */
+ NULL, /* fmdo_gc */
+};
+
+static const fmd_prop_t fmd_props[] = {
+ { "spare_on_remove", FMD_TYPE_BOOL, "true" },
+ { NULL, 0, NULL }
+};
+
+static const fmd_hdl_info_t fmd_info = {
+ "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props
+};
+
+void
+_zfs_retire_init(fmd_hdl_t *hdl)
+{
+ zfs_retire_data_t *zdp;
+ libzfs_handle_t *zhdl;
+
+ if ((zhdl = libzfs_init()) == NULL)
+ return;
+
+ if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
+ libzfs_fini(zhdl);
+ return;
+ }
+
+ zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP);
+ zdp->zrd_hdl = zhdl;
+
+ fmd_hdl_setspecific(hdl, zdp);
+}
+
+void
+_zfs_retire_fini(fmd_hdl_t *hdl)
+{
+ zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
+
+ if (zdp != NULL) {
+ zfs_retire_clear_data(hdl, zdp);
+ libzfs_fini(zdp->zrd_hdl);
+ fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t));
+ }
+}
diff --git a/cmd/zed/zed.c b/cmd/zed/zed.c
new file mode 100644
index 000000000000..0784e3834733
--- /dev/null
+++ b/cmd/zed/zed.c
@@ -0,0 +1,306 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "zed.h"
+#include "zed_conf.h"
+#include "zed_event.h"
+#include "zed_file.h"
+#include "zed_log.h"
+
+static volatile sig_atomic_t _got_exit = 0;
+static volatile sig_atomic_t _got_hup = 0;
+
+/*
+ * Signal handler for SIGINT & SIGTERM.
+ */
+static void
+_exit_handler(int signum)
+{
+ _got_exit = 1;
+}
+
+/*
+ * Signal handler for SIGHUP.
+ */
+static void
+_hup_handler(int signum)
+{
+ _got_hup = 1;
+}
+
+/*
+ * Register signal handlers.
+ */
+static void
+_setup_sig_handlers(void)
+{
+ struct sigaction sa;
+
+ if (sigemptyset(&sa.sa_mask) < 0)
+ zed_log_die("Failed to initialize sigset");
+
+ sa.sa_flags = SA_RESTART;
+ sa.sa_handler = SIG_IGN;
+
+ if (sigaction(SIGPIPE, &sa, NULL) < 0)
+ zed_log_die("Failed to ignore SIGPIPE");
+
+ sa.sa_handler = _exit_handler;
+ if (sigaction(SIGINT, &sa, NULL) < 0)
+ zed_log_die("Failed to register SIGINT handler");
+
+ if (sigaction(SIGTERM, &sa, NULL) < 0)
+ zed_log_die("Failed to register SIGTERM handler");
+
+ sa.sa_handler = _hup_handler;
+ if (sigaction(SIGHUP, &sa, NULL) < 0)
+ zed_log_die("Failed to register SIGHUP handler");
+}
+
+/*
+ * Lock all current and future pages in the virtual memory address space.
+ * Access to locked pages will never be delayed by a page fault.
+ *
+ * EAGAIN is tested up to max_tries in case this is a transient error.
+ *
+ * Note that memory locks are not inherited by a child created via fork()
+ * and are automatically removed during an execve(). As such, this must
+ * be called after the daemon fork()s (when running in the background).
+ */
+static void
+_lock_memory(void)
+{
+#if HAVE_MLOCKALL
+ int i = 0;
+ const int max_tries = 10;
+
+ for (i = 0; i < max_tries; i++) {
+ if (mlockall(MCL_CURRENT | MCL_FUTURE) == 0) {
+ zed_log_msg(LOG_INFO, "Locked all pages in memory");
+ return;
+ }
+ if (errno != EAGAIN)
+ break;
+ }
+ zed_log_die("Failed to lock memory pages: %s", strerror(errno));
+
+#else /* HAVE_MLOCKALL */
+ zed_log_die("Failed to lock memory pages: mlockall() not supported");
+#endif /* HAVE_MLOCKALL */
+}
+
+/*
+ * Start daemonization of the process including the double fork().
+ *
+ * The parent process will block here until _finish_daemonize() is called
+ * (in the grandchild process), at which point the parent process will exit.
+ * This prevents the parent process from exiting until initialization is
+ * complete.
+ */
+static void
+_start_daemonize(void)
+{
+ pid_t pid;
+ struct sigaction sa;
+
+ /* Create pipe for communicating with child during daemonization. */
+ zed_log_pipe_open();
+
+ /* Background process and ensure child is not process group leader. */
+ pid = fork();
+ if (pid < 0) {
+ zed_log_die("Failed to create child process: %s",
+ strerror(errno));
+ } else if (pid > 0) {
+
+ /* Close writes since parent will only read from pipe. */
+ zed_log_pipe_close_writes();
+
+ /* Wait for notification that daemonization is complete. */
+ zed_log_pipe_wait();
+
+ zed_log_pipe_close_reads();
+ _exit(EXIT_SUCCESS);
+ }
+
+ /* Close reads since child will only write to pipe. */
+ zed_log_pipe_close_reads();
+
+ /* Create independent session and detach from terminal. */
+ if (setsid() < 0)
+ zed_log_die("Failed to create new session: %s",
+ strerror(errno));
+
+ /* Prevent child from terminating on HUP when session leader exits. */
+ if (sigemptyset(&sa.sa_mask) < 0)
+ zed_log_die("Failed to initialize sigset");
+
+ sa.sa_flags = 0;
+ sa.sa_handler = SIG_IGN;
+
+ if (sigaction(SIGHUP, &sa, NULL) < 0)
+ zed_log_die("Failed to ignore SIGHUP");
+
+ /* Ensure process cannot re-acquire terminal. */
+ pid = fork();
+ if (pid < 0) {
+ zed_log_die("Failed to create grandchild process: %s",
+ strerror(errno));
+ } else if (pid > 0) {
+ _exit(EXIT_SUCCESS);
+ }
+}
+
+/*
+ * Finish daemonization of the process by closing stdin/stdout/stderr.
+ *
+ * This must be called at the end of initialization after all external
+ * communication channels are established and accessible.
+ */
+static void
+_finish_daemonize(void)
+{
+ int devnull;
+
+ /* Preserve fd 0/1/2, but discard data to/from stdin/stdout/stderr. */
+ devnull = open("/dev/null", O_RDWR);
+ if (devnull < 0)
+ zed_log_die("Failed to open /dev/null: %s", strerror(errno));
+
+ if (dup2(devnull, STDIN_FILENO) < 0)
+ zed_log_die("Failed to dup /dev/null onto stdin: %s",
+ strerror(errno));
+
+ if (dup2(devnull, STDOUT_FILENO) < 0)
+ zed_log_die("Failed to dup /dev/null onto stdout: %s",
+ strerror(errno));
+
+ if (dup2(devnull, STDERR_FILENO) < 0)
+ zed_log_die("Failed to dup /dev/null onto stderr: %s",
+ strerror(errno));
+
+ if ((devnull > STDERR_FILENO) && (close(devnull) < 0))
+ zed_log_die("Failed to close /dev/null: %s", strerror(errno));
+
+ /* Notify parent that daemonization is complete. */
+ zed_log_pipe_close_writes();
+}
+
+/*
+ * ZFS Event Daemon (ZED).
+ */
+int
+main(int argc, char *argv[])
+{
+ struct zed_conf *zcp;
+ uint64_t saved_eid;
+ int64_t saved_etime[2];
+
+ zed_log_init(argv[0]);
+ zed_log_stderr_open(LOG_NOTICE);
+ zcp = zed_conf_create();
+ zed_conf_parse_opts(zcp, argc, argv);
+ if (zcp->do_verbose)
+ zed_log_stderr_open(LOG_INFO);
+
+ if (geteuid() != 0)
+ zed_log_die("Must be run as root");
+
+ zed_conf_parse_file(zcp);
+
+ zed_file_close_from(STDERR_FILENO + 1);
+
+ (void) umask(0);
+
+ if (chdir("/") < 0)
+ zed_log_die("Failed to change to root directory");
+
+ if (zed_conf_scan_dir(zcp) < 0)
+ exit(EXIT_FAILURE);
+
+ if (!zcp->do_foreground) {
+ _start_daemonize();
+ zed_log_syslog_open(LOG_DAEMON);
+ }
+ _setup_sig_handlers();
+
+ if (zcp->do_memlock)
+ _lock_memory();
+
+ if ((zed_conf_write_pid(zcp) < 0) && (!zcp->do_force))
+ exit(EXIT_FAILURE);
+
+ if (!zcp->do_foreground)
+ _finish_daemonize();
+
+ zed_log_msg(LOG_NOTICE,
+ "ZFS Event Daemon %s-%s (PID %d)",
+ ZFS_META_VERSION, ZFS_META_RELEASE, (int)getpid());
+
+ if (zed_conf_open_state(zcp) < 0)
+ exit(EXIT_FAILURE);
+
+ if (zed_conf_read_state(zcp, &saved_eid, saved_etime) < 0)
+ exit(EXIT_FAILURE);
+
+idle:
+ /*
+ * If -I is specified, attempt to open /dev/zfs repeatedly until
+ * successful.
+ */
+ do {
+ if (!zed_event_init(zcp))
+ break;
+ /* Wait for some time and try again. tunable? */
+ sleep(30);
+ } while (!_got_exit && zcp->do_idle);
+
+ if (_got_exit)
+ goto out;
+
+ zed_event_seek(zcp, saved_eid, saved_etime);
+
+ while (!_got_exit) {
+ int rv;
+ if (_got_hup) {
+ _got_hup = 0;
+ (void) zed_conf_scan_dir(zcp);
+ }
+ rv = zed_event_service(zcp);
+
+ /* ENODEV: When kernel module is unloaded (osx) */
+ if (rv == ENODEV)
+ break;
+ }
+
+ zed_log_msg(LOG_NOTICE, "Exiting");
+ zed_event_fini(zcp);
+
+ if (zcp->do_idle && !_got_exit)
+ goto idle;
+
+out:
+ zed_conf_destroy(zcp);
+ zed_log_fini();
+ exit(EXIT_SUCCESS);
+}
diff --git a/cmd/zed/zed.d/.gitignore b/cmd/zed/zed.d/.gitignore
new file mode 100644
index 000000000000..46a00945aa7c
--- /dev/null
+++ b/cmd/zed/zed.d/.gitignore
@@ -0,0 +1 @@
+history_event-zfs-list-cacher.sh
diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am
new file mode 100644
index 000000000000..8b2d0c200286
--- /dev/null
+++ b/cmd/zed/zed.d/Makefile.am
@@ -0,0 +1,53 @@
+include $(top_srcdir)/config/Rules.am
+include $(top_srcdir)/config/Substfiles.am
+
+EXTRA_DIST += README
+
+zedconfdir = $(sysconfdir)/zfs/zed.d
+
+dist_zedconf_DATA = \
+ zed-functions.sh \
+ zed.rc
+
+zedexecdir = $(zfsexecdir)/zed.d
+
+dist_zedexec_SCRIPTS = \
+ all-debug.sh \
+ all-syslog.sh \
+ data-notify.sh \
+ generic-notify.sh \
+ resilver_finish-notify.sh \
+ scrub_finish-notify.sh \
+ statechange-led.sh \
+ statechange-notify.sh \
+ vdev_clear-led.sh \
+ vdev_attach-led.sh \
+ pool_import-led.sh \
+ resilver_finish-start-scrub.sh \
+ trim_finish-notify.sh
+
+nodist_zedexec_SCRIPTS = history_event-zfs-list-cacher.sh
+
+SUBSTFILES += $(nodist_zedexec_SCRIPTS)
+
+zedconfdefaults = \
+ all-syslog.sh \
+ data-notify.sh \
+ history_event-zfs-list-cacher.sh \
+ resilver_finish-notify.sh \
+ scrub_finish-notify.sh \
+ statechange-led.sh \
+ statechange-notify.sh \
+ vdev_clear-led.sh \
+ vdev_attach-led.sh \
+ pool_import-led.sh \
+ resilver_finish-start-scrub.sh
+
+install-data-hook:
+ $(MKDIR_P) "$(DESTDIR)$(zedconfdir)"
+ for f in $(zedconfdefaults); do \
+ test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \
+ -L "$(DESTDIR)$(zedconfdir)/$${f}" || \
+ ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \
+ done
+ chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc"
diff --git a/cmd/zed/zed.d/README b/cmd/zed/zed.d/README
new file mode 100644
index 000000000000..7279b93704e2
--- /dev/null
+++ b/cmd/zed/zed.d/README
@@ -0,0 +1,30 @@
+Shell scripts are the recommended choice for ZEDLETs that mostly call
+other utilities and do relatively little data manipulation.
+
+Shell scripts MUST work on both bash and dash.
+
+Shell scripts MUST run cleanly through ShellCheck:
+ http://www.shellcheck.net/
+
+General functions reside in "zed-functions.sh". Use them where applicable.
+
+Additional references that may be of use:
+
+ Google Shell Style Guide
+ https://github.com/google/styleguide/blob/gh-pages/shell.xml
+
+ Dash as /bin/sh
+ https://wiki.ubuntu.com/DashAsBinSh
+
+ Common shell script mistakes
+ http://www.pixelbeat.org/programming/shell_script_mistakes.html
+
+ Filenames and Pathnames in Shell: How to do it Correctly
+ http://www.dwheeler.com/essays/filenames-in-shell.html
+
+ Autoconf: Portable Shell Programming
+ https://www.gnu.org/software/autoconf/manual/autoconf.html#Portable-Shell
+
+Please BE CONSISTENT with the existing style, check for errors,
+minimize dependencies where possible, try to be portable,
+and comment anything non-obvious. Festina lente.
diff --git a/cmd/zed/zed.d/all-debug.sh b/cmd/zed/zed.d/all-debug.sh
new file mode 100755
index 000000000000..14b39caacd9d
--- /dev/null
+++ b/cmd/zed/zed.d/all-debug.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+#
+# Log all environment variables to ZED_DEBUG_LOG.
+#
+# This can be a useful aid when developing/debugging ZEDLETs since it shows the
+# environment variables defined for each zevent.
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+: "${ZED_DEBUG_LOG:="${TMPDIR:="/tmp"}/zed.debug.log"}"
+
+zed_exit_if_ignoring_this_event
+
+lockfile="$(basename -- "${ZED_DEBUG_LOG}").lock"
+
+umask 077
+zed_lock "${lockfile}"
+exec >> "${ZED_DEBUG_LOG}"
+
+printenv | sort
+echo
+
+exec >&-
+zed_unlock "${lockfile}"
+exit 0
diff --git a/cmd/zed/zed.d/all-syslog.sh b/cmd/zed/zed.d/all-syslog.sh
new file mode 100755
index 000000000000..cb9286500136
--- /dev/null
+++ b/cmd/zed/zed.d/all-syslog.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+#
+# Log the zevent via syslog.
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+zed_exit_if_ignoring_this_event
+
+zed_log_msg "eid=${ZEVENT_EID}" "class=${ZEVENT_SUBCLASS}" \
+ "${ZEVENT_POOL_GUID:+"pool_guid=${ZEVENT_POOL_GUID}"}" \
+ "${ZEVENT_VDEV_PATH:+"vdev_path=${ZEVENT_VDEV_PATH}"}" \
+ "${ZEVENT_VDEV_STATE_STR:+"vdev_state=${ZEVENT_VDEV_STATE_STR}"}"
+exit 0
diff --git a/cmd/zed/zed.d/data-notify.sh b/cmd/zed/zed.d/data-notify.sh
new file mode 100755
index 000000000000..639b459bdd3b
--- /dev/null
+++ b/cmd/zed/zed.d/data-notify.sh
@@ -0,0 +1,43 @@
+#!/bin/sh
+#
+# Send notification in response to a DATA error.
+#
+# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given
+# class/pool/[vdev] combination. This protects against spamming the recipient
+# should multiple events occur together in time for the same pool/[vdev].
+#
+# Exit codes:
+# 0: notification sent
+# 1: notification failed
+# 2: notification not configured
+# 3: notification suppressed
+# 9: internal error
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ -n "${ZEVENT_POOL}" ] || exit 9
+[ -n "${ZEVENT_SUBCLASS}" ] || exit 9
+[ -n "${ZED_NOTIFY_DATA}" ] || exit 3
+
+rate_limit_tag="${ZEVENT_POOL};${ZEVENT_VDEV_GUID:-0};${ZEVENT_SUBCLASS};notify"
+zed_rate_limit "${rate_limit_tag}" || exit 3
+
+umask 077
+note_subject="ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on $(hostname)"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+ echo "ZFS has detected a data error:"
+ echo
+ echo " eid: ${ZEVENT_EID}"
+ echo " class: ${ZEVENT_SUBCLASS}"
+ echo " host: $(hostname)"
+ echo " time: ${ZEVENT_TIME_STRING}"
+ echo " error: ${ZEVENT_ZIO_ERR}"
+ echo " objid: ${ZEVENT_ZIO_OBJSET}:${ZEVENT_ZIO_OBJECT}"
+ echo " pool: ${ZEVENT_POOL}"
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/cmd/zed/zed.d/generic-notify.sh b/cmd/zed/zed.d/generic-notify.sh
new file mode 100755
index 000000000000..e438031a088a
--- /dev/null
+++ b/cmd/zed/zed.d/generic-notify.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+#
+# Send notification in response to a given zevent.
+#
+# This is a generic script than can be symlinked to a file in the
+# enabled-zedlets directory to have a notification sent when a particular
+# class of zevents occurs. The symlink filename must begin with the zevent
+# (sub)class string (e.g., "probe_failure-notify.sh" for the "probe_failure"
+# subclass). Refer to the zed(8) manpage for details.
+#
+# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given
+# class/pool combination. This protects against spamming the recipient
+# should multiple events occur together in time for the same pool.
+#
+# Exit codes:
+# 0: notification sent
+# 1: notification failed
+# 2: notification not configured
+# 3: notification suppressed
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+# Rate-limit the notification based in part on the filename.
+#
+rate_limit_tag="${ZEVENT_POOL};${ZEVENT_SUBCLASS};$(basename -- "$0")"
+rate_limit_interval="${ZED_NOTIFY_INTERVAL_SECS}"
+zed_rate_limit "${rate_limit_tag}" "${rate_limit_interval}" || exit 3
+
+umask 077
+pool_str="${ZEVENT_POOL:+" for ${ZEVENT_POOL}"}"
+host_str=" on $(hostname)"
+note_subject="ZFS ${ZEVENT_SUBCLASS} event${pool_str}${host_str}"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+ echo "ZFS has posted the following event:"
+ echo
+ echo " eid: ${ZEVENT_EID}"
+ echo " class: ${ZEVENT_SUBCLASS}"
+ echo " host: $(hostname)"
+ echo " time: ${ZEVENT_TIME_STRING}"
+
+ [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}"
+ [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}"
+ [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}"
+
+ [ -n "${ZEVENT_POOL}" ] && [ -x "${ZPOOL}" ] \
+ && "${ZPOOL}" status "${ZEVENT_POOL}"
+
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in b/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in
new file mode 100755
index 000000000000..053b4414a768
--- /dev/null
+++ b/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in
@@ -0,0 +1,85 @@
+#!/bin/sh
+#
+# Track changes to enumerated pools for use in early-boot
+set -ef
+
+FSLIST_DIR="@sysconfdir@/zfs/zfs-list.cache"
+FSLIST_TMP="@runstatedir@/zfs-list.cache.new"
+FSLIST="${FSLIST_DIR}/${ZEVENT_POOL}"
+
+# If the pool specific cache file is not writeable, abort
+[ -w "${FSLIST}" ] || exit 0
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+zed_exit_if_ignoring_this_event
+zed_check_cmd "${ZFS}" sort diff grep
+
+# If we are acting on a snapshot, we have nothing to do
+printf '%s' "${ZEVENT_HISTORY_DSNAME}" | grep '@' && exit 0
+
+# We obtain a lock on zfs-list to avoid any simultaneous writes.
+# If we run into trouble, log and drop the lock
+abort_alter() {
+ zed_log_msg "Error updating zfs-list.cache!"
+ zed_unlock zfs-list
+}
+
+finished() {
+ zed_unlock zfs-list
+ trap - EXIT
+ exit 0
+}
+
+case "${ZEVENT_HISTORY_INTERNAL_NAME}" in
+ create|"finish receiving"|import|destroy|rename)
+ ;;
+
+ export)
+ zed_lock zfs-list
+ trap abort_alter EXIT
+ echo > "${FSLIST}"
+ finished
+ ;;
+
+ set|inherit)
+ # Only act if one of the tracked properties is altered.
+ case "${ZEVENT_HISTORY_INTERNAL_STR%%=*}" in
+ canmount|mountpoint|atime|relatime|devices|exec|readonly| \
+ setuid|nbmand|encroot|keylocation|org.openzfs.systemd:requires| \
+ org.openzfs.systemd:requires-mounts-for| \
+ org.openzfs.systemd:before|org.openzfs.systemd:after| \
+ org.openzfs.systemd:wanted-by|org.openzfs.systemd:required-by| \
+ org.openzfs.systemd:nofail|org.openzfs.systemd:ignore \
+ ) ;;
+ *) exit 0 ;;
+ esac
+ ;;
+
+ *)
+ # Ignore all other events.
+ exit 0
+ ;;
+esac
+
+zed_lock zfs-list
+trap abort_alter EXIT
+
+PROPS="name,mountpoint,canmount,atime,relatime,devices,exec\
+,readonly,setuid,nbmand,encroot,keylocation\
+,org.openzfs.systemd:requires,org.openzfs.systemd:requires-mounts-for\
+,org.openzfs.systemd:before,org.openzfs.systemd:after\
+,org.openzfs.systemd:wanted-by,org.openzfs.systemd:required-by\
+,org.openzfs.systemd:nofail,org.openzfs.systemd:ignore"
+
+"${ZFS}" list -H -t filesystem -o $PROPS -r "${ZEVENT_POOL}" > "${FSLIST_TMP}"
+
+# Sort the output so that it is stable
+sort "${FSLIST_TMP}" -o "${FSLIST_TMP}"
+
+# Don't modify the file if it hasn't changed
+diff -q "${FSLIST_TMP}" "${FSLIST}" || mv "${FSLIST_TMP}" "${FSLIST}"
+rm -f "${FSLIST_TMP}"
+
+finished
diff --git a/cmd/zed/zed.d/pool_import-led.sh b/cmd/zed/zed.d/pool_import-led.sh
new file mode 120000
index 000000000000..7d7404398a4a
--- /dev/null
+++ b/cmd/zed/zed.d/pool_import-led.sh
@@ -0,0 +1 @@
+statechange-led.sh \ No newline at end of file
diff --git a/cmd/zed/zed.d/resilver_finish-notify.sh b/cmd/zed/zed.d/resilver_finish-notify.sh
new file mode 120000
index 000000000000..e4c56bc5f816
--- /dev/null
+++ b/cmd/zed/zed.d/resilver_finish-notify.sh
@@ -0,0 +1 @@
+scrub_finish-notify.sh \ No newline at end of file
diff --git a/cmd/zed/zed.d/resilver_finish-start-scrub.sh b/cmd/zed/zed.d/resilver_finish-start-scrub.sh
new file mode 100755
index 000000000000..c7cfd1ddba80
--- /dev/null
+++ b/cmd/zed/zed.d/resilver_finish-start-scrub.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+# resilver_finish-start-scrub.sh
+# Run a scrub after a resilver
+#
+# Exit codes:
+# 1: Internal error
+# 2: Script wasn't enabled in zed.rc
+# 3: Scrubs are automatically started for sequential resilvers
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ "${ZED_SCRUB_AFTER_RESILVER}" = "1" ] || exit 2
+[ "${ZEVENT_RESILVER_TYPE}" != "sequential" ] || exit 3
+[ -n "${ZEVENT_POOL}" ] || exit 1
+[ -n "${ZEVENT_SUBCLASS}" ] || exit 1
+zed_check_cmd "${ZPOOL}" || exit 1
+
+zed_log_msg "Starting scrub after resilver on ${ZEVENT_POOL}"
+"${ZPOOL}" scrub "${ZEVENT_POOL}"
diff --git a/cmd/zed/zed.d/scrub_finish-notify.sh b/cmd/zed/zed.d/scrub_finish-notify.sh
new file mode 100755
index 000000000000..2145a100a3fa
--- /dev/null
+++ b/cmd/zed/zed.d/scrub_finish-notify.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+#
+# Send notification in response to a RESILVER_FINISH or SCRUB_FINISH.
+#
+# By default, "zpool status" output will only be included for a scrub_finish
+# zevent if the pool is not healthy; to always include its output, set
+# ZED_NOTIFY_VERBOSE=1.
+#
+# Exit codes:
+# 0: notification sent
+# 1: notification failed
+# 2: notification not configured
+# 3: notification suppressed
+# 9: internal error
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ -n "${ZEVENT_POOL}" ] || exit 9
+[ -n "${ZEVENT_SUBCLASS}" ] || exit 9
+
+if [ "${ZEVENT_SUBCLASS}" = "resilver_finish" ]; then
+ action="resilver"
+elif [ "${ZEVENT_SUBCLASS}" = "scrub_finish" ]; then
+ action="scrub"
+else
+ zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\""
+ exit 9
+fi
+
+zed_check_cmd "${ZPOOL}" || exit 9
+
+# For scrub, suppress notification if the pool is healthy
+# and verbosity is not enabled.
+#
+if [ "${ZEVENT_SUBCLASS}" = "scrub_finish" ]; then
+ healthy="$("${ZPOOL}" status -x "${ZEVENT_POOL}" \
+ | grep "'${ZEVENT_POOL}' is healthy")"
+ [ -n "${healthy}" ] && [ "${ZED_NOTIFY_VERBOSE}" -eq 0 ] && exit 3
+fi
+
+umask 077
+note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+ echo "ZFS has finished a ${action}:"
+ echo
+ echo " eid: ${ZEVENT_EID}"
+ echo " class: ${ZEVENT_SUBCLASS}"
+ echo " host: $(hostname)"
+ echo " time: ${ZEVENT_TIME_STRING}"
+
+ "${ZPOOL}" status "${ZEVENT_POOL}"
+
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/cmd/zed/zed.d/statechange-led.sh b/cmd/zed/zed.d/statechange-led.sh
new file mode 100755
index 000000000000..e656e125d378
--- /dev/null
+++ b/cmd/zed/zed.d/statechange-led.sh
@@ -0,0 +1,177 @@
+#!/bin/sh
+#
+# Turn off/on the VDEV's enclosure fault LEDs when the pool's state changes.
+#
+# Turn the VDEV's fault LED on if it becomes FAULTED, DEGRADED or UNAVAIL.
+# Turn the LED off when it's back ONLINE again.
+#
+# This script run in two basic modes:
+#
+# 1. If $ZEVENT_VDEV_ENC_SYSFS_PATH and $ZEVENT_VDEV_STATE_STR are set, then
+# only set the LED for that particular VDEV. This is the case for statechange
+# events and some vdev_* events.
+#
+# 2. If those vars are not set, then check the state of all VDEVs in the pool
+# and set the LEDs accordingly. This is the case for pool_import events.
+#
+# Note that this script requires that your enclosure be supported by the
+# Linux SCSI enclosure services (ses) driver. The script will do nothing
+# if you have no enclosure, or if your enclosure isn't supported.
+#
+# Exit codes:
+# 0: enclosure led successfully set
+# 1: enclosure leds not available
+# 2: enclosure leds administratively disabled
+# 3: The led sysfs path passed from ZFS does not exist
+# 4: $ZPOOL not set
+# 5: awk is not installed
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+if [ ! -d /sys/class/enclosure ] ; then
+ exit 1
+fi
+
+if [ "${ZED_USE_ENCLOSURE_LEDS}" != "1" ] ; then
+ exit 2
+fi
+
+zed_check_cmd "$ZPOOL" || exit 4
+zed_check_cmd awk || exit 5
+
+# Global used in set_led debug print
+vdev=""
+
+# check_and_set_led (file, val)
+#
+# Read an enclosure sysfs file, and write it if it's not already set to 'val'
+#
+# Arguments
+# file: sysfs file to set (like /sys/class/enclosure/0:0:1:0/SLOT 10/fault)
+# val: value to set it to
+#
+# Return
+# 0 on success, 3 on missing sysfs path
+#
+check_and_set_led()
+{
+ file="$1"
+ val="$2"
+
+ if [ ! -e "$file" ] ; then
+ return 3
+ fi
+
+ # If another process is accessing the LED when we attempt to update it,
+ # the update will be lost so retry until the LED actually changes or we
+ # timeout.
+ for _ in $(seq 1 5); do
+ # We want to check the current state first, since writing to the
+ # 'fault' entry always causes a SES command, even if the
+ # current state is already what you want.
+ current=$(cat "${file}")
+
+ # On some enclosures if you write 1 to fault, and read it back,
+ # it will return 2. Treat all non-zero values as 1 for
+ # simplicity.
+ if [ "$current" != "0" ] ; then
+ current=1
+ fi
+
+ if [ "$current" != "$val" ] ; then
+ echo "$val" > "$file"
+ zed_log_msg "vdev $vdev set '$file' LED to $val"
+ else
+ break
+ fi
+ done
+}
+
+state_to_val()
+{
+ state="$1"
+ if [ "$state" = "FAULTED" ] || [ "$state" = "DEGRADED" ] || \
+ [ "$state" = "UNAVAIL" ] ; then
+ echo 1
+ elif [ "$state" = "ONLINE" ] ; then
+ echo 0
+ fi
+}
+
+# process_pool ([pool])
+#
+# Iterate through a pool (or pools) and set the VDEV's enclosure slot LEDs to
+# the VDEV's state.
+#
+# Arguments
+# pool: Optional pool name. If not specified, iterate though all pools.
+#
+# Return
+# 0 on success, 3 on missing sysfs path
+#
+process_pool()
+{
+ pool="$1"
+ rc=0
+
+ # Lookup all the current LED values and paths in parallel
+ #shellcheck disable=SC2016
+ cmd='echo led_token=$(cat "$VDEV_ENC_SYSFS_PATH/fault"),"$VDEV_ENC_SYSFS_PATH",'
+ out=$($ZPOOL status -vc "$cmd" "$pool" | grep 'led_token=')
+
+ #shellcheck disable=SC2034
+ echo "$out" | while read -r vdev state read write chksum therest; do
+ # Read out current LED value and path
+ tmp=$(echo "$therest" | sed 's/^.*led_token=//g')
+ vdev_enc_sysfs_path=$(echo "$tmp" | awk -F ',' '{print $2}')
+ current_val=$(echo "$tmp" | awk -F ',' '{print $1}')
+
+ if [ "$current_val" != "0" ] ; then
+ current_val=1
+ fi
+
+ if [ -z "$vdev_enc_sysfs_path" ] ; then
+ # Skip anything with no sysfs LED entries
+ continue
+ fi
+
+ if [ ! -e "$vdev_enc_sysfs_path/fault" ] ; then
+ #shellcheck disable=SC2030
+ rc=1
+ zed_log_msg "vdev $vdev '$file/fault' doesn't exist"
+ continue;
+ fi
+
+ val=$(state_to_val "$state")
+
+ if [ "$current_val" = "$val" ] ; then
+ # LED is already set correctly
+ continue;
+ fi
+
+ if ! check_and_set_led "$vdev_enc_sysfs_path/fault" "$val"; then
+ rc=1
+ fi
+
+ done
+
+ #shellcheck disable=SC2031
+ if [ "$rc" = "0" ] ; then
+ return 0
+ else
+ # We didn't see a sysfs entry that we wanted to set
+ return 3
+ fi
+}
+
+if [ -n "$ZEVENT_VDEV_ENC_SYSFS_PATH" ] && [ -n "$ZEVENT_VDEV_STATE_STR" ] ; then
+ # Got a statechange for an individual VDEV
+ val=$(state_to_val "$ZEVENT_VDEV_STATE_STR")
+ vdev=$(basename "$ZEVENT_VDEV_PATH")
+ check_and_set_led "$ZEVENT_VDEV_ENC_SYSFS_PATH/fault" "$val"
+else
+ # Process the entire pool
+ poolname=$(zed_guid_to_pool "$ZEVENT_POOL_GUID")
+ process_pool "$poolname"
+fi
diff --git a/cmd/zed/zed.d/statechange-notify.sh b/cmd/zed/zed.d/statechange-notify.sh
new file mode 100755
index 000000000000..f46080a03239
--- /dev/null
+++ b/cmd/zed/zed.d/statechange-notify.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License Version 1.0 (CDDL-1.0).
+# You can obtain a copy of the license from the top-level file
+# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+# You may not use this file except in compliance with the license.
+#
+# CDDL HEADER END
+#
+
+#
+# Send notification in response to a fault induced statechange
+#
+# ZEVENT_SUBCLASS: 'statechange'
+# ZEVENT_VDEV_STATE_STR: 'DEGRADED', 'FAULTED' or 'REMOVED'
+#
+# Exit codes:
+# 0: notification sent
+# 1: notification failed
+# 2: notification not configured
+# 3: statechange not relevant
+# 4: statechange string missing (unexpected)
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ -n "${ZEVENT_VDEV_STATE_STR}" ] || exit 4
+
+if [ "${ZEVENT_VDEV_STATE_STR}" != "FAULTED" ] \
+ && [ "${ZEVENT_VDEV_STATE_STR}" != "DEGRADED" ] \
+ && [ "${ZEVENT_VDEV_STATE_STR}" != "REMOVED" ]; then
+ exit 3
+fi
+
+umask 077
+note_subject="ZFS device fault for pool ${ZEVENT_POOL_GUID} on $(hostname)"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+ if [ "${ZEVENT_VDEV_STATE_STR}" = "FAULTED" ] ; then
+ echo "The number of I/O errors associated with a ZFS device exceeded"
+ echo "acceptable levels. ZFS has marked the device as faulted."
+ elif [ "${ZEVENT_VDEV_STATE_STR}" = "DEGRADED" ] ; then
+ echo "The number of checksum errors associated with a ZFS device"
+ echo "exceeded acceptable levels. ZFS has marked the device as"
+ echo "degraded."
+ else
+ echo "ZFS has detected that a device was removed."
+ fi
+
+ echo
+ echo " impact: Fault tolerance of the pool may be compromised."
+ echo " eid: ${ZEVENT_EID}"
+ echo " class: ${ZEVENT_SUBCLASS}"
+ echo " state: ${ZEVENT_VDEV_STATE_STR}"
+ echo " host: $(hostname)"
+ echo " time: ${ZEVENT_TIME_STRING}"
+
+ [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}"
+ [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}"
+ [ -n "${ZEVENT_VDEV_PHYSPATH}" ] && echo " vphys: ${ZEVENT_VDEV_PHYSPATH}"
+ [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}"
+ [ -n "${ZEVENT_VDEV_DEVID}" ] && echo " devid: ${ZEVENT_VDEV_DEVID}"
+
+ echo " pool: ${ZEVENT_POOL_GUID}"
+
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/cmd/zed/zed.d/trim_finish-notify.sh b/cmd/zed/zed.d/trim_finish-notify.sh
new file mode 100755
index 000000000000..5075302997e3
--- /dev/null
+++ b/cmd/zed/zed.d/trim_finish-notify.sh
@@ -0,0 +1,37 @@
+#!/bin/sh
+#
+# Send notification in response to a TRIM_FINISH. The event
+# will be received for each vdev in the pool which was trimmed.
+#
+# Exit codes:
+# 0: notification sent
+# 1: notification failed
+# 2: notification not configured
+# 9: internal error
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ -n "${ZEVENT_POOL}" ] || exit 9
+[ -n "${ZEVENT_SUBCLASS}" ] || exit 9
+
+zed_check_cmd "${ZPOOL}" || exit 9
+
+umask 077
+note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+ echo "ZFS has finished a trim:"
+ echo
+ echo " eid: ${ZEVENT_EID}"
+ echo " class: ${ZEVENT_SUBCLASS}"
+ echo " host: $(hostname)"
+ echo " time: ${ZEVENT_TIME_STRING}"
+
+ "${ZPOOL}" status -t "${ZEVENT_POOL}"
+
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/cmd/zed/zed.d/vdev_attach-led.sh b/cmd/zed/zed.d/vdev_attach-led.sh
new file mode 120000
index 000000000000..7d7404398a4a
--- /dev/null
+++ b/cmd/zed/zed.d/vdev_attach-led.sh
@@ -0,0 +1 @@
+statechange-led.sh \ No newline at end of file
diff --git a/cmd/zed/zed.d/vdev_clear-led.sh b/cmd/zed/zed.d/vdev_clear-led.sh
new file mode 120000
index 000000000000..7d7404398a4a
--- /dev/null
+++ b/cmd/zed/zed.d/vdev_clear-led.sh
@@ -0,0 +1 @@
+statechange-led.sh \ No newline at end of file
diff --git a/cmd/zed/zed.d/zed-functions.sh b/cmd/zed/zed.d/zed-functions.sh
new file mode 100755
index 000000000000..44a9b8d23303
--- /dev/null
+++ b/cmd/zed/zed.d/zed-functions.sh
@@ -0,0 +1,538 @@
+#!/bin/sh
+# shellcheck disable=SC2039
+# zed-functions.sh
+#
+# ZED helper functions for use in ZEDLETs
+
+
+# Variable Defaults
+#
+: "${ZED_LOCKDIR:="/var/lock"}"
+: "${ZED_NOTIFY_INTERVAL_SECS:=3600}"
+: "${ZED_NOTIFY_VERBOSE:=0}"
+: "${ZED_RUNDIR:="/var/run"}"
+: "${ZED_SYSLOG_PRIORITY:="daemon.notice"}"
+: "${ZED_SYSLOG_TAG:="zed"}"
+
+ZED_FLOCK_FD=8
+
+
+# zed_check_cmd (cmd, ...)
+#
+# For each argument given, search PATH for the executable command [cmd].
+# Log a message if [cmd] is not found.
+#
+# Arguments
+# cmd: name of executable command for which to search
+#
+# Return
+# 0 if all commands are found in PATH and are executable
+# n for a count of the command executables that are not found
+#
+zed_check_cmd()
+{
+ local cmd
+ local rv=0
+
+ for cmd; do
+ if ! command -v "${cmd}" >/dev/null 2>&1; then
+ zed_log_err "\"${cmd}\" not installed"
+ rv=$((rv + 1))
+ fi
+ done
+ return "${rv}"
+}
+
+
+# zed_log_msg (msg, ...)
+#
+# Write all argument strings to the system log.
+#
+# Globals
+# ZED_SYSLOG_PRIORITY
+# ZED_SYSLOG_TAG
+#
+# Return
+# nothing
+#
+zed_log_msg()
+{
+ logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "$@"
+}
+
+
+# zed_log_err (msg, ...)
+#
+# Write an error message to the system log. This message will contain the
+# script name, EID, and all argument strings.
+#
+# Globals
+# ZED_SYSLOG_PRIORITY
+# ZED_SYSLOG_TAG
+# ZEVENT_EID
+#
+# Return
+# nothing
+#
+zed_log_err()
+{
+ logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "error:" \
+ "$(basename -- "$0"):""${ZEVENT_EID:+" eid=${ZEVENT_EID}:"}" "$@"
+}
+
+
+# zed_lock (lockfile, [fd])
+#
+# Obtain an exclusive (write) lock on [lockfile]. If the lock cannot be
+# immediately acquired, wait until it becomes available.
+#
+# Every zed_lock() must be paired with a corresponding zed_unlock().
+#
+# By default, flock-style locks associate the lockfile with file descriptor 8.
+# The bash manpage warns that file descriptors >9 should be used with care as
+# they may conflict with file descriptors used internally by the shell. File
+# descriptor 9 is reserved for zed_rate_limit(). If concurrent locks are held
+# within the same process, they must use different file descriptors (preferably
+# decrementing from 8); otherwise, obtaining a new lock with a given file
+# descriptor will release the previous lock associated with that descriptor.
+#
+# Arguments
+# lockfile: pathname of the lock file; the lock will be stored in
+# ZED_LOCKDIR unless the pathname contains a "/".
+# fd: integer for the file descriptor used by flock (OPTIONAL unless holding
+# concurrent locks)
+#
+# Globals
+# ZED_FLOCK_FD
+# ZED_LOCKDIR
+#
+# Return
+# nothing
+#
+zed_lock()
+{
+ local lockfile="$1"
+ local fd="${2:-${ZED_FLOCK_FD}}"
+ local umask_bak
+ local err
+
+ [ -n "${lockfile}" ] || return
+ if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then
+ lockfile="${ZED_LOCKDIR}/${lockfile}"
+ fi
+
+ umask_bak="$(umask)"
+ umask 077
+
+ # Obtain a lock on the file bound to the given file descriptor.
+ #
+ eval "exec ${fd}> '${lockfile}'"
+ err="$(flock --exclusive "${fd}" 2>&1)"
+ # shellcheck disable=SC2181
+ if [ $? -ne 0 ]; then
+ zed_log_err "failed to lock \"${lockfile}\": ${err}"
+ fi
+
+ umask "${umask_bak}"
+}
+
+
+# zed_unlock (lockfile, [fd])
+#
+# Release the lock on [lockfile].
+#
+# Arguments
+# lockfile: pathname of the lock file
+# fd: integer for the file descriptor used by flock (must match the file
+# descriptor passed to the zed_lock function call)
+#
+# Globals
+# ZED_FLOCK_FD
+# ZED_LOCKDIR
+#
+# Return
+# nothing
+#
+zed_unlock()
+{
+ local lockfile="$1"
+ local fd="${2:-${ZED_FLOCK_FD}}"
+ local err
+
+ [ -n "${lockfile}" ] || return
+ if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then
+ lockfile="${ZED_LOCKDIR}/${lockfile}"
+ fi
+
+ # Release the lock and close the file descriptor.
+ err="$(flock --unlock "${fd}" 2>&1)"
+ # shellcheck disable=SC2181
+ if [ $? -ne 0 ]; then
+ zed_log_err "failed to unlock \"${lockfile}\": ${err}"
+ fi
+ eval "exec ${fd}>&-"
+}
+
+
+# zed_notify (subject, pathname)
+#
+# Send a notification via all available methods.
+#
+# Arguments
+# subject: notification subject
+# pathname: pathname containing the notification message (OPTIONAL)
+#
+# Return
+# 0: notification succeeded via at least one method
+# 1: notification failed
+# 2: no notification methods configured
+#
+zed_notify()
+{
+ local subject="$1"
+ local pathname="$2"
+ local num_success=0
+ local num_failure=0
+
+ zed_notify_email "${subject}" "${pathname}"; rv=$?
+ [ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
+ [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
+
+ zed_notify_pushbullet "${subject}" "${pathname}"; rv=$?
+ [ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
+ [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
+
+ zed_notify_slack_webhook "${subject}" "${pathname}"; rv=$?
+ [ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
+ [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
+
+ [ "${num_success}" -gt 0 ] && return 0
+ [ "${num_failure}" -gt 0 ] && return 1
+ return 2
+}
+
+
+# zed_notify_email (subject, pathname)
+#
+# Send a notification via email to the address specified by ZED_EMAIL_ADDR.
+#
+# Requires the mail executable to be installed in the standard PATH, or
+# ZED_EMAIL_PROG to be defined with the pathname of an executable capable of
+# reading a message body from stdin.
+#
+# Command-line options to the mail executable can be specified in
+# ZED_EMAIL_OPTS. This undergoes the following keyword substitutions:
+# - @ADDRESS@ is replaced with the space-delimited recipient email address(es)
+# - @SUBJECT@ is replaced with the notification subject
+#
+# Arguments
+# subject: notification subject
+# pathname: pathname containing the notification message (OPTIONAL)
+#
+# Globals
+# ZED_EMAIL_PROG
+# ZED_EMAIL_OPTS
+# ZED_EMAIL_ADDR
+#
+# Return
+# 0: notification sent
+# 1: notification failed
+# 2: not configured
+#
+zed_notify_email()
+{
+ local subject="$1"
+ local pathname="${2:-"/dev/null"}"
+
+ : "${ZED_EMAIL_PROG:="mail"}"
+ : "${ZED_EMAIL_OPTS:="-s '@SUBJECT@' @ADDRESS@"}"
+
+ # For backward compatibility with ZED_EMAIL.
+ if [ -n "${ZED_EMAIL}" ] && [ -z "${ZED_EMAIL_ADDR}" ]; then
+ ZED_EMAIL_ADDR="${ZED_EMAIL}"
+ fi
+ [ -n "${ZED_EMAIL_ADDR}" ] || return 2
+
+ zed_check_cmd "${ZED_EMAIL_PROG}" || return 1
+
+ [ -n "${subject}" ] || return 1
+ if [ ! -r "${pathname}" ]; then
+ zed_log_err \
+ "$(basename "${ZED_EMAIL_PROG}") cannot read \"${pathname}\""
+ return 1
+ fi
+
+ ZED_EMAIL_OPTS="$(echo "${ZED_EMAIL_OPTS}" \
+ | sed -e "s/@ADDRESS@/${ZED_EMAIL_ADDR}/g" \
+ -e "s/@SUBJECT@/${subject}/g")"
+
+ # shellcheck disable=SC2086
+ eval "${ZED_EMAIL_PROG}" ${ZED_EMAIL_OPTS} < "${pathname}" >/dev/null 2>&1
+ rv=$?
+ if [ "${rv}" -ne 0 ]; then
+ zed_log_err "$(basename "${ZED_EMAIL_PROG}") exit=${rv}"
+ return 1
+ fi
+ return 0
+}
+
+
+# zed_notify_pushbullet (subject, pathname)
+#
+# Send a notification via Pushbullet <https://www.pushbullet.com/>.
+# The access token (ZED_PUSHBULLET_ACCESS_TOKEN) identifies this client to the
+# Pushbullet server. The optional channel tag (ZED_PUSHBULLET_CHANNEL_TAG) is
+# for pushing to notification feeds that can be subscribed to; if a channel is
+# not defined, push notifications will instead be sent to all devices
+# associated with the account specified by the access token.
+#
+# Requires awk, curl, and sed executables to be installed in the standard PATH.
+#
+# References
+# https://docs.pushbullet.com/
+# https://www.pushbullet.com/security
+#
+# Arguments
+# subject: notification subject
+# pathname: pathname containing the notification message (OPTIONAL)
+#
+# Globals
+# ZED_PUSHBULLET_ACCESS_TOKEN
+# ZED_PUSHBULLET_CHANNEL_TAG
+#
+# Return
+# 0: notification sent
+# 1: notification failed
+# 2: not configured
+#
+zed_notify_pushbullet()
+{
+ local subject="$1"
+ local pathname="${2:-"/dev/null"}"
+ local msg_body
+ local msg_tag
+ local msg_json
+ local msg_out
+ local msg_err
+ local url="https://api.pushbullet.com/v2/pushes"
+
+ [ -n "${ZED_PUSHBULLET_ACCESS_TOKEN}" ] || return 2
+
+ [ -n "${subject}" ] || return 1
+ if [ ! -r "${pathname}" ]; then
+ zed_log_err "pushbullet cannot read \"${pathname}\""
+ return 1
+ fi
+
+ zed_check_cmd "awk" "curl" "sed" || return 1
+
+ # Escape the following characters in the message body for JSON:
+ # newline, backslash, double quote, horizontal tab, vertical tab,
+ # and carriage return.
+ #
+ msg_body="$(awk '{ ORS="\\n" } { gsub(/\\/, "\\\\"); gsub(/"/, "\\\"");
+ gsub(/\t/, "\\t"); gsub(/\f/, "\\f"); gsub(/\r/, "\\r"); print }' \
+ "${pathname}")"
+
+ # Push to a channel if one is configured.
+ #
+ [ -n "${ZED_PUSHBULLET_CHANNEL_TAG}" ] && msg_tag="$(printf \
+ '"channel_tag": "%s", ' "${ZED_PUSHBULLET_CHANNEL_TAG}")"
+
+ # Construct the JSON message for pushing a note.
+ #
+ msg_json="$(printf '{%s"type": "note", "title": "%s", "body": "%s"}' \
+ "${msg_tag}" "${subject}" "${msg_body}")"
+
+ # Send the POST request and check for errors.
+ #
+ msg_out="$(curl -u "${ZED_PUSHBULLET_ACCESS_TOKEN}:" -X POST "${url}" \
+ --header "Content-Type: application/json" --data-binary "${msg_json}" \
+ 2>/dev/null)"; rv=$?
+ if [ "${rv}" -ne 0 ]; then
+ zed_log_err "curl exit=${rv}"
+ return 1
+ fi
+ msg_err="$(echo "${msg_out}" \
+ | sed -n -e 's/.*"error" *:.*"message" *: *"\([^"]*\)".*/\1/p')"
+ if [ -n "${msg_err}" ]; then
+ zed_log_err "pushbullet \"${msg_err}"\"
+ return 1
+ fi
+ return 0
+}
+
+
+# zed_notify_slack_webhook (subject, pathname)
+#
+# Notification via Slack Webhook <https://api.slack.com/incoming-webhooks>.
+# The Webhook URL (ZED_SLACK_WEBHOOK_URL) identifies this client to the
+# Slack channel.
+#
+# Requires awk, curl, and sed executables to be installed in the standard PATH.
+#
+# References
+# https://api.slack.com/incoming-webhooks
+#
+# Arguments
+# subject: notification subject
+# pathname: pathname containing the notification message (OPTIONAL)
+#
+# Globals
+# ZED_SLACK_WEBHOOK_URL
+#
+# Return
+# 0: notification sent
+# 1: notification failed
+# 2: not configured
+#
+zed_notify_slack_webhook()
+{
+ [ -n "${ZED_SLACK_WEBHOOK_URL}" ] || return 2
+
+ local subject="$1"
+ local pathname="${2:-"/dev/null"}"
+ local msg_body
+ local msg_tag
+ local msg_json
+ local msg_out
+ local msg_err
+ local url="${ZED_SLACK_WEBHOOK_URL}"
+
+ [ -n "${subject}" ] || return 1
+ if [ ! -r "${pathname}" ]; then
+ zed_log_err "slack webhook cannot read \"${pathname}\""
+ return 1
+ fi
+
+ zed_check_cmd "awk" "curl" "sed" || return 1
+
+ # Escape the following characters in the message body for JSON:
+ # newline, backslash, double quote, horizontal tab, vertical tab,
+ # and carriage return.
+ #
+ msg_body="$(awk '{ ORS="\\n" } { gsub(/\\/, "\\\\"); gsub(/"/, "\\\"");
+ gsub(/\t/, "\\t"); gsub(/\f/, "\\f"); gsub(/\r/, "\\r"); print }' \
+ "${pathname}")"
+
+ # Construct the JSON message for posting.
+ #
+ msg_json="$(printf '{"text": "*%s*\n%s"}' "${subject}" "${msg_body}" )"
+
+ # Send the POST request and check for errors.
+ #
+ msg_out="$(curl -X POST "${url}" \
+ --header "Content-Type: application/json" --data-binary "${msg_json}" \
+ 2>/dev/null)"; rv=$?
+ if [ "${rv}" -ne 0 ]; then
+ zed_log_err "curl exit=${rv}"
+ return 1
+ fi
+ msg_err="$(echo "${msg_out}" \
+ | sed -n -e 's/.*"error" *:.*"message" *: *"\([^"]*\)".*/\1/p')"
+ if [ -n "${msg_err}" ]; then
+ zed_log_err "slack webhook \"${msg_err}"\"
+ return 1
+ fi
+ return 0
+}
+
+# zed_rate_limit (tag, [interval])
+#
+# Check whether an event of a given type [tag] has already occurred within the
+# last [interval] seconds.
+#
+# This function obtains a lock on the statefile using file descriptor 9.
+#
+# Arguments
+# tag: arbitrary string for grouping related events to rate-limit
+# interval: time interval in seconds (OPTIONAL)
+#
+# Globals
+# ZED_NOTIFY_INTERVAL_SECS
+# ZED_RUNDIR
+#
+# Return
+# 0 if the event should be processed
+# 1 if the event should be dropped
+#
+# State File Format
+# time;tag
+#
+zed_rate_limit()
+{
+ local tag="$1"
+ local interval="${2:-${ZED_NOTIFY_INTERVAL_SECS}}"
+ local lockfile="zed.zedlet.state.lock"
+ local lockfile_fd=9
+ local statefile="${ZED_RUNDIR}/zed.zedlet.state"
+ local time_now
+ local time_prev
+ local umask_bak
+ local rv=0
+
+ [ -n "${tag}" ] || return 0
+
+ zed_lock "${lockfile}" "${lockfile_fd}"
+ time_now="$(date +%s)"
+ time_prev="$(grep -E "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \
+ | tail -1 | cut -d\; -f1)"
+
+ if [ -n "${time_prev}" ] \
+ && [ "$((time_now - time_prev))" -lt "${interval}" ]; then
+ rv=1
+ else
+ umask_bak="$(umask)"
+ umask 077
+ grep -E -v "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \
+ > "${statefile}.$$"
+ echo "${time_now};${tag}" >> "${statefile}.$$"
+ mv -f "${statefile}.$$" "${statefile}"
+ umask "${umask_bak}"
+ fi
+
+ zed_unlock "${lockfile}" "${lockfile_fd}"
+ return "${rv}"
+}
+
+
+# zed_guid_to_pool (guid)
+#
+# Convert a pool GUID into its pool name (like "tank")
+# Arguments
+# guid: pool GUID (decimal or hex)
+#
+# Return
+# Pool name
+#
+zed_guid_to_pool()
+{
+ if [ -z "$1" ] ; then
+ return
+ fi
+
+ guid=$(printf "%llu" "$1")
+ if [ -n "$guid" ] ; then
+ $ZPOOL get -H -ovalue,name guid | awk '$1=='"$guid"' {print $2}'
+ fi
+}
+
+# zed_exit_if_ignoring_this_event
+#
+# Exit the script if we should ignore this event, as determined by
+# $ZED_SYSLOG_SUBCLASS_INCLUDE and $ZED_SYSLOG_SUBCLASS_EXCLUDE in zed.rc.
+# This function assumes you've imported the normal zed variables.
+zed_exit_if_ignoring_this_event()
+{
+ if [ -n "${ZED_SYSLOG_SUBCLASS_INCLUDE}" ]; then
+ eval "case ${ZEVENT_SUBCLASS} in
+ ${ZED_SYSLOG_SUBCLASS_INCLUDE});;
+ *) exit 0;;
+ esac"
+ elif [ -n "${ZED_SYSLOG_SUBCLASS_EXCLUDE}" ]; then
+ eval "case ${ZEVENT_SUBCLASS} in
+ ${ZED_SYSLOG_SUBCLASS_EXCLUDE}) exit 0;;
+ *);;
+ esac"
+ fi
+}
diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc
new file mode 100644
index 000000000000..1b220d28db20
--- /dev/null
+++ b/cmd/zed/zed.d/zed.rc
@@ -0,0 +1,122 @@
+##
+# zed.rc
+#
+# This file should be owned by root and permissioned 0600.
+##
+
+##
+# Absolute path to the debug output file.
+#
+#ZED_DEBUG_LOG="/tmp/zed.debug.log"
+
+##
+# Email address of the zpool administrator for receipt of notifications;
+# multiple addresses can be specified if they are delimited by whitespace.
+# Email will only be sent if ZED_EMAIL_ADDR is defined.
+# Disabled by default; uncomment to enable.
+#
+#ZED_EMAIL_ADDR="root"
+
+##
+# Name or path of executable responsible for sending notifications via email;
+# the mail program must be capable of reading a message body from stdin.
+# Email will only be sent if ZED_EMAIL_ADDR is defined.
+#
+#ZED_EMAIL_PROG="mail"
+
+##
+# Command-line options for ZED_EMAIL_PROG.
+# The string @ADDRESS@ will be replaced with the recipient email address(es).
+# The string @SUBJECT@ will be replaced with the notification subject;
+# this should be protected with quotes to prevent word-splitting.
+# Email will only be sent if ZED_EMAIL_ADDR is defined.
+#
+#ZED_EMAIL_OPTS="-s '@SUBJECT@' @ADDRESS@"
+
+##
+# Default directory for zed lock files.
+#
+#ZED_LOCKDIR="/var/lock"
+
+##
+# Minimum number of seconds between notifications for a similar event.
+#
+#ZED_NOTIFY_INTERVAL_SECS=3600
+
+##
+# Notification verbosity.
+# If set to 0, suppress notification if the pool is healthy.
+# If set to 1, send notification regardless of pool health.
+#
+#ZED_NOTIFY_VERBOSE=0
+
+##
+# Send notifications for 'ereport.fs.zfs.data' events.
+# Disabled by default, any non-empty value will enable the feature.
+#
+#ZED_NOTIFY_DATA=
+
+##
+# Pushbullet access token.
+# This grants full access to your account -- protect it accordingly!
+# <https://www.pushbullet.com/get-started>
+# <https://www.pushbullet.com/account>
+# Disabled by default; uncomment to enable.
+#
+#ZED_PUSHBULLET_ACCESS_TOKEN=""
+
+##
+# Pushbullet channel tag for push notification feeds that can be subscribed to.
+# <https://www.pushbullet.com/my-channel>
+# If not defined, push notifications will instead be sent to all devices
+# associated with the account specified by the access token.
+# Disabled by default; uncomment to enable.
+#
+#ZED_PUSHBULLET_CHANNEL_TAG=""
+
+##
+# Slack Webhook URL.
+# This allows posting to the given channel and includes an access token.
+# <https://api.slack.com/incoming-webhooks>
+# Disabled by default; uncomment to enable.
+#
+#ZED_SLACK_WEBHOOK_URL=""
+
+##
+# Default directory for zed state files.
+#
+#ZED_RUNDIR="/var/run"
+
+##
+# Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED. This works for
+# device mapper and multipath devices as well. Your enclosure must be
+# supported by the Linux SES driver for this to work.
+#
+ZED_USE_ENCLOSURE_LEDS=1
+
+##
+# Run a scrub after every resilver
+# Disabled by default, 1 to enable and 0 to disable.
+#ZED_SCRUB_AFTER_RESILVER=0
+
+##
+# The syslog priority (e.g., specified as a "facility.level" pair).
+#
+#ZED_SYSLOG_PRIORITY="daemon.notice"
+
+##
+# The syslog tag for marking zed events.
+#
+#ZED_SYSLOG_TAG="zed"
+
+##
+# Which set of event subclasses to log
+# By default, events from all subclasses are logged.
+# If ZED_SYSLOG_SUBCLASS_INCLUDE is set, only subclasses
+# matching the pattern are logged. Use the pipe symbol (|)
+# or shell wildcards (*, ?) to match multiple subclasses.
+# Otherwise, if ZED_SYSLOG_SUBCLASS_EXCLUDE is set, the
+# matching subclasses are excluded from logging.
+#ZED_SYSLOG_SUBCLASS_INCLUDE="checksum|scrub_*|vdev.*"
+#ZED_SYSLOG_SUBCLASS_EXCLUDE="statechange|config_*|history_event"
+
diff --git a/cmd/zed/zed.h b/cmd/zed/zed.h
new file mode 100644
index 000000000000..3ac0e63141e8
--- /dev/null
+++ b/cmd/zed/zed.h
@@ -0,0 +1,58 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_H
+#define ZED_H
+
+/*
+ * Absolute path for the default zed configuration file.
+ */
+#define ZED_CONF_FILE SYSCONFDIR "/zfs/zed.conf"
+
+/*
+ * Absolute path for the default zed pid file.
+ */
+#define ZED_PID_FILE RUNSTATEDIR "/zed.pid"
+
+/*
+ * Absolute path for the default zed state file.
+ */
+#define ZED_STATE_FILE RUNSTATEDIR "/zed.state"
+
+/*
+ * Absolute path for the default zed zedlet directory.
+ */
+#define ZED_ZEDLET_DIR SYSCONFDIR "/zfs/zed.d"
+
+/*
+ * Reserved for future use.
+ */
+#define ZED_MAX_EVENTS 0
+
+/*
+ * Reserved for future use.
+ */
+#define ZED_MIN_EVENTS 0
+
+/*
+ * String prefix for ZED variables passed via environment variables.
+ */
+#define ZED_VAR_PREFIX "ZED_"
+
+/*
+ * String prefix for ZFS event names passed via environment variables.
+ */
+#define ZEVENT_VAR_PREFIX "ZEVENT_"
+
+#endif /* !ZED_H */
diff --git a/cmd/zed/zed_conf.c b/cmd/zed/zed_conf.c
new file mode 100644
index 000000000000..52370eb87b29
--- /dev/null
+++ b/cmd/zed/zed_conf.c
@@ -0,0 +1,735 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include "zed.h"
+#include "zed_conf.h"
+#include "zed_file.h"
+#include "zed_log.h"
+#include "zed_strings.h"
+
+/*
+ * Return a new configuration with default values.
+ */
+struct zed_conf *
+zed_conf_create(void)
+{
+ struct zed_conf *zcp;
+
+ zcp = calloc(1, sizeof (*zcp));
+ if (!zcp)
+ goto nomem;
+
+ zcp->syslog_facility = LOG_DAEMON;
+ zcp->min_events = ZED_MIN_EVENTS;
+ zcp->max_events = ZED_MAX_EVENTS;
+ zcp->pid_fd = -1;
+ zcp->zedlets = NULL; /* created via zed_conf_scan_dir() */
+ zcp->state_fd = -1; /* opened via zed_conf_open_state() */
+ zcp->zfs_hdl = NULL; /* opened via zed_event_init() */
+ zcp->zevent_fd = -1; /* opened via zed_event_init() */
+
+ if (!(zcp->conf_file = strdup(ZED_CONF_FILE)))
+ goto nomem;
+
+ if (!(zcp->pid_file = strdup(ZED_PID_FILE)))
+ goto nomem;
+
+ if (!(zcp->zedlet_dir = strdup(ZED_ZEDLET_DIR)))
+ goto nomem;
+
+ if (!(zcp->state_file = strdup(ZED_STATE_FILE)))
+ goto nomem;
+
+ return (zcp);
+
+nomem:
+ zed_log_die("Failed to create conf: %s", strerror(errno));
+ return (NULL);
+}
+
+/*
+ * Destroy the configuration [zcp].
+ *
+ * Note: zfs_hdl & zevent_fd are destroyed via zed_event_fini().
+ */
+void
+zed_conf_destroy(struct zed_conf *zcp)
+{
+ if (!zcp)
+ return;
+
+ if (zcp->state_fd >= 0) {
+ if (close(zcp->state_fd) < 0)
+ zed_log_msg(LOG_WARNING,
+ "Failed to close state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ zcp->state_fd = -1;
+ }
+ if (zcp->pid_file) {
+ if ((unlink(zcp->pid_file) < 0) && (errno != ENOENT))
+ zed_log_msg(LOG_WARNING,
+ "Failed to remove PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ }
+ if (zcp->pid_fd >= 0) {
+ if (close(zcp->pid_fd) < 0)
+ zed_log_msg(LOG_WARNING,
+ "Failed to close PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ zcp->pid_fd = -1;
+ }
+ if (zcp->conf_file) {
+ free(zcp->conf_file);
+ zcp->conf_file = NULL;
+ }
+ if (zcp->pid_file) {
+ free(zcp->pid_file);
+ zcp->pid_file = NULL;
+ }
+ if (zcp->zedlet_dir) {
+ free(zcp->zedlet_dir);
+ zcp->zedlet_dir = NULL;
+ }
+ if (zcp->state_file) {
+ free(zcp->state_file);
+ zcp->state_file = NULL;
+ }
+ if (zcp->zedlets) {
+ zed_strings_destroy(zcp->zedlets);
+ zcp->zedlets = NULL;
+ }
+ free(zcp);
+}
+
+/*
+ * Display command-line help and exit.
+ *
+ * If [got_err] is 0, output to stdout and exit normally;
+ * otherwise, output to stderr and exit with a failure status.
+ */
+static void
+_zed_conf_display_help(const char *prog, int got_err)
+{
+ FILE *fp = got_err ? stderr : stdout;
+ int w1 = 4; /* width of leading whitespace */
+ int w2 = 8; /* width of L-justified option field */
+
+ fprintf(fp, "Usage: %s [OPTION]...\n", (prog ? prog : "zed"));
+ fprintf(fp, "\n");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-h",
+ "Display help.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-L",
+ "Display license information.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-V",
+ "Display version information.");
+ fprintf(fp, "\n");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-v",
+ "Be verbose.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-f",
+ "Force daemon to run.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-F",
+ "Run daemon in the foreground.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-I",
+ "Idle daemon until kernel module is (re)loaded.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-M",
+ "Lock all pages in memory.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-P",
+ "$PATH for ZED to use (only used by ZTS).");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-Z",
+ "Zero state file.");
+ fprintf(fp, "\n");
+#if 0
+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-c FILE",
+ "Read configuration from FILE.", ZED_CONF_FILE);
+#endif
+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-d DIR",
+ "Read enabled ZEDLETs from DIR.", ZED_ZEDLET_DIR);
+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-p FILE",
+ "Write daemon's PID to FILE.", ZED_PID_FILE);
+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-s FILE",
+ "Write daemon's state to FILE.", ZED_STATE_FILE);
+ fprintf(fp, "\n");
+
+ exit(got_err ? EXIT_FAILURE : EXIT_SUCCESS);
+}
+
+/*
+ * Display license information to stdout and exit.
+ */
+static void
+_zed_conf_display_license(void)
+{
+ const char **pp;
+ const char *text[] = {
+ "The ZFS Event Daemon (ZED) is distributed under the terms of the",
+ " Common Development and Distribution License (CDDL-1.0)",
+ " <http://opensource.org/licenses/CDDL-1.0>.",
+ "",
+ "Developed at Lawrence Livermore National Laboratory"
+ " (LLNL-CODE-403049).",
+ "",
+ NULL
+ };
+
+ for (pp = text; *pp; pp++)
+ printf("%s\n", *pp);
+
+ exit(EXIT_SUCCESS);
+}
+
+/*
+ * Display version information to stdout and exit.
+ */
+static void
+_zed_conf_display_version(void)
+{
+ printf("%s-%s-%s\n",
+ ZFS_META_NAME, ZFS_META_VERSION, ZFS_META_RELEASE);
+
+ exit(EXIT_SUCCESS);
+}
+
+/*
+ * Copy the [path] string to the [resultp] ptr.
+ * If [path] is not an absolute path, prefix it with the current working dir.
+ * If [resultp] is non-null, free its existing string before assignment.
+ */
+static void
+_zed_conf_parse_path(char **resultp, const char *path)
+{
+ char buf[PATH_MAX];
+
+ assert(resultp != NULL);
+ assert(path != NULL);
+
+ if (*resultp)
+ free(*resultp);
+
+ if (path[0] == '/') {
+ *resultp = strdup(path);
+ } else if (!getcwd(buf, sizeof (buf))) {
+ zed_log_die("Failed to get current working dir: %s",
+ strerror(errno));
+ } else if (strlcat(buf, "/", sizeof (buf)) >= sizeof (buf)) {
+ zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG));
+ } else if (strlcat(buf, path, sizeof (buf)) >= sizeof (buf)) {
+ zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG));
+ } else {
+ *resultp = strdup(buf);
+ }
+ if (!*resultp)
+ zed_log_die("Failed to copy path: %s", strerror(ENOMEM));
+}
+
+/*
+ * Parse the command-line options into the configuration [zcp].
+ */
+void
+zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv)
+{
+ const char * const opts = ":hLVc:d:p:P:s:vfFMZI";
+ int opt;
+
+ if (!zcp || !argv || !argv[0])
+ zed_log_die("Failed to parse options: Internal error");
+
+ opterr = 0; /* suppress default getopt err msgs */
+
+ while ((opt = getopt(argc, argv, opts)) != -1) {
+ switch (opt) {
+ case 'h':
+ _zed_conf_display_help(argv[0], EXIT_SUCCESS);
+ break;
+ case 'L':
+ _zed_conf_display_license();
+ break;
+ case 'V':
+ _zed_conf_display_version();
+ break;
+ case 'c':
+ _zed_conf_parse_path(&zcp->conf_file, optarg);
+ break;
+ case 'd':
+ _zed_conf_parse_path(&zcp->zedlet_dir, optarg);
+ break;
+ case 'I':
+ zcp->do_idle = 1;
+ break;
+ case 'p':
+ _zed_conf_parse_path(&zcp->pid_file, optarg);
+ break;
+ case 'P':
+ _zed_conf_parse_path(&zcp->path, optarg);
+ break;
+ case 's':
+ _zed_conf_parse_path(&zcp->state_file, optarg);
+ break;
+ case 'v':
+ zcp->do_verbose = 1;
+ break;
+ case 'f':
+ zcp->do_force = 1;
+ break;
+ case 'F':
+ zcp->do_foreground = 1;
+ break;
+ case 'M':
+ zcp->do_memlock = 1;
+ break;
+ case 'Z':
+ zcp->do_zero = 1;
+ break;
+ case '?':
+ default:
+ if (optopt == '?')
+ _zed_conf_display_help(argv[0], EXIT_SUCCESS);
+
+ fprintf(stderr, "%s: %s '-%c'\n\n", argv[0],
+ "Invalid option", optopt);
+ _zed_conf_display_help(argv[0], EXIT_FAILURE);
+ break;
+ }
+ }
+}
+
+/*
+ * Parse the configuration file into the configuration [zcp].
+ *
+ * FIXME: Not yet implemented.
+ */
+void
+zed_conf_parse_file(struct zed_conf *zcp)
+{
+ if (!zcp)
+ zed_log_die("Failed to parse config: %s", strerror(EINVAL));
+}
+
+/*
+ * Scan the [zcp] zedlet_dir for files to exec based on the event class.
+ * Files must be executable by user, but not writable by group or other.
+ * Dotfiles are ignored.
+ *
+ * Return 0 on success with an updated set of zedlets,
+ * or -1 on error with errno set.
+ *
+ * FIXME: Check if zedlet_dir and all parent dirs are secure.
+ */
+int
+zed_conf_scan_dir(struct zed_conf *zcp)
+{
+ zed_strings_t *zedlets;
+ DIR *dirp;
+ struct dirent *direntp;
+ char pathname[PATH_MAX];
+ struct stat st;
+ int n;
+
+ if (!zcp) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR, "Failed to scan zedlet dir: %s",
+ strerror(errno));
+ return (-1);
+ }
+ zedlets = zed_strings_create();
+ if (!zedlets) {
+ errno = ENOMEM;
+ zed_log_msg(LOG_WARNING, "Failed to scan dir \"%s\": %s",
+ zcp->zedlet_dir, strerror(errno));
+ return (-1);
+ }
+ dirp = opendir(zcp->zedlet_dir);
+ if (!dirp) {
+ int errno_bak = errno;
+ zed_log_msg(LOG_WARNING, "Failed to open dir \"%s\": %s",
+ zcp->zedlet_dir, strerror(errno));
+ zed_strings_destroy(zedlets);
+ errno = errno_bak;
+ return (-1);
+ }
+ while ((direntp = readdir(dirp))) {
+ if (direntp->d_name[0] == '.')
+ continue;
+
+ n = snprintf(pathname, sizeof (pathname),
+ "%s/%s", zcp->zedlet_dir, direntp->d_name);
+ if ((n < 0) || (n >= sizeof (pathname))) {
+ zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s",
+ direntp->d_name, strerror(ENAMETOOLONG));
+ continue;
+ }
+ if (stat(pathname, &st) < 0) {
+ zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s",
+ pathname, strerror(errno));
+ continue;
+ }
+ if (!S_ISREG(st.st_mode)) {
+ zed_log_msg(LOG_INFO,
+ "Ignoring \"%s\": not a regular file",
+ direntp->d_name);
+ continue;
+ }
+ if ((st.st_uid != 0) && !zcp->do_force) {
+ zed_log_msg(LOG_NOTICE,
+ "Ignoring \"%s\": not owned by root",
+ direntp->d_name);
+ continue;
+ }
+ if (!(st.st_mode & S_IXUSR)) {
+ zed_log_msg(LOG_INFO,
+ "Ignoring \"%s\": not executable by user",
+ direntp->d_name);
+ continue;
+ }
+ if ((st.st_mode & S_IWGRP) && !zcp->do_force) {
+ zed_log_msg(LOG_NOTICE,
+ "Ignoring \"%s\": writable by group",
+ direntp->d_name);
+ continue;
+ }
+ if ((st.st_mode & S_IWOTH) && !zcp->do_force) {
+ zed_log_msg(LOG_NOTICE,
+ "Ignoring \"%s\": writable by other",
+ direntp->d_name);
+ continue;
+ }
+ if (zed_strings_add(zedlets, NULL, direntp->d_name) < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to register \"%s\": %s",
+ direntp->d_name, strerror(errno));
+ continue;
+ }
+ if (zcp->do_verbose)
+ zed_log_msg(LOG_INFO,
+ "Registered zedlet \"%s\"", direntp->d_name);
+ }
+ if (closedir(dirp) < 0) {
+ int errno_bak = errno;
+ zed_log_msg(LOG_WARNING, "Failed to close dir \"%s\": %s",
+ zcp->zedlet_dir, strerror(errno));
+ zed_strings_destroy(zedlets);
+ errno = errno_bak;
+ return (-1);
+ }
+ if (zcp->zedlets)
+ zed_strings_destroy(zcp->zedlets);
+
+ zcp->zedlets = zedlets;
+ return (0);
+}
+
+/*
+ * Write the PID file specified in [zcp].
+ * Return 0 on success, -1 on error.
+ *
+ * This must be called after fork()ing to become a daemon (so the correct PID
+ * is recorded), but before daemonization is complete and the parent process
+ * exits (for synchronization with systemd).
+ */
+int
+zed_conf_write_pid(struct zed_conf *zcp)
+{
+ const mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
+ const mode_t filemode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+ char buf[PATH_MAX];
+ int n;
+ char *p;
+ mode_t mask;
+ int rv;
+
+ if (!zcp || !zcp->pid_file) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR, "Failed to create PID file: %s",
+ strerror(errno));
+ return (-1);
+ }
+ assert(zcp->pid_fd == -1);
+ /*
+ * Create PID file directory if needed.
+ */
+ n = strlcpy(buf, zcp->pid_file, sizeof (buf));
+ if (n >= sizeof (buf)) {
+ errno = ENAMETOOLONG;
+ zed_log_msg(LOG_ERR, "Failed to create PID file: %s",
+ strerror(errno));
+ goto err;
+ }
+ p = strrchr(buf, '/');
+ if (p)
+ *p = '\0';
+
+ if ((mkdirp(buf, dirmode) < 0) && (errno != EEXIST)) {
+ zed_log_msg(LOG_ERR, "Failed to create directory \"%s\": %s",
+ buf, strerror(errno));
+ goto err;
+ }
+ /*
+ * Obtain PID file lock.
+ */
+ mask = umask(0);
+ umask(mask | 022);
+ zcp->pid_fd = open(zcp->pid_file, (O_RDWR | O_CREAT), filemode);
+ umask(mask);
+ if (zcp->pid_fd < 0) {
+ zed_log_msg(LOG_ERR, "Failed to open PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ goto err;
+ }
+ rv = zed_file_lock(zcp->pid_fd);
+ if (rv < 0) {
+ zed_log_msg(LOG_ERR, "Failed to lock PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ goto err;
+ } else if (rv > 0) {
+ pid_t pid = zed_file_is_locked(zcp->pid_fd);
+ if (pid < 0) {
+ zed_log_msg(LOG_ERR,
+ "Failed to test lock on PID file \"%s\"",
+ zcp->pid_file);
+ } else if (pid > 0) {
+ zed_log_msg(LOG_ERR,
+ "Found PID %d bound to PID file \"%s\"",
+ pid, zcp->pid_file);
+ } else {
+ zed_log_msg(LOG_ERR,
+ "Inconsistent lock state on PID file \"%s\"",
+ zcp->pid_file);
+ }
+ goto err;
+ }
+ /*
+ * Write PID file.
+ */
+ n = snprintf(buf, sizeof (buf), "%d\n", (int)getpid());
+ if ((n < 0) || (n >= sizeof (buf))) {
+ errno = ERANGE;
+ zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ } else if (zed_file_write_n(zcp->pid_fd, buf, n) != n) {
+ zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ } else if (fdatasync(zcp->pid_fd) < 0) {
+ zed_log_msg(LOG_ERR, "Failed to sync PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ } else {
+ return (0);
+ }
+
+err:
+ if (zcp->pid_fd >= 0) {
+ (void) close(zcp->pid_fd);
+ zcp->pid_fd = -1;
+ }
+ return (-1);
+}
+
+/*
+ * Open and lock the [zcp] state_file.
+ * Return 0 on success, -1 on error.
+ *
+ * FIXME: Move state information into kernel.
+ */
+int
+zed_conf_open_state(struct zed_conf *zcp)
+{
+ char dirbuf[PATH_MAX];
+ mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
+ int n;
+ char *p;
+ int rv;
+
+ if (!zcp || !zcp->state_file) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR, "Failed to open state file: %s",
+ strerror(errno));
+ return (-1);
+ }
+ n = strlcpy(dirbuf, zcp->state_file, sizeof (dirbuf));
+ if (n >= sizeof (dirbuf)) {
+ errno = ENAMETOOLONG;
+ zed_log_msg(LOG_WARNING, "Failed to open state file: %s",
+ strerror(errno));
+ return (-1);
+ }
+ p = strrchr(dirbuf, '/');
+ if (p)
+ *p = '\0';
+
+ if ((mkdirp(dirbuf, dirmode) < 0) && (errno != EEXIST)) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to create directory \"%s\": %s",
+ dirbuf, strerror(errno));
+ return (-1);
+ }
+ if (zcp->state_fd >= 0) {
+ if (close(zcp->state_fd) < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to close state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ return (-1);
+ }
+ }
+ if (zcp->do_zero)
+ (void) unlink(zcp->state_file);
+
+ zcp->state_fd = open(zcp->state_file,
+ (O_RDWR | O_CREAT), (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH));
+ if (zcp->state_fd < 0) {
+ zed_log_msg(LOG_WARNING, "Failed to open state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ return (-1);
+ }
+ rv = zed_file_lock(zcp->state_fd);
+ if (rv < 0) {
+ zed_log_msg(LOG_WARNING, "Failed to lock state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ return (-1);
+ }
+ if (rv > 0) {
+ pid_t pid = zed_file_is_locked(zcp->state_fd);
+ if (pid < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to test lock on state file \"%s\"",
+ zcp->state_file);
+ } else if (pid > 0) {
+ zed_log_msg(LOG_WARNING,
+ "Found PID %d bound to state file \"%s\"",
+ pid, zcp->state_file);
+ } else {
+ zed_log_msg(LOG_WARNING,
+ "Inconsistent lock state on state file \"%s\"",
+ zcp->state_file);
+ }
+ return (-1);
+ }
+ return (0);
+}
+
+/*
+ * Read the opened [zcp] state_file to obtain the eid & etime of the last event
+ * processed. Write the state from the last event to the [eidp] & [etime] args
+ * passed by reference. Note that etime[] is an array of size 2.
+ * Return 0 on success, -1 on error.
+ */
+int
+zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[])
+{
+ ssize_t len;
+ struct iovec iov[3];
+ ssize_t n;
+
+ if (!zcp || !eidp || !etime) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR,
+ "Failed to read state file: %s", strerror(errno));
+ return (-1);
+ }
+ if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to reposition state file offset: %s",
+ strerror(errno));
+ return (-1);
+ }
+ len = 0;
+ iov[0].iov_base = eidp;
+ len += iov[0].iov_len = sizeof (*eidp);
+ iov[1].iov_base = &etime[0];
+ len += iov[1].iov_len = sizeof (etime[0]);
+ iov[2].iov_base = &etime[1];
+ len += iov[2].iov_len = sizeof (etime[1]);
+
+ n = readv(zcp->state_fd, iov, 3);
+ if (n == 0) {
+ *eidp = 0;
+ } else if (n < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to read state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ return (-1);
+ } else if (n != len) {
+ errno = EIO;
+ zed_log_msg(LOG_WARNING,
+ "Failed to read state file \"%s\": Read %d of %d bytes",
+ zcp->state_file, n, len);
+ return (-1);
+ }
+ return (0);
+}
+
+/*
+ * Write the [eid] & [etime] of the last processed event to the opened
+ * [zcp] state_file. Note that etime[] is an array of size 2.
+ * Return 0 on success, -1 on error.
+ */
+int
+zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[])
+{
+ ssize_t len;
+ struct iovec iov[3];
+ ssize_t n;
+
+ if (!zcp) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR,
+ "Failed to write state file: %s", strerror(errno));
+ return (-1);
+ }
+ if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to reposition state file offset: %s",
+ strerror(errno));
+ return (-1);
+ }
+ len = 0;
+ iov[0].iov_base = &eid;
+ len += iov[0].iov_len = sizeof (eid);
+ iov[1].iov_base = &etime[0];
+ len += iov[1].iov_len = sizeof (etime[0]);
+ iov[2].iov_base = &etime[1];
+ len += iov[2].iov_len = sizeof (etime[1]);
+
+ n = writev(zcp->state_fd, iov, 3);
+ if (n < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to write state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ return (-1);
+ }
+ if (n != len) {
+ errno = EIO;
+ zed_log_msg(LOG_WARNING,
+ "Failed to write state file \"%s\": Wrote %d of %d bytes",
+ zcp->state_file, n, len);
+ return (-1);
+ }
+ if (fdatasync(zcp->state_fd) < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to sync state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ return (-1);
+ }
+ return (0);
+}
diff --git a/cmd/zed/zed_conf.h b/cmd/zed/zed_conf.h
new file mode 100644
index 000000000000..424cb2c01c8c
--- /dev/null
+++ b/cmd/zed/zed_conf.h
@@ -0,0 +1,62 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_CONF_H
+#define ZED_CONF_H
+
+#include <libzfs.h>
+#include <stdint.h>
+#include "zed_strings.h"
+
+struct zed_conf {
+ unsigned do_force:1; /* true if force enabled */
+ unsigned do_foreground:1; /* true if run in foreground */
+ unsigned do_memlock:1; /* true if locking memory */
+ unsigned do_verbose:1; /* true if verbosity enabled */
+ unsigned do_zero:1; /* true if zeroing state */
+ unsigned do_idle:1; /* true if idle enabled */
+ int syslog_facility; /* syslog facility value */
+ int min_events; /* RESERVED FOR FUTURE USE */
+ int max_events; /* RESERVED FOR FUTURE USE */
+ char *conf_file; /* abs path to config file */
+ char *pid_file; /* abs path to pid file */
+ int pid_fd; /* fd to pid file for lock */
+ char *zedlet_dir; /* abs path to zedlet dir */
+ zed_strings_t *zedlets; /* names of enabled zedlets */
+ char *state_file; /* abs path to state file */
+ int state_fd; /* fd to state file */
+ libzfs_handle_t *zfs_hdl; /* handle to libzfs */
+ int zevent_fd; /* fd for access to zevents */
+ char *path; /* custom $PATH for zedlets to use */
+};
+
+struct zed_conf *zed_conf_create(void);
+
+void zed_conf_destroy(struct zed_conf *zcp);
+
+void zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv);
+
+void zed_conf_parse_file(struct zed_conf *zcp);
+
+int zed_conf_scan_dir(struct zed_conf *zcp);
+
+int zed_conf_write_pid(struct zed_conf *zcp);
+
+int zed_conf_open_state(struct zed_conf *zcp);
+
+int zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]);
+
+int zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]);
+
+#endif /* !ZED_CONF_H */
diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c
new file mode 100644
index 000000000000..174d24523253
--- /dev/null
+++ b/cmd/zed/zed_disk_event.c
@@ -0,0 +1,416 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2017, Intel Corporation.
+ */
+
+#ifdef HAVE_LIBUDEV
+
+#include <errno.h>
+#include <fcntl.h>
+#include <libnvpair.h>
+#include <libudev.h>
+#include <libzfs.h>
+#include <libzutil.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
+
+#include "zed_log.h"
+#include "zed_disk_event.h"
+#include "agents/zfs_agents.h"
+
+/*
+ * Portions of ZED need to see disk events for disks belonging to ZFS pools.
+ * A libudev monitor is established to monitor block device actions and pass
+ * them on to internal ZED logic modules. Initially, zfs_mod.c is the only
+ * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM
+ * module responsible for handling disk events for ZFS.
+ */
+
+pthread_t g_mon_tid;
+struct udev *g_udev;
+struct udev_monitor *g_mon;
+
+
+#define DEV_BYID_PATH "/dev/disk/by-id/"
+
+/* 64MB is minimum usable disk for ZFS */
+#define MINIMUM_SECTORS 131072
+
+
+/*
+ * Post disk event to SLM module
+ *
+ * occurs in the context of monitor thread
+ */
+static void
+zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)
+{
+ char *strval;
+ uint64_t numval;
+
+ zed_log_msg(LOG_INFO, "zed_disk_event:");
+ zed_log_msg(LOG_INFO, "\tclass: %s", class);
+ zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass);
+ if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval);
+ if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval);
+ if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval);
+ if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);
+ if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval);
+ if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval);
+ if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval);
+
+ (void) zfs_agent_post_event(class, subclass, nvl);
+}
+
+/*
+ * dev_event_nvlist: place event schema into an nv pair list
+ *
+ * NAME VALUE (example)
+ * -------------- --------------------------------------------------------
+ * DEV_NAME /dev/sdl
+ * DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/...
+ * DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC
+ * DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0
+ * DEV_IS_PART ---
+ * DEV_SIZE 500107862016
+ * ZFS_EV_POOL_GUID 17523635698032189180
+ * ZFS_EV_VDEV_GUID 14663607734290803088
+ */
+static nvlist_t *
+dev_event_nvlist(struct udev_device *dev)
+{
+ nvlist_t *nvl;
+ char strval[128];
+ const char *value, *path;
+ uint64_t guid;
+
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+ return (NULL);
+
+ if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0)
+ (void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval);
+ if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0)
+ (void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval);
+ if ((path = udev_device_get_devnode(dev)) != NULL)
+ (void) nvlist_add_string(nvl, DEV_NAME, path);
+ if ((value = udev_device_get_devpath(dev)) != NULL)
+ (void) nvlist_add_string(nvl, DEV_PATH, value);
+ value = udev_device_get_devtype(dev);
+ if ((value != NULL && strcmp("partition", value) == 0) ||
+ (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER")
+ != NULL)) {
+ (void) nvlist_add_boolean(nvl, DEV_IS_PART);
+ }
+ if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) {
+ uint64_t numval = DEV_BSIZE;
+
+ numval *= strtoull(value, NULL, 10);
+ (void) nvlist_add_uint64(nvl, DEV_SIZE, numval);
+ }
+
+ /*
+ * Grab the pool and vdev guids from blkid cache
+ */
+ value = udev_device_get_property_value(dev, "ID_FS_UUID");
+ if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
+ (void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid);
+
+ value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB");
+ if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
+ (void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid);
+
+ /*
+ * Either a vdev guid or a devid must be present for matching
+ */
+ if (!nvlist_exists(nvl, DEV_IDENTIFIER) &&
+ !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) {
+ nvlist_free(nvl);
+ return (NULL);
+ }
+
+ return (nvl);
+}
+
+/*
+ * Listen for block device uevents
+ */
+static void *
+zed_udev_monitor(void *arg)
+{
+ struct udev_monitor *mon = arg;
+ char *tmp, *tmp2;
+
+ zed_log_msg(LOG_INFO, "Waiting for new udev disk events...");
+
+ while (1) {
+ struct udev_device *dev;
+ const char *action, *type, *part, *sectors;
+ const char *bus, *uuid;
+ const char *class, *subclass;
+ nvlist_t *nvl;
+ boolean_t is_zfs = B_FALSE;
+
+ /* allow a cancellation while blocked (recvmsg) */
+ pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
+
+ /* blocks at recvmsg until an event occurs */
+ if ((dev = udev_monitor_receive_device(mon)) == NULL) {
+ zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive "
+ "device error %d", errno);
+ continue;
+ }
+
+ /* allow all steps to complete before a cancellation */
+ pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
+
+ /*
+ * Strongly typed device is the preferred filter
+ */
+ type = udev_device_get_property_value(dev, "ID_FS_TYPE");
+ if (type != NULL && type[0] != '\0') {
+ if (strcmp(type, "zfs_member") == 0) {
+ is_zfs = B_TRUE;
+ } else {
+ /* not ours, so skip */
+ zed_log_msg(LOG_INFO, "zed_udev_monitor: skip "
+ "%s (in use by %s)",
+ udev_device_get_devnode(dev), type);
+ udev_device_unref(dev);
+ continue;
+ }
+ }
+
+ /*
+ * if this is a disk and it is partitioned, then the
+ * zfs label will reside in a DEVTYPE=partition and
+ * we can skip passing this event
+ */
+ type = udev_device_get_property_value(dev, "DEVTYPE");
+ part = udev_device_get_property_value(dev,
+ "ID_PART_TABLE_TYPE");
+ if (type != NULL && type[0] != '\0' &&
+ strcmp(type, "disk") == 0 &&
+ part != NULL && part[0] != '\0') {
+ /* skip and wait for partition event */
+ udev_device_unref(dev);
+ continue;
+ }
+
+ /*
+ * ignore small partitions
+ */
+ sectors = udev_device_get_property_value(dev,
+ "ID_PART_ENTRY_SIZE");
+ if (sectors == NULL)
+ sectors = udev_device_get_sysattr_value(dev, "size");
+ if (sectors != NULL &&
+ strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {
+ udev_device_unref(dev);
+ continue;
+ }
+
+ /*
+ * If the blkid probe didn't find ZFS, then a persistent
+ * device id string is required in the message schema
+ * for matching with vdevs. Preflight here for expected
+ * udev information.
+ */
+ bus = udev_device_get_property_value(dev, "ID_BUS");
+ uuid = udev_device_get_property_value(dev, "DM_UUID");
+ if (!is_zfs && (bus == NULL && uuid == NULL)) {
+ zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "
+ "source", udev_device_get_devnode(dev));
+ udev_device_unref(dev);
+ continue;
+ }
+
+ action = udev_device_get_action(dev);
+ if (strcmp(action, "add") == 0) {
+ class = EC_DEV_ADD;
+ subclass = ESC_DISK;
+ } else if (strcmp(action, "remove") == 0) {
+ class = EC_DEV_REMOVE;
+ subclass = ESC_DISK;
+ } else if (strcmp(action, "change") == 0) {
+ class = EC_DEV_STATUS;
+ subclass = ESC_DEV_DLE;
+ } else {
+ zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown",
+ action);
+ udev_device_unref(dev);
+ continue;
+ }
+
+ /*
+ * Special case an EC_DEV_ADD for multipath devices
+ *
+ * When a multipath device is created, udev reports the
+ * following:
+ *
+ * 1. "add" event of the dm device for the multipath device
+ * (like /dev/dm-3).
+ * 2. "change" event to create the actual multipath device
+ * symlink (like /dev/mapper/mpatha). The event also
+ * passes back the relevant DM vars we care about, like
+ * DM_UUID.
+ * 3. Another "change" event identical to #2 (that we ignore).
+ *
+ * To get the behavior we want, we treat the "change" event
+ * in #2 as a "add" event; as if "/dev/mapper/mpatha" was
+ * a new disk being added.
+ */
+ if (strcmp(class, EC_DEV_STATUS) == 0 &&
+ udev_device_get_property_value(dev, "DM_UUID") &&
+ udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {
+ tmp = (char *)udev_device_get_devnode(dev);
+ tmp2 = zfs_get_underlying_path(tmp);
+ if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {
+ /*
+ * We have a real underlying device, which
+ * means that this multipath "change" event is
+ * an "add" event.
+ *
+ * If the multipath device and the underlying
+ * dev are the same name (i.e. /dev/dm-5), then
+ * there is no real underlying disk for this
+ * multipath device, and so this "change" event
+ * really is a multipath removal.
+ */
+ class = EC_DEV_ADD;
+ subclass = ESC_DISK;
+ } else {
+ tmp = (char *)
+ udev_device_get_property_value(dev,
+ "DM_NR_VALID_PATHS");
+ /* treat as a multipath remove */
+ if (tmp != NULL && strcmp(tmp, "0") == 0) {
+ class = EC_DEV_REMOVE;
+ subclass = ESC_DISK;
+ }
+ }
+ free(tmp2);
+ }
+
+ /*
+ * Special case an EC_DEV_ADD for scsi_debug devices
+ *
+ * These devices require a udevadm trigger command after
+ * creation in order to register the vdev_id scsidebug alias
+ * rule (adds a persistent path (phys_path) used for fault
+ * management automated tests in the ZFS test suite.
+ *
+ * After udevadm trigger command, event registers as a "change"
+ * event but needs to instead be handled as another "add" event
+ * to allow for disk labeling and partitioning to occur.
+ */
+ if (strcmp(class, EC_DEV_STATUS) == 0 &&
+ udev_device_get_property_value(dev, "ID_VDEV") &&
+ udev_device_get_property_value(dev, "ID_MODEL")) {
+ const char *id_model, *id_model_sd = "scsi_debug";
+
+ id_model = udev_device_get_property_value(dev,
+ "ID_MODEL");
+ if (strcmp(id_model, id_model_sd) == 0) {
+ class = EC_DEV_ADD;
+ subclass = ESC_DISK;
+ }
+ }
+
+ if ((nvl = dev_event_nvlist(dev)) != NULL) {
+ zed_udev_event(class, subclass, nvl);
+ nvlist_free(nvl);
+ }
+
+ udev_device_unref(dev);
+ }
+
+ return (NULL);
+}
+
+int
+zed_disk_event_init()
+{
+ int fd, fflags;
+
+ if ((g_udev = udev_new()) == NULL) {
+ zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno);
+ return (-1);
+ }
+
+ /* Set up a udev monitor for block devices */
+ g_mon = udev_monitor_new_from_netlink(g_udev, "udev");
+ udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk");
+ udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block",
+ "partition");
+ udev_monitor_enable_receiving(g_mon);
+
+ /* Make sure monitoring socket is blocking */
+ fd = udev_monitor_get_fd(g_mon);
+ if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK)
+ (void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK);
+
+ /* spawn a thread to monitor events */
+ if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) {
+ udev_monitor_unref(g_mon);
+ udev_unref(g_udev);
+ zed_log_msg(LOG_WARNING, "pthread_create failed");
+ return (-1);
+ }
+
+ zed_log_msg(LOG_INFO, "zed_disk_event_init");
+
+ return (0);
+}
+
+void
+zed_disk_event_fini()
+{
+ /* cancel monitor thread at recvmsg() */
+ (void) pthread_cancel(g_mon_tid);
+ (void) pthread_join(g_mon_tid, NULL);
+
+ /* cleanup udev resources */
+ udev_monitor_unref(g_mon);
+ udev_unref(g_udev);
+
+ zed_log_msg(LOG_INFO, "zed_disk_event_fini");
+}
+
+#else
+
+#include "zed_disk_event.h"
+
+int
+zed_disk_event_init()
+{
+ return (0);
+}
+
+void
+zed_disk_event_fini()
+{
+}
+
+#endif /* HAVE_LIBUDEV */
diff --git a/cmd/zed/zed_disk_event.h b/cmd/zed/zed_disk_event.h
new file mode 100644
index 000000000000..ea9813d0a595
--- /dev/null
+++ b/cmd/zed/zed_disk_event.h
@@ -0,0 +1,31 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef ZED_DISK_EVENT_H
+#define ZED_DISK_EVENT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int zed_disk_event_init(void);
+extern void zed_disk_event_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !ZED_DISK_EVENT_H */
diff --git a/cmd/zed/zed_event.c b/cmd/zed/zed_event.c
new file mode 100644
index 000000000000..1c5d00e297ff
--- /dev/null
+++ b/cmd/zed/zed_event.c
@@ -0,0 +1,965 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libzfs.h> /* FIXME: Replace with libzfs_core. */
+#include <paths.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/zfs_ioctl.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/fm/fs/zfs.h>
+#include "zed.h"
+#include "zed_conf.h"
+#include "zed_disk_event.h"
+#include "zed_event.h"
+#include "zed_exec.h"
+#include "zed_file.h"
+#include "zed_log.h"
+#include "zed_strings.h"
+
+#include "agents/zfs_agents.h"
+
+#define MAXBUF 4096
+
+/*
+ * Open the libzfs interface.
+ */
+int
+zed_event_init(struct zed_conf *zcp)
+{
+ if (!zcp)
+ zed_log_die("Failed zed_event_init: %s", strerror(EINVAL));
+
+ zcp->zfs_hdl = libzfs_init();
+ if (!zcp->zfs_hdl) {
+ if (zcp->do_idle)
+ return (-1);
+ zed_log_die("Failed to initialize libzfs");
+ }
+
+ zcp->zevent_fd = open(ZFS_DEV, O_RDWR);
+ if (zcp->zevent_fd < 0) {
+ if (zcp->do_idle)
+ return (-1);
+ zed_log_die("Failed to open \"%s\": %s",
+ ZFS_DEV, strerror(errno));
+ }
+
+ zfs_agent_init(zcp->zfs_hdl);
+
+ if (zed_disk_event_init() != 0) {
+ if (zcp->do_idle)
+ return (-1);
+ zed_log_die("Failed to initialize disk events");
+ }
+
+ return (0);
+}
+
+/*
+ * Close the libzfs interface.
+ */
+void
+zed_event_fini(struct zed_conf *zcp)
+{
+ if (!zcp)
+ zed_log_die("Failed zed_event_fini: %s", strerror(EINVAL));
+
+ zed_disk_event_fini();
+ zfs_agent_fini();
+
+ if (zcp->zevent_fd >= 0) {
+ if (close(zcp->zevent_fd) < 0)
+ zed_log_msg(LOG_WARNING, "Failed to close \"%s\": %s",
+ ZFS_DEV, strerror(errno));
+
+ zcp->zevent_fd = -1;
+ }
+ if (zcp->zfs_hdl) {
+ libzfs_fini(zcp->zfs_hdl);
+ zcp->zfs_hdl = NULL;
+ }
+}
+
+/*
+ * Seek to the event specified by [saved_eid] and [saved_etime].
+ * This protects against processing a given event more than once.
+ * Return 0 upon a successful seek to the specified event, or -1 otherwise.
+ *
+ * A zevent is considered to be uniquely specified by its (eid,time) tuple.
+ * The unsigned 64b eid is set to 1 when the kernel module is loaded, and
+ * incremented by 1 for each new event. Since the state file can persist
+ * across a kernel module reload, the time must be checked to ensure a match.
+ */
+int
+zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, int64_t saved_etime[])
+{
+ uint64_t eid;
+ int found;
+ nvlist_t *nvl;
+ int n_dropped;
+ int64_t *etime;
+ uint_t nelem;
+ int rv;
+
+ if (!zcp) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR, "Failed to seek zevent: %s",
+ strerror(errno));
+ return (-1);
+ }
+ eid = 0;
+ found = 0;
+ while ((eid < saved_eid) && !found) {
+ rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped,
+ ZEVENT_NONBLOCK, zcp->zevent_fd);
+
+ if ((rv != 0) || !nvl)
+ break;
+
+ if (n_dropped > 0) {
+ zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped);
+ /*
+ * FIXME: Increase max size of event nvlist in
+ * /sys/module/zfs/parameters/zfs_zevent_len_max ?
+ */
+ }
+ if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) {
+ zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid");
+ } else if (nvlist_lookup_int64_array(nvl, "time",
+ &etime, &nelem) != 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to lookup zevent time (eid=%llu)", eid);
+ } else if (nelem != 2) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to lookup zevent time (eid=%llu, nelem=%u)",
+ eid, nelem);
+ } else if ((eid != saved_eid) ||
+ (etime[0] != saved_etime[0]) ||
+ (etime[1] != saved_etime[1])) {
+ /* no-op */
+ } else {
+ found = 1;
+ }
+ free(nvl);
+ }
+ if (!found && (saved_eid > 0)) {
+ if (zpool_events_seek(zcp->zfs_hdl, ZEVENT_SEEK_START,
+ zcp->zevent_fd) < 0)
+ zed_log_msg(LOG_WARNING, "Failed to seek to eid=0");
+ else
+ eid = 0;
+ }
+ zed_log_msg(LOG_NOTICE, "Processing events since eid=%llu", eid);
+ return (found ? 0 : -1);
+}
+
+/*
+ * Return non-zero if nvpair [name] should be formatted in hex; o/w, return 0.
+ */
+static int
+_zed_event_value_is_hex(const char *name)
+{
+ const char *hex_suffix[] = {
+ "_guid",
+ "_guids",
+ NULL
+ };
+ const char **pp;
+ char *p;
+
+ if (!name)
+ return (0);
+
+ for (pp = hex_suffix; *pp; pp++) {
+ p = strstr(name, *pp);
+ if (p && strlen(p) == strlen(*pp))
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Add an environment variable for [eid] to the container [zsp].
+ *
+ * The variable name is the concatenation of [prefix] and [name] converted to
+ * uppercase with non-alphanumeric characters converted to underscores;
+ * [prefix] is optional, and [name] must begin with an alphabetic character.
+ * If the converted variable name already exists within the container [zsp],
+ * its existing value will be replaced with the new value.
+ *
+ * The variable value is specified by the format string [fmt].
+ *
+ * Returns 0 on success, and -1 on error (with errno set).
+ *
+ * All environment variables in [zsp] should be added through this function.
+ */
+static int
+_zed_event_add_var(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, const char *name, const char *fmt, ...)
+{
+ char keybuf[MAXBUF];
+ char valbuf[MAXBUF];
+ char *dstp;
+ const char *srcp;
+ const char *lastp;
+ int n;
+ int buflen;
+ va_list vargs;
+
+ assert(zsp != NULL);
+ assert(fmt != NULL);
+
+ if (!name) {
+ errno = EINVAL;
+ zed_log_msg(LOG_WARNING,
+ "Failed to add variable for eid=%llu: Name is empty", eid);
+ return (-1);
+ } else if (!isalpha(name[0])) {
+ errno = EINVAL;
+ zed_log_msg(LOG_WARNING,
+ "Failed to add variable for eid=%llu: "
+ "Name \"%s\" is invalid", eid, name);
+ return (-1);
+ }
+ /*
+ * Construct the string key by converting PREFIX (if present) and NAME.
+ */
+ dstp = keybuf;
+ lastp = keybuf + sizeof (keybuf);
+ if (prefix) {
+ for (srcp = prefix; *srcp && (dstp < lastp); srcp++)
+ *dstp++ = isalnum(*srcp) ? toupper(*srcp) : '_';
+ }
+ for (srcp = name; *srcp && (dstp < lastp); srcp++)
+ *dstp++ = isalnum(*srcp) ? toupper(*srcp) : '_';
+
+ if (dstp == lastp) {
+ errno = ENAMETOOLONG;
+ zed_log_msg(LOG_WARNING,
+ "Failed to add variable for eid=%llu: Name too long", eid);
+ return (-1);
+ }
+ *dstp = '\0';
+ /*
+ * Construct the string specified by "[PREFIX][NAME]=[FMT]".
+ */
+ dstp = valbuf;
+ buflen = sizeof (valbuf);
+ n = strlcpy(dstp, keybuf, buflen);
+ if (n >= sizeof (valbuf)) {
+ errno = EMSGSIZE;
+ zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s",
+ keybuf, eid, "Exceeded buffer size");
+ return (-1);
+ }
+ dstp += n;
+ buflen -= n;
+
+ *dstp++ = '=';
+ buflen--;
+
+ if (buflen <= 0) {
+ errno = EMSGSIZE;
+ zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s",
+ keybuf, eid, "Exceeded buffer size");
+ return (-1);
+ }
+
+ va_start(vargs, fmt);
+ n = vsnprintf(dstp, buflen, fmt, vargs);
+ va_end(vargs);
+
+ if ((n < 0) || (n >= buflen)) {
+ errno = EMSGSIZE;
+ zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s",
+ keybuf, eid, "Exceeded buffer size");
+ return (-1);
+ } else if (zed_strings_add(zsp, keybuf, valbuf) < 0) {
+ zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s",
+ keybuf, eid, strerror(errno));
+ return (-1);
+ }
+ return (0);
+}
+
+static int
+_zed_event_add_array_err(uint64_t eid, const char *name)
+{
+ errno = EMSGSIZE;
+ zed_log_msg(LOG_WARNING,
+ "Failed to convert nvpair \"%s\" for eid=%llu: "
+ "Exceeded buffer size", name, eid);
+ return (-1);
+}
+
+static int
+_zed_event_add_int8_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ int8_t *i8p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT8_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_int8_array(nvp, &i8p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%d ", i8p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_uint8_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ uint8_t *u8p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT8_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_uint8_array(nvp, &u8p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%u ", u8p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_int16_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ int16_t *i16p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT16_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_int16_array(nvp, &i16p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%d ", i16p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_uint16_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ uint16_t *u16p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT16_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_uint16_array(nvp, &u16p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%u ", u16p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_int32_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ int32_t *i32p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT32_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_int32_array(nvp, &i32p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%d ", i32p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_uint32_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ uint32_t *u32p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_uint32_array(nvp, &u32p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%u ", u32p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_int64_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ int64_t *i64p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT64_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_int64_array(nvp, &i64p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%lld ", (u_longlong_t)i64p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_uint64_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ const char *fmt;
+ uint64_t *u64p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT64_ARRAY));
+
+ name = nvpair_name(nvp);
+ fmt = _zed_event_value_is_hex(name) ? "0x%.16llX " : "%llu ";
+ (void) nvpair_value_uint64_array(nvp, &u64p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, fmt, (u_longlong_t)u64p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_string_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ char **strp;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_STRING_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_string_array(nvp, &strp, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%s ", strp[i] ? strp[i] : "<NULL>");
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+/*
+ * Convert the nvpair [nvp] to a string which is added to the environment
+ * of the child process.
+ * Return 0 on success, -1 on error.
+ *
+ * FIXME: Refactor with cmd/zpool/zpool_main.c:zpool_do_events_nvprint()?
+ */
+static void
+_zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp)
+{
+ const char *name;
+ data_type_t type;
+ const char *prefix = ZEVENT_VAR_PREFIX;
+ boolean_t b;
+ double d;
+ uint8_t i8;
+ uint16_t i16;
+ uint32_t i32;
+ uint64_t i64;
+ char *str;
+
+ assert(zsp != NULL);
+ assert(nvp != NULL);
+
+ name = nvpair_name(nvp);
+ type = nvpair_type(nvp);
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ _zed_event_add_var(eid, zsp, prefix, name, "%s", "1");
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ (void) nvpair_value_boolean_value(nvp, &b);
+ _zed_event_add_var(eid, zsp, prefix, name, "%s", b ? "1" : "0");
+ break;
+ case DATA_TYPE_BYTE:
+ (void) nvpair_value_byte(nvp, &i8);
+ _zed_event_add_var(eid, zsp, prefix, name, "%d", i8);
+ break;
+ case DATA_TYPE_INT8:
+ (void) nvpair_value_int8(nvp, (int8_t *)&i8);
+ _zed_event_add_var(eid, zsp, prefix, name, "%d", i8);
+ break;
+ case DATA_TYPE_UINT8:
+ (void) nvpair_value_uint8(nvp, &i8);
+ _zed_event_add_var(eid, zsp, prefix, name, "%u", i8);
+ break;
+ case DATA_TYPE_INT16:
+ (void) nvpair_value_int16(nvp, (int16_t *)&i16);
+ _zed_event_add_var(eid, zsp, prefix, name, "%d", i16);
+ break;
+ case DATA_TYPE_UINT16:
+ (void) nvpair_value_uint16(nvp, &i16);
+ _zed_event_add_var(eid, zsp, prefix, name, "%u", i16);
+ break;
+ case DATA_TYPE_INT32:
+ (void) nvpair_value_int32(nvp, (int32_t *)&i32);
+ _zed_event_add_var(eid, zsp, prefix, name, "%d", i32);
+ break;
+ case DATA_TYPE_UINT32:
+ (void) nvpair_value_uint32(nvp, &i32);
+ _zed_event_add_var(eid, zsp, prefix, name, "%u", i32);
+ break;
+ case DATA_TYPE_INT64:
+ (void) nvpair_value_int64(nvp, (int64_t *)&i64);
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%lld", (longlong_t)i64);
+ break;
+ case DATA_TYPE_UINT64:
+ (void) nvpair_value_uint64(nvp, &i64);
+ _zed_event_add_var(eid, zsp, prefix, name,
+ (_zed_event_value_is_hex(name) ? "0x%.16llX" : "%llu"),
+ (u_longlong_t)i64);
+ /*
+ * shadow readable strings for vdev state pairs
+ */
+ if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 ||
+ strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) {
+ char alt[32];
+
+ (void) snprintf(alt, sizeof (alt), "%s_str", name);
+ _zed_event_add_var(eid, zsp, prefix, alt, "%s",
+ zpool_state_to_name(i64, VDEV_AUX_NONE));
+ } else
+ /*
+ * shadow readable strings for pool state
+ */
+ if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_POOL_STATE) == 0) {
+ char alt[32];
+
+ (void) snprintf(alt, sizeof (alt), "%s_str", name);
+ _zed_event_add_var(eid, zsp, prefix, alt, "%s",
+ zpool_pool_state_to_name(i64));
+ }
+ break;
+ case DATA_TYPE_DOUBLE:
+ (void) nvpair_value_double(nvp, &d);
+ _zed_event_add_var(eid, zsp, prefix, name, "%g", d);
+ break;
+ case DATA_TYPE_HRTIME:
+ (void) nvpair_value_hrtime(nvp, (hrtime_t *)&i64);
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%llu", (u_longlong_t)i64);
+ break;
+ case DATA_TYPE_NVLIST:
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%s", "_NOT_IMPLEMENTED_"); /* FIXME */
+ break;
+ case DATA_TYPE_STRING:
+ (void) nvpair_value_string(nvp, &str);
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%s", (str ? str : "<NULL>"));
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%s", "_NOT_IMPLEMENTED_"); /* FIXME */
+ break;
+ case DATA_TYPE_BYTE_ARRAY:
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%s", "_NOT_IMPLEMENTED_"); /* FIXME */
+ break;
+ case DATA_TYPE_INT8_ARRAY:
+ _zed_event_add_int8_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_UINT8_ARRAY:
+ _zed_event_add_uint8_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_INT16_ARRAY:
+ _zed_event_add_int16_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_UINT16_ARRAY:
+ _zed_event_add_uint16_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_INT32_ARRAY:
+ _zed_event_add_int32_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_UINT32_ARRAY:
+ _zed_event_add_uint32_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_INT64_ARRAY:
+ _zed_event_add_int64_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_UINT64_ARRAY:
+ _zed_event_add_uint64_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ _zed_event_add_string_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%s", "_NOT_IMPLEMENTED_"); /* FIXME */
+ break;
+ default:
+ errno = EINVAL;
+ zed_log_msg(LOG_WARNING,
+ "Failed to convert nvpair \"%s\" for eid=%llu: "
+ "Unrecognized type=%u", name, eid, (unsigned int) type);
+ break;
+ }
+}
+
+/*
+ * Restrict various environment variables to safe and sane values
+ * when constructing the environment for the child process, unless
+ * we're running with a custom $PATH (like under the ZFS test suite).
+ *
+ * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1.
+ */
+static void
+_zed_event_add_env_restrict(uint64_t eid, zed_strings_t *zsp,
+ const char *path)
+{
+ const char *env_restrict[][2] = {
+ { "IFS", " \t\n" },
+ { "PATH", _PATH_STDPATH },
+ { "ZDB", SBINDIR "/zdb" },
+ { "ZED", SBINDIR "/zed" },
+ { "ZFS", SBINDIR "/zfs" },
+ { "ZINJECT", SBINDIR "/zinject" },
+ { "ZPOOL", SBINDIR "/zpool" },
+ { "ZFS_ALIAS", ZFS_META_ALIAS },
+ { "ZFS_VERSION", ZFS_META_VERSION },
+ { "ZFS_RELEASE", ZFS_META_RELEASE },
+ { NULL, NULL }
+ };
+
+ /*
+ * If we have a custom $PATH, use the default ZFS binary locations
+ * instead of the hard-coded ones.
+ */
+ const char *env_path[][2] = {
+ { "IFS", " \t\n" },
+ { "PATH", NULL }, /* $PATH copied in later on */
+ { "ZDB", "zdb" },
+ { "ZED", "zed" },
+ { "ZFS", "zfs" },
+ { "ZINJECT", "zinject" },
+ { "ZPOOL", "zpool" },
+ { "ZFS_ALIAS", ZFS_META_ALIAS },
+ { "ZFS_VERSION", ZFS_META_VERSION },
+ { "ZFS_RELEASE", ZFS_META_RELEASE },
+ { NULL, NULL }
+ };
+ const char *(*pa)[2];
+
+ assert(zsp != NULL);
+
+ pa = path != NULL ? env_path : env_restrict;
+
+ for (; *(*pa); pa++) {
+ /* Use our custom $PATH if we have one */
+ if (path != NULL && strcmp((*pa)[0], "PATH") == 0)
+ (*pa)[1] = path;
+
+ _zed_event_add_var(eid, zsp, NULL, (*pa)[0], "%s", (*pa)[1]);
+ }
+}
+
+/*
+ * Preserve specified variables from the parent environment
+ * when constructing the environment for the child process.
+ *
+ * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1.
+ */
+static void
+_zed_event_add_env_preserve(uint64_t eid, zed_strings_t *zsp)
+{
+ const char *env_preserve[] = {
+ "TZ",
+ NULL
+ };
+ const char **keyp;
+ const char *val;
+
+ assert(zsp != NULL);
+
+ for (keyp = env_preserve; *keyp; keyp++) {
+ if ((val = getenv(*keyp)))
+ _zed_event_add_var(eid, zsp, NULL, *keyp, "%s", val);
+ }
+}
+
+/*
+ * Compute the "subclass" by removing the first 3 components of [class]
+ * (which will always be of the form "*.fs.zfs"). Return a pointer inside
+ * the string [class], or NULL if insufficient components exist.
+ */
+static const char *
+_zed_event_get_subclass(const char *class)
+{
+ const char *p;
+ int i;
+
+ if (!class)
+ return (NULL);
+
+ p = class;
+ for (i = 0; i < 3; i++) {
+ p = strchr(p, '.');
+ if (!p)
+ break;
+ p++;
+ }
+ return (p);
+}
+
+/*
+ * Convert the zevent time from a 2-element array of 64b integers
+ * into a more convenient form:
+ * - TIME_SECS is the second component of the time.
+ * - TIME_NSECS is the nanosecond component of the time.
+ * - TIME_STRING is an almost-RFC3339-compliant string representation.
+ */
+static void
+_zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[])
+{
+ struct tm *stp;
+ char buf[32];
+
+ assert(zsp != NULL);
+ assert(etime != NULL);
+
+ _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_SECS",
+ "%lld", (long long int) etime[0]);
+ _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_NSECS",
+ "%lld", (long long int) etime[1]);
+
+ if (!(stp = localtime((const time_t *) &etime[0]))) {
+ zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s",
+ ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "localtime error");
+ } else if (!strftime(buf, sizeof (buf), "%Y-%m-%d %H:%M:%S%z", stp)) {
+ zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s",
+ ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "strftime error");
+ } else {
+ _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_STRING",
+ "%s", buf);
+ }
+}
+
+/*
+ * Service the next zevent, blocking until one is available.
+ */
+int
+zed_event_service(struct zed_conf *zcp)
+{
+ nvlist_t *nvl;
+ nvpair_t *nvp;
+ int n_dropped;
+ zed_strings_t *zsp;
+ uint64_t eid;
+ int64_t *etime;
+ uint_t nelem;
+ char *class;
+ const char *subclass;
+ int rv;
+
+ if (!zcp) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR, "Failed to service zevent: %s",
+ strerror(errno));
+ return (EINVAL);
+ }
+ rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, ZEVENT_NONE,
+ zcp->zevent_fd);
+
+ if ((rv != 0) || !nvl)
+ return (errno);
+
+ if (n_dropped > 0) {
+ zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped);
+ /*
+ * FIXME: Increase max size of event nvlist in
+ * /sys/module/zfs/parameters/zfs_zevent_len_max ?
+ */
+ }
+ if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) {
+ zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid");
+ } else if (nvlist_lookup_int64_array(
+ nvl, "time", &etime, &nelem) != 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to lookup zevent time (eid=%llu)", eid);
+ } else if (nelem != 2) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to lookup zevent time (eid=%llu, nelem=%u)",
+ eid, nelem);
+ } else if (nvlist_lookup_string(nvl, "class", &class) != 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to lookup zevent class (eid=%llu)", eid);
+ } else {
+ /* let internal modules see this event first */
+ zfs_agent_post_event(class, NULL, nvl);
+
+ zsp = zed_strings_create();
+
+ nvp = NULL;
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)))
+ _zed_event_add_nvpair(eid, zsp, nvp);
+
+ _zed_event_add_env_restrict(eid, zsp, zcp->path);
+ _zed_event_add_env_preserve(eid, zsp);
+
+ _zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "PID",
+ "%d", (int)getpid());
+ _zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "ZEDLET_DIR",
+ "%s", zcp->zedlet_dir);
+ subclass = _zed_event_get_subclass(class);
+ _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "SUBCLASS",
+ "%s", (subclass ? subclass : class));
+
+ _zed_event_add_time_strings(eid, zsp, etime);
+
+ zed_exec_process(eid, class, subclass,
+ zcp->zedlet_dir, zcp->zedlets, zsp, zcp->zevent_fd);
+
+ zed_conf_write_state(zcp, eid, etime);
+
+ zed_strings_destroy(zsp);
+ }
+ nvlist_free(nvl);
+ return (0);
+}
diff --git a/cmd/zed/zed_event.h b/cmd/zed/zed_event.h
new file mode 100644
index 000000000000..c1455c3a0629
--- /dev/null
+++ b/cmd/zed/zed_event.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_EVENT_H
+#define ZED_EVENT_H
+
+#include <stdint.h>
+
+int zed_event_init(struct zed_conf *zcp);
+
+void zed_event_fini(struct zed_conf *zcp);
+
+int zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid,
+ int64_t saved_etime[]);
+
+int zed_event_service(struct zed_conf *zcp);
+
+#endif /* !ZED_EVENT_H */
diff --git a/cmd/zed/zed_exec.c b/cmd/zed/zed_exec.c
new file mode 100644
index 000000000000..08b7b5568362
--- /dev/null
+++ b/cmd/zed/zed_exec.c
@@ -0,0 +1,232 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+#include "zed_exec.h"
+#include "zed_file.h"
+#include "zed_log.h"
+#include "zed_strings.h"
+
+#define ZEVENT_FILENO 3
+
+/*
+ * Create an environment string array for passing to execve() using the
+ * NAME=VALUE strings in container [zsp].
+ * Return a newly-allocated environment, or NULL on error.
+ */
+static char **
+_zed_exec_create_env(zed_strings_t *zsp)
+{
+ int num_ptrs;
+ int buflen;
+ char *buf;
+ char **pp;
+ char *p;
+ const char *q;
+ int i;
+ int len;
+
+ num_ptrs = zed_strings_count(zsp) + 1;
+ buflen = num_ptrs * sizeof (char *);
+ for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp))
+ buflen += strlen(q) + 1;
+
+ buf = calloc(1, buflen);
+ if (!buf)
+ return (NULL);
+
+ pp = (char **)buf;
+ p = buf + (num_ptrs * sizeof (char *));
+ i = 0;
+ for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) {
+ pp[i] = p;
+ len = strlen(q) + 1;
+ memcpy(p, q, len);
+ p += len;
+ i++;
+ }
+ pp[i] = NULL;
+ assert(buf + buflen == p);
+ return ((char **)buf);
+}
+
+/*
+ * Fork a child process to handle event [eid]. The program [prog]
+ * in directory [dir] is executed with the environment [env].
+ *
+ * The file descriptor [zfd] is the zevent_fd used to track the
+ * current cursor location within the zevent nvlist.
+ */
+static void
+_zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog,
+ char *env[], int zfd)
+{
+ char path[PATH_MAX];
+ int n;
+ pid_t pid;
+ int fd;
+ pid_t wpid;
+ int status;
+
+ assert(dir != NULL);
+ assert(prog != NULL);
+ assert(env != NULL);
+ assert(zfd >= 0);
+
+ n = snprintf(path, sizeof (path), "%s/%s", dir, prog);
+ if ((n < 0) || (n >= sizeof (path))) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to fork \"%s\" for eid=%llu: %s",
+ prog, eid, strerror(ENAMETOOLONG));
+ return;
+ }
+ pid = fork();
+ if (pid < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to fork \"%s\" for eid=%llu: %s",
+ prog, eid, strerror(errno));
+ return;
+ } else if (pid == 0) {
+ (void) umask(022);
+ if ((fd = open("/dev/null", O_RDWR)) != -1) {
+ (void) dup2(fd, STDIN_FILENO);
+ (void) dup2(fd, STDOUT_FILENO);
+ (void) dup2(fd, STDERR_FILENO);
+ }
+ (void) dup2(zfd, ZEVENT_FILENO);
+ zed_file_close_from(ZEVENT_FILENO + 1);
+ execle(path, prog, NULL, env);
+ _exit(127);
+ }
+
+ /* parent process */
+
+ zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d",
+ prog, eid, pid);
+
+ /* FIXME: Timeout rogue child processes with sigalarm? */
+
+ /*
+ * Wait for child process using WNOHANG to limit
+ * the time spent waiting to 10 seconds (10,000ms).
+ */
+ for (n = 0; n < 1000; n++) {
+ wpid = waitpid(pid, &status, WNOHANG);
+ if (wpid == (pid_t)-1) {
+ if (errno == EINTR)
+ continue;
+ zed_log_msg(LOG_WARNING,
+ "Failed to wait for \"%s\" eid=%llu pid=%d",
+ prog, eid, pid);
+ break;
+ } else if (wpid == 0) {
+ struct timespec t;
+
+ /* child still running */
+ t.tv_sec = 0;
+ t.tv_nsec = 10000000; /* 10ms */
+ (void) nanosleep(&t, NULL);
+ continue;
+ }
+
+ if (WIFEXITED(status)) {
+ zed_log_msg(LOG_INFO,
+ "Finished \"%s\" eid=%llu pid=%d exit=%d",
+ prog, eid, pid, WEXITSTATUS(status));
+ } else if (WIFSIGNALED(status)) {
+ zed_log_msg(LOG_INFO,
+ "Finished \"%s\" eid=%llu pid=%d sig=%d/%s",
+ prog, eid, pid, WTERMSIG(status),
+ strsignal(WTERMSIG(status)));
+ } else {
+ zed_log_msg(LOG_INFO,
+ "Finished \"%s\" eid=%llu pid=%d status=0x%X",
+ prog, eid, (unsigned int) status);
+ }
+ break;
+ }
+
+ /*
+ * kill child process after 10 seconds
+ */
+ if (wpid == 0) {
+ zed_log_msg(LOG_WARNING, "Killing hung \"%s\" pid=%d",
+ prog, pid);
+ (void) kill(pid, SIGKILL);
+ }
+}
+
+/*
+ * Process the event [eid] by synchronously invoking all zedlets with a
+ * matching class prefix.
+ *
+ * Each executable in [zedlets] from the directory [dir] is matched against
+ * the event's [class], [subclass], and the "all" class (which matches
+ * all events). Every zedlet with a matching class prefix is invoked.
+ * The NAME=VALUE strings in [envs] will be passed to the zedlet as
+ * environment variables.
+ *
+ * The file descriptor [zfd] is the zevent_fd used to track the
+ * current cursor location within the zevent nvlist.
+ *
+ * Return 0 on success, -1 on error.
+ */
+int
+zed_exec_process(uint64_t eid, const char *class, const char *subclass,
+ const char *dir, zed_strings_t *zedlets, zed_strings_t *envs, int zfd)
+{
+ const char *class_strings[4];
+ const char *allclass = "all";
+ const char **csp;
+ const char *z;
+ char **e;
+ int n;
+
+ if (!dir || !zedlets || !envs || zfd < 0)
+ return (-1);
+
+ csp = class_strings;
+
+ if (class)
+ *csp++ = class;
+
+ if (subclass)
+ *csp++ = subclass;
+
+ if (allclass)
+ *csp++ = allclass;
+
+ *csp = NULL;
+
+ e = _zed_exec_create_env(envs);
+
+ for (z = zed_strings_first(zedlets); z; z = zed_strings_next(zedlets)) {
+ for (csp = class_strings; *csp; csp++) {
+ n = strlen(*csp);
+ if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n]))
+ _zed_exec_fork_child(eid, dir, z, e, zfd);
+ }
+ }
+ free(e);
+ return (0);
+}
diff --git a/cmd/zed/zed_exec.h b/cmd/zed/zed_exec.h
new file mode 100644
index 000000000000..4153e5519a46
--- /dev/null
+++ b/cmd/zed/zed_exec.h
@@ -0,0 +1,25 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_EXEC_H
+#define ZED_EXEC_H
+
+#include <stdint.h>
+#include "zed_strings.h"
+
+int zed_exec_process(uint64_t eid, const char *class, const char *subclass,
+ const char *dir, zed_strings_t *zedlets, zed_strings_t *envs,
+ int zevent_fd);
+
+#endif /* !ZED_EXEC_H */
diff --git a/cmd/zed/zed_file.c b/cmd/zed/zed_file.c
new file mode 100644
index 000000000000..c3cf3d421c6f
--- /dev/null
+++ b/cmd/zed/zed_file.c
@@ -0,0 +1,217 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "zed_file.h"
+#include "zed_log.h"
+
+/*
+ * Read up to [n] bytes from [fd] into [buf].
+ * Return the number of bytes read, 0 on EOF, or -1 on error.
+ */
+ssize_t
+zed_file_read_n(int fd, void *buf, size_t n)
+{
+ unsigned char *p;
+ size_t n_left;
+ ssize_t n_read;
+
+ p = buf;
+ n_left = n;
+ while (n_left > 0) {
+ if ((n_read = read(fd, p, n_left)) < 0) {
+ if (errno == EINTR)
+ continue;
+ else
+ return (-1);
+
+ } else if (n_read == 0) {
+ break;
+ }
+ n_left -= n_read;
+ p += n_read;
+ }
+ return (n - n_left);
+}
+
+/*
+ * Write [n] bytes from [buf] out to [fd].
+ * Return the number of bytes written, or -1 on error.
+ */
+ssize_t
+zed_file_write_n(int fd, void *buf, size_t n)
+{
+ const unsigned char *p;
+ size_t n_left;
+ ssize_t n_written;
+
+ p = buf;
+ n_left = n;
+ while (n_left > 0) {
+ if ((n_written = write(fd, p, n_left)) < 0) {
+ if (errno == EINTR)
+ continue;
+ else
+ return (-1);
+
+ }
+ n_left -= n_written;
+ p += n_written;
+ }
+ return (n);
+}
+
+/*
+ * Set an exclusive advisory lock on the open file descriptor [fd].
+ * Return 0 on success, 1 if a conflicting lock is held by another process,
+ * or -1 on error (with errno set).
+ */
+int
+zed_file_lock(int fd)
+{
+ struct flock lock;
+
+ if (fd < 0) {
+ errno = EBADF;
+ return (-1);
+ }
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+
+ if (fcntl(fd, F_SETLK, &lock) < 0) {
+ if ((errno == EACCES) || (errno == EAGAIN))
+ return (1);
+
+ return (-1);
+ }
+ return (0);
+}
+
+/*
+ * Release an advisory lock held on the open file descriptor [fd].
+ * Return 0 on success, or -1 on error (with errno set).
+ */
+int
+zed_file_unlock(int fd)
+{
+ struct flock lock;
+
+ if (fd < 0) {
+ errno = EBADF;
+ return (-1);
+ }
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+
+ if (fcntl(fd, F_SETLK, &lock) < 0)
+ return (-1);
+
+ return (0);
+}
+
+/*
+ * Test whether an exclusive advisory lock could be obtained for the open
+ * file descriptor [fd].
+ * Return 0 if the file is not locked, >0 for the PID of another process
+ * holding a conflicting lock, or -1 on error (with errno set).
+ */
+pid_t
+zed_file_is_locked(int fd)
+{
+ struct flock lock;
+
+ if (fd < 0) {
+ errno = EBADF;
+ return (-1);
+ }
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+
+ if (fcntl(fd, F_GETLK, &lock) < 0)
+ return (-1);
+
+ if (lock.l_type == F_UNLCK)
+ return (0);
+
+ return (lock.l_pid);
+}
+
+/*
+ * Close all open file descriptors greater than or equal to [lowfd].
+ * Any errors encountered while closing file descriptors are ignored.
+ */
+void
+zed_file_close_from(int lowfd)
+{
+ const int maxfd_def = 256;
+ int errno_bak;
+ struct rlimit rl;
+ int maxfd;
+ int fd;
+
+ errno_bak = errno;
+
+ if (getrlimit(RLIMIT_NOFILE, &rl) < 0) {
+ maxfd = maxfd_def;
+ } else if (rl.rlim_max == RLIM_INFINITY) {
+ maxfd = maxfd_def;
+ } else {
+ maxfd = rl.rlim_max;
+ }
+ for (fd = lowfd; fd < maxfd; fd++)
+ (void) close(fd);
+
+ errno = errno_bak;
+}
+
+/*
+ * Set the CLOEXEC flag on file descriptor [fd] so it will be automatically
+ * closed upon successful execution of one of the exec functions.
+ * Return 0 on success, or -1 on error.
+ *
+ * FIXME: No longer needed?
+ */
+int
+zed_file_close_on_exec(int fd)
+{
+ int flags;
+
+ if (fd < 0) {
+ errno = EBADF;
+ return (-1);
+ }
+ flags = fcntl(fd, F_GETFD);
+ if (flags == -1)
+ return (-1);
+
+ flags |= FD_CLOEXEC;
+
+ if (fcntl(fd, F_SETFD, flags) == -1)
+ return (-1);
+
+ return (0);
+}
diff --git a/cmd/zed/zed_file.h b/cmd/zed/zed_file.h
new file mode 100644
index 000000000000..05f360d20efd
--- /dev/null
+++ b/cmd/zed/zed_file.h
@@ -0,0 +1,35 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_FILE_H
+#define ZED_FILE_H
+
+#include <sys/types.h>
+#include <unistd.h>
+
+ssize_t zed_file_read_n(int fd, void *buf, size_t n);
+
+ssize_t zed_file_write_n(int fd, void *buf, size_t n);
+
+int zed_file_lock(int fd);
+
+int zed_file_unlock(int fd);
+
+pid_t zed_file_is_locked(int fd);
+
+void zed_file_close_from(int fd);
+
+int zed_file_close_on_exec(int fd);
+
+#endif /* !ZED_FILE_H */
diff --git a/cmd/zed/zed_log.c b/cmd/zed/zed_log.c
new file mode 100644
index 000000000000..5a3f2dbdb832
--- /dev/null
+++ b/cmd/zed/zed_log.c
@@ -0,0 +1,256 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <syslog.h>
+#include <unistd.h>
+#include "zed_log.h"
+
+#define ZED_LOG_MAX_LOG_LEN 1024
+
+static struct {
+ unsigned do_stderr:1;
+ unsigned do_syslog:1;
+ const char *identity;
+ int priority;
+ int pipe_fd[2];
+} _ctx;
+
+/*
+ * Initialize the logging subsystem.
+ */
+void
+zed_log_init(const char *identity)
+{
+ if (identity) {
+ const char *p = strrchr(identity, '/');
+ _ctx.identity = (p != NULL) ? p + 1 : identity;
+ } else {
+ _ctx.identity = NULL;
+ }
+ _ctx.pipe_fd[0] = -1;
+ _ctx.pipe_fd[1] = -1;
+}
+
+/*
+ * Shutdown the logging subsystem.
+ */
+void
+zed_log_fini(void)
+{
+ zed_log_stderr_close();
+ zed_log_syslog_close();
+}
+
+/*
+ * Create pipe for communicating daemonization status between the parent and
+ * child processes across the double-fork().
+ */
+void
+zed_log_pipe_open(void)
+{
+ if ((_ctx.pipe_fd[0] != -1) || (_ctx.pipe_fd[1] != -1))
+ zed_log_die("Invalid use of zed_log_pipe_open in PID %d",
+ (int)getpid());
+
+ if (pipe(_ctx.pipe_fd) < 0)
+ zed_log_die("Failed to create daemonize pipe in PID %d: %s",
+ (int)getpid(), strerror(errno));
+}
+
+/*
+ * Close the read-half of the daemonize pipe.
+ *
+ * This should be called by the child after fork()ing from the parent since
+ * the child will never read from this pipe.
+ */
+void
+zed_log_pipe_close_reads(void)
+{
+ if (_ctx.pipe_fd[0] < 0)
+ zed_log_die(
+ "Invalid use of zed_log_pipe_close_reads in PID %d",
+ (int)getpid());
+
+ if (close(_ctx.pipe_fd[0]) < 0)
+ zed_log_die(
+ "Failed to close reads on daemonize pipe in PID %d: %s",
+ (int)getpid(), strerror(errno));
+
+ _ctx.pipe_fd[0] = -1;
+}
+
+/*
+ * Close the write-half of the daemonize pipe.
+ *
+ * This should be called by the parent after fork()ing its child since the
+ * parent will never write to this pipe.
+ *
+ * This should also be called by the child once initialization is complete
+ * in order to signal the parent that it can safely exit.
+ */
+void
+zed_log_pipe_close_writes(void)
+{
+ if (_ctx.pipe_fd[1] < 0)
+ zed_log_die(
+ "Invalid use of zed_log_pipe_close_writes in PID %d",
+ (int)getpid());
+
+ if (close(_ctx.pipe_fd[1]) < 0)
+ zed_log_die(
+ "Failed to close writes on daemonize pipe in PID %d: %s",
+ (int)getpid(), strerror(errno));
+
+ _ctx.pipe_fd[1] = -1;
+}
+
+/*
+ * Block on reading from the daemonize pipe until signaled by the child
+ * (via zed_log_pipe_close_writes()) that initialization is complete.
+ *
+ * This should only be called by the parent while waiting to exit after
+ * fork()ing the child.
+ */
+void
+zed_log_pipe_wait(void)
+{
+ ssize_t n;
+ char c;
+
+ if (_ctx.pipe_fd[0] < 0)
+ zed_log_die("Invalid use of zed_log_pipe_wait in PID %d",
+ (int)getpid());
+
+ for (;;) {
+ n = read(_ctx.pipe_fd[0], &c, sizeof (c));
+ if (n < 0) {
+ if (errno == EINTR)
+ continue;
+ zed_log_die(
+ "Failed to read from daemonize pipe in PID %d: %s",
+ (int)getpid(), strerror(errno));
+ }
+ if (n == 0) {
+ break;
+ }
+ }
+}
+
+/*
+ * Start logging messages at the syslog [priority] level or higher to stderr.
+ * Refer to syslog(3) for valid priority values.
+ */
+void
+zed_log_stderr_open(int priority)
+{
+ _ctx.do_stderr = 1;
+ _ctx.priority = priority;
+}
+
+/*
+ * Stop logging messages to stderr.
+ */
+void
+zed_log_stderr_close(void)
+{
+ if (_ctx.do_stderr)
+ _ctx.do_stderr = 0;
+}
+
+/*
+ * Start logging messages to syslog.
+ * Refer to syslog(3) for valid option/facility values.
+ */
+void
+zed_log_syslog_open(int facility)
+{
+ _ctx.do_syslog = 1;
+ openlog(_ctx.identity, LOG_NDELAY | LOG_PID, facility);
+}
+
+/*
+ * Stop logging messages to syslog.
+ */
+void
+zed_log_syslog_close(void)
+{
+ if (_ctx.do_syslog) {
+ _ctx.do_syslog = 0;
+ closelog();
+ }
+}
+
+/*
+ * Auxiliary function to log a message to syslog and/or stderr.
+ */
+static void
+_zed_log_aux(int priority, const char *fmt, va_list vargs)
+{
+ char buf[ZED_LOG_MAX_LOG_LEN];
+ int n;
+
+ if (!fmt)
+ return;
+
+ n = vsnprintf(buf, sizeof (buf), fmt, vargs);
+ if ((n < 0) || (n >= sizeof (buf))) {
+ buf[sizeof (buf) - 2] = '+';
+ buf[sizeof (buf) - 1] = '\0';
+ }
+
+ if (_ctx.do_syslog)
+ syslog(priority, "%s", buf);
+
+ if (_ctx.do_stderr && (priority <= _ctx.priority))
+ fprintf(stderr, "%s\n", buf);
+}
+
+/*
+ * Log a message at the given [priority] level specified by the printf-style
+ * format string [fmt].
+ */
+void
+zed_log_msg(int priority, const char *fmt, ...)
+{
+ va_list vargs;
+
+ if (fmt) {
+ va_start(vargs, fmt);
+ _zed_log_aux(priority, fmt, vargs);
+ va_end(vargs);
+ }
+}
+
+/*
+ * Log a fatal error message specified by the printf-style format string [fmt].
+ */
+void
+zed_log_die(const char *fmt, ...)
+{
+ va_list vargs;
+
+ if (fmt) {
+ va_start(vargs, fmt);
+ _zed_log_aux(LOG_ERR, fmt, vargs);
+ va_end(vargs);
+ }
+ exit(EXIT_FAILURE);
+}
diff --git a/cmd/zed/zed_log.h b/cmd/zed/zed_log.h
new file mode 100644
index 000000000000..a03a4f53967c
--- /dev/null
+++ b/cmd/zed/zed_log.h
@@ -0,0 +1,44 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_LOG_H
+#define ZED_LOG_H
+
+#include <syslog.h>
+
+void zed_log_init(const char *identity);
+
+void zed_log_fini(void);
+
+void zed_log_pipe_open(void);
+
+void zed_log_pipe_close_reads(void);
+
+void zed_log_pipe_close_writes(void);
+
+void zed_log_pipe_wait(void);
+
+void zed_log_stderr_open(int priority);
+
+void zed_log_stderr_close(void);
+
+void zed_log_syslog_open(int facility);
+
+void zed_log_syslog_close(void);
+
+void zed_log_msg(int priority, const char *fmt, ...);
+
+void zed_log_die(const char *fmt, ...);
+
+#endif /* !ZED_LOG_H */
diff --git a/cmd/zed/zed_strings.c b/cmd/zed/zed_strings.c
new file mode 100644
index 000000000000..6b1c669d71f4
--- /dev/null
+++ b/cmd/zed/zed_strings.c
@@ -0,0 +1,247 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/avl.h>
+#include <sys/sysmacros.h>
+#include "zed_strings.h"
+
+struct zed_strings {
+ avl_tree_t tree;
+ avl_node_t *iteratorp;
+};
+
+struct zed_strings_node {
+ avl_node_t node;
+ char *key;
+ char *val;
+};
+
+typedef struct zed_strings_node zed_strings_node_t;
+
+/*
+ * Compare zed_strings_node_t nodes [x1] and [x2].
+ * As required for the AVL tree, return -1 for <, 0 for ==, and +1 for >.
+ */
+static int
+_zed_strings_node_compare(const void *x1, const void *x2)
+{
+ const char *s1;
+ const char *s2;
+ int rv;
+
+ assert(x1 != NULL);
+ assert(x2 != NULL);
+
+ s1 = ((const zed_strings_node_t *) x1)->key;
+ assert(s1 != NULL);
+ s2 = ((const zed_strings_node_t *) x2)->key;
+ assert(s2 != NULL);
+ rv = strcmp(s1, s2);
+
+ if (rv < 0)
+ return (-1);
+
+ if (rv > 0)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Return a new string container, or NULL on error.
+ */
+zed_strings_t *
+zed_strings_create(void)
+{
+ zed_strings_t *zsp;
+
+ zsp = calloc(1, sizeof (*zsp));
+ if (!zsp)
+ return (NULL);
+
+ avl_create(&zsp->tree, _zed_strings_node_compare,
+ sizeof (zed_strings_node_t), offsetof(zed_strings_node_t, node));
+
+ zsp->iteratorp = NULL;
+ return (zsp);
+}
+
+/*
+ * Destroy the string node [np].
+ */
+static void
+_zed_strings_node_destroy(zed_strings_node_t *np)
+{
+ if (!np)
+ return;
+
+ if (np->key) {
+ if (np->key != np->val)
+ free(np->key);
+ np->key = NULL;
+ }
+ if (np->val) {
+ free(np->val);
+ np->val = NULL;
+ }
+ free(np);
+}
+
+/*
+ * Return a new string node for storing the string [val], or NULL on error.
+ * If [key] is specified, it will be used to index the node; otherwise,
+ * the string [val] will be used.
+ */
+static zed_strings_node_t *
+_zed_strings_node_create(const char *key, const char *val)
+{
+ zed_strings_node_t *np;
+
+ assert(val != NULL);
+
+ np = calloc(1, sizeof (*np));
+ if (!np)
+ return (NULL);
+
+ np->val = strdup(val);
+ if (!np->val)
+ goto nomem;
+
+ if (key) {
+ np->key = strdup(key);
+ if (!np->key)
+ goto nomem;
+ } else {
+ np->key = np->val;
+ }
+ return (np);
+
+nomem:
+ _zed_strings_node_destroy(np);
+ return (NULL);
+}
+
+/*
+ * Destroy the string container [zsp] and all nodes within.
+ */
+void
+zed_strings_destroy(zed_strings_t *zsp)
+{
+ void *cookie;
+ zed_strings_node_t *np;
+
+ if (!zsp)
+ return;
+
+ cookie = NULL;
+ while ((np = avl_destroy_nodes(&zsp->tree, &cookie)))
+ _zed_strings_node_destroy(np);
+
+ avl_destroy(&zsp->tree);
+ free(zsp);
+}
+
+/*
+ * Add a copy of the string [s] indexed by [key] to the container [zsp].
+ * If [key] already exists within the container [zsp], it will be replaced
+ * with the new string [s].
+ * If [key] is NULL, the string [s] will be used as the key.
+ * Return 0 on success, or -1 on error.
+ */
+int
+zed_strings_add(zed_strings_t *zsp, const char *key, const char *s)
+{
+ zed_strings_node_t *newp, *oldp;
+
+ if (!zsp || !s) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if (key == s)
+ key = NULL;
+
+ newp = _zed_strings_node_create(key, s);
+ if (!newp)
+ return (-1);
+
+ oldp = avl_find(&zsp->tree, newp, NULL);
+ if (oldp) {
+ avl_remove(&zsp->tree, oldp);
+ _zed_strings_node_destroy(oldp);
+ }
+ avl_add(&zsp->tree, newp);
+ return (0);
+}
+
+/*
+ * Return the first string in container [zsp].
+ * Return NULL if there are no strings, or on error.
+ * This can be called multiple times to re-traverse [zsp].
+ * XXX: Not thread-safe.
+ */
+const char *
+zed_strings_first(zed_strings_t *zsp)
+{
+ if (!zsp) {
+ errno = EINVAL;
+ return (NULL);
+ }
+ zsp->iteratorp = avl_first(&zsp->tree);
+ if (!zsp->iteratorp)
+ return (NULL);
+
+ return (((zed_strings_node_t *)zsp->iteratorp)->val);
+
+}
+
+/*
+ * Return the next string in container [zsp].
+ * Return NULL after the last string, or on error.
+ * This must be called after zed_strings_first().
+ * XXX: Not thread-safe.
+ */
+const char *
+zed_strings_next(zed_strings_t *zsp)
+{
+ if (!zsp) {
+ errno = EINVAL;
+ return (NULL);
+ }
+ if (!zsp->iteratorp)
+ return (NULL);
+
+ zsp->iteratorp = AVL_NEXT(&zsp->tree, zsp->iteratorp);
+ if (!zsp->iteratorp)
+ return (NULL);
+
+ return (((zed_strings_node_t *)zsp->iteratorp)->val);
+}
+
+/*
+ * Return the number of strings in container [zsp], or -1 on error.
+ */
+int
+zed_strings_count(zed_strings_t *zsp)
+{
+ if (!zsp) {
+ errno = EINVAL;
+ return (-1);
+ }
+ return (avl_numnodes(&zsp->tree));
+}
diff --git a/cmd/zed/zed_strings.h b/cmd/zed/zed_strings.h
new file mode 100644
index 000000000000..37a84cad7ffc
--- /dev/null
+++ b/cmd/zed/zed_strings.h
@@ -0,0 +1,27 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_STRINGS_H
+#define ZED_STRINGS_H
+
+typedef struct zed_strings zed_strings_t;
+
+zed_strings_t *zed_strings_create(void);
+void zed_strings_destroy(zed_strings_t *zsp);
+int zed_strings_add(zed_strings_t *zsp, const char *key, const char *s);
+const char *zed_strings_first(zed_strings_t *zsp);
+const char *zed_strings_next(zed_strings_t *zsp);
+int zed_strings_count(zed_strings_t *zsp);
+
+#endif /* !ZED_STRINGS_H */