diff options
author | Matt Macy <mmacy@FreeBSD.org> | 2020-08-24 22:48:19 +0000 |
---|---|---|
committer | Matt Macy <mmacy@FreeBSD.org> | 2020-08-24 22:48:19 +0000 |
commit | 3b0ce0e28db46d0403929aba45c682285e1ac217 (patch) | |
tree | 91721e6e5518bd0d8113dee535898f2225443411 /cmd/zed | |
download | src-3b0ce0e28db46d0403929aba45c682285e1ac217.tar.gz src-3b0ce0e28db46d0403929aba45c682285e1ac217.zip |
Vendor import of openzfs master @ 184df27eef0abdc7ab2105b21257f753834b936bvendor/openzfs/2.0-rc0-g184df27
Sponsored by: iX Systems, Inc.
Notes
Notes:
svn path=/vendor-sys/openzfs/dist/; revision=364736
svn path=/vendor-sys/openzfs/2.0-rc0-g184df27/; revision=364741; tag=vendor/openzfs/2.0-rc0-g184df27
Diffstat (limited to 'cmd/zed')
47 files changed, 9553 insertions, 0 deletions
diff --git a/cmd/zed/.gitignore b/cmd/zed/.gitignore new file mode 100644 index 000000000000..76557bb6bb3a --- /dev/null +++ b/cmd/zed/.gitignore @@ -0,0 +1 @@ +/zed diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am new file mode 100644 index 000000000000..4bd8ac4a53e6 --- /dev/null +++ b/cmd/zed/Makefile.am @@ -0,0 +1,49 @@ +include $(top_srcdir)/config/Rules.am + +AM_CFLAGS += $(LIBUDEV_CFLAGS) $(LIBUUID_CFLAGS) + +SUBDIRS = zed.d + +sbin_PROGRAMS = zed + +ZED_SRC = \ + zed.c \ + zed.h \ + zed_conf.c \ + zed_conf.h \ + zed_disk_event.c \ + zed_disk_event.h \ + zed_event.c \ + zed_event.h \ + zed_exec.c \ + zed_exec.h \ + zed_file.c \ + zed_file.h \ + zed_log.c \ + zed_log.h \ + zed_strings.c \ + zed_strings.h + +FMA_SRC = \ + agents/zfs_agents.c \ + agents/zfs_agents.h \ + agents/zfs_diagnosis.c \ + agents/zfs_mod.c \ + agents/zfs_retire.c \ + agents/fmd_api.c \ + agents/fmd_api.h \ + agents/fmd_serd.c \ + agents/fmd_serd.h + +zed_SOURCES = $(ZED_SRC) $(FMA_SRC) + +zed_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la + +zed_LDADD += -lrt $(LIBUDEV_LIBS) $(LIBUUID_LIBS) +zed_LDFLAGS = -pthread + +EXTRA_DIST = agents/README.md diff --git a/cmd/zed/agents/README.md b/cmd/zed/agents/README.md new file mode 100644 index 000000000000..e35b97668a9d --- /dev/null +++ b/cmd/zed/agents/README.md @@ -0,0 +1,112 @@ +## Fault Management Logic for ZED ## + +The integration of Fault Management Daemon (FMD) logic from illumos +is being deployed in three phases. This logic is encapsulated in +several software modules inside ZED. + +### ZED+FM Phase 1 ### + +All the phase 1 work is in current Master branch. Phase I work includes: + +* Add new paths to the persistent VDEV label for device matching. +* Add a disk monitor for generating _disk-add_ and _disk-change_ events. +* Add support for automated VDEV auto-online, auto-replace and auto-expand. +* Expand the statechange event to include all VDEV state transitions. + +### ZED+FM Phase 2 (WIP) ### + +The phase 2 work primarily entails the _Diagnosis Engine_ and the +_Retire Agent_ modules. It also includes infrastructure to support a +crude FMD environment to host these modules. For additional +information see the **FMD Components in ZED** and **Implementation +Notes** sections below. + +### ZED+FM Phase 3 ### + +Future work will add additional functionality and will likely include: + +* Add FMD module garbage collection (periodically call `fmd_module_gc()`). +* Add real module property retrieval (currently hard-coded in accessors). +* Additional diagnosis telemetry (like latency outliers and SMART data). +* Export FMD module statistics. +* Zedlet parallel execution and resiliency (add watchdog). + +### ZFS Fault Management Overview ### + +The primary purpose with ZFS fault management is automated diagnosis +and isolation of VDEV faults. A fault is something we can associate +with an impact (e.g. loss of data redundancy) and a corrective action +(e.g. offline or replace a disk). A typical ZFS fault management stack +is comprised of _error detectors_ (e.g. `zfs_ereport_post()`), a _disk +monitor_, a _diagnosis engine_ and _response agents_. + +After detecting a software error, the ZFS kernel module sends error +events to the ZED user daemon which in turn routes the events to its +internal FMA modules based on their event subscriptions. Likewise, if +a disk is added or changed in the system, the disk monitor sends disk +events which are consumed by a response agent. + +### FMD Components in ZED ### + +There are three FMD modules (aka agents) that are now built into ZED. + + 1. A _Diagnosis Engine_ module (`agents/zfs_diagnosis.c`) + 2. A _Retire Agent_ module (`agents/zfs_retire.c`) + 3. A _Disk Add Agent_ module (`agents/zfs_mod.c`) + +To begin with, a **Diagnosis Engine** consumes per-vdev I/O and checksum +ereports and feeds them into a Soft Error Rate Discrimination (SERD) +algorithm which will generate a corresponding fault diagnosis when the +tracked VDEV encounters **N** events in a given **T** time window. The +initial N and T values for the SERD algorithm are estimates inherited +from illumos (10 errors in 10 minutes). + +In turn, a **Retire Agent** responds to diagnosed faults by isolating +the faulty VDEV. It will notify the ZFS kernel module of the new VDEV +state (degraded or faulted). The retire agent is also responsible for +managing hot spares across all pools. When it encounters a device fault +or a device removal it will replace the device with an appropriate +spare if available. + +Finally, a **Disk Add Agent** responds to events from a libudev disk +monitor (`EC_DEV_ADD` or `EC_DEV_STATUS`) and will online, replace or +expand the associated VDEV. This agent is also known as the `zfs_mod` +or Sysevent Loadable Module (SLM) on the illumos platform. The added +disk is matched to a specific VDEV using its device id, physical path +or VDEV GUID. + +Note that the _auto-replace_ feature (aka hot plug) is opt-in and you +must set the pool's `autoreplace` property to enable it. The new disk +will be matched to the corresponding leaf VDEV by physical location +and labeled with a GPT partition before replacing the original VDEV +in the pool. + +### Implementation Notes ### + +* The FMD module API required for logic modules is emulated and implemented + in the `fmd_api.c` and `fmd_serd.c` source files. This support includes + module registration, memory allocation, module property accessors, basic + case management, one-shot timers and SERD engines. + For detailed information on the FMD module API, see the document -- + _"Fault Management Daemon Programmer's Reference Manual"_. + +* The event subscriptions for the modules (located in a module specific + configuration file on illumos) are currently hard-coded into the ZED + `zfs_agent_dispatch()` function. + +* The FMD modules are called one at a time from a single thread that + consumes events queued to the modules. These events are sourced from + the normal ZED events and also include events posted from the diagnosis + engine and the libudev disk event monitor. + +* The FMD code modules have minimal changes and were intentionally left + as similar as possible to their upstream source files. + +* The sysevent namespace in ZED differs from illumos. For example: + * illumos uses `"resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove"` + * Linux uses `"sysevent.fs.zfs.vdev_remove"` + +* The FMD Modules port was produced by Intel Federal, LLC under award + number B609815 between the U.S. Department of Energy (DOE) and Intel + Federal, LLC. + diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c new file mode 100644 index 000000000000..607b387ca3a8 --- /dev/null +++ b/cmd/zed/agents/fmd_api.c @@ -0,0 +1,760 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * This file implements the minimal FMD module API required to support the + * fault logic modules in ZED. This support includes module registration, + * memory allocation, module property accessors, basic case management, + * one-shot timers and SERD engines. + * + * In the ZED runtime, the modules are called from a single thread so no + * locking is required in this emulated FMD environment. + */ + +#include <sys/types.h> +#include <sys/fm/protocol.h> +#include <uuid/uuid.h> +#include <signal.h> +#include <strings.h> +#include <time.h> + +#include "fmd_api.h" +#include "fmd_serd.h" + +#include "zfs_agents.h" +#include "../zed_log.h" + +typedef struct fmd_modstat { + fmd_stat_t ms_accepted; /* total events accepted by module */ + fmd_stat_t ms_caseopen; /* cases currently open */ + fmd_stat_t ms_casesolved; /* total cases solved by module */ + fmd_stat_t ms_caseclosed; /* total cases closed by module */ +} fmd_modstat_t; + +typedef struct fmd_module { + const char *mod_name; /* basename of module (ro) */ + const fmd_hdl_info_t *mod_info; /* module info registered with handle */ + void *mod_spec; /* fmd_hdl_get/setspecific data value */ + fmd_stat_t *mod_ustat; /* module specific custom stats */ + uint_t mod_ustat_cnt; /* count of ustat stats */ + fmd_modstat_t mod_stats; /* fmd built-in per-module statistics */ + fmd_serd_hash_t mod_serds; /* hash of serd engs owned by module */ + char *mod_vers; /* a copy of module version string */ +} fmd_module_t; + +/* + * ZED has two FMD hardwired module instances + */ +fmd_module_t zfs_retire_module; +fmd_module_t zfs_diagnosis_module; + +/* + * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. + */ + +#ifdef DEBUG +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} +#endif + +/* + * Register a module with fmd and finish module initialization. + * Returns an integer indicating whether it succeeded (zero) or + * failed (non-zero). + */ +int +fmd_hdl_register(fmd_hdl_t *hdl, int version, const fmd_hdl_info_t *mip) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + mp->mod_info = mip; + mp->mod_name = mip->fmdi_desc + 4; /* drop 'ZFS ' prefix */ + mp->mod_spec = NULL; + + /* bare minimum module stats */ + (void) strcpy(mp->mod_stats.ms_accepted.fmds_name, "fmd.accepted"); + (void) strcpy(mp->mod_stats.ms_caseopen.fmds_name, "fmd.caseopen"); + (void) strcpy(mp->mod_stats.ms_casesolved.fmds_name, "fmd.casesolved"); + (void) strcpy(mp->mod_stats.ms_caseclosed.fmds_name, "fmd.caseclosed"); + + fmd_serd_hash_create(&mp->mod_serds); + + fmd_hdl_debug(hdl, "register module"); + + return (0); +} + +void +fmd_hdl_unregister(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_modstat_t *msp = &mp->mod_stats; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + + /* dump generic module stats */ + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_accepted.fmds_name, + msp->ms_accepted.fmds_value.ui64); + if (ops->fmdo_close != NULL) { + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseopen.fmds_name, + msp->ms_caseopen.fmds_value.ui64); + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_casesolved.fmds_name, + msp->ms_casesolved.fmds_value.ui64); + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseclosed.fmds_name, + msp->ms_caseclosed.fmds_value.ui64); + } + + /* dump module specific stats */ + if (mp->mod_ustat != NULL) { + int i; + + for (i = 0; i < mp->mod_ustat_cnt; i++) { + fmd_hdl_debug(hdl, "%s: %llu", + mp->mod_ustat[i].fmds_name, + mp->mod_ustat[i].fmds_value.ui64); + } + } + + fmd_serd_hash_destroy(&mp->mod_serds); + + fmd_hdl_debug(hdl, "unregister module"); +} + +/* + * fmd_hdl_setspecific() is used to associate a data pointer with + * the specified handle for the duration of the module's lifetime. + * This pointer can be retrieved using fmd_hdl_getspecific(). + */ +void +fmd_hdl_setspecific(fmd_hdl_t *hdl, void *spec) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + mp->mod_spec = spec; +} + +/* + * Return the module-specific data pointer previously associated + * with the handle using fmd_hdl_setspecific(). + */ +void * +fmd_hdl_getspecific(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (mp->mod_spec); +} + +void * +fmd_hdl_alloc(fmd_hdl_t *hdl, size_t size, int flags) +{ + return (umem_alloc(size, flags)); +} + +void * +fmd_hdl_zalloc(fmd_hdl_t *hdl, size_t size, int flags) +{ + return (umem_zalloc(size, flags)); +} + +void +fmd_hdl_free(fmd_hdl_t *hdl, void *data, size_t size) +{ + umem_free(data, size); +} + +/* + * Record a module debug message using the specified format. + */ +void +fmd_hdl_debug(fmd_hdl_t *hdl, const char *format, ...) +{ + char message[256]; + va_list vargs; + fmd_module_t *mp = (fmd_module_t *)hdl; + + va_start(vargs, format); + (void) vsnprintf(message, sizeof (message), format, vargs); + va_end(vargs); + + /* prefix message with module name */ + zed_log_msg(LOG_INFO, "%s: %s", mp->mod_name, message); +} + +/* Property Retrieval */ + +int32_t +fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name) +{ + /* + * These can be looked up in mp->modinfo->fmdi_props + * For now we just hard code for phase 2. In the + * future, there can be a ZED based override. + */ + if (strcmp(name, "spare_on_remove") == 0) + return (1); + + if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0) + return (10); /* N = 10 events */ + + return (0); +} + +int64_t +fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name) +{ + /* + * These can be looked up in mp->modinfo->fmdi_props + * For now we just hard code for phase 2. In the + * future, there can be a ZED based override. + */ + if (strcmp(name, "remove_timeout") == 0) + return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */ + + if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0) + return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */ + + return (0); +} + +/* FMD Statistics */ + +fmd_stat_t * +fmd_stat_create(fmd_hdl_t *hdl, uint_t flags, uint_t nstats, fmd_stat_t *statv) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + if (flags == FMD_STAT_NOALLOC) { + mp->mod_ustat = statv; + mp->mod_ustat_cnt = nstats; + } + + return (statv); +} + +/* Case Management */ + +fmd_case_t * +fmd_case_open(fmd_hdl_t *hdl, void *data) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + uuid_t uuid; + + fmd_case_t *cp; + + cp = fmd_hdl_zalloc(hdl, sizeof (fmd_case_t), FMD_SLEEP); + cp->ci_mod = hdl; + cp->ci_state = FMD_CASE_UNSOLVED; + cp->ci_flags = FMD_CF_DIRTY; + cp->ci_data = data; + cp->ci_bufptr = NULL; + cp->ci_bufsiz = 0; + + uuid_generate(uuid); + uuid_unparse(uuid, cp->ci_uuid); + + fmd_hdl_debug(hdl, "case opened (%s)", cp->ci_uuid); + mp->mod_stats.ms_caseopen.fmds_value.ui64++; + + return (cp); +} + +void +fmd_case_solve(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + /* + * For ZED, the event was already sent from fmd_case_add_suspect() + */ + + if (cp->ci_state >= FMD_CASE_SOLVED) + fmd_hdl_debug(hdl, "case is already solved or closed"); + + cp->ci_state = FMD_CASE_SOLVED; + + fmd_hdl_debug(hdl, "case solved (%s)", cp->ci_uuid); + mp->mod_stats.ms_casesolved.fmds_value.ui64++; +} + +void +fmd_case_close(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + + fmd_hdl_debug(hdl, "case closed (%s)", cp->ci_uuid); + + if (ops->fmdo_close != NULL) + ops->fmdo_close(hdl, cp); + + mp->mod_stats.ms_caseopen.fmds_value.ui64--; + mp->mod_stats.ms_caseclosed.fmds_value.ui64++; + + if (cp->ci_bufptr != NULL && cp->ci_bufsiz > 0) + fmd_hdl_free(hdl, cp->ci_bufptr, cp->ci_bufsiz); + + fmd_hdl_free(hdl, cp, sizeof (fmd_case_t)); +} + +void +fmd_case_uuresolved(fmd_hdl_t *hdl, const char *uuid) +{ + fmd_hdl_debug(hdl, "case resolved by uuid (%s)", uuid); +} + +int +fmd_case_solved(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + return ((cp->ci_state >= FMD_CASE_SOLVED) ? FMD_B_TRUE : FMD_B_FALSE); +} + +void +fmd_case_add_ereport(fmd_hdl_t *hdl, fmd_case_t *cp, fmd_event_t *ep) +{ +} + +static void +zed_log_fault(nvlist_t *nvl, const char *uuid, const char *code) +{ + nvlist_t *rsrc; + char *strval; + uint64_t guid; + uint8_t byte; + + zed_log_msg(LOG_INFO, "\nzed_fault_event:"); + + if (uuid != NULL) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_UUID, uuid); + if (nvlist_lookup_string(nvl, FM_CLASS, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_CLASS, strval); + if (code != NULL) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_DIAG_CODE, code); + if (nvlist_lookup_uint8(nvl, FM_FAULT_CERTAINTY, &byte) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FAULT_CERTAINTY, byte); + if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) { + if (nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_FMRI_SCHEME, + strval); + if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_POOL, &guid) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FMRI_ZFS_POOL, + guid); + if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_VDEV, &guid) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu \n", FM_FMRI_ZFS_VDEV, + guid); + } +} + +static const char * +fmd_fault_mkcode(nvlist_t *fault) +{ + char *class, *code = "-"; + + /* + * Note: message codes come from: openzfs/usr/src/cmd/fm/dicts/ZFS.po + */ + if (nvlist_lookup_string(fault, FM_CLASS, &class) == 0) { + if (strcmp(class, "fault.fs.zfs.vdev.io") == 0) + code = "ZFS-8000-FD"; + else if (strcmp(class, "fault.fs.zfs.vdev.checksum") == 0) + code = "ZFS-8000-GH"; + else if (strcmp(class, "fault.fs.zfs.io_failure_wait") == 0) + code = "ZFS-8000-HC"; + else if (strcmp(class, "fault.fs.zfs.io_failure_continue") == 0) + code = "ZFS-8000-JQ"; + else if (strcmp(class, "fault.fs.zfs.log_replay") == 0) + code = "ZFS-8000-K4"; + else if (strcmp(class, "fault.fs.zfs.pool") == 0) + code = "ZFS-8000-CS"; + else if (strcmp(class, "fault.fs.zfs.device") == 0) + code = "ZFS-8000-D3"; + + } + return (code); +} + +void +fmd_case_add_suspect(fmd_hdl_t *hdl, fmd_case_t *cp, nvlist_t *fault) +{ + nvlist_t *nvl; + const char *code = fmd_fault_mkcode(fault); + int64_t tod[2]; + int err = 0; + + /* + * payload derived from fmd_protocol_list() + */ + + (void) gettimeofday(&cp->ci_tv, NULL); + tod[0] = cp->ci_tv.tv_sec; + tod[1] = cp->ci_tv.tv_usec; + + nvl = fmd_nvl_alloc(hdl, FMD_SLEEP); + + err |= nvlist_add_uint8(nvl, FM_VERSION, FM_SUSPECT_VERSION); + err |= nvlist_add_string(nvl, FM_CLASS, FM_LIST_SUSPECT_CLASS); + err |= nvlist_add_string(nvl, FM_SUSPECT_UUID, cp->ci_uuid); + err |= nvlist_add_string(nvl, FM_SUSPECT_DIAG_CODE, code); + err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2); + err |= nvlist_add_uint32(nvl, FM_SUSPECT_FAULT_SZ, 1); + err |= nvlist_add_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &fault, 1); + + if (err) + zed_log_die("failed to populate nvlist"); + + zed_log_fault(fault, cp->ci_uuid, code); + zfs_agent_post_event(FM_LIST_SUSPECT_CLASS, NULL, nvl); + + nvlist_free(nvl); + nvlist_free(fault); +} + +void +fmd_case_setspecific(fmd_hdl_t *hdl, fmd_case_t *cp, void *data) +{ + cp->ci_data = data; +} + +void * +fmd_case_getspecific(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + return (cp->ci_data); +} + +void +fmd_buf_create(fmd_hdl_t *hdl, fmd_case_t *cp, const char *name, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr == NULL); + assert(size < (1024 * 1024)); + + cp->ci_bufptr = fmd_hdl_alloc(hdl, size, FMD_SLEEP); + cp->ci_bufsiz = size; +} + +void +fmd_buf_read(fmd_hdl_t *hdl, fmd_case_t *cp, + const char *name, void *buf, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr != NULL); + assert(size <= cp->ci_bufsiz); + + bcopy(cp->ci_bufptr, buf, size); +} + +void +fmd_buf_write(fmd_hdl_t *hdl, fmd_case_t *cp, + const char *name, const void *buf, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr != NULL); + assert(cp->ci_bufsiz >= size); + + bcopy(buf, cp->ci_bufptr, size); +} + +/* SERD Engines */ + +void +fmd_serd_create(fmd_hdl_t *hdl, const char *name, uint_t n, hrtime_t t) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + if (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL) { + zed_log_msg(LOG_ERR, "failed to create SERD engine '%s': " + " name already exists", name); + return; + } + + (void) fmd_serd_eng_insert(&mp->mod_serds, name, n, t); +} + +void +fmd_serd_destroy(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + fmd_serd_eng_delete(&mp->mod_serds, name); + + fmd_hdl_debug(hdl, "serd_destroy %s", name); +} + +int +fmd_serd_exists(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL); +} + +void +fmd_serd_reset(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; + + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); + return; + } + + fmd_serd_eng_reset(sgp); + + fmd_hdl_debug(hdl, "serd_reset %s", name); +} + +int +fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; + int err; + + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'", + name); + return (FMD_B_FALSE); + } + err = fmd_serd_eng_record(sgp, ep->ev_hrt); + + return (err); +} + +/* FMD Timers */ + +static void +_timer_notify(union sigval sv) +{ + fmd_timer_t *ftp = sv.sival_ptr; + fmd_hdl_t *hdl = ftp->ft_hdl; + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + struct itimerspec its; + + fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid); + + /* disarm the timer */ + bzero(&its, sizeof (struct itimerspec)); + timer_settime(ftp->ft_tid, 0, &its, NULL); + + /* Note that the fmdo_timeout can remove this timer */ + if (ops->fmdo_timeout != NULL) + ops->fmdo_timeout(hdl, ftp, ftp->ft_arg); +} + +/* + * Install a new timer which will fire at least delta nanoseconds after the + * current time. After the timeout has expired, the module's fmdo_timeout + * entry point is called. + */ +fmd_timer_t * +fmd_timer_install(fmd_hdl_t *hdl, void *arg, fmd_event_t *ep, hrtime_t delta) +{ + struct sigevent sev; + struct itimerspec its; + fmd_timer_t *ftp; + + ftp = fmd_hdl_alloc(hdl, sizeof (fmd_timer_t), FMD_SLEEP); + ftp->ft_arg = arg; + ftp->ft_hdl = hdl; + + its.it_value.tv_sec = delta / 1000000000; + its.it_value.tv_nsec = delta % 1000000000; + its.it_interval.tv_sec = its.it_value.tv_sec; + its.it_interval.tv_nsec = its.it_value.tv_nsec; + + sev.sigev_notify = SIGEV_THREAD; + sev.sigev_notify_function = _timer_notify; + sev.sigev_notify_attributes = NULL; + sev.sigev_value.sival_ptr = ftp; + + timer_create(CLOCK_REALTIME, &sev, &ftp->ft_tid); + timer_settime(ftp->ft_tid, 0, &its, NULL); + + fmd_hdl_debug(hdl, "installing timer for %d secs (%p)", + (int)its.it_value.tv_sec, ftp->ft_tid); + + return (ftp); +} + +void +fmd_timer_remove(fmd_hdl_t *hdl, fmd_timer_t *ftp) +{ + fmd_hdl_debug(hdl, "removing timer (%p)", ftp->ft_tid); + + timer_delete(ftp->ft_tid); + + fmd_hdl_free(hdl, ftp, sizeof (fmd_timer_t)); +} + +/* Name-Value Pair Lists */ + +nvlist_t * +fmd_nvl_create_fault(fmd_hdl_t *hdl, const char *class, uint8_t certainty, + nvlist_t *asru, nvlist_t *fru, nvlist_t *resource) +{ + nvlist_t *nvl; + int err = 0; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + zed_log_die("failed to xalloc fault nvlist"); + + err |= nvlist_add_uint8(nvl, FM_VERSION, FM_FAULT_VERSION); + err |= nvlist_add_string(nvl, FM_CLASS, class); + err |= nvlist_add_uint8(nvl, FM_FAULT_CERTAINTY, certainty); + + if (asru != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_ASRU, asru); + if (fru != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_FRU, fru); + if (resource != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_RESOURCE, resource); + + if (err) + zed_log_die("failed to populate nvlist: %s\n", strerror(err)); + + return (nvl); +} + +/* + * sourced from fmd_string.c + */ +static int +fmd_strmatch(const char *s, const char *p) +{ + char c; + + if (p == NULL) + return (0); + + if (s == NULL) + s = ""; /* treat NULL string as the empty string */ + + do { + if ((c = *p++) == '\0') + return (*s == '\0'); + + if (c == '*') { + while (*p == '*') + p++; /* consecutive *'s can be collapsed */ + + if (*p == '\0') + return (1); + + while (*s != '\0') { + if (fmd_strmatch(s++, p) != 0) + return (1); + } + + return (0); + } + } while (c == *s++); + + return (0); +} + +int +fmd_nvl_class_match(fmd_hdl_t *hdl, nvlist_t *nvl, const char *pattern) +{ + char *class; + + return (nvl != NULL && + nvlist_lookup_string(nvl, FM_CLASS, &class) == 0 && + fmd_strmatch(class, pattern)); +} + +nvlist_t * +fmd_nvl_alloc(fmd_hdl_t *hdl, int flags) +{ + nvlist_t *nvl = NULL; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + + return (nvl); +} + + +/* + * ZED Agent specific APIs + */ + +fmd_hdl_t * +fmd_module_hdl(const char *name) +{ + if (strcmp(name, "zfs-retire") == 0) + return ((fmd_hdl_t *)&zfs_retire_module); + if (strcmp(name, "zfs-diagnosis") == 0) + return ((fmd_hdl_t *)&zfs_diagnosis_module); + + return (NULL); +} + +boolean_t +fmd_module_initialized(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (mp->mod_info != NULL); +} + +/* + * fmd_module_recv is called for each event that is received by + * the fault manager that has a class that matches one of the + * module's subscriptions. + */ +void +fmd_module_recv(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + fmd_event_t faux_event = {0}; + int64_t *tv; + uint_t n; + + /* + * Will need to normalized this if we persistently store the case data + */ + if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0) + faux_event.ev_hrt = tv[0] * NANOSEC + tv[1]; + else + faux_event.ev_hrt = 0; + + ops->fmdo_recv(hdl, &faux_event, nvl, class); + + mp->mod_stats.ms_accepted.fmds_value.ui64++; + + /* TBD - should we initiate fm_module_gc() periodically? */ +} diff --git a/cmd/zed/agents/fmd_api.h b/cmd/zed/agents/fmd_api.h new file mode 100644 index 000000000000..4f06fb244b7b --- /dev/null +++ b/cmd/zed/agents/fmd_api.h @@ -0,0 +1,246 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _FMD_API_H +#define _FMD_API_H + +#include <sys/types.h> +#include <sys/time.h> +#include <time.h> +#include <libnvpair.h> +#include <stdarg.h> +#include <umem.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Fault Management Daemon Client Interfaces + */ + +#define FMD_API_VERSION 5 + +typedef struct fmd_hdl fmd_hdl_t; + +typedef struct fmd_timer { + timer_t ft_tid; + void *ft_arg; + fmd_hdl_t *ft_hdl; +} fmd_timer_t; + +#define id_t fmd_timer_t * + + +typedef struct fmd_event { + hrtime_t ev_hrt; /* event time used by SERD engines */ +} fmd_event_t; + +typedef struct fmd_case { + char ci_uuid[48]; /* uuid string for this case */ + fmd_hdl_t *ci_mod; /* module that owns this case */ + void *ci_data; /* data from fmd_case_setspecific() */ + ushort_t ci_state; /* case state (see below) */ + ushort_t ci_flags; /* case flags (see below) */ + struct timeval ci_tv; /* time of original diagnosis */ + void *ci_bufptr; /* case data serialization buffer */ + size_t ci_bufsiz; +} fmd_case_t; + + +#define FMD_B_FALSE 0 /* false value for booleans as int */ +#define FMD_B_TRUE 1 /* true value for booleans as int */ + + +#define FMD_CASE_UNSOLVED 0 /* case is not yet solved (waiting) */ +#define FMD_CASE_SOLVED 1 /* case is solved (suspects added) */ +#define FMD_CASE_CLOSE_WAIT 2 /* case is executing fmdo_close() */ +#define FMD_CASE_CLOSED 3 /* case is closed (reconfig done) */ +#define FMD_CASE_REPAIRED 4 /* case is repaired */ +#define FMD_CASE_RESOLVED 5 /* case is resolved (can be freed) */ + +#define FMD_CF_DIRTY 0x01 /* case is in need of checkpoint */ +#define FMD_CF_SOLVED 0x02 /* case has been solved */ +#define FMD_CF_ISOLATED 0x04 /* case has been isolated */ +#define FMD_CF_REPAIRED 0x08 /* case has been repaired */ +#define FMD_CF_RESOLVED 0x10 /* case has been resolved */ + + +#define FMD_TYPE_BOOL 0 /* int */ +#define FMD_TYPE_INT32 1 /* int32_t */ +#define FMD_TYPE_UINT32 2 /* uint32_t */ +#define FMD_TYPE_INT64 3 /* int64_t */ +#define FMD_TYPE_UINT64 4 /* uint64_t */ +#define FMD_TYPE_TIME 5 /* uint64_t */ +#define FMD_TYPE_SIZE 6 /* uint64_t */ + +typedef struct fmd_prop { + const char *fmdp_name; /* property name */ + uint_t fmdp_type; /* property type (see above) */ + const char *fmdp_defv; /* default value */ +} fmd_prop_t; + +typedef struct fmd_stat { + char fmds_name[32]; /* statistic name */ + uint_t fmds_type; /* statistic type (see above) */ + char fmds_desc[64]; /* statistic description */ + union { + int bool; /* FMD_TYPE_BOOL */ + int32_t i32; /* FMD_TYPE_INT32 */ + uint32_t ui32; /* FMD_TYPE_UINT32 */ + int64_t i64; /* FMD_TYPE_INT64 */ + uint64_t ui64; /* FMD_TYPE_UINT64 */ + } fmds_value; +} fmd_stat_t; + +typedef struct fmd_hdl_ops { + void (*fmdo_recv)(fmd_hdl_t *, fmd_event_t *, nvlist_t *, const char *); + void (*fmdo_timeout)(fmd_hdl_t *, id_t, void *); + void (*fmdo_close)(fmd_hdl_t *, fmd_case_t *); + void (*fmdo_stats)(fmd_hdl_t *); + void (*fmdo_gc)(fmd_hdl_t *); +} fmd_hdl_ops_t; + +#define FMD_SEND_SUCCESS 0 /* fmdo_send queued event */ +#define FMD_SEND_FAILED 1 /* fmdo_send unrecoverable error */ +#define FMD_SEND_RETRY 2 /* fmdo_send requests retry */ + +typedef struct fmd_hdl_info { + const char *fmdi_desc; /* fmd client description string */ + const char *fmdi_vers; /* fmd client version string */ + const fmd_hdl_ops_t *fmdi_ops; /* ops vector for client */ + const fmd_prop_t *fmdi_props; /* array of configuration props */ +} fmd_hdl_info_t; + +extern int fmd_hdl_register(fmd_hdl_t *, int, const fmd_hdl_info_t *); +extern void fmd_hdl_unregister(fmd_hdl_t *); + +extern void fmd_hdl_setspecific(fmd_hdl_t *, void *); +extern void *fmd_hdl_getspecific(fmd_hdl_t *); + +#define FMD_SLEEP UMEM_NOFAIL + +extern void *fmd_hdl_alloc(fmd_hdl_t *, size_t, int); +extern void *fmd_hdl_zalloc(fmd_hdl_t *, size_t, int); +extern void fmd_hdl_free(fmd_hdl_t *, void *, size_t); + +extern char *fmd_hdl_strdup(fmd_hdl_t *, const char *, int); +extern void fmd_hdl_strfree(fmd_hdl_t *, char *); + +extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list); +extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...); + +extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *); +extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *); + +#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */ +#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */ + +extern fmd_stat_t *fmd_stat_create(fmd_hdl_t *, uint_t, uint_t, fmd_stat_t *); +extern void fmd_stat_destroy(fmd_hdl_t *, uint_t, fmd_stat_t *); +extern void fmd_stat_setstr(fmd_hdl_t *, fmd_stat_t *, const char *); + +extern fmd_case_t *fmd_case_open(fmd_hdl_t *, void *); +extern void fmd_case_reset(fmd_hdl_t *, fmd_case_t *); +extern void fmd_case_solve(fmd_hdl_t *, fmd_case_t *); +extern void fmd_case_close(fmd_hdl_t *, fmd_case_t *); + +extern const char *fmd_case_uuid(fmd_hdl_t *, fmd_case_t *); +extern fmd_case_t *fmd_case_uulookup(fmd_hdl_t *, const char *); +extern void fmd_case_uuclose(fmd_hdl_t *, const char *); +extern int fmd_case_uuclosed(fmd_hdl_t *, const char *); +extern int fmd_case_uuisresolved(fmd_hdl_t *, const char *); +extern void fmd_case_uuresolved(fmd_hdl_t *, const char *); + +extern int fmd_case_solved(fmd_hdl_t *, fmd_case_t *); +extern int fmd_case_closed(fmd_hdl_t *, fmd_case_t *); + +extern void fmd_case_add_ereport(fmd_hdl_t *, fmd_case_t *, fmd_event_t *); +extern void fmd_case_add_serd(fmd_hdl_t *, fmd_case_t *, const char *); +extern void fmd_case_add_suspect(fmd_hdl_t *, fmd_case_t *, nvlist_t *); + +extern void fmd_case_setspecific(fmd_hdl_t *, fmd_case_t *, void *); +extern void *fmd_case_getspecific(fmd_hdl_t *, fmd_case_t *); + +extern fmd_case_t *fmd_case_next(fmd_hdl_t *, fmd_case_t *); +extern fmd_case_t *fmd_case_prev(fmd_hdl_t *, fmd_case_t *); + +extern void fmd_buf_create(fmd_hdl_t *, fmd_case_t *, const char *, size_t); +extern void fmd_buf_destroy(fmd_hdl_t *, fmd_case_t *, const char *); +extern void fmd_buf_read(fmd_hdl_t *, fmd_case_t *, + const char *, void *, size_t); +extern void fmd_buf_write(fmd_hdl_t *, fmd_case_t *, + const char *, const void *, size_t); +extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *); + +extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t); +extern void fmd_serd_destroy(fmd_hdl_t *, const char *); +extern int fmd_serd_exists(fmd_hdl_t *, const char *); +extern void fmd_serd_reset(fmd_hdl_t *, const char *); +extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *); +extern int fmd_serd_fired(fmd_hdl_t *, const char *); +extern int fmd_serd_empty(fmd_hdl_t *, const char *); + +extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t); +extern void fmd_timer_remove(fmd_hdl_t *, id_t); + +extern nvlist_t *fmd_nvl_create_fault(fmd_hdl_t *, + const char *, uint8_t, nvlist_t *, nvlist_t *, nvlist_t *); + +extern int fmd_nvl_class_match(fmd_hdl_t *, nvlist_t *, const char *); + +#define FMD_HAS_FAULT_FRU 0 +#define FMD_HAS_FAULT_ASRU 1 +#define FMD_HAS_FAULT_RESOURCE 2 + +extern void fmd_repair_fru(fmd_hdl_t *, const char *); +extern int fmd_repair_asru(fmd_hdl_t *, const char *); + +extern nvlist_t *fmd_nvl_alloc(fmd_hdl_t *, int); +extern nvlist_t *fmd_nvl_dup(fmd_hdl_t *, nvlist_t *, int); + +/* + * ZED Specific Interfaces + */ + +extern fmd_hdl_t *fmd_module_hdl(const char *); +extern boolean_t fmd_module_initialized(fmd_hdl_t *); +extern void fmd_module_recv(fmd_hdl_t *, nvlist_t *, const char *); + +/* ZFS FMA Retire Agent */ +extern void _zfs_retire_init(fmd_hdl_t *); +extern void _zfs_retire_fini(fmd_hdl_t *); + +/* ZFS FMA Diagnosis Engine */ +extern void _zfs_diagnosis_init(fmd_hdl_t *); +extern void _zfs_diagnosis_fini(fmd_hdl_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _FMD_API_H */ diff --git a/cmd/zed/agents/fmd_serd.c b/cmd/zed/agents/fmd_serd.c new file mode 100644 index 000000000000..d4ec37fb7691 --- /dev/null +++ b/cmd/zed/agents/fmd_serd.c @@ -0,0 +1,316 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#include <assert.h> +#include <stddef.h> +#include <stdlib.h> +#include <strings.h> +#include <sys/list.h> +#include <sys/time.h> + +#include "fmd_api.h" +#include "fmd_serd.h" +#include "../zed_log.h" + + +#define FMD_STR_BUCKETS 211 + + +#ifdef SERD_ENG_DEBUG +#define serd_log_msg(fmt, ...) \ + zed_log_msg(LOG_INFO, fmt, __VA_ARGS__) +#else +#define serd_log_msg(fmt, ...) +#endif + + +/* + * SERD Engine Backend + */ + +/* + * Compute the delta between events in nanoseconds. To account for very old + * events which are replayed, we must handle the case where time is negative. + * We convert the hrtime_t's to unsigned 64-bit integers and then handle the + * case where 'old' is greater than 'new' (i.e. high-res time has wrapped). + */ +static hrtime_t +fmd_event_delta(hrtime_t t1, hrtime_t t2) +{ + uint64_t old = t1; + uint64_t new = t2; + + return (new >= old ? new - old : (UINT64_MAX - old) + new + 1); +} + +static fmd_serd_eng_t * +fmd_serd_eng_alloc(const char *name, uint64_t n, hrtime_t t) +{ + fmd_serd_eng_t *sgp; + + sgp = malloc(sizeof (fmd_serd_eng_t)); + bzero(sgp, sizeof (fmd_serd_eng_t)); + + sgp->sg_name = strdup(name); + sgp->sg_flags = FMD_SERD_DIRTY; + sgp->sg_n = n; + sgp->sg_t = t; + + list_create(&sgp->sg_list, sizeof (fmd_serd_elem_t), + offsetof(fmd_serd_elem_t, se_list)); + + return (sgp); +} + +static void +fmd_serd_eng_free(fmd_serd_eng_t *sgp) +{ + fmd_serd_eng_reset(sgp); + free(sgp->sg_name); + list_destroy(&sgp->sg_list); + free(sgp); +} + +/* + * sourced from fmd_string.c + */ +static ulong_t +fmd_strhash(const char *key) +{ + ulong_t g, h = 0; + const char *p; + + for (p = key; *p != '\0'; p++) { + h = (h << 4) + *p; + + if ((g = (h & 0xf0000000)) != 0) { + h ^= (g >> 24); + h ^= g; + } + } + + return (h); +} + +void +fmd_serd_hash_create(fmd_serd_hash_t *shp) +{ + shp->sh_hashlen = FMD_STR_BUCKETS; + shp->sh_hash = calloc(shp->sh_hashlen, sizeof (void *)); + shp->sh_count = 0; +} + +void +fmd_serd_hash_destroy(fmd_serd_hash_t *shp) +{ + fmd_serd_eng_t *sgp, *ngp; + uint_t i; + + for (i = 0; i < shp->sh_hashlen; i++) { + for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = ngp) { + ngp = sgp->sg_next; + fmd_serd_eng_free(sgp); + } + } + + free(shp->sh_hash); + bzero(shp, sizeof (fmd_serd_hash_t)); +} + +void +fmd_serd_hash_apply(fmd_serd_hash_t *shp, fmd_serd_eng_f *func, void *arg) +{ + fmd_serd_eng_t *sgp; + uint_t i; + + for (i = 0; i < shp->sh_hashlen; i++) { + for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = sgp->sg_next) + func(sgp, arg); + } +} + +fmd_serd_eng_t * +fmd_serd_eng_insert(fmd_serd_hash_t *shp, const char *name, + uint_t n, hrtime_t t) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp = fmd_serd_eng_alloc(name, n, t); + + serd_log_msg(" SERD Engine: inserting %s N %d T %llu", + name, (int)n, (long long unsigned)t); + + sgp->sg_next = shp->sh_hash[h]; + shp->sh_hash[h] = sgp; + shp->sh_count++; + + return (sgp); +} + +fmd_serd_eng_t * +fmd_serd_eng_lookup(fmd_serd_hash_t *shp, const char *name) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp; + + for (sgp = shp->sh_hash[h]; sgp != NULL; sgp = sgp->sg_next) { + if (strcmp(name, sgp->sg_name) == 0) + return (sgp); + } + + return (NULL); +} + +void +fmd_serd_eng_delete(fmd_serd_hash_t *shp, const char *name) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp, **pp = &shp->sh_hash[h]; + + serd_log_msg(" SERD Engine: deleting %s", name); + + for (sgp = *pp; sgp != NULL; sgp = sgp->sg_next) { + if (strcmp(sgp->sg_name, name) != 0) + pp = &sgp->sg_next; + else + break; + } + + if (sgp != NULL) { + *pp = sgp->sg_next; + fmd_serd_eng_free(sgp); + assert(shp->sh_count != 0); + shp->sh_count--; + } +} + +static void +fmd_serd_eng_discard(fmd_serd_eng_t *sgp, fmd_serd_elem_t *sep) +{ + list_remove(&sgp->sg_list, sep); + sgp->sg_count--; + + serd_log_msg(" SERD Engine: discarding %s, %d remaining", + sgp->sg_name, (int)sgp->sg_count); + + free(sep); +} + +int +fmd_serd_eng_record(fmd_serd_eng_t *sgp, hrtime_t hrt) +{ + fmd_serd_elem_t *sep, *oep; + + /* + * If the fired flag is already set, return false and discard the + * event. This means that the caller will only see the engine "fire" + * once until fmd_serd_eng_reset() is called. The fmd_serd_eng_fired() + * function can also be used in combination with fmd_serd_eng_record(). + */ + if (sgp->sg_flags & FMD_SERD_FIRED) { + serd_log_msg(" SERD Engine: record %s already fired!", + sgp->sg_name); + return (FMD_B_FALSE); + } + + while (sgp->sg_count >= sgp->sg_n) + fmd_serd_eng_discard(sgp, list_tail(&sgp->sg_list)); + + sep = malloc(sizeof (fmd_serd_elem_t)); + sep->se_hrt = hrt; + + list_insert_head(&sgp->sg_list, sep); + sgp->sg_count++; + + serd_log_msg(" SERD Engine: recording %s of %d (%llu)", + sgp->sg_name, (int)sgp->sg_count, (long long unsigned)hrt); + + /* + * Pick up the oldest element pointer for comparison to 'sep'. We must + * do this after adding 'sep' because 'oep' and 'sep' can be the same. + */ + oep = list_tail(&sgp->sg_list); + + if (sgp->sg_count >= sgp->sg_n && + fmd_event_delta(oep->se_hrt, sep->se_hrt) <= sgp->sg_t) { + sgp->sg_flags |= FMD_SERD_FIRED | FMD_SERD_DIRTY; + serd_log_msg(" SERD Engine: fired %s", sgp->sg_name); + return (FMD_B_TRUE); + } + + sgp->sg_flags |= FMD_SERD_DIRTY; + return (FMD_B_FALSE); +} + +int +fmd_serd_eng_fired(fmd_serd_eng_t *sgp) +{ + return (sgp->sg_flags & FMD_SERD_FIRED); +} + +int +fmd_serd_eng_empty(fmd_serd_eng_t *sgp) +{ + return (sgp->sg_count == 0); +} + +void +fmd_serd_eng_reset(fmd_serd_eng_t *sgp) +{ + serd_log_msg(" SERD Engine: resetting %s", sgp->sg_name); + + while (sgp->sg_count != 0) + fmd_serd_eng_discard(sgp, list_head(&sgp->sg_list)); + + sgp->sg_flags &= ~FMD_SERD_FIRED; + sgp->sg_flags |= FMD_SERD_DIRTY; +} + +void +fmd_serd_eng_gc(fmd_serd_eng_t *sgp) +{ + fmd_serd_elem_t *sep, *nep; + hrtime_t hrt; + + if (sgp->sg_count == 0 || (sgp->sg_flags & FMD_SERD_FIRED)) + return; /* no garbage collection needed if empty or fired */ + + sep = list_head(&sgp->sg_list); + if (sep == NULL) + return; + + hrt = sep->se_hrt - sgp->sg_t; + + for (sep = list_head(&sgp->sg_list); sep != NULL; sep = nep) { + if (sep->se_hrt >= hrt) + break; /* sep and subsequent events are all within T */ + + nep = list_next(&sgp->sg_list, sep); + fmd_serd_eng_discard(sgp, sep); + sgp->sg_flags |= FMD_SERD_DIRTY; + } +} diff --git a/cmd/zed/agents/fmd_serd.h b/cmd/zed/agents/fmd_serd.h new file mode 100644 index 000000000000..c35c9acc7785 --- /dev/null +++ b/cmd/zed/agents/fmd_serd.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _FMD_SERD_H +#define _FMD_SERD_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/list.h> +#include <sys/time.h> + +typedef struct fmd_serd_elem { + list_node_t se_list; /* linked list forward/back pointers */ + hrtime_t se_hrt; /* upper bound on event hrtime */ +} fmd_serd_elem_t; + +typedef struct fmd_serd_eng { + char *sg_name; /* string name for this engine */ + struct fmd_serd_eng *sg_next; /* next engine on hash chain */ + list_t sg_list; /* list of fmd_serd_elem_t's */ + uint_t sg_count; /* count of events in sg_list */ + uint_t sg_flags; /* engine flags (see below) */ + uint_t sg_n; /* engine N parameter (event count) */ + hrtime_t sg_t; /* engine T parameter (nanoseconds) */ +} fmd_serd_eng_t; + +#define FMD_SERD_FIRED 0x1 /* error rate has exceeded threshold */ +#define FMD_SERD_DIRTY 0x2 /* engine needs to be checkpointed */ + +typedef void fmd_serd_eng_f(fmd_serd_eng_t *, void *); + +typedef struct fmd_serd_hash { + fmd_serd_eng_t **sh_hash; /* hash bucket array for buffers */ + uint_t sh_hashlen; /* length of hash bucket array */ + uint_t sh_count; /* count of engines in hash */ +} fmd_serd_hash_t; + +extern void fmd_serd_hash_create(fmd_serd_hash_t *); +extern void fmd_serd_hash_destroy(fmd_serd_hash_t *); +extern void fmd_serd_hash_apply(fmd_serd_hash_t *, fmd_serd_eng_f *, void *); + +extern fmd_serd_eng_t *fmd_serd_eng_insert(fmd_serd_hash_t *, + const char *, uint32_t, hrtime_t); + +extern fmd_serd_eng_t *fmd_serd_eng_lookup(fmd_serd_hash_t *, const char *); +extern void fmd_serd_eng_delete(fmd_serd_hash_t *, const char *); + +extern int fmd_serd_eng_record(fmd_serd_eng_t *, hrtime_t); +extern int fmd_serd_eng_fired(fmd_serd_eng_t *); +extern int fmd_serd_eng_empty(fmd_serd_eng_t *); + +extern void fmd_serd_eng_reset(fmd_serd_eng_t *); +extern void fmd_serd_eng_gc(fmd_serd_eng_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _FMD_SERD_H */ diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c new file mode 100644 index 000000000000..006e0ab99f47 --- /dev/null +++ b/cmd/zed/agents/zfs_agents.c @@ -0,0 +1,422 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com> + */ + +#include <libnvpair.h> +#include <libzfs.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <sys/list.h> +#include <sys/time.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> +#include <sys/fm/protocol.h> +#include <sys/fm/fs/zfs.h> +#include <pthread.h> +#include <unistd.h> + +#include "zfs_agents.h" +#include "fmd_api.h" +#include "../zed_log.h" + +/* + * agent dispatch code + */ + +static pthread_mutex_t agent_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t agent_cond = PTHREAD_COND_INITIALIZER; +static list_t agent_events; /* list of pending events */ +static int agent_exiting; + +typedef struct agent_event { + char ae_class[64]; + char ae_subclass[32]; + nvlist_t *ae_nvl; + list_node_t ae_node; +} agent_event_t; + +pthread_t g_agents_tid; + +libzfs_handle_t *g_zfs_hdl; + +/* guid search data */ +typedef enum device_type { + DEVICE_TYPE_L2ARC, /* l2arc device */ + DEVICE_TYPE_SPARE, /* spare device */ + DEVICE_TYPE_PRIMARY /* any primary pool storage device */ +} device_type_t; + +typedef struct guid_search { + uint64_t gs_pool_guid; + uint64_t gs_vdev_guid; + char *gs_devid; + device_type_t gs_vdev_type; + uint64_t gs_vdev_expandtime; /* vdev expansion time */ +} guid_search_t; + +/* + * Walks the vdev tree recursively looking for a matching devid. + * Returns B_TRUE as soon as a matching device is found, B_FALSE otherwise. + */ +static boolean_t +zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) +{ + guid_search_t *gsp = arg; + char *path = NULL; + uint_t c, children; + nvlist_t **child; + + /* + * First iterate over any children. + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { + gsp->gs_vdev_type = DEVICE_TYPE_PRIMARY; + return (B_TRUE); + } + } + } + /* + * Iterate over any spares and cache devices + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { + gsp->gs_vdev_type = DEVICE_TYPE_L2ARC; + return (B_TRUE); + } + } + } + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { + gsp->gs_vdev_type = DEVICE_TYPE_SPARE; + return (B_TRUE); + } + } + } + /* + * On a devid match, grab the vdev guid and expansion time, if any. + */ + if (gsp->gs_devid != NULL && + (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) && + (strcmp(gsp->gs_devid, path) == 0)) { + (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, + &gsp->gs_vdev_guid); + (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME, + &gsp->gs_vdev_expandtime); + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg) +{ + guid_search_t *gsp = arg; + nvlist_t *config, *nvl; + + /* + * For each vdev in this pool, look for a match by devid + */ + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvl) == 0) { + (void) zfs_agent_iter_vdev(zhp, nvl, gsp); + } + } + /* + * if a match was found then grab the pool guid + */ + if (gsp->gs_vdev_guid) { + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &gsp->gs_pool_guid); + } + + zpool_close(zhp); + return (gsp->gs_vdev_guid != 0); +} + +void +zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + agent_event_t *event; + + if (subclass == NULL) + subclass = ""; + + event = malloc(sizeof (agent_event_t)); + if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) { + if (event) + free(event); + return; + } + + if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) { + class = EC_ZFS; + subclass = ESC_ZFS_VDEV_CHECK; + } + + /* + * On ZFS on Linux, we don't get the expected FM_RESOURCE_REMOVED + * ereport from vdev_disk layer after a hot unplug. Fortunately we + * get a EC_DEV_REMOVE from our disk monitor and it is a suitable + * proxy so we remap it here for the benefit of the diagnosis engine. + */ + if ((strcmp(class, EC_DEV_REMOVE) == 0) && + (strcmp(subclass, ESC_DISK) == 0) && + (nvlist_exists(nvl, ZFS_EV_VDEV_GUID) || + nvlist_exists(nvl, DEV_IDENTIFIER))) { + nvlist_t *payload = event->ae_nvl; + struct timeval tv; + int64_t tod[2]; + uint64_t pool_guid = 0, vdev_guid = 0; + guid_search_t search = { 0 }; + device_type_t devtype = DEVICE_TYPE_PRIMARY; + + class = "resource.fs.zfs.removed"; + subclass = ""; + + (void) nvlist_add_string(payload, FM_CLASS, class); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid); + + (void) gettimeofday(&tv, NULL); + tod[0] = tv.tv_sec; + tod[1] = tv.tv_usec; + (void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2); + + /* + * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or + * ZFS_EV_POOL_GUID may be missing so find them. + */ + (void) nvlist_lookup_string(nvl, DEV_IDENTIFIER, + &search.gs_devid); + (void) zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search); + pool_guid = search.gs_pool_guid; + vdev_guid = search.gs_vdev_guid; + devtype = search.gs_vdev_type; + + /* + * We want to avoid reporting "remove" events coming from + * libudev for VDEVs which were expanded recently (10s) and + * avoid activating spares in response to partitions being + * deleted and created in rapid succession. + */ + if (search.gs_vdev_expandtime != 0 && + search.gs_vdev_expandtime + 10 > tv.tv_sec) { + zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' " + "for recently expanded device '%s'", EC_DEV_REMOVE, + search.gs_devid); + goto out; + } + + (void) nvlist_add_uint64(payload, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid); + (void) nvlist_add_uint64(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid); + switch (devtype) { + case DEVICE_TYPE_L2ARC: + (void) nvlist_add_string(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + VDEV_TYPE_L2CACHE); + break; + case DEVICE_TYPE_SPARE: + (void) nvlist_add_string(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_SPARE); + break; + case DEVICE_TYPE_PRIMARY: + (void) nvlist_add_string(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_DISK); + break; + } + + zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'", + EC_DEV_REMOVE, class); + } + + (void) strlcpy(event->ae_class, class, sizeof (event->ae_class)); + (void) strlcpy(event->ae_subclass, subclass, + sizeof (event->ae_subclass)); + + (void) pthread_mutex_lock(&agent_lock); + list_insert_tail(&agent_events, event); + (void) pthread_mutex_unlock(&agent_lock); + +out: + (void) pthread_cond_signal(&agent_cond); +} + +static void +zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl) +{ + /* + * The diagnosis engine subscribes to the following events. + * On illumos these subscriptions reside in: + * /usr/lib/fm/fmd/plugins/zfs-diagnosis.conf + */ + if (strstr(class, "ereport.fs.zfs.") != NULL || + strstr(class, "resource.fs.zfs.") != NULL || + strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 || + strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 || + strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) { + fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class); + } + + /* + * The retire agent subscribes to the following events. + * On illumos these subscriptions reside in: + * /usr/lib/fm/fmd/plugins/zfs-retire.conf + * + * NOTE: faults events come directly from our diagnosis engine + * and will not pass through the zfs kernel module. + */ + if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 || + strcmp(class, "resource.fs.zfs.removed") == 0 || + strcmp(class, "resource.fs.zfs.statechange") == 0 || + strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { + fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class); + } + + /* + * The SLM module only consumes disk events and vdev check events + * + * NOTE: disk events come directly from disk monitor and will + * not pass through the zfs kernel module. + */ + if (strstr(class, "EC_dev_") != NULL || + strcmp(class, EC_ZFS) == 0) { + (void) zfs_slm_event(class, subclass, nvl); + } +} + +/* + * Events are consumed and dispatched from this thread + * An agent can also post an event so event list lock + * is not held when calling an agent. + * One event is consumed at a time. + */ +static void * +zfs_agent_consumer_thread(void *arg) +{ + for (;;) { + agent_event_t *event; + + (void) pthread_mutex_lock(&agent_lock); + + /* wait for an event to show up */ + while (!agent_exiting && list_is_empty(&agent_events)) + (void) pthread_cond_wait(&agent_cond, &agent_lock); + + if (agent_exiting) { + (void) pthread_mutex_unlock(&agent_lock); + zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: " + "exiting"); + return (NULL); + } + + if ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); + + (void) pthread_mutex_unlock(&agent_lock); + + /* dispatch to all event subscribers */ + zfs_agent_dispatch(event->ae_class, event->ae_subclass, + event->ae_nvl); + + nvlist_free(event->ae_nvl); + free(event); + continue; + } + + (void) pthread_mutex_unlock(&agent_lock); + } + + return (NULL); +} + +void +zfs_agent_init(libzfs_handle_t *zfs_hdl) +{ + fmd_hdl_t *hdl; + + g_zfs_hdl = zfs_hdl; + + if (zfs_slm_init() != 0) + zed_log_die("Failed to initialize zfs slm"); + zed_log_msg(LOG_INFO, "Add Agent: init"); + + hdl = fmd_module_hdl("zfs-diagnosis"); + _zfs_diagnosis_init(hdl); + if (!fmd_module_initialized(hdl)) + zed_log_die("Failed to initialize zfs diagnosis"); + + hdl = fmd_module_hdl("zfs-retire"); + _zfs_retire_init(hdl); + if (!fmd_module_initialized(hdl)) + zed_log_die("Failed to initialize zfs retire"); + + list_create(&agent_events, sizeof (agent_event_t), + offsetof(struct agent_event, ae_node)); + + if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread, + NULL) != 0) { + list_destroy(&agent_events); + zed_log_die("Failed to initialize agents"); + } +} + +void +zfs_agent_fini(void) +{ + fmd_hdl_t *hdl; + agent_event_t *event; + + agent_exiting = 1; + (void) pthread_cond_signal(&agent_cond); + + /* wait for zfs_enum_pools thread to complete */ + (void) pthread_join(g_agents_tid, NULL); + + /* drain any pending events */ + while ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); + nvlist_free(event->ae_nvl); + free(event); + } + + list_destroy(&agent_events); + + if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) { + _zfs_retire_fini(hdl); + fmd_hdl_unregister(hdl); + } + if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) { + _zfs_diagnosis_fini(hdl); + fmd_hdl_unregister(hdl); + } + + zed_log_msg(LOG_INFO, "Add Agent: fini"); + zfs_slm_fini(); + + g_zfs_hdl = NULL; +} diff --git a/cmd/zed/agents/zfs_agents.h b/cmd/zed/agents/zfs_agents.h new file mode 100644 index 000000000000..d1a459139b1e --- /dev/null +++ b/cmd/zed/agents/zfs_agents.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef ZFS_AGENTS_H +#define ZFS_AGENTS_H + +#include <libzfs.h> +#include <libnvpair.h> + + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Agent abstraction presented to ZED + */ +extern void zfs_agent_init(libzfs_handle_t *); +extern void zfs_agent_fini(void); +extern void zfs_agent_post_event(const char *, const char *, nvlist_t *); + +/* + * ZFS Sysevent Linkable Module (SLM) + */ +extern int zfs_slm_init(void); +extern void zfs_slm_fini(void); +extern void zfs_slm_event(const char *, const char *, nvlist_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* !ZFS_AGENTS_H */ diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c new file mode 100644 index 000000000000..0b27f6702ee8 --- /dev/null +++ b/cmd/zed/agents/zfs_diagnosis.c @@ -0,0 +1,981 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2016, Intel Corporation. + */ + +#include <stddef.h> +#include <string.h> +#include <strings.h> +#include <libuutil.h> +#include <libzfs.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/fs/zfs.h> +#include <sys/fm/protocol.h> +#include <sys/fm/fs/zfs.h> + +#include "zfs_agents.h" +#include "fmd_api.h" + +/* + * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This + * #define reserves enough space for two 64-bit hex values plus the length of + * the longest string. + */ +#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) + +/* + * On-disk case structure. This must maintain backwards compatibility with + * previous versions of the DE. By default, any members appended to the end + * will be filled with zeros if they don't exist in a previous version. + */ +typedef struct zfs_case_data { + uint64_t zc_version; + uint64_t zc_ena; + uint64_t zc_pool_guid; + uint64_t zc_vdev_guid; + int zc_pool_state; + char zc_serd_checksum[MAX_SERDLEN]; + char zc_serd_io[MAX_SERDLEN]; + int zc_has_remove_timer; +} zfs_case_data_t; + +/* + * Time-of-day + */ +typedef struct er_timeval { + uint64_t ertv_sec; + uint64_t ertv_nsec; +} er_timeval_t; + +/* + * In-core case structure. + */ +typedef struct zfs_case { + boolean_t zc_present; + uint32_t zc_version; + zfs_case_data_t zc_data; + fmd_case_t *zc_case; + uu_list_node_t zc_node; + id_t zc_remove_timer; + char *zc_fru; + er_timeval_t zc_when; +} zfs_case_t; + +#define CASE_DATA "data" +#define CASE_FRU "fru" +#define CASE_DATA_VERSION_INITIAL 1 +#define CASE_DATA_VERSION_SERD 2 + +typedef struct zfs_de_stats { + fmd_stat_t old_drops; + fmd_stat_t dev_drops; + fmd_stat_t vdev_drops; + fmd_stat_t import_drops; + fmd_stat_t resource_drops; +} zfs_de_stats_t; + +zfs_de_stats_t zfs_stats = { + { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" }, + { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"}, + { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"}, + { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" }, + { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" } +}; + +static hrtime_t zfs_remove_timeout; + +uu_list_pool_t *zfs_case_pool; +uu_list_t *zfs_cases; + +#define ZFS_MAKE_RSRC(type) \ + FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type +#define ZFS_MAKE_EREPORT(type) \ + FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type + +/* + * Write out the persistent representation of an active case. + */ +static void +zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp) +{ + zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD; +} + +/* + * Read back the persistent representation of an active case. + */ +static zfs_case_t * +zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + zfs_case_t *zcp; + + zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP); + zcp->zc_case = cp; + + fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, + sizeof (zcp->zc_data)); + + if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); + return (NULL); + } + + /* + * fmd_buf_read() will have already zeroed out the remainder of the + * buffer, so we don't have to do anything special if the version + * doesn't include the SERD engine name. + */ + + if (zcp->zc_data.zc_has_remove_timer) + zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, + NULL, zfs_remove_timeout); + + uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool); + (void) uu_list_insert_before(zfs_cases, NULL, zcp); + + fmd_case_setspecific(hdl, cp, zcp); + + return (zcp); +} + +/* + * Iterate over any active cases. If any cases are associated with a pool or + * vdev which is no longer present on the system, close the associated case. + */ +static void +zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded) +{ + uint64_t vdev_guid = 0; + uint_t c, children; + nvlist_t **child; + zfs_case_t *zcp; + + (void) nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid); + + /* + * Mark any cases associated with this (pool, vdev) pair. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == vdev_guid) { + zcp->zc_present = B_TRUE; + zcp->zc_when = *loaded; + } + } + + /* + * Iterate over all children. + */ + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } + + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } + + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } +} + +/*ARGSUSED*/ +static int +zfs_mark_pool(zpool_handle_t *zhp, void *unused) +{ + zfs_case_t *zcp; + uint64_t pool_guid; + uint64_t *tod; + er_timeval_t loaded = { 0 }; + nvlist_t *config, *vd; + uint_t nelem = 0; + int ret; + + pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); + /* + * Mark any cases associated with just this pool. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == 0) + zcp->zc_present = B_TRUE; + } + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + zpool_close(zhp); + return (-1); + } + + (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, + &tod, &nelem); + if (nelem == 2) { + loaded.ertv_sec = tod[0]; + loaded.ertv_nsec = tod[1]; + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == 0) { + zcp->zc_when = loaded; + } + } + } + + ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd); + if (ret) { + zpool_close(zhp); + return (-1); + } + + zfs_mark_vdev(pool_guid, vd, &loaded); + + zpool_close(zhp); + + return (0); +} + +struct load_time_arg { + uint64_t lt_guid; + er_timeval_t *lt_time; + boolean_t lt_found; +}; + +static int +zpool_find_load_time(zpool_handle_t *zhp, void *arg) +{ + struct load_time_arg *lta = arg; + uint64_t pool_guid; + uint64_t *tod; + nvlist_t *config; + uint_t nelem; + + if (lta->lt_found) { + zpool_close(zhp); + return (0); + } + + pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); + if (pool_guid != lta->lt_guid) { + zpool_close(zhp); + return (0); + } + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + zpool_close(zhp); + return (-1); + } + + if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, + &tod, &nelem) == 0 && nelem == 2) { + lta->lt_found = B_TRUE; + lta->lt_time->ertv_sec = tod[0]; + lta->lt_time->ertv_nsec = tod[1]; + } + + zpool_close(zhp); + + return (0); +} + +static void +zfs_purge_cases(fmd_hdl_t *hdl) +{ + zfs_case_t *zcp; + uu_list_walk_t *walk; + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + + /* + * There is no way to open a pool by GUID, or lookup a vdev by GUID. No + * matter what we do, we're going to have to stomach an O(vdevs * cases) + * algorithm. In reality, both quantities are likely so small that + * neither will matter. Given that iterating over pools is more + * expensive than iterating over the in-memory case list, we opt for a + * 'present' flag in each case that starts off cleared. We then iterate + * over all pools, marking those that are still present, and removing + * those that aren't found. + * + * Note that we could also construct an FMRI and rely on + * fmd_nvl_fmri_present(), but this would end up doing the same search. + */ + + /* + * Mark the cases as not present. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) + zcp->zc_present = B_FALSE; + + /* + * Iterate over all pools and mark the pools and vdevs found. If this + * fails (most probably because we're out of memory), then don't close + * any of the cases and we cannot be sure they are accurate. + */ + if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0) + return; + + /* + * Remove those cases which were not found. + */ + walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); + while ((zcp = uu_list_walk_next(walk)) != NULL) { + if (!zcp->zc_present) + fmd_case_close(hdl, zcp->zc_case); + } + uu_list_walk_end(walk); +} + +/* + * Construct the name of a serd engine given the pool/vdev GUID and type (io or + * checksum). + */ +static void +zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, + const char *type) +{ + (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", + (long long unsigned int)pool_guid, + (long long unsigned int)vdev_guid, type); +} + +/* + * Solve a given ZFS case. This first checks to make sure the diagnosis is + * still valid, as well as cleaning up any pending timer associated with the + * case. + */ +static void +zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname, + boolean_t checkunusable) +{ + nvlist_t *detector, *fault; + boolean_t serialize; + nvlist_t *fru = NULL; + fmd_hdl_debug(hdl, "solving fault '%s'", faultname); + + /* + * Construct the detector from the case data. The detector is in the + * ZFS scheme, and is either the pool or the vdev, depending on whether + * this is a vdev or pool fault. + */ + detector = fmd_nvl_alloc(hdl, FMD_SLEEP); + + (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); + (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); + (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, + zcp->zc_data.zc_pool_guid); + if (zcp->zc_data.zc_vdev_guid != 0) { + (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, + zcp->zc_data.zc_vdev_guid); + } + + fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, + fru, detector); + fmd_case_add_suspect(hdl, zcp->zc_case, fault); + + nvlist_free(fru); + + fmd_case_solve(hdl, zcp->zc_case); + + serialize = B_FALSE; + if (zcp->zc_data.zc_has_remove_timer) { + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_data.zc_has_remove_timer = 0; + serialize = B_TRUE; + } + if (serialize) + zfs_case_serialize(hdl, zcp); + + nvlist_free(detector); +} + +static boolean_t +timeval_earlier(er_timeval_t *a, er_timeval_t *b) +{ + return (a->ertv_sec < b->ertv_sec || + (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec)); +} + +/*ARGSUSED*/ +static void +zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when) +{ + int64_t *tod; + uint_t nelem; + + if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod, + &nelem) == 0 && nelem == 2) { + when->ertv_sec = tod[0]; + when->ertv_nsec = tod[1]; + } else { + when->ertv_sec = when->ertv_nsec = UINT64_MAX; + } +} + +/* + * Main fmd entry point. + */ +/*ARGSUSED*/ +static void +zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) +{ + zfs_case_t *zcp, *dcp; + int32_t pool_state; + uint64_t ena, pool_guid, vdev_guid; + er_timeval_t pool_load; + er_timeval_t er_when; + nvlist_t *detector; + boolean_t pool_found = B_FALSE; + boolean_t isresource; + char *type; + + /* + * We subscribe to notifications for vdev or pool removal. In these + * cases, there may be cases that no longer apply. Purge any cases + * that no longer apply. + */ + if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) { + fmd_hdl_debug(hdl, "purging orphaned cases from %s", + strrchr(class, '.') + 1); + zfs_purge_cases(hdl); + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*"); + + if (isresource) { + /* + * For resources, we don't have a normal payload. + */ + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + &vdev_guid) != 0) + pool_state = SPA_LOAD_OPEN; + else + pool_state = SPA_LOAD_NONE; + detector = NULL; + } else { + (void) nvlist_lookup_nvlist(nvl, + FM_EREPORT_DETECTOR, &detector); + (void) nvlist_lookup_int32(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state); + } + + /* + * We also ignore all ereports generated during an import of a pool, + * since the only possible fault (.pool) would result in import failure, + * and hence no persistent fault. Some day we may want to do something + * with these ereports, so we continue generating them internally. + */ + if (pool_state == SPA_LOAD_IMPORT) { + zfs_stats.import_drops.fmds_value.ui64++; + fmd_hdl_debug(hdl, "ignoring '%s' during import", class); + return; + } + + /* + * Device I/O errors are ignored during pool open. + */ + if (pool_state == SPA_LOAD_OPEN && + (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) { + fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class); + zfs_stats.dev_drops.fmds_value.ui64++; + return; + } + + /* + * We ignore ereports for anything except disks and files. + */ + if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + &type) == 0) { + if (strcmp(type, VDEV_TYPE_DISK) != 0 && + strcmp(type, VDEV_TYPE_FILE) != 0) { + zfs_stats.vdev_drops.fmds_value.ui64++; + return; + } + } + + /* + * Determine if this ereport corresponds to an open case. + * Each vdev or pool can have a single case. + */ + (void) nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); + if (nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) + vdev_guid = 0; + if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) + ena = 0; + + zfs_ereport_when(hdl, nvl, &er_when); + + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid) { + pool_found = B_TRUE; + pool_load = zcp->zc_when; + } + if (zcp->zc_data.zc_vdev_guid == vdev_guid) + break; + } + + /* + * Avoid falsely accusing a pool of being faulty. Do so by + * not replaying ereports that were generated prior to the + * current import. If the failure that generated them was + * transient because the device was actually removed but we + * didn't receive the normal asynchronous notification, we + * don't want to mark it as faulted and potentially panic. If + * there is still a problem we'd expect not to be able to + * import the pool, or that new ereports will be generated + * once the pool is used. + */ + if (pool_found && timeval_earlier(&er_when, &pool_load)) { + fmd_hdl_debug(hdl, "ignoring pool %llx, " + "ereport time %lld.%lld, pool load time = %lld.%lld", + pool_guid, er_when.ertv_sec, er_when.ertv_nsec, + pool_load.ertv_sec, pool_load.ertv_nsec); + zfs_stats.old_drops.fmds_value.ui64++; + return; + } + + if (!pool_found) { + /* + * Haven't yet seen this pool, but same situation + * may apply. + */ + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + struct load_time_arg la; + + la.lt_guid = pool_guid; + la.lt_time = &pool_load; + la.lt_found = B_FALSE; + + if (zhdl != NULL && + zpool_iter(zhdl, zpool_find_load_time, &la) == 0 && + la.lt_found == B_TRUE) { + pool_found = B_TRUE; + + if (timeval_earlier(&er_when, &pool_load)) { + fmd_hdl_debug(hdl, "ignoring pool %llx, " + "ereport time %lld.%lld, " + "pool load time = %lld.%lld", + pool_guid, er_when.ertv_sec, + er_when.ertv_nsec, pool_load.ertv_sec, + pool_load.ertv_nsec); + zfs_stats.old_drops.fmds_value.ui64++; + return; + } + } + } + + if (zcp == NULL) { + fmd_case_t *cs; + zfs_case_data_t data = { 0 }; + + /* + * If this is one of our 'fake' resource ereports, and there is + * no case open, simply discard it. + */ + if (isresource) { + zfs_stats.resource_drops.fmds_value.ui64++; + fmd_hdl_debug(hdl, "discarding '%s for vdev %llu", + class, vdev_guid); + return; + } + + /* + * Skip tracking some ereports + */ + if (strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 || + strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 || + strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) { + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + /* + * Open a new case. + */ + cs = fmd_case_open(hdl, NULL); + + fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'", + vdev_guid, class); + + /* + * Initialize the case buffer. To commonize code, we actually + * create the buffer with existing data, and then call + * zfs_case_unserialize() to instantiate the in-core structure. + */ + fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t)); + + data.zc_version = CASE_DATA_VERSION_SERD; + data.zc_ena = ena; + data.zc_pool_guid = pool_guid; + data.zc_vdev_guid = vdev_guid; + data.zc_pool_state = (int)pool_state; + + fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); + + zcp = zfs_case_unserialize(hdl, cs); + assert(zcp != NULL); + if (pool_found) + zcp->zc_when = pool_load; + } + + if (isresource) { + fmd_hdl_debug(hdl, "resource event '%s'", class); + + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) { + /* + * The 'resource.fs.zfs.autoreplace' event indicates + * that the pool was loaded with the 'autoreplace' + * property set. In this case, any pending device + * failures should be ignored, as the asynchronous + * autoreplace handling will take care of them. + */ + fmd_case_close(hdl, zcp->zc_case); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) { + /* + * The 'resource.fs.zfs.removed' event indicates that + * device removal was detected, and the device was + * closed asynchronously. If this is the case, we + * assume that any recent I/O errors were due to the + * device removal, not any fault of the device itself. + * We reset the SERD engine, and cancel any pending + * timers. + */ + if (zcp->zc_data.zc_has_remove_timer) { + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_data.zc_has_remove_timer = 0; + zfs_case_serialize(hdl, zcp); + } + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_reset(hdl, + zcp->zc_data.zc_serd_checksum); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) { + uint64_t state = 0; + + if (zcp != NULL && + nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 && + state == VDEV_STATE_HEALTHY) { + fmd_hdl_debug(hdl, "closing case after a " + "device statechange to healthy"); + fmd_case_close(hdl, zcp->zc_case); + } + } + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + /* + * Associate the ereport with this case. + */ + fmd_case_add_ereport(hdl, zcp->zc_case, ep); + + /* + * Don't do anything else if this case is already solved. + */ + if (fmd_case_solved(hdl, zcp->zc_case)) + return; + + fmd_hdl_debug(hdl, "error event '%s'", class); + + /* + * Determine if we should solve the case and generate a fault. We solve + * a case if: + * + * a. A pool failed to open (ereport.fs.zfs.pool) + * b. A device failed to open (ereport.fs.zfs.pool) while a pool + * was up and running. + * + * We may see a series of ereports associated with a pool open, all + * chained together by the same ENA. If the pool open succeeds, then + * we'll see no further ereports. To detect when a pool open has + * succeeded, we associate a timer with the event. When it expires, we + * close the case. + */ + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) { + /* + * Pool level fault. Before solving the case, go through and + * close any open device cases that may be pending. + */ + for (dcp = uu_list_first(zfs_cases); dcp != NULL; + dcp = uu_list_next(zfs_cases, dcp)) { + if (dcp->zc_data.zc_pool_guid == + zcp->zc_data.zc_pool_guid && + dcp->zc_data.zc_vdev_guid != 0) + fmd_case_close(hdl, dcp->zc_case); + } + + zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) { + /* + * Pool level fault for reading the intent logs. + */ + zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) { + /* + * Device fault. + */ + zfs_case_solve(hdl, zcp, "fault.fs.zfs.device", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { + char *failmode = NULL; + boolean_t checkremove = B_FALSE; + + /* + * If this is a checksum or I/O error, then toss it into the + * appropriate SERD engine and check to see if it has fired. + * Ideally, we want to do something more sophisticated, + * (persistent errors for a single data block, etc). For now, + * a single SERD engine is sufficient. + */ + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) { + if (zcp->zc_data.zc_serd_io[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_io, + pool_guid, vdev_guid, "io"); + fmd_serd_create(hdl, zcp->zc_data.zc_serd_io, + fmd_prop_get_int32(hdl, "io_N"), + fmd_prop_get_int64(hdl, "io_T")); + zfs_case_serialize(hdl, zcp); + } + if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) + checkremove = B_TRUE; + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { + if (zcp->zc_data.zc_serd_checksum[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_checksum, + pool_guid, vdev_guid, "checksum"); + fmd_serd_create(hdl, + zcp->zc_data.zc_serd_checksum, + fmd_prop_get_int32(hdl, "checksum_N"), + fmd_prop_get_int64(hdl, "checksum_T")); + zfs_case_serialize(hdl, zcp); + } + if (fmd_serd_record(hdl, + zcp->zc_data.zc_serd_checksum, ep)) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.vdev.checksum", B_FALSE); + } + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) && + (nvlist_lookup_string(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) && + failmode != NULL) { + if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE, + strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.io_failure_continue", + B_FALSE); + } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT, + strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.io_failure_wait", B_FALSE); + } + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { +#ifndef __linux__ + /* This causes an unexpected fault diagnosis on linux */ + checkremove = B_TRUE; +#endif + } + + /* + * Because I/O errors may be due to device removal, we postpone + * any diagnosis until we're sure that we aren't about to + * receive a 'resource.fs.zfs.removed' event. + */ + if (checkremove) { + if (zcp->zc_data.zc_has_remove_timer) + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL, + zfs_remove_timeout); + if (!zcp->zc_data.zc_has_remove_timer) { + zcp->zc_data.zc_has_remove_timer = 1; + zfs_case_serialize(hdl, zcp); + } + } + } +} + +/* + * The timeout is fired when we diagnosed an I/O error, and it was not due to + * device removal (which would cause the timeout to be cancelled). + */ +/* ARGSUSED */ +static void +zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data) +{ + zfs_case_t *zcp = data; + + if (id == zcp->zc_remove_timer) + zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io", B_FALSE); +} + +/* + * The specified case has been closed and any case-specific + * data structures should be deallocated. + */ +static void +zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) +{ + zfs_case_t *zcp = fmd_case_getspecific(hdl, cs); + + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_has_remove_timer) + fmd_timer_remove(hdl, zcp->zc_remove_timer); + + uu_list_remove(zfs_cases, zcp); + uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); +} + +/* + * We use the fmd gc entry point to look for old cases that no longer apply. + * This allows us to keep our set of case data small in a long running system. + */ +static void +zfs_fm_gc(fmd_hdl_t *hdl) +{ + zfs_purge_cases(hdl); +} + +static const fmd_hdl_ops_t fmd_ops = { + zfs_fm_recv, /* fmdo_recv */ + zfs_fm_timeout, /* fmdo_timeout */ + zfs_fm_close, /* fmdo_close */ + NULL, /* fmdo_stats */ + zfs_fm_gc, /* fmdo_gc */ +}; + +static const fmd_prop_t fmd_props[] = { + { "checksum_N", FMD_TYPE_UINT32, "10" }, + { "checksum_T", FMD_TYPE_TIME, "10min" }, + { "io_N", FMD_TYPE_UINT32, "10" }, + { "io_T", FMD_TYPE_TIME, "10min" }, + { "remove_timeout", FMD_TYPE_TIME, "15sec" }, + { NULL, 0, NULL } +}; + +static const fmd_hdl_info_t fmd_info = { + "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props +}; + +void +_zfs_diagnosis_init(fmd_hdl_t *hdl) +{ + libzfs_handle_t *zhdl; + + if ((zhdl = libzfs_init()) == NULL) + return; + + if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool", + sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node), + NULL, UU_LIST_POOL_DEBUG)) == NULL) { + libzfs_fini(zhdl); + return; + } + + if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, + UU_LIST_DEBUG)) == NULL) { + uu_list_pool_destroy(zfs_case_pool); + libzfs_fini(zhdl); + return; + } + + if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { + uu_list_destroy(zfs_cases); + uu_list_pool_destroy(zfs_case_pool); + libzfs_fini(zhdl); + return; + } + + fmd_hdl_setspecific(hdl, zhdl); + + (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / + sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); + + zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout"); +} + +void +_zfs_diagnosis_fini(fmd_hdl_t *hdl) +{ + zfs_case_t *zcp; + uu_list_walk_t *walk; + libzfs_handle_t *zhdl; + + /* + * Remove all active cases. + */ + walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); + while ((zcp = uu_list_walk_next(walk)) != NULL) { + fmd_hdl_debug(hdl, "removing case ena %llu", + (long long unsigned)zcp->zc_data.zc_ena); + uu_list_remove(zfs_cases, zcp); + uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); + } + uu_list_walk_end(walk); + + uu_list_destroy(zfs_cases); + uu_list_pool_destroy(zfs_case_pool); + + zhdl = fmd_hdl_getspecific(hdl); + libzfs_fini(zhdl); +} diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c new file mode 100644 index 000000000000..8d0a3b420086 --- /dev/null +++ b/cmd/zed/agents/zfs_mod.c @@ -0,0 +1,956 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2016, 2017, Intel Corporation. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + */ + +/* + * ZFS syseventd module. + * + * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c + * + * The purpose of this module is to identify when devices are added to the + * system, and appropriately online or replace the affected vdevs. + * + * When a device is added to the system: + * + * 1. Search for any vdevs whose devid matches that of the newly added + * device. + * + * 2. If no vdevs are found, then search for any vdevs whose udev path + * matches that of the new device. + * + * 3. If no vdevs match by either method, then ignore the event. + * + * 4. Attempt to online the device with a flag to indicate that it should + * be unspared when resilvering completes. If this succeeds, then the + * same device was inserted and we should continue normally. + * + * 5. If the pool does not have the 'autoreplace' property set, attempt to + * online the device again without the unspare flag, which will + * generate a FMA fault. + * + * 6. If the pool has the 'autoreplace' property set, and the matching vdev + * is a whole disk, then label the new disk and attempt a 'zpool + * replace'. + * + * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK + * event indicates that a device failed to open during pool load, but the + * autoreplace property was set. In this case, we deferred the associated + * FMA fault until our module had a chance to process the autoreplace logic. + * If the device could not be replaced, then the second online attempt will + * trigger the FMA fault that we skipped earlier. + * + * ZFS on Linux porting notes: + * Linux udev provides a disk insert for both the disk and the partition + * + */ + +#include <ctype.h> +#include <fcntl.h> +#include <libnvpair.h> +#include <libzfs.h> +#include <libzutil.h> +#include <limits.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> +#include <sys/list.h> +#include <sys/sunddi.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> +#include <thread_pool.h> +#include <pthread.h> +#include <unistd.h> +#include <errno.h> +#include "zfs_agents.h" +#include "../zed_log.h" + +#define DEV_BYID_PATH "/dev/disk/by-id/" +#define DEV_BYPATH_PATH "/dev/disk/by-path/" +#define DEV_BYVDEV_PATH "/dev/disk/by-vdev/" + +typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); + +libzfs_handle_t *g_zfshdl; +list_t g_pool_list; /* list of unavailable pools at initialization */ +list_t g_device_list; /* list of disks with asynchronous label request */ +tpool_t *g_tpool; +boolean_t g_enumeration_done; +pthread_t g_zfs_tid; /* zfs_enum_pools() thread */ + +typedef struct unavailpool { + zpool_handle_t *uap_zhp; + list_node_t uap_node; +} unavailpool_t; + +typedef struct pendingdev { + char pd_physpath[128]; + list_node_t pd_node; +} pendingdev_t; + +static int +zfs_toplevel_state(zpool_handle_t *zhp) +{ + nvlist_t *nvroot; + vdev_stat_t *vs; + unsigned int c; + + verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + return (vs->vs_state); +} + +static int +zfs_unavail_pool(zpool_handle_t *zhp, void *data) +{ + zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)", + zpool_get_name(zhp), (int)zfs_toplevel_state(zhp)); + + if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { + unavailpool_t *uap; + uap = malloc(sizeof (unavailpool_t)); + uap->uap_zhp = zhp; + list_insert_tail((list_t *)data, uap); + } else { + zpool_close(zhp); + } + return (0); +} + +/* + * Two stage replace on Linux + * since we get disk notifications + * we can wait for partitioned disk slice to show up! + * + * First stage tags the disk, initiates async partitioning, and returns + * Second stage finds the tag and proceeds to ZFS labeling/replace + * + * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach + * + * 1. physical match with no fs, no partition + * tag it top, partition disk + * + * 2. physical match again, see partition and tag + * + */ + +/* + * The device associated with the given vdev (either by devid or physical path) + * has been added to the system. If 'isdisk' is set, then we only attempt a + * replacement if it's a whole disk. This also implies that we should label the + * disk first. + * + * First, we attempt to online the device (making sure to undo any spare + * operation when finished). If this succeeds, then we're done. If it fails, + * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, + * but that the label was not what we expected. If the 'autoreplace' property + * is enabled, then we relabel the disk (if specified), and attempt a 'zpool + * replace'. If the online is successful, but the new state is something else + * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of + * race, and we should avoid attempting to relabel the disk. + * + * Also can arrive here from a ESC_ZFS_VDEV_CHECK event + */ +static void +zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) +{ + char *path; + vdev_state_t newstate; + nvlist_t *nvroot, *newvd; + pendingdev_t *device; + uint64_t wholedisk = 0ULL; + uint64_t offline = 0ULL; + uint64_t guid = 0ULL; + char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; + char rawpath[PATH_MAX], fullpath[PATH_MAX]; + char devpath[PATH_MAX]; + int ret; + boolean_t is_dm = B_FALSE; + boolean_t is_sd = B_FALSE; + uint_t c; + vdev_stat_t *vs; + + if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) + return; + + /* Skip healthy disks */ + verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + if (vs->vs_state == VDEV_STATE_HEALTHY) { + zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.", + __func__, path); + return; + } + + (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); + (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &enc_sysfs_path); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); + + if (offline) + return; /* don't intervene if it was taken offline */ + + is_dm = zfs_dev_is_dm(path); + zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" + " wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path, + physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not", + (long long unsigned int)guid); + + /* + * The VDEV guid is preferred for identification (gets passed in path) + */ + if (guid != 0) { + (void) snprintf(fullpath, sizeof (fullpath), "%llu", + (long long unsigned int)guid); + } else { + /* + * otherwise use path sans partition suffix for whole disks + */ + (void) strlcpy(fullpath, path, sizeof (fullpath)); + if (wholedisk) { + char *spath = zfs_strip_partition(fullpath); + if (!spath) { + zed_log_msg(LOG_INFO, "%s: Can't alloc", + __func__); + return; + } + + (void) strlcpy(fullpath, spath, sizeof (fullpath)); + free(spath); + } + } + + /* + * Attempt to online the device. + */ + if (zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && + (newstate == VDEV_STATE_HEALTHY || + newstate == VDEV_STATE_DEGRADED)) { + zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s", + fullpath, (newstate == VDEV_STATE_HEALTHY) ? + "HEALTHY" : "DEGRADED"); + return; + } + + /* + * vdev_id alias rule for using scsi_debug devices (FMA automated + * testing) + */ + if (physpath != NULL && strcmp("scsidebug", physpath) == 0) + is_sd = B_TRUE; + + /* + * If the pool doesn't have the autoreplace property set, then use + * vdev online to trigger a FMA fault by posting an ereport. + */ + if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || + !(wholedisk || is_dm) || (physpath == NULL)) { + (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, + &newstate); + zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " + "not a whole disk for '%s'", fullpath); + return; + } + + /* + * Convert physical path into its current device node. Rawpath + * needs to be /dev/disk/by-vdev for a scsi_debug device since + * /dev/disk/by-path will not be present. + */ + (void) snprintf(rawpath, sizeof (rawpath), "%s%s", + is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); + + if (realpath(rawpath, devpath) == NULL && !is_dm) { + zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", + rawpath, strerror(errno)); + + (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, + &newstate); + + zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", + fullpath, libzfs_error_description(g_zfshdl)); + return; + } + + /* Only autoreplace bad disks */ + if ((vs->vs_state != VDEV_STATE_DEGRADED) && + (vs->vs_state != VDEV_STATE_FAULTED) && + (vs->vs_state != VDEV_STATE_CANT_OPEN)) { + return; + } + + nvlist_lookup_string(vdev, "new_devid", &new_devid); + + if (is_dm) { + /* Don't label device mapper or multipath disks. */ + } else if (!labeled) { + /* + * we're auto-replacing a raw disk, so label it first + */ + char *leafname; + + /* + * If this is a request to label a whole disk, then attempt to + * write out the label. Before we can label the disk, we need + * to map the physical string that was matched on to the under + * lying device node. + * + * If any part of this process fails, then do a force online + * to trigger a ZFS fault for the device (and any hot spare + * replacement). + */ + leafname = strrchr(devpath, '/') + 1; + + /* + * If this is a request to label a whole disk, then attempt to + * write out the label. + */ + if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { + zed_log_msg(LOG_INFO, " zpool_label_disk: could not " + "label '%s' (%s)", leafname, + libzfs_error_description(g_zfshdl)); + + (void) zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); + return; + } + + /* + * The disk labeling is asynchronous on Linux. Just record + * this label request and return as there will be another + * disk add event for the partition after the labeling is + * completed. + */ + device = malloc(sizeof (pendingdev_t)); + (void) strlcpy(device->pd_physpath, physpath, + sizeof (device->pd_physpath)); + list_insert_tail(&g_device_list, device); + + zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", + leafname, (u_longlong_t)guid); + + return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ + + } else /* labeled */ { + boolean_t found = B_FALSE; + /* + * match up with request above to label the disk + */ + for (device = list_head(&g_device_list); device != NULL; + device = list_next(&g_device_list, device)) { + if (strcmp(physpath, device->pd_physpath) == 0) { + list_remove(&g_device_list, device); + free(device); + found = B_TRUE; + break; + } + zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s", + physpath, device->pd_physpath); + } + if (!found) { + /* unexpected partition slice encountered */ + zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", + fullpath); + (void) zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); + return; + } + + zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", + physpath, (u_longlong_t)guid); + + (void) snprintf(devpath, sizeof (devpath), "%s%s", + DEV_BYID_PATH, new_devid); + } + + /* + * Construct the root vdev to pass to zpool_vdev_attach(). While adding + * the entire vdev structure is harmless, we construct a reduced set of + * path/physpath/wholedisk to keep it simple. + */ + if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); + return; + } + if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); + nvlist_free(nvroot); + return; + } + + if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || + nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || + nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 || + (physpath != NULL && nvlist_add_string(newvd, + ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || + (enc_sysfs_path != NULL && nvlist_add_string(newvd, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) || + nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || + nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || + nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, + 1) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs"); + nvlist_free(newvd); + nvlist_free(nvroot); + return; + } + + nvlist_free(newvd); + + /* + * Wait for udev to verify the links exist, then auto-replace + * the leaf disk at same physical location. + */ + if (zpool_label_disk_wait(path, 3000) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement " + "disk %s is missing", path); + nvlist_free(nvroot); + return; + } + + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE); + + zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", + fullpath, path, (ret == 0) ? "no errors" : + libzfs_error_description(g_zfshdl)); + + nvlist_free(nvroot); +} + +/* + * Utility functions to find a vdev matching given criteria. + */ +typedef struct dev_data { + const char *dd_compare; + const char *dd_prop; + zfs_process_func_t dd_func; + boolean_t dd_found; + boolean_t dd_islabeled; + uint64_t dd_pool_guid; + uint64_t dd_vdev_guid; + const char *dd_new_devid; +} dev_data_t; + +static void +zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) +{ + dev_data_t *dp = data; + char *path = NULL; + uint_t c, children; + nvlist_t **child; + + /* + * First iterate over any children. + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + } + + /* + * Iterate over any spares and cache devices + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + } + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + } + + /* once a vdev was matched and processed there is nothing left to do */ + if (dp->dd_found) + return; + + /* + * Match by GUID if available otherwise fallback to devid or physical + */ + if (dp->dd_vdev_guid != 0) { + uint64_t guid; + + if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid != dp->dd_vdev_guid) { + return; + } + zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid); + dp->dd_found = B_TRUE; + + } else if (dp->dd_compare != NULL) { + /* + * NOTE: On Linux there is an event for partition, so unlike + * illumos, substring matching is not required to accommodate + * the partition suffix. An exact match will be present in + * the dp->dd_compare value. + */ + if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || + strcmp(dp->dd_compare, path) != 0) + return; + + zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", + dp->dd_prop, path); + dp->dd_found = B_TRUE; + + /* pass the new devid for use by replacing code */ + if (dp->dd_new_devid != NULL) { + (void) nvlist_add_string(nvl, "new_devid", + dp->dd_new_devid); + } + } + + (dp->dd_func)(zhp, nvl, dp->dd_islabeled); +} + +static void +zfs_enable_ds(void *arg) +{ + unavailpool_t *pool = (unavailpool_t *)arg; + + (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); + zpool_close(pool->uap_zhp); + free(pool); +} + +static int +zfs_iter_pool(zpool_handle_t *zhp, void *data) +{ + nvlist_t *config, *nvl; + dev_data_t *dp = data; + uint64_t pool_guid; + unavailpool_t *pool; + + zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)", + zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop); + + /* + * For each vdev in this pool, look for a match to apply dd_func + */ + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + if (dp->dd_pool_guid == 0 || + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { + (void) nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvl); + zfs_iter_vdev(zhp, nvl, data); + } + } + + /* + * if this pool was originally unavailable, + * then enable its datasets asynchronously + */ + if (g_enumeration_done) { + for (pool = list_head(&g_pool_list); pool != NULL; + pool = list_next(&g_pool_list, pool)) { + + if (strcmp(zpool_get_name(zhp), + zpool_get_name(pool->uap_zhp))) + continue; + if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) { + list_remove(&g_pool_list, pool); + (void) tpool_dispatch(g_tpool, zfs_enable_ds, + pool); + break; + } + } + } + + zpool_close(zhp); + return (dp->dd_found); /* cease iteration after a match */ +} + +/* + * Given a physical device location, iterate over all + * (pool, vdev) pairs which correspond to that location. + */ +static boolean_t +devphys_iter(const char *physical, const char *devid, zfs_process_func_t func, + boolean_t is_slice) +{ + dev_data_t data = { 0 }; + + data.dd_compare = physical; + data.dd_func = func; + data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; + data.dd_found = B_FALSE; + data.dd_islabeled = is_slice; + data.dd_new_devid = devid; /* used by auto replace code */ + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (data.dd_found); +} + +/* + * Given a device identifier, find any vdevs with a matching devid. + * On Linux we can match devid directly which is always a whole disk. + */ +static boolean_t +devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice) +{ + dev_data_t data = { 0 }; + + data.dd_compare = devid; + data.dd_func = func; + data.dd_prop = ZPOOL_CONFIG_DEVID; + data.dd_found = B_FALSE; + data.dd_islabeled = is_slice; + data.dd_new_devid = devid; + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (data.dd_found); +} + +/* + * Handle a EC_DEV_ADD.ESC_DISK event. + * + * illumos + * Expects: DEV_PHYS_PATH string in schema + * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID + * + * path: '/dev/dsk/c0t1d0s0' (persistent) + * devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a' + * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a' + * + * linux + * provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema + * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID + * + * path: '/dev/sdc1' (not persistent) + * devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1' + * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0' + */ +static int +zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) +{ + char *devpath = NULL, *devid; + boolean_t is_slice; + + /* + * Expecting a devid string and an optional physical location + */ + if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) + return (-1); + + (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); + + is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0); + + zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)", + devid, devpath ? devpath : "NULL", is_slice); + + /* + * Iterate over all vdevs looking for a match in the following order: + * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) + * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). + * + * For disks, we only want to pay attention to vdevs marked as whole + * disks or are a multipath device. + */ + if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL) + (void) devphys_iter(devpath, devid, zfs_process_add, is_slice); + + return (0); +} + +/* + * Called when we receive a VDEV_CHECK event, which indicates a device could not + * be opened during initial pool open, but the autoreplace property was set on + * the pool. In this case, we treat it as if it were an add event. + */ +static int +zfs_deliver_check(nvlist_t *nvl) +{ + dev_data_t data = { 0 }; + + if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, + &data.dd_pool_guid) != 0 || + nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, + &data.dd_vdev_guid) != 0 || + data.dd_vdev_guid == 0) + return (0); + + zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu", + data.dd_pool_guid, data.dd_vdev_guid); + + data.dd_func = zfs_process_add; + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (0); +} + +static int +zfsdle_vdev_online(zpool_handle_t *zhp, void *data) +{ + char *devname = data; + boolean_t avail_spare, l2cache; + nvlist_t *tgt; + int error; + + zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", + devname, zpool_get_name(zhp)); + + if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, + &avail_spare, &l2cache, NULL)) != NULL) { + char *path, fullpath[MAXPATHLEN]; + uint64_t wholedisk; + + error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path); + if (error) { + zpool_close(zhp); + return (0); + } + + error = nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk); + if (error) + wholedisk = 0; + + if (wholedisk) { + path = strrchr(path, '/'); + if (path != NULL) { + path = zfs_strip_partition(path + 1); + if (path == NULL) { + zpool_close(zhp); + return (0); + } + } else { + zpool_close(zhp); + return (0); + } + + (void) strlcpy(fullpath, path, sizeof (fullpath)); + free(path); + + /* + * We need to reopen the pool associated with this + * device so that the kernel can update the size of + * the expanded device. When expanding there is no + * need to restart the scrub from the beginning. + */ + boolean_t scrub_restart = B_FALSE; + (void) zpool_reopen_one(zhp, &scrub_restart); + } else { + (void) strlcpy(fullpath, path, sizeof (fullpath)); + } + + if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { + vdev_state_t newstate; + + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { + error = zpool_vdev_online(zhp, fullpath, 0, + &newstate); + zed_log_msg(LOG_INFO, "zfsdle_vdev_online: " + "setting device '%s' to ONLINE state " + "in pool '%s': %d", fullpath, + zpool_get_name(zhp), error); + } + } + zpool_close(zhp); + return (1); + } + zpool_close(zhp); + return (0); +} + +/* + * This function handles the ESC_DEV_DLE device change event. Use the + * provided vdev guid when looking up a disk or partition, when the guid + * is not present assume the entire disk is owned by ZFS and append the + * expected -part1 partition information then lookup by physical path. + */ +static int +zfs_deliver_dle(nvlist_t *nvl) +{ + char *devname, name[MAXPATHLEN]; + uint64_t guid; + + if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { + sprintf(name, "%llu", (u_longlong_t)guid); + } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) { + strlcpy(name, devname, MAXPATHLEN); + zfs_append_partition(name, MAXPATHLEN); + } else { + zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath"); + } + + if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) { + zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " + "found", name); + return (1); + } + + return (0); +} + +/* + * syseventd daemon module event handler + * + * Handles syseventd daemon zfs device related events: + * + * EC_DEV_ADD.ESC_DISK + * EC_DEV_STATUS.ESC_DEV_DLE + * EC_ZFS.ESC_ZFS_VDEV_CHECK + * + * Note: assumes only one thread active at a time (not thread safe) + */ +static int +zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + int ret; + boolean_t is_lofi = B_FALSE, is_check = B_FALSE, is_dle = B_FALSE; + + if (strcmp(class, EC_DEV_ADD) == 0) { + /* + * We're mainly interested in disk additions, but we also listen + * for new loop devices, to allow for simplified testing. + */ + if (strcmp(subclass, ESC_DISK) == 0) + is_lofi = B_FALSE; + else if (strcmp(subclass, ESC_LOFI) == 0) + is_lofi = B_TRUE; + else + return (0); + + is_check = B_FALSE; + } else if (strcmp(class, EC_ZFS) == 0 && + strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { + /* + * This event signifies that a device failed to open + * during pool load, but the 'autoreplace' property was + * set, so we should pretend it's just been added. + */ + is_check = B_TRUE; + } else if (strcmp(class, EC_DEV_STATUS) == 0 && + strcmp(subclass, ESC_DEV_DLE) == 0) { + is_dle = B_TRUE; + } else { + return (0); + } + + if (is_dle) + ret = zfs_deliver_dle(nvl); + else if (is_check) + ret = zfs_deliver_check(nvl); + else + ret = zfs_deliver_add(nvl, is_lofi); + + return (ret); +} + +/*ARGSUSED*/ +static void * +zfs_enum_pools(void *arg) +{ + (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list); + /* + * Linux - instead of using a thread pool, each list entry + * will spawn a thread when an unavailable pool transitions + * to available. zfs_slm_fini will wait for these threads. + */ + g_enumeration_done = B_TRUE; + return (NULL); +} + +/* + * called from zed daemon at startup + * + * sent messages from zevents or udev monitor + * + * For now, each agent has its own libzfs instance + */ +int +zfs_slm_init() +{ + if ((g_zfshdl = libzfs_init()) == NULL) + return (-1); + + /* + * collect a list of unavailable pools (asynchronously, + * since this can take a while) + */ + list_create(&g_pool_list, sizeof (struct unavailpool), + offsetof(struct unavailpool, uap_node)); + + if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) { + list_destroy(&g_pool_list); + libzfs_fini(g_zfshdl); + return (-1); + } + + list_create(&g_device_list, sizeof (struct pendingdev), + offsetof(struct pendingdev, pd_node)); + + return (0); +} + +void +zfs_slm_fini() +{ + unavailpool_t *pool; + pendingdev_t *device; + + /* wait for zfs_enum_pools thread to complete */ + (void) pthread_join(g_zfs_tid, NULL); + /* destroy the thread pool */ + if (g_tpool != NULL) { + tpool_wait(g_tpool); + tpool_destroy(g_tpool); + } + + while ((pool = (list_head(&g_pool_list))) != NULL) { + list_remove(&g_pool_list, pool); + zpool_close(pool->uap_zhp); + free(pool); + } + list_destroy(&g_pool_list); + + while ((device = (list_head(&g_device_list))) != NULL) { + list_remove(&g_device_list, device); + free(device); + } + list_destroy(&g_device_list); + + libzfs_fini(g_zfshdl); +} + +void +zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass); + (void) zfs_slm_deliver_event(class, subclass, nvl); +} diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c new file mode 100644 index 000000000000..9e95e20d5683 --- /dev/null +++ b/cmd/zed/agents/zfs_retire.c @@ -0,0 +1,557 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com> + */ + +/* + * The ZFS retire agent is responsible for managing hot spares across all pools. + * When we see a device fault or a device removal, we try to open the associated + * pool and look for any hot spares. We iterate over any available hot spares + * and attempt a 'zpool replace' for each one. + * + * For vdevs diagnosed as faulty, the agent is also responsible for proactively + * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). + */ + +#include <sys/fs/zfs.h> +#include <sys/fm/protocol.h> +#include <sys/fm/fs/zfs.h> +#include <libzfs.h> +#include <string.h> + +#include "zfs_agents.h" +#include "fmd_api.h" + + +typedef struct zfs_retire_repaired { + struct zfs_retire_repaired *zrr_next; + uint64_t zrr_pool; + uint64_t zrr_vdev; +} zfs_retire_repaired_t; + +typedef struct zfs_retire_data { + libzfs_handle_t *zrd_hdl; + zfs_retire_repaired_t *zrd_repaired; +} zfs_retire_data_t; + +static void +zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp) +{ + zfs_retire_repaired_t *zrp; + + while ((zrp = zdp->zrd_repaired) != NULL) { + zdp->zrd_repaired = zrp->zrr_next; + fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t)); + } +} + +/* + * Find a pool with a matching GUID. + */ +typedef struct find_cbdata { + uint64_t cb_guid; + zpool_handle_t *cb_zhp; + nvlist_t *cb_vdev; +} find_cbdata_t; + +static int +find_pool(zpool_handle_t *zhp, void *data) +{ + find_cbdata_t *cbp = data; + + if (cbp->cb_guid == + zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { + cbp->cb_zhp = zhp; + return (1); + } + + zpool_close(zhp); + return (0); +} + +/* + * Find a vdev within a tree with a matching GUID. + */ +static nvlist_t * +find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) +{ + uint64_t guid; + nvlist_t **child; + uint_t c, children; + nvlist_t *ret; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && + guid == search_guid) { + fmd_hdl_debug(fmd_module_hdl("zfs-retire"), + "matched vdev %llu", guid); + return (nv); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) + return (ret); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) + return (ret); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) + return (ret); + } + + return (NULL); +} + +/* + * Given a (pool, vdev) GUID pair, find the matching pool and vdev. + */ +static zpool_handle_t * +find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, + nvlist_t **vdevp) +{ + find_cbdata_t cb; + zpool_handle_t *zhp; + nvlist_t *config, *nvroot; + + /* + * Find the corresponding pool and make sure the vdev still exists. + */ + cb.cb_guid = pool_guid; + if (zpool_iter(zhdl, find_pool, &cb) != 1) + return (NULL); + + zhp = cb.cb_zhp; + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) { + zpool_close(zhp); + return (NULL); + } + + if (vdev_guid != 0) { + if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) { + zpool_close(zhp); + return (NULL); + } + } + + return (zhp); +} + +/* + * Given a vdev, attempt to replace it with every known spare until one + * succeeds or we run out of devices to try. + * Return whether we were successful or not in replacing the device. + */ +static boolean_t +replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) +{ + nvlist_t *config, *nvroot, *replacement; + nvlist_t **spares; + uint_t s, nspares; + char *dev_name; + zprop_source_t source; + int ashift; + + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) + return (B_FALSE); + + /* + * Find out if there are any hot spares available in the pool. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) != 0) + return (B_FALSE); + + /* + * lookup "ashift" pool property, we may need it for the replacement + */ + ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source); + + replacement = fmd_nvl_alloc(hdl, FMD_SLEEP); + + (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT); + + dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); + + /* + * Try to replace each spare, ending when we successfully + * replace it. + */ + for (s = 0; s < nspares; s++) { + char *spare_name; + + if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, + &spare_name) != 0) + continue; + + /* if set, add the "ashift" pool property to the spare nvlist */ + if (source != ZPROP_SRC_DEFAULT) + (void) nvlist_add_uint64(spares[s], + ZPOOL_CONFIG_ASHIFT, ashift); + + (void) nvlist_add_nvlist_array(replacement, + ZPOOL_CONFIG_CHILDREN, &spares[s], 1); + + fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'", + dev_name, basename(spare_name)); + + if (zpool_vdev_attach(zhp, dev_name, spare_name, + replacement, B_TRUE, B_FALSE) == 0) { + free(dev_name); + nvlist_free(replacement); + return (B_TRUE); + } + } + + free(dev_name); + nvlist_free(replacement); + + return (B_FALSE); +} + +/* + * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and + * ASRU is now usable. ZFS has found the device to be present and + * functioning. + */ +/*ARGSUSED*/ +static void +zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl) +{ + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + zfs_retire_repaired_t *zrp; + uint64_t pool_guid, vdev_guid; + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + &pool_guid) != 0 || nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) + return; + + /* + * Before checking the state of the ASRU, go through and see if we've + * already made an attempt to repair this ASRU. This list is cleared + * whenever we receive any kind of list event, and is designed to + * prevent us from generating a feedback loop when we attempt repairs + * against a faulted pool. The problem is that checking the unusable + * state of the ASRU can involve opening the pool, which can post + * statechange events but otherwise leave the pool in the faulted + * state. This list allows us to detect when a statechange event is + * due to our own request. + */ + for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) { + if (zrp->zrr_pool == pool_guid && + zrp->zrr_vdev == vdev_guid) + return; + } + + zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP); + zrp->zrr_next = zdp->zrd_repaired; + zrp->zrr_pool = pool_guid; + zrp->zrr_vdev = vdev_guid; + zdp->zrd_repaired = zrp; + + fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu", + vdev_guid, pool_guid); +} + +/*ARGSUSED*/ +static void +zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, + const char *class) +{ + uint64_t pool_guid, vdev_guid; + zpool_handle_t *zhp; + nvlist_t *resource, *fault; + nvlist_t **faults; + uint_t f, nfaults; + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + libzfs_handle_t *zhdl = zdp->zrd_hdl; + boolean_t fault_device, degrade_device; + boolean_t is_repair; + char *scheme; + nvlist_t *vdev = NULL; + char *uuid; + int repair_done = 0; + boolean_t retire; + boolean_t is_disk; + vdev_aux_t aux; + uint64_t state = 0; + + fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); + + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state); + + /* + * If this is a resource notifying us of device removal then simply + * check for an available spare and continue unless the device is a + * l2arc vdev, in which case we just offline it. + */ + if (strcmp(class, "resource.fs.zfs.removed") == 0 || + (strcmp(class, "resource.fs.zfs.statechange") == 0 && + state == VDEV_STATE_REMOVED)) { + char *devtype; + char *devname; + + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + &vdev_guid) != 0) + return; + + if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, + &vdev)) == NULL) + return; + + devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); + + /* Can't replace l2arc with a spare: offline the device */ + if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + &devtype) == 0 && strcmp(devtype, VDEV_TYPE_L2CACHE) == 0) { + fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname); + zpool_vdev_offline(zhp, devname, B_TRUE); + } else if (!fmd_prop_get_int32(hdl, "spare_on_remove") || + replace_with_spare(hdl, zhp, vdev) == B_FALSE) { + /* Could not handle with spare */ + fmd_hdl_debug(hdl, "no spare for '%s'", devname); + } + + free(devname); + zpool_close(zhp); + return; + } + + if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) + return; + + /* + * Note: on zfsonlinux statechange events are more than just + * healthy ones so we need to confirm the actual state value. + */ + if (strcmp(class, "resource.fs.zfs.statechange") == 0 && + state == VDEV_STATE_HEALTHY) { + zfs_vdev_repair(hdl, nvl); + return; + } + if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { + zfs_vdev_repair(hdl, nvl); + return; + } + + zfs_retire_clear_data(hdl, zdp); + + if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) + is_repair = B_TRUE; + else + is_repair = B_FALSE; + + /* + * We subscribe to zfs faults as well as all repair events. + */ + if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, + &faults, &nfaults) != 0) + return; + + for (f = 0; f < nfaults; f++) { + fault = faults[f]; + + fault_device = B_FALSE; + degrade_device = B_FALSE; + is_disk = B_FALSE; + + if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, + &retire) == 0 && retire == 0) + continue; + + /* + * While we subscribe to fault.fs.zfs.*, we only take action + * for faults targeting a specific vdev (open failure or SERD + * failure). We also subscribe to fault.io.* events, so that + * faulty disks will be faulted in the ZFS configuration. + */ + if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) { + fault_device = B_TRUE; + } else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.vdev.checksum")) { + degrade_device = B_TRUE; + } else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.device")) { + fault_device = B_FALSE; + } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) { + is_disk = B_TRUE; + fault_device = B_TRUE; + } else { + continue; + } + + if (is_disk) { + continue; + } else { + /* + * This is a ZFS fault. Lookup the resource, and + * attempt to find the matching vdev. + */ + if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, + &resource) != 0 || + nvlist_lookup_string(resource, FM_FMRI_SCHEME, + &scheme) != 0) + continue; + + if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) + continue; + + if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, + &pool_guid) != 0) + continue; + + if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, + &vdev_guid) != 0) { + if (is_repair) + vdev_guid = 0; + else + continue; + } + + if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, + &vdev)) == NULL) + continue; + + aux = VDEV_AUX_ERR_EXCEEDED; + } + + if (vdev_guid == 0) { + /* + * For pool-level repair events, clear the entire pool. + */ + fmd_hdl_debug(hdl, "zpool_clear of pool '%s'", + zpool_get_name(zhp)); + (void) zpool_clear(zhp, NULL, NULL); + zpool_close(zhp); + continue; + } + + /* + * If this is a repair event, then mark the vdev as repaired and + * continue. + */ + if (is_repair) { + repair_done = 1; + fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu", + zpool_get_name(zhp), vdev_guid); + (void) zpool_vdev_clear(zhp, vdev_guid); + zpool_close(zhp); + continue; + } + + /* + * Actively fault the device if needed. + */ + if (fault_device) + (void) zpool_vdev_fault(zhp, vdev_guid, aux); + if (degrade_device) + (void) zpool_vdev_degrade(zhp, vdev_guid, aux); + + if (fault_device || degrade_device) + fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'", + fault_device ? "fault" : "degrade", vdev_guid, + zpool_get_name(zhp)); + + /* + * Attempt to substitute a hot spare. + */ + (void) replace_with_spare(hdl, zhp, vdev); + zpool_close(zhp); + } + + if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done && + nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0) + fmd_case_uuresolved(hdl, uuid); +} + +static const fmd_hdl_ops_t fmd_ops = { + zfs_retire_recv, /* fmdo_recv */ + NULL, /* fmdo_timeout */ + NULL, /* fmdo_close */ + NULL, /* fmdo_stats */ + NULL, /* fmdo_gc */ +}; + +static const fmd_prop_t fmd_props[] = { + { "spare_on_remove", FMD_TYPE_BOOL, "true" }, + { NULL, 0, NULL } +}; + +static const fmd_hdl_info_t fmd_info = { + "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props +}; + +void +_zfs_retire_init(fmd_hdl_t *hdl) +{ + zfs_retire_data_t *zdp; + libzfs_handle_t *zhdl; + + if ((zhdl = libzfs_init()) == NULL) + return; + + if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { + libzfs_fini(zhdl); + return; + } + + zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP); + zdp->zrd_hdl = zhdl; + + fmd_hdl_setspecific(hdl, zdp); +} + +void +_zfs_retire_fini(fmd_hdl_t *hdl) +{ + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + + if (zdp != NULL) { + zfs_retire_clear_data(hdl, zdp); + libzfs_fini(zdp->zrd_hdl); + fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t)); + } +} diff --git a/cmd/zed/zed.c b/cmd/zed/zed.c new file mode 100644 index 000000000000..0784e3834733 --- /dev/null +++ b/cmd/zed/zed.c @@ -0,0 +1,306 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <errno.h> +#include <fcntl.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <unistd.h> +#include "zed.h" +#include "zed_conf.h" +#include "zed_event.h" +#include "zed_file.h" +#include "zed_log.h" + +static volatile sig_atomic_t _got_exit = 0; +static volatile sig_atomic_t _got_hup = 0; + +/* + * Signal handler for SIGINT & SIGTERM. + */ +static void +_exit_handler(int signum) +{ + _got_exit = 1; +} + +/* + * Signal handler for SIGHUP. + */ +static void +_hup_handler(int signum) +{ + _got_hup = 1; +} + +/* + * Register signal handlers. + */ +static void +_setup_sig_handlers(void) +{ + struct sigaction sa; + + if (sigemptyset(&sa.sa_mask) < 0) + zed_log_die("Failed to initialize sigset"); + + sa.sa_flags = SA_RESTART; + sa.sa_handler = SIG_IGN; + + if (sigaction(SIGPIPE, &sa, NULL) < 0) + zed_log_die("Failed to ignore SIGPIPE"); + + sa.sa_handler = _exit_handler; + if (sigaction(SIGINT, &sa, NULL) < 0) + zed_log_die("Failed to register SIGINT handler"); + + if (sigaction(SIGTERM, &sa, NULL) < 0) + zed_log_die("Failed to register SIGTERM handler"); + + sa.sa_handler = _hup_handler; + if (sigaction(SIGHUP, &sa, NULL) < 0) + zed_log_die("Failed to register SIGHUP handler"); +} + +/* + * Lock all current and future pages in the virtual memory address space. + * Access to locked pages will never be delayed by a page fault. + * + * EAGAIN is tested up to max_tries in case this is a transient error. + * + * Note that memory locks are not inherited by a child created via fork() + * and are automatically removed during an execve(). As such, this must + * be called after the daemon fork()s (when running in the background). + */ +static void +_lock_memory(void) +{ +#if HAVE_MLOCKALL + int i = 0; + const int max_tries = 10; + + for (i = 0; i < max_tries; i++) { + if (mlockall(MCL_CURRENT | MCL_FUTURE) == 0) { + zed_log_msg(LOG_INFO, "Locked all pages in memory"); + return; + } + if (errno != EAGAIN) + break; + } + zed_log_die("Failed to lock memory pages: %s", strerror(errno)); + +#else /* HAVE_MLOCKALL */ + zed_log_die("Failed to lock memory pages: mlockall() not supported"); +#endif /* HAVE_MLOCKALL */ +} + +/* + * Start daemonization of the process including the double fork(). + * + * The parent process will block here until _finish_daemonize() is called + * (in the grandchild process), at which point the parent process will exit. + * This prevents the parent process from exiting until initialization is + * complete. + */ +static void +_start_daemonize(void) +{ + pid_t pid; + struct sigaction sa; + + /* Create pipe for communicating with child during daemonization. */ + zed_log_pipe_open(); + + /* Background process and ensure child is not process group leader. */ + pid = fork(); + if (pid < 0) { + zed_log_die("Failed to create child process: %s", + strerror(errno)); + } else if (pid > 0) { + + /* Close writes since parent will only read from pipe. */ + zed_log_pipe_close_writes(); + + /* Wait for notification that daemonization is complete. */ + zed_log_pipe_wait(); + + zed_log_pipe_close_reads(); + _exit(EXIT_SUCCESS); + } + + /* Close reads since child will only write to pipe. */ + zed_log_pipe_close_reads(); + + /* Create independent session and detach from terminal. */ + if (setsid() < 0) + zed_log_die("Failed to create new session: %s", + strerror(errno)); + + /* Prevent child from terminating on HUP when session leader exits. */ + if (sigemptyset(&sa.sa_mask) < 0) + zed_log_die("Failed to initialize sigset"); + + sa.sa_flags = 0; + sa.sa_handler = SIG_IGN; + + if (sigaction(SIGHUP, &sa, NULL) < 0) + zed_log_die("Failed to ignore SIGHUP"); + + /* Ensure process cannot re-acquire terminal. */ + pid = fork(); + if (pid < 0) { + zed_log_die("Failed to create grandchild process: %s", + strerror(errno)); + } else if (pid > 0) { + _exit(EXIT_SUCCESS); + } +} + +/* + * Finish daemonization of the process by closing stdin/stdout/stderr. + * + * This must be called at the end of initialization after all external + * communication channels are established and accessible. + */ +static void +_finish_daemonize(void) +{ + int devnull; + + /* Preserve fd 0/1/2, but discard data to/from stdin/stdout/stderr. */ + devnull = open("/dev/null", O_RDWR); + if (devnull < 0) + zed_log_die("Failed to open /dev/null: %s", strerror(errno)); + + if (dup2(devnull, STDIN_FILENO) < 0) + zed_log_die("Failed to dup /dev/null onto stdin: %s", + strerror(errno)); + + if (dup2(devnull, STDOUT_FILENO) < 0) + zed_log_die("Failed to dup /dev/null onto stdout: %s", + strerror(errno)); + + if (dup2(devnull, STDERR_FILENO) < 0) + zed_log_die("Failed to dup /dev/null onto stderr: %s", + strerror(errno)); + + if ((devnull > STDERR_FILENO) && (close(devnull) < 0)) + zed_log_die("Failed to close /dev/null: %s", strerror(errno)); + + /* Notify parent that daemonization is complete. */ + zed_log_pipe_close_writes(); +} + +/* + * ZFS Event Daemon (ZED). + */ +int +main(int argc, char *argv[]) +{ + struct zed_conf *zcp; + uint64_t saved_eid; + int64_t saved_etime[2]; + + zed_log_init(argv[0]); + zed_log_stderr_open(LOG_NOTICE); + zcp = zed_conf_create(); + zed_conf_parse_opts(zcp, argc, argv); + if (zcp->do_verbose) + zed_log_stderr_open(LOG_INFO); + + if (geteuid() != 0) + zed_log_die("Must be run as root"); + + zed_conf_parse_file(zcp); + + zed_file_close_from(STDERR_FILENO + 1); + + (void) umask(0); + + if (chdir("/") < 0) + zed_log_die("Failed to change to root directory"); + + if (zed_conf_scan_dir(zcp) < 0) + exit(EXIT_FAILURE); + + if (!zcp->do_foreground) { + _start_daemonize(); + zed_log_syslog_open(LOG_DAEMON); + } + _setup_sig_handlers(); + + if (zcp->do_memlock) + _lock_memory(); + + if ((zed_conf_write_pid(zcp) < 0) && (!zcp->do_force)) + exit(EXIT_FAILURE); + + if (!zcp->do_foreground) + _finish_daemonize(); + + zed_log_msg(LOG_NOTICE, + "ZFS Event Daemon %s-%s (PID %d)", + ZFS_META_VERSION, ZFS_META_RELEASE, (int)getpid()); + + if (zed_conf_open_state(zcp) < 0) + exit(EXIT_FAILURE); + + if (zed_conf_read_state(zcp, &saved_eid, saved_etime) < 0) + exit(EXIT_FAILURE); + +idle: + /* + * If -I is specified, attempt to open /dev/zfs repeatedly until + * successful. + */ + do { + if (!zed_event_init(zcp)) + break; + /* Wait for some time and try again. tunable? */ + sleep(30); + } while (!_got_exit && zcp->do_idle); + + if (_got_exit) + goto out; + + zed_event_seek(zcp, saved_eid, saved_etime); + + while (!_got_exit) { + int rv; + if (_got_hup) { + _got_hup = 0; + (void) zed_conf_scan_dir(zcp); + } + rv = zed_event_service(zcp); + + /* ENODEV: When kernel module is unloaded (osx) */ + if (rv == ENODEV) + break; + } + + zed_log_msg(LOG_NOTICE, "Exiting"); + zed_event_fini(zcp); + + if (zcp->do_idle && !_got_exit) + goto idle; + +out: + zed_conf_destroy(zcp); + zed_log_fini(); + exit(EXIT_SUCCESS); +} diff --git a/cmd/zed/zed.d/.gitignore b/cmd/zed/zed.d/.gitignore new file mode 100644 index 000000000000..46a00945aa7c --- /dev/null +++ b/cmd/zed/zed.d/.gitignore @@ -0,0 +1 @@ +history_event-zfs-list-cacher.sh diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am new file mode 100644 index 000000000000..8b2d0c200286 --- /dev/null +++ b/cmd/zed/zed.d/Makefile.am @@ -0,0 +1,53 @@ +include $(top_srcdir)/config/Rules.am +include $(top_srcdir)/config/Substfiles.am + +EXTRA_DIST += README + +zedconfdir = $(sysconfdir)/zfs/zed.d + +dist_zedconf_DATA = \ + zed-functions.sh \ + zed.rc + +zedexecdir = $(zfsexecdir)/zed.d + +dist_zedexec_SCRIPTS = \ + all-debug.sh \ + all-syslog.sh \ + data-notify.sh \ + generic-notify.sh \ + resilver_finish-notify.sh \ + scrub_finish-notify.sh \ + statechange-led.sh \ + statechange-notify.sh \ + vdev_clear-led.sh \ + vdev_attach-led.sh \ + pool_import-led.sh \ + resilver_finish-start-scrub.sh \ + trim_finish-notify.sh + +nodist_zedexec_SCRIPTS = history_event-zfs-list-cacher.sh + +SUBSTFILES += $(nodist_zedexec_SCRIPTS) + +zedconfdefaults = \ + all-syslog.sh \ + data-notify.sh \ + history_event-zfs-list-cacher.sh \ + resilver_finish-notify.sh \ + scrub_finish-notify.sh \ + statechange-led.sh \ + statechange-notify.sh \ + vdev_clear-led.sh \ + vdev_attach-led.sh \ + pool_import-led.sh \ + resilver_finish-start-scrub.sh + +install-data-hook: + $(MKDIR_P) "$(DESTDIR)$(zedconfdir)" + for f in $(zedconfdefaults); do \ + test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \ + -L "$(DESTDIR)$(zedconfdir)/$${f}" || \ + ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \ + done + chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc" diff --git a/cmd/zed/zed.d/README b/cmd/zed/zed.d/README new file mode 100644 index 000000000000..7279b93704e2 --- /dev/null +++ b/cmd/zed/zed.d/README @@ -0,0 +1,30 @@ +Shell scripts are the recommended choice for ZEDLETs that mostly call +other utilities and do relatively little data manipulation. + +Shell scripts MUST work on both bash and dash. + +Shell scripts MUST run cleanly through ShellCheck: + http://www.shellcheck.net/ + +General functions reside in "zed-functions.sh". Use them where applicable. + +Additional references that may be of use: + + Google Shell Style Guide + https://github.com/google/styleguide/blob/gh-pages/shell.xml + + Dash as /bin/sh + https://wiki.ubuntu.com/DashAsBinSh + + Common shell script mistakes + http://www.pixelbeat.org/programming/shell_script_mistakes.html + + Filenames and Pathnames in Shell: How to do it Correctly + http://www.dwheeler.com/essays/filenames-in-shell.html + + Autoconf: Portable Shell Programming + https://www.gnu.org/software/autoconf/manual/autoconf.html#Portable-Shell + +Please BE CONSISTENT with the existing style, check for errors, +minimize dependencies where possible, try to be portable, +and comment anything non-obvious. Festina lente. diff --git a/cmd/zed/zed.d/all-debug.sh b/cmd/zed/zed.d/all-debug.sh new file mode 100755 index 000000000000..14b39caacd9d --- /dev/null +++ b/cmd/zed/zed.d/all-debug.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# +# Log all environment variables to ZED_DEBUG_LOG. +# +# This can be a useful aid when developing/debugging ZEDLETs since it shows the +# environment variables defined for each zevent. + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +: "${ZED_DEBUG_LOG:="${TMPDIR:="/tmp"}/zed.debug.log"}" + +zed_exit_if_ignoring_this_event + +lockfile="$(basename -- "${ZED_DEBUG_LOG}").lock" + +umask 077 +zed_lock "${lockfile}" +exec >> "${ZED_DEBUG_LOG}" + +printenv | sort +echo + +exec >&- +zed_unlock "${lockfile}" +exit 0 diff --git a/cmd/zed/zed.d/all-syslog.sh b/cmd/zed/zed.d/all-syslog.sh new file mode 100755 index 000000000000..cb9286500136 --- /dev/null +++ b/cmd/zed/zed.d/all-syslog.sh @@ -0,0 +1,14 @@ +#!/bin/sh +# +# Log the zevent via syslog. + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +zed_exit_if_ignoring_this_event + +zed_log_msg "eid=${ZEVENT_EID}" "class=${ZEVENT_SUBCLASS}" \ + "${ZEVENT_POOL_GUID:+"pool_guid=${ZEVENT_POOL_GUID}"}" \ + "${ZEVENT_VDEV_PATH:+"vdev_path=${ZEVENT_VDEV_PATH}"}" \ + "${ZEVENT_VDEV_STATE_STR:+"vdev_state=${ZEVENT_VDEV_STATE_STR}"}" +exit 0 diff --git a/cmd/zed/zed.d/data-notify.sh b/cmd/zed/zed.d/data-notify.sh new file mode 100755 index 000000000000..639b459bdd3b --- /dev/null +++ b/cmd/zed/zed.d/data-notify.sh @@ -0,0 +1,43 @@ +#!/bin/sh +# +# Send notification in response to a DATA error. +# +# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given +# class/pool/[vdev] combination. This protects against spamming the recipient +# should multiple events occur together in time for the same pool/[vdev]. +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 3: notification suppressed +# 9: internal error + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 +[ -n "${ZED_NOTIFY_DATA}" ] || exit 3 + +rate_limit_tag="${ZEVENT_POOL};${ZEVENT_VDEV_GUID:-0};${ZEVENT_SUBCLASS};notify" +zed_rate_limit "${rate_limit_tag}" || exit 3 + +umask 077 +note_subject="ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on $(hostname)" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + echo "ZFS has detected a data error:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + echo " error: ${ZEVENT_ZIO_ERR}" + echo " objid: ${ZEVENT_ZIO_OBJSET}:${ZEVENT_ZIO_OBJECT}" + echo " pool: ${ZEVENT_POOL}" +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? +rm -f "${note_pathname}" +exit "${rv}" diff --git a/cmd/zed/zed.d/generic-notify.sh b/cmd/zed/zed.d/generic-notify.sh new file mode 100755 index 000000000000..e438031a088a --- /dev/null +++ b/cmd/zed/zed.d/generic-notify.sh @@ -0,0 +1,54 @@ +#!/bin/sh +# +# Send notification in response to a given zevent. +# +# This is a generic script than can be symlinked to a file in the +# enabled-zedlets directory to have a notification sent when a particular +# class of zevents occurs. The symlink filename must begin with the zevent +# (sub)class string (e.g., "probe_failure-notify.sh" for the "probe_failure" +# subclass). Refer to the zed(8) manpage for details. +# +# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given +# class/pool combination. This protects against spamming the recipient +# should multiple events occur together in time for the same pool. +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 3: notification suppressed + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +# Rate-limit the notification based in part on the filename. +# +rate_limit_tag="${ZEVENT_POOL};${ZEVENT_SUBCLASS};$(basename -- "$0")" +rate_limit_interval="${ZED_NOTIFY_INTERVAL_SECS}" +zed_rate_limit "${rate_limit_tag}" "${rate_limit_interval}" || exit 3 + +umask 077 +pool_str="${ZEVENT_POOL:+" for ${ZEVENT_POOL}"}" +host_str=" on $(hostname)" +note_subject="ZFS ${ZEVENT_SUBCLASS} event${pool_str}${host_str}" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + echo "ZFS has posted the following event:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}" + [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}" + [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}" + + [ -n "${ZEVENT_POOL}" ] && [ -x "${ZPOOL}" ] \ + && "${ZPOOL}" status "${ZEVENT_POOL}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? +rm -f "${note_pathname}" +exit "${rv}" diff --git a/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in b/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in new file mode 100755 index 000000000000..053b4414a768 --- /dev/null +++ b/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in @@ -0,0 +1,85 @@ +#!/bin/sh +# +# Track changes to enumerated pools for use in early-boot +set -ef + +FSLIST_DIR="@sysconfdir@/zfs/zfs-list.cache" +FSLIST_TMP="@runstatedir@/zfs-list.cache.new" +FSLIST="${FSLIST_DIR}/${ZEVENT_POOL}" + +# If the pool specific cache file is not writeable, abort +[ -w "${FSLIST}" ] || exit 0 + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +zed_exit_if_ignoring_this_event +zed_check_cmd "${ZFS}" sort diff grep + +# If we are acting on a snapshot, we have nothing to do +printf '%s' "${ZEVENT_HISTORY_DSNAME}" | grep '@' && exit 0 + +# We obtain a lock on zfs-list to avoid any simultaneous writes. +# If we run into trouble, log and drop the lock +abort_alter() { + zed_log_msg "Error updating zfs-list.cache!" + zed_unlock zfs-list +} + +finished() { + zed_unlock zfs-list + trap - EXIT + exit 0 +} + +case "${ZEVENT_HISTORY_INTERNAL_NAME}" in + create|"finish receiving"|import|destroy|rename) + ;; + + export) + zed_lock zfs-list + trap abort_alter EXIT + echo > "${FSLIST}" + finished + ;; + + set|inherit) + # Only act if one of the tracked properties is altered. + case "${ZEVENT_HISTORY_INTERNAL_STR%%=*}" in + canmount|mountpoint|atime|relatime|devices|exec|readonly| \ + setuid|nbmand|encroot|keylocation|org.openzfs.systemd:requires| \ + org.openzfs.systemd:requires-mounts-for| \ + org.openzfs.systemd:before|org.openzfs.systemd:after| \ + org.openzfs.systemd:wanted-by|org.openzfs.systemd:required-by| \ + org.openzfs.systemd:nofail|org.openzfs.systemd:ignore \ + ) ;; + *) exit 0 ;; + esac + ;; + + *) + # Ignore all other events. + exit 0 + ;; +esac + +zed_lock zfs-list +trap abort_alter EXIT + +PROPS="name,mountpoint,canmount,atime,relatime,devices,exec\ +,readonly,setuid,nbmand,encroot,keylocation\ +,org.openzfs.systemd:requires,org.openzfs.systemd:requires-mounts-for\ +,org.openzfs.systemd:before,org.openzfs.systemd:after\ +,org.openzfs.systemd:wanted-by,org.openzfs.systemd:required-by\ +,org.openzfs.systemd:nofail,org.openzfs.systemd:ignore" + +"${ZFS}" list -H -t filesystem -o $PROPS -r "${ZEVENT_POOL}" > "${FSLIST_TMP}" + +# Sort the output so that it is stable +sort "${FSLIST_TMP}" -o "${FSLIST_TMP}" + +# Don't modify the file if it hasn't changed +diff -q "${FSLIST_TMP}" "${FSLIST}" || mv "${FSLIST_TMP}" "${FSLIST}" +rm -f "${FSLIST_TMP}" + +finished diff --git a/cmd/zed/zed.d/pool_import-led.sh b/cmd/zed/zed.d/pool_import-led.sh new file mode 120000 index 000000000000..7d7404398a4a --- /dev/null +++ b/cmd/zed/zed.d/pool_import-led.sh @@ -0,0 +1 @@ +statechange-led.sh
\ No newline at end of file diff --git a/cmd/zed/zed.d/resilver_finish-notify.sh b/cmd/zed/zed.d/resilver_finish-notify.sh new file mode 120000 index 000000000000..e4c56bc5f816 --- /dev/null +++ b/cmd/zed/zed.d/resilver_finish-notify.sh @@ -0,0 +1 @@ +scrub_finish-notify.sh
\ No newline at end of file diff --git a/cmd/zed/zed.d/resilver_finish-start-scrub.sh b/cmd/zed/zed.d/resilver_finish-start-scrub.sh new file mode 100755 index 000000000000..c7cfd1ddba80 --- /dev/null +++ b/cmd/zed/zed.d/resilver_finish-start-scrub.sh @@ -0,0 +1,19 @@ +#!/bin/sh +# resilver_finish-start-scrub.sh +# Run a scrub after a resilver +# +# Exit codes: +# 1: Internal error +# 2: Script wasn't enabled in zed.rc +# 3: Scrubs are automatically started for sequential resilvers +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ "${ZED_SCRUB_AFTER_RESILVER}" = "1" ] || exit 2 +[ "${ZEVENT_RESILVER_TYPE}" != "sequential" ] || exit 3 +[ -n "${ZEVENT_POOL}" ] || exit 1 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 1 +zed_check_cmd "${ZPOOL}" || exit 1 + +zed_log_msg "Starting scrub after resilver on ${ZEVENT_POOL}" +"${ZPOOL}" scrub "${ZEVENT_POOL}" diff --git a/cmd/zed/zed.d/scrub_finish-notify.sh b/cmd/zed/zed.d/scrub_finish-notify.sh new file mode 100755 index 000000000000..2145a100a3fa --- /dev/null +++ b/cmd/zed/zed.d/scrub_finish-notify.sh @@ -0,0 +1,59 @@ +#!/bin/sh +# +# Send notification in response to a RESILVER_FINISH or SCRUB_FINISH. +# +# By default, "zpool status" output will only be included for a scrub_finish +# zevent if the pool is not healthy; to always include its output, set +# ZED_NOTIFY_VERBOSE=1. +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 3: notification suppressed +# 9: internal error + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 + +if [ "${ZEVENT_SUBCLASS}" = "resilver_finish" ]; then + action="resilver" +elif [ "${ZEVENT_SUBCLASS}" = "scrub_finish" ]; then + action="scrub" +else + zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" + exit 9 +fi + +zed_check_cmd "${ZPOOL}" || exit 9 + +# For scrub, suppress notification if the pool is healthy +# and verbosity is not enabled. +# +if [ "${ZEVENT_SUBCLASS}" = "scrub_finish" ]; then + healthy="$("${ZPOOL}" status -x "${ZEVENT_POOL}" \ + | grep "'${ZEVENT_POOL}' is healthy")" + [ -n "${healthy}" ] && [ "${ZED_NOTIFY_VERBOSE}" -eq 0 ] && exit 3 +fi + +umask 077 +note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + echo "ZFS has finished a ${action}:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + "${ZPOOL}" status "${ZEVENT_POOL}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? +rm -f "${note_pathname}" +exit "${rv}" diff --git a/cmd/zed/zed.d/statechange-led.sh b/cmd/zed/zed.d/statechange-led.sh new file mode 100755 index 000000000000..e656e125d378 --- /dev/null +++ b/cmd/zed/zed.d/statechange-led.sh @@ -0,0 +1,177 @@ +#!/bin/sh +# +# Turn off/on the VDEV's enclosure fault LEDs when the pool's state changes. +# +# Turn the VDEV's fault LED on if it becomes FAULTED, DEGRADED or UNAVAIL. +# Turn the LED off when it's back ONLINE again. +# +# This script run in two basic modes: +# +# 1. If $ZEVENT_VDEV_ENC_SYSFS_PATH and $ZEVENT_VDEV_STATE_STR are set, then +# only set the LED for that particular VDEV. This is the case for statechange +# events and some vdev_* events. +# +# 2. If those vars are not set, then check the state of all VDEVs in the pool +# and set the LEDs accordingly. This is the case for pool_import events. +# +# Note that this script requires that your enclosure be supported by the +# Linux SCSI enclosure services (ses) driver. The script will do nothing +# if you have no enclosure, or if your enclosure isn't supported. +# +# Exit codes: +# 0: enclosure led successfully set +# 1: enclosure leds not available +# 2: enclosure leds administratively disabled +# 3: The led sysfs path passed from ZFS does not exist +# 4: $ZPOOL not set +# 5: awk is not installed + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +if [ ! -d /sys/class/enclosure ] ; then + exit 1 +fi + +if [ "${ZED_USE_ENCLOSURE_LEDS}" != "1" ] ; then + exit 2 +fi + +zed_check_cmd "$ZPOOL" || exit 4 +zed_check_cmd awk || exit 5 + +# Global used in set_led debug print +vdev="" + +# check_and_set_led (file, val) +# +# Read an enclosure sysfs file, and write it if it's not already set to 'val' +# +# Arguments +# file: sysfs file to set (like /sys/class/enclosure/0:0:1:0/SLOT 10/fault) +# val: value to set it to +# +# Return +# 0 on success, 3 on missing sysfs path +# +check_and_set_led() +{ + file="$1" + val="$2" + + if [ ! -e "$file" ] ; then + return 3 + fi + + # If another process is accessing the LED when we attempt to update it, + # the update will be lost so retry until the LED actually changes or we + # timeout. + for _ in $(seq 1 5); do + # We want to check the current state first, since writing to the + # 'fault' entry always causes a SES command, even if the + # current state is already what you want. + current=$(cat "${file}") + + # On some enclosures if you write 1 to fault, and read it back, + # it will return 2. Treat all non-zero values as 1 for + # simplicity. + if [ "$current" != "0" ] ; then + current=1 + fi + + if [ "$current" != "$val" ] ; then + echo "$val" > "$file" + zed_log_msg "vdev $vdev set '$file' LED to $val" + else + break + fi + done +} + +state_to_val() +{ + state="$1" + if [ "$state" = "FAULTED" ] || [ "$state" = "DEGRADED" ] || \ + [ "$state" = "UNAVAIL" ] ; then + echo 1 + elif [ "$state" = "ONLINE" ] ; then + echo 0 + fi +} + +# process_pool ([pool]) +# +# Iterate through a pool (or pools) and set the VDEV's enclosure slot LEDs to +# the VDEV's state. +# +# Arguments +# pool: Optional pool name. If not specified, iterate though all pools. +# +# Return +# 0 on success, 3 on missing sysfs path +# +process_pool() +{ + pool="$1" + rc=0 + + # Lookup all the current LED values and paths in parallel + #shellcheck disable=SC2016 + cmd='echo led_token=$(cat "$VDEV_ENC_SYSFS_PATH/fault"),"$VDEV_ENC_SYSFS_PATH",' + out=$($ZPOOL status -vc "$cmd" "$pool" | grep 'led_token=') + + #shellcheck disable=SC2034 + echo "$out" | while read -r vdev state read write chksum therest; do + # Read out current LED value and path + tmp=$(echo "$therest" | sed 's/^.*led_token=//g') + vdev_enc_sysfs_path=$(echo "$tmp" | awk -F ',' '{print $2}') + current_val=$(echo "$tmp" | awk -F ',' '{print $1}') + + if [ "$current_val" != "0" ] ; then + current_val=1 + fi + + if [ -z "$vdev_enc_sysfs_path" ] ; then + # Skip anything with no sysfs LED entries + continue + fi + + if [ ! -e "$vdev_enc_sysfs_path/fault" ] ; then + #shellcheck disable=SC2030 + rc=1 + zed_log_msg "vdev $vdev '$file/fault' doesn't exist" + continue; + fi + + val=$(state_to_val "$state") + + if [ "$current_val" = "$val" ] ; then + # LED is already set correctly + continue; + fi + + if ! check_and_set_led "$vdev_enc_sysfs_path/fault" "$val"; then + rc=1 + fi + + done + + #shellcheck disable=SC2031 + if [ "$rc" = "0" ] ; then + return 0 + else + # We didn't see a sysfs entry that we wanted to set + return 3 + fi +} + +if [ -n "$ZEVENT_VDEV_ENC_SYSFS_PATH" ] && [ -n "$ZEVENT_VDEV_STATE_STR" ] ; then + # Got a statechange for an individual VDEV + val=$(state_to_val "$ZEVENT_VDEV_STATE_STR") + vdev=$(basename "$ZEVENT_VDEV_PATH") + check_and_set_led "$ZEVENT_VDEV_ENC_SYSFS_PATH/fault" "$val" +else + # Process the entire pool + poolname=$(zed_guid_to_pool "$ZEVENT_POOL_GUID") + process_pool "$poolname" +fi diff --git a/cmd/zed/zed.d/statechange-notify.sh b/cmd/zed/zed.d/statechange-notify.sh new file mode 100755 index 000000000000..f46080a03239 --- /dev/null +++ b/cmd/zed/zed.d/statechange-notify.sh @@ -0,0 +1,74 @@ +#!/bin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. +# You may not use this file except in compliance with the license. +# +# CDDL HEADER END +# + +# +# Send notification in response to a fault induced statechange +# +# ZEVENT_SUBCLASS: 'statechange' +# ZEVENT_VDEV_STATE_STR: 'DEGRADED', 'FAULTED' or 'REMOVED' +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 3: statechange not relevant +# 4: statechange string missing (unexpected) + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_VDEV_STATE_STR}" ] || exit 4 + +if [ "${ZEVENT_VDEV_STATE_STR}" != "FAULTED" ] \ + && [ "${ZEVENT_VDEV_STATE_STR}" != "DEGRADED" ] \ + && [ "${ZEVENT_VDEV_STATE_STR}" != "REMOVED" ]; then + exit 3 +fi + +umask 077 +note_subject="ZFS device fault for pool ${ZEVENT_POOL_GUID} on $(hostname)" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + if [ "${ZEVENT_VDEV_STATE_STR}" = "FAULTED" ] ; then + echo "The number of I/O errors associated with a ZFS device exceeded" + echo "acceptable levels. ZFS has marked the device as faulted." + elif [ "${ZEVENT_VDEV_STATE_STR}" = "DEGRADED" ] ; then + echo "The number of checksum errors associated with a ZFS device" + echo "exceeded acceptable levels. ZFS has marked the device as" + echo "degraded." + else + echo "ZFS has detected that a device was removed." + fi + + echo + echo " impact: Fault tolerance of the pool may be compromised." + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " state: ${ZEVENT_VDEV_STATE_STR}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}" + [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}" + [ -n "${ZEVENT_VDEV_PHYSPATH}" ] && echo " vphys: ${ZEVENT_VDEV_PHYSPATH}" + [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}" + [ -n "${ZEVENT_VDEV_DEVID}" ] && echo " devid: ${ZEVENT_VDEV_DEVID}" + + echo " pool: ${ZEVENT_POOL_GUID}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? + +rm -f "${note_pathname}" +exit "${rv}" diff --git a/cmd/zed/zed.d/trim_finish-notify.sh b/cmd/zed/zed.d/trim_finish-notify.sh new file mode 100755 index 000000000000..5075302997e3 --- /dev/null +++ b/cmd/zed/zed.d/trim_finish-notify.sh @@ -0,0 +1,37 @@ +#!/bin/sh +# +# Send notification in response to a TRIM_FINISH. The event +# will be received for each vdev in the pool which was trimmed. +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 9: internal error + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 + +zed_check_cmd "${ZPOOL}" || exit 9 + +umask 077 +note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + echo "ZFS has finished a trim:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + "${ZPOOL}" status -t "${ZEVENT_POOL}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? +rm -f "${note_pathname}" +exit "${rv}" diff --git a/cmd/zed/zed.d/vdev_attach-led.sh b/cmd/zed/zed.d/vdev_attach-led.sh new file mode 120000 index 000000000000..7d7404398a4a --- /dev/null +++ b/cmd/zed/zed.d/vdev_attach-led.sh @@ -0,0 +1 @@ +statechange-led.sh
\ No newline at end of file diff --git a/cmd/zed/zed.d/vdev_clear-led.sh b/cmd/zed/zed.d/vdev_clear-led.sh new file mode 120000 index 000000000000..7d7404398a4a --- /dev/null +++ b/cmd/zed/zed.d/vdev_clear-led.sh @@ -0,0 +1 @@ +statechange-led.sh
\ No newline at end of file diff --git a/cmd/zed/zed.d/zed-functions.sh b/cmd/zed/zed.d/zed-functions.sh new file mode 100755 index 000000000000..44a9b8d23303 --- /dev/null +++ b/cmd/zed/zed.d/zed-functions.sh @@ -0,0 +1,538 @@ +#!/bin/sh +# shellcheck disable=SC2039 +# zed-functions.sh +# +# ZED helper functions for use in ZEDLETs + + +# Variable Defaults +# +: "${ZED_LOCKDIR:="/var/lock"}" +: "${ZED_NOTIFY_INTERVAL_SECS:=3600}" +: "${ZED_NOTIFY_VERBOSE:=0}" +: "${ZED_RUNDIR:="/var/run"}" +: "${ZED_SYSLOG_PRIORITY:="daemon.notice"}" +: "${ZED_SYSLOG_TAG:="zed"}" + +ZED_FLOCK_FD=8 + + +# zed_check_cmd (cmd, ...) +# +# For each argument given, search PATH for the executable command [cmd]. +# Log a message if [cmd] is not found. +# +# Arguments +# cmd: name of executable command for which to search +# +# Return +# 0 if all commands are found in PATH and are executable +# n for a count of the command executables that are not found +# +zed_check_cmd() +{ + local cmd + local rv=0 + + for cmd; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + zed_log_err "\"${cmd}\" not installed" + rv=$((rv + 1)) + fi + done + return "${rv}" +} + + +# zed_log_msg (msg, ...) +# +# Write all argument strings to the system log. +# +# Globals +# ZED_SYSLOG_PRIORITY +# ZED_SYSLOG_TAG +# +# Return +# nothing +# +zed_log_msg() +{ + logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "$@" +} + + +# zed_log_err (msg, ...) +# +# Write an error message to the system log. This message will contain the +# script name, EID, and all argument strings. +# +# Globals +# ZED_SYSLOG_PRIORITY +# ZED_SYSLOG_TAG +# ZEVENT_EID +# +# Return +# nothing +# +zed_log_err() +{ + logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "error:" \ + "$(basename -- "$0"):""${ZEVENT_EID:+" eid=${ZEVENT_EID}:"}" "$@" +} + + +# zed_lock (lockfile, [fd]) +# +# Obtain an exclusive (write) lock on [lockfile]. If the lock cannot be +# immediately acquired, wait until it becomes available. +# +# Every zed_lock() must be paired with a corresponding zed_unlock(). +# +# By default, flock-style locks associate the lockfile with file descriptor 8. +# The bash manpage warns that file descriptors >9 should be used with care as +# they may conflict with file descriptors used internally by the shell. File +# descriptor 9 is reserved for zed_rate_limit(). If concurrent locks are held +# within the same process, they must use different file descriptors (preferably +# decrementing from 8); otherwise, obtaining a new lock with a given file +# descriptor will release the previous lock associated with that descriptor. +# +# Arguments +# lockfile: pathname of the lock file; the lock will be stored in +# ZED_LOCKDIR unless the pathname contains a "/". +# fd: integer for the file descriptor used by flock (OPTIONAL unless holding +# concurrent locks) +# +# Globals +# ZED_FLOCK_FD +# ZED_LOCKDIR +# +# Return +# nothing +# +zed_lock() +{ + local lockfile="$1" + local fd="${2:-${ZED_FLOCK_FD}}" + local umask_bak + local err + + [ -n "${lockfile}" ] || return + if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then + lockfile="${ZED_LOCKDIR}/${lockfile}" + fi + + umask_bak="$(umask)" + umask 077 + + # Obtain a lock on the file bound to the given file descriptor. + # + eval "exec ${fd}> '${lockfile}'" + err="$(flock --exclusive "${fd}" 2>&1)" + # shellcheck disable=SC2181 + if [ $? -ne 0 ]; then + zed_log_err "failed to lock \"${lockfile}\": ${err}" + fi + + umask "${umask_bak}" +} + + +# zed_unlock (lockfile, [fd]) +# +# Release the lock on [lockfile]. +# +# Arguments +# lockfile: pathname of the lock file +# fd: integer for the file descriptor used by flock (must match the file +# descriptor passed to the zed_lock function call) +# +# Globals +# ZED_FLOCK_FD +# ZED_LOCKDIR +# +# Return +# nothing +# +zed_unlock() +{ + local lockfile="$1" + local fd="${2:-${ZED_FLOCK_FD}}" + local err + + [ -n "${lockfile}" ] || return + if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then + lockfile="${ZED_LOCKDIR}/${lockfile}" + fi + + # Release the lock and close the file descriptor. + err="$(flock --unlock "${fd}" 2>&1)" + # shellcheck disable=SC2181 + if [ $? -ne 0 ]; then + zed_log_err "failed to unlock \"${lockfile}\": ${err}" + fi + eval "exec ${fd}>&-" +} + + +# zed_notify (subject, pathname) +# +# Send a notification via all available methods. +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Return +# 0: notification succeeded via at least one method +# 1: notification failed +# 2: no notification methods configured +# +zed_notify() +{ + local subject="$1" + local pathname="$2" + local num_success=0 + local num_failure=0 + + zed_notify_email "${subject}" "${pathname}"; rv=$? + [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) + [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + + zed_notify_pushbullet "${subject}" "${pathname}"; rv=$? + [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) + [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + + zed_notify_slack_webhook "${subject}" "${pathname}"; rv=$? + [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) + [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + + [ "${num_success}" -gt 0 ] && return 0 + [ "${num_failure}" -gt 0 ] && return 1 + return 2 +} + + +# zed_notify_email (subject, pathname) +# +# Send a notification via email to the address specified by ZED_EMAIL_ADDR. +# +# Requires the mail executable to be installed in the standard PATH, or +# ZED_EMAIL_PROG to be defined with the pathname of an executable capable of +# reading a message body from stdin. +# +# Command-line options to the mail executable can be specified in +# ZED_EMAIL_OPTS. This undergoes the following keyword substitutions: +# - @ADDRESS@ is replaced with the space-delimited recipient email address(es) +# - @SUBJECT@ is replaced with the notification subject +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Globals +# ZED_EMAIL_PROG +# ZED_EMAIL_OPTS +# ZED_EMAIL_ADDR +# +# Return +# 0: notification sent +# 1: notification failed +# 2: not configured +# +zed_notify_email() +{ + local subject="$1" + local pathname="${2:-"/dev/null"}" + + : "${ZED_EMAIL_PROG:="mail"}" + : "${ZED_EMAIL_OPTS:="-s '@SUBJECT@' @ADDRESS@"}" + + # For backward compatibility with ZED_EMAIL. + if [ -n "${ZED_EMAIL}" ] && [ -z "${ZED_EMAIL_ADDR}" ]; then + ZED_EMAIL_ADDR="${ZED_EMAIL}" + fi + [ -n "${ZED_EMAIL_ADDR}" ] || return 2 + + zed_check_cmd "${ZED_EMAIL_PROG}" || return 1 + + [ -n "${subject}" ] || return 1 + if [ ! -r "${pathname}" ]; then + zed_log_err \ + "$(basename "${ZED_EMAIL_PROG}") cannot read \"${pathname}\"" + return 1 + fi + + ZED_EMAIL_OPTS="$(echo "${ZED_EMAIL_OPTS}" \ + | sed -e "s/@ADDRESS@/${ZED_EMAIL_ADDR}/g" \ + -e "s/@SUBJECT@/${subject}/g")" + + # shellcheck disable=SC2086 + eval "${ZED_EMAIL_PROG}" ${ZED_EMAIL_OPTS} < "${pathname}" >/dev/null 2>&1 + rv=$? + if [ "${rv}" -ne 0 ]; then + zed_log_err "$(basename "${ZED_EMAIL_PROG}") exit=${rv}" + return 1 + fi + return 0 +} + + +# zed_notify_pushbullet (subject, pathname) +# +# Send a notification via Pushbullet <https://www.pushbullet.com/>. +# The access token (ZED_PUSHBULLET_ACCESS_TOKEN) identifies this client to the +# Pushbullet server. The optional channel tag (ZED_PUSHBULLET_CHANNEL_TAG) is +# for pushing to notification feeds that can be subscribed to; if a channel is +# not defined, push notifications will instead be sent to all devices +# associated with the account specified by the access token. +# +# Requires awk, curl, and sed executables to be installed in the standard PATH. +# +# References +# https://docs.pushbullet.com/ +# https://www.pushbullet.com/security +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Globals +# ZED_PUSHBULLET_ACCESS_TOKEN +# ZED_PUSHBULLET_CHANNEL_TAG +# +# Return +# 0: notification sent +# 1: notification failed +# 2: not configured +# +zed_notify_pushbullet() +{ + local subject="$1" + local pathname="${2:-"/dev/null"}" + local msg_body + local msg_tag + local msg_json + local msg_out + local msg_err + local url="https://api.pushbullet.com/v2/pushes" + + [ -n "${ZED_PUSHBULLET_ACCESS_TOKEN}" ] || return 2 + + [ -n "${subject}" ] || return 1 + if [ ! -r "${pathname}" ]; then + zed_log_err "pushbullet cannot read \"${pathname}\"" + return 1 + fi + + zed_check_cmd "awk" "curl" "sed" || return 1 + + # Escape the following characters in the message body for JSON: + # newline, backslash, double quote, horizontal tab, vertical tab, + # and carriage return. + # + msg_body="$(awk '{ ORS="\\n" } { gsub(/\\/, "\\\\"); gsub(/"/, "\\\""); + gsub(/\t/, "\\t"); gsub(/\f/, "\\f"); gsub(/\r/, "\\r"); print }' \ + "${pathname}")" + + # Push to a channel if one is configured. + # + [ -n "${ZED_PUSHBULLET_CHANNEL_TAG}" ] && msg_tag="$(printf \ + '"channel_tag": "%s", ' "${ZED_PUSHBULLET_CHANNEL_TAG}")" + + # Construct the JSON message for pushing a note. + # + msg_json="$(printf '{%s"type": "note", "title": "%s", "body": "%s"}' \ + "${msg_tag}" "${subject}" "${msg_body}")" + + # Send the POST request and check for errors. + # + msg_out="$(curl -u "${ZED_PUSHBULLET_ACCESS_TOKEN}:" -X POST "${url}" \ + --header "Content-Type: application/json" --data-binary "${msg_json}" \ + 2>/dev/null)"; rv=$? + if [ "${rv}" -ne 0 ]; then + zed_log_err "curl exit=${rv}" + return 1 + fi + msg_err="$(echo "${msg_out}" \ + | sed -n -e 's/.*"error" *:.*"message" *: *"\([^"]*\)".*/\1/p')" + if [ -n "${msg_err}" ]; then + zed_log_err "pushbullet \"${msg_err}"\" + return 1 + fi + return 0 +} + + +# zed_notify_slack_webhook (subject, pathname) +# +# Notification via Slack Webhook <https://api.slack.com/incoming-webhooks>. +# The Webhook URL (ZED_SLACK_WEBHOOK_URL) identifies this client to the +# Slack channel. +# +# Requires awk, curl, and sed executables to be installed in the standard PATH. +# +# References +# https://api.slack.com/incoming-webhooks +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Globals +# ZED_SLACK_WEBHOOK_URL +# +# Return +# 0: notification sent +# 1: notification failed +# 2: not configured +# +zed_notify_slack_webhook() +{ + [ -n "${ZED_SLACK_WEBHOOK_URL}" ] || return 2 + + local subject="$1" + local pathname="${2:-"/dev/null"}" + local msg_body + local msg_tag + local msg_json + local msg_out + local msg_err + local url="${ZED_SLACK_WEBHOOK_URL}" + + [ -n "${subject}" ] || return 1 + if [ ! -r "${pathname}" ]; then + zed_log_err "slack webhook cannot read \"${pathname}\"" + return 1 + fi + + zed_check_cmd "awk" "curl" "sed" || return 1 + + # Escape the following characters in the message body for JSON: + # newline, backslash, double quote, horizontal tab, vertical tab, + # and carriage return. + # + msg_body="$(awk '{ ORS="\\n" } { gsub(/\\/, "\\\\"); gsub(/"/, "\\\""); + gsub(/\t/, "\\t"); gsub(/\f/, "\\f"); gsub(/\r/, "\\r"); print }' \ + "${pathname}")" + + # Construct the JSON message for posting. + # + msg_json="$(printf '{"text": "*%s*\n%s"}' "${subject}" "${msg_body}" )" + + # Send the POST request and check for errors. + # + msg_out="$(curl -X POST "${url}" \ + --header "Content-Type: application/json" --data-binary "${msg_json}" \ + 2>/dev/null)"; rv=$? + if [ "${rv}" -ne 0 ]; then + zed_log_err "curl exit=${rv}" + return 1 + fi + msg_err="$(echo "${msg_out}" \ + | sed -n -e 's/.*"error" *:.*"message" *: *"\([^"]*\)".*/\1/p')" + if [ -n "${msg_err}" ]; then + zed_log_err "slack webhook \"${msg_err}"\" + return 1 + fi + return 0 +} + +# zed_rate_limit (tag, [interval]) +# +# Check whether an event of a given type [tag] has already occurred within the +# last [interval] seconds. +# +# This function obtains a lock on the statefile using file descriptor 9. +# +# Arguments +# tag: arbitrary string for grouping related events to rate-limit +# interval: time interval in seconds (OPTIONAL) +# +# Globals +# ZED_NOTIFY_INTERVAL_SECS +# ZED_RUNDIR +# +# Return +# 0 if the event should be processed +# 1 if the event should be dropped +# +# State File Format +# time;tag +# +zed_rate_limit() +{ + local tag="$1" + local interval="${2:-${ZED_NOTIFY_INTERVAL_SECS}}" + local lockfile="zed.zedlet.state.lock" + local lockfile_fd=9 + local statefile="${ZED_RUNDIR}/zed.zedlet.state" + local time_now + local time_prev + local umask_bak + local rv=0 + + [ -n "${tag}" ] || return 0 + + zed_lock "${lockfile}" "${lockfile_fd}" + time_now="$(date +%s)" + time_prev="$(grep -E "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \ + | tail -1 | cut -d\; -f1)" + + if [ -n "${time_prev}" ] \ + && [ "$((time_now - time_prev))" -lt "${interval}" ]; then + rv=1 + else + umask_bak="$(umask)" + umask 077 + grep -E -v "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \ + > "${statefile}.$$" + echo "${time_now};${tag}" >> "${statefile}.$$" + mv -f "${statefile}.$$" "${statefile}" + umask "${umask_bak}" + fi + + zed_unlock "${lockfile}" "${lockfile_fd}" + return "${rv}" +} + + +# zed_guid_to_pool (guid) +# +# Convert a pool GUID into its pool name (like "tank") +# Arguments +# guid: pool GUID (decimal or hex) +# +# Return +# Pool name +# +zed_guid_to_pool() +{ + if [ -z "$1" ] ; then + return + fi + + guid=$(printf "%llu" "$1") + if [ -n "$guid" ] ; then + $ZPOOL get -H -ovalue,name guid | awk '$1=='"$guid"' {print $2}' + fi +} + +# zed_exit_if_ignoring_this_event +# +# Exit the script if we should ignore this event, as determined by +# $ZED_SYSLOG_SUBCLASS_INCLUDE and $ZED_SYSLOG_SUBCLASS_EXCLUDE in zed.rc. +# This function assumes you've imported the normal zed variables. +zed_exit_if_ignoring_this_event() +{ + if [ -n "${ZED_SYSLOG_SUBCLASS_INCLUDE}" ]; then + eval "case ${ZEVENT_SUBCLASS} in + ${ZED_SYSLOG_SUBCLASS_INCLUDE});; + *) exit 0;; + esac" + elif [ -n "${ZED_SYSLOG_SUBCLASS_EXCLUDE}" ]; then + eval "case ${ZEVENT_SUBCLASS} in + ${ZED_SYSLOG_SUBCLASS_EXCLUDE}) exit 0;; + *);; + esac" + fi +} diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc new file mode 100644 index 000000000000..1b220d28db20 --- /dev/null +++ b/cmd/zed/zed.d/zed.rc @@ -0,0 +1,122 @@ +## +# zed.rc +# +# This file should be owned by root and permissioned 0600. +## + +## +# Absolute path to the debug output file. +# +#ZED_DEBUG_LOG="/tmp/zed.debug.log" + +## +# Email address of the zpool administrator for receipt of notifications; +# multiple addresses can be specified if they are delimited by whitespace. +# Email will only be sent if ZED_EMAIL_ADDR is defined. +# Disabled by default; uncomment to enable. +# +#ZED_EMAIL_ADDR="root" + +## +# Name or path of executable responsible for sending notifications via email; +# the mail program must be capable of reading a message body from stdin. +# Email will only be sent if ZED_EMAIL_ADDR is defined. +# +#ZED_EMAIL_PROG="mail" + +## +# Command-line options for ZED_EMAIL_PROG. +# The string @ADDRESS@ will be replaced with the recipient email address(es). +# The string @SUBJECT@ will be replaced with the notification subject; +# this should be protected with quotes to prevent word-splitting. +# Email will only be sent if ZED_EMAIL_ADDR is defined. +# +#ZED_EMAIL_OPTS="-s '@SUBJECT@' @ADDRESS@" + +## +# Default directory for zed lock files. +# +#ZED_LOCKDIR="/var/lock" + +## +# Minimum number of seconds between notifications for a similar event. +# +#ZED_NOTIFY_INTERVAL_SECS=3600 + +## +# Notification verbosity. +# If set to 0, suppress notification if the pool is healthy. +# If set to 1, send notification regardless of pool health. +# +#ZED_NOTIFY_VERBOSE=0 + +## +# Send notifications for 'ereport.fs.zfs.data' events. +# Disabled by default, any non-empty value will enable the feature. +# +#ZED_NOTIFY_DATA= + +## +# Pushbullet access token. +# This grants full access to your account -- protect it accordingly! +# <https://www.pushbullet.com/get-started> +# <https://www.pushbullet.com/account> +# Disabled by default; uncomment to enable. +# +#ZED_PUSHBULLET_ACCESS_TOKEN="" + +## +# Pushbullet channel tag for push notification feeds that can be subscribed to. +# <https://www.pushbullet.com/my-channel> +# If not defined, push notifications will instead be sent to all devices +# associated with the account specified by the access token. +# Disabled by default; uncomment to enable. +# +#ZED_PUSHBULLET_CHANNEL_TAG="" + +## +# Slack Webhook URL. +# This allows posting to the given channel and includes an access token. +# <https://api.slack.com/incoming-webhooks> +# Disabled by default; uncomment to enable. +# +#ZED_SLACK_WEBHOOK_URL="" + +## +# Default directory for zed state files. +# +#ZED_RUNDIR="/var/run" + +## +# Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED. This works for +# device mapper and multipath devices as well. Your enclosure must be +# supported by the Linux SES driver for this to work. +# +ZED_USE_ENCLOSURE_LEDS=1 + +## +# Run a scrub after every resilver +# Disabled by default, 1 to enable and 0 to disable. +#ZED_SCRUB_AFTER_RESILVER=0 + +## +# The syslog priority (e.g., specified as a "facility.level" pair). +# +#ZED_SYSLOG_PRIORITY="daemon.notice" + +## +# The syslog tag for marking zed events. +# +#ZED_SYSLOG_TAG="zed" + +## +# Which set of event subclasses to log +# By default, events from all subclasses are logged. +# If ZED_SYSLOG_SUBCLASS_INCLUDE is set, only subclasses +# matching the pattern are logged. Use the pipe symbol (|) +# or shell wildcards (*, ?) to match multiple subclasses. +# Otherwise, if ZED_SYSLOG_SUBCLASS_EXCLUDE is set, the +# matching subclasses are excluded from logging. +#ZED_SYSLOG_SUBCLASS_INCLUDE="checksum|scrub_*|vdev.*" +#ZED_SYSLOG_SUBCLASS_EXCLUDE="statechange|config_*|history_event" + diff --git a/cmd/zed/zed.h b/cmd/zed/zed.h new file mode 100644 index 000000000000..3ac0e63141e8 --- /dev/null +++ b/cmd/zed/zed.h @@ -0,0 +1,58 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_H +#define ZED_H + +/* + * Absolute path for the default zed configuration file. + */ +#define ZED_CONF_FILE SYSCONFDIR "/zfs/zed.conf" + +/* + * Absolute path for the default zed pid file. + */ +#define ZED_PID_FILE RUNSTATEDIR "/zed.pid" + +/* + * Absolute path for the default zed state file. + */ +#define ZED_STATE_FILE RUNSTATEDIR "/zed.state" + +/* + * Absolute path for the default zed zedlet directory. + */ +#define ZED_ZEDLET_DIR SYSCONFDIR "/zfs/zed.d" + +/* + * Reserved for future use. + */ +#define ZED_MAX_EVENTS 0 + +/* + * Reserved for future use. + */ +#define ZED_MIN_EVENTS 0 + +/* + * String prefix for ZED variables passed via environment variables. + */ +#define ZED_VAR_PREFIX "ZED_" + +/* + * String prefix for ZFS event names passed via environment variables. + */ +#define ZEVENT_VAR_PREFIX "ZEVENT_" + +#endif /* !ZED_H */ diff --git a/cmd/zed/zed_conf.c b/cmd/zed/zed_conf.c new file mode 100644 index 000000000000..52370eb87b29 --- /dev/null +++ b/cmd/zed/zed_conf.c @@ -0,0 +1,735 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <assert.h> +#include <ctype.h> +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <libgen.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <unistd.h> +#include "zed.h" +#include "zed_conf.h" +#include "zed_file.h" +#include "zed_log.h" +#include "zed_strings.h" + +/* + * Return a new configuration with default values. + */ +struct zed_conf * +zed_conf_create(void) +{ + struct zed_conf *zcp; + + zcp = calloc(1, sizeof (*zcp)); + if (!zcp) + goto nomem; + + zcp->syslog_facility = LOG_DAEMON; + zcp->min_events = ZED_MIN_EVENTS; + zcp->max_events = ZED_MAX_EVENTS; + zcp->pid_fd = -1; + zcp->zedlets = NULL; /* created via zed_conf_scan_dir() */ + zcp->state_fd = -1; /* opened via zed_conf_open_state() */ + zcp->zfs_hdl = NULL; /* opened via zed_event_init() */ + zcp->zevent_fd = -1; /* opened via zed_event_init() */ + + if (!(zcp->conf_file = strdup(ZED_CONF_FILE))) + goto nomem; + + if (!(zcp->pid_file = strdup(ZED_PID_FILE))) + goto nomem; + + if (!(zcp->zedlet_dir = strdup(ZED_ZEDLET_DIR))) + goto nomem; + + if (!(zcp->state_file = strdup(ZED_STATE_FILE))) + goto nomem; + + return (zcp); + +nomem: + zed_log_die("Failed to create conf: %s", strerror(errno)); + return (NULL); +} + +/* + * Destroy the configuration [zcp]. + * + * Note: zfs_hdl & zevent_fd are destroyed via zed_event_fini(). + */ +void +zed_conf_destroy(struct zed_conf *zcp) +{ + if (!zcp) + return; + + if (zcp->state_fd >= 0) { + if (close(zcp->state_fd) < 0) + zed_log_msg(LOG_WARNING, + "Failed to close state file \"%s\": %s", + zcp->state_file, strerror(errno)); + zcp->state_fd = -1; + } + if (zcp->pid_file) { + if ((unlink(zcp->pid_file) < 0) && (errno != ENOENT)) + zed_log_msg(LOG_WARNING, + "Failed to remove PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + } + if (zcp->pid_fd >= 0) { + if (close(zcp->pid_fd) < 0) + zed_log_msg(LOG_WARNING, + "Failed to close PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + zcp->pid_fd = -1; + } + if (zcp->conf_file) { + free(zcp->conf_file); + zcp->conf_file = NULL; + } + if (zcp->pid_file) { + free(zcp->pid_file); + zcp->pid_file = NULL; + } + if (zcp->zedlet_dir) { + free(zcp->zedlet_dir); + zcp->zedlet_dir = NULL; + } + if (zcp->state_file) { + free(zcp->state_file); + zcp->state_file = NULL; + } + if (zcp->zedlets) { + zed_strings_destroy(zcp->zedlets); + zcp->zedlets = NULL; + } + free(zcp); +} + +/* + * Display command-line help and exit. + * + * If [got_err] is 0, output to stdout and exit normally; + * otherwise, output to stderr and exit with a failure status. + */ +static void +_zed_conf_display_help(const char *prog, int got_err) +{ + FILE *fp = got_err ? stderr : stdout; + int w1 = 4; /* width of leading whitespace */ + int w2 = 8; /* width of L-justified option field */ + + fprintf(fp, "Usage: %s [OPTION]...\n", (prog ? prog : "zed")); + fprintf(fp, "\n"); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-h", + "Display help."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-L", + "Display license information."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-V", + "Display version information."); + fprintf(fp, "\n"); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-v", + "Be verbose."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-f", + "Force daemon to run."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-F", + "Run daemon in the foreground."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-I", + "Idle daemon until kernel module is (re)loaded."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-M", + "Lock all pages in memory."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-P", + "$PATH for ZED to use (only used by ZTS)."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-Z", + "Zero state file."); + fprintf(fp, "\n"); +#if 0 + fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-c FILE", + "Read configuration from FILE.", ZED_CONF_FILE); +#endif + fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-d DIR", + "Read enabled ZEDLETs from DIR.", ZED_ZEDLET_DIR); + fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-p FILE", + "Write daemon's PID to FILE.", ZED_PID_FILE); + fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-s FILE", + "Write daemon's state to FILE.", ZED_STATE_FILE); + fprintf(fp, "\n"); + + exit(got_err ? EXIT_FAILURE : EXIT_SUCCESS); +} + +/* + * Display license information to stdout and exit. + */ +static void +_zed_conf_display_license(void) +{ + const char **pp; + const char *text[] = { + "The ZFS Event Daemon (ZED) is distributed under the terms of the", + " Common Development and Distribution License (CDDL-1.0)", + " <http://opensource.org/licenses/CDDL-1.0>.", + "", + "Developed at Lawrence Livermore National Laboratory" + " (LLNL-CODE-403049).", + "", + NULL + }; + + for (pp = text; *pp; pp++) + printf("%s\n", *pp); + + exit(EXIT_SUCCESS); +} + +/* + * Display version information to stdout and exit. + */ +static void +_zed_conf_display_version(void) +{ + printf("%s-%s-%s\n", + ZFS_META_NAME, ZFS_META_VERSION, ZFS_META_RELEASE); + + exit(EXIT_SUCCESS); +} + +/* + * Copy the [path] string to the [resultp] ptr. + * If [path] is not an absolute path, prefix it with the current working dir. + * If [resultp] is non-null, free its existing string before assignment. + */ +static void +_zed_conf_parse_path(char **resultp, const char *path) +{ + char buf[PATH_MAX]; + + assert(resultp != NULL); + assert(path != NULL); + + if (*resultp) + free(*resultp); + + if (path[0] == '/') { + *resultp = strdup(path); + } else if (!getcwd(buf, sizeof (buf))) { + zed_log_die("Failed to get current working dir: %s", + strerror(errno)); + } else if (strlcat(buf, "/", sizeof (buf)) >= sizeof (buf)) { + zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG)); + } else if (strlcat(buf, path, sizeof (buf)) >= sizeof (buf)) { + zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG)); + } else { + *resultp = strdup(buf); + } + if (!*resultp) + zed_log_die("Failed to copy path: %s", strerror(ENOMEM)); +} + +/* + * Parse the command-line options into the configuration [zcp]. + */ +void +zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv) +{ + const char * const opts = ":hLVc:d:p:P:s:vfFMZI"; + int opt; + + if (!zcp || !argv || !argv[0]) + zed_log_die("Failed to parse options: Internal error"); + + opterr = 0; /* suppress default getopt err msgs */ + + while ((opt = getopt(argc, argv, opts)) != -1) { + switch (opt) { + case 'h': + _zed_conf_display_help(argv[0], EXIT_SUCCESS); + break; + case 'L': + _zed_conf_display_license(); + break; + case 'V': + _zed_conf_display_version(); + break; + case 'c': + _zed_conf_parse_path(&zcp->conf_file, optarg); + break; + case 'd': + _zed_conf_parse_path(&zcp->zedlet_dir, optarg); + break; + case 'I': + zcp->do_idle = 1; + break; + case 'p': + _zed_conf_parse_path(&zcp->pid_file, optarg); + break; + case 'P': + _zed_conf_parse_path(&zcp->path, optarg); + break; + case 's': + _zed_conf_parse_path(&zcp->state_file, optarg); + break; + case 'v': + zcp->do_verbose = 1; + break; + case 'f': + zcp->do_force = 1; + break; + case 'F': + zcp->do_foreground = 1; + break; + case 'M': + zcp->do_memlock = 1; + break; + case 'Z': + zcp->do_zero = 1; + break; + case '?': + default: + if (optopt == '?') + _zed_conf_display_help(argv[0], EXIT_SUCCESS); + + fprintf(stderr, "%s: %s '-%c'\n\n", argv[0], + "Invalid option", optopt); + _zed_conf_display_help(argv[0], EXIT_FAILURE); + break; + } + } +} + +/* + * Parse the configuration file into the configuration [zcp]. + * + * FIXME: Not yet implemented. + */ +void +zed_conf_parse_file(struct zed_conf *zcp) +{ + if (!zcp) + zed_log_die("Failed to parse config: %s", strerror(EINVAL)); +} + +/* + * Scan the [zcp] zedlet_dir for files to exec based on the event class. + * Files must be executable by user, but not writable by group or other. + * Dotfiles are ignored. + * + * Return 0 on success with an updated set of zedlets, + * or -1 on error with errno set. + * + * FIXME: Check if zedlet_dir and all parent dirs are secure. + */ +int +zed_conf_scan_dir(struct zed_conf *zcp) +{ + zed_strings_t *zedlets; + DIR *dirp; + struct dirent *direntp; + char pathname[PATH_MAX]; + struct stat st; + int n; + + if (!zcp) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to scan zedlet dir: %s", + strerror(errno)); + return (-1); + } + zedlets = zed_strings_create(); + if (!zedlets) { + errno = ENOMEM; + zed_log_msg(LOG_WARNING, "Failed to scan dir \"%s\": %s", + zcp->zedlet_dir, strerror(errno)); + return (-1); + } + dirp = opendir(zcp->zedlet_dir); + if (!dirp) { + int errno_bak = errno; + zed_log_msg(LOG_WARNING, "Failed to open dir \"%s\": %s", + zcp->zedlet_dir, strerror(errno)); + zed_strings_destroy(zedlets); + errno = errno_bak; + return (-1); + } + while ((direntp = readdir(dirp))) { + if (direntp->d_name[0] == '.') + continue; + + n = snprintf(pathname, sizeof (pathname), + "%s/%s", zcp->zedlet_dir, direntp->d_name); + if ((n < 0) || (n >= sizeof (pathname))) { + zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s", + direntp->d_name, strerror(ENAMETOOLONG)); + continue; + } + if (stat(pathname, &st) < 0) { + zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s", + pathname, strerror(errno)); + continue; + } + if (!S_ISREG(st.st_mode)) { + zed_log_msg(LOG_INFO, + "Ignoring \"%s\": not a regular file", + direntp->d_name); + continue; + } + if ((st.st_uid != 0) && !zcp->do_force) { + zed_log_msg(LOG_NOTICE, + "Ignoring \"%s\": not owned by root", + direntp->d_name); + continue; + } + if (!(st.st_mode & S_IXUSR)) { + zed_log_msg(LOG_INFO, + "Ignoring \"%s\": not executable by user", + direntp->d_name); + continue; + } + if ((st.st_mode & S_IWGRP) && !zcp->do_force) { + zed_log_msg(LOG_NOTICE, + "Ignoring \"%s\": writable by group", + direntp->d_name); + continue; + } + if ((st.st_mode & S_IWOTH) && !zcp->do_force) { + zed_log_msg(LOG_NOTICE, + "Ignoring \"%s\": writable by other", + direntp->d_name); + continue; + } + if (zed_strings_add(zedlets, NULL, direntp->d_name) < 0) { + zed_log_msg(LOG_WARNING, + "Failed to register \"%s\": %s", + direntp->d_name, strerror(errno)); + continue; + } + if (zcp->do_verbose) + zed_log_msg(LOG_INFO, + "Registered zedlet \"%s\"", direntp->d_name); + } + if (closedir(dirp) < 0) { + int errno_bak = errno; + zed_log_msg(LOG_WARNING, "Failed to close dir \"%s\": %s", + zcp->zedlet_dir, strerror(errno)); + zed_strings_destroy(zedlets); + errno = errno_bak; + return (-1); + } + if (zcp->zedlets) + zed_strings_destroy(zcp->zedlets); + + zcp->zedlets = zedlets; + return (0); +} + +/* + * Write the PID file specified in [zcp]. + * Return 0 on success, -1 on error. + * + * This must be called after fork()ing to become a daemon (so the correct PID + * is recorded), but before daemonization is complete and the parent process + * exits (for synchronization with systemd). + */ +int +zed_conf_write_pid(struct zed_conf *zcp) +{ + const mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; + const mode_t filemode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + char buf[PATH_MAX]; + int n; + char *p; + mode_t mask; + int rv; + + if (!zcp || !zcp->pid_file) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to create PID file: %s", + strerror(errno)); + return (-1); + } + assert(zcp->pid_fd == -1); + /* + * Create PID file directory if needed. + */ + n = strlcpy(buf, zcp->pid_file, sizeof (buf)); + if (n >= sizeof (buf)) { + errno = ENAMETOOLONG; + zed_log_msg(LOG_ERR, "Failed to create PID file: %s", + strerror(errno)); + goto err; + } + p = strrchr(buf, '/'); + if (p) + *p = '\0'; + + if ((mkdirp(buf, dirmode) < 0) && (errno != EEXIST)) { + zed_log_msg(LOG_ERR, "Failed to create directory \"%s\": %s", + buf, strerror(errno)); + goto err; + } + /* + * Obtain PID file lock. + */ + mask = umask(0); + umask(mask | 022); + zcp->pid_fd = open(zcp->pid_file, (O_RDWR | O_CREAT), filemode); + umask(mask); + if (zcp->pid_fd < 0) { + zed_log_msg(LOG_ERR, "Failed to open PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + goto err; + } + rv = zed_file_lock(zcp->pid_fd); + if (rv < 0) { + zed_log_msg(LOG_ERR, "Failed to lock PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + goto err; + } else if (rv > 0) { + pid_t pid = zed_file_is_locked(zcp->pid_fd); + if (pid < 0) { + zed_log_msg(LOG_ERR, + "Failed to test lock on PID file \"%s\"", + zcp->pid_file); + } else if (pid > 0) { + zed_log_msg(LOG_ERR, + "Found PID %d bound to PID file \"%s\"", + pid, zcp->pid_file); + } else { + zed_log_msg(LOG_ERR, + "Inconsistent lock state on PID file \"%s\"", + zcp->pid_file); + } + goto err; + } + /* + * Write PID file. + */ + n = snprintf(buf, sizeof (buf), "%d\n", (int)getpid()); + if ((n < 0) || (n >= sizeof (buf))) { + errno = ERANGE; + zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + } else if (zed_file_write_n(zcp->pid_fd, buf, n) != n) { + zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + } else if (fdatasync(zcp->pid_fd) < 0) { + zed_log_msg(LOG_ERR, "Failed to sync PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + } else { + return (0); + } + +err: + if (zcp->pid_fd >= 0) { + (void) close(zcp->pid_fd); + zcp->pid_fd = -1; + } + return (-1); +} + +/* + * Open and lock the [zcp] state_file. + * Return 0 on success, -1 on error. + * + * FIXME: Move state information into kernel. + */ +int +zed_conf_open_state(struct zed_conf *zcp) +{ + char dirbuf[PATH_MAX]; + mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; + int n; + char *p; + int rv; + + if (!zcp || !zcp->state_file) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to open state file: %s", + strerror(errno)); + return (-1); + } + n = strlcpy(dirbuf, zcp->state_file, sizeof (dirbuf)); + if (n >= sizeof (dirbuf)) { + errno = ENAMETOOLONG; + zed_log_msg(LOG_WARNING, "Failed to open state file: %s", + strerror(errno)); + return (-1); + } + p = strrchr(dirbuf, '/'); + if (p) + *p = '\0'; + + if ((mkdirp(dirbuf, dirmode) < 0) && (errno != EEXIST)) { + zed_log_msg(LOG_WARNING, + "Failed to create directory \"%s\": %s", + dirbuf, strerror(errno)); + return (-1); + } + if (zcp->state_fd >= 0) { + if (close(zcp->state_fd) < 0) { + zed_log_msg(LOG_WARNING, + "Failed to close state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + } + if (zcp->do_zero) + (void) unlink(zcp->state_file); + + zcp->state_fd = open(zcp->state_file, + (O_RDWR | O_CREAT), (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)); + if (zcp->state_fd < 0) { + zed_log_msg(LOG_WARNING, "Failed to open state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + rv = zed_file_lock(zcp->state_fd); + if (rv < 0) { + zed_log_msg(LOG_WARNING, "Failed to lock state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + if (rv > 0) { + pid_t pid = zed_file_is_locked(zcp->state_fd); + if (pid < 0) { + zed_log_msg(LOG_WARNING, + "Failed to test lock on state file \"%s\"", + zcp->state_file); + } else if (pid > 0) { + zed_log_msg(LOG_WARNING, + "Found PID %d bound to state file \"%s\"", + pid, zcp->state_file); + } else { + zed_log_msg(LOG_WARNING, + "Inconsistent lock state on state file \"%s\"", + zcp->state_file); + } + return (-1); + } + return (0); +} + +/* + * Read the opened [zcp] state_file to obtain the eid & etime of the last event + * processed. Write the state from the last event to the [eidp] & [etime] args + * passed by reference. Note that etime[] is an array of size 2. + * Return 0 on success, -1 on error. + */ +int +zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]) +{ + ssize_t len; + struct iovec iov[3]; + ssize_t n; + + if (!zcp || !eidp || !etime) { + errno = EINVAL; + zed_log_msg(LOG_ERR, + "Failed to read state file: %s", strerror(errno)); + return (-1); + } + if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) { + zed_log_msg(LOG_WARNING, + "Failed to reposition state file offset: %s", + strerror(errno)); + return (-1); + } + len = 0; + iov[0].iov_base = eidp; + len += iov[0].iov_len = sizeof (*eidp); + iov[1].iov_base = &etime[0]; + len += iov[1].iov_len = sizeof (etime[0]); + iov[2].iov_base = &etime[1]; + len += iov[2].iov_len = sizeof (etime[1]); + + n = readv(zcp->state_fd, iov, 3); + if (n == 0) { + *eidp = 0; + } else if (n < 0) { + zed_log_msg(LOG_WARNING, + "Failed to read state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } else if (n != len) { + errno = EIO; + zed_log_msg(LOG_WARNING, + "Failed to read state file \"%s\": Read %d of %d bytes", + zcp->state_file, n, len); + return (-1); + } + return (0); +} + +/* + * Write the [eid] & [etime] of the last processed event to the opened + * [zcp] state_file. Note that etime[] is an array of size 2. + * Return 0 on success, -1 on error. + */ +int +zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]) +{ + ssize_t len; + struct iovec iov[3]; + ssize_t n; + + if (!zcp) { + errno = EINVAL; + zed_log_msg(LOG_ERR, + "Failed to write state file: %s", strerror(errno)); + return (-1); + } + if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) { + zed_log_msg(LOG_WARNING, + "Failed to reposition state file offset: %s", + strerror(errno)); + return (-1); + } + len = 0; + iov[0].iov_base = &eid; + len += iov[0].iov_len = sizeof (eid); + iov[1].iov_base = &etime[0]; + len += iov[1].iov_len = sizeof (etime[0]); + iov[2].iov_base = &etime[1]; + len += iov[2].iov_len = sizeof (etime[1]); + + n = writev(zcp->state_fd, iov, 3); + if (n < 0) { + zed_log_msg(LOG_WARNING, + "Failed to write state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + if (n != len) { + errno = EIO; + zed_log_msg(LOG_WARNING, + "Failed to write state file \"%s\": Wrote %d of %d bytes", + zcp->state_file, n, len); + return (-1); + } + if (fdatasync(zcp->state_fd) < 0) { + zed_log_msg(LOG_WARNING, + "Failed to sync state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + return (0); +} diff --git a/cmd/zed/zed_conf.h b/cmd/zed/zed_conf.h new file mode 100644 index 000000000000..424cb2c01c8c --- /dev/null +++ b/cmd/zed/zed_conf.h @@ -0,0 +1,62 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_CONF_H +#define ZED_CONF_H + +#include <libzfs.h> +#include <stdint.h> +#include "zed_strings.h" + +struct zed_conf { + unsigned do_force:1; /* true if force enabled */ + unsigned do_foreground:1; /* true if run in foreground */ + unsigned do_memlock:1; /* true if locking memory */ + unsigned do_verbose:1; /* true if verbosity enabled */ + unsigned do_zero:1; /* true if zeroing state */ + unsigned do_idle:1; /* true if idle enabled */ + int syslog_facility; /* syslog facility value */ + int min_events; /* RESERVED FOR FUTURE USE */ + int max_events; /* RESERVED FOR FUTURE USE */ + char *conf_file; /* abs path to config file */ + char *pid_file; /* abs path to pid file */ + int pid_fd; /* fd to pid file for lock */ + char *zedlet_dir; /* abs path to zedlet dir */ + zed_strings_t *zedlets; /* names of enabled zedlets */ + char *state_file; /* abs path to state file */ + int state_fd; /* fd to state file */ + libzfs_handle_t *zfs_hdl; /* handle to libzfs */ + int zevent_fd; /* fd for access to zevents */ + char *path; /* custom $PATH for zedlets to use */ +}; + +struct zed_conf *zed_conf_create(void); + +void zed_conf_destroy(struct zed_conf *zcp); + +void zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv); + +void zed_conf_parse_file(struct zed_conf *zcp); + +int zed_conf_scan_dir(struct zed_conf *zcp); + +int zed_conf_write_pid(struct zed_conf *zcp); + +int zed_conf_open_state(struct zed_conf *zcp); + +int zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]); + +int zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]); + +#endif /* !ZED_CONF_H */ diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c new file mode 100644 index 000000000000..174d24523253 --- /dev/null +++ b/cmd/zed/zed_disk_event.c @@ -0,0 +1,416 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2017, Intel Corporation. + */ + +#ifdef HAVE_LIBUDEV + +#include <errno.h> +#include <fcntl.h> +#include <libnvpair.h> +#include <libudev.h> +#include <libzfs.h> +#include <libzutil.h> +#include <pthread.h> +#include <stdlib.h> +#include <string.h> + +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> + +#include "zed_log.h" +#include "zed_disk_event.h" +#include "agents/zfs_agents.h" + +/* + * Portions of ZED need to see disk events for disks belonging to ZFS pools. + * A libudev monitor is established to monitor block device actions and pass + * them on to internal ZED logic modules. Initially, zfs_mod.c is the only + * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM + * module responsible for handling disk events for ZFS. + */ + +pthread_t g_mon_tid; +struct udev *g_udev; +struct udev_monitor *g_mon; + + +#define DEV_BYID_PATH "/dev/disk/by-id/" + +/* 64MB is minimum usable disk for ZFS */ +#define MINIMUM_SECTORS 131072 + + +/* + * Post disk event to SLM module + * + * occurs in the context of monitor thread + */ +static void +zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + char *strval; + uint64_t numval; + + zed_log_msg(LOG_INFO, "zed_disk_event:"); + zed_log_msg(LOG_INFO, "\tclass: %s", class); + zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass); + if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval); + if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval); + if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval); + if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval); + if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval); + if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval); + if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval); + + (void) zfs_agent_post_event(class, subclass, nvl); +} + +/* + * dev_event_nvlist: place event schema into an nv pair list + * + * NAME VALUE (example) + * -------------- -------------------------------------------------------- + * DEV_NAME /dev/sdl + * DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/... + * DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC + * DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0 + * DEV_IS_PART --- + * DEV_SIZE 500107862016 + * ZFS_EV_POOL_GUID 17523635698032189180 + * ZFS_EV_VDEV_GUID 14663607734290803088 + */ +static nvlist_t * +dev_event_nvlist(struct udev_device *dev) +{ + nvlist_t *nvl; + char strval[128]; + const char *value, *path; + uint64_t guid; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + + if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0) + (void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval); + if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0) + (void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval); + if ((path = udev_device_get_devnode(dev)) != NULL) + (void) nvlist_add_string(nvl, DEV_NAME, path); + if ((value = udev_device_get_devpath(dev)) != NULL) + (void) nvlist_add_string(nvl, DEV_PATH, value); + value = udev_device_get_devtype(dev); + if ((value != NULL && strcmp("partition", value) == 0) || + (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER") + != NULL)) { + (void) nvlist_add_boolean(nvl, DEV_IS_PART); + } + if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) { + uint64_t numval = DEV_BSIZE; + + numval *= strtoull(value, NULL, 10); + (void) nvlist_add_uint64(nvl, DEV_SIZE, numval); + } + + /* + * Grab the pool and vdev guids from blkid cache + */ + value = udev_device_get_property_value(dev, "ID_FS_UUID"); + if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) + (void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid); + + value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB"); + if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) + (void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid); + + /* + * Either a vdev guid or a devid must be present for matching + */ + if (!nvlist_exists(nvl, DEV_IDENTIFIER) && + !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) { + nvlist_free(nvl); + return (NULL); + } + + return (nvl); +} + +/* + * Listen for block device uevents + */ +static void * +zed_udev_monitor(void *arg) +{ + struct udev_monitor *mon = arg; + char *tmp, *tmp2; + + zed_log_msg(LOG_INFO, "Waiting for new udev disk events..."); + + while (1) { + struct udev_device *dev; + const char *action, *type, *part, *sectors; + const char *bus, *uuid; + const char *class, *subclass; + nvlist_t *nvl; + boolean_t is_zfs = B_FALSE; + + /* allow a cancellation while blocked (recvmsg) */ + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + + /* blocks at recvmsg until an event occurs */ + if ((dev = udev_monitor_receive_device(mon)) == NULL) { + zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive " + "device error %d", errno); + continue; + } + + /* allow all steps to complete before a cancellation */ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + + /* + * Strongly typed device is the preferred filter + */ + type = udev_device_get_property_value(dev, "ID_FS_TYPE"); + if (type != NULL && type[0] != '\0') { + if (strcmp(type, "zfs_member") == 0) { + is_zfs = B_TRUE; + } else { + /* not ours, so skip */ + zed_log_msg(LOG_INFO, "zed_udev_monitor: skip " + "%s (in use by %s)", + udev_device_get_devnode(dev), type); + udev_device_unref(dev); + continue; + } + } + + /* + * if this is a disk and it is partitioned, then the + * zfs label will reside in a DEVTYPE=partition and + * we can skip passing this event + */ + type = udev_device_get_property_value(dev, "DEVTYPE"); + part = udev_device_get_property_value(dev, + "ID_PART_TABLE_TYPE"); + if (type != NULL && type[0] != '\0' && + strcmp(type, "disk") == 0 && + part != NULL && part[0] != '\0') { + /* skip and wait for partition event */ + udev_device_unref(dev); + continue; + } + + /* + * ignore small partitions + */ + sectors = udev_device_get_property_value(dev, + "ID_PART_ENTRY_SIZE"); + if (sectors == NULL) + sectors = udev_device_get_sysattr_value(dev, "size"); + if (sectors != NULL && + strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) { + udev_device_unref(dev); + continue; + } + + /* + * If the blkid probe didn't find ZFS, then a persistent + * device id string is required in the message schema + * for matching with vdevs. Preflight here for expected + * udev information. + */ + bus = udev_device_get_property_value(dev, "ID_BUS"); + uuid = udev_device_get_property_value(dev, "DM_UUID"); + if (!is_zfs && (bus == NULL && uuid == NULL)) { + zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid " + "source", udev_device_get_devnode(dev)); + udev_device_unref(dev); + continue; + } + + action = udev_device_get_action(dev); + if (strcmp(action, "add") == 0) { + class = EC_DEV_ADD; + subclass = ESC_DISK; + } else if (strcmp(action, "remove") == 0) { + class = EC_DEV_REMOVE; + subclass = ESC_DISK; + } else if (strcmp(action, "change") == 0) { + class = EC_DEV_STATUS; + subclass = ESC_DEV_DLE; + } else { + zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown", + action); + udev_device_unref(dev); + continue; + } + + /* + * Special case an EC_DEV_ADD for multipath devices + * + * When a multipath device is created, udev reports the + * following: + * + * 1. "add" event of the dm device for the multipath device + * (like /dev/dm-3). + * 2. "change" event to create the actual multipath device + * symlink (like /dev/mapper/mpatha). The event also + * passes back the relevant DM vars we care about, like + * DM_UUID. + * 3. Another "change" event identical to #2 (that we ignore). + * + * To get the behavior we want, we treat the "change" event + * in #2 as a "add" event; as if "/dev/mapper/mpatha" was + * a new disk being added. + */ + if (strcmp(class, EC_DEV_STATUS) == 0 && + udev_device_get_property_value(dev, "DM_UUID") && + udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) { + tmp = (char *)udev_device_get_devnode(dev); + tmp2 = zfs_get_underlying_path(tmp); + if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) { + /* + * We have a real underlying device, which + * means that this multipath "change" event is + * an "add" event. + * + * If the multipath device and the underlying + * dev are the same name (i.e. /dev/dm-5), then + * there is no real underlying disk for this + * multipath device, and so this "change" event + * really is a multipath removal. + */ + class = EC_DEV_ADD; + subclass = ESC_DISK; + } else { + tmp = (char *) + udev_device_get_property_value(dev, + "DM_NR_VALID_PATHS"); + /* treat as a multipath remove */ + if (tmp != NULL && strcmp(tmp, "0") == 0) { + class = EC_DEV_REMOVE; + subclass = ESC_DISK; + } + } + free(tmp2); + } + + /* + * Special case an EC_DEV_ADD for scsi_debug devices + * + * These devices require a udevadm trigger command after + * creation in order to register the vdev_id scsidebug alias + * rule (adds a persistent path (phys_path) used for fault + * management automated tests in the ZFS test suite. + * + * After udevadm trigger command, event registers as a "change" + * event but needs to instead be handled as another "add" event + * to allow for disk labeling and partitioning to occur. + */ + if (strcmp(class, EC_DEV_STATUS) == 0 && + udev_device_get_property_value(dev, "ID_VDEV") && + udev_device_get_property_value(dev, "ID_MODEL")) { + const char *id_model, *id_model_sd = "scsi_debug"; + + id_model = udev_device_get_property_value(dev, + "ID_MODEL"); + if (strcmp(id_model, id_model_sd) == 0) { + class = EC_DEV_ADD; + subclass = ESC_DISK; + } + } + + if ((nvl = dev_event_nvlist(dev)) != NULL) { + zed_udev_event(class, subclass, nvl); + nvlist_free(nvl); + } + + udev_device_unref(dev); + } + + return (NULL); +} + +int +zed_disk_event_init() +{ + int fd, fflags; + + if ((g_udev = udev_new()) == NULL) { + zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno); + return (-1); + } + + /* Set up a udev monitor for block devices */ + g_mon = udev_monitor_new_from_netlink(g_udev, "udev"); + udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk"); + udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", + "partition"); + udev_monitor_enable_receiving(g_mon); + + /* Make sure monitoring socket is blocking */ + fd = udev_monitor_get_fd(g_mon); + if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK) + (void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK); + + /* spawn a thread to monitor events */ + if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) { + udev_monitor_unref(g_mon); + udev_unref(g_udev); + zed_log_msg(LOG_WARNING, "pthread_create failed"); + return (-1); + } + + zed_log_msg(LOG_INFO, "zed_disk_event_init"); + + return (0); +} + +void +zed_disk_event_fini() +{ + /* cancel monitor thread at recvmsg() */ + (void) pthread_cancel(g_mon_tid); + (void) pthread_join(g_mon_tid, NULL); + + /* cleanup udev resources */ + udev_monitor_unref(g_mon); + udev_unref(g_udev); + + zed_log_msg(LOG_INFO, "zed_disk_event_fini"); +} + +#else + +#include "zed_disk_event.h" + +int +zed_disk_event_init() +{ + return (0); +} + +void +zed_disk_event_fini() +{ +} + +#endif /* HAVE_LIBUDEV */ diff --git a/cmd/zed/zed_disk_event.h b/cmd/zed/zed_disk_event.h new file mode 100644 index 000000000000..ea9813d0a595 --- /dev/null +++ b/cmd/zed/zed_disk_event.h @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef ZED_DISK_EVENT_H +#define ZED_DISK_EVENT_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern int zed_disk_event_init(void); +extern void zed_disk_event_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* !ZED_DISK_EVENT_H */ diff --git a/cmd/zed/zed_event.c b/cmd/zed/zed_event.c new file mode 100644 index 000000000000..1c5d00e297ff --- /dev/null +++ b/cmd/zed/zed_event.c @@ -0,0 +1,965 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <libzfs.h> /* FIXME: Replace with libzfs_core. */ +#include <paths.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/zfs_ioctl.h> +#include <time.h> +#include <unistd.h> +#include <sys/fm/fs/zfs.h> +#include "zed.h" +#include "zed_conf.h" +#include "zed_disk_event.h" +#include "zed_event.h" +#include "zed_exec.h" +#include "zed_file.h" +#include "zed_log.h" +#include "zed_strings.h" + +#include "agents/zfs_agents.h" + +#define MAXBUF 4096 + +/* + * Open the libzfs interface. + */ +int +zed_event_init(struct zed_conf *zcp) +{ + if (!zcp) + zed_log_die("Failed zed_event_init: %s", strerror(EINVAL)); + + zcp->zfs_hdl = libzfs_init(); + if (!zcp->zfs_hdl) { + if (zcp->do_idle) + return (-1); + zed_log_die("Failed to initialize libzfs"); + } + + zcp->zevent_fd = open(ZFS_DEV, O_RDWR); + if (zcp->zevent_fd < 0) { + if (zcp->do_idle) + return (-1); + zed_log_die("Failed to open \"%s\": %s", + ZFS_DEV, strerror(errno)); + } + + zfs_agent_init(zcp->zfs_hdl); + + if (zed_disk_event_init() != 0) { + if (zcp->do_idle) + return (-1); + zed_log_die("Failed to initialize disk events"); + } + + return (0); +} + +/* + * Close the libzfs interface. + */ +void +zed_event_fini(struct zed_conf *zcp) +{ + if (!zcp) + zed_log_die("Failed zed_event_fini: %s", strerror(EINVAL)); + + zed_disk_event_fini(); + zfs_agent_fini(); + + if (zcp->zevent_fd >= 0) { + if (close(zcp->zevent_fd) < 0) + zed_log_msg(LOG_WARNING, "Failed to close \"%s\": %s", + ZFS_DEV, strerror(errno)); + + zcp->zevent_fd = -1; + } + if (zcp->zfs_hdl) { + libzfs_fini(zcp->zfs_hdl); + zcp->zfs_hdl = NULL; + } +} + +/* + * Seek to the event specified by [saved_eid] and [saved_etime]. + * This protects against processing a given event more than once. + * Return 0 upon a successful seek to the specified event, or -1 otherwise. + * + * A zevent is considered to be uniquely specified by its (eid,time) tuple. + * The unsigned 64b eid is set to 1 when the kernel module is loaded, and + * incremented by 1 for each new event. Since the state file can persist + * across a kernel module reload, the time must be checked to ensure a match. + */ +int +zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, int64_t saved_etime[]) +{ + uint64_t eid; + int found; + nvlist_t *nvl; + int n_dropped; + int64_t *etime; + uint_t nelem; + int rv; + + if (!zcp) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to seek zevent: %s", + strerror(errno)); + return (-1); + } + eid = 0; + found = 0; + while ((eid < saved_eid) && !found) { + rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, + ZEVENT_NONBLOCK, zcp->zevent_fd); + + if ((rv != 0) || !nvl) + break; + + if (n_dropped > 0) { + zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped); + /* + * FIXME: Increase max size of event nvlist in + * /sys/module/zfs/parameters/zfs_zevent_len_max ? + */ + } + if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) { + zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid"); + } else if (nvlist_lookup_int64_array(nvl, "time", + &etime, &nelem) != 0) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent time (eid=%llu)", eid); + } else if (nelem != 2) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent time (eid=%llu, nelem=%u)", + eid, nelem); + } else if ((eid != saved_eid) || + (etime[0] != saved_etime[0]) || + (etime[1] != saved_etime[1])) { + /* no-op */ + } else { + found = 1; + } + free(nvl); + } + if (!found && (saved_eid > 0)) { + if (zpool_events_seek(zcp->zfs_hdl, ZEVENT_SEEK_START, + zcp->zevent_fd) < 0) + zed_log_msg(LOG_WARNING, "Failed to seek to eid=0"); + else + eid = 0; + } + zed_log_msg(LOG_NOTICE, "Processing events since eid=%llu", eid); + return (found ? 0 : -1); +} + +/* + * Return non-zero if nvpair [name] should be formatted in hex; o/w, return 0. + */ +static int +_zed_event_value_is_hex(const char *name) +{ + const char *hex_suffix[] = { + "_guid", + "_guids", + NULL + }; + const char **pp; + char *p; + + if (!name) + return (0); + + for (pp = hex_suffix; *pp; pp++) { + p = strstr(name, *pp); + if (p && strlen(p) == strlen(*pp)) + return (1); + } + return (0); +} + +/* + * Add an environment variable for [eid] to the container [zsp]. + * + * The variable name is the concatenation of [prefix] and [name] converted to + * uppercase with non-alphanumeric characters converted to underscores; + * [prefix] is optional, and [name] must begin with an alphabetic character. + * If the converted variable name already exists within the container [zsp], + * its existing value will be replaced with the new value. + * + * The variable value is specified by the format string [fmt]. + * + * Returns 0 on success, and -1 on error (with errno set). + * + * All environment variables in [zsp] should be added through this function. + */ +static int +_zed_event_add_var(uint64_t eid, zed_strings_t *zsp, + const char *prefix, const char *name, const char *fmt, ...) +{ + char keybuf[MAXBUF]; + char valbuf[MAXBUF]; + char *dstp; + const char *srcp; + const char *lastp; + int n; + int buflen; + va_list vargs; + + assert(zsp != NULL); + assert(fmt != NULL); + + if (!name) { + errno = EINVAL; + zed_log_msg(LOG_WARNING, + "Failed to add variable for eid=%llu: Name is empty", eid); + return (-1); + } else if (!isalpha(name[0])) { + errno = EINVAL; + zed_log_msg(LOG_WARNING, + "Failed to add variable for eid=%llu: " + "Name \"%s\" is invalid", eid, name); + return (-1); + } + /* + * Construct the string key by converting PREFIX (if present) and NAME. + */ + dstp = keybuf; + lastp = keybuf + sizeof (keybuf); + if (prefix) { + for (srcp = prefix; *srcp && (dstp < lastp); srcp++) + *dstp++ = isalnum(*srcp) ? toupper(*srcp) : '_'; + } + for (srcp = name; *srcp && (dstp < lastp); srcp++) + *dstp++ = isalnum(*srcp) ? toupper(*srcp) : '_'; + + if (dstp == lastp) { + errno = ENAMETOOLONG; + zed_log_msg(LOG_WARNING, + "Failed to add variable for eid=%llu: Name too long", eid); + return (-1); + } + *dstp = '\0'; + /* + * Construct the string specified by "[PREFIX][NAME]=[FMT]". + */ + dstp = valbuf; + buflen = sizeof (valbuf); + n = strlcpy(dstp, keybuf, buflen); + if (n >= sizeof (valbuf)) { + errno = EMSGSIZE; + zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s", + keybuf, eid, "Exceeded buffer size"); + return (-1); + } + dstp += n; + buflen -= n; + + *dstp++ = '='; + buflen--; + + if (buflen <= 0) { + errno = EMSGSIZE; + zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s", + keybuf, eid, "Exceeded buffer size"); + return (-1); + } + + va_start(vargs, fmt); + n = vsnprintf(dstp, buflen, fmt, vargs); + va_end(vargs); + + if ((n < 0) || (n >= buflen)) { + errno = EMSGSIZE; + zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s", + keybuf, eid, "Exceeded buffer size"); + return (-1); + } else if (zed_strings_add(zsp, keybuf, valbuf) < 0) { + zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s", + keybuf, eid, strerror(errno)); + return (-1); + } + return (0); +} + +static int +_zed_event_add_array_err(uint64_t eid, const char *name) +{ + errno = EMSGSIZE; + zed_log_msg(LOG_WARNING, + "Failed to convert nvpair \"%s\" for eid=%llu: " + "Exceeded buffer size", name, eid); + return (-1); +} + +static int +_zed_event_add_int8_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + int8_t *i8p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT8_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_int8_array(nvp, &i8p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%d ", i8p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_uint8_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + uint8_t *u8p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT8_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_uint8_array(nvp, &u8p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%u ", u8p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_int16_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + int16_t *i16p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT16_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_int16_array(nvp, &i16p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%d ", i16p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_uint16_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + uint16_t *u16p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT16_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_uint16_array(nvp, &u16p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%u ", u16p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_int32_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + int32_t *i32p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT32_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_int32_array(nvp, &i32p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%d ", i32p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_uint32_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + uint32_t *u32p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_uint32_array(nvp, &u32p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%u ", u32p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_int64_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + int64_t *i64p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT64_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_int64_array(nvp, &i64p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%lld ", (u_longlong_t)i64p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_uint64_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + const char *fmt; + uint64_t *u64p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT64_ARRAY)); + + name = nvpair_name(nvp); + fmt = _zed_event_value_is_hex(name) ? "0x%.16llX " : "%llu "; + (void) nvpair_value_uint64_array(nvp, &u64p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, fmt, (u_longlong_t)u64p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_string_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + char **strp; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_STRING_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_string_array(nvp, &strp, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%s ", strp[i] ? strp[i] : "<NULL>"); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +/* + * Convert the nvpair [nvp] to a string which is added to the environment + * of the child process. + * Return 0 on success, -1 on error. + * + * FIXME: Refactor with cmd/zpool/zpool_main.c:zpool_do_events_nvprint()? + */ +static void +_zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp) +{ + const char *name; + data_type_t type; + const char *prefix = ZEVENT_VAR_PREFIX; + boolean_t b; + double d; + uint8_t i8; + uint16_t i16; + uint32_t i32; + uint64_t i64; + char *str; + + assert(zsp != NULL); + assert(nvp != NULL); + + name = nvpair_name(nvp); + type = nvpair_type(nvp); + + switch (type) { + case DATA_TYPE_BOOLEAN: + _zed_event_add_var(eid, zsp, prefix, name, "%s", "1"); + break; + case DATA_TYPE_BOOLEAN_VALUE: + (void) nvpair_value_boolean_value(nvp, &b); + _zed_event_add_var(eid, zsp, prefix, name, "%s", b ? "1" : "0"); + break; + case DATA_TYPE_BYTE: + (void) nvpair_value_byte(nvp, &i8); + _zed_event_add_var(eid, zsp, prefix, name, "%d", i8); + break; + case DATA_TYPE_INT8: + (void) nvpair_value_int8(nvp, (int8_t *)&i8); + _zed_event_add_var(eid, zsp, prefix, name, "%d", i8); + break; + case DATA_TYPE_UINT8: + (void) nvpair_value_uint8(nvp, &i8); + _zed_event_add_var(eid, zsp, prefix, name, "%u", i8); + break; + case DATA_TYPE_INT16: + (void) nvpair_value_int16(nvp, (int16_t *)&i16); + _zed_event_add_var(eid, zsp, prefix, name, "%d", i16); + break; + case DATA_TYPE_UINT16: + (void) nvpair_value_uint16(nvp, &i16); + _zed_event_add_var(eid, zsp, prefix, name, "%u", i16); + break; + case DATA_TYPE_INT32: + (void) nvpair_value_int32(nvp, (int32_t *)&i32); + _zed_event_add_var(eid, zsp, prefix, name, "%d", i32); + break; + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &i32); + _zed_event_add_var(eid, zsp, prefix, name, "%u", i32); + break; + case DATA_TYPE_INT64: + (void) nvpair_value_int64(nvp, (int64_t *)&i64); + _zed_event_add_var(eid, zsp, prefix, name, + "%lld", (longlong_t)i64); + break; + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &i64); + _zed_event_add_var(eid, zsp, prefix, name, + (_zed_event_value_is_hex(name) ? "0x%.16llX" : "%llu"), + (u_longlong_t)i64); + /* + * shadow readable strings for vdev state pairs + */ + if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || + strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { + char alt[32]; + + (void) snprintf(alt, sizeof (alt), "%s_str", name); + _zed_event_add_var(eid, zsp, prefix, alt, "%s", + zpool_state_to_name(i64, VDEV_AUX_NONE)); + } else + /* + * shadow readable strings for pool state + */ + if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_POOL_STATE) == 0) { + char alt[32]; + + (void) snprintf(alt, sizeof (alt), "%s_str", name); + _zed_event_add_var(eid, zsp, prefix, alt, "%s", + zpool_pool_state_to_name(i64)); + } + break; + case DATA_TYPE_DOUBLE: + (void) nvpair_value_double(nvp, &d); + _zed_event_add_var(eid, zsp, prefix, name, "%g", d); + break; + case DATA_TYPE_HRTIME: + (void) nvpair_value_hrtime(nvp, (hrtime_t *)&i64); + _zed_event_add_var(eid, zsp, prefix, name, + "%llu", (u_longlong_t)i64); + break; + case DATA_TYPE_NVLIST: + _zed_event_add_var(eid, zsp, prefix, name, + "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ + break; + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &str); + _zed_event_add_var(eid, zsp, prefix, name, + "%s", (str ? str : "<NULL>")); + break; + case DATA_TYPE_BOOLEAN_ARRAY: + _zed_event_add_var(eid, zsp, prefix, name, + "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ + break; + case DATA_TYPE_BYTE_ARRAY: + _zed_event_add_var(eid, zsp, prefix, name, + "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ + break; + case DATA_TYPE_INT8_ARRAY: + _zed_event_add_int8_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_UINT8_ARRAY: + _zed_event_add_uint8_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_INT16_ARRAY: + _zed_event_add_int16_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_UINT16_ARRAY: + _zed_event_add_uint16_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_INT32_ARRAY: + _zed_event_add_int32_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_UINT32_ARRAY: + _zed_event_add_uint32_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_INT64_ARRAY: + _zed_event_add_int64_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_UINT64_ARRAY: + _zed_event_add_uint64_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_STRING_ARRAY: + _zed_event_add_string_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_NVLIST_ARRAY: + _zed_event_add_var(eid, zsp, prefix, name, + "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ + break; + default: + errno = EINVAL; + zed_log_msg(LOG_WARNING, + "Failed to convert nvpair \"%s\" for eid=%llu: " + "Unrecognized type=%u", name, eid, (unsigned int) type); + break; + } +} + +/* + * Restrict various environment variables to safe and sane values + * when constructing the environment for the child process, unless + * we're running with a custom $PATH (like under the ZFS test suite). + * + * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1. + */ +static void +_zed_event_add_env_restrict(uint64_t eid, zed_strings_t *zsp, + const char *path) +{ + const char *env_restrict[][2] = { + { "IFS", " \t\n" }, + { "PATH", _PATH_STDPATH }, + { "ZDB", SBINDIR "/zdb" }, + { "ZED", SBINDIR "/zed" }, + { "ZFS", SBINDIR "/zfs" }, + { "ZINJECT", SBINDIR "/zinject" }, + { "ZPOOL", SBINDIR "/zpool" }, + { "ZFS_ALIAS", ZFS_META_ALIAS }, + { "ZFS_VERSION", ZFS_META_VERSION }, + { "ZFS_RELEASE", ZFS_META_RELEASE }, + { NULL, NULL } + }; + + /* + * If we have a custom $PATH, use the default ZFS binary locations + * instead of the hard-coded ones. + */ + const char *env_path[][2] = { + { "IFS", " \t\n" }, + { "PATH", NULL }, /* $PATH copied in later on */ + { "ZDB", "zdb" }, + { "ZED", "zed" }, + { "ZFS", "zfs" }, + { "ZINJECT", "zinject" }, + { "ZPOOL", "zpool" }, + { "ZFS_ALIAS", ZFS_META_ALIAS }, + { "ZFS_VERSION", ZFS_META_VERSION }, + { "ZFS_RELEASE", ZFS_META_RELEASE }, + { NULL, NULL } + }; + const char *(*pa)[2]; + + assert(zsp != NULL); + + pa = path != NULL ? env_path : env_restrict; + + for (; *(*pa); pa++) { + /* Use our custom $PATH if we have one */ + if (path != NULL && strcmp((*pa)[0], "PATH") == 0) + (*pa)[1] = path; + + _zed_event_add_var(eid, zsp, NULL, (*pa)[0], "%s", (*pa)[1]); + } +} + +/* + * Preserve specified variables from the parent environment + * when constructing the environment for the child process. + * + * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1. + */ +static void +_zed_event_add_env_preserve(uint64_t eid, zed_strings_t *zsp) +{ + const char *env_preserve[] = { + "TZ", + NULL + }; + const char **keyp; + const char *val; + + assert(zsp != NULL); + + for (keyp = env_preserve; *keyp; keyp++) { + if ((val = getenv(*keyp))) + _zed_event_add_var(eid, zsp, NULL, *keyp, "%s", val); + } +} + +/* + * Compute the "subclass" by removing the first 3 components of [class] + * (which will always be of the form "*.fs.zfs"). Return a pointer inside + * the string [class], or NULL if insufficient components exist. + */ +static const char * +_zed_event_get_subclass(const char *class) +{ + const char *p; + int i; + + if (!class) + return (NULL); + + p = class; + for (i = 0; i < 3; i++) { + p = strchr(p, '.'); + if (!p) + break; + p++; + } + return (p); +} + +/* + * Convert the zevent time from a 2-element array of 64b integers + * into a more convenient form: + * - TIME_SECS is the second component of the time. + * - TIME_NSECS is the nanosecond component of the time. + * - TIME_STRING is an almost-RFC3339-compliant string representation. + */ +static void +_zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[]) +{ + struct tm *stp; + char buf[32]; + + assert(zsp != NULL); + assert(etime != NULL); + + _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_SECS", + "%lld", (long long int) etime[0]); + _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_NSECS", + "%lld", (long long int) etime[1]); + + if (!(stp = localtime((const time_t *) &etime[0]))) { + zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s", + ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "localtime error"); + } else if (!strftime(buf, sizeof (buf), "%Y-%m-%d %H:%M:%S%z", stp)) { + zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s", + ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "strftime error"); + } else { + _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_STRING", + "%s", buf); + } +} + +/* + * Service the next zevent, blocking until one is available. + */ +int +zed_event_service(struct zed_conf *zcp) +{ + nvlist_t *nvl; + nvpair_t *nvp; + int n_dropped; + zed_strings_t *zsp; + uint64_t eid; + int64_t *etime; + uint_t nelem; + char *class; + const char *subclass; + int rv; + + if (!zcp) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to service zevent: %s", + strerror(errno)); + return (EINVAL); + } + rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, ZEVENT_NONE, + zcp->zevent_fd); + + if ((rv != 0) || !nvl) + return (errno); + + if (n_dropped > 0) { + zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped); + /* + * FIXME: Increase max size of event nvlist in + * /sys/module/zfs/parameters/zfs_zevent_len_max ? + */ + } + if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) { + zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid"); + } else if (nvlist_lookup_int64_array( + nvl, "time", &etime, &nelem) != 0) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent time (eid=%llu)", eid); + } else if (nelem != 2) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent time (eid=%llu, nelem=%u)", + eid, nelem); + } else if (nvlist_lookup_string(nvl, "class", &class) != 0) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent class (eid=%llu)", eid); + } else { + /* let internal modules see this event first */ + zfs_agent_post_event(class, NULL, nvl); + + zsp = zed_strings_create(); + + nvp = NULL; + while ((nvp = nvlist_next_nvpair(nvl, nvp))) + _zed_event_add_nvpair(eid, zsp, nvp); + + _zed_event_add_env_restrict(eid, zsp, zcp->path); + _zed_event_add_env_preserve(eid, zsp); + + _zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "PID", + "%d", (int)getpid()); + _zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "ZEDLET_DIR", + "%s", zcp->zedlet_dir); + subclass = _zed_event_get_subclass(class); + _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "SUBCLASS", + "%s", (subclass ? subclass : class)); + + _zed_event_add_time_strings(eid, zsp, etime); + + zed_exec_process(eid, class, subclass, + zcp->zedlet_dir, zcp->zedlets, zsp, zcp->zevent_fd); + + zed_conf_write_state(zcp, eid, etime); + + zed_strings_destroy(zsp); + } + nvlist_free(nvl); + return (0); +} diff --git a/cmd/zed/zed_event.h b/cmd/zed/zed_event.h new file mode 100644 index 000000000000..c1455c3a0629 --- /dev/null +++ b/cmd/zed/zed_event.h @@ -0,0 +1,29 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_EVENT_H +#define ZED_EVENT_H + +#include <stdint.h> + +int zed_event_init(struct zed_conf *zcp); + +void zed_event_fini(struct zed_conf *zcp); + +int zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, + int64_t saved_etime[]); + +int zed_event_service(struct zed_conf *zcp); + +#endif /* !ZED_EVENT_H */ diff --git a/cmd/zed/zed_exec.c b/cmd/zed/zed_exec.c new file mode 100644 index 000000000000..08b7b5568362 --- /dev/null +++ b/cmd/zed/zed_exec.c @@ -0,0 +1,232 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <assert.h> +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <time.h> +#include <unistd.h> +#include "zed_exec.h" +#include "zed_file.h" +#include "zed_log.h" +#include "zed_strings.h" + +#define ZEVENT_FILENO 3 + +/* + * Create an environment string array for passing to execve() using the + * NAME=VALUE strings in container [zsp]. + * Return a newly-allocated environment, or NULL on error. + */ +static char ** +_zed_exec_create_env(zed_strings_t *zsp) +{ + int num_ptrs; + int buflen; + char *buf; + char **pp; + char *p; + const char *q; + int i; + int len; + + num_ptrs = zed_strings_count(zsp) + 1; + buflen = num_ptrs * sizeof (char *); + for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) + buflen += strlen(q) + 1; + + buf = calloc(1, buflen); + if (!buf) + return (NULL); + + pp = (char **)buf; + p = buf + (num_ptrs * sizeof (char *)); + i = 0; + for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) { + pp[i] = p; + len = strlen(q) + 1; + memcpy(p, q, len); + p += len; + i++; + } + pp[i] = NULL; + assert(buf + buflen == p); + return ((char **)buf); +} + +/* + * Fork a child process to handle event [eid]. The program [prog] + * in directory [dir] is executed with the environment [env]. + * + * The file descriptor [zfd] is the zevent_fd used to track the + * current cursor location within the zevent nvlist. + */ +static void +_zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog, + char *env[], int zfd) +{ + char path[PATH_MAX]; + int n; + pid_t pid; + int fd; + pid_t wpid; + int status; + + assert(dir != NULL); + assert(prog != NULL); + assert(env != NULL); + assert(zfd >= 0); + + n = snprintf(path, sizeof (path), "%s/%s", dir, prog); + if ((n < 0) || (n >= sizeof (path))) { + zed_log_msg(LOG_WARNING, + "Failed to fork \"%s\" for eid=%llu: %s", + prog, eid, strerror(ENAMETOOLONG)); + return; + } + pid = fork(); + if (pid < 0) { + zed_log_msg(LOG_WARNING, + "Failed to fork \"%s\" for eid=%llu: %s", + prog, eid, strerror(errno)); + return; + } else if (pid == 0) { + (void) umask(022); + if ((fd = open("/dev/null", O_RDWR)) != -1) { + (void) dup2(fd, STDIN_FILENO); + (void) dup2(fd, STDOUT_FILENO); + (void) dup2(fd, STDERR_FILENO); + } + (void) dup2(zfd, ZEVENT_FILENO); + zed_file_close_from(ZEVENT_FILENO + 1); + execle(path, prog, NULL, env); + _exit(127); + } + + /* parent process */ + + zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d", + prog, eid, pid); + + /* FIXME: Timeout rogue child processes with sigalarm? */ + + /* + * Wait for child process using WNOHANG to limit + * the time spent waiting to 10 seconds (10,000ms). + */ + for (n = 0; n < 1000; n++) { + wpid = waitpid(pid, &status, WNOHANG); + if (wpid == (pid_t)-1) { + if (errno == EINTR) + continue; + zed_log_msg(LOG_WARNING, + "Failed to wait for \"%s\" eid=%llu pid=%d", + prog, eid, pid); + break; + } else if (wpid == 0) { + struct timespec t; + + /* child still running */ + t.tv_sec = 0; + t.tv_nsec = 10000000; /* 10ms */ + (void) nanosleep(&t, NULL); + continue; + } + + if (WIFEXITED(status)) { + zed_log_msg(LOG_INFO, + "Finished \"%s\" eid=%llu pid=%d exit=%d", + prog, eid, pid, WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + zed_log_msg(LOG_INFO, + "Finished \"%s\" eid=%llu pid=%d sig=%d/%s", + prog, eid, pid, WTERMSIG(status), + strsignal(WTERMSIG(status))); + } else { + zed_log_msg(LOG_INFO, + "Finished \"%s\" eid=%llu pid=%d status=0x%X", + prog, eid, (unsigned int) status); + } + break; + } + + /* + * kill child process after 10 seconds + */ + if (wpid == 0) { + zed_log_msg(LOG_WARNING, "Killing hung \"%s\" pid=%d", + prog, pid); + (void) kill(pid, SIGKILL); + } +} + +/* + * Process the event [eid] by synchronously invoking all zedlets with a + * matching class prefix. + * + * Each executable in [zedlets] from the directory [dir] is matched against + * the event's [class], [subclass], and the "all" class (which matches + * all events). Every zedlet with a matching class prefix is invoked. + * The NAME=VALUE strings in [envs] will be passed to the zedlet as + * environment variables. + * + * The file descriptor [zfd] is the zevent_fd used to track the + * current cursor location within the zevent nvlist. + * + * Return 0 on success, -1 on error. + */ +int +zed_exec_process(uint64_t eid, const char *class, const char *subclass, + const char *dir, zed_strings_t *zedlets, zed_strings_t *envs, int zfd) +{ + const char *class_strings[4]; + const char *allclass = "all"; + const char **csp; + const char *z; + char **e; + int n; + + if (!dir || !zedlets || !envs || zfd < 0) + return (-1); + + csp = class_strings; + + if (class) + *csp++ = class; + + if (subclass) + *csp++ = subclass; + + if (allclass) + *csp++ = allclass; + + *csp = NULL; + + e = _zed_exec_create_env(envs); + + for (z = zed_strings_first(zedlets); z; z = zed_strings_next(zedlets)) { + for (csp = class_strings; *csp; csp++) { + n = strlen(*csp); + if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n])) + _zed_exec_fork_child(eid, dir, z, e, zfd); + } + } + free(e); + return (0); +} diff --git a/cmd/zed/zed_exec.h b/cmd/zed/zed_exec.h new file mode 100644 index 000000000000..4153e5519a46 --- /dev/null +++ b/cmd/zed/zed_exec.h @@ -0,0 +1,25 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_EXEC_H +#define ZED_EXEC_H + +#include <stdint.h> +#include "zed_strings.h" + +int zed_exec_process(uint64_t eid, const char *class, const char *subclass, + const char *dir, zed_strings_t *zedlets, zed_strings_t *envs, + int zevent_fd); + +#endif /* !ZED_EXEC_H */ diff --git a/cmd/zed/zed_file.c b/cmd/zed/zed_file.c new file mode 100644 index 000000000000..c3cf3d421c6f --- /dev/null +++ b/cmd/zed/zed_file.c @@ -0,0 +1,217 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <string.h> +#include <sys/resource.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include "zed_file.h" +#include "zed_log.h" + +/* + * Read up to [n] bytes from [fd] into [buf]. + * Return the number of bytes read, 0 on EOF, or -1 on error. + */ +ssize_t +zed_file_read_n(int fd, void *buf, size_t n) +{ + unsigned char *p; + size_t n_left; + ssize_t n_read; + + p = buf; + n_left = n; + while (n_left > 0) { + if ((n_read = read(fd, p, n_left)) < 0) { + if (errno == EINTR) + continue; + else + return (-1); + + } else if (n_read == 0) { + break; + } + n_left -= n_read; + p += n_read; + } + return (n - n_left); +} + +/* + * Write [n] bytes from [buf] out to [fd]. + * Return the number of bytes written, or -1 on error. + */ +ssize_t +zed_file_write_n(int fd, void *buf, size_t n) +{ + const unsigned char *p; + size_t n_left; + ssize_t n_written; + + p = buf; + n_left = n; + while (n_left > 0) { + if ((n_written = write(fd, p, n_left)) < 0) { + if (errno == EINTR) + continue; + else + return (-1); + + } + n_left -= n_written; + p += n_written; + } + return (n); +} + +/* + * Set an exclusive advisory lock on the open file descriptor [fd]. + * Return 0 on success, 1 if a conflicting lock is held by another process, + * or -1 on error (with errno set). + */ +int +zed_file_lock(int fd) +{ + struct flock lock; + + if (fd < 0) { + errno = EBADF; + return (-1); + } + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + + if (fcntl(fd, F_SETLK, &lock) < 0) { + if ((errno == EACCES) || (errno == EAGAIN)) + return (1); + + return (-1); + } + return (0); +} + +/* + * Release an advisory lock held on the open file descriptor [fd]. + * Return 0 on success, or -1 on error (with errno set). + */ +int +zed_file_unlock(int fd) +{ + struct flock lock; + + if (fd < 0) { + errno = EBADF; + return (-1); + } + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + + if (fcntl(fd, F_SETLK, &lock) < 0) + return (-1); + + return (0); +} + +/* + * Test whether an exclusive advisory lock could be obtained for the open + * file descriptor [fd]. + * Return 0 if the file is not locked, >0 for the PID of another process + * holding a conflicting lock, or -1 on error (with errno set). + */ +pid_t +zed_file_is_locked(int fd) +{ + struct flock lock; + + if (fd < 0) { + errno = EBADF; + return (-1); + } + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + + if (fcntl(fd, F_GETLK, &lock) < 0) + return (-1); + + if (lock.l_type == F_UNLCK) + return (0); + + return (lock.l_pid); +} + +/* + * Close all open file descriptors greater than or equal to [lowfd]. + * Any errors encountered while closing file descriptors are ignored. + */ +void +zed_file_close_from(int lowfd) +{ + const int maxfd_def = 256; + int errno_bak; + struct rlimit rl; + int maxfd; + int fd; + + errno_bak = errno; + + if (getrlimit(RLIMIT_NOFILE, &rl) < 0) { + maxfd = maxfd_def; + } else if (rl.rlim_max == RLIM_INFINITY) { + maxfd = maxfd_def; + } else { + maxfd = rl.rlim_max; + } + for (fd = lowfd; fd < maxfd; fd++) + (void) close(fd); + + errno = errno_bak; +} + +/* + * Set the CLOEXEC flag on file descriptor [fd] so it will be automatically + * closed upon successful execution of one of the exec functions. + * Return 0 on success, or -1 on error. + * + * FIXME: No longer needed? + */ +int +zed_file_close_on_exec(int fd) +{ + int flags; + + if (fd < 0) { + errno = EBADF; + return (-1); + } + flags = fcntl(fd, F_GETFD); + if (flags == -1) + return (-1); + + flags |= FD_CLOEXEC; + + if (fcntl(fd, F_SETFD, flags) == -1) + return (-1); + + return (0); +} diff --git a/cmd/zed/zed_file.h b/cmd/zed/zed_file.h new file mode 100644 index 000000000000..05f360d20efd --- /dev/null +++ b/cmd/zed/zed_file.h @@ -0,0 +1,35 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_FILE_H +#define ZED_FILE_H + +#include <sys/types.h> +#include <unistd.h> + +ssize_t zed_file_read_n(int fd, void *buf, size_t n); + +ssize_t zed_file_write_n(int fd, void *buf, size_t n); + +int zed_file_lock(int fd); + +int zed_file_unlock(int fd); + +pid_t zed_file_is_locked(int fd); + +void zed_file_close_from(int fd); + +int zed_file_close_on_exec(int fd); + +#endif /* !ZED_FILE_H */ diff --git a/cmd/zed/zed_log.c b/cmd/zed/zed_log.c new file mode 100644 index 000000000000..5a3f2dbdb832 --- /dev/null +++ b/cmd/zed/zed_log.c @@ -0,0 +1,256 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <assert.h> +#include <errno.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <syslog.h> +#include <unistd.h> +#include "zed_log.h" + +#define ZED_LOG_MAX_LOG_LEN 1024 + +static struct { + unsigned do_stderr:1; + unsigned do_syslog:1; + const char *identity; + int priority; + int pipe_fd[2]; +} _ctx; + +/* + * Initialize the logging subsystem. + */ +void +zed_log_init(const char *identity) +{ + if (identity) { + const char *p = strrchr(identity, '/'); + _ctx.identity = (p != NULL) ? p + 1 : identity; + } else { + _ctx.identity = NULL; + } + _ctx.pipe_fd[0] = -1; + _ctx.pipe_fd[1] = -1; +} + +/* + * Shutdown the logging subsystem. + */ +void +zed_log_fini(void) +{ + zed_log_stderr_close(); + zed_log_syslog_close(); +} + +/* + * Create pipe for communicating daemonization status between the parent and + * child processes across the double-fork(). + */ +void +zed_log_pipe_open(void) +{ + if ((_ctx.pipe_fd[0] != -1) || (_ctx.pipe_fd[1] != -1)) + zed_log_die("Invalid use of zed_log_pipe_open in PID %d", + (int)getpid()); + + if (pipe(_ctx.pipe_fd) < 0) + zed_log_die("Failed to create daemonize pipe in PID %d: %s", + (int)getpid(), strerror(errno)); +} + +/* + * Close the read-half of the daemonize pipe. + * + * This should be called by the child after fork()ing from the parent since + * the child will never read from this pipe. + */ +void +zed_log_pipe_close_reads(void) +{ + if (_ctx.pipe_fd[0] < 0) + zed_log_die( + "Invalid use of zed_log_pipe_close_reads in PID %d", + (int)getpid()); + + if (close(_ctx.pipe_fd[0]) < 0) + zed_log_die( + "Failed to close reads on daemonize pipe in PID %d: %s", + (int)getpid(), strerror(errno)); + + _ctx.pipe_fd[0] = -1; +} + +/* + * Close the write-half of the daemonize pipe. + * + * This should be called by the parent after fork()ing its child since the + * parent will never write to this pipe. + * + * This should also be called by the child once initialization is complete + * in order to signal the parent that it can safely exit. + */ +void +zed_log_pipe_close_writes(void) +{ + if (_ctx.pipe_fd[1] < 0) + zed_log_die( + "Invalid use of zed_log_pipe_close_writes in PID %d", + (int)getpid()); + + if (close(_ctx.pipe_fd[1]) < 0) + zed_log_die( + "Failed to close writes on daemonize pipe in PID %d: %s", + (int)getpid(), strerror(errno)); + + _ctx.pipe_fd[1] = -1; +} + +/* + * Block on reading from the daemonize pipe until signaled by the child + * (via zed_log_pipe_close_writes()) that initialization is complete. + * + * This should only be called by the parent while waiting to exit after + * fork()ing the child. + */ +void +zed_log_pipe_wait(void) +{ + ssize_t n; + char c; + + if (_ctx.pipe_fd[0] < 0) + zed_log_die("Invalid use of zed_log_pipe_wait in PID %d", + (int)getpid()); + + for (;;) { + n = read(_ctx.pipe_fd[0], &c, sizeof (c)); + if (n < 0) { + if (errno == EINTR) + continue; + zed_log_die( + "Failed to read from daemonize pipe in PID %d: %s", + (int)getpid(), strerror(errno)); + } + if (n == 0) { + break; + } + } +} + +/* + * Start logging messages at the syslog [priority] level or higher to stderr. + * Refer to syslog(3) for valid priority values. + */ +void +zed_log_stderr_open(int priority) +{ + _ctx.do_stderr = 1; + _ctx.priority = priority; +} + +/* + * Stop logging messages to stderr. + */ +void +zed_log_stderr_close(void) +{ + if (_ctx.do_stderr) + _ctx.do_stderr = 0; +} + +/* + * Start logging messages to syslog. + * Refer to syslog(3) for valid option/facility values. + */ +void +zed_log_syslog_open(int facility) +{ + _ctx.do_syslog = 1; + openlog(_ctx.identity, LOG_NDELAY | LOG_PID, facility); +} + +/* + * Stop logging messages to syslog. + */ +void +zed_log_syslog_close(void) +{ + if (_ctx.do_syslog) { + _ctx.do_syslog = 0; + closelog(); + } +} + +/* + * Auxiliary function to log a message to syslog and/or stderr. + */ +static void +_zed_log_aux(int priority, const char *fmt, va_list vargs) +{ + char buf[ZED_LOG_MAX_LOG_LEN]; + int n; + + if (!fmt) + return; + + n = vsnprintf(buf, sizeof (buf), fmt, vargs); + if ((n < 0) || (n >= sizeof (buf))) { + buf[sizeof (buf) - 2] = '+'; + buf[sizeof (buf) - 1] = '\0'; + } + + if (_ctx.do_syslog) + syslog(priority, "%s", buf); + + if (_ctx.do_stderr && (priority <= _ctx.priority)) + fprintf(stderr, "%s\n", buf); +} + +/* + * Log a message at the given [priority] level specified by the printf-style + * format string [fmt]. + */ +void +zed_log_msg(int priority, const char *fmt, ...) +{ + va_list vargs; + + if (fmt) { + va_start(vargs, fmt); + _zed_log_aux(priority, fmt, vargs); + va_end(vargs); + } +} + +/* + * Log a fatal error message specified by the printf-style format string [fmt]. + */ +void +zed_log_die(const char *fmt, ...) +{ + va_list vargs; + + if (fmt) { + va_start(vargs, fmt); + _zed_log_aux(LOG_ERR, fmt, vargs); + va_end(vargs); + } + exit(EXIT_FAILURE); +} diff --git a/cmd/zed/zed_log.h b/cmd/zed/zed_log.h new file mode 100644 index 000000000000..a03a4f53967c --- /dev/null +++ b/cmd/zed/zed_log.h @@ -0,0 +1,44 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_LOG_H +#define ZED_LOG_H + +#include <syslog.h> + +void zed_log_init(const char *identity); + +void zed_log_fini(void); + +void zed_log_pipe_open(void); + +void zed_log_pipe_close_reads(void); + +void zed_log_pipe_close_writes(void); + +void zed_log_pipe_wait(void); + +void zed_log_stderr_open(int priority); + +void zed_log_stderr_close(void); + +void zed_log_syslog_open(int facility); + +void zed_log_syslog_close(void); + +void zed_log_msg(int priority, const char *fmt, ...); + +void zed_log_die(const char *fmt, ...); + +#endif /* !ZED_LOG_H */ diff --git a/cmd/zed/zed_strings.c b/cmd/zed/zed_strings.c new file mode 100644 index 000000000000..6b1c669d71f4 --- /dev/null +++ b/cmd/zed/zed_strings.c @@ -0,0 +1,247 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <assert.h> +#include <errno.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <sys/avl.h> +#include <sys/sysmacros.h> +#include "zed_strings.h" + +struct zed_strings { + avl_tree_t tree; + avl_node_t *iteratorp; +}; + +struct zed_strings_node { + avl_node_t node; + char *key; + char *val; +}; + +typedef struct zed_strings_node zed_strings_node_t; + +/* + * Compare zed_strings_node_t nodes [x1] and [x2]. + * As required for the AVL tree, return -1 for <, 0 for ==, and +1 for >. + */ +static int +_zed_strings_node_compare(const void *x1, const void *x2) +{ + const char *s1; + const char *s2; + int rv; + + assert(x1 != NULL); + assert(x2 != NULL); + + s1 = ((const zed_strings_node_t *) x1)->key; + assert(s1 != NULL); + s2 = ((const zed_strings_node_t *) x2)->key; + assert(s2 != NULL); + rv = strcmp(s1, s2); + + if (rv < 0) + return (-1); + + if (rv > 0) + return (1); + + return (0); +} + +/* + * Return a new string container, or NULL on error. + */ +zed_strings_t * +zed_strings_create(void) +{ + zed_strings_t *zsp; + + zsp = calloc(1, sizeof (*zsp)); + if (!zsp) + return (NULL); + + avl_create(&zsp->tree, _zed_strings_node_compare, + sizeof (zed_strings_node_t), offsetof(zed_strings_node_t, node)); + + zsp->iteratorp = NULL; + return (zsp); +} + +/* + * Destroy the string node [np]. + */ +static void +_zed_strings_node_destroy(zed_strings_node_t *np) +{ + if (!np) + return; + + if (np->key) { + if (np->key != np->val) + free(np->key); + np->key = NULL; + } + if (np->val) { + free(np->val); + np->val = NULL; + } + free(np); +} + +/* + * Return a new string node for storing the string [val], or NULL on error. + * If [key] is specified, it will be used to index the node; otherwise, + * the string [val] will be used. + */ +static zed_strings_node_t * +_zed_strings_node_create(const char *key, const char *val) +{ + zed_strings_node_t *np; + + assert(val != NULL); + + np = calloc(1, sizeof (*np)); + if (!np) + return (NULL); + + np->val = strdup(val); + if (!np->val) + goto nomem; + + if (key) { + np->key = strdup(key); + if (!np->key) + goto nomem; + } else { + np->key = np->val; + } + return (np); + +nomem: + _zed_strings_node_destroy(np); + return (NULL); +} + +/* + * Destroy the string container [zsp] and all nodes within. + */ +void +zed_strings_destroy(zed_strings_t *zsp) +{ + void *cookie; + zed_strings_node_t *np; + + if (!zsp) + return; + + cookie = NULL; + while ((np = avl_destroy_nodes(&zsp->tree, &cookie))) + _zed_strings_node_destroy(np); + + avl_destroy(&zsp->tree); + free(zsp); +} + +/* + * Add a copy of the string [s] indexed by [key] to the container [zsp]. + * If [key] already exists within the container [zsp], it will be replaced + * with the new string [s]. + * If [key] is NULL, the string [s] will be used as the key. + * Return 0 on success, or -1 on error. + */ +int +zed_strings_add(zed_strings_t *zsp, const char *key, const char *s) +{ + zed_strings_node_t *newp, *oldp; + + if (!zsp || !s) { + errno = EINVAL; + return (-1); + } + if (key == s) + key = NULL; + + newp = _zed_strings_node_create(key, s); + if (!newp) + return (-1); + + oldp = avl_find(&zsp->tree, newp, NULL); + if (oldp) { + avl_remove(&zsp->tree, oldp); + _zed_strings_node_destroy(oldp); + } + avl_add(&zsp->tree, newp); + return (0); +} + +/* + * Return the first string in container [zsp]. + * Return NULL if there are no strings, or on error. + * This can be called multiple times to re-traverse [zsp]. + * XXX: Not thread-safe. + */ +const char * +zed_strings_first(zed_strings_t *zsp) +{ + if (!zsp) { + errno = EINVAL; + return (NULL); + } + zsp->iteratorp = avl_first(&zsp->tree); + if (!zsp->iteratorp) + return (NULL); + + return (((zed_strings_node_t *)zsp->iteratorp)->val); + +} + +/* + * Return the next string in container [zsp]. + * Return NULL after the last string, or on error. + * This must be called after zed_strings_first(). + * XXX: Not thread-safe. + */ +const char * +zed_strings_next(zed_strings_t *zsp) +{ + if (!zsp) { + errno = EINVAL; + return (NULL); + } + if (!zsp->iteratorp) + return (NULL); + + zsp->iteratorp = AVL_NEXT(&zsp->tree, zsp->iteratorp); + if (!zsp->iteratorp) + return (NULL); + + return (((zed_strings_node_t *)zsp->iteratorp)->val); +} + +/* + * Return the number of strings in container [zsp], or -1 on error. + */ +int +zed_strings_count(zed_strings_t *zsp) +{ + if (!zsp) { + errno = EINVAL; + return (-1); + } + return (avl_numnodes(&zsp->tree)); +} diff --git a/cmd/zed/zed_strings.h b/cmd/zed/zed_strings.h new file mode 100644 index 000000000000..37a84cad7ffc --- /dev/null +++ b/cmd/zed/zed_strings.h @@ -0,0 +1,27 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_STRINGS_H +#define ZED_STRINGS_H + +typedef struct zed_strings zed_strings_t; + +zed_strings_t *zed_strings_create(void); +void zed_strings_destroy(zed_strings_t *zsp); +int zed_strings_add(zed_strings_t *zsp, const char *key, const char *s); +const char *zed_strings_first(zed_strings_t *zsp); +const char *zed_strings_next(zed_strings_t *zsp); +int zed_strings_count(zed_strings_t *zsp); + +#endif /* !ZED_STRINGS_H */ |