diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am index 9f6869fed5d9..32fd9597d916 100644 --- a/cmd/zed/Makefile.am +++ b/cmd/zed/Makefile.am @@ -39,12 +39,16 @@ dist_zedexec_SCRIPTS = \ $(top_srcdir)/cmd/zed/zed.d/all-debug.sh \ $(top_srcdir)/cmd/zed/zed.d/all-syslog.sh \ $(top_srcdir)/cmd/zed/zed.d/checksum-email.sh \ + $(top_srcdir)/cmd/zed/zed.d/checksum-spare.sh \ + $(top_srcdir)/cmd/zed/zed.d/io-spare.sh \ $(top_srcdir)/cmd/zed/zed.d/resilver.finish-email.sh \ $(top_srcdir)/cmd/zed/zed.d/scrub.finish-email.sh zedconfdefaults = \ all-syslog.sh \ checksum-email.sh \ + checksum-spare.sh \ + io-spare.sh \ resilver.finish-email.sh \ scrub.finish-email.sh diff --git a/cmd/zed/zed.d/checksum-spare.sh b/cmd/zed/zed.d/checksum-spare.sh new file mode 120000 index 000000000000..f564f932283c --- /dev/null +++ b/cmd/zed/zed.d/checksum-spare.sh @@ -0,0 +1 @@ +io-spare.sh \ No newline at end of file diff --git a/cmd/zed/zed.d/io-spare.sh b/cmd/zed/zed.d/io-spare.sh new file mode 100755 index 000000000000..2c503551b0ec --- /dev/null +++ b/cmd/zed/zed.d/io-spare.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# +# Replace a device with a hot spare in response to IO or checksum errors. +# The following actions will be performed when the number of errors exceed +# the limit set by ZED_SPARE_ON_IO_ERRORS or ZED_SPARE_ON_CHECKSUM_ERRORS. +# +# 1) FAULT or DEGRADE the offending device to prevent additional errors. +# +# 2) Set the fault beacon for the device if possible. +# +# 3) Replace the device with a hot spare if any are available. +# +# This script only provides the functionality for automatically kicking in +# a hot spare. It does not provide any of the autoreplace functionality. +# This means that once the required repair is complete the hot spare must +# be manually retired using the 'zpool detach' command. +# +# Full support for autoreplace is planned, but it requires that the full +# ZFS Diagnosis Engine be ported. In the meanwhile this script provides +# the majority of the expected hot spare functionality. +# +test -f zed.rc && . ./zed.rc + +# Defaults to disabled, enable in the zed.rc file. +ZED_SPARE_ON_IO_ERRORS=${ZED_SPARE_ON_IO_ERRORS:-0} +ZED_SPARE_ON_CHECKSUM_ERRORS=${ZED_SPARE_ON_CHECKSUM_ERRORS:-0} + +if [ ${ZED_SPARE_ON_IO_ERRORS} -eq 0 -a \ + ${ZED_SPARE_ON_CHECKSUM_ERRORS} -eq 0 ]; then + exit 0 +fi + +# A lock file is used to serialize execution. +ZED_LOCKDIR=${ZED_LOCKDIR:-/var/lock} +LOCKFILE="${ZED_LOCKDIR}/zed.spare.lock" + +exec 8> "${LOCKFILE}" +flock -x 8 + +# Given a and return the status, (ONLINE, FAULTED, etc...). +vdev_status() { + local POOL=$1 + local VDEV=`basename $2` + + ${ZPOOL} status ${POOL} | awk -v pat=${VDEV} '$0 ~ pat { print $2 }' + return 0 +} + +# Fault devices after N I/O errors. +if [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.io" ]; then + ERRORS=`expr ${ZEVENT_VDEV_READ_ERRORS} + ${ZEVENT_VDEV_WRITE_ERRORS}` + + if [ ${ZED_SPARE_ON_IO_ERRORS} -gt 0 -a \ + ${ERRORS} -ge ${ZED_SPARE_ON_IO_ERRORS} ]; then + ACTION="fault" + fi +# Degrade devices after N checksum errors. +elif [ "${ZEVENT_CLASS}" = "ereport.fs.zfs.checksum" ]; then + ERRORS=${ZEVENT_VDEV_CKSUM_ERRORS} + + if [ ${ZED_SPARE_ON_CHECKSUM_ERRORS} -gt 0 -a \ + ${ERRORS} -ge ${ZED_SPARE_ON_CHECKSUM_ERRORS} ]; then + ACTION="degrade" + fi +else + ACTION= +fi + +if [ -n "${ACTION}" ]; then + + # Device is already FAULTED or DEGRADED + STATUS=`vdev_status ${ZEVENT_POOL} ${ZEVENT_VDEV_PATH}` + if [ "${STATUS}" = "FAULTED" -o "${STATUS}" = "DEGRADED" ]; then + exit 0 + fi + + # FAULT or DEGRADE the device + ${ZINJECT} -d ${ZEVENT_VDEV_GUID} -A ${ACTION} ${ZEVENT_POOL} + + # FIXME: Set the 'fault' or 'ident' beacon for the device. This can + # be done through the sg_ses utility, the only hard part is to map + # the sd device to its corresponding enclosure and slot. We may + # be able to leverage the existing vdev_id scripts for this. + # + # $ sg_ses --dev-slot-num=0 --set=ident /dev/sg3 + # $ sg_ses --dev-slot-num=0 --clear=ident /dev/sg3 + + # Round robin through the spares selecting those which are available. + for SPARE in ${ZEVENT_VDEV_SPARE_PATHS}; do + STATUS=`vdev_status ${ZEVENT_POOL} ${SPARE}` + if [ "${STATUS}" = "AVAIL" ]; then + ${ZPOOL} replace ${ZEVENT_POOL} \ + ${ZEVENT_VDEV_GUID} ${SPARE} && break + fi + done +fi + +exit 0 diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index 57c969c89900..69989f95315b 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -26,3 +26,9 @@ # The syslog tag for marking zed events. #ZED_SYSLOG_TAG="zed" + +# Replace a device with a hot spare after N I/O errors are detected. +#ZED_SPARE_ON_IO_ERRORS=1 + +# Replace a device with a hot spare after N checksum errors are detected. +#ZED_SPARE_ON_CHECKSUM_ERRORS=10 diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index d9122ac5f7d3..d541b07a3729 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -75,6 +75,11 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS "vdev_complete_ts" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS "vdev_delta_ts" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS "vdev_spare_paths" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS "vdev_spare_guids" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 7052eec4abab..1aa2dd15235f 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1377,7 +1377,7 @@ spa_load_spares(spa_t *spa) * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ - spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), + spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), KM_PUSHPAGE); for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index df47d99cfafa..05ee84c19e4d 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -251,6 +251,11 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, if (vd != NULL) { vdev_t *pvd = vd->vdev_parent; vdev_queue_t *vq = &vd->vdev_queue; + vdev_stat_t *vs = &vd->vdev_stat; + vdev_t *spare_vd; + uint64_t *spare_guids; + char **spare_paths; + int i, spare_count; fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, DATA_TYPE_UINT64, vd->vdev_guid, @@ -282,6 +287,16 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); } + if (vs != NULL) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, + DATA_TYPE_UINT64, vs->vs_read_errors, + FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, + DATA_TYPE_UINT64, vs->vs_write_errors, + FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, + DATA_TYPE_UINT64, vs->vs_checksum_errors, NULL); + } + if (pvd != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, @@ -298,6 +313,28 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, DATA_TYPE_STRING, pvd->vdev_devid, NULL); } + + spare_count = spa->spa_spares.sav_count; + spare_paths = kmem_zalloc(sizeof (char *) * spare_count, + KM_PUSHPAGE); + spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, + KM_PUSHPAGE); + + for (i = 0; i < spare_count; i++) { + spare_vd = spa->spa_spares.sav_vdevs[i]; + if (spare_vd) { + spare_paths[i] = spare_vd->vdev_path; + spare_guids[i] = spare_vd->vdev_guid; + } + } + + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, + DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, + DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); + + kmem_free(spare_guids, sizeof (uint64_t) * spare_count); + kmem_free(spare_paths, sizeof (char *) * spare_count); } if (zio != NULL) { @@ -834,15 +871,18 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, ZFS_ERROR_CLASS, name); - VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); - VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); - VERIFY(nvlist_add_uint64(resource, - FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); + VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); + VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); + VERIFY0(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); + VERIFY0(nvlist_add_int32(resource, + FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); + if (vd) { - VERIFY(nvlist_add_uint64(resource, - FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); - VERIFY(nvlist_add_uint64(resource, - FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state) == 0); + VERIFY0(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); + VERIFY0(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); } zfs_zevent_post(resource, NULL, zfs_zevent_post_cb);