diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 69001071dba9..1415be8643ae 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -4542,6 +4542,8 @@ main(int argc, char **argv) } } else { error = open_objset(target, DMU_OST_ANY, FTAG, &os); + if (error == 0) + spa = dmu_objset_spa(os); } } nvlist_free(policy); @@ -4549,6 +4551,14 @@ main(int argc, char **argv) if (error) fatal("can't open '%s': %s", target, strerror(error)); + /* + * Set the pool failure mode to panic in order to prevent the pool + * from suspending. A suspended I/O will have no way to resume and + * can prevent the zdb(8) command from terminating as expected. + */ + if (spa != NULL) + spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; + argv++; argc--; if (!dump_opt['R']) { diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index f07e11c8361b..a8b5418b60b2 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -197,6 +197,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = { extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; +extern unsigned long zfs_deadman_synctime_ms; extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; extern int zfs_abd_scatter_enabled; @@ -447,6 +448,7 @@ static kmutex_t ztest_vdev_lock; static rwlock_t ztest_name_lock; static boolean_t ztest_dump_core = B_TRUE; +static boolean_t ztest_dump_debug_buffer = B_FALSE; static boolean_t ztest_exiting; /* Global commit callback list */ @@ -495,6 +497,16 @@ _umem_logging_init(void) return ("fail,contents"); /* $UMEM_LOGGING setting */ } +static void +dump_debug_buffer(void) +{ + if (!ztest_dump_debug_buffer) + return; + + (void) printf("\n"); + zfs_dbgmsg_print("ztest"); +} + #define BACKTRACE_SZ 100 static void sig_handler(int signo) @@ -507,6 +519,7 @@ static void sig_handler(int signo) nptrs = backtrace(buffer, BACKTRACE_SZ); backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); #endif + dump_debug_buffer(); /* * Restore default action and re-raise signal so SIGSEGV and @@ -544,6 +557,9 @@ fatal(int do_perror, char *message, ...) } (void) fprintf(stderr, "%s\n", buf); fatal_msg = buf; /* to ease debugging */ + + dump_debug_buffer(); + if (ztest_dump_core) abort(); exit(3); @@ -641,6 +657,7 @@ usage(boolean_t requested) "\t[-B alt_ztest (default: )] alternate ztest path\n" "\t[-o variable=value] ... set global variable to an unsigned\n" "\t 32-bit integer value\n" + "\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n" "\t[-h] (print help)\n" "", zo->zo_pool, @@ -676,7 +693,7 @@ process_options(int argc, char **argv) bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); while ((opt = getopt(argc, argv, - "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:o:")) != EOF) { + "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:o:G")) != EOF) { value = 0; switch (opt) { case 'v': @@ -771,6 +788,9 @@ process_options(int argc, char **argv) if (set_global_var(optarg) != 0) usage(B_FALSE); break; + case 'G': + ztest_dump_debug_buffer = B_TRUE; + break; case 'h': usage(B_TRUE); break; @@ -6224,15 +6244,48 @@ ztest_resume_thread(void *arg) thread_exit(); } -#define GRACE 300 - -#if 0 static void -ztest_deadman_alarm(int sig) +ztest_deadman_thread(void *arg) { - fatal(0, "failed to complete within %d seconds of deadline", GRACE); + ztest_shared_t *zs = arg; + spa_t *spa = ztest_spa; + hrtime_t delta, overdue, total = 0; + + for (;;) { + delta = zs->zs_thread_stop - zs->zs_thread_start + + MSEC2NSEC(zfs_deadman_synctime_ms); + + (void) poll(NULL, 0, (int)NSEC2MSEC(delta)); + + /* + * If the pool is suspended then fail immediately. Otherwise, + * check to see if the pool is making any progress. If + * vdev_deadman() discovers that there hasn't been any recent + * I/Os then it will end up aborting the tests. + */ + if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { + fatal(0, "aborting test after %llu seconds because " + "pool has transitioned to a suspended state.", + zfs_deadman_synctime_ms / 1000); + } + vdev_deadman(spa->spa_root_vdev, FTAG); + + /* + * If the process doesn't complete within a grace period of + * zfs_deadman_synctime_ms over the expected finish time, + * then it may be hung and is terminated. + */ + overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); + total += zfs_deadman_synctime_ms / 1000; + if (gethrtime() > overdue) { + fatal(0, "aborting test after %llu seconds because " + "the process is overdue for termination.", total); + } + + (void) printf("ztest has been running for %lld seconds\n", + total); + } } -#endif static void ztest_execute(int test, ztest_info_t *zi, uint64_t id) @@ -6491,13 +6544,13 @@ ztest_run(ztest_shared_t *zs) resume_thread = thread_create(NULL, 0, ztest_resume_thread, spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); -#if 0 /* - * Set a deadman alarm to abort() if we hang. + * Create a deadman thread and set to panic if we hang. */ - signal(SIGALRM, ztest_deadman_alarm); - alarm((zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + GRACE); -#endif + (void) thread_create(NULL, 0, ztest_deadman_thread, + zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; /* * Verify that we can safely inquire about about any object, @@ -7047,6 +7100,7 @@ main(int argc, char **argv) (void) setvbuf(stdout, NULL, _IOLBF, 0); dprintf_setup(&argc, argv); + zfs_deadman_synctime_ms = 300000; action.sa_handler = sig_handler; sigemptyset(&action.sa_mask); diff --git a/configure.ac b/configure.ac index 47e0a1b126b7..fe7093e239c6 100644 --- a/configure.ac +++ b/configure.ac @@ -242,6 +242,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/cli_user/zpool_list/Makefile tests/zfs-tests/tests/functional/compression/Makefile tests/zfs-tests/tests/functional/ctime/Makefile + tests/zfs-tests/tests/functional/deadman/Makefile tests/zfs-tests/tests/functional/delegate/Makefile tests/zfs-tests/tests/functional/devices/Makefile tests/zfs-tests/tests/functional/events/Makefile diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index 02b15b810c43..513d0cf5ec1c 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -37,6 +37,7 @@ extern "C" { #define FM_EREPORT_ZFS_IO "io" #define FM_EREPORT_ZFS_DATA "data" #define FM_EREPORT_ZFS_DELAY "delay" +#define FM_EREPORT_ZFS_DEADMAN "deadman" #define FM_EREPORT_ZFS_POOL "zpool" #define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" #define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" diff --git a/include/sys/spa.h b/include/sys/spa.h index c291c508281a..7bc3b2197bce 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -955,12 +955,15 @@ extern void spa_evicting_os_deregister(spa_t *, objset_t *os); extern void spa_evicting_os_wait(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_prev_software_version(spa_t *spa); -extern uint8_t spa_get_failmode(spa_t *spa); +extern uint64_t spa_get_failmode(spa_t *spa); +extern uint64_t spa_get_deadman_failmode(spa_t *spa); +extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode); extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); +extern uint64_t spa_deadman_ziotime(spa_t *spa); /* Miscellaneous support routines */ extern void spa_activate_mos_feature(spa_t *spa, const char *feature, @@ -1074,6 +1077,10 @@ extern boolean_t spa_debug_enabled(spa_t *spa); } extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ +extern int zfs_deadman_enabled; +extern unsigned long zfs_deadman_synctime_ms; +extern unsigned long zfs_deadman_ziotime_ms; +extern unsigned long zfs_deadman_checktime_ms; #ifdef __cplusplus } diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 2fc598016306..66fcafd9c60f 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -228,6 +228,7 @@ struct spa { uint64_t spa_pool_props_object; /* object for properties */ uint64_t spa_bootfs; /* default boot filesystem */ uint64_t spa_failmode; /* failure mode for the pool */ + uint64_t spa_deadman_failmode; /* failure mode for deadman */ uint64_t spa_delegation; /* delegation on/off */ list_t spa_config_list; /* previous cache file(s) */ /* per-CPU array of root of async I/O: */ @@ -270,7 +271,8 @@ struct spa { taskqid_t spa_deadman_tqid; /* Task id */ uint64_t spa_deadman_calls; /* number of deadman calls */ hrtime_t spa_sync_starttime; /* starting time of spa_sync */ - uint64_t spa_deadman_synctime; /* deadman expiration timer */ + uint64_t spa_deadman_synctime; /* deadman sync expiration */ + uint64_t spa_deadman_ziotime; /* deadman zio expiration */ uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */ spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */ uint64_t spa_errata; /* errata issues detected */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index bc2f4f0ead88..022713096d63 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -84,7 +84,7 @@ extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); -extern void vdev_deadman(vdev_t *vd); +extern void vdev_deadman(vdev_t *vd, char *tag); extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 6f32b11c730c..d6da0d32b3aa 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -317,6 +317,7 @@ extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, extern void cv_signal(kcondvar_t *cv); extern void cv_broadcast(kcondvar_t *cv); +#define cv_timedwait_io(cv, mp, at) cv_timedwait(cv, mp, at) #define cv_timedwait_sig(cv, mp, at) cv_timedwait(cv, mp, at) #define cv_wait_sig(cv, mp) cv_wait(cv, mp) #define cv_wait_io(cv, mp) cv_wait(cv, mp) diff --git a/include/sys/zio.h b/include/sys/zio.h index bf4818e16cfe..1ed1f1ffb421 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -556,6 +556,7 @@ extern void zio_execute(zio_t *zio); extern void zio_interrupt(zio_t *zio); extern void zio_delay_init(zio_t *zio); extern void zio_delay_interrupt(zio_t *zio); +extern void zio_deadman(zio_t *zio, char *tag); extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); diff --git a/man/man1/ztest.1 b/man/man1/ztest.1 index 64f543d21cc8..b8cb0d45d92c 100644 --- a/man/man1/ztest.1 +++ b/man/man1/ztest.1 @@ -129,6 +129,10 @@ Total test run time. .BI "\-z" " zil_failure_rate" " (default: fail every 2^5 allocs) .IP Injected failure rate. +.HP +.BI "\-G" +.IP +Dump zfs_dbgmsg buffer before exiting. .SH "EXAMPLES" .LP To override /tmp as your location for block files, you can use the -f diff --git a/man/man5/zfs-events.5 b/man/man5/zfs-events.5 index 5cef4f53922f..4c60eecc5d5f 100644 --- a/man/man5/zfs-events.5 +++ b/man/man5/zfs-events.5 @@ -55,7 +55,7 @@ part here. \fBchecksum\fR .ad .RS 12n -Issued when a checksum error have been detected. +Issued when a checksum error has been detected. .RE .sp @@ -76,14 +76,27 @@ Issued when there is an I/O error in a vdev in the pool. Issued when there have been data errors in the pool. .RE +.sp +.ne 2 +.na +\fBdeadman\fR +.ad +.RS 12n +Issued when an I/O is determined to be "hung", this can be caused by lost +completion events due to flaky hardware or drivers. See the +\fBzfs_deadman_failmode\fR module option description for additional +information regarding "hung" I/O detection and configuration. +.RE + .sp .ne 2 .na \fBdelay\fR .ad .RS 12n -Issued when an I/O was slow to complete as defined by the zio_delay_max module -option. +Issued when a completed I/O exceeds the maximum allowed time specified +by the \fBzio_delay_max\fR module option. This can be an indicator of +problems with the underlying storage device. .RE .sp diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 5b7a29d3256b..4eed0a309a50 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -823,14 +823,36 @@ Default value: \fB0\fR. .ad .RS 12n When a pool sync operation takes longer than \fBzfs_deadman_synctime_ms\fR -milliseconds, a "slow spa_sync" message is logged to the debug log -(see \fBzfs_dbgmsg_enable\fR). If \fBzfs_deadman_enabled\fR is set, -all pending IO operations are also checked and if any haven't completed -within \fBzfs_deadman_synctime_ms\fR milliseconds, a "SLOW IO" message -is logged to the debug log and a "delay" system event with the details of -the hung IO is posted. +milliseconds, or when an individual I/O takes longer than +\fBzfs_deadman_ziotime_ms\fR milliseconds, then the operation is considered to +be "hung". If \fBzfs_deadman_enabled\fR is set then the deadman behavior is +invoked as described by the \fBzfs_deadman_failmode\fR module option. +By default the deadman is enabled and configured to \fBwait\fR which results +in "hung" I/Os only being logged. The deadman is automatically disabled +when a pool gets suspended. .sp -Use \fB1\fR (default) to enable the slow IO check and \fB0\fR to disable. +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_deadman_failmode\fR (charp) +.ad +.RS 12n +Controls the failure behavior when the deadman detects a "hung" I/O. Valid +values are \fBwait\fR, \fBcontinue\fR, and \fBpanic\fR. +.sp +\fBwait\fR - Wait for a "hung" I/O to complete. For each "hung" I/O a +"deadman" event will be posted describing that I/O. +.sp +\fBcontinue\fR - Attempt to recover from a "hung" I/O by re-dispatching it +to the I/O pipeline if possible. +.sp +\fBpanic\fR - Panic the system. This can be used to facilitate an automatic +fail-over to a properly configured fail-over partner. +.sp +Default value: \fBwait\fR. .RE .sp @@ -839,11 +861,10 @@ Use \fB1\fR (default) to enable the slow IO check and \fB0\fR to disable. \fBzfs_deadman_checktime_ms\fR (int) .ad .RS 12n -Once a pool sync operation has taken longer than -\fBzfs_deadman_synctime_ms\fR milliseconds, continue to check for slow -operations every \fBzfs_deadman_checktime_ms\fR milliseconds. +Check time in milliseconds. This defines the frequency at which we check +for hung I/O and invoke the \fBzfs_deadman_failmode\fR behavior. .sp -Default value: \fB5,000\fR. +Default value: \fB60,000\fR. .RE .sp @@ -853,12 +874,25 @@ Default value: \fB5,000\fR. .ad .RS 12n Interval in milliseconds after which the deadman is triggered and also -the interval after which an IO operation is considered to be "hung" -if \fBzfs_deadman_enabled\fR is set. +the interval after which a pool sync operation is considered to be "hung". +Once this limit is exceeded the deadman will be invoked every +\fBzfs_deadman_checktime_ms\fR milliseconds until the pool sync completes. +.sp +Default value: \fB600,000\fR. +.RE -See \fBzfs_deadman_enabled\fR. .sp -Default value: \fB1,000,000\fR. +.ne 2 +.na +\fBzfs_deadman_ziotime_ms\fR (ulong) +.ad +.RS 12n +Interval in milliseconds after which the deadman is triggered and an +individual IO operation is considered to be "hung". As long as the I/O +remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR +milliseconds until the I/O completes. +.sp +Default value: \fB300,000\fR. .RE .sp diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index dc0f8d5fe9d1..e742af2550f0 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -292,24 +292,41 @@ int zfs_free_leak_on_eio = B_FALSE; /* * Expiration time in milliseconds. This value has two meanings. First it is * used to determine when the spa_deadman() logic should fire. By default the - * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. + * spa_deadman() will fire if spa_sync() has not completed in 600 seconds. * Secondly, the value determines if an I/O is considered "hung". Any I/O that * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting - * in a system panic. + * in one of three behaviors controlled by zfs_deadman_failmode. */ -unsigned long zfs_deadman_synctime_ms = 1000000ULL; +unsigned long zfs_deadman_synctime_ms = 600000ULL; + +/* + * This value controls the maximum amount of time zio_wait() will block for an + * outstanding IO. By default this is 300 seconds at which point the "hung" + * behavior will be applied as described for zfs_deadman_synctime_ms. + */ +unsigned long zfs_deadman_ziotime_ms = 300000ULL; /* * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ -unsigned long zfs_deadman_checktime_ms = 5000ULL; +unsigned long zfs_deadman_checktime_ms = 60000ULL; /* * By default the deadman is enabled. */ int zfs_deadman_enabled = 1; +/* + * Controls the behavior of the deadman when it detects a "hung" I/O. + * Valid values are zfs_deadman_failmode=. + * + * wait - Wait for the "hung" I/O (default) + * continue - Attempt to recover from a "hung" I/O + * panic - Panic the system + */ +char *zfs_deadman_failmode = "wait"; + /* * The worst case is single-sector max-parity RAID-Z blocks, in which * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) @@ -536,7 +553,7 @@ spa_deadman(void *arg) (gethrtime() - spa->spa_sync_starttime) / NANOSEC, ++spa->spa_deadman_calls); if (zfs_deadman_enabled) - vdev_deadman(spa->spa_root_vdev); + vdev_deadman(spa->spa_root_vdev, FTAG); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + @@ -590,6 +607,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_proc_state = SPA_PROC_NONE; spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); + spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); + spa_set_deadman_failmode(spa, zfs_deadman_failmode); refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); @@ -1681,7 +1700,7 @@ spa_update_dspace(spa_t *spa) * Return the failure mode that has been set to this pool. The default * behavior will be to block all I/Os when a complete failure occurs. */ -uint8_t +uint64_t spa_get_failmode(spa_t *spa) { return (spa->spa_failmode); @@ -1770,6 +1789,31 @@ spa_deadman_synctime(spa_t *spa) return (spa->spa_deadman_synctime); } +uint64_t +spa_deadman_ziotime(spa_t *spa) +{ + return (spa->spa_deadman_ziotime); +} + +uint64_t +spa_get_deadman_failmode(spa_t *spa) +{ + return (spa->spa_deadman_failmode); +} + +void +spa_set_deadman_failmode(spa_t *spa, const char *failmode) +{ + if (strcmp(failmode, "wait") == 0) + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; + else if (strcmp(failmode, "continue") == 0) + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE; + else if (strcmp(failmode, "panic") == 0) + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; + else + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; +} + uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { @@ -2106,6 +2150,33 @@ spa_get_hostid(void) } #if defined(_KERNEL) && defined(HAVE_SPL) + +#include + +static int +param_set_deadman_failmode(const char *val, zfs_kernel_param_t *kp) +{ + spa_t *spa = NULL; + char *p; + + if (val == NULL) + return (SET_ERROR(-EINVAL)); + + if ((p = strchr(val, '\n')) != NULL) + *p = '\0'; + + if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 && + strcmp(val, "panic")) + return (SET_ERROR(-EINVAL)); + + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) + spa_set_deadman_failmode(spa, val); + mutex_exit(&spa_namespace_lock); + + return (param_set_charp(val, kp)); +} + /* Namespace manipulation */ EXPORT_SYMBOL(spa_lookup); EXPORT_SYMBOL(spa_add); @@ -2196,7 +2267,12 @@ MODULE_PARM_DESC(zfs_free_leak_on_eio, "Set to ignore IO errors during free and permanently leak the space"); module_param(zfs_deadman_synctime_ms, ulong, 0644); -MODULE_PARM_DESC(zfs_deadman_synctime_ms, "Expiration time in milliseconds"); +MODULE_PARM_DESC(zfs_deadman_synctime_ms, + "Pool sync expiration time in milliseconds"); + +module_param(zfs_deadman_ziotime_ms, ulong, 0644); +MODULE_PARM_DESC(zfs_deadman_ziotime_ms, + "IO expiration time in milliseconds"); module_param(zfs_deadman_checktime_ms, ulong, 0644); MODULE_PARM_DESC(zfs_deadman_checktime_ms, @@ -2205,6 +2281,10 @@ MODULE_PARM_DESC(zfs_deadman_checktime_ms, module_param(zfs_deadman_enabled, int, 0644); MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer"); +module_param_call(zfs_deadman_failmode, param_set_deadman_failmode, + param_get_charp, &zfs_deadman_failmode, 0644); +MODULE_PARM_DESC(zfs_deadman_failmode, "Failmode for deadman timer"); + module_param(spa_asize_inflation, int, 0644); MODULE_PARM_DESC(spa_asize_inflation, "SPA size estimate multiplication factor"); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 9edeaf52592a..7fd0bd1ac679 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -3707,12 +3707,12 @@ vdev_split(vdev_t *vd) } void -vdev_deadman(vdev_t *vd) +vdev_deadman(vdev_t *vd, char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; - vdev_deadman(cvd); + vdev_deadman(cvd, tag); } if (vd->vdev_ops->vdev_op_leaf) { @@ -3724,21 +3724,18 @@ vdev_deadman(vdev_t *vd) zio_t *fio; uint64_t delta; + zfs_dbgmsg("slow vdev: %s has %d active IOs", + vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); + /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than - * the spa_deadman_synctime we log a zevent. + * the spa_deadman_synctime invoke the deadman logic. */ fio = avl_first(&vq->vq_active_tree); delta = gethrtime() - fio->io_timestamp; - if (delta > spa_deadman_synctime(spa)) { - zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " - "delta %lluns, last io %lluns", - fio->io_timestamp, delta, - vq->vq_io_complete_ts); - zfs_ereport_post(FM_EREPORT_ZFS_DELAY, - spa, vd, &fio->io_bookmark, fio, 0, 0); - } + if (delta > spa_deadman_synctime(spa)) + zio_deadman(fio, tag); } mutex_exit(&vq->vq_lock); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 7d2370034239..263c77e4a2d3 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1748,6 +1748,80 @@ zio_delay_interrupt(zio_t *zio) zio_interrupt(zio); } +static void +zio_deadman_impl(zio_t *pio) +{ + zio_t *cio, *cio_next; + zio_link_t *zl = NULL; + vdev_t *vd = pio->io_vd; + + if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { + vdev_queue_t *vq = &vd->vdev_queue; + zbookmark_phys_t *zb = &pio->io_bookmark; + uint64_t delta = gethrtime() - pio->io_timestamp; + uint64_t failmode = spa_get_deadman_failmode(pio->io_spa); + + zfs_dbgmsg("slow zio: zio=%p timestamp=%llu " + "delta=%llu queued=%llu io=%llu " + "path=%s last=%llu " + "type=%d priority=%d flags=0x%x " + "stage=0x%x pipeline=0x%x pipeline-trace=0x%x " + "objset=%llu object=%llu level=%llu blkid=%llu " + "offset=%llu size=%llu error=%d", + pio, pio->io_timestamp, + delta, pio->io_delta, pio->io_delay, + vd->vdev_path, vq->vq_io_complete_ts, + pio->io_type, pio->io_priority, pio->io_flags, + pio->io_state, pio->io_pipeline, pio->io_pipeline_trace, + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, + pio->io_offset, pio->io_size, pio->io_error); + zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, + pio->io_spa, vd, zb, pio, 0, 0); + + if (failmode == ZIO_FAILURE_MODE_CONTINUE && + taskq_empty_ent(&pio->io_tqent)) { + zio_interrupt(pio); + } + } + + mutex_enter(&pio->io_lock); + for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio, &zl); + zio_deadman_impl(cio); + } + mutex_exit(&pio->io_lock); +} + +/* + * Log the critical information describing this zio and all of its children + * using the zfs_dbgmsg() interface then post deadman event for the ZED. + */ +void +zio_deadman(zio_t *pio, char *tag) +{ + spa_t *spa = pio->io_spa; + char *name = spa_name(spa); + + if (!zfs_deadman_enabled || spa_suspended(spa)) + return; + + zio_deadman_impl(pio); + + switch (spa_get_deadman_failmode(spa)) { + case ZIO_FAILURE_MODE_WAIT: + zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name); + break; + + case ZIO_FAILURE_MODE_CONTINUE: + zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name); + break; + + case ZIO_FAILURE_MODE_PANIC: + fm_panic("%s determined I/O to pool '%s' is hung.", tag, name); + break; + } +} + /* * Execute the I/O pipeline until one of the following occurs: * (1) the I/O completes; (2) the pipeline stalls waiting for @@ -1877,6 +1951,7 @@ __zio_execute(zio_t *zio) int zio_wait(zio_t *zio) { + long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms); int error; ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN); @@ -1889,8 +1964,19 @@ zio_wait(zio_t *zio) __zio_execute(zio); mutex_enter(&zio->io_lock); - while (zio->io_executor != NULL) - cv_wait_io(&zio->io_cv, &zio->io_lock); + while (zio->io_executor != NULL) { + error = cv_timedwait_io(&zio->io_cv, &zio->io_lock, + ddi_get_lbolt() + timeout); + + if (zfs_deadman_enabled && error == -1 && + gethrtime() - zio->io_queued_timestamp > + spa_deadman_ziotime(zio->io_spa)) { + mutex_exit(&zio->io_lock); + timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms); + zio_deadman(zio, FTAG); + mutex_enter(&zio->io_lock); + } + } mutex_exit(&zio->io_lock); error = zio->io_error; diff --git a/scripts/zloop.sh b/scripts/zloop.sh index 4e0afac5b474..1f36f865b594 100755 --- a/scripts/zloop.sh +++ b/scripts/zloop.sh @@ -52,6 +52,8 @@ function usage " -s Size of vdev devices.\n" \ " -f Specify working directory for ztest vdev files.\n" \ " -c Specify a core dump directory to use.\n" \ + " -m Max number of core dumps to allow before exiting.\n" \ + " -l Create 'ztest.core.N' symlink to core directory.\n" \ " -h Print this help message.\n" \ "" >&2 } @@ -105,22 +107,35 @@ function store_core coreid=$(date "+zloop-%y%m%d-%H%M%S") foundcrashes=$((foundcrashes + 1)) + # zdb debugging + zdbcmd="$ZDB -U "$workdir/zpool.cache" -dddMmDDG ztest" + zdbdebug=$($zdbcmd 2>&1) + echo -e "$zdbcmd\n" >>ztest.zdb + echo "$zdbdebug" >>ztest.zdb + dest=$coredir/$coreid or_die mkdir -p "$dest" or_die mkdir -p "$dest/vdev" + if [[ $symlink -ne 0 ]]; then + or_die ln -sf "$dest" ztest.core.$foundcrashes + fi + echo "*** ztest crash found - moving logs to $dest" or_die mv ztest.history "$dest/" - or_die mv ztest.ddt "$dest/" + or_die mv ztest.zdb "$dest/" or_die mv ztest.out "$dest/" or_die mv "$workdir/ztest*" "$dest/vdev/" - or_die mv "$workdir/zpool.cache" "$dest/vdev/" + + if [[ -e "$workdir/zpool.cache" ]]; then + or_die mv "$workdir/zpool.cache" "$dest/vdev/" + fi # check for core if [[ -f "$core" ]]; then coreprog=$(core_prog "$core") - corestatus=$($GDB --batch --quiet \ + coredebug=$($GDB --batch --quiet \ -ex "set print thread-events off" \ -ex "printf \"*\n* Backtrace \n*\n\"" \ -ex "bt" \ @@ -132,19 +147,25 @@ function store_core -ex "thread apply all bt" \ -ex "printf \"*\n* Backtraces (full) \n*\n\"" \ -ex "thread apply all bt full" \ - -ex "quit" "$coreprog" "$core" | grep -v "New LWP") + -ex "quit" "$coreprog" "$core" 2>&1 | \ + grep -v "New LWP") # Dump core + logs to stored directory - echo "$corestatus" >>"$dest/status" + echo "$coredebug" >>"$dest/ztest.gdb" or_die mv "$core" "$dest/" # Record info in cores logfile echo "*** core @ $coredir/$coreid/$core:" | \ tee -a ztest.cores - echo "$corestatus" | tee -a ztest.cores - echo "" | tee -a ztest.cores fi - echo "continuing..." + + if [[ $coremax -gt 0 ]] && + [[ $foundcrashes -ge $coremax ]]; then + echo "exiting... max $coremax allowed cores" + exit 1 + else + echo "continuing..." + fi fi } @@ -155,12 +176,16 @@ basedir=$DEFAULTWORKDIR rundir="zloop-run" timeout=0 size="512m" -while getopts ":ht:s:c:f:" opt; do +coremax=0 +symlink=0 +while getopts ":ht:m:s:c:f:l" opt; do case $opt in t ) [[ $OPTARG -gt 0 ]] && timeout=$OPTARG ;; + m ) [[ $OPTARG -gt 0 ]] && coremax=$OPTARG ;; s ) [[ $OPTARG ]] && size=$OPTARG ;; c ) [[ $OPTARG ]] && coredir=$OPTARG ;; f ) [[ $OPTARG ]] && basedir=$(readlink -f "$OPTARG") ;; + l ) symlink=1 ;; h ) usage exit 2 ;; @@ -179,6 +204,7 @@ export ASAN_OPTIONS=abort_on_error=1:disable_coredump=0 if [[ -f "$(core_file)" ]]; then echo -n "There's a core dump here you might want to look at first... " core_file + echo exit 1 fi @@ -193,7 +219,7 @@ if [[ ! -w $coredir ]]; then fi or_die rm -f ztest.history -or_die rm -f ztest.ddt +or_die rm -f ztest.zdb or_die rm -f ztest.cores ztrc=0 # ztest return value @@ -203,7 +229,7 @@ curtime=$starttime # if no timeout was specified, loop forever. while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do - zopt="-VVVVV" + zopt="-G -VVVVV" # start each run with an empty directory workdir="$basedir/$rundir" @@ -244,7 +270,6 @@ while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do $cmd >>ztest.out 2>&1 ztrc=$? grep -E '===|WARNING' ztest.out >>ztest.history - $ZDB -U "$workdir/zpool.cache" -DD ztest >>ztest.ddt 2>&1 store_core diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 0cecdd673350..af96e6a64eb9 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -440,6 +440,12 @@ tags = ['functional', 'compression'] tests = ['ctime_001_pos' ] tags = ['functional', 'ctime'] +[tests/functional/deadman] +tests = ['deadman_sync', 'deadman_zio'] +pre = +post = +tags = ['functional', 'deadman'] + [tests/functional/delegate] tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am index dc4bf51b4f73..49add883ece8 100644 --- a/tests/zfs-tests/tests/functional/Makefile.am +++ b/tests/zfs-tests/tests/functional/Makefile.am @@ -12,6 +12,7 @@ SUBDIRS = \ cli_user \ compression \ ctime \ + deadman \ delegate \ devices \ events \ diff --git a/tests/zfs-tests/tests/functional/deadman/Makefile.am b/tests/zfs-tests/tests/functional/deadman/Makefile.am new file mode 100644 index 000000000000..9c4b0713cf5e --- /dev/null +++ b/tests/zfs-tests/tests/functional/deadman/Makefile.am @@ -0,0 +1,5 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/deadman +dist_pkgdata_SCRIPTS = \ + deadman.cfg \ + deadman_sync.ksh \ + deadman_zio.ksh diff --git a/tests/zfs-tests/tests/functional/deadman/deadman.cfg b/tests/zfs-tests/tests/functional/deadman/deadman.cfg new file mode 100644 index 000000000000..e767f3dd5a98 --- /dev/null +++ b/tests/zfs-tests/tests/functional/deadman/deadman.cfg @@ -0,0 +1,32 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# Use is subject to license terms. +# + +export DISK1=${DISKS%% *} + +export SYNCTIME_DEFAULT=600000 +export ZIOTIME_DEFAULT=300000 +export CHECKTIME_DEFAULT=60000 +export FAILMODE_DEFAULT="wait" diff --git a/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh b/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh new file mode 100755 index 000000000000..a5537c4355c6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Lawrence Livermore National Security, LLC. +# Use is subject to license terms. +# + +# DESCRIPTION: +# Verify spa deadman detects a hung txg +# +# STRATEGY: +# 1. Reduce the zfs_deadman_synctime_ms to 5s. +# 2. Reduce the zfs_deadman_checktime_ms to 1s. +# 3. Inject a 10s zio delay to force long IOs. +# 4. Write enough data to force a long txg sync time due to the delay. +# 5. Verify a "deadman" event is posted. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/deadman/deadman.cfg + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + default_cleanup_noexit + + log_must set_tunable64 zfs_deadman_synctime_ms $SYNCTIME_DEFAULT + log_must set_tunable64 zfs_deadman_checktime_ms $CHECKTIME_DEFAULT + log_must set_tunable64 zfs_deadman_failmode $FAILMODE_DEFAULT +} + +log_assert "Verify spa deadman detects a hung txg" +log_onexit cleanup + +log_must set_tunable64 zfs_deadman_synctime_ms 5000 +log_must set_tunable64 zfs_deadman_checktime_ms 1000 +log_must set_tunable64 zfs_deadman_failmode "wait" + +# Create a new pool in order to use the updated deadman settings. +default_setup_noexit $DISK1 +log_must zpool events -c + +# Force each IO to take 10s by allow them to run concurrently. +log_must zinject -d $DISK1 -D10000:10 $TESTPOOL + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file +sleep 10 + +log_must zinject -c all +log_must zpool sync + +# Log txg sync times for reference and the zpool event summary. +log_must cat /proc/spl/kstat/zfs/$TESTPOOL/txgs +log_must zpool events + +# Verify at least 5 deadman events were logged. The first after 5 seconds, +# and another each second thereafter until the delay is clearer. +events=$(zpool events | grep -c ereport.fs.zfs.deadman) +if [ "$events" -lt 5 ]; then + log_fail "Expect >=5 deadman events, $events found" +fi + +log_pass "Verify spa deadman detected a hung txg and $events deadman events" diff --git a/tests/zfs-tests/tests/functional/deadman/deadman_zio.ksh b/tests/zfs-tests/tests/functional/deadman/deadman_zio.ksh new file mode 100755 index 000000000000..a61be995aeb0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/deadman/deadman_zio.ksh @@ -0,0 +1,113 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Lawrence Livermore National Security, LLC. +# Use is subject to license terms. +# + +# DESCRIPTION: +# Verify zio deadman detects a hung zio +# +# STRATEGY: +# 1. Reduce the zfs_deadman_ziotime_ms to 5s. +# 2. Reduce the zfs_deadman_checktime_ms to 1s. +# 3. Inject a 10s zio delay to force long IOs. +# 4. Read an uncached file in the background. +# 5. Verify a "deadman" event is posted. +# 6. Inject a 100ms zio delay which is under the 5s allowed. +# 7. Read an uncached file in the background. +# 8. Verify a "deadman" event is not posted. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/deadman/deadman.cfg + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + default_cleanup_noexit + + log_must set_tunable64 zfs_deadman_ziotime_ms $ZIOTIME_DEFAULT + log_must set_tunable64 zfs_deadman_checktime_ms $CHECKTIME_DEFAULT + log_must set_tunable64 zfs_deadman_failmode $FAILMODE_DEFAULT +} + +log_assert "Verify zio deadman detects a hung zio" +log_onexit cleanup + +# 1. Reduce the zfs_deadman_ziotime_ms to 5s. +log_must set_tunable64 zfs_deadman_ziotime_ms 5000 +# 2. Reduce the zfs_deadman_checktime_ms to 1s. +log_must set_tunable64 zfs_deadman_checktime_ms 1000 +log_must set_tunable64 zfs_deadman_failmode "wait" + +# Create a new pool in order to use the updated deadman settings. +default_setup_noexit $DISK1 + +# Write a file and export/import the pool to clear the ARC. +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file1 +log_must file_write -b 1048576 -c 8 -o create -d 0 -f $mntpnt/file2 +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +log_must stat -t /$mntpnt/file1 +log_must stat -t /$mntpnt/file2 + +# 3. Inject a 10s zio delay to force long IOs and serialize them.. +log_must zpool events -c +log_must zinject -d $DISK1 -D10000:1 $TESTPOOL + +# 4. Read an uncached file in the background, it's expected to hang. +log_must eval "dd if=/$mntpnt/file1 of=/dev/null bs=1048576 &" +sleep 10 +log_must zinject -c all +log_must zpool sync +wait + +# 5. Verify a "deadman" event is posted. The first appears after 5 +# seconds, and another each second thereafter until the delay is cleared. +events=$(zpool events | grep -c ereport.fs.zfs.deadman) +if [ "$events" -lt 1 ]; then + log_fail "Expect >=1 deadman events, $events found" +fi + +# 6. Inject a 100ms zio delay which is under the 5s allowed, allow them +# to run concurrently so they don't get starved in the queue. +log_must zpool events -c +log_must zinject -d $DISK1 -D100:10 $TESTPOOL + +# 7. Read an uncached file in the background. +log_must eval "dd if=/$mntpnt/file2 of=/dev/null bs=1048576 &" +sleep 10 +log_must zinject -c all +wait + +# 8. Verify a "deadman" event is not posted. +events=$(zpool events | grep -c ereport.fs.zfs.deadman) +if [ "$events" -ne 0 ]; then + log_fail "Expect 0 deadman events, $events found" +fi + +log_pass "Verify zio deadman detected a hung zio and $events deadman events"