From 6f2975f5f9e1d797a078141268650475891e5275 Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Wed, 10 Oct 2018 16:48:33 -0400 Subject: [PATCH] Fix random ztest_deadman_thread failures The zloop test has been failing in buildbot for the last few weeks with various failures in ztest_deadman_thread(). This is due to the fact that this thread is not stopped when performing pool import / export tests as it should be. This patch simply corrects this. Reviewed-by: Brian Behlendorf Reviewed-by: Serapheim Dimitropoulos Reviewed-by: Matthew Ahrens Signed-off-by: Tom Caputi Closes #8010 --- cmd/ztest/ztest.c | 36 +++++++++++++++++++++++++----------- module/zfs/spa_misc.c | 2 +- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index e0972fcdcb02..be2e247aff02 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -6496,13 +6496,20 @@ ztest_deadman_thread(void *arg) { ztest_shared_t *zs = arg; spa_t *spa = ztest_spa; - hrtime_t delta, overdue, total = 0; + hrtime_t delay, overdue, last_run = gethrtime(); - for (;;) { - delta = zs->zs_thread_stop - zs->zs_thread_start + - MSEC2NSEC(zfs_deadman_synctime_ms); + delay = (zs->zs_thread_stop - zs->zs_thread_start) + + MSEC2NSEC(zfs_deadman_synctime_ms); - (void) poll(NULL, 0, (int)NSEC2MSEC(delta)); + while (!ztest_exiting) { + /* + * Wait for the delay timer while checking occasionally + * if we should stop. + */ + if (gethrtime() < last_run + delay) { + (void) poll(NULL, 0, 1000); + continue; + } /* * If the pool is suspended then fail immediately. Otherwise, @@ -6523,15 +6530,20 @@ ztest_deadman_thread(void *arg) * then it may be hung and is terminated. */ overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); - total += zfs_deadman_synctime_ms / 1000; if (gethrtime() > overdue) { fatal(0, "aborting test after %llu seconds because " - "the process is overdue for termination.", total); + "the process is overdue for termination.", + (gethrtime() - zs->zs_proc_start) / NANOSEC); } (void) printf("ztest has been running for %lld seconds\n", - total); + (gethrtime() - zs->zs_proc_start) / NANOSEC); + + last_run = gethrtime(); + delay = MSEC2NSEC(zfs_deadman_checktime_ms); } + + thread_exit(); } static void @@ -6725,7 +6737,7 @@ ztest_run(ztest_shared_t *zs) { spa_t *spa; objset_t *os; - kthread_t *resume_thread; + kthread_t *resume_thread, *deadman_thread; kthread_t **run_threads; uint64_t object; int error; @@ -6783,7 +6795,7 @@ ztest_run(ztest_shared_t *zs) /* * Create a deadman thread and set to panic if we hang. */ - (void) thread_create(NULL, 0, ztest_deadman_thread, + deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; @@ -6850,9 +6862,10 @@ ztest_run(ztest_shared_t *zs) umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); - /* Kill the resume thread */ + /* Kill the resume and deadman threads */ ztest_exiting = B_TRUE; VERIFY0(thread_join(resume_thread)); + VERIFY0(thread_join(deadman_thread)); ztest_resume(spa); /* @@ -7352,6 +7365,7 @@ main(int argc, char **argv) dprintf_setup(&argc, argv); zfs_deadman_synctime_ms = 300000; + zfs_deadman_checktime_ms = 30000; /* * As two-word space map entries may not come up often (especially * if pool and vdev sizes are small) we want to force at least some diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 3dc720ac07a1..3461a37cf9c8 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -312,7 +312,7 @@ unsigned long zfs_deadman_ziotime_ms = 300000ULL; * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ -unsigned long zfs_deadman_checktime_ms = 60000ULL; +unsigned long zfs_deadman_checktime_ms = 60000ULL; /* * By default the deadman is enabled.