From 6ce063bf529de242f3a245c8a655a83fe9cedde6 Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Wed, 10 Oct 2018 16:48:33 -0400 Subject: [PATCH] Fix random ztest_deadman_thread failures The zloop test has been failing in buildbot for the last few weeks with various failures in ztest_deadman_thread(). This is due to the fact that this thread is not stopped when performing pool import / export tests as it should be. This patch simply corrects this. TEST_ZTEST_TIMEOUT=3600 Signed-off-by: Tom Caputi --- cmd/ztest/ztest.c | 34 ++++++++++++++++++++++++---------- module/zfs/spa_misc.c | 2 +- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index f277312c663b..09ffa6da409b 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -6495,13 +6495,20 @@ ztest_deadman_thread(void *arg) { ztest_shared_t *zs = arg; spa_t *spa = ztest_spa; - hrtime_t delta, overdue, total = 0; + hrtime_t delay, overdue, last_run = gethrtime(); - for (;;) { - delta = zs->zs_thread_stop - zs->zs_thread_start + - MSEC2NSEC(zfs_deadman_synctime_ms); + delay = (zs->zs_thread_stop - zs->zs_thread_start) + + MSEC2NSEC(zfs_deadman_synctime_ms); - (void) poll(NULL, 0, (int)NSEC2MSEC(delta)); + while (!ztest_exiting) { + /* + * Wait for the delay timer while checking occasionally + * if we should stop. + */ + if (gethrtime() < last_run + delay) { + (void) poll(NULL, 0, 1000); + continue; + } /* * If the pool is suspended then fail immediately. Otherwise, @@ -6522,15 +6529,20 @@ ztest_deadman_thread(void *arg) * then it may be hung and is terminated. */ overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); - total += zfs_deadman_synctime_ms / 1000; if (gethrtime() > overdue) { fatal(0, "aborting test after %llu seconds because " - "the process is overdue for termination.", total); + "the process is overdue for termination.", + (gethrtime() - zs->zs_proc_start) / NANOSEC); } (void) printf("ztest has been running for %lld seconds\n", - total); + (gethrtime() - zs->zs_proc_start) / NANOSEC); + + last_run = gethrtime(); + delay = MSEC2NSEC(zfs_deadman_checktime_ms); } + + thread_exit(); } static void @@ -6724,7 +6736,7 @@ ztest_run(ztest_shared_t *zs) { spa_t *spa; objset_t *os; - kthread_t *resume_thread; + kthread_t *resume_thread, *deadman_thread; kthread_t **run_threads; uint64_t object; int error; @@ -6782,7 +6794,7 @@ ztest_run(ztest_shared_t *zs) /* * Create a deadman thread and set to panic if we hang. */ - (void) thread_create(NULL, 0, ztest_deadman_thread, + deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; @@ -6852,6 +6864,7 @@ ztest_run(ztest_shared_t *zs) /* Kill the resume thread */ ztest_exiting = B_TRUE; VERIFY0(thread_join(resume_thread)); + VERIFY0(thread_join(deadman_thread)); ztest_resume(spa); /* @@ -7351,6 +7364,7 @@ main(int argc, char **argv) dprintf_setup(&argc, argv); zfs_deadman_synctime_ms = 300000; + zfs_deadman_checktime_ms = 30000; /* * As two-word space map entries may not come up often (especially * if pool and vdev sizes are small) we want to force at least some diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index ae9eb4de7be7..a3ac70f07ae2 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -312,7 +312,7 @@ unsigned long zfs_deadman_ziotime_ms = 300000ULL; * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ -unsigned long zfs_deadman_checktime_ms = 60000ULL; +unsigned long zfs_deadman_checktime_ms = 60000ULL; /* * By default the deadman is enabled.