Skip to content

Commit

Permalink
Fix random ztest_deadman_thread failures
Browse files Browse the repository at this point in the history
The zloop test has been failing in buildbot for the last few weeks
with various failures in ztest_deadman_thread(). This is due to the
fact that this thread is not stopped when performing pool import /
export tests as it should be. This patch simply corrects this.

Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Serapheim Dimitropoulos <[email protected]>
Reviewed-by: Matthew Ahrens <[email protected]>
Signed-off-by: Tom Caputi <[email protected]>
Closes openzfs#8010
  • Loading branch information
Tom Caputi authored and BrainSlayer committed Oct 28, 2018
1 parent 1cb5176 commit 6f2975f
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 12 deletions.
36 changes: 25 additions & 11 deletions cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -6496,13 +6496,20 @@ ztest_deadman_thread(void *arg)
{
ztest_shared_t *zs = arg;
spa_t *spa = ztest_spa;
hrtime_t delta, overdue, total = 0;
hrtime_t delay, overdue, last_run = gethrtime();

for (;;) {
delta = zs->zs_thread_stop - zs->zs_thread_start +
MSEC2NSEC(zfs_deadman_synctime_ms);
delay = (zs->zs_thread_stop - zs->zs_thread_start) +
MSEC2NSEC(zfs_deadman_synctime_ms);

(void) poll(NULL, 0, (int)NSEC2MSEC(delta));
while (!ztest_exiting) {
/*
* Wait for the delay timer while checking occasionally
* if we should stop.
*/
if (gethrtime() < last_run + delay) {
(void) poll(NULL, 0, 1000);
continue;
}

/*
* If the pool is suspended then fail immediately. Otherwise,
Expand All @@ -6523,15 +6530,20 @@ ztest_deadman_thread(void *arg)
* then it may be hung and is terminated.
*/
overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms);
total += zfs_deadman_synctime_ms / 1000;
if (gethrtime() > overdue) {
fatal(0, "aborting test after %llu seconds because "
"the process is overdue for termination.", total);
"the process is overdue for termination.",
(gethrtime() - zs->zs_proc_start) / NANOSEC);
}

(void) printf("ztest has been running for %lld seconds\n",
total);
(gethrtime() - zs->zs_proc_start) / NANOSEC);

last_run = gethrtime();
delay = MSEC2NSEC(zfs_deadman_checktime_ms);
}

thread_exit();
}

static void
Expand Down Expand Up @@ -6725,7 +6737,7 @@ ztest_run(ztest_shared_t *zs)
{
spa_t *spa;
objset_t *os;
kthread_t *resume_thread;
kthread_t *resume_thread, *deadman_thread;
kthread_t **run_threads;
uint64_t object;
int error;
Expand Down Expand Up @@ -6783,7 +6795,7 @@ ztest_run(ztest_shared_t *zs)
/*
* Create a deadman thread and set to panic if we hang.
*/
(void) thread_create(NULL, 0, ztest_deadman_thread,
deadman_thread = thread_create(NULL, 0, ztest_deadman_thread,
zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);

spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
Expand Down Expand Up @@ -6850,9 +6862,10 @@ ztest_run(ztest_shared_t *zs)

umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *));

/* Kill the resume thread */
/* Kill the resume and deadman threads */
ztest_exiting = B_TRUE;
VERIFY0(thread_join(resume_thread));
VERIFY0(thread_join(deadman_thread));
ztest_resume(spa);

/*
Expand Down Expand Up @@ -7352,6 +7365,7 @@ main(int argc, char **argv)

dprintf_setup(&argc, argv);
zfs_deadman_synctime_ms = 300000;
zfs_deadman_checktime_ms = 30000;
/*
* As two-word space map entries may not come up often (especially
* if pool and vdev sizes are small) we want to force at least some
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/spa_misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ unsigned long zfs_deadman_ziotime_ms = 300000ULL;
* Check time in milliseconds. This defines the frequency at which we check
* for hung I/O.
*/
unsigned long zfs_deadman_checktime_ms = 60000ULL;
unsigned long zfs_deadman_checktime_ms = 60000ULL;

/*
* By default the deadman is enabled.
Expand Down

0 comments on commit 6f2975f

Please sign in to comment.