From facbf94a9d78e5e12437522efc7e973657fde525 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Fri, 18 Oct 2024 17:54:32 -0400 Subject: [PATCH] i#7044 hang: Fix deadlock regression on drmemtrace replay Fixes a deadlock on replay introduced by PR #6985. If a recorded schedule has back-to-back entries with the same input (with a gap where another thread runs that input in between of course) and it has to wait for that other thread for its 2nd stint with that input, it will deadlock as it holds the input's lock while it tries to get the lock in set_cur_input(). That 2nd lock acquisition is what was added by PR #6985: that's the source of the regression. Tested on a 12-thread threadsig trace which hangs without the fix and succeeds with the fix. It is too large to turn into a convenient regression test. Issue: #7044 --- clients/drcachesim/scheduler/scheduler.cpp | 11 ++++++++--- clients/drcachesim/scheduler/scheduler.h | 5 ++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index 18c1a88e6dc..30a9938c82e 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -2946,7 +2946,8 @@ scheduler_tmpl_t::syscall_incurs_switch(input_info_t *in template typename scheduler_tmpl_t::stream_status_t scheduler_tmpl_t::set_cur_input(output_ordinal_t output, - input_ordinal_t input) + input_ordinal_t input, + bool hold_cur_input_lock) { // XXX i#5843: Merge tracking of current inputs with ready_queue.queue to better // manage the possible 3 states of each input (a live cur_input for an output stream, @@ -2958,7 +2959,9 @@ scheduler_tmpl_t::set_cur_input(output_ordinal_t output, if (prev_input >= 0) { if (prev_input != input) { input_info_t &prev_info = inputs_[prev_input]; - std::lock_guard lock(*prev_info.lock); + auto scoped_lock = hold_cur_input_lock + ? std::unique_lock() + : std::unique_lock(*prev_info.lock); prev_info.cur_output = INVALID_OUTPUT_ORDINAL; prev_info.last_run_time = get_output_time(output); if (options_.schedule_record_ostream != nullptr) { @@ -3146,7 +3149,9 @@ scheduler_tmpl_t::pick_next_input_as_previously( output, index, segment.value.start_instruction); // Give up this input and go into a wait state. // We'll come back here on the next next_record() call. - set_cur_input(output, INVALID_INPUT_ORDINAL); + set_cur_input(output, INVALID_INPUT_ORDINAL, + // Avoid livelock if prev input == cur input. + index == outputs_[output].cur_input); outputs_[output].waiting = true; return sched_type_t::STATUS_WAIT; } diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h index e752839c1f8..0244b52e1a5 100644 --- a/clients/drcachesim/scheduler/scheduler.h +++ b/clients/drcachesim/scheduler/scheduler.h @@ -1920,8 +1920,11 @@ template class scheduler_tmpl_t { input_reader_info_t &reader_info); // The caller cannot hold the output or input lock. + // The caller can hold the output's current input's lock but must pass + // true for the 3rd parameter in that case. stream_status_t - set_cur_input(output_ordinal_t output, input_ordinal_t input); + set_cur_input(output_ordinal_t output, input_ordinal_t input, + bool hold_cur_input_lock = false); // Finds the next input stream for the 'output_ordinal'-th output stream. // No input_info_t lock can be held on entry.