From 7673d429a9aa80c560a7c3656e00c4b6bc4d4f00 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Tue, 28 May 2024 20:13:27 -0400 Subject: [PATCH] i#6822 unscheduled: Add unscheduled-input drmemtrace support (#6826) Augments the drmemtrace scheduler with a new notion of an "unscheduled" input which, if it has no timeout, is not runnable indefinitely until another input explicitly wakes it up. Adds 3 new markers: + TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, which makes the caller "unscheduled". + TRACE_MARKER_TYPE_SYSCALL_SCHEDULE, which makes a target no longer "unscheduled". + TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT which adds a timeout parameter to TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE and TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH Adds handling for the new marker types. Changes TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH to make the source input unscheduled, unless a timeout is passed which is honored (previously it was ignored and the syscall latency was used as the block time if it was over the thresholds). Adds a fallback in case a state is reached with no schedulable inputs yet some number of unscheduled inputs who would otherwise hang forever. Adds unit tests. Also tested on a large trace with many real-world cases of these direct switches and unschedulable threads. Issue: #6822 --- clients/drcachesim/common/trace_entry.h | 64 ++- clients/drcachesim/scheduler/scheduler.cpp | 360 +++++++++++++--- clients/drcachesim/scheduler/scheduler.h | 60 ++- .../drcachesim/tests/scheduler_unit_tests.cpp | 401 +++++++++++++++++- 4 files changed, 808 insertions(+), 77 deletions(-) diff --git a/clients/drcachesim/common/trace_entry.h b/clients/drcachesim/common/trace_entry.h index 5b26180ba5c..6b1a66e2b13 100644 --- a/clients/drcachesim/common/trace_entry.h +++ b/clients/drcachesim/common/trace_entry.h @@ -567,10 +567,21 @@ typedef enum { * #TRACE_MARKER_TYPE_SYSCALL and #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL markers) * that causes an immediate switch to another thread on the same core (with the * current thread entering an unscheduled state), bypassing the kernel scheduler's - * normal dynamic switch code based on run queues. The marker value holds the - * thread id of the target thread. This should generally always be after a - * #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL marker as such a switch always - * has a chance of blocking if the target needs to be migrated. + * normal dynamic switch code based on run queues. The marker value holds the thread + * id of the target thread. The current thread will remain unschedulable + * indefinitely unless another thread resumes it with either + * #TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH or #TRACE_MARKER_TYPE_SYSCALL_SCHEDULE; + * or, if a #TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT marker is present, the thread will + * become schedulable when that timeout expires. This marker provides a mechanism to + * model these semantics while abstracting away whether the underlying system call is + * a custom kernel extension or a variant of "futex" or other selective wait-notify + * scheme. This marker should generally always be after a + * #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL marker as such a switch always has a + * chance of blocking the source thread. See also + * #TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, and + * #TRACE_MARKER_TYPE_SYSCALL_SCHEDULE. The scheduler only models this behavior when + * #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t.honor_direct_switches + * is true. */ TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH, @@ -627,6 +638,51 @@ typedef enum { */ TRACE_MARKER_TYPE_VECTOR_LENGTH, + /** + * This marker is emitted prior to a system call (but after the system call's + * #TRACE_MARKER_TYPE_SYSCALL and #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL markers) + * that causes the current thread to become unschedulable (removed from all queues of + * runnable threads). The thread will remain unschedulable indefinitely unless + * another thread resumes it with either #TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH or + * #TRACE_MARKER_TYPE_SYSCALL_SCHEDULE; or, if a + * #TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT marker is present, the thread will become + * schedulable when that timeout expires. This marker provides a mechanism to model + * these semantics while abstracting away whether the underlying system call is a + * custom kernel extension or a variant of "futex" or other selective wait-notify + * scheme. This marker should generally always be after a + * #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL marker as becoming unschedulable is a + * form of blocking and results in a context switch. The scheduler only models this + * behavior when + * #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t.honor_direct_switches + * is true. + */ + TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, + + /** + * This marker is emitted prior to a system call (but after the system call's + * #TRACE_MARKER_TYPE_SYSCALL marker) that causes a target thread identified in the + * marker value to become schedulable again if it were currently unschedulable or if + * it is not currently unschedulable to *not* become unschedulable on its next action + * that would otherwise do so. See also #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE and + * #TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH. This marker provides a mechanism to + * model these semantics while abstracting away whether the underlying system call is + * a custom kernel extension or a variant of "futex" or other selective wait-notify + * scheme. The scheduler only models this behavior when + * #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t.honor_direct_switches + * is true. + */ + TRACE_MARKER_TYPE_SYSCALL_SCHEDULE, + + /** + * This marker is emitted prior to a system call (but after the system call's + * #TRACE_MARKER_TYPE_SYSCALL and #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL markers) + * which also has a #TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH or + * #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE marker. This indicates a timeout provided + * by the application after which the thread will become schedulable again. The + * marker value holds the timeout duration in microseconds. + */ + TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, + // ... // These values are reserved for future built-in marker types. // ... diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index bbdad729e59..b7bf1170d32 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -931,15 +931,22 @@ scheduler_tmpl_t::set_initial_schedule( set_cur_input(i, INVALID_INPUT_ORDINAL); } } else { - // Just take the 1st N inputs (even if all from the same workload). + // Just take the 1st #outputs of schedulable (i.e., not "unscheduled") + // inputs (even if all from the same workload). + input_ordinal_t input = 0; for (int i = 0; i < static_cast(outputs_.size()); ++i) { - if (i < static_cast(inputs_.size())) - set_cur_input(i, i); + while (input < static_cast(inputs_.size()) && + inputs_[input].unscheduled) { + add_to_ready_queue(&inputs_[input]); + ++input; + } + if (input < static_cast(inputs_.size())) + set_cur_input(i, input); else set_cur_input(i, INVALID_INPUT_ORDINAL); + ++input; } - for (int i = static_cast(outputs_.size()); - i < static_cast(inputs_.size()); ++i) { + for (int i = input; i < static_cast(inputs_.size()); ++i) { add_to_ready_queue(&inputs_[i]); } } @@ -1479,6 +1486,12 @@ bool scheduler_tmpl_t::process_next_initial_record( input_info_t &input, RecordType record, bool found_filetype, bool found_timestamp) { + // TODO i#6822: Always look ahead until the first instruction, looking + // for threads that start out with an exit from an UNSCHEDULE or DIRECT + // syscall so we can have them start out unscheduled: though we can't + // easily know whether there was a timeout unless we read way ahead past + // signal handlers until the syscall exits to look for -ETIMEDOUT. + // Should we have raw2trace look for that? return !(found_filetype && found_timestamp); } @@ -1512,7 +1525,8 @@ scheduler_tmpl_t::get_initial_input_content( if (record_type_is_marker(record, marker_type, marker_value) && marker_type == TRACE_MARKER_TYPE_FILETYPE) { found_filetype = true; - VPRINT(this, 2, "Input %zu filetype %zu\n", i, marker_value); + VPRINT(this, 2, "Input %zu filetype %zu from queue\n", i, + marker_value); } if (record_type_is_timestamp(record, input.next_timestamp)) found_timestamp = true; @@ -1561,7 +1575,7 @@ scheduler_tmpl_t::get_initial_input_content( // synthetic trace in a test) or we may have to read thousands of records // to find it if it were somehow missing, which we do not want to do. We // assume our queued records are few and do not include instructions when - // we skip (see skip_instrutions()). Thus, we abort with an error. + // we skip (see skip_instructions()). Thus, we abort with an error. if (record_type_is_instr(record)) break; input.queue.push_back(record); @@ -1833,7 +1847,7 @@ scheduler_tmpl_t::advance_region_of_interest( input.cur_region); if (input.cur_region >= static_cast(input.regions_of_interest.size())) { if (input.at_eof) - return eof_or_idle(output); + return eof_or_idle(output, /*hold_sched_lock=*/false); else { // We let the user know we're done. if (options_.schedule_record_ostream != nullptr) { @@ -2070,10 +2084,26 @@ scheduler_tmpl_t::ready_queue_empty() return ready_priority_.empty(); } +template +void +scheduler_tmpl_t::add_to_unscheduled_queue(input_info_t *input) +{ + assert(input->unscheduled && + input->blocked_time == 0); // Else should be in regular queue. + VPRINT(this, 4, "add_to_unscheduled_queue (pre-size %zu): input %d priority %d\n", + unscheduled_priority_.size(), input->index, input->priority); + input->queue_counter = ++unscheduled_counter_; + unscheduled_priority_.push(input); +} + template void scheduler_tmpl_t::add_to_ready_queue(input_info_t *input) { + if (input->unscheduled && input->blocked_time == 0) { + add_to_unscheduled_queue(input); + return; + } VPRINT( this, 4, "add_to_ready_queue (pre-size %zu): input %d priority %d timestamp delta %" PRIu64 @@ -2105,6 +2135,8 @@ scheduler_tmpl_t::pop_from_ready_queue( res = ready_priority_.top(); ready_priority_.pop(); } + assert(!res->unscheduled || + res->blocked_time > 0); // Should be in unscheduled_priority_. if (res->binding.empty() || res->binding.find(for_output) != res->binding.end()) { // For blocked inputs, as we don't have interrupts or other regular // control points we only check for being unblocked when an input @@ -2144,9 +2176,11 @@ scheduler_tmpl_t::pop_from_ready_queue( static int heartbeat; // We are ok with races as the cadence is approximate. if (++heartbeat % 500 == 0) { - VPRINT(this, 1, "heartbeat[%d] %zd in queue; %d blocked => %d %d\n", + VPRINT(this, 1, + "heartbeat[%d] %zd in queue; %d blocked; %zd unscheduled => %d %d\n", for_output, ready_priority_.size(), num_blocked_, - res == nullptr ? -1 : res->index, status); + unscheduled_priority_.size(), res == nullptr ? -1 : res->index, + status); } }); if (res != nullptr) { @@ -2156,11 +2190,29 @@ scheduler_tmpl_t::pop_from_ready_queue( for_output, ready_priority_.size(), res->index, res->priority, res->reader->get_last_timestamp() - res->base_timestamp); res->blocked_time = 0; + res->unscheduled = false; } new_input = res; return status; } +template +uint64_t +scheduler_tmpl_t::scale_blocked_time(uint64_t initial_time) const +{ + uint64_t scaled = static_cast(static_cast(initial_time) * + options_.block_time_scale); + if (scaled > options_.block_time_max) { + // We have a max to avoid outlier latencies that are already a second or + // more from scaling up to tens of minutes. We assume a cap is representative + // as the outliers likely were not part of key dependence chains. Without a + // cap the other threads all finish and the simulation waits for tens of + // minutes further for a couple of outliers. + scaled = options_.block_time_max; + } + return scaled; +} + template bool scheduler_tmpl_t::syscall_incurs_switch(input_info_t *input, @@ -2181,16 +2233,7 @@ scheduler_tmpl_t::syscall_incurs_switch(input_info_t *in uint64_t threshold = input->processing_maybe_blocking_syscall ? options_.blocking_switch_threshold : options_.syscall_switch_threshold; - blocked_time = - static_cast(static_cast(latency) * options_.block_time_scale); - if (blocked_time > options_.block_time_max) { - // We have a max to avoid outlier latencies that are already a second or - // more from scaling up to tens of minutes. We assume a cap is representative - // as the outliers likely were not part of key dependence chains. Without a - // cap the other threads all finish and the simulation waits for tens of - // minutes further for a couple of outliers. - blocked_time = options_.block_time_max; - } + blocked_time = scale_blocked_time(latency); VPRINT(this, 3, "input %d %ssyscall latency %" PRIu64 " * scale %5.1f => blocked time %" PRIu64 "\n", @@ -2214,8 +2257,9 @@ scheduler_tmpl_t::set_cur_input(output_ordinal_t output, int prev_input = outputs_[output].cur_input; if (prev_input >= 0) { if (options_.mapping == MAP_TO_ANY_OUTPUT && prev_input != input && - !inputs_[prev_input].at_eof) + !inputs_[prev_input].at_eof) { add_to_ready_queue(&inputs_[prev_input]); + } if (prev_input != input && options_.schedule_record_ostream != nullptr) { input_info_t &prev_info = inputs_[prev_input]; std::lock_guard lock(*prev_info.lock); @@ -2320,7 +2364,7 @@ scheduler_tmpl_t::pick_next_input_as_previously( outputs_[output].at_eof = true; live_replay_output_count_.fetch_add(-1, std::memory_order_release); } - return eof_or_idle(output); + return eof_or_idle(output, need_sched_lock()); } const schedule_record_t &segment = outputs_[output].record[outputs_[output].record_index + 1]; @@ -2441,22 +2485,41 @@ scheduler_tmpl_t::pick_next_input_as_previously( return sched_type_t::STATUS_OK; } +template +bool +scheduler_tmpl_t::need_sched_lock() +{ + return options_.mapping == MAP_TO_ANY_OUTPUT || options_.mapping == MAP_AS_PREVIOUSLY; +} + +template +std::unique_lock +scheduler_tmpl_t::acquire_scoped_sched_lock_if_necessary( + bool &need_lock) +{ + need_lock = need_sched_lock(); + auto scoped_lock = need_lock ? std::unique_lock(sched_lock_) + : std::unique_lock(); + return scoped_lock; +} + template typename scheduler_tmpl_t::stream_status_t scheduler_tmpl_t::pick_next_input(output_ordinal_t output, uint64_t blocked_time) { sched_type_t::stream_status_t res = sched_type_t::STATUS_OK; - bool need_lock = - options_.mapping == MAP_TO_ANY_OUTPUT || options_.mapping == MAP_AS_PREVIOUSLY; - auto scoped_lock = need_lock ? std::unique_lock(sched_lock_) - : std::unique_lock(); + bool need_lock; + auto scoped_lock = acquire_scoped_sched_lock_if_necessary(need_lock); input_ordinal_t prev_index = outputs_[output].cur_input; input_ordinal_t index = INVALID_INPUT_ORDINAL; int iters = 0; while (true) { ++iters; if (index < 0) { + // XXX i#6831: Refactor to use subclasses or templates to specialize + // scheduler code based on mapping options, to avoid these top-level + // conditionals in many functions? if (options_.mapping == MAP_AS_PREVIOUSLY) { res = pick_next_input_as_previously(output, index); VDO(this, 2, { @@ -2491,20 +2554,21 @@ scheduler_tmpl_t::pick_next_input(output_ordinal_t outpu output, blocked_time); inputs_[prev_index].blocked_time = blocked_time; inputs_[prev_index].blocked_start_time = get_output_time(output); - } else { - // If we looped we could have the same prev_index. - assert(iters > 1); } } if (prev_index != INVALID_INPUT_ORDINAL && inputs_[prev_index].switch_to_input != INVALID_INPUT_ORDINAL) { input_info_t *target = &inputs_[inputs_[prev_index].switch_to_input]; inputs_[prev_index].switch_to_input = INVALID_INPUT_ORDINAL; + std::lock_guard lock(*target->lock); // XXX i#5843: Add an invariant check that the next timestamp of the // target is later than the pre-switch-syscall timestamp? if (ready_priority_.find(target)) { - VPRINT(this, 2, "next_record[%d]: direct switch to input %d\n", - output, target->index); + VPRINT(this, 2, + "next_record[%d]: direct switch from input %d to input %d " + "@%" PRIu64 "\n", + output, prev_index, target->index, + inputs_[prev_index].reader->get_last_timestamp()); ready_priority_.erase(target); index = target->index; // Erase any remaining wait time for the target. @@ -2515,29 +2579,46 @@ scheduler_tmpl_t::pick_next_input(output_ordinal_t outpu output, target->index); --num_blocked_; target->blocked_time = 0; + target->unscheduled = false; } + } else if (unscheduled_priority_.find(target)) { + target->unscheduled = false; + unscheduled_priority_.erase(target); + index = target->index; + VPRINT(this, 2, + "next_record[%d]: direct switch from input %d to " + "was-unscheduled input %d " + "@%" PRIu64 "\n", + output, prev_index, target->index, + inputs_[prev_index].reader->get_last_timestamp()); } else { - // TODO i#5843: If the target is running on another output, we - // need to do a forced migration by setting a flag to force a - // preempt and presumably waiting (STATUS_WAIT or STATUS_IDLE?) - // here until the input is available. - // For now we print a message so we can notice when this - // happens, but we ignore the direct switch request. + // We assume that inter-input dependencies are captured in + // the _DIRECT_THREAD_SWITCH, _UNSCHEDULE, and _SCHEDULE markers + // and that if a switch request targets a thread running elsewhere + // that means there isn't a dependence and this is really a + // dynamic switch to whoever happens to be available (and + // different timing between tracing and analysis has caused this + // miss). VPRINT(this, 1, - "Direct switch target input #%d is running elsewhere and " - "forced migration is NYI\n", - target->index); + "Direct switch (from %d) target input #%d is running " + "elsewhere; picking a different target @%" PRIu64 "\n", + prev_index, target->index, + inputs_[prev_index].reader->get_last_timestamp()); + // We do ensure the missed target doesn't wait indefinitely. + // XXX i#6822: It's not clear this is always the right thing to + // do. + target->skip_next_unscheduled = true; } } if (index != INVALID_INPUT_ORDINAL) { // We found a direct switch target above. } else if (ready_queue_empty() && blocked_time == 0) { if (prev_index == INVALID_INPUT_ORDINAL) - return eof_or_idle(output); + return eof_or_idle(output, need_lock); auto lock = std::unique_lock(*inputs_[prev_index].lock); if (inputs_[prev_index].at_eof) { lock.unlock(); - return eof_or_idle(output); + return eof_or_idle(output, need_lock); } else index = prev_index; // Go back to prior. } else { @@ -2565,7 +2646,7 @@ scheduler_tmpl_t::pick_next_input(output_ordinal_t outpu } if (queue_next == nullptr) { assert(blocked_time == 0 || prev_index == INVALID_INPUT_ORDINAL); - return eof_or_idle(output); + return eof_or_idle(output, need_lock); } index = queue_next->index; } @@ -2580,7 +2661,7 @@ scheduler_tmpl_t::pick_next_input(output_ordinal_t outpu } } if (index < 0) - return eof_or_idle(output); + return eof_or_idle(output, need_lock); VPRINT(this, 2, "next_record[%d]: advancing to timestamp %" PRIu64 " == input #%d\n", @@ -2669,6 +2750,105 @@ scheduler_tmpl_t::process_marker(input_info_t &input, } else { input.switch_to_input = it->second; } + // Trigger a switch either indefinitely or until timeout. + if (input.skip_next_unscheduled) { + // The underlying kernel mechanism being modeled only supports a single + // request: they cannot accumulate. Timing differences in the trace could + // perhaps result in multiple lining up when the didn't in the real app; + // but changing the scheme here could also push representatives in the + // other direction. + input.skip_next_unscheduled = false; + VPRINT(this, 3, + "input %d unschedule request ignored due to prior schedule request " + "@%" PRIu64 "\n", + input.index, input.reader->get_last_timestamp()); + break; + } + input.unscheduled = true; + if (input.syscall_timeout_arg > 0) { + input.blocked_time = scale_blocked_time(input.syscall_timeout_arg); + VPRINT(this, 3, "input %d unscheduled for %" PRIu64 " @%" PRIu64 "\n", + input.index, input.blocked_time, input.reader->get_last_timestamp()); + } else { + VPRINT(this, 3, "input %d unscheduled indefinitely @%" PRIu64 "\n", + input.index, input.reader->get_last_timestamp()); + } + break; + } + case TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT: + // This is cleared at the post-syscall instr. + input.syscall_timeout_arg = static_cast(marker_value); + break; + case TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE: + if (!options_.honor_direct_switches) + break; + if (input.skip_next_unscheduled) { + input.skip_next_unscheduled = false; + VPRINT(this, 3, + "input %d unschedule request ignored due to prior schedule request " + "@%" PRIu64 "\n", + input.index, input.reader->get_last_timestamp()); + break; + } + // Trigger a switch either indefinitely or until timeout. + input.unscheduled = true; + if (input.syscall_timeout_arg > 0) { + input.blocked_time = scale_blocked_time(input.syscall_timeout_arg); + VPRINT(this, 3, "input %d unscheduled for %" PRIu64 " @%" PRIu64 "\n", + input.index, input.blocked_time, input.reader->get_last_timestamp()); + } else { + VPRINT(this, 3, "input %d unscheduled indefinitely @%" PRIu64 "\n", + input.index, input.reader->get_last_timestamp()); + } + break; + case TRACE_MARKER_TYPE_SYSCALL_SCHEDULE: { + if (!options_.honor_direct_switches) + break; + memref_tid_t target_tid = marker_value; + auto it = tid2input_.find(workload_tid_t(input.workload, target_tid)); + if (it == tid2input_.end()) { + VPRINT(this, 1, + "Failed to find input for switchto::resume target tid %" PRId64 "\n", + target_tid); + return; + } + input_ordinal_t target_idx = it->second; + VPRINT(this, 3, "input %d re-scheduling input %d @%" PRIu64 "\n", input.index, + target_idx, input.reader->get_last_timestamp()); + // Release the input lock before acquiring sched_lock, to meet our lock + // ordering convention to avoid deadlocks. + input.lock->unlock(); + { + bool need_sched_lock; + auto scoped_sched_lock = + acquire_scoped_sched_lock_if_necessary(need_sched_lock); + input_info_t *target = &inputs_[target_idx]; + std::lock_guard lock(*target->lock); + if (target->unscheduled) { + target->unscheduled = false; + if (unscheduled_priority_.find(target)) { + add_to_ready_queue(target); + unscheduled_priority_.erase(target); + } else if (ready_priority_.find(target)) { + // We assume blocked_time is from _ARG_TIMEOUT and is not from + // regularly-blocking i/o. We assume i/o getting into the mix is + // rare enough or does not matter enough to try to have separate + // timeouts. + if (target->blocked_time > 0) { + VPRINT( + this, 3, + "switchto::resume erasing blocked time for target input %d\n", + target->index); + --num_blocked_; + target->blocked_time = 0; + } + } + } else { + VPRINT(this, 3, "input %d will skip next unschedule\n", target_idx); + target->skip_next_unscheduled = true; + } + } + input.lock->lock(); break; } default: // Nothing to do. @@ -2719,7 +2899,7 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, if (outputs_[output].cur_input < 0) { // This happens with more outputs than inputs. For non-empty outputs we // require cur_input to be set to >=0 during init(). - return eof_or_idle(output); + return eof_or_idle(output, /*hold_sched_lock=*/false); } input = &inputs_[outputs_[output].cur_input]; auto lock = std::unique_lock(*input->lock); @@ -2794,6 +2974,9 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, bool preempt = false; uint64_t blocked_time = 0; uint64_t prev_time_in_quantum = 0; + // XXX i#6831: Refactor to use subclasses or templates to specialize + // scheduler code based on mapping options, to avoid these top-level + // conditionals in many functions? if (options_.mapping == MAP_AS_PREVIOUSLY) { assert(outputs_[output].record_index >= 0); if (outputs_[output].record_index >= @@ -2833,34 +3016,48 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, } else if (options_.mapping == MAP_TO_ANY_OUTPUT) { trace_marker_type_t marker_type; uintptr_t marker_value; - // While regular traces typically always have a syscall marker when there's a - // maybe-blocking marker, some tests and synthetic traces have just the maybe - // so we check both. + // While regular traces typically always have a syscall marker when + // there's a maybe-blocking marker, some tests and synthetic traces have + // just the maybe so we check both. if (input->processing_syscall || input->processing_maybe_blocking_syscall) { // Wait until we're past all the markers associated with the syscall. - // XXX: We may prefer to stop before the return value marker for futex, - // or a kernel xfer marker, but our recorded format is on instr + // XXX: We may prefer to stop before the return value marker for + // futex, or a kernel xfer marker, but our recorded format is on instr // boundaries so we live with those being before the switch. // XXX: Once we insert kernel traces, we may have to try harder // to stop before the post-syscall records. if (record_type_is_instr_boundary(record, outputs_[output].last_record)) { - if (syscall_incurs_switch(input, blocked_time)) { - // Model as blocking and should switch to a different input. + if (input->switch_to_input != INVALID_INPUT_ORDINAL) { + // The switch request overrides any latency threshold. need_new_input = true; VPRINT(this, 3, - "next_record[%d]: hit blocking syscall in input %d\n", + "next_record[%d]: direct switch on low-latency " + "syscall in " + "input %d\n", output, input->index); - } else if (input->switch_to_input != INVALID_INPUT_ORDINAL) { - // The switch request overrides any latency threshold. + } else if (input->blocked_time > 0) { + // If we've found out another way that this input should + // block, use that time and do a switch. need_new_input = true; + blocked_time = input->blocked_time; VPRINT(this, 3, - "next_record[%d]: direct switch on low-latency syscall in " - "input %d\n", + "next_record[%d]: blocked time set for input %d\n", output, + input->index); + } else if (input->unscheduled) { + need_new_input = true; + VPRINT(this, 3, "next_record[%d]: input %d going unscheduled\n", + output, input->index); + } else if (syscall_incurs_switch(input, blocked_time)) { + // Model as blocking and should switch to a different input. + need_new_input = true; + VPRINT(this, 3, + "next_record[%d]: hit blocking syscall in input %d\n", output, input->index); } input->processing_syscall = false; input->processing_maybe_blocking_syscall = false; input->pre_syscall_timestamp = 0; + input->syscall_timeout_arg = 0; } } if (outputs_[output].hit_switch_code_end) { @@ -3099,8 +3296,12 @@ scheduler_tmpl_t::mark_input_eof(input_info_t &input) template typename scheduler_tmpl_t::stream_status_t -scheduler_tmpl_t::eof_or_idle(output_ordinal_t output) +scheduler_tmpl_t::eof_or_idle(output_ordinal_t output, + bool hold_sched_lock) { + // XXX i#6831: Refactor to use subclasses or templates to specialize + // scheduler code based on mapping options, to avoid these top-level + // conditionals in many functions? if (options_.mapping == MAP_TO_CONSISTENT_OUTPUT || live_input_count_.load(std::memory_order_acquire) == 0 || // While a full schedule recorded should have each input hit either its @@ -3111,6 +3312,49 @@ scheduler_tmpl_t::eof_or_idle(output_ordinal_t output) assert(options_.mapping != MAP_AS_PREVIOUSLY || outputs_[output].at_eof); return sched_type_t::STATUS_EOF; } else { + if (options_.mapping == MAP_TO_ANY_OUTPUT) { + // Workaround to avoid hangs when _SCHEDULE and/or _DIRECT_THREAD_SWITCH + // directives miss their targets (due to running with a subset of the + // original threads, or other scenarios) and we end up with no scheduled + // inputs but a set of unscheduled inputs who will never be scheduled. + auto scoped_lock = hold_sched_lock + ? std::unique_lock() + : std::unique_lock(sched_lock_); + VPRINT(this, 4, + "eof_or_idle output=%d live=%d unsched=%zu runq=%zu blocked=%d\n", + output, live_input_count_.load(std::memory_order_acquire), + unscheduled_priority_.size(), ready_priority_.size(), num_blocked_); + if (ready_priority_.empty() && !unscheduled_priority_.empty()) { + if (outputs_[output].wait_start_time == 0) { + outputs_[output].wait_start_time = get_output_time(output); + } else { + uint64_t now = get_output_time(output); + if (now - outputs_[output].wait_start_time > + options_.block_time_max) { + // XXX i#6822: We may want some other options here for what to + // do. We could release just one input at a time, which would be + // the same scheduling order (as we have FIFO in + // unscheduled_priority_) but may take a long time at + // block_time_max each; we could declare we're done and just + // exit, maybe under a flag or if we could see what % of total + // records we've processed. + VPRINT(this, 1, + "eof_or_idle moving entire unscheduled queue to ready " + "queue\n"); + while (!unscheduled_priority_.empty()) { + input_info_t *tomove = unscheduled_priority_.top(); + std::lock_guard lock(*tomove->lock); + tomove->unscheduled = false; + ready_priority_.push(tomove); + unscheduled_priority_.pop(); + } + outputs_[output].wait_start_time = 0; + } + } + } else { + outputs_[output].wait_start_time = 0; + } + } outputs_[output].waiting = true; set_cur_input(output, INVALID_INPUT_ORDINAL); return sched_type_t::STATUS_IDLE; diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h index e0fe4401c25..5ca8207512f 100644 --- a/clients/drcachesim/scheduler/scheduler.h +++ b/clients/drcachesim/scheduler/scheduler.h @@ -578,6 +578,10 @@ template class scheduler_tmpl_t { * #QUANTUM_TIME simulator time or wall-clock microseconds for * #QUANTUM_INSTRUCTIONS), for an input to be considered blocked for any one * system call. This is applied after multiplying by #block_time_scale. + * This is also used as a fallback to avoid hangs when there are no scheduled + * inputs: if the only inputs left are "unscheduled" (see + * #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE), after this amount of time those + * inputs are all re-scheduled. */ uint64_t block_time_max = 25000000; // XXX: Should we share the file-to-reader code currently in the scheduler @@ -639,13 +643,11 @@ template class scheduler_tmpl_t { /** * If true, the scheduler will attempt to switch to the recorded targets of * #TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH system call metadata markers - * regardless of system call latency. If the target is not available, the - * current implementation will select the next available input in the regular - * scheduling queue, but in the future a forced migration may be applied for an - * input currently on another output. If false, the direct switch markers are - * ignored and only system call latency thresholds are used to determine - * switches (the #TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH markers remain: they - * are not removed from the trace). + * regardless of system call latency. Furthermore, the scheduler will model + * "unscheduled" semantics and honor the #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE + * and #TRACE_MARKER_TYPE_SYSCALL_SCHEDULE markers. If false, these markers are + * ignored and only system call latency thresholds are used to determine switches + * (these markers remain: they are not removed from the trace). */ bool honor_direct_switches = true; }; @@ -1205,7 +1207,9 @@ template class scheduler_tmpl_t { // with schedule_t "this" access for the comparator to compile: it is not // simple to do so, however.) bool order_by_timestamp = false; - // Global ready queue counter used to provide FIFO for same-priority inputs. + // Global queue counter used to provide FIFO for same-priority inputs. + // This value is only valid when this input is in a queue; it is set upon + // being added to a queue. uint64_t queue_counter = 0; // Used to switch on the instruction *after* a long-latency syscall. bool processing_syscall = false; @@ -1214,6 +1218,7 @@ template class scheduler_tmpl_t { // Use for special kernel features where one thread specifies a target // thread to replace it. input_ordinal_t switch_to_input = INVALID_INPUT_ORDINAL; + uint64_t syscall_timeout_arg = 0; // Used to switch before we've read the next instruction. bool switching_pre_instruction = false; // Used for time-based quanta. @@ -1223,6 +1228,15 @@ template class scheduler_tmpl_t { // The units are us for instr quanta and simuilation time for time quanta. uint64_t blocked_time = 0; uint64_t blocked_start_time = 0; + // An input can be "unscheduled" and not on the ready_priority_ run queue at all + // with an infinite timeout until directly targeted. Such inputs are stored + // in the unscheduled_priority_ queue. + // This field is also set to true for inputs that are "unscheduled" but with + // a timeout, even though that is implemented by storing them in ready_priority_ + // (because that is our mechanism for measuring timeouts). + bool unscheduled = false; + // Causes the next unscheduled entry to abort. + bool skip_next_unscheduled = false; }; // Format for recording a schedule to disk. A separate sequence of these records @@ -1628,8 +1642,10 @@ template class scheduler_tmpl_t { void mark_input_eof(input_info_t &input); + // Determines whether to exit or wait for other outputs when one output + // runs out of things to do. May end up scheduling new inputs. stream_status_t - eof_or_idle(output_ordinal_t output); + eof_or_idle(output_ordinal_t output, bool hold_sched_lock); // Returns whether the current record for the current input stream scheduled on // the 'output_ordinal'-th output stream is from a part of the trace corresponding @@ -1660,14 +1676,29 @@ template class scheduler_tmpl_t { } }; + bool + need_sched_lock(); + + std::unique_lock + acquire_scoped_sched_lock_if_necessary(bool &need_lock); + // sched_lock_ must be held by the caller. bool ready_queue_empty(); // sched_lock_ must be held by the caller. + // If input->unscheduled is true and input->blocked_time is 0, input + // is placed on the unscheduled_priority_ queue instead. void add_to_ready_queue(input_info_t *input); + // sched_lock_ must be held by the caller. + void + add_to_unscheduled_queue(input_info_t *input); + + uint64_t + scale_blocked_time(uint64_t blocked_time) const; + // The input's lock must be held by the caller. // Returns a multiplier for how long the input should be considered blocked. bool @@ -1694,18 +1725,23 @@ template class scheduler_tmpl_t { std::vector outputs_; // We use a central lock for global scheduling. We assume the synchronization // cost is outweighed by the simulator's overhead. This protects concurrent - // access to inputs_.size(), outputs_.size(), ready_priority_, and - // ready_counter_. + // access to inputs_.size(), outputs_.size(), ready_priority_, + // ready_counter_, unscheduled_priority_, and unscheduled_counter_. + // This cannot be acquired while holding an input lock: it must + // be acquired first, to avoid deadlocks. std::mutex sched_lock_; // Inputs ready to be scheduled, sorted by priority and then timestamp if timestamp // dependencies are requested. We use the timestamp delta from the first observed // timestamp in each workload in order to mix inputs from different workloads in the // same queue. FIFO ordering is used for same-priority entries. flexible_queue_t ready_priority_; + // Inputs that are unscheduled indefinitely unless directly targeted. + flexible_queue_t unscheduled_priority_; // Trackes the count of blocked inputs. Protected by sched_lock_. int num_blocked_ = 0; - // Global ready queue counter used to provide FIFO for same-priority inputs. + // Global queue counters used to provide FIFO for same-priority inputs. uint64_t ready_counter_ = 0; + uint64_t unscheduled_counter_ = 0; // Count of inputs not yet at eof. std::atomic live_input_count_; // In replay mode, count of outputs not yet at the end of the replay sequence. diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp index e0ed0702f92..eeec09c073c 100644 --- a/clients/drcachesim/tests/scheduler_unit_tests.cpp +++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp @@ -3734,10 +3734,13 @@ static void test_direct_switch() { std::cerr << "\n----------------\nTesting direct switches\n"; + // This tests just direct switches with no unscheduled inputs or related + // switch requests. // We have just 1 output to better control the order and avoid flakiness. static constexpr int NUM_OUTPUTS = 1; static constexpr int QUANTUM_DURATION = 100; // Never reached. static constexpr int BLOCK_LATENCY = 100; + static constexpr int SWITCH_TIMEOUT = 2000; static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY); static constexpr memref_tid_t TID_BASE = 100; static constexpr memref_tid_t TID_A = TID_BASE + 0; @@ -3756,6 +3759,9 @@ test_direct_switch() make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + // This test focuses on direct only with nothing "unscheduled"; + // thus, we always provide a timeout to avoid going unscheduled. + make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT), make_marker(TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH, TID_C), make_timestamp(4001), make_instr(/*pc=*/401), @@ -3786,11 +3792,16 @@ test_direct_switch() make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + // This test focuses on direct only with nothing "unscheduled"; + // thus, we always provide a timeout to avoid going unscheduled. + make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT), make_marker(TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH, TID_A), make_timestamp(5001), make_instr(/*pc=*/501), // Test a non-existent target: should be ignored, but not crash. make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + // This test focuses on direct only with nothing "unscheduled". + make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT), make_marker(TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH, TID_BASE + 3), make_exit(TID_C), }; @@ -3809,8 +3820,7 @@ test_direct_switch() // has significant blocked time left. But then after B is scheduled and finishes, // we still have to wait for C's block time so we see idle underscores: static const char *const CORE0_SCHED_STRING = - "...AA.........CC......A....BBBB.______________C..."; - + "...AA..........CC.......A....BBBB.__________C...."; std::vector sched_inputs; sched_inputs.emplace_back(std::move(readers)); scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, @@ -3846,7 +3856,389 @@ test_direct_switch() // We expect A's first switch to be to B with an earlier timestamp. // We expect C's direct switch to A to not happen until A's blocked time ends. static const char *const CORE0_SCHED_STRING = - "...AA.........BBBB....CC......___________________C...___A."; + "...AA..........BBBB....CC.......___________________C....___A."; + std::vector sched_inputs; + sched_inputs.emplace_back(std::move(readers)); + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, + scheduler_t::DEPENDENCY_TIMESTAMPS, + scheduler_t::SCHEDULER_DEFAULTS, + /*verbosity=*/3); + sched_ops.quantum_duration = QUANTUM_DURATION; + // We use our mock's time==instruction count for a deterministic result. + sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.blocking_switch_threshold = BLOCK_LATENCY; + sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.honor_direct_switches = false; + scheduler_t scheduler; + if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != + scheduler_t::STATUS_SUCCESS) + assert(false); + std::vector sched_as_string = + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true); + for (int i = 0; i < NUM_OUTPUTS; i++) { + std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; + } + assert(sched_as_string[0] == CORE0_SCHED_STRING); + } +} + +static void +test_unscheduled() +{ + std::cerr << "\n----------------\nTesting unscheduled inputs\n"; + // We have just 1 output to better control the order and avoid flakiness. + static constexpr int NUM_OUTPUTS = 1; + static constexpr int QUANTUM_DURATION = 100; // Never reached. + static constexpr int BLOCK_LATENCY = 100; + static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY); + static constexpr int SWITCH_TIMEOUT = 2000; + static constexpr memref_tid_t TID_BASE = 100; + static constexpr memref_tid_t TID_A = TID_BASE + 0; + static constexpr memref_tid_t TID_B = TID_BASE + 1; + static constexpr memref_tid_t TID_C = TID_BASE + 2; + std::vector refs_A = { + make_thread(TID_A), + make_pid(1), + make_version(TRACE_ENTRY_VERSION), + // A has the earliest timestamp and starts. + make_timestamp(1001), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_instr(/*pc=*/101), + make_instr(/*pc=*/102), + make_timestamp(1002), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + // Test going unscheduled with no timeout. + make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0), + make_timestamp(4202), + // B makes us scheduled again. + make_instr(/*pc=*/103), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + // Switch to B to test a direct switch to unscheduled. + make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT), + make_marker(TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH, TID_B), + make_timestamp(4402), + make_instr(/*pc=*/401), + make_exit(TID_A), + }; + std::vector refs_B = { + make_thread(TID_B), + make_pid(1), + make_version(TRACE_ENTRY_VERSION), + // B runs next by timestamp. + make_timestamp(2001), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_instr(/*pc=*/200), + // B goes unscheduled with a timeout. + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT), + make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0), + // C will run at this point. + // Then, C blocks and our timeout lapses and we run again. + make_timestamp(4001), + make_instr(/*pc=*/201), + // B tells C to not go unscheduled later. + make_timestamp(4002), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL_SCHEDULE, TID_C), + make_timestamp(4004), + make_instr(/*pc=*/202), + // B makes A no longer unscheduled. + make_timestamp(4006), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL_SCHEDULE, TID_A), + make_timestamp(4011), + make_instr(/*pc=*/202), + // B now goes unscheduled with no timeout. + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0), + // A switches to us. + make_instr(/*pc=*/203), + make_instr(/*pc=*/204), + make_instr(/*pc=*/205), + make_instr(/*pc=*/206), + make_exit(TID_B), + }; + std::vector refs_C = { + make_thread(TID_C), + make_pid(1), + make_version(TRACE_ENTRY_VERSION), + // C goes 3rd by timestamp. + make_timestamp(3001), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_instr(/*pc=*/301), + make_instr(/*pc=*/302), + make_timestamp(3002), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + // C makes a long-latency blocking syscall, testing whether + // A is still unscheduled. + // We also test _SCHEDULE avoiding a future unschedule when C + // unblocks. + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + make_timestamp(7002), + make_instr(/*pc=*/501), + // C asks to go unscheduled with no timeout, but a prior _SCHEDULE + // means it just continues. + make_timestamp(7004), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0), + make_timestamp(7008), + make_instr(/*pc=*/502), + make_exit(TID_C), + }; + { + // Test the defaults with direct switches enabled. + std::vector readers; + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_A)), + std::unique_ptr(new mock_reader_t()), TID_A); + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_B)), + std::unique_ptr(new mock_reader_t()), TID_B); + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_C)), + std::unique_ptr(new mock_reader_t()), TID_C); + // The string constructor writes "." for markers. + // Matching the comments above, we expect A to go unscheduled; + // Then B runs and goes unscheduled-with-timeout; C takes over and blocks. + // We then have _=idle confirming A is unscheduled and B blocked. + // B then runs and makes A schedulable before going unscheduled. + // A then runs and switches back to B with a timeout. B exits; A's timeout + // has lapsed so it runs; finally we wait idle for C's long block to finish, + // after which C runs and *does not unschedule* b/c of B's prior request. + static const char *const CORE0_SCHED_STRING = + "...AA.........B........CC.....______________B......B......B....A......BBBB." + "A._________________C......C."; + + std::vector sched_inputs; + sched_inputs.emplace_back(std::move(readers)); + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, + scheduler_t::DEPENDENCY_TIMESTAMPS, + scheduler_t::SCHEDULER_DEFAULTS, + /*verbosity=*/3); + sched_ops.quantum_duration = QUANTUM_DURATION; + // We use our mock's time==instruction count for a deterministic result. + sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.blocking_switch_threshold = BLOCK_LATENCY; + sched_ops.block_time_scale = BLOCK_SCALE; + scheduler_t scheduler; + if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != + scheduler_t::STATUS_SUCCESS) + assert(false); + std::vector sched_as_string = + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true); + for (int i = 0; i < NUM_OUTPUTS; i++) { + std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; + } + assert(sched_as_string[0] == CORE0_SCHED_STRING); + } + { + // Test disabling direct switches. + std::vector readers; + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_A)), + std::unique_ptr(new mock_reader_t()), TID_A); + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_B)), + std::unique_ptr(new mock_reader_t()), TID_B); + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_C)), + std::unique_ptr(new mock_reader_t()), TID_C); + // The syscall latencies make this schedule not all that different: we just + // finish B instead of switching to A toward the end. + static const char *const CORE0_SCHED_STRING = + "...AA.........B........CC.....__________________B......B......B....BBBB.____" + "A......__A._______C......C."; + + std::vector sched_inputs; + sched_inputs.emplace_back(std::move(readers)); + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, + scheduler_t::DEPENDENCY_TIMESTAMPS, + scheduler_t::SCHEDULER_DEFAULTS, + /*verbosity=*/3); + sched_ops.quantum_duration = QUANTUM_DURATION; + // We use our mock's time==instruction count for a deterministic result. + sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.blocking_switch_threshold = BLOCK_LATENCY; + sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.honor_direct_switches = false; + scheduler_t scheduler; + if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != + scheduler_t::STATUS_SUCCESS) + assert(false); + std::vector sched_as_string = + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true); + for (int i = 0; i < NUM_OUTPUTS; i++) { + std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; + } + assert(sched_as_string[0] == CORE0_SCHED_STRING); + } +} + +static void +test_unscheduled_fallback() +{ + std::cerr << "\n----------------\nTesting unscheduled hang workarounds\n"; + // We have just 1 output to better control the order and avoid flakiness. + static constexpr int NUM_OUTPUTS = 1; + static constexpr int QUANTUM_DURATION = 100; // Never reached. + static constexpr int BLOCK_LATENCY = 100; + static constexpr double BLOCK_SCALE = 1. / (BLOCK_LATENCY); + static constexpr uint64_t BLOCK_TIME_MAX = 500; + static constexpr int SWITCH_TIMEOUT = 2000; + static constexpr memref_tid_t TID_BASE = 100; + static constexpr memref_tid_t TID_A = TID_BASE + 0; + static constexpr memref_tid_t TID_B = TID_BASE + 1; + static constexpr memref_tid_t TID_C = TID_BASE + 2; + std::vector refs_A = { + make_thread(TID_A), + make_pid(1), + make_version(TRACE_ENTRY_VERSION), + // A has the earliest timestamp and starts. + make_timestamp(1001), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_instr(/*pc=*/101), + make_instr(/*pc=*/102), + make_timestamp(1002), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + // Test going unscheduled with no timeout. + make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0), + make_timestamp(4202), + // B makes us scheduled again. + make_instr(/*pc=*/102), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + // Switch to a missing thread to leave us unscheduled; B also went + // unscheduled, leaving nothing scheduled, to test hang workarounds. + make_marker(TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH, TID_BASE + 4), + make_timestamp(4402), + // We won't get here until the no-scheduled-input hang workaround. + make_instr(/*pc=*/401), + make_exit(TID_A), + }; + std::vector refs_B = { + make_thread(TID_B), + make_pid(1), + make_version(TRACE_ENTRY_VERSION), + // B runs next by timestamp. + make_timestamp(2001), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_instr(/*pc=*/200), + // B goes unscheduled with a timeout. + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT), + make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0), + // C will run at this point. + // Then, C blocks and our timeout lapses and we run again. + make_timestamp(4001), + make_instr(/*pc=*/201), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + // B makes A no longer unscheduled. + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL_ARG_TIMEOUT, SWITCH_TIMEOUT), + make_marker(TRACE_MARKER_TYPE_SYSCALL_SCHEDULE, TID_A), + make_timestamp(4011), + make_instr(/*pc=*/202), + // B now goes unscheduled with no timeout. + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + make_marker(TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE, 0), + // We won't get here until the hang workaround. + make_instr(/*pc=*/203), + make_instr(/*pc=*/204), + make_instr(/*pc=*/205), + make_instr(/*pc=*/206), + make_exit(TID_B), + }; + std::vector refs_C = { + make_thread(TID_C), + make_pid(1), + make_version(TRACE_ENTRY_VERSION), + // C goes 3rd by timestamp. + make_timestamp(3001), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + make_instr(/*pc=*/301), + make_instr(/*pc=*/302), + make_timestamp(3002), + make_marker(TRACE_MARKER_TYPE_CPU_ID, 0), + // C makes a long-latency blocking syscall, testing whether + // A is still unscheduled. + make_marker(TRACE_MARKER_TYPE_SYSCALL, 999), + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0), + make_timestamp(7002), + make_instr(/*pc=*/501), + make_exit(TID_C), + }; + { + // Test the defaults with direct switches enabled. + std::vector readers; + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_A)), + std::unique_ptr(new mock_reader_t()), TID_A); + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_B)), + std::unique_ptr(new mock_reader_t()), TID_B); + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_C)), + std::unique_ptr(new mock_reader_t()), TID_C); + // This looks like the schedule in test_unscheduled() up until "..A.." when + // we have a BLOCK_TIME_MAX-long idle period: + static const char *const CORE0_SCHED_STRING = + "...AA.........B........CC.....______________B......B....A.....______________" + "_________C._________________________________________________________________" + "____________________________________________________________________________" + "____________________________________________________________________________" + "____________________________________________________________________________" + "____________________________________________________________________________" + "____________________________________________________________________________" + "_________________________________________________________BBBB.A."; + + std::vector sched_inputs; + sched_inputs.emplace_back(std::move(readers)); + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, + scheduler_t::DEPENDENCY_TIMESTAMPS, + scheduler_t::SCHEDULER_DEFAULTS, + /*verbosity=*/3); + sched_ops.quantum_duration = QUANTUM_DURATION; + // We use our mock's time==instruction count for a deterministic result. + sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.blocking_switch_threshold = BLOCK_LATENCY; + sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.block_time_max = BLOCK_TIME_MAX; + scheduler_t scheduler; + if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != + scheduler_t::STATUS_SUCCESS) + assert(false); + std::vector sched_as_string = + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true); + for (int i = 0; i < NUM_OUTPUTS; i++) { + std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; + } + assert(sched_as_string[0] == CORE0_SCHED_STRING); + } + { + // Test disabling direct switches. + std::vector readers; + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_A)), + std::unique_ptr(new mock_reader_t()), TID_A); + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_B)), + std::unique_ptr(new mock_reader_t()), TID_B); + readers.emplace_back(std::unique_ptr(new mock_reader_t(refs_C)), + std::unique_ptr(new mock_reader_t()), TID_C); + // This result is identical to the one in test_unscheduled(). + static const char *const CORE0_SCHED_STRING = + "...AA.........B........CC.....__________________B......B....BBBB._____A....." + "__A._______C."; std::vector sched_inputs; sched_inputs.emplace_back(std::move(readers)); @@ -3859,6 +4251,7 @@ test_direct_switch() sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.block_time_max = BLOCK_TIME_MAX; sched_ops.honor_direct_switches = false; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != @@ -4382,6 +4775,8 @@ test_main(int argc, const char *argv[]) test_replay_as_traced_sort(); test_inactive(); test_direct_switch(); + test_unscheduled(); + test_unscheduled_fallback(); test_kernel_switch_sequences(); test_random_schedule(); test_record_scheduler();