From 5eb838b0cb01262efb99af45a5ca7f4decb47f72 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Wed, 11 Sep 2024 17:41:14 -0400 Subject: [PATCH 01/11] i#6938 sched migrate: All options in microseconds + single scale Refactors the scheduler's time-oriented options to become based on simulated microseconds rather than being unitless and having to all be separately scaled depending on the simulator's clock. Deprecates these scheduler_options_t fields, replacing them with new versions: + quantum_duration => quantum_duration_{instructions,us} + block_time_scale => block_time_multiplier + block_time_max => block_time_max_us Adds a new "time_units_per_us" which is the single place where a simulator sets the relationship between the value passed to "cur_time" in next_record() and simulated microseconds. The aforementioned fields are all compared to cur_time multiplied by time_units_per_us. This is a prelude to adding yet more time-based options for the forthcoming scheduler additions with migration thresholds and rebalance periods. Adds legacy support for binary compatibility. Recompiling will result in error messages prompting an update to the new fields. Adds a unit test of legacy support. Issue: #6938 --- clients/drcachesim/analyzer_multi.cpp | 12 +- clients/drcachesim/common/options.cpp | 31 +- clients/drcachesim/common/options.h | 1 + clients/drcachesim/scheduler/scheduler.cpp | 90 +++- clients/drcachesim/scheduler/scheduler.h | 115 +++-- .../drcachesim/tests/scheduler_launcher.cpp | 9 +- .../drcachesim/tests/scheduler_unit_tests.cpp | 431 ++++++++++++------ 7 files changed, 484 insertions(+), 205 deletions(-) diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp index 845ca642c36..509954f0d9d 100644 --- a/clients/drcachesim/analyzer_multi.cpp +++ b/clients/drcachesim/analyzer_multi.cpp @@ -553,13 +553,17 @@ analyzer_multi_tmpl_t::init_dynamic_schedule() op_sched_order_time.get_value() ? sched_type_t::DEPENDENCY_TIMESTAMPS : sched_type_t::DEPENDENCY_IGNORE, sched_type_t::SCHEDULER_DEFAULTS, op_verbose.get_value()); - sched_ops.quantum_duration = op_sched_quantum.get_value(); - if (op_sched_time.get_value()) + sched_ops.time_units_per_us = op_sched_time_per_us.get_value(); + if (op_sched_time.get_value()) { sched_ops.quantum_unit = sched_type_t::QUANTUM_TIME; + sched_ops.quantum_duration_us = op_sched_quantum.get_value(); + } else { + sched_ops.quantum_duration_instrs = op_sched_quantum.get_value(); + } sched_ops.syscall_switch_threshold = op_sched_syscall_switch_us.get_value(); sched_ops.blocking_switch_threshold = op_sched_blocking_switch_us.get_value(); - sched_ops.block_time_scale = op_sched_block_scale.get_value(); - sched_ops.block_time_max = op_sched_block_max_us.get_value(); + sched_ops.block_time_multiplier = op_sched_block_scale.get_value(); + sched_ops.block_time_max_us = op_sched_block_max_us.get_value(); sched_ops.randomize_next_input = op_sched_randomize.get_value(); sched_ops.honor_direct_switches = !op_sched_disable_direct_switches.get_value(); #ifdef HAS_ZIP diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp index 4ce17d5a8f3..9e888670d32 100644 --- a/clients/drcachesim/common/options.cpp +++ b/clients/drcachesim/common/options.cpp @@ -887,13 +887,19 @@ droption_t op_core_serial( "How the scheduling is performed is controlled by a set " "of options with the prefix \"sched_\" along with -cores."); +droption_t + op_sched_time_per_us(DROPTION_SCOPE_ALL, "sched_time_per_us", 1000., + "Wall-clock microseconds per simulated microsecond", + "Wall-clock microseconds per simulated microsecond."); + droption_t // We pick 6 million to match 2 instructions per nanosecond with a 3ms quantum. op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 6 * 1000 * 1000, "Scheduling quantum", "Applies to -core_sharded and -core_serial. " - "Scheduling quantum: in microseconds of wall-clock " - "time if -sched_time is set; otherwise in instructions."); + "Scheduling quantum in instructions, unless -sched_time is set in " + "which case this value is multiplied by -sched_time_per_us to " + "produce a quantum in wall-clock microseconds."); droption_t op_sched_time(DROPTION_SCOPE_ALL, "sched_time", false, @@ -922,14 +928,15 @@ droption_t op_sched_blocking_switch_us( "-core_serial. "); droption_t op_sched_block_scale( - DROPTION_SCOPE_ALL, "sched_block_scale", 10., "Input block time scale factor", - "The scale applied to the microsecond latency of blocking system calls. A higher " - "value here results in blocking syscalls keeping inputs unscheduled for longer. " - "This should roughly equal the slowdown of instruction record processing versus the " - "original (untraced) application execution."); - -// We have a max to avoid outlier latencies that are already a second or more from -// scaling up to tens of minutes. We assume a cap is representative as the outliers + DROPTION_SCOPE_ALL, "sched_block_scale", 0.01, "Input block time scale factor", + "This value is multiplied by -sched_time_per_us to produce a scale which is applied " + "to the as-traced microsecond latency of blocking system calls to produce the block " + "time during simulation. A higher value here results in blocking syscalls keeping " + "inputs unscheduled for longer."); + +// We have a max to avoid outlier latencies from scaling up to extreme times. There is +// some inflation in the as-traced latencies and some can be inflated more than others. +// We assume a cap is representative as the outliers // likely were not part of key dependence chains. Without a cap the other threads all // finish and the simulation waits for tens of minutes further for a couple of outliers. // The cap remains a flag and not a constant as different length traces and different @@ -937,8 +944,8 @@ droption_t op_sched_block_scale( // to achieve desired cpu usage targets. The default value was selected to avoid unduly // long idle times with local analyzers; it may need to be increased with more // heavyweight analyzers/simulators. -droption_t op_sched_block_max_us(DROPTION_SCOPE_ALL, "sched_block_max_us", - 250000, +// TODO i#6959: Once we have -exit_if_all_unscheduled raise this. +droption_t op_sched_block_max_us(DROPTION_SCOPE_ALL, "sched_block_max_us", 250, "Maximum blocked input time, in microseconds", "The maximum blocked time, after scaling with " "-sched_block_scale."); diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h index 572524ea888..664594e57f0 100644 --- a/clients/drcachesim/common/options.h +++ b/clients/drcachesim/common/options.h @@ -199,6 +199,7 @@ extern dynamorio::droption::droption_t op_kernel_trace_buffer_size_shift; #endif extern dynamorio::droption::droption_t op_core_sharded; extern dynamorio::droption::droption_t op_core_serial; +extern dynamorio::droption::droption_t op_sched_time_per_us; extern dynamorio::droption::droption_t op_sched_quantum; extern dynamorio::droption::droption_t op_sched_time; extern dynamorio::droption::droption_t op_sched_order_time; diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index 294bd488287..b075ce627d4 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -842,6 +842,11 @@ scheduler_tmpl_t::init( } } + // Legacy field support. + sched_type_t::scheduler_status_t res = legacy_field_support(); + if (res != sched_type_t::STATUS_SUCCESS) + return res; + if (TESTANY(sched_type_t::SCHEDULER_USE_SINGLE_INPUT_ORDINALS, options_.flags) && inputs_.size() == 1 && output_count == 1) { options_.flags = static_cast( @@ -881,13 +886,67 @@ scheduler_tmpl_t::init( VPRINT(this, 1, "%zu inputs\n", inputs_.size()); live_input_count_.store(static_cast(inputs_.size()), std::memory_order_release); - sched_type_t::scheduler_status_t res = read_switch_sequences(); + res = read_switch_sequences(); if (res != sched_type_t::STATUS_SUCCESS) return STATUS_ERROR_INVALID_PARAMETER; return set_initial_schedule(workload2inputs); } +template +typename scheduler_tmpl_t::scheduler_status_t +scheduler_tmpl_t::legacy_field_support() +{ + if (options_.quantum_duration > 0) { + if (options_.struct_size > offsetof(scheduler_options_t, quantum_duration_us)) { + error_string_ = "quantum_duration is deprecated; use quantum_duration_us and " + "time_units_per_us or quantum_duration_instrs"; + return STATUS_ERROR_INVALID_PARAMETER; + } + if (options_.quantum_unit == QUANTUM_INSTRUCTIONS) { + options_.quantum_duration_instrs = options_.quantum_duration; + } else { + options_.quantum_duration_us = + static_cast(options_.quantum_duration) / + options_.time_units_per_us; + VPRINT(this, 2, + "Legacy support: setting quantum_duration_us to %" PRIu64 "\n", + options_.quantum_duration_us); + } + } + if (options_.quantum_duration_us == 0) { + error_string_ = "quantum_duration_us must be > 0"; + return STATUS_ERROR_INVALID_PARAMETER; + } + if (options_.block_time_scale > 0) { + if (options_.struct_size > offsetof(scheduler_options_t, block_time_multiplier)) { + error_string_ = "quantum_duration is deprecated; use block_time_multiplier " + "and time_units_per_us"; + return STATUS_ERROR_INVALID_PARAMETER; + } + options_.block_time_multiplier = + static_cast(options_.block_time_scale) / options_.time_units_per_us; + VPRINT(this, 2, "Legacy support: setting block_time_multiplier to %6.3f\n", + options_.block_time_multiplier); + } + if (options_.block_time_max > 0) { + if (options_.struct_size > offsetof(scheduler_options_t, block_time_max_us)) { + error_string_ = "quantum_duration is deprecated; use block_time_max_us " + "and time_units_per_us"; + return STATUS_ERROR_INVALID_PARAMETER; + } + options_.block_time_max_us = + static_cast(options_.block_time_max) / options_.time_units_per_us; + VPRINT(this, 2, "Legacy support: setting block_time_max_us to %" PRIu64 "\n", + options_.block_time_max_us); + } + if (options_.block_time_max_us == 0) { + error_string_ = "block_time_max_us must be > 0"; + return STATUS_ERROR_INVALID_PARAMETER; + } + return STATUS_SUCCESS; +} + template typename scheduler_tmpl_t::scheduler_status_t scheduler_tmpl_t::set_initial_schedule( @@ -2552,14 +2611,14 @@ uint64_t scheduler_tmpl_t::scale_blocked_time(uint64_t initial_time) const { uint64_t scaled = static_cast(static_cast(initial_time) * - options_.block_time_scale); - if (scaled > options_.block_time_max) { + options_.block_time_multiplier); + if (scaled > options_.block_time_max_us) { // We have a max to avoid outlier latencies that are already a second or // more from scaling up to tens of minutes. We assume a cap is representative // as the outliers likely were not part of key dependence chains. Without a // cap the other threads all finish and the simulation waits for tens of // minutes further for a couple of outliers. - scaled = options_.block_time_max; + scaled = options_.block_time_max_us; } return scaled; } @@ -2587,11 +2646,11 @@ scheduler_tmpl_t::syscall_incurs_switch(input_info_t *in : options_.syscall_switch_threshold; blocked_time = scale_blocked_time(latency); VPRINT(this, 3, - "input %d %ssyscall latency %" PRIu64 " * scale %5.1f => blocked time %" PRIu64 + "input %d %ssyscall latency %" PRIu64 " * scale %6.3f => blocked time %" PRIu64 "\n", input->index, input->processing_maybe_blocking_syscall ? "maybe-blocking " : "", latency, - options_.block_time_scale, blocked_time); + options_.block_time_multiplier, blocked_time); return latency >= threshold; } @@ -3279,6 +3338,8 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, // It's more efficient for QUANTUM_INSTRUCTIONS to get the time here instead of // in get_output_time(). This also makes the two more similarly behaved with // respect to blocking system calls. + // TODO i#6971: Use INSTRS_PER_US to replace .cur_time completely + // with a counter-based time, weighted appropriately for STATUS_IDLE. cur_time = get_time_micros(); } outputs_[output].cur_time = cur_time; // Invalid values are checked below. @@ -3492,7 +3553,7 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, record_type_is_instr_boundary(record, outputs_[output].last_record) && !outputs_[output].in_kernel_code) { ++input->instrs_in_quantum; - if (input->instrs_in_quantum > options_.quantum_duration) { + if (input->instrs_in_quantum > options_.quantum_duration_instrs) { // We again prefer to switch to another input even if the current // input has the oldest timestamp, prioritizing context switches // over timestamp ordering. @@ -3516,7 +3577,13 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, input->time_spent_in_quantum += cur_time - input->prev_time_in_quantum; prev_time_in_quantum = input->prev_time_in_quantum; input->prev_time_in_quantum = cur_time; - if (input->time_spent_in_quantum >= options_.quantum_duration && + double elapsed_micros = + input->time_spent_in_quantum * options_.time_units_per_us; + VPRINT(this, 4, + "next_record[%d]: input %d elapsed %6.1f vs quantum %" PRIu64 "\n", + output, input->index, elapsed_micros, + options_.quantum_duration_us); // NOCHECK + if (elapsed_micros >= options_.quantum_duration_us && // We only switch on instruction boundaries. We could possibly switch // in between (e.g., scatter/gather long sequence of reads/writes) by // setting input->switching_pre_instruction. @@ -3759,13 +3826,14 @@ scheduler_tmpl_t::eof_or_idle(output_ordinal_t output, outputs_[output].wait_start_time = get_output_time(output); } else { uint64_t now = get_output_time(output); - if (now - outputs_[output].wait_start_time > - options_.block_time_max) { + double elapsed_micros = (now - outputs_[output].wait_start_time) * + options_.time_units_per_us; + if (elapsed_micros > options_.block_time_max_us) { // XXX i#6822: We may want some other options here for what to // do. We could release just one input at a time, which would be // the same scheduling order (as we have FIFO in // unscheduled_priority_) but may take a long time at - // block_time_max each; we could declare we're done and just + // block_time_max_us each; we could declare we're done and just // exit, maybe under a flag or if we could see what % of total // records we've processed. VPRINT(this, 1, diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h index 4184ffa1a86..a4dea1f1024 100644 --- a/clients/drcachesim/scheduler/scheduler.h +++ b/clients/drcachesim/scheduler/scheduler.h @@ -595,12 +595,15 @@ template class scheduler_tmpl_t { /** The unit of the schedule time quantum. */ quantum_unit_t quantum_unit = QUANTUM_INSTRUCTIONS; /** - * The scheduling quantum duration for preemption. The units are - * specified by - * #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t::quantum_unit. + * Deprecated: use #quantum_duration_us and #time_units_per_us for #QUANTUM_TIME + * or #quantum_duration_instrs for #QUANTUM_INSTRUCTIONS instead. It + * is an error to set this to a non-zero value when #struct_size includes + * #quantum_duration_us. When #struct_size does not include + * #quantum_duration_us and this value is non-zero, the value in + * #quantum_duration_us is replaced with this value divided by the default + * value of #time_units_per_us. */ - // We pick 6 million to match 2 instructions per nanosecond with a 3ms quantum. - uint64_t quantum_duration = 6 * 1000 * 1000; + uint64_t quantum_duration = 0; /** * If > 0, diagnostic messages are printed to stderr. Higher values produce * more frequent diagnostics. @@ -643,37 +646,21 @@ template class scheduler_tmpl_t { */ uint64_t blocking_switch_threshold = 100; /** - * Controls the amount of time inputs are considered blocked at a syscall whose - * latency exceeds #syscall_switch_threshold or #blocking_switch_threshold. The - * syscall latency (in microseconds) is multiplied by this field to produce the - * blocked time. For #QUANTUM_TIME, that blocked time in the units reported by - * the time parameter to next_record() must pass before the input is no longer - * considered blocked. Since the system call latencies are in microseconds, this - * #block_time_scale should be set to the number of next_record() time units in - * one simulated microsecond. For #QUANTUM_INSTRUCTIONS, the blocked time in - * wall-clock microseconds must pass before the input is actually selected - * (wall-clock time is used as there is no reasonable alternative with no other - * uniform notion of time); thus, the #block_time_scale value here should equal - * the slowdown of the instruction record processing versus the original - * (untraced) application execution. The blocked time is clamped to a maximum - * value controlled by #block_time_max. - * - * The default value is meant to be reasonable for simple analyzers. It may - * result in too much or too little idle time depending on the analyzer or - * simulator and its speed; it is meant to be tuned and modified. - */ - double block_time_scale = 10.; - /** - * The maximum time, in the units explained by #block_time_scale (either - * #QUANTUM_TIME simulator time or wall-clock microseconds for - * #QUANTUM_INSTRUCTIONS), for an input to be considered blocked for any one - * system call. This is applied after multiplying by #block_time_scale. - * This is also used as a fallback to avoid hangs when there are no scheduled - * inputs: if the only inputs left are "unscheduled" (see - * #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE), after this amount of time those - * inputs are all re-scheduled. - */ - uint64_t block_time_max = 250000; + * Deprecated: use #block_time_multiplier instead. It is an error to set + * this to a non-zero value when #struct_size includes #block_time_multiplier. + * When #struct_size does not include #block_time_multiplier and this value is + * non-zero, the value in #block_time_multiplier is replaced with this value + * divided by the default value of #time_units_per_us. + */ + double block_time_scale = 0.; + /** + * Deprecated: use #block_time_max_us and #time_units_per_us instead. It is + * an error to set this to a non-zero value when #struct_size includes + * #block_time_max_us. When #struct_size does not include #block_time_max_us + * and this value is non-zero, the value in #block_time_max_us is replaced + * with this value divided by the default value of #time_units_per_us. + */ + uint64_t block_time_max = 0; // XXX: Should we share the file-to-reader code currently in the scheduler // with the analyzer and only then need reader interfaces and not pass paths // to the scheduler? @@ -740,6 +727,59 @@ template class scheduler_tmpl_t { * (these markers remain: they are not removed from the trace). */ bool honor_direct_switches = true; + /** + * How many time units for the "cur_time" value passed to next_record() are + * equivalent to one simulated microsecond. E.g., if the time units are in + * picoseconds, pass one million here. This is used to scale all of the + * other parameters that are in microseconds (they all end in "_us": e.g., + * #quantum_duration_us) so that they operate on the right time scale for the + * passed-in simulator time. + */ + double time_units_per_us = 1000.; + /** + * The scheduling quantum duration for preemption, in simulated microseconds, + * for #QUANTUM_TIME. This value is multiplied by #time_units_per_us to + * produce a value that is compared to the "cur_time" parameter to + * next_record() to determine when to force a quantum switch. + */ + uint64_t quantum_duration_us = 5000; + /** + * The scheduling quantum duration for preemption, in instruction count, + * for #QUANTUM_INSTRUCTIONS. The time passed to next_record() is ignored + * for purposes of quantum preempts. + */ + // We pick 6 million to match 2 instructions per nanosecond with a 3ms quantum. + uint64_t quantum_duration_instrs = 6 * 1000 * 1000; + /** + * Controls the amount of time inputs are considered blocked at a syscall + * whose as-traced latency (recorded in timestamp records in the trace) + * exceeds #syscall_switch_threshold or #blocking_switch_threshold. The + * as-traced syscall latency (which is in traced microseconds) is multiplied + * by this field to produce the blocked time in simulated microseconds. Once + * that many simulated microseconds has passed according to the "cur_time" + * value passed to next_record() (multiplied by #time_units_per_us), the + * input will be no longer considered blocked. The blocked time is clamped + * to a maximum value controlled by #block_time_max. + * + * While there is no direct overhead during tracing, indirect overhead + * does result in some inflation of recorded system call latencies. + * Thus, a value below 0 is typically used here. This value, in combination + * with #block_time_max_us, can be tuned to achieve a desired idle rate. + * The default value errs on the side of less idle time. + */ + double block_time_multiplier = 0.01; + /** + * The maximum time in microseconds for an input to be considered blocked for + * any one system call. This value is multiplied by #time_units_per_us to + * produce a value that is compared to the "cur_time" parameter to + * next_record(). If any block time (see #block_time_multiplier) exceeds + * this value, it is capped to this value. This value is also used as a + * fallback to avoid hangs when there are no scheduled inputs: if the only + * inputs left are "unscheduled" (see #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE), + * after this amount of time those inputs are all re-scheduled. + */ + // TODO i#6959: Once we have -exit_if_all_unscheduled raise this. + uint64_t block_time_max_us = 250; }; /** @@ -1532,6 +1572,9 @@ template class scheduler_tmpl_t { process_next_initial_record(input_info_t &input, RecordType record, bool &found_filetype, bool &found_timestamp); + scheduler_status_t + legacy_field_support(); + // Opens readers for each file in 'path', subject to the constraints in // 'reader_info'. 'path' may be a directory. // Updates the ti2dinput, unfiltered_tids, and input_count fields of 'reader_info'. diff --git a/clients/drcachesim/tests/scheduler_launcher.cpp b/clients/drcachesim/tests/scheduler_launcher.cpp index 16554a9daa9..1944029c96d 100644 --- a/clients/drcachesim/tests/scheduler_launcher.cpp +++ b/clients/drcachesim/tests/scheduler_launcher.cpp @@ -307,10 +307,13 @@ _tmain(int argc, const TCHAR *targv[]) op_honor_stamps.get_value() ? scheduler_t::DEPENDENCY_TIMESTAMPS : scheduler_t::DEPENDENCY_IGNORE, scheduler_t::SCHEDULER_DEFAULTS, op_verbose.get_value()); - sched_ops.quantum_duration = op_sched_quantum.get_value(); - if (op_sched_time.get_value()) + if (op_sched_time.get_value()) { sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; - sched_ops.block_time_scale = op_block_time_scale.get_value(); + sched_ops.quantum_duration_us = op_sched_quantum.get_value(); + } else { + sched_ops.quantum_duration_instrs = op_sched_quantum.get_value(); + } + sched_ops.block_time_multiplier = op_block_time_scale.get_value(); #ifdef HAS_ZIP std::unique_ptr record_zip; std::unique_ptr replay_zip; diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp index 26e4310d679..b8bf367b581 100644 --- a/clients/drcachesim/tests/scheduler_unit_tests.cpp +++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp @@ -160,6 +160,101 @@ verify_scheduler_stats(scheduler_t::stream_t *stream, int64_t switch_input_to_in migrations); } +// Returns a string with one char per input. +// Assumes the input threads are all tid_base plus an offset < 26. +// When send_time=true, typically time_units_per_us should be set to 1 to then have +// instruction count for all timing measures. +static std::vector +run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t tid_base, + bool send_time = false, bool print_markers = true) +{ + // Walk the outputs in lockstep for crude but deterministic concurrency. + std::vector outputs(num_outputs, nullptr); + std::vector eof(num_outputs, false); + for (int i = 0; i < num_outputs; i++) + outputs[i] = scheduler.get_stream(i); + int num_eof = 0; + int64_t meta_records = 0; + // Record the threads, one char each. + std::vector sched_as_string(num_outputs); + static constexpr char THREAD_LETTER_START = 'A'; + static constexpr char WAIT_SYMBOL = '-'; + static constexpr char IDLE_SYMBOL = '_'; + static constexpr char NON_INSTR_SYMBOL = '.'; + while (num_eof < num_outputs) { + for (int i = 0; i < num_outputs; i++) { + if (eof[i]) + continue; + memref_t memref; + scheduler_t::stream_status_t status; + if (send_time) { + // We assume IPC=1 and so send the instruction count (+1 to avoid an + // invalid time of 0) which allows apples-to-apples comparisons with + // instruction quanta. This is a per-output time which technically + // violates the globally-increasing requirement, so this will not work + // perfectly with i/o waits, but should work fine for basic tests. + // We add the wait and idle records to make progress with idle time. + status = outputs[i]->next_record( + memref, outputs[i]->get_instruction_ordinal() + 1 + meta_records); + } else { + status = outputs[i]->next_record(memref); + } + if (status == scheduler_t::STATUS_EOF) { + ++num_eof; + eof[i] = true; + continue; + } + if (status == scheduler_t::STATUS_WAIT) { + sched_as_string[i] += WAIT_SYMBOL; + ++meta_records; + continue; + } + if (status == scheduler_t::STATUS_IDLE) { + sched_as_string[i] += IDLE_SYMBOL; + ++meta_records; + continue; + } + assert(status == scheduler_t::STATUS_OK); + if (type_is_instr(memref.instr.type)) { + sched_as_string[i] += + THREAD_LETTER_START + static_cast(memref.instr.tid - tid_base); + } else { + // While this makes the string longer, it is just too confusing + // with the same letter seemingly on 2 cores at once without these + // fillers to line everything up in time. + sched_as_string[i] += NON_INSTR_SYMBOL; + } + assert(outputs[i]->get_shard_index() == + outputs[i]->get_output_stream_ordinal()); + } + } + // Ensure we never see the same output on multiple cores in the same timestep. + size_t max_size = 0; + for (int i = 0; i < num_outputs; ++i) + max_size = std::max(max_size, sched_as_string[i].size()); + for (int step = 0; step < static_cast(max_size); ++step) { + std::set inputs; + for (int out = 0; out < num_outputs; ++out) { + if (static_cast(sched_as_string[out].size()) <= step) + continue; + if (sched_as_string[out][step] < 'A' || sched_as_string[out][step] > 'Z') + continue; + assert(inputs.find(sched_as_string[out][step]) == inputs.end()); + inputs.insert(sched_as_string[out][step]); + } + } + if (!print_markers) { + // We kept the dots internally for our same-timestep check above. + for (int i = 0; i < num_outputs; ++i) { + sched_as_string[i].erase(std::remove(sched_as_string[i].begin(), + sched_as_string[i].end(), + NON_INSTR_SYMBOL), + sched_as_string[i].end()); + } + } + return sched_as_string; +} + static void test_serial() { @@ -294,7 +389,7 @@ test_parallel() } static void -test_param_checks() +test_invalid_regions() { std::vector readers; readers.emplace_back(std::unique_ptr(new mock_reader_t()), @@ -333,6 +428,143 @@ test_param_checks() scheduler_t::STATUS_ERROR_INVALID_PARAMETER); } +static void +test_legacy_fields() +{ + std::cerr << "\n----------------\nTesting legacy fields\n"; + static constexpr int NUM_INPUTS = 7; + static constexpr int NUM_OUTPUTS = 2; + static constexpr int NUM_INSTRS = 9; + static constexpr int QUANTUM_DURATION = 3; + // We do not want to block for very long. + static constexpr double BLOCK_LATENCY = 200; + static constexpr double BLOCK_SCALE = 0.01; + static constexpr double BLOCK_MAX = 50; + static constexpr memref_tid_t TID_BASE = 100; + static constexpr uint64_t START_TIME = 20; + std::vector inputs[NUM_INPUTS]; + for (int i = 0; i < NUM_INPUTS; i++) { + memref_tid_t tid = TID_BASE + i; + inputs[i].push_back(make_thread(tid)); + inputs[i].push_back(make_pid(1)); + inputs[i].push_back(make_version(TRACE_ENTRY_VERSION)); + inputs[i].push_back(make_timestamp(START_TIME)); // All the same time priority. + for (int j = 0; j < NUM_INSTRS; j++) { + inputs[i].push_back(make_instr(42 + j * 4)); + // Including blocking syscalls. + if ((i == 0 || i == 1) && j == 1) { + inputs[i].push_back(make_timestamp(START_TIME * 2)); + inputs[i].push_back(make_marker(TRACE_MARKER_TYPE_SYSCALL, 42)); + inputs[i].push_back( + make_marker(TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL, 0)); + inputs[i].push_back(make_timestamp(START_TIME * 2 + BLOCK_LATENCY)); + } + } + inputs[i].push_back(make_exit(tid)); + } + { + // Test invalid quantum. + std::vector sched_inputs; + std::vector readers; + readers.emplace_back(std::unique_ptr(new mock_reader_t(inputs[0])), + std::unique_ptr(new mock_reader_t()), + TID_BASE); + sched_inputs.emplace_back(std::move(readers)); + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, + scheduler_t::DEPENDENCY_IGNORE, + scheduler_t::SCHEDULER_DEFAULTS); + sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.quantum_duration = QUANTUM_DURATION; + scheduler_t scheduler; + assert(scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) == + scheduler_t::STATUS_ERROR_INVALID_PARAMETER); + } + { + // Test invalid block scale. + std::vector sched_inputs; + std::vector readers; + readers.emplace_back(std::unique_ptr(new mock_reader_t(inputs[0])), + std::unique_ptr(new mock_reader_t()), + TID_BASE); + sched_inputs.emplace_back(std::move(readers)); + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, + scheduler_t::DEPENDENCY_IGNORE, + scheduler_t::SCHEDULER_DEFAULTS); + sched_ops.block_time_scale = BLOCK_SCALE; + scheduler_t scheduler; + assert(scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) == + scheduler_t::STATUS_ERROR_INVALID_PARAMETER); + } + { + // Test invalid block max. + std::vector sched_inputs; + std::vector readers; + readers.emplace_back(std::unique_ptr(new mock_reader_t(inputs[0])), + std::unique_ptr(new mock_reader_t()), + TID_BASE); + sched_inputs.emplace_back(std::move(readers)); + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, + scheduler_t::DEPENDENCY_IGNORE, + scheduler_t::SCHEDULER_DEFAULTS); + sched_ops.block_time_max = BLOCK_MAX; + scheduler_t scheduler; + assert(scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) == + scheduler_t::STATUS_ERROR_INVALID_PARAMETER); + } + { + // Test valid legacy fields. + std::vector sched_inputs; + for (int i = 0; i < NUM_INPUTS; i++) { + std::vector readers; + readers.emplace_back( + std::unique_ptr(new mock_reader_t(inputs[i])), + std::unique_ptr(new mock_reader_t()), TID_BASE + i); + sched_inputs.emplace_back(std::move(readers)); + } + scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT, + scheduler_t::DEPENDENCY_IGNORE, + scheduler_t::SCHEDULER_DEFAULTS, + /*verbosity=*/4); + // Simulate binary compatibility with a legacy struct. + sched_ops.struct_size = + offsetof(scheduler_t::scheduler_options_t, time_units_per_us); + sched_ops.quantum_duration_us = QUANTUM_DURATION; + sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.block_time_max = BLOCK_MAX; + // To do our test we use instrs-as-time for deterministic block times. + sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; + scheduler_t scheduler; + if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != + scheduler_t::STATUS_SUCCESS) + assert(false); + std::vector sched_as_string = + run_lockstep_simulation(scheduler, NUM_OUTPUTS, TID_BASE, /*send_time=*/true); + // Hardcoding here for the 2 outputs and 7 inputs. + // We expect 3 letter sequences (our quantum) alternating every-other as each + // core alternates. The dots are markers and thread exits. + // A and B have a voluntary switch after their 1st 2 letters, but we expect + // the usage to persist to their next scheduling which should only have + // a single letter. + static const char *const CORE0_SCHED_STRING = + "..AA......CCC..EEE..GGGDDDFFFBBBCCC.EEE.AAA.GGG."; + static const char *const CORE1_SCHED_STRING = + "..BB......DDD..FFFABCCCEEEAAAGGGDDD.FFF.BBB.____"; + for (int i = 0; i < NUM_OUTPUTS; i++) { + std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n"; + } + assert(sched_as_string[0] == CORE0_SCHED_STRING); + assert(sched_as_string[1] == CORE1_SCHED_STRING); + } +} + +static void +test_param_checks() +{ + test_invalid_regions(); + test_legacy_fields(); +} + // Tests regions without timestamps for a simple, direct test. static void test_regions_bare() @@ -947,99 +1179,6 @@ test_real_file_queries_and_filters(const char *testdir) #endif } -// Returns a string with one char per input. -// Assumes the input threads are all tid_base plus an offset < 26. -static std::vector -run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t tid_base, - bool send_time = false, bool print_markers = true) -{ - // Walk the outputs in lockstep for crude but deterministic concurrency. - std::vector outputs(num_outputs, nullptr); - std::vector eof(num_outputs, false); - for (int i = 0; i < num_outputs; i++) - outputs[i] = scheduler.get_stream(i); - int num_eof = 0; - int64_t meta_records = 0; - // Record the threads, one char each. - std::vector sched_as_string(num_outputs); - static constexpr char THREAD_LETTER_START = 'A'; - static constexpr char WAIT_SYMBOL = '-'; - static constexpr char IDLE_SYMBOL = '_'; - static constexpr char NON_INSTR_SYMBOL = '.'; - while (num_eof < num_outputs) { - for (int i = 0; i < num_outputs; i++) { - if (eof[i]) - continue; - memref_t memref; - scheduler_t::stream_status_t status; - if (send_time) { - // We assume IPC=1 and so send the instruction count (+1 to avoid an - // invalid time of 0) which allows apples-to-apples comparisons with - // instruction quanta. This is a per-output time which technically - // violates the globally-increasing requirement, so this will not work - // perfectly with i/o waits, but should work fine for basic tests. - // We add the wait and idle records to make progress with idle time. - status = outputs[i]->next_record( - memref, outputs[i]->get_instruction_ordinal() + 1 + meta_records); - } else { - status = outputs[i]->next_record(memref); - } - if (status == scheduler_t::STATUS_EOF) { - ++num_eof; - eof[i] = true; - continue; - } - if (status == scheduler_t::STATUS_WAIT) { - sched_as_string[i] += WAIT_SYMBOL; - ++meta_records; - continue; - } - if (status == scheduler_t::STATUS_IDLE) { - sched_as_string[i] += IDLE_SYMBOL; - ++meta_records; - continue; - } - assert(status == scheduler_t::STATUS_OK); - if (type_is_instr(memref.instr.type)) { - sched_as_string[i] += - THREAD_LETTER_START + static_cast(memref.instr.tid - tid_base); - } else { - // While this makes the string longer, it is just too confusing - // with the same letter seemingly on 2 cores at once without these - // fillers to line everything up in time. - sched_as_string[i] += NON_INSTR_SYMBOL; - } - assert(outputs[i]->get_shard_index() == - outputs[i]->get_output_stream_ordinal()); - } - } - // Ensure we never see the same output on multiple cores in the same timestep. - size_t max_size = 0; - for (int i = 0; i < num_outputs; ++i) - max_size = std::max(max_size, sched_as_string[i].size()); - for (int step = 0; step < static_cast(max_size); ++step) { - std::set inputs; - for (int out = 0; out < num_outputs; ++out) { - if (static_cast(sched_as_string[out].size()) <= step) - continue; - if (sched_as_string[out][step] < 'A' || sched_as_string[out][step] > 'Z') - continue; - assert(inputs.find(sched_as_string[out][step]) == inputs.end()); - inputs.insert(sched_as_string[out][step]); - } - } - if (!print_markers) { - // We kept the dots internally for our same-timestep check above. - for (int i = 0; i < num_outputs; ++i) { - sched_as_string[i].erase(std::remove(sched_as_string[i].begin(), - sched_as_string[i].end(), - NON_INSTR_SYMBOL), - sched_as_string[i].end()); - } - } - return sched_as_string; -} - static void test_synthetic() { @@ -1095,8 +1234,8 @@ test_synthetic() scheduler_t::DEPENDENCY_IGNORE, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = QUANTUM_DURATION; - sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.quantum_duration_instrs = QUANTUM_DURATION; + sched_ops.block_time_multiplier = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -1160,8 +1299,9 @@ test_synthetic() scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; - sched_ops.quantum_duration = QUANTUM_DURATION; - sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.time_units_per_us = 1.; + sched_ops.quantum_duration_us = QUANTUM_DURATION; + sched_ops.block_time_multiplier = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -1223,9 +1363,10 @@ test_synthetic_time_quanta() scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/4); sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; - sched_ops.quantum_duration = 3; + sched_ops.time_units_per_us = 1.; + sched_ops.quantum_duration_us = 3; // Ensure it waits 10 steps. - sched_ops.block_time_scale = 10. / (POST_BLOCK_TIME - PRE_BLOCK_TIME); + sched_ops.block_time_multiplier = 10. / (POST_BLOCK_TIME - PRE_BLOCK_TIME); zipfile_ostream_t outfile(record_fname); sched_ops.schedule_record_ostream = &outfile; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != @@ -1408,7 +1549,7 @@ test_synthetic_with_timestamps() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = 3; + sched_ops.quantum_duration_instrs = 3; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -1506,7 +1647,7 @@ test_synthetic_with_priorities() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = 3; + sched_ops.quantum_duration_instrs = 3; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -1592,7 +1733,7 @@ test_synthetic_with_bindings_time(bool time_deps) time_deps ? scheduler_t::DEPENDENCY_TIMESTAMPS : scheduler_t::DEPENDENCY_IGNORE, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = 3; + sched_ops.quantum_duration_instrs = 3; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -1646,7 +1787,7 @@ test_synthetic_with_bindings_more_out() scheduler_t::DEPENDENCY_IGNORE, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = 3; + sched_ops.quantum_duration_instrs = 3; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -1716,7 +1857,7 @@ test_synthetic_with_bindings_weighted() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = 3; + sched_ops.quantum_duration_instrs = 3; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -1828,11 +1969,12 @@ test_synthetic_with_syscalls_multiple() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = 3; + sched_ops.quantum_duration_us = 3; // We use our mock's time==instruction count for a deterministic result. sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; - sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.block_time_multiplier = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -1936,11 +2078,12 @@ test_synthetic_with_syscalls_single() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/4); - sched_ops.quantum_duration = 3; + sched_ops.quantum_duration_us = 3; // We use our mock's time==instruction count for a deterministic result. sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; - sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.block_time_multiplier = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -2141,7 +2284,7 @@ test_synthetic_with_syscalls_latencies() // We use a mock time for a deterministic result. sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; - sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.block_time_multiplier = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, 1, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -2247,11 +2390,12 @@ test_synthetic_with_syscalls_idle() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = 3; + sched_ops.quantum_duration_us = 3; // We use a mock time for a deterministic result. sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; - sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.block_time_multiplier = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -2312,7 +2456,7 @@ test_synthetic_multi_threaded(const char *testdir) /*verbosity=*/2); static constexpr int NUM_OUTPUTS = 4; static constexpr int QUANTUM_DURATION = 2000; - sched_ops.quantum_duration = QUANTUM_DURATION; + sched_ops.quantum_duration_instrs = QUANTUM_DURATION; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) assert(false); @@ -2544,7 +2688,7 @@ test_replay() scheduler_t::DEPENDENCY_IGNORE, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = QUANTUM_INSTRS; + sched_ops.quantum_duration_instrs = QUANTUM_INSTRS; zipfile_ostream_t outfile(record_fname); sched_ops.schedule_record_ostream = &outfile; @@ -2672,7 +2816,7 @@ test_replay_multi_threaded(const char *testdir) zipfile_ostream_t outfile(record_fname); sched_ops.schedule_record_ostream = &outfile; static constexpr int QUANTUM_DURATION = 2000; - sched_ops.quantum_duration = QUANTUM_DURATION; + sched_ops.quantum_duration_instrs = QUANTUM_DURATION; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) assert(false); @@ -3300,7 +3444,7 @@ test_replay_limit() /*verbosity=*/2); zipfile_ostream_t outfile(record_fname); sched_ops.schedule_record_ostream = &outfile; - sched_ops.quantum_duration = NUM_INSTRS / 10; + sched_ops.quantum_duration_instrs = NUM_INSTRS / 10; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) assert(false); @@ -4029,7 +4173,7 @@ test_inactive() scheduler_t::DEPENDENCY_IGNORE, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/4); - sched_ops.quantum_duration = 2; + sched_ops.quantum_duration_instrs = 2; zipfile_ostream_t outfile(record_fname); sched_ops.schedule_record_ostream = &outfile; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != @@ -4253,11 +4397,12 @@ test_direct_switch() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = QUANTUM_DURATION; + sched_ops.quantum_duration_us = QUANTUM_DURATION; // We use our mock's time==instruction count for a deterministic result. sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; - sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.block_time_multiplier = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -4293,11 +4438,12 @@ test_direct_switch() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = QUANTUM_DURATION; + sched_ops.quantum_duration_us = QUANTUM_DURATION; // We use our mock's time==instruction count for a deterministic result. sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; - sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.block_time_multiplier = BLOCK_SCALE; sched_ops.honor_direct_switches = false; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != @@ -4460,11 +4606,12 @@ test_unscheduled() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = QUANTUM_DURATION; + sched_ops.quantum_duration_us = QUANTUM_DURATION; // We use our mock's time==instruction count for a deterministic result. sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; - sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.block_time_multiplier = BLOCK_SCALE; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -4497,11 +4644,12 @@ test_unscheduled() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = QUANTUM_DURATION; + sched_ops.quantum_duration_us = QUANTUM_DURATION; // We use our mock's time==instruction count for a deterministic result. sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; - sched_ops.block_time_scale = BLOCK_SCALE; + sched_ops.block_time_multiplier = BLOCK_SCALE; sched_ops.honor_direct_switches = false; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != @@ -4644,12 +4792,13 @@ test_unscheduled_fallback() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = QUANTUM_DURATION; + sched_ops.quantum_duration_us = QUANTUM_DURATION; // We use our mock's time==instruction count for a deterministic result. sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; - sched_ops.block_time_scale = BLOCK_SCALE; - sched_ops.block_time_max = BLOCK_TIME_MAX; + sched_ops.block_time_multiplier = BLOCK_SCALE; + sched_ops.block_time_max_us = BLOCK_TIME_MAX; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -4681,12 +4830,13 @@ test_unscheduled_fallback() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); - sched_ops.quantum_duration = QUANTUM_DURATION; + sched_ops.quantum_duration_us = QUANTUM_DURATION; // We use our mock's time==instruction count for a deterministic result. sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; - sched_ops.block_time_scale = BLOCK_SCALE; - sched_ops.block_time_max = BLOCK_TIME_MAX; + sched_ops.block_time_multiplier = BLOCK_SCALE; + sched_ops.block_time_max_us = BLOCK_TIME_MAX; sched_ops.honor_direct_switches = false; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != @@ -4768,9 +4918,10 @@ test_unscheduled_initially() scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; - sched_ops.block_time_scale = BLOCK_SCALE; - sched_ops.block_time_max = BLOCK_TIME_MAX; + sched_ops.block_time_multiplier = BLOCK_SCALE; + sched_ops.block_time_max_us = BLOCK_TIME_MAX; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -4801,9 +4952,10 @@ test_unscheduled_initially() /*verbosity=*/3); // We use our mock's time==instruction count for a deterministic result. sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; - sched_ops.block_time_scale = BLOCK_SCALE; - sched_ops.block_time_max = BLOCK_TIME_MAX; + sched_ops.block_time_multiplier = BLOCK_SCALE; + sched_ops.block_time_max_us = BLOCK_TIME_MAX; sched_ops.honor_direct_switches = false; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != @@ -5014,7 +5166,7 @@ test_kernel_switch_sequences() scheduler_t::DEPENDENCY_TIMESTAMPS, scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/4); - sched_ops.quantum_duration = INSTR_QUANTUM; + sched_ops.quantum_duration_instrs = INSTR_QUANTUM; sched_ops.kernel_switch_reader = std::move(switch_reader); sched_ops.kernel_switch_reader_end = std::move(switch_reader_end); scheduler_t scheduler; @@ -5242,7 +5394,7 @@ test_random_schedule() scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/3); sched_ops.randomize_next_input = true; - sched_ops.quantum_duration = QUANTUM_DURATION; + sched_ops.quantum_duration_instrs = QUANTUM_DURATION; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -5328,8 +5480,8 @@ test_record_scheduler() record_scheduler_t::MAP_TO_ANY_OUTPUT, record_scheduler_t::DEPENDENCY_IGNORE, record_scheduler_t::SCHEDULER_DEFAULTS, /*verbosity=*/4); - sched_ops.quantum_duration = 2; - sched_ops.block_time_scale = 0.001; // Do not stay blocked. + sched_ops.quantum_duration_instrs = 2; + sched_ops.block_time_multiplier = 0.001; // Do not stay blocked. if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != record_scheduler_t::STATUS_SUCCESS) assert(false); @@ -5457,6 +5609,7 @@ test_main(int argc, const char *argv[]) test_kernel_switch_sequences(); test_random_schedule(); test_record_scheduler(); + dr_standalone_exit(); return 0; } From bdec908b6f5531bf6815cacb70126d63f342316c Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Thu, 12 Sep 2024 10:44:37 -0400 Subject: [PATCH 02/11] Fix Windows build warning --- clients/drcachesim/scheduler/scheduler.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index b075ce627d4..5401c33c671 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -907,8 +907,8 @@ scheduler_tmpl_t::legacy_field_support() options_.quantum_duration_instrs = options_.quantum_duration; } else { options_.quantum_duration_us = - static_cast(options_.quantum_duration) / - options_.time_units_per_us; + static_cast(static_cast(options_.quantum_duration) / + options_.time_units_per_us); VPRINT(this, 2, "Legacy support: setting quantum_duration_us to %" PRIu64 "\n", options_.quantum_duration_us); @@ -929,14 +929,18 @@ scheduler_tmpl_t::legacy_field_support() VPRINT(this, 2, "Legacy support: setting block_time_multiplier to %6.3f\n", options_.block_time_multiplier); } + if (options_.block_time_multiplier == 0) { + error_string_ = "block_time_multiplier must != 0"; + return STATUS_ERROR_INVALID_PARAMETER; + } if (options_.block_time_max > 0) { if (options_.struct_size > offsetof(scheduler_options_t, block_time_max_us)) { error_string_ = "quantum_duration is deprecated; use block_time_max_us " "and time_units_per_us"; return STATUS_ERROR_INVALID_PARAMETER; } - options_.block_time_max_us = - static_cast(options_.block_time_max) / options_.time_units_per_us; + options_.block_time_max_us = static_cast( + static_cast(options_.block_time_max) / options_.time_units_per_us); VPRINT(this, 2, "Legacy support: setting block_time_max_us to %" PRIu64 "\n", options_.block_time_max_us); } @@ -3579,10 +3583,6 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, input->prev_time_in_quantum = cur_time; double elapsed_micros = input->time_spent_in_quantum * options_.time_units_per_us; - VPRINT(this, 4, - "next_record[%d]: input %d elapsed %6.1f vs quantum %" PRIu64 "\n", - output, input->index, elapsed_micros, - options_.quantum_duration_us); // NOCHECK if (elapsed_micros >= options_.quantum_duration_us && // We only switch on instruction boundaries. We could possibly switch // in between (e.g., scatter/gather long sequence of reads/writes) by From 4b40d2b8f35d1a493afd200db39c03745ebdda18 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Thu, 12 Sep 2024 11:53:40 -0400 Subject: [PATCH 03/11] Fix another Windows build warning --- clients/drcachesim/tests/scheduler_unit_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp index b8bf367b581..8bb2b41ed71 100644 --- a/clients/drcachesim/tests/scheduler_unit_tests.cpp +++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp @@ -437,7 +437,7 @@ test_legacy_fields() static constexpr int NUM_INSTRS = 9; static constexpr int QUANTUM_DURATION = 3; // We do not want to block for very long. - static constexpr double BLOCK_LATENCY = 200; + static constexpr uint64_t BLOCK_LATENCY = 200; static constexpr double BLOCK_SCALE = 0.01; static constexpr double BLOCK_MAX = 50; static constexpr memref_tid_t TID_BASE = 100; From 33b5bfd2d7fd095ad350f27f3d74d8d796cdf1d7 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Thu, 12 Sep 2024 12:07:02 -0400 Subject: [PATCH 04/11] Fix yet another Windows build warning --- clients/drcachesim/tests/scheduler_unit_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp index 8bb2b41ed71..81a8d43391e 100644 --- a/clients/drcachesim/tests/scheduler_unit_tests.cpp +++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp @@ -439,7 +439,7 @@ test_legacy_fields() // We do not want to block for very long. static constexpr uint64_t BLOCK_LATENCY = 200; static constexpr double BLOCK_SCALE = 0.01; - static constexpr double BLOCK_MAX = 50; + static constexpr uint64_t BLOCK_MAX = 50; static constexpr memref_tid_t TID_BASE = 100; static constexpr uint64_t START_TIME = 20; std::vector inputs[NUM_INPUTS]; From a462db1a142c656ebd06fc07a601027d3a2f04e5 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Thu, 12 Sep 2024 19:05:21 -0400 Subject: [PATCH 05/11] Add missing includes --- clients/drcachesim/scheduler/scheduler.cpp | 1 + clients/drcachesim/tests/scheduler_unit_tests.cpp | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index 5401c33c671..4f2c580a0e1 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp index 81a8d43391e..9420641611b 100644 --- a/clients/drcachesim/tests/scheduler_unit_tests.cpp +++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp @@ -34,14 +34,20 @@ #undef NDEBUG #include #include +#include #include #include +#include +#include #include #include +#include #include "dr_api.h" #include "scheduler.h" #include "mock_reader.h" +#include "memref.h" +#include "trace_entry.h" #ifdef HAS_ZIP # include "zipfile_istream.h" # include "zipfile_ostream.h" From 5d7cef752f4c57d50aad7cd6089aa790b710d7ca Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Fri, 13 Sep 2024 15:34:11 -0400 Subject: [PATCH 06/11] Fix scaling bugs for blocked time and quantum; shift defaults to match local analyzer runs --- clients/drcachesim/common/options.cpp | 11 +++++++++-- clients/drcachesim/scheduler/scheduler.cpp | 17 +++++++++++------ clients/drcachesim/scheduler/scheduler.h | 16 ++++++++-------- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp index 9e888670d32..c87d70bc3de 100644 --- a/clients/drcachesim/common/options.cpp +++ b/clients/drcachesim/common/options.cpp @@ -928,7 +928,7 @@ droption_t op_sched_blocking_switch_us( "-core_serial. "); droption_t op_sched_block_scale( - DROPTION_SCOPE_ALL, "sched_block_scale", 0.01, "Input block time scale factor", + DROPTION_SCOPE_ALL, "sched_block_scale", 0.1, "Input block time scale factor", "This value is multiplied by -sched_time_per_us to produce a scale which is applied " "to the as-traced microsecond latency of blocking system calls to produce the block " "time during simulation. A higher value here results in blocking syscalls keeping " @@ -945,7 +945,7 @@ droption_t op_sched_block_scale( // long idle times with local analyzers; it may need to be increased with more // heavyweight analyzers/simulators. // TODO i#6959: Once we have -exit_if_all_unscheduled raise this. -droption_t op_sched_block_max_us(DROPTION_SCOPE_ALL, "sched_block_max_us", 250, +droption_t op_sched_block_max_us(DROPTION_SCOPE_ALL, "sched_block_max_us", 2500, "Maximum blocked input time, in microseconds", "The maximum blocked time, after scaling with " "-sched_block_scale."); @@ -992,6 +992,13 @@ droption_t op_sched_disable_direct_switches( "switch being determined by latency and the next input in the queue. The " "TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH markers are not removed from the trace."); +droption_t op_sched_time_units_per_us( + DROPTION_SCOPE_ALL, "sched_time_units_per_us", 100., + "Time units per simulated microsecond", + "Time units (currently wall-clock time) per simulated microsecond. This scales all " + "of the -sched_*_us values as it concerts wall-clock time into the simulated " + "microseconds measured by those options."); + // Schedule_stats options. droption_t op_schedule_stats_print_every(DROPTION_SCOPE_ALL, "schedule_stats_print_every", diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index 4f2c580a0e1..9f5354cb6e8 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -949,6 +949,10 @@ scheduler_tmpl_t::legacy_field_support() error_string_ = "block_time_max_us must be > 0"; return STATUS_ERROR_INVALID_PARAMETER; } + if (options_.time_units_per_us == 0) { + error_string_ = "time_units_per_us must be > 0"; + return STATUS_ERROR_INVALID_PARAMETER; + } return STATUS_SUCCESS; } @@ -2615,17 +2619,17 @@ template uint64_t scheduler_tmpl_t::scale_blocked_time(uint64_t initial_time) const { - uint64_t scaled = static_cast(static_cast(initial_time) * - options_.block_time_multiplier); - if (scaled > options_.block_time_max_us) { + uint64_t scaled_us = static_cast(static_cast(initial_time) * + options_.block_time_multiplier); + if (scaled_us > options_.block_time_max_us) { // We have a max to avoid outlier latencies that are already a second or // more from scaling up to tens of minutes. We assume a cap is representative // as the outliers likely were not part of key dependence chains. Without a // cap the other threads all finish and the simulation waits for tens of // minutes further for a couple of outliers. - scaled = options_.block_time_max_us; + scaled_us = options_.block_time_max_us; } - return scaled; + return static_cast(scaled_us * options_.time_units_per_us); } template @@ -3583,7 +3587,8 @@ scheduler_tmpl_t::next_record(output_ordinal_t output, prev_time_in_quantum = input->prev_time_in_quantum; input->prev_time_in_quantum = cur_time; double elapsed_micros = - input->time_spent_in_quantum * options_.time_units_per_us; + static_cast(input->time_spent_in_quantum) / + options_.time_units_per_us; if (elapsed_micros >= options_.quantum_duration_us && // We only switch on instruction boundaries. We could possibly switch // in between (e.g., scatter/gather long sequence of reads/writes) by diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h index a4dea1f1024..0688211e50e 100644 --- a/clients/drcachesim/scheduler/scheduler.h +++ b/clients/drcachesim/scheduler/scheduler.h @@ -733,9 +733,9 @@ template class scheduler_tmpl_t { * picoseconds, pass one million here. This is used to scale all of the * other parameters that are in microseconds (they all end in "_us": e.g., * #quantum_duration_us) so that they operate on the right time scale for the - * passed-in simulator time. + * passed-in simulator time (or wall-clock microseconds if no time is passed). */ - double time_units_per_us = 1000.; + double time_units_per_us = 100.; /** * The scheduling quantum duration for preemption, in simulated microseconds, * for #QUANTUM_TIME. This value is multiplied by #time_units_per_us to @@ -748,8 +748,8 @@ template class scheduler_tmpl_t { * for #QUANTUM_INSTRUCTIONS. The time passed to next_record() is ignored * for purposes of quantum preempts. */ - // We pick 6 million to match 2 instructions per nanosecond with a 3ms quantum. - uint64_t quantum_duration_instrs = 6 * 1000 * 1000; + // We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum. + uint64_t quantum_duration_instrs = 10 * 1000 * 1000; /** * Controls the amount of time inputs are considered blocked at a syscall * whose as-traced latency (recorded in timestamp records in the trace) @@ -767,7 +767,7 @@ template class scheduler_tmpl_t { * with #block_time_max_us, can be tuned to achieve a desired idle rate. * The default value errs on the side of less idle time. */ - double block_time_multiplier = 0.01; + double block_time_multiplier = 0.1; /** * The maximum time in microseconds for an input to be considered blocked for * any one system call. This value is multiplied by #time_units_per_us to @@ -779,7 +779,7 @@ template class scheduler_tmpl_t { * after this amount of time those inputs are all re-scheduled. */ // TODO i#6959: Once we have -exit_if_all_unscheduled raise this. - uint64_t block_time_max_us = 250; + uint64_t block_time_max_us = 2500; }; /** @@ -1364,11 +1364,11 @@ template class scheduler_tmpl_t { uint64_t syscall_timeout_arg = 0; // Used to switch before we've read the next instruction. bool switching_pre_instruction = false; - // Used for time-based quanta. + // Used for time-based quanta. The units are simulation time. uint64_t prev_time_in_quantum = 0; uint64_t time_spent_in_quantum = 0; // These fields model waiting at a blocking syscall. - // The units are us for instr quanta and simuilation time for time quanta. + // The units are in simuilation time. uint64_t blocked_time = 0; uint64_t blocked_start_time = 0; // An input can be "unscheduled" and not on the ready_priority_ run queue at all From f4197663cb7bbe32aafa07b7cd524f838de7de44 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Fri, 13 Sep 2024 16:09:52 -0400 Subject: [PATCH 07/11] Fix two unit tests missing time_units_per_us --- clients/drcachesim/scheduler/scheduler.cpp | 24 +++++++++++++++++++ .../drcachesim/tests/scheduler_unit_tests.cpp | 2 ++ 2 files changed, 26 insertions(+) diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index 9f5354cb6e8..50b0c2839d6 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -3812,6 +3812,30 @@ scheduler_tmpl_t::eof_or_idle(output_ordinal_t output, (options_.mapping == MAP_AS_PREVIOUSLY && live_replay_output_count_.load(std::memory_order_acquire) == 0)) { assert(options_.mapping != MAP_AS_PREVIOUSLY || outputs_[output].at_eof); +#if 1 // NOCHECK + for (unsigned int i = 0; i < outputs_.size(); ++i) { + VPRINT(this, 1, "Stats for output #%d\n", i); + VPRINT( + this, 1, " %-25s: %9" PRId64 "\n", "Switch input->input", + outputs_[i].stats[memtrace_stream_t::SCHED_STAT_SWITCH_INPUT_TO_INPUT]); + VPRINT(this, 1, " %-25s: %9" PRId64 "\n", "Switch input->idle", + outputs_[i].stats[memtrace_stream_t::SCHED_STAT_SWITCH_INPUT_TO_IDLE]); + VPRINT(this, 1, " %-25s: %9" PRId64 "\n", "Switch idle->input", + outputs_[i].stats[memtrace_stream_t::SCHED_STAT_SWITCH_IDLE_TO_INPUT]); + VPRINT(this, 1, " %-25s: %9" PRId64 "\n", "Switch nop", + outputs_[i].stats[memtrace_stream_t::SCHED_STAT_SWITCH_NOP]); + VPRINT(this, 1, " %-25s: %9" PRId64 "\n", "Quantum preempts", + outputs_[i].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS]); + VPRINT( + this, 1, " %-25s: %9" PRId64 "\n", "Direct switch attempts", + outputs_[i].stats[memtrace_stream_t::SCHED_STAT_DIRECT_SWITCH_ATTEMPTS]); + VPRINT( + this, 1, " %-25s: %9" PRId64 "\n", "Direct switch successes", + outputs_[i].stats[memtrace_stream_t::SCHED_STAT_DIRECT_SWITCH_SUCCESSES]); + VPRINT(this, 1, " %-25s: %9" PRId64 "\n", "Migrations", + outputs_[i].stats[memtrace_stream_t::SCHED_STAT_MIGRATIONS]); + } +#endif return sched_type_t::STATUS_EOF; } else { bool need_lock; diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp index 9420641611b..43b4d201c6a 100644 --- a/clients/drcachesim/tests/scheduler_unit_tests.cpp +++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp @@ -1242,6 +1242,7 @@ test_synthetic() /*verbosity=*/3); sched_ops.quantum_duration_instrs = QUANTUM_DURATION; sched_ops.block_time_multiplier = BLOCK_SCALE; + sched_ops.time_units_per_us = 1.; scheduler_t scheduler; if (scheduler.init(sched_inputs, NUM_OUTPUTS, std::move(sched_ops)) != scheduler_t::STATUS_SUCCESS) @@ -2289,6 +2290,7 @@ test_synthetic_with_syscalls_latencies() /*verbosity=*/4); // We use a mock time for a deterministic result. sched_ops.quantum_unit = scheduler_t::QUANTUM_TIME; + sched_ops.time_units_per_us = 1.; sched_ops.blocking_switch_threshold = BLOCK_LATENCY; sched_ops.block_time_multiplier = BLOCK_SCALE; scheduler_t scheduler; From e908b0469ef422676347fad5821db1793e230907 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Fri, 13 Sep 2024 16:12:58 -0400 Subject: [PATCH 08/11] Remove diagnostics --- clients/drcachesim/scheduler/scheduler.cpp | 24 ---------------------- 1 file changed, 24 deletions(-) diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index 50b0c2839d6..9f5354cb6e8 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -3812,30 +3812,6 @@ scheduler_tmpl_t::eof_or_idle(output_ordinal_t output, (options_.mapping == MAP_AS_PREVIOUSLY && live_replay_output_count_.load(std::memory_order_acquire) == 0)) { assert(options_.mapping != MAP_AS_PREVIOUSLY || outputs_[output].at_eof); -#if 1 // NOCHECK - for (unsigned int i = 0; i < outputs_.size(); ++i) { - VPRINT(this, 1, "Stats for output #%d\n", i); - VPRINT( - this, 1, " %-25s: %9" PRId64 "\n", "Switch input->input", - outputs_[i].stats[memtrace_stream_t::SCHED_STAT_SWITCH_INPUT_TO_INPUT]); - VPRINT(this, 1, " %-25s: %9" PRId64 "\n", "Switch input->idle", - outputs_[i].stats[memtrace_stream_t::SCHED_STAT_SWITCH_INPUT_TO_IDLE]); - VPRINT(this, 1, " %-25s: %9" PRId64 "\n", "Switch idle->input", - outputs_[i].stats[memtrace_stream_t::SCHED_STAT_SWITCH_IDLE_TO_INPUT]); - VPRINT(this, 1, " %-25s: %9" PRId64 "\n", "Switch nop", - outputs_[i].stats[memtrace_stream_t::SCHED_STAT_SWITCH_NOP]); - VPRINT(this, 1, " %-25s: %9" PRId64 "\n", "Quantum preempts", - outputs_[i].stats[memtrace_stream_t::SCHED_STAT_QUANTUM_PREEMPTS]); - VPRINT( - this, 1, " %-25s: %9" PRId64 "\n", "Direct switch attempts", - outputs_[i].stats[memtrace_stream_t::SCHED_STAT_DIRECT_SWITCH_ATTEMPTS]); - VPRINT( - this, 1, " %-25s: %9" PRId64 "\n", "Direct switch successes", - outputs_[i].stats[memtrace_stream_t::SCHED_STAT_DIRECT_SWITCH_SUCCESSES]); - VPRINT(this, 1, " %-25s: %9" PRId64 "\n", "Migrations", - outputs_[i].stats[memtrace_stream_t::SCHED_STAT_MIGRATIONS]); - } -#endif return sched_type_t::STATUS_EOF; } else { bool need_lock; From f0e1efb10727e192f71bf7cd8d581e3a7990f42a Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Mon, 16 Sep 2024 15:30:18 -0400 Subject: [PATCH 09/11] Review requests: make quantum defaults match; clarify a number of comments; fix typos --- clients/drcachesim/common/options.cpp | 17 ++++++++++------- clients/drcachesim/scheduler/scheduler.h | 6 +++--- .../drcachesim/tests/scheduler_unit_tests.cpp | 9 ++++++--- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp index c87d70bc3de..1adad4168dd 100644 --- a/clients/drcachesim/common/options.cpp +++ b/clients/drcachesim/common/options.cpp @@ -893,8 +893,8 @@ droption_t "Wall-clock microseconds per simulated microsecond."); droption_t - // We pick 6 million to match 2 instructions per nanosecond with a 3ms quantum. - op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 6 * 1000 * 1000, + // We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum. + op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 10 * 1000 * 1000, "Scheduling quantum", "Applies to -core_sharded and -core_serial. " "Scheduling quantum in instructions, unless -sched_time is set in " @@ -929,10 +929,13 @@ droption_t op_sched_blocking_switch_us( droption_t op_sched_block_scale( DROPTION_SCOPE_ALL, "sched_block_scale", 0.1, "Input block time scale factor", - "This value is multiplied by -sched_time_per_us to produce a scale which is applied " - "to the as-traced microsecond latency of blocking system calls to produce the block " - "time during simulation. A higher value here results in blocking syscalls keeping " - "inputs unscheduled for longer."); + "A system call considered to block (see -sched_blocking_switch_us) will " + "block in the trace scheduler for an amount of simulator time equal to its " + "as-traced latency in trace-time microseconds multiplied by this parameter " + "and by -sched_time_per_us in simulated microseconds, subject to a " + "maximum of --sched_block_max_us. A higher value here results in blocking " + "syscalls keeping inputs unscheduled for longer. There is indirect " + "overhead inflating the as-traced times, so a value below 1 is typical."); // We have a max to avoid outlier latencies from scaling up to extreme times. There is // some inflation in the as-traced latencies and some can be inflated more than others. @@ -996,7 +999,7 @@ droption_t op_sched_time_units_per_us( DROPTION_SCOPE_ALL, "sched_time_units_per_us", 100., "Time units per simulated microsecond", "Time units (currently wall-clock time) per simulated microsecond. This scales all " - "of the -sched_*_us values as it concerts wall-clock time into the simulated " + "of the -sched_*_us values as it converts wall-clock time into the simulated " "microseconds measured by those options."); // Schedule_stats options. diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h index 0688211e50e..821644b5c38 100644 --- a/clients/drcachesim/scheduler/scheduler.h +++ b/clients/drcachesim/scheduler/scheduler.h @@ -595,8 +595,8 @@ template class scheduler_tmpl_t { /** The unit of the schedule time quantum. */ quantum_unit_t quantum_unit = QUANTUM_INSTRUCTIONS; /** - * Deprecated: use #quantum_duration_us and #time_units_per_us for #QUANTUM_TIME - * or #quantum_duration_instrs for #QUANTUM_INSTRUCTIONS instead. It + * Deprecated: use #quantum_duration_us and #time_units_per_us for #QUANTUM_TIME, + * or #quantum_duration_instrs for #QUANTUM_INSTRUCTIONS, instead. It * is an error to set this to a non-zero value when #struct_size includes * #quantum_duration_us. When #struct_size does not include * #quantum_duration_us and this value is non-zero, the value in @@ -756,7 +756,7 @@ template class scheduler_tmpl_t { * exceeds #syscall_switch_threshold or #blocking_switch_threshold. The * as-traced syscall latency (which is in traced microseconds) is multiplied * by this field to produce the blocked time in simulated microseconds. Once - * that many simulated microseconds has passed according to the "cur_time" + * that many simulated microseconds have passed according to the "cur_time" * value passed to next_record() (multiplied by #time_units_per_us), the * input will be no longer considered blocked. The blocked time is clamped * to a maximum value controlled by #block_time_max. diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp index 43b4d201c6a..bd077fd7878 100644 --- a/clients/drcachesim/tests/scheduler_unit_tests.cpp +++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp @@ -166,10 +166,13 @@ verify_scheduler_stats(scheduler_t::stream_t *stream, int64_t switch_input_to_in migrations); } -// Returns a string with one char per input. +// Returns a vector of strings, one per ouput, where each string has one char per input +// showing the order of inputs scheduled onto that output. // Assumes the input threads are all tid_base plus an offset < 26. -// When send_time=true, typically time_units_per_us should be set to 1 to then have -// instruction count for all timing measures. +// When send_time=true, the record count is passed to the scheduler as the current +// time, to avoid relying on wall-clock time. For this use case of send_time=true, +// typically time_units_per_us should be set to 1 to avoid any scaling of the record +// count for simpler small tests. static std::vector run_lockstep_simulation(scheduler_t &scheduler, int num_outputs, memref_tid_t tid_base, bool send_time = false, bool print_markers = true) From 5e11733ca0c53e060ef7fd2ec44f5bcc017e1751 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Mon, 16 Sep 2024 16:12:54 -0400 Subject: [PATCH 10/11] Fix typo --- clients/drcachesim/tests/scheduler_unit_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp index bd077fd7878..d9515676db9 100644 --- a/clients/drcachesim/tests/scheduler_unit_tests.cpp +++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp @@ -166,7 +166,7 @@ verify_scheduler_stats(scheduler_t::stream_t *stream, int64_t switch_input_to_in migrations); } -// Returns a vector of strings, one per ouput, where each string has one char per input +// Returns a vector of strings, one per output, where each string has one char per input // showing the order of inputs scheduled onto that output. // Assumes the input threads are all tid_base plus an offset < 26. // When send_time=true, the record count is passed to the scheduler as the current From c05d4aa39ad3e69942b2b932473dc5e48e070526 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Mon, 16 Sep 2024 16:34:36 -0400 Subject: [PATCH 11/11] Check time_units_per_us being 0 before doing divide --- clients/drcachesim/scheduler/scheduler.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index 9f5354cb6e8..331df93b419 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -898,6 +898,10 @@ template typename scheduler_tmpl_t::scheduler_status_t scheduler_tmpl_t::legacy_field_support() { + if (options_.time_units_per_us == 0) { + error_string_ = "time_units_per_us must be > 0"; + return STATUS_ERROR_INVALID_PARAMETER; + } if (options_.quantum_duration > 0) { if (options_.struct_size > offsetof(scheduler_options_t, quantum_duration_us)) { error_string_ = "quantum_duration is deprecated; use quantum_duration_us and " @@ -949,10 +953,6 @@ scheduler_tmpl_t::legacy_field_support() error_string_ = "block_time_max_us must be > 0"; return STATUS_ERROR_INVALID_PARAMETER; } - if (options_.time_units_per_us == 0) { - error_string_ = "time_units_per_us must be > 0"; - return STATUS_ERROR_INVALID_PARAMETER; - } return STATUS_SUCCESS; }