DynamoRIO · derekbruening · Sep 17, 2024 · Sep 11, 2024 · Sep 12, 2024 · Sep 12, 2024
diff --git a/clients/drcachesim/analyzer_multi.cpp b/clients/drcachesim/analyzer_multi.cpp
@@ -553,13 +553,17 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::init_dynamic_schedule()
         op_sched_order_time.get_value() ? sched_type_t::DEPENDENCY_TIMESTAMPS
                                         : sched_type_t::DEPENDENCY_IGNORE,
         sched_type_t::SCHEDULER_DEFAULTS, op_verbose.get_value());
-    sched_ops.quantum_duration = op_sched_quantum.get_value();
-    if (op_sched_time.get_value())
+    sched_ops.time_units_per_us = op_sched_time_per_us.get_value();
+    if (op_sched_time.get_value()) {
         sched_ops.quantum_unit = sched_type_t::QUANTUM_TIME;
+        sched_ops.quantum_duration_us = op_sched_quantum.get_value();
+    } else {
+        sched_ops.quantum_duration_instrs = op_sched_quantum.get_value();
+    }
     sched_ops.syscall_switch_threshold = op_sched_syscall_switch_us.get_value();
     sched_ops.blocking_switch_threshold = op_sched_blocking_switch_us.get_value();
-    sched_ops.block_time_scale = op_sched_block_scale.get_value();
-    sched_ops.block_time_max = op_sched_block_max_us.get_value();
+    sched_ops.block_time_multiplier = op_sched_block_scale.get_value();
+    sched_ops.block_time_max_us = op_sched_block_max_us.get_value();
     sched_ops.randomize_next_input = op_sched_randomize.get_value();
     sched_ops.honor_direct_switches = !op_sched_disable_direct_switches.get_value();
 #ifdef HAS_ZIP

diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp
@@ -887,13 +887,19 @@ droption_t<bool> op_core_serial(
     "How the scheduling is performed is controlled by a set "
     "of options with the prefix \"sched_\" along with -cores.");
 
+droption_t<double>
+    op_sched_time_per_us(DROPTION_SCOPE_ALL, "sched_time_per_us", 1000.,
+                         "Wall-clock microseconds per simulated microsecond",
+                         "Wall-clock microseconds per simulated microsecond.");
+
 droption_t<int64_t>
     // We pick 6 million to match 2 instructions per nanosecond with a 3ms quantum.
     op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 6 * 1000 * 1000,
                      "Scheduling quantum",
                      "Applies to -core_sharded and -core_serial. "
-                     "Scheduling quantum: in microseconds of wall-clock "
-                     "time if -sched_time is set; otherwise in instructions.");
+                     "Scheduling quantum in instructions, unless -sched_time is set in "
+                     "which case this value is multiplied by -sched_time_per_us to "
+                     "produce a quantum in wall-clock microseconds.");
 
 droption_t<bool>
     op_sched_time(DROPTION_SCOPE_ALL, "sched_time", false,
@@ -922,23 +928,24 @@ droption_t<uint64_t> op_sched_blocking_switch_us(
     "-core_serial. ");
 
 droption_t<double> op_sched_block_scale(
-    DROPTION_SCOPE_ALL, "sched_block_scale", 10., "Input block time scale factor",
-    "The scale applied to the microsecond latency of blocking system calls.  A higher "
-    "value here results in blocking syscalls keeping inputs unscheduled for longer.  "
-    "This should roughly equal the slowdown of instruction record processing versus the "
-    "original (untraced) application execution.");
-
-// We have a max to avoid outlier latencies that are already a second or more from
-// scaling up to tens of minutes.  We assume a cap is representative as the outliers
+    DROPTION_SCOPE_ALL, "sched_block_scale", 0.1, "Input block time scale factor",
+    "This value is multiplied by -sched_time_per_us to produce a scale which is applied "
+    "to the as-traced microsecond latency of blocking system calls to produce the block "
+    "time during simulation.  A higher value here results in blocking syscalls keeping "
+    "inputs unscheduled for longer.");
+
+// We have a max to avoid outlier latencies from scaling up to extreme times.  There is
+// some inflation in the as-traced latencies and some can be inflated more than others.
+// We assume a cap is representative as the outliers
 // likely were not part of key dependence chains.  Without a cap the other threads all
 // finish and the simulation waits for tens of minutes further for a couple of outliers.
 // The cap remains a flag and not a constant as different length traces and different
 // speed simulators need different idle time ranges, so we need to be able to tune this
 // to achieve desired cpu usage targets.  The default value was selected to avoid unduly
 // long idle times with local analyzers; it may need to be increased with more
 // heavyweight analyzers/simulators.
-droption_t<uint64_t> op_sched_block_max_us(DROPTION_SCOPE_ALL, "sched_block_max_us",
-                                           250000,
+// TODO i#6959: Once we have -exit_if_all_unscheduled raise this.
+droption_t<uint64_t> op_sched_block_max_us(DROPTION_SCOPE_ALL, "sched_block_max_us", 2500,
                                            "Maximum blocked input time, in microseconds",
                                            "The maximum blocked time, after scaling with "
                                            "-sched_block_scale.");
@@ -985,6 +992,13 @@ droption_t<bool> op_sched_disable_direct_switches(
     "switch being determined by latency and the next input in the queue.  The "
     "TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH markers are not removed from the trace.");
 
+droption_t<double> op_sched_time_units_per_us(
+    DROPTION_SCOPE_ALL, "sched_time_units_per_us", 100.,
+    "Time units per simulated microsecond",
+    "Time units (currently wall-clock time) per simulated microsecond.  This scales all "
+    "of the -sched_*_us values as it concerts wall-clock time into the simulated "
+    "microseconds measured by those options.");
+
 // Schedule_stats options.
 droption_t<uint64_t>
     op_schedule_stats_print_every(DROPTION_SCOPE_ALL, "schedule_stats_print_every",

diff --git a/clients/drcachesim/common/options.h b/clients/drcachesim/common/options.h
@@ -199,6 +199,7 @@ extern dynamorio::droption::droption_t<int> op_kernel_trace_buffer_size_shift;
 #endif
 extern dynamorio::droption::droption_t<bool> op_core_sharded;
 extern dynamorio::droption::droption_t<bool> op_core_serial;
+extern dynamorio::droption::droption_t<double> op_sched_time_per_us;
 extern dynamorio::droption::droption_t<int64_t> op_sched_quantum;
 extern dynamorio::droption::droption_t<bool> op_sched_time;
 extern dynamorio::droption::droption_t<bool> op_sched_order_time;

diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp
@@ -38,6 +38,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>
+#include <cstddef>
 #include <cstdio>
 #include <iomanip>
 #include <limits>
@@ -842,6 +843,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::init(
         }
     }
 
+    // Legacy field support.
+    sched_type_t::scheduler_status_t res = legacy_field_support();
+    if (res != sched_type_t::STATUS_SUCCESS)
+        return res;
+
     if (TESTANY(sched_type_t::SCHEDULER_USE_SINGLE_INPUT_ORDINALS, options_.flags) &&
         inputs_.size() == 1 && output_count == 1) {
         options_.flags = static_cast<scheduler_flags_t>(
@@ -881,13 +887,75 @@ scheduler_tmpl_t<RecordType, ReaderType>::init(
     VPRINT(this, 1, "%zu inputs\n", inputs_.size());
     live_input_count_.store(static_cast<int>(inputs_.size()), std::memory_order_release);
 
-    sched_type_t::scheduler_status_t res = read_switch_sequences();
+    res = read_switch_sequences();
     if (res != sched_type_t::STATUS_SUCCESS)
         return STATUS_ERROR_INVALID_PARAMETER;
 
     return set_initial_schedule(workload2inputs);
 }
 
+template <typename RecordType, typename ReaderType>
+typename scheduler_tmpl_t<RecordType, ReaderType>::scheduler_status_t
+scheduler_tmpl_t<RecordType, ReaderType>::legacy_field_support()
+{
+    if (options_.quantum_duration > 0) {
+        if (options_.struct_size > offsetof(scheduler_options_t, quantum_duration_us)) {
+            error_string_ = "quantum_duration is deprecated; use quantum_duration_us and "
+                            "time_units_per_us or quantum_duration_instrs";
+            return STATUS_ERROR_INVALID_PARAMETER;
+        }
+        if (options_.quantum_unit == QUANTUM_INSTRUCTIONS) {
+            options_.quantum_duration_instrs = options_.quantum_duration;
+        } else {
+            options_.quantum_duration_us =
+                static_cast<uint64_t>(static_cast<double>(options_.quantum_duration) /
+                                      options_.time_units_per_us);
+            VPRINT(this, 2,
+                   "Legacy support: setting quantum_duration_us to %" PRIu64 "\n",
+                   options_.quantum_duration_us);
+        }
+    }
+    if (options_.quantum_duration_us == 0) {
+        error_string_ = "quantum_duration_us must be > 0";
+        return STATUS_ERROR_INVALID_PARAMETER;
+    }
+    if (options_.block_time_scale > 0) {
+        if (options_.struct_size > offsetof(scheduler_options_t, block_time_multiplier)) {
+            error_string_ = "quantum_duration is deprecated; use block_time_multiplier "
+                            "and time_units_per_us";
+            return STATUS_ERROR_INVALID_PARAMETER;
+        }
+        options_.block_time_multiplier =
+            static_cast<double>(options_.block_time_scale) / options_.time_units_per_us;
+        VPRINT(this, 2, "Legacy support: setting block_time_multiplier to %6.3f\n",
+               options_.block_time_multiplier);
+    }
+    if (options_.block_time_multiplier == 0) {
+        error_string_ = "block_time_multiplier must != 0";
+        return STATUS_ERROR_INVALID_PARAMETER;
+    }
+    if (options_.block_time_max > 0) {
+        if (options_.struct_size > offsetof(scheduler_options_t, block_time_max_us)) {
+            error_string_ = "quantum_duration is deprecated; use block_time_max_us "
+                            "and time_units_per_us";
+            return STATUS_ERROR_INVALID_PARAMETER;
+        }
+        options_.block_time_max_us = static_cast<uint64_t>(
+            static_cast<double>(options_.block_time_max) / options_.time_units_per_us);
+        VPRINT(this, 2, "Legacy support: setting block_time_max_us to %" PRIu64 "\n",
+               options_.block_time_max_us);
+    }
+    if (options_.block_time_max_us == 0) {
+        error_string_ = "block_time_max_us must be > 0";
+        return STATUS_ERROR_INVALID_PARAMETER;
+    }
+    if (options_.time_units_per_us == 0) {
+        error_string_ = "time_units_per_us must be > 0";
+        return STATUS_ERROR_INVALID_PARAMETER;
+    }
+    return STATUS_SUCCESS;
+}
+
 template <typename RecordType, typename ReaderType>
 typename scheduler_tmpl_t<RecordType, ReaderType>::scheduler_status_t
 scheduler_tmpl_t<RecordType, ReaderType>::set_initial_schedule(
@@ -2551,17 +2619,17 @@ template <typename RecordType, typename ReaderType>
 uint64_t
 scheduler_tmpl_t<RecordType, ReaderType>::scale_blocked_time(uint64_t initial_time) const
 {
-    uint64_t scaled = static_cast<uint64_t>(static_cast<double>(initial_time) *
-                                            options_.block_time_scale);
-    if (scaled > options_.block_time_max) {
+    uint64_t scaled_us = static_cast<uint64_t>(static_cast<double>(initial_time) *
+                                               options_.block_time_multiplier);
+    if (scaled_us > options_.block_time_max_us) {
         // We have a max to avoid outlier latencies that are already a second or
         // more from scaling up to tens of minutes.  We assume a cap is representative
         // as the outliers likely were not part of key dependence chains.  Without a
         // cap the other threads all finish and the simulation waits for tens of
         // minutes further for a couple of outliers.
-        scaled = options_.block_time_max;
+        scaled_us = options_.block_time_max_us;
     }
-    return scaled;
+    return static_cast<uint64_t>(scaled_us * options_.time_units_per_us);
 }
 
 template <typename RecordType, typename ReaderType>
@@ -2587,11 +2655,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::syscall_incurs_switch(input_info_t *in
         : options_.syscall_switch_threshold;
     blocked_time = scale_blocked_time(latency);
     VPRINT(this, 3,
-           "input %d %ssyscall latency %" PRIu64 " * scale %5.1f => blocked time %" PRIu64
+           "input %d %ssyscall latency %" PRIu64 " * scale %6.3f => blocked time %" PRIu64
            "\n",
            input->index,
            input->processing_maybe_blocking_syscall ? "maybe-blocking " : "", latency,
-           options_.block_time_scale, blocked_time);
+           options_.block_time_multiplier, blocked_time);
     return latency >= threshold;
 }
 
@@ -3279,6 +3347,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
         // It's more efficient for QUANTUM_INSTRUCTIONS to get the time here instead of
         // in get_output_time().  This also makes the two more similarly behaved with
         // respect to blocking system calls.
+        // TODO i#6971: Use INSTRS_PER_US to replace .cur_time completely
+        // with a counter-based time, weighted appropriately for STATUS_IDLE.
         cur_time = get_time_micros();
     }
     outputs_[output].cur_time = cur_time; // Invalid values are checked below.
@@ -3492,7 +3562,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 record_type_is_instr_boundary(record, outputs_[output].last_record) &&
                 !outputs_[output].in_kernel_code) {
                 ++input->instrs_in_quantum;
-                if (input->instrs_in_quantum > options_.quantum_duration) {
+                if (input->instrs_in_quantum > options_.quantum_duration_instrs) {
                     // We again prefer to switch to another input even if the current
                     // input has the oldest timestamp, prioritizing context switches
                     // over timestamp ordering.
@@ -3516,7 +3586,10 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
                 input->time_spent_in_quantum += cur_time - input->prev_time_in_quantum;
                 prev_time_in_quantum = input->prev_time_in_quantum;
                 input->prev_time_in_quantum = cur_time;
-                if (input->time_spent_in_quantum >= options_.quantum_duration &&
+                double elapsed_micros =
+                    static_cast<double>(input->time_spent_in_quantum) /
+                    options_.time_units_per_us;
+                if (elapsed_micros >= options_.quantum_duration_us &&
                     // We only switch on instruction boundaries.  We could possibly switch
                     // in between (e.g., scatter/gather long sequence of reads/writes) by
                     // setting input->switching_pre_instruction.
@@ -3759,13 +3832,14 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
                     outputs_[output].wait_start_time = get_output_time(output);
                 } else {
                     uint64_t now = get_output_time(output);
-                    if (now - outputs_[output].wait_start_time >
-                        options_.block_time_max) {
+                    double elapsed_micros = (now - outputs_[output].wait_start_time) *
+                        options_.time_units_per_us;
+                    if (elapsed_micros > options_.block_time_max_us) {
                         // XXX i#6822: We may want some other options here for what to
                         // do.  We could release just one input at a time, which would be
                         // the same scheduling order (as we have FIFO in
                         // unscheduled_priority_) but may take a long time at
-                        // block_time_max each; we could declare we're done and just
+                        // block_time_max_us each; we could declare we're done and just
                         // exit, maybe under a flag or if we could see what % of total
                         // records we've processed.
                         VPRINT(this, 1,