Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

i#6938 sched migrate: All options in microseconds + single scale #6980

Merged
merged 14 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions clients/drcachesim/analyzer_multi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -553,13 +553,17 @@ analyzer_multi_tmpl_t<RecordType, ReaderType>::init_dynamic_schedule()
op_sched_order_time.get_value() ? sched_type_t::DEPENDENCY_TIMESTAMPS
: sched_type_t::DEPENDENCY_IGNORE,
sched_type_t::SCHEDULER_DEFAULTS, op_verbose.get_value());
sched_ops.quantum_duration = op_sched_quantum.get_value();
if (op_sched_time.get_value())
sched_ops.time_units_per_us = op_sched_time_per_us.get_value();
if (op_sched_time.get_value()) {
sched_ops.quantum_unit = sched_type_t::QUANTUM_TIME;
sched_ops.quantum_duration_us = op_sched_quantum.get_value();
} else {
sched_ops.quantum_duration_instrs = op_sched_quantum.get_value();
}
sched_ops.syscall_switch_threshold = op_sched_syscall_switch_us.get_value();
sched_ops.blocking_switch_threshold = op_sched_blocking_switch_us.get_value();
sched_ops.block_time_scale = op_sched_block_scale.get_value();
sched_ops.block_time_max = op_sched_block_max_us.get_value();
sched_ops.block_time_multiplier = op_sched_block_scale.get_value();
sched_ops.block_time_max_us = op_sched_block_max_us.get_value();
sched_ops.randomize_next_input = op_sched_randomize.get_value();
sched_ops.honor_direct_switches = !op_sched_disable_direct_switches.get_value();
#ifdef HAS_ZIP
Expand Down
38 changes: 26 additions & 12 deletions clients/drcachesim/common/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -887,13 +887,19 @@ droption_t<bool> op_core_serial(
"How the scheduling is performed is controlled by a set "
"of options with the prefix \"sched_\" along with -cores.");

droption_t<double>
op_sched_time_per_us(DROPTION_SCOPE_ALL, "sched_time_per_us", 1000.,
"Wall-clock microseconds per simulated microsecond",
"Wall-clock microseconds per simulated microsecond.");

droption_t<int64_t>
// We pick 6 million to match 2 instructions per nanosecond with a 3ms quantum.
op_sched_quantum(DROPTION_SCOPE_ALL, "sched_quantum", 6 * 1000 * 1000,
"Scheduling quantum",
"Applies to -core_sharded and -core_serial. "
"Scheduling quantum: in microseconds of wall-clock "
"time if -sched_time is set; otherwise in instructions.");
"Scheduling quantum in instructions, unless -sched_time is set in "
"which case this value is multiplied by -sched_time_per_us to "
"produce a quantum in wall-clock microseconds.");

droption_t<bool>
op_sched_time(DROPTION_SCOPE_ALL, "sched_time", false,
Expand Down Expand Up @@ -922,23 +928,24 @@ droption_t<uint64_t> op_sched_blocking_switch_us(
"-core_serial. ");

droption_t<double> op_sched_block_scale(
DROPTION_SCOPE_ALL, "sched_block_scale", 10., "Input block time scale factor",
"The scale applied to the microsecond latency of blocking system calls. A higher "
"value here results in blocking syscalls keeping inputs unscheduled for longer. "
"This should roughly equal the slowdown of instruction record processing versus the "
"original (untraced) application execution.");

// We have a max to avoid outlier latencies that are already a second or more from
// scaling up to tens of minutes. We assume a cap is representative as the outliers
DROPTION_SCOPE_ALL, "sched_block_scale", 0.1, "Input block time scale factor",
"This value is multiplied by -sched_time_per_us to produce a scale which is applied "
"to the as-traced microsecond latency of blocking system calls to produce the block "
"time during simulation. A higher value here results in blocking syscalls keeping "
"inputs unscheduled for longer.");

// We have a max to avoid outlier latencies from scaling up to extreme times. There is
// some inflation in the as-traced latencies and some can be inflated more than others.
// We assume a cap is representative as the outliers
// likely were not part of key dependence chains. Without a cap the other threads all
// finish and the simulation waits for tens of minutes further for a couple of outliers.
// The cap remains a flag and not a constant as different length traces and different
// speed simulators need different idle time ranges, so we need to be able to tune this
// to achieve desired cpu usage targets. The default value was selected to avoid unduly
// long idle times with local analyzers; it may need to be increased with more
// heavyweight analyzers/simulators.
droption_t<uint64_t> op_sched_block_max_us(DROPTION_SCOPE_ALL, "sched_block_max_us",
250000,
// TODO i#6959: Once we have -exit_if_all_unscheduled raise this.
droption_t<uint64_t> op_sched_block_max_us(DROPTION_SCOPE_ALL, "sched_block_max_us", 2500,
"Maximum blocked input time, in microseconds",
"The maximum blocked time, after scaling with "
"-sched_block_scale.");
Expand Down Expand Up @@ -985,6 +992,13 @@ droption_t<bool> op_sched_disable_direct_switches(
"switch being determined by latency and the next input in the queue. The "
"TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH markers are not removed from the trace.");

droption_t<double> op_sched_time_units_per_us(
DROPTION_SCOPE_ALL, "sched_time_units_per_us", 100.,
"Time units per simulated microsecond",
"Time units (currently wall-clock time) per simulated microsecond. This scales all "
"of the -sched_*_us values as it concerts wall-clock time into the simulated "
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
"microseconds measured by those options.");

// Schedule_stats options.
droption_t<uint64_t>
op_schedule_stats_print_every(DROPTION_SCOPE_ALL, "schedule_stats_print_every",
Expand Down
1 change: 1 addition & 0 deletions clients/drcachesim/common/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ extern dynamorio::droption::droption_t<int> op_kernel_trace_buffer_size_shift;
#endif
extern dynamorio::droption::droption_t<bool> op_core_sharded;
extern dynamorio::droption::droption_t<bool> op_core_serial;
extern dynamorio::droption::droption_t<double> op_sched_time_per_us;
extern dynamorio::droption::droption_t<int64_t> op_sched_quantum;
extern dynamorio::droption::droption_t<bool> op_sched_time;
extern dynamorio::droption::droption_t<bool> op_sched_order_time;
Expand Down
100 changes: 87 additions & 13 deletions clients/drcachesim/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <algorithm>
#include <cassert>
#include <cinttypes>
#include <cstddef>
#include <cstdio>
#include <iomanip>
#include <limits>
Expand Down Expand Up @@ -842,6 +843,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::init(
}
}

// Legacy field support.
sched_type_t::scheduler_status_t res = legacy_field_support();
if (res != sched_type_t::STATUS_SUCCESS)
return res;

if (TESTANY(sched_type_t::SCHEDULER_USE_SINGLE_INPUT_ORDINALS, options_.flags) &&
inputs_.size() == 1 && output_count == 1) {
options_.flags = static_cast<scheduler_flags_t>(
Expand Down Expand Up @@ -881,13 +887,75 @@ scheduler_tmpl_t<RecordType, ReaderType>::init(
VPRINT(this, 1, "%zu inputs\n", inputs_.size());
live_input_count_.store(static_cast<int>(inputs_.size()), std::memory_order_release);

sched_type_t::scheduler_status_t res = read_switch_sequences();
res = read_switch_sequences();
if (res != sched_type_t::STATUS_SUCCESS)
return STATUS_ERROR_INVALID_PARAMETER;

return set_initial_schedule(workload2inputs);
}

template <typename RecordType, typename ReaderType>
typename scheduler_tmpl_t<RecordType, ReaderType>::scheduler_status_t
scheduler_tmpl_t<RecordType, ReaderType>::legacy_field_support()
{
if (options_.quantum_duration > 0) {
if (options_.struct_size > offsetof(scheduler_options_t, quantum_duration_us)) {
error_string_ = "quantum_duration is deprecated; use quantum_duration_us and "
"time_units_per_us or quantum_duration_instrs";
return STATUS_ERROR_INVALID_PARAMETER;
}
if (options_.quantum_unit == QUANTUM_INSTRUCTIONS) {
options_.quantum_duration_instrs = options_.quantum_duration;
} else {
options_.quantum_duration_us =
static_cast<uint64_t>(static_cast<double>(options_.quantum_duration) /
options_.time_units_per_us);
VPRINT(this, 2,
"Legacy support: setting quantum_duration_us to %" PRIu64 "\n",
options_.quantum_duration_us);
}
}
if (options_.quantum_duration_us == 0) {
error_string_ = "quantum_duration_us must be > 0";
return STATUS_ERROR_INVALID_PARAMETER;
}
if (options_.block_time_scale > 0) {
if (options_.struct_size > offsetof(scheduler_options_t, block_time_multiplier)) {
error_string_ = "quantum_duration is deprecated; use block_time_multiplier "
"and time_units_per_us";
return STATUS_ERROR_INVALID_PARAMETER;
}
options_.block_time_multiplier =
static_cast<double>(options_.block_time_scale) / options_.time_units_per_us;
VPRINT(this, 2, "Legacy support: setting block_time_multiplier to %6.3f\n",
options_.block_time_multiplier);
}
if (options_.block_time_multiplier == 0) {
error_string_ = "block_time_multiplier must != 0";
return STATUS_ERROR_INVALID_PARAMETER;
}
if (options_.block_time_max > 0) {
if (options_.struct_size > offsetof(scheduler_options_t, block_time_max_us)) {
error_string_ = "quantum_duration is deprecated; use block_time_max_us "
"and time_units_per_us";
return STATUS_ERROR_INVALID_PARAMETER;
}
options_.block_time_max_us = static_cast<uint64_t>(
static_cast<double>(options_.block_time_max) / options_.time_units_per_us);
VPRINT(this, 2, "Legacy support: setting block_time_max_us to %" PRIu64 "\n",
options_.block_time_max_us);
}
if (options_.block_time_max_us == 0) {
error_string_ = "block_time_max_us must be > 0";
return STATUS_ERROR_INVALID_PARAMETER;
}
if (options_.time_units_per_us == 0) {
error_string_ = "time_units_per_us must be > 0";
return STATUS_ERROR_INVALID_PARAMETER;
}
return STATUS_SUCCESS;
}

template <typename RecordType, typename ReaderType>
typename scheduler_tmpl_t<RecordType, ReaderType>::scheduler_status_t
scheduler_tmpl_t<RecordType, ReaderType>::set_initial_schedule(
Expand Down Expand Up @@ -2551,17 +2619,17 @@ template <typename RecordType, typename ReaderType>
uint64_t
scheduler_tmpl_t<RecordType, ReaderType>::scale_blocked_time(uint64_t initial_time) const
{
uint64_t scaled = static_cast<uint64_t>(static_cast<double>(initial_time) *
options_.block_time_scale);
if (scaled > options_.block_time_max) {
uint64_t scaled_us = static_cast<uint64_t>(static_cast<double>(initial_time) *
options_.block_time_multiplier);
if (scaled_us > options_.block_time_max_us) {
// We have a max to avoid outlier latencies that are already a second or
// more from scaling up to tens of minutes. We assume a cap is representative
// as the outliers likely were not part of key dependence chains. Without a
// cap the other threads all finish and the simulation waits for tens of
// minutes further for a couple of outliers.
scaled = options_.block_time_max;
scaled_us = options_.block_time_max_us;
}
return scaled;
return static_cast<uint64_t>(scaled_us * options_.time_units_per_us);
}

template <typename RecordType, typename ReaderType>
Expand All @@ -2587,11 +2655,11 @@ scheduler_tmpl_t<RecordType, ReaderType>::syscall_incurs_switch(input_info_t *in
: options_.syscall_switch_threshold;
blocked_time = scale_blocked_time(latency);
VPRINT(this, 3,
"input %d %ssyscall latency %" PRIu64 " * scale %5.1f => blocked time %" PRIu64
"input %d %ssyscall latency %" PRIu64 " * scale %6.3f => blocked time %" PRIu64
"\n",
input->index,
input->processing_maybe_blocking_syscall ? "maybe-blocking " : "", latency,
options_.block_time_scale, blocked_time);
options_.block_time_multiplier, blocked_time);
return latency >= threshold;
}

Expand Down Expand Up @@ -3279,6 +3347,8 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
// It's more efficient for QUANTUM_INSTRUCTIONS to get the time here instead of
// in get_output_time(). This also makes the two more similarly behaved with
// respect to blocking system calls.
// TODO i#6971: Use INSTRS_PER_US to replace .cur_time completely
// with a counter-based time, weighted appropriately for STATUS_IDLE.
cur_time = get_time_micros();
}
outputs_[output].cur_time = cur_time; // Invalid values are checked below.
Expand Down Expand Up @@ -3492,7 +3562,7 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
record_type_is_instr_boundary(record, outputs_[output].last_record) &&
!outputs_[output].in_kernel_code) {
++input->instrs_in_quantum;
if (input->instrs_in_quantum > options_.quantum_duration) {
if (input->instrs_in_quantum > options_.quantum_duration_instrs) {
// We again prefer to switch to another input even if the current
// input has the oldest timestamp, prioritizing context switches
// over timestamp ordering.
Expand All @@ -3516,7 +3586,10 @@ scheduler_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t output,
input->time_spent_in_quantum += cur_time - input->prev_time_in_quantum;
prev_time_in_quantum = input->prev_time_in_quantum;
input->prev_time_in_quantum = cur_time;
if (input->time_spent_in_quantum >= options_.quantum_duration &&
double elapsed_micros =
static_cast<double>(input->time_spent_in_quantum) /
options_.time_units_per_us;
if (elapsed_micros >= options_.quantum_duration_us &&
// We only switch on instruction boundaries. We could possibly switch
// in between (e.g., scatter/gather long sequence of reads/writes) by
// setting input->switching_pre_instruction.
Expand Down Expand Up @@ -3759,13 +3832,14 @@ scheduler_tmpl_t<RecordType, ReaderType>::eof_or_idle(output_ordinal_t output,
outputs_[output].wait_start_time = get_output_time(output);
} else {
uint64_t now = get_output_time(output);
if (now - outputs_[output].wait_start_time >
options_.block_time_max) {
double elapsed_micros = (now - outputs_[output].wait_start_time) *
options_.time_units_per_us;
if (elapsed_micros > options_.block_time_max_us) {
// XXX i#6822: We may want some other options here for what to
// do. We could release just one input at a time, which would be
// the same scheduling order (as we have FIFO in
// unscheduled_priority_) but may take a long time at
// block_time_max each; we could declare we're done and just
// block_time_max_us each; we could declare we're done and just
// exit, maybe under a flag or if we could see what % of total
// records we've processed.
VPRINT(this, 1,
Expand Down
Loading
Loading