From 57c8e117f9753ea7a23d5354f0d8ccd45881759f Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Wed, 28 Aug 2024 10:51:53 -0400 Subject: [PATCH] i#6945: Reduce default block time (#6946) Reduces the scheduler and drmemtrace launcher default values for block_time_scale down to 10 and block_time_max down to 2.5s. This improves the scheduler behavior for small traces under fast analyzers. It seems better to err on the side of faster and let more heavyweight simulations tune the block times for more idle time; otherwise we can end up with local runs and especially new users trying things out and seeing the tool seem to just sit there doing nothing. This reduces the threadsig core-sharded time from a minute and a half down to 10 seconds in local runs (see #6945 for command lines); there is still some idle time in there so it seems a reasonable compromise. Fixes #6945 --- clients/drcachesim/common/options.cpp | 10 +++++----- clients/drcachesim/scheduler/scheduler.cpp | 2 +- clients/drcachesim/scheduler/scheduler.h | 8 ++++++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/clients/drcachesim/common/options.cpp b/clients/drcachesim/common/options.cpp index 64ef61bca38..7922eb3bd51 100644 --- a/clients/drcachesim/common/options.cpp +++ b/clients/drcachesim/common/options.cpp @@ -922,7 +922,7 @@ droption_t op_sched_blocking_switch_us( "-core_serial. "); droption_t op_sched_block_scale( - DROPTION_SCOPE_ALL, "sched_block_scale", 1000., "Input block time scale factor", + DROPTION_SCOPE_ALL, "sched_block_scale", 10., "Input block time scale factor", "The scale applied to the microsecond latency of blocking system calls. A higher " "value here results in blocking syscalls keeping inputs unscheduled for longer. " "This should roughly equal the slowdown of instruction record processing versus the " @@ -934,11 +934,11 @@ droption_t op_sched_block_scale( // finish and the simulation waits for tens of minutes further for a couple of outliers. // The cap remains a flag and not a constant as different length traces and different // speed simulators need different idle time ranges, so we need to be able to tune this -// to achieve desired cpu usage targets. The default value was selected while tuning -// a 1-minute-long schedule_stats run on a 112-core 500-thread large application -// to produce good cpu usage without unduly increasing tool runtime. +// to achieve desired cpu usage targets. The default value was selected to avoid unduly +// long idle times with local analyzers; it may need to be increased with more +// heavyweight analyzers/simulators. droption_t op_sched_block_max_us(DROPTION_SCOPE_ALL, "sched_block_max_us", - 25000000, + 2500000, "Maximum blocked input time, in microseconds", "The maximum blocked time, after scaling with " "-sched_block_scale."); diff --git a/clients/drcachesim/scheduler/scheduler.cpp b/clients/drcachesim/scheduler/scheduler.cpp index 69963ae6e47..ffecb6356fc 100644 --- a/clients/drcachesim/scheduler/scheduler.cpp +++ b/clients/drcachesim/scheduler/scheduler.cpp @@ -2483,7 +2483,7 @@ scheduler_tmpl_t::pop_from_ready_queue( VDO(this, 1, { static int heartbeat; // We are ok with races as the cadence is approximate. - if (++heartbeat % 500 == 0) { + if (++heartbeat % 2000 == 0) { VPRINT(this, 1, "heartbeat[%d] %zd in queue; %d blocked; %zd unscheduled => %d %d\n", for_output, ready_priority_.size(), num_blocked_, diff --git a/clients/drcachesim/scheduler/scheduler.h b/clients/drcachesim/scheduler/scheduler.h index 5f3b2516461..64621c3e592 100644 --- a/clients/drcachesim/scheduler/scheduler.h +++ b/clients/drcachesim/scheduler/scheduler.h @@ -656,8 +656,12 @@ template class scheduler_tmpl_t { * the slowdown of the instruction record processing versus the original * (untraced) application execution. The blocked time is clamped to a maximum * value controlled by #block_time_max. + * + * The default value is meant to be reasonable for simple analyzers. It may + * result in too much or too little idle time depending on the analyzer or + * simulator and its speed; it is meant to be tuned and modified. */ - double block_time_scale = 1000.; + double block_time_scale = 10.; /** * The maximum time, in the units explained by #block_time_scale (either * #QUANTUM_TIME simulator time or wall-clock microseconds for @@ -668,7 +672,7 @@ template class scheduler_tmpl_t { * #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE), after this amount of time those * inputs are all re-scheduled. */ - uint64_t block_time_max = 25000000; + uint64_t block_time_max = 2500000; // XXX: Should we share the file-to-reader code currently in the scheduler // with the analyzer and only then need reader interfaces and not pass paths // to the scheduler?