Skip to content

Commit

Permalink
i#6938 sched migrate: Add observed migrations to schedule stats (#7057)
Browse files Browse the repository at this point in the history
While the schedule_stats tool already reports the migration count from
the scheduler, that is only non-zero for a dynamic schedule, resulting
in 0 for core-sharded-on-disk traces. We add "observed migrations" where
schedule_stats looks for migrations on its own. This requires a global
lock on each context switch, but experimental results show that this
does not seem to cause noticeable overhead.

Adds some testing.

Issue: #6938
  • Loading branch information
derekbruening authored Oct 28, 2024
1 parent d70ab44 commit d689927
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 6 deletions.
1 change: 1 addition & 0 deletions clients/drcachesim/tests/core_serial.templatex
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Total counts:
161 system calls
2 maybe-blocking system calls
0 direct switch requests
0 observed migrations
0 waits
345686 idles
64.89% cpu busy by record count
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Total counts:
161 system calls
2 maybe-blocking system calls
0 direct switch requests
0 observed migrations
0 waits
*[0-9]* idles
*[0-9\.]*% cpu busy by record count
Expand Down
5 changes: 5 additions & 0 deletions clients/drcachesim/tests/schedule_stats_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ run_schedule_stats(const std::vector<std::vector<memref_t>> &memrefs)
std::vector<per_core_t> per_core(memrefs.size());
for (int cpu = 0; cpu < static_cast<int>(memrefs.size()); ++cpu) {
per_core[cpu].worker_data = tool.parallel_worker_init(cpu);
per_core[cpu].stream.set_output_cpuid(cpu);
per_core[cpu].shard_data = tool.parallel_shard_init_stream(
cpu, per_core[cpu].worker_data, &per_core[cpu].stream);
}
Expand All @@ -121,6 +122,7 @@ run_schedule_stats(const std::vector<std::vector<memref_t>> &memrefs)
continue;
memref_t memref = memrefs[cpu][per_core[cpu].memref_idx];
per_core[cpu].stream.set_tid(memref.instr.tid);
per_core[cpu].stream.set_output_cpuid(cpu);
bool res = tool.parallel_shard_memref(per_core[cpu].shard_data, memref);
assert(res);
++per_core[cpu].memref_idx;
Expand Down Expand Up @@ -206,6 +208,8 @@ test_basic_stats()
assert(result.syscalls == 4);
assert(result.maybe_blocking_syscalls == 3);
assert(result.direct_switch_requests == 2);
// 5 migrations: A 0->1; B 1->0; A 1->0; C 0->1; B 0->1.
assert(result.observed_migrations == 5);
assert(result.waits == 3);
assert(result.idle_microseconds == 0);
assert(result.cpu_microseconds > 20);
Expand Down Expand Up @@ -265,6 +269,7 @@ test_idle()
assert(result.syscalls == 0);
assert(result.maybe_blocking_syscalls == 0);
assert(result.direct_switch_requests == 0);
assert(result.observed_migrations == 0);
assert(result.waits == 3);
assert(result.idles == 6);
assert(result.idle_microseconds >= 6);
Expand Down
31 changes: 27 additions & 4 deletions clients/drcachesim/tools/schedule_stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,11 +204,14 @@ schedule_stats_t::update_state_time(per_shard_t *shard, state_t state)
}

void
schedule_stats_t::record_context_switch(per_shard_t *shard, int64_t tid, int64_t input_id,
int64_t letter_ord)
schedule_stats_t::record_context_switch(per_shard_t *shard, int64_t workload_id,
int64_t tid, int64_t input_id, int64_t letter_ord)
{
// We convert to letters which only works well for <=26 inputs.
if (!shard->thread_sequence.empty()) {
if (shard->thread_sequence.empty()) {
std::lock_guard<std::mutex> lock(prev_core_mutex_);
prev_core_[workload_tid_t(workload_id, tid)] = shard->core;
} else {
++shard->counters.total_switches;
if (shard->saw_syscall || shard->saw_exit)
++shard->counters.voluntary_switches;
Expand All @@ -217,6 +220,16 @@ schedule_stats_t::record_context_switch(per_shard_t *shard, int64_t tid, int64_t
uint64_t instr_delta = shard->counters.instrs - shard->switch_start_instrs;
shard->counters.instrs_per_switch->add(instr_delta);
shard->switch_start_instrs = shard->counters.instrs;

// Tracking migrations requires a global lock but just once per context
// switch seems to have negligible performance impact on parallel analysis.
std::lock_guard<std::mutex> lock(prev_core_mutex_);
workload_tid_t workload_tid(workload_id, tid);
auto it = prev_core_.find(workload_tid);
if (it != prev_core_.end() && it->second != shard->core) {
++shard->counters.observed_migrations;
}
prev_core_[workload_tid] = shard->core;
}
shard->thread_sequence +=
THREAD_LETTER_INITIAL_START + static_cast<char>(letter_ord % 26);
Expand Down Expand Up @@ -321,7 +334,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
if ((workload_id != prev_workload_id || tid != prev_tid) && tid != IDLE_THREAD_ID) {
// See XXX comment in get_scheduler_stats(): this measures swap-ins, while
// "perf" measures swap-outs.
record_context_switch(shard, tid, input_id, letter_ord);
record_context_switch(shard, workload_id, tid, input_id, letter_ord);
}
shard->prev_workload_id = workload_id;
shard->prev_tid = tid;
Expand Down Expand Up @@ -429,6 +442,8 @@ schedule_stats_t::print_counters(const counters_t &counters)
<< " maybe-blocking system calls\n";
std::cerr << std::setw(12) << counters.direct_switch_requests
<< " direct switch requests\n";
std::cerr << std::setw(12) << counters.observed_migrations
<< " observed migrations\n";
std::cerr << std::setw(12) << counters.waits << " waits\n";
std::cerr << std::setw(12) << counters.idles << " idles\n";
print_percentage(static_cast<double>(counters.instrs),
Expand Down Expand Up @@ -483,6 +498,14 @@ schedule_stats_t::aggregate_results(counters_t &total)
assert(shard.second->counters.direct_switches ==
shard.second->stream->get_schedule_statistic(
memtrace_stream_t::SCHED_STAT_DIRECT_SWITCH_SUCCESSES));
// If the scheduler is counting migrations, it may see more due to inputs
// not yet executed moving among runqueues.
#ifndef NDEBUG
double sched_migrations = shard.second->stream->get_schedule_statistic(
memtrace_stream_t::SCHED_STAT_MIGRATIONS);
#endif
assert(sched_migrations == 0. ||
sched_migrations >= shard.second->counters.observed_migrations);
}
}

Expand Down
49 changes: 47 additions & 2 deletions clients/drcachesim/tools/schedule_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
Expand Down Expand Up @@ -156,6 +157,7 @@ class schedule_stats_t : public analysis_tool_t {
syscalls += rhs.syscalls;
maybe_blocking_syscalls += rhs.maybe_blocking_syscalls;
direct_switch_requests += rhs.direct_switch_requests;
observed_migrations += rhs.observed_migrations;
waits += rhs.waits;
idles += rhs.idles;
idle_microseconds += rhs.idle_microseconds;
Expand Down Expand Up @@ -185,6 +187,12 @@ class schedule_stats_t : public analysis_tool_t {
int64_t syscalls = 0;
int64_t maybe_blocking_syscalls = 0;
int64_t direct_switch_requests = 0;
// Our observed migrations will be <= the scheduler's reported migrations
// for a dynamic schedule as we don't know the initial runqueue allocation
// and so can't see the migration of an input that didn't execute in the
// trace yet. For replayed (including core-sharded-on-disk) the scheduler
// does not provide any data and so this stat is required there.
int64_t observed_migrations = 0;
int64_t waits = 0;
int64_t idles = 0;
uint64_t idle_microseconds = 0;
Expand All @@ -194,6 +202,39 @@ class schedule_stats_t : public analysis_tool_t {
std::unordered_set<memref_tid_t> threads;
std::unique_ptr<histogram_interface_t> instrs_per_switch;
};

struct workload_tid_t {
workload_tid_t(int64_t workload, int64_t thread)
: workload_id(workload)
, tid(thread)
{
}
bool
operator==(const workload_tid_t &rhs) const
{
return workload_id == rhs.workload_id && tid == rhs.tid;
}

bool
operator!=(const workload_tid_t &rhs) const
{
return !(*this == rhs);
}
int64_t workload_id;
int64_t tid;
};

struct workload_tid_hash_t {
std::size_t
operator()(const workload_tid_t &wt) const
{
// Ensure {workload_id=X, tid=Y} doesn't always hash the same as
// {workload_id=Y, tid=X} by avoiding a simple symmetric wid^tid.
return std::hash<size_t>()(static_cast<size_t>(wt.workload_id ^ wt.tid)) ^
std::hash<memref_tid_t>()(wt.tid);
}
};

counters_t
get_total_counts();

Expand Down Expand Up @@ -244,8 +285,8 @@ class schedule_stats_t : public analysis_tool_t {
update_state_time(per_shard_t *shard, state_t state);

void
record_context_switch(per_shard_t *shard, int64_t tid, int64_t input_id,
int64_t letter_ord);
record_context_switch(per_shard_t *shard, int64_t workload_id, int64_t tid,
int64_t input_id, int64_t letter_ord);

virtual void
aggregate_results(counters_t &total);
Expand All @@ -263,6 +304,10 @@ class schedule_stats_t : public analysis_tool_t {
std::mutex shard_map_mutex_;
static const std::string TOOL_NAME;
memtrace_stream_t *serial_stream_ = nullptr;
// To track migrations we unfortunately need a global synchronized map to
// remember the last core for each input.
std::unordered_map<workload_tid_t, int64_t, workload_tid_hash_t> prev_core_;
std::mutex prev_core_mutex_;
};

} // namespace drmemtrace
Expand Down

0 comments on commit d689927

Please sign in to comment.