From f8f1efc8eaf75f944f160a42b926eac91e4f8ce5 Mon Sep 17 00:00:00 2001 From: Derek Bruening Date: Thu, 24 Oct 2024 23:51:39 -0400 Subject: [PATCH] i#6938 sched migrate: Add observed migrations to schedule stats While the schedule_stats tool already reports the migration count from the scheduler, that is only non-zero for a dynamic schedule, resulting in 0 for core-sharded-on-disk traces. We add "observed migrations" where schedule_stats looks for migrations on its own. This requires a global lock on each context switch, but experimental results show that this does not seem to cause noticeable overhead. Adds some testing. Issue: #6938 --- .../drcachesim/tests/core_serial.templatex | 1 + .../tests/schedule_stats_nopreempt.templatex | 1 + .../drcachesim/tests/schedule_stats_test.cpp | 5 ++ clients/drcachesim/tools/schedule_stats.cpp | 31 +++++++++++-- clients/drcachesim/tools/schedule_stats.h | 46 ++++++++++++++++++- 5 files changed, 78 insertions(+), 6 deletions(-) diff --git a/clients/drcachesim/tests/core_serial.templatex b/clients/drcachesim/tests/core_serial.templatex index 8b8520b5225..4efffd16e98 100644 --- a/clients/drcachesim/tests/core_serial.templatex +++ b/clients/drcachesim/tests/core_serial.templatex @@ -19,6 +19,7 @@ Total counts: 161 system calls 2 maybe-blocking system calls 0 direct switch requests + 0 observed migrations 0 waits 345686 idles 64.89% cpu busy by record count diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex index 29a7eb3291b..204dbf41756 100644 --- a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex +++ b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex @@ -19,6 +19,7 @@ Total counts: 161 system calls 2 maybe-blocking system calls 0 direct switch requests + 0 observed migrations 0 waits *[0-9]* idles *[0-9\.]*% cpu busy by record count diff --git a/clients/drcachesim/tests/schedule_stats_test.cpp b/clients/drcachesim/tests/schedule_stats_test.cpp index 0f5f57cdc2f..75f7db93d9e 100644 --- a/clients/drcachesim/tests/schedule_stats_test.cpp +++ b/clients/drcachesim/tests/schedule_stats_test.cpp @@ -110,6 +110,7 @@ run_schedule_stats(const std::vector> &memrefs) std::vector per_core(memrefs.size()); for (int cpu = 0; cpu < static_cast(memrefs.size()); ++cpu) { per_core[cpu].worker_data = tool.parallel_worker_init(cpu); + per_core[cpu].stream.set_output_cpuid(cpu); per_core[cpu].shard_data = tool.parallel_shard_init_stream( cpu, per_core[cpu].worker_data, &per_core[cpu].stream); } @@ -121,6 +122,7 @@ run_schedule_stats(const std::vector> &memrefs) continue; memref_t memref = memrefs[cpu][per_core[cpu].memref_idx]; per_core[cpu].stream.set_tid(memref.instr.tid); + per_core[cpu].stream.set_output_cpuid(cpu); bool res = tool.parallel_shard_memref(per_core[cpu].shard_data, memref); assert(res); ++per_core[cpu].memref_idx; @@ -206,6 +208,8 @@ test_basic_stats() assert(result.syscalls == 4); assert(result.maybe_blocking_syscalls == 3); assert(result.direct_switch_requests == 2); + // 5 migrations: A 0->1; B 1->0; A 1->0; C 0->1; B 0->1. + assert(result.observed_migrations == 5); assert(result.waits == 3); assert(result.idle_microseconds == 0); assert(result.cpu_microseconds > 20); @@ -265,6 +269,7 @@ test_idle() assert(result.syscalls == 0); assert(result.maybe_blocking_syscalls == 0); assert(result.direct_switch_requests == 0); + assert(result.observed_migrations == 0); assert(result.waits == 3); assert(result.idles == 6); assert(result.idle_microseconds >= 6); diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp index 333ac926960..3a71a89f5fb 100644 --- a/clients/drcachesim/tools/schedule_stats.cpp +++ b/clients/drcachesim/tools/schedule_stats.cpp @@ -200,11 +200,14 @@ schedule_stats_t::update_state_time(per_shard_t *shard, state_t state) } void -schedule_stats_t::record_context_switch(per_shard_t *shard, int64_t tid, int64_t input_id, - int64_t letter_ord) +schedule_stats_t::record_context_switch(per_shard_t *shard, int64_t workload_id, + int64_t tid, int64_t input_id, int64_t letter_ord) { // We convert to letters which only works well for <=26 inputs. - if (!shard->thread_sequence.empty()) { + if (shard->thread_sequence.empty()) { + std::lock_guard lock(prev_core_mutex_); + prev_core_[workload_tid_t(workload_id, tid)] = shard->core; + } else { ++shard->counters.total_switches; if (shard->saw_syscall || shard->saw_exit) ++shard->counters.voluntary_switches; @@ -213,6 +216,16 @@ schedule_stats_t::record_context_switch(per_shard_t *shard, int64_t tid, int64_t uint64_t instr_delta = shard->counters.instrs - shard->switch_start_instrs; shard->counters.instrs_per_switch->add(instr_delta); shard->switch_start_instrs = shard->counters.instrs; + + // Tracking migrations requires a global lock but just once per context + // switch seems to have negligible performance impact on parallel analysis. + std::lock_guard lock(prev_core_mutex_); + workload_tid_t workload_tid(workload_id, tid); + auto it = prev_core_.find(workload_tid); + if (it != prev_core_.end() && it->second != shard->core) { + ++shard->counters.observed_migrations; + } + prev_core_[workload_tid] = shard->core; } shard->thread_sequence += THREAD_LETTER_INITIAL_START + static_cast(letter_ord % 26); @@ -317,7 +330,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref if ((workload_id != prev_workload_id || tid != prev_tid) && tid != IDLE_THREAD_ID) { // See XXX comment in get_scheduler_stats(): this measures swap-ins, while // "perf" measures swap-outs. - record_context_switch(shard, tid, input_id, letter_ord); + record_context_switch(shard, workload_id, tid, input_id, letter_ord); } shard->prev_workload_id = workload_id; shard->prev_tid = tid; @@ -423,6 +436,8 @@ schedule_stats_t::print_counters(const counters_t &counters) << " maybe-blocking system calls\n"; std::cerr << std::setw(12) << counters.direct_switch_requests << " direct switch requests\n"; + std::cerr << std::setw(12) << counters.observed_migrations + << " observed migrations\n"; std::cerr << std::setw(12) << counters.waits << " waits\n"; std::cerr << std::setw(12) << counters.idles << " idles\n"; print_percentage(static_cast(counters.instrs), @@ -477,6 +492,14 @@ schedule_stats_t::aggregate_results(counters_t &total) assert(shard.second->counters.direct_switches == shard.second->stream->get_schedule_statistic( memtrace_stream_t::SCHED_STAT_DIRECT_SWITCH_SUCCESSES)); + // If the scheduler is counting migrations, it may see more due to inputs + // not yet executed moving among runqueues. +#ifndef NDEBUG + int64_t sched_migrations = shard.second->stream->get_schedule_statistic( + memtrace_stream_t::SCHED_STAT_MIGRATIONS); +#endif + assert(sched_migrations == 0 || + sched_migrations > shard.second->counters.observed_migrations); } } diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h index 61e175a4c5e..f4eb8d71068 100644 --- a/clients/drcachesim/tools/schedule_stats.h +++ b/clients/drcachesim/tools/schedule_stats.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -154,6 +155,7 @@ class schedule_stats_t : public analysis_tool_t { syscalls += rhs.syscalls; maybe_blocking_syscalls += rhs.maybe_blocking_syscalls; direct_switch_requests += rhs.direct_switch_requests; + observed_migrations += rhs.observed_migrations; waits += rhs.waits; idles += rhs.idles; idle_microseconds += rhs.idle_microseconds; @@ -181,6 +183,12 @@ class schedule_stats_t : public analysis_tool_t { int64_t syscalls = 0; int64_t maybe_blocking_syscalls = 0; int64_t direct_switch_requests = 0; + // Our observed migrations will be <= the scheduler's reported migrations + // for a dynamic schedule as we don't know the initial runqueue allocation + // and so can't see the migration of an input that didn't execute in the + // trace yet. For replayed (including core-sharded-on-disk) the scheduler + // does not provide any data and so this stat is required there. + int64_t observed_migrations = 0; int64_t waits = 0; int64_t idles = 0; uint64_t idle_microseconds = 0; @@ -190,6 +198,36 @@ class schedule_stats_t : public analysis_tool_t { std::unordered_set threads; std::unique_ptr instrs_per_switch; }; + + struct workload_tid_t { + workload_tid_t(int64_t workload, int64_t thread) + : workload_id(workload) + , tid(thread) + { + } + bool + operator==(const workload_tid_t &rhs) const + { + return workload_id == rhs.workload_id && tid == rhs.tid; + } + + bool + operator!=(const workload_tid_t &rhs) const + { + return !(*this == rhs); + } + int64_t workload_id; + int64_t tid; + }; + + struct workload_tid_hash_t { + std::size_t + operator()(const workload_tid_t &wt) const + { + return std::hash()(wt.workload_id) ^ std::hash()(wt.tid); + } + }; + counters_t get_total_counts(); @@ -240,8 +278,8 @@ class schedule_stats_t : public analysis_tool_t { update_state_time(per_shard_t *shard, state_t state); void - record_context_switch(per_shard_t *shard, int64_t tid, int64_t input_id, - int64_t letter_ord); + record_context_switch(per_shard_t *shard, int64_t workload_id, int64_t tid, + int64_t input_id, int64_t letter_ord); virtual void aggregate_results(counters_t &total); @@ -259,6 +297,10 @@ class schedule_stats_t : public analysis_tool_t { std::mutex shard_map_mutex_; static const std::string TOOL_NAME; memtrace_stream_t *serial_stream_ = nullptr; + // To track migrations we unfortunately need a global synchronized map to + // remember the last core for each input. + std::unordered_map prev_core_; + std::mutex prev_core_mutex_; }; } // namespace drmemtrace