i#6938 sched migrate: Add observed migrations to schedule stats (#7057)

While the schedule_stats tool already reports the migration count from the scheduler, that is only non-zero for a dynamic schedule, resulting in 0 for core-sharded-on-disk traces. We add "observed migrations" where schedule_stats looks for migrations on its own. This requires a global lock on each context switch, but experimental results show that this does not seem to cause noticeable overhead. Adds some testing. Issue: #6938
DynamoRIO · Oct 28, 2024 · d689927 · d689927
1 parent d70ab44
commit d689927
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 6 deletions.
diff --git a/clients/drcachesim/tests/core_serial.templatex b/clients/drcachesim/tests/core_serial.templatex
@@ -21,6 +21,7 @@ Total counts:
          161 system calls
            2 maybe-blocking system calls
            0 direct switch requests
+           0 observed migrations
            0 waits
       345686 idles
        64.89% cpu busy by record count

diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
@@ -21,6 +21,7 @@ Total counts:
          161 system calls
            2 maybe-blocking system calls
            0 direct switch requests
+           0 observed migrations
            0 waits
     *[0-9]* idles
       *[0-9\.]*% cpu busy by record count

diff --git a/clients/drcachesim/tests/schedule_stats_test.cpp b/clients/drcachesim/tests/schedule_stats_test.cpp
@@ -110,6 +110,7 @@ run_schedule_stats(const std::vector<std::vector<memref_t>> &memrefs)
     std::vector<per_core_t> per_core(memrefs.size());
     for (int cpu = 0; cpu < static_cast<int>(memrefs.size()); ++cpu) {
         per_core[cpu].worker_data = tool.parallel_worker_init(cpu);
+        per_core[cpu].stream.set_output_cpuid(cpu);
         per_core[cpu].shard_data = tool.parallel_shard_init_stream(
             cpu, per_core[cpu].worker_data, &per_core[cpu].stream);
     }
@@ -121,6 +122,7 @@ run_schedule_stats(const std::vector<std::vector<memref_t>> &memrefs)
                 continue;
             memref_t memref = memrefs[cpu][per_core[cpu].memref_idx];
             per_core[cpu].stream.set_tid(memref.instr.tid);
+            per_core[cpu].stream.set_output_cpuid(cpu);
             bool res = tool.parallel_shard_memref(per_core[cpu].shard_data, memref);
             assert(res);
             ++per_core[cpu].memref_idx;
@@ -206,6 +208,8 @@ test_basic_stats()
     assert(result.syscalls == 4);
     assert(result.maybe_blocking_syscalls == 3);
     assert(result.direct_switch_requests == 2);
+    // 5 migrations: A 0->1; B 1->0; A 1->0; C 0->1; B 0->1.
+    assert(result.observed_migrations == 5);
     assert(result.waits == 3);
     assert(result.idle_microseconds == 0);
     assert(result.cpu_microseconds > 20);
@@ -265,6 +269,7 @@ test_idle()
     assert(result.syscalls == 0);
     assert(result.maybe_blocking_syscalls == 0);
     assert(result.direct_switch_requests == 0);
+    assert(result.observed_migrations == 0);
     assert(result.waits == 3);
     assert(result.idles == 6);
     assert(result.idle_microseconds >= 6);

diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp
@@ -204,11 +204,14 @@ schedule_stats_t::update_state_time(per_shard_t *shard, state_t state)
 }
 
 void
-schedule_stats_t::record_context_switch(per_shard_t *shard, int64_t tid, int64_t input_id,
-                                        int64_t letter_ord)
+schedule_stats_t::record_context_switch(per_shard_t *shard, int64_t workload_id,
+                                        int64_t tid, int64_t input_id, int64_t letter_ord)
 {
     // We convert to letters which only works well for <=26 inputs.
-    if (!shard->thread_sequence.empty()) {
+    if (shard->thread_sequence.empty()) {
+        std::lock_guard<std::mutex> lock(prev_core_mutex_);
+        prev_core_[workload_tid_t(workload_id, tid)] = shard->core;
+    } else {
         ++shard->counters.total_switches;
         if (shard->saw_syscall || shard->saw_exit)
             ++shard->counters.voluntary_switches;
@@ -217,6 +220,16 @@ schedule_stats_t::record_context_switch(per_shard_t *shard, int64_t tid, int64_t
         uint64_t instr_delta = shard->counters.instrs - shard->switch_start_instrs;
         shard->counters.instrs_per_switch->add(instr_delta);
         shard->switch_start_instrs = shard->counters.instrs;
+
+        // Tracking migrations requires a global lock but just once per context
+        // switch seems to have negligible performance impact on parallel analysis.
+        std::lock_guard<std::mutex> lock(prev_core_mutex_);
+        workload_tid_t workload_tid(workload_id, tid);
+        auto it = prev_core_.find(workload_tid);
+        if (it != prev_core_.end() && it->second != shard->core) {
+            ++shard->counters.observed_migrations;
+        }
+        prev_core_[workload_tid] = shard->core;
     }
     shard->thread_sequence +=
         THREAD_LETTER_INITIAL_START + static_cast<char>(letter_ord % 26);
@@ -321,7 +334,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
     if ((workload_id != prev_workload_id || tid != prev_tid) && tid != IDLE_THREAD_ID) {
         // See XXX comment in get_scheduler_stats(): this measures swap-ins, while
         // "perf" measures swap-outs.
-        record_context_switch(shard, tid, input_id, letter_ord);
+        record_context_switch(shard, workload_id, tid, input_id, letter_ord);
     }
     shard->prev_workload_id = workload_id;
     shard->prev_tid = tid;
@@ -429,6 +442,8 @@ schedule_stats_t::print_counters(const counters_t &counters)
               << " maybe-blocking system calls\n";
     std::cerr << std::setw(12) << counters.direct_switch_requests
               << " direct switch requests\n";
+    std::cerr << std::setw(12) << counters.observed_migrations
+              << " observed migrations\n";
     std::cerr << std::setw(12) << counters.waits << " waits\n";
     std::cerr << std::setw(12) << counters.idles << " idles\n";
     print_percentage(static_cast<double>(counters.instrs),
@@ -483,6 +498,14 @@ schedule_stats_t::aggregate_results(counters_t &total)
         assert(shard.second->counters.direct_switches ==
                shard.second->stream->get_schedule_statistic(
                    memtrace_stream_t::SCHED_STAT_DIRECT_SWITCH_SUCCESSES));
+        // If the scheduler is counting migrations, it may see more due to inputs
+        // not yet executed moving among runqueues.
+#ifndef NDEBUG
+        double sched_migrations = shard.second->stream->get_schedule_statistic(
+            memtrace_stream_t::SCHED_STAT_MIGRATIONS);
+#endif
+        assert(sched_migrations == 0. ||
+               sched_migrations >= shard.second->counters.observed_migrations);
     }
 }
 

diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h
@@ -41,6 +41,7 @@
 #include <memory>
 #include <mutex>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
@@ -156,6 +157,7 @@ class schedule_stats_t : public analysis_tool_t {
             syscalls += rhs.syscalls;
             maybe_blocking_syscalls += rhs.maybe_blocking_syscalls;
             direct_switch_requests += rhs.direct_switch_requests;
+            observed_migrations += rhs.observed_migrations;
             waits += rhs.waits;
             idles += rhs.idles;
             idle_microseconds += rhs.idle_microseconds;
@@ -185,6 +187,12 @@ class schedule_stats_t : public analysis_tool_t {
         int64_t syscalls = 0;
         int64_t maybe_blocking_syscalls = 0;
         int64_t direct_switch_requests = 0;
+        // Our observed migrations will be <= the scheduler's reported migrations
+        // for a dynamic schedule as we don't know the initial runqueue allocation
+        // and so can't see the migration of an input that didn't execute in the
+        // trace yet. For replayed (including core-sharded-on-disk) the scheduler
+        // does not provide any data and so this stat is required there.
+        int64_t observed_migrations = 0;
         int64_t waits = 0;
         int64_t idles = 0;
         uint64_t idle_microseconds = 0;
@@ -194,6 +202,39 @@ class schedule_stats_t : public analysis_tool_t {
         std::unordered_set<memref_tid_t> threads;
         std::unique_ptr<histogram_interface_t> instrs_per_switch;
     };
+
+    struct workload_tid_t {
+        workload_tid_t(int64_t workload, int64_t thread)
+            : workload_id(workload)
+            , tid(thread)
+        {
+        }
+        bool
+        operator==(const workload_tid_t &rhs) const
+        {
+            return workload_id == rhs.workload_id && tid == rhs.tid;
+        }
+
+        bool
+        operator!=(const workload_tid_t &rhs) const
+        {
+            return !(*this == rhs);
+        }
+        int64_t workload_id;
+        int64_t tid;
+    };
+
+    struct workload_tid_hash_t {
+        std::size_t
+        operator()(const workload_tid_t &wt) const
+        {
+            // Ensure {workload_id=X, tid=Y} doesn't always hash the same as
+            // {workload_id=Y, tid=X} by avoiding a simple symmetric wid^tid.
+            return std::hash<size_t>()(static_cast<size_t>(wt.workload_id ^ wt.tid)) ^
+                std::hash<memref_tid_t>()(wt.tid);
+        }
+    };
+
     counters_t
     get_total_counts();
 
@@ -244,8 +285,8 @@ class schedule_stats_t : public analysis_tool_t {
     update_state_time(per_shard_t *shard, state_t state);
 
     void
-    record_context_switch(per_shard_t *shard, int64_t tid, int64_t input_id,
-                          int64_t letter_ord);
+    record_context_switch(per_shard_t *shard, int64_t workload_id, int64_t tid,
+                          int64_t input_id, int64_t letter_ord);
 
     virtual void
     aggregate_results(counters_t &total);
@@ -263,6 +304,10 @@ class schedule_stats_t : public analysis_tool_t {
     std::mutex shard_map_mutex_;
     static const std::string TOOL_NAME;
     memtrace_stream_t *serial_stream_ = nullptr;
+    // To track migrations we unfortunately need a global synchronized map to
+    // remember the last core for each input.
+    std::unordered_map<workload_tid_t, int64_t, workload_tid_hash_t> prev_core_;
+    std::mutex prev_core_mutex_;
 };
 
 } // namespace drmemtrace