diff --git a/clients/drcachesim/tests/core_serial.templatex b/clients/drcachesim/tests/core_serial.templatex index 74ca54ac8db..fbf81c013f9 100644 --- a/clients/drcachesim/tests/core_serial.templatex +++ b/clients/drcachesim/tests/core_serial.templatex @@ -22,6 +22,9 @@ Total counts: [0-9 ]* idle microseconds at last instr [0-9\. ]*% cpu busy by time [0-9\. ]*% cpu busy by time, ignoring idle past last instr + Instructions per context switch histogram: + 0.. 50000 2 + 50000.. 100000 4 Core #0 counts: .* Core #1 counts: diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex index 95a501f8218..3374497d90d 100644 --- a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex +++ b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex @@ -22,6 +22,9 @@ Total counts: *[0-9]* idle microseconds at last instr *[0-9\.]*% cpu busy by time *[0-9\.]*% cpu busy by time, ignoring idle past last instr + Instructions per context switch histogram: + 0.. 50000 2 + 50000.. 100000 4 Core #0 counts: . threads: 1257.* *[0-9]* instructions @@ -36,14 +39,7 @@ Core #0 counts: . maybe-blocking system calls 0 direct switch requests 0 waits - *[0-9]* idles - *[0-9\.]*% cpu busy by record count - *[0-9]* cpu microseconds - *[0-9]* wait microseconds - *[0-9]* idle microseconds - *[0-9]* idle microseconds at last instr - *[0-9\.]*% cpu busy by time - *[0-9\.]*% cpu busy by time, ignoring idle past last instr +.* Core #1 counts: . threads: 1257.* *[0-9]* instructions @@ -58,14 +54,7 @@ Core #1 counts: . maybe-blocking system calls 0 direct switch requests 0 waits - *[0-9]* idles - *[0-9\.]*% cpu busy by record count - *[0-9]* cpu microseconds - *[0-9]* wait microseconds - *[0-9]* idle microseconds - *[0-9]* idle microseconds at last instr - *[0-9\.]*% cpu busy by time - *[0-9\.]*% cpu busy by time, ignoring idle past last instr +.* Core #2 counts: . threads: 1257.* *[0-9]* instructions @@ -80,14 +69,7 @@ Core #2 counts: . maybe-blocking system calls 0 direct switch requests 0 waits - *[0-9]* idles - *[0-9\.]*% cpu busy by record count - *[0-9]* cpu microseconds - *[0-9]* wait microseconds - *[0-9]* idle microseconds - *[0-9]* idle microseconds at last instr - *[0-9\.]*% cpu busy by time - *[0-9\.]*% cpu busy by time, ignoring idle past last instr +.* Core #3 counts: . threads: 1257.* *[0-9]* instructions @@ -102,14 +84,7 @@ Core #3 counts: . maybe-blocking system calls 0 direct switch requests 0 waits - *[0-9]* idles - *[0-9\.]*% cpu busy by record count - *[0-9]* cpu microseconds - *[0-9]* wait microseconds - *[0-9]* idle microseconds - *[0-9]* idle microseconds at last instr - *[0-9\.]*% cpu busy by time - *[0-9\.]*% cpu busy by time, ignoring idle past last instr +.* Core #0 schedule: [A-Ha-h_]* Core #1 schedule: [A-Ha-h_]* Core #2 schedule: [A-Ha-h_]* diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp index d858037842e..47637430bbe 100644 --- a/clients/drcachesim/tools/schedule_stats.cpp +++ b/clients/drcachesim/tools/schedule_stats.cpp @@ -174,13 +174,46 @@ schedule_stats_t::update_state_time(per_shard_t *shard, state_t state) return true; } +void +schedule_stats_t::record_context_switch(per_shard_t *shard, int64_t tid, int64_t input_id, + int64_t letter_ord) +{ + // We convert to letters which only works well for <=26 inputs. + if (!shard->thread_sequence.empty()) { + ++shard->counters.total_switches; + if (shard->saw_syscall || shard->saw_exit) + ++shard->counters.voluntary_switches; + if (shard->direct_switch_target == tid) + ++shard->counters.direct_switches; + uint64_t instr_delta = shard->counters.instrs - shard->switch_start_instrs; + shard->counters.instrs_per_switch->add(instr_delta); + shard->switch_start_instrs = shard->counters.instrs; + } + shard->thread_sequence += + THREAD_LETTER_INITIAL_START + static_cast(letter_ord % 26); + shard->cur_segment_instrs = 0; + if (knob_verbose_ >= 2) { + std::ostringstream line; + line << "Core #" << std::setw(2) << shard->core << " @" << std::setw(9) + << shard->stream->get_record_ordinal() << " refs, " << std::setw(9) + << shard->stream->get_instruction_ordinal() << " instrs: input " + << std::setw(4) << input_id << " @" << std::setw(9) + << shard->stream->get_input_interface()->get_record_ordinal() << " refs, " + << std::setw(9) + << shard->stream->get_input_interface()->get_instruction_ordinal() + << " instrs, time " + << std::setw(16) + // TODO i#5843: For time quanta, provide some way to get the + // latest time and print that here instead of the timestamp? + << shard->stream->get_input_interface()->get_last_timestamp() + << " == thread " << tid << "\n"; + std::cerr << line.str(); + } +} + bool schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref) { - static constexpr char THREAD_LETTER_INITIAL_START = 'A'; - static constexpr char THREAD_LETTER_SUBSEQUENT_START = 'a'; - static constexpr char WAIT_SYMBOL = '-'; - static constexpr char IDLE_SYMBOL = '_'; per_shard_t *shard = reinterpret_cast(shard_data); int64_t input_id = shard->stream->get_input_id(); if (knob_verbose_ >= 4) { @@ -257,34 +290,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref ? tid : input_id; if ((workload_id != prev_workload_id || tid != prev_tid) && tid != IDLE_THREAD_ID) { - // We convert to letters which only works well for <=26 inputs. - if (!shard->thread_sequence.empty()) { - ++shard->counters.total_switches; - if (shard->saw_syscall || shard->saw_exit) - ++shard->counters.voluntary_switches; - if (shard->direct_switch_target == memref.marker.tid) - ++shard->counters.direct_switches; - } - shard->thread_sequence += - THREAD_LETTER_INITIAL_START + static_cast(letter_ord % 26); - shard->cur_segment_instrs = 0; - if (knob_verbose_ >= 2) { - std::ostringstream line; - line << "Core #" << std::setw(2) << shard->core << " @" << std::setw(9) - << shard->stream->get_record_ordinal() << " refs, " << std::setw(9) - << shard->stream->get_instruction_ordinal() << " instrs: input " - << std::setw(4) << input_id << " @" << std::setw(9) - << shard->stream->get_input_interface()->get_record_ordinal() - << " refs, " << std::setw(9) - << shard->stream->get_input_interface()->get_instruction_ordinal() - << " instrs, time " - << std::setw(16) - // TODO i#5843: For time quanta, provide some way to get the - // latest time and print that here instead of the the timestamp? - << shard->stream->get_input_interface()->get_last_timestamp() - << " == thread " << memref.instr.tid << "\n"; - std::cerr << line.str(); - } + record_context_switch(shard, tid, input_id, letter_ord); } shard->prev_workload_id = workload_id; shard->prev_tid = tid; @@ -396,6 +402,16 @@ schedule_stats_t::print_counters(const counters_t &counters) static_cast(counters.cpu_microseconds + counters.idle_micros_at_last_instr), "% cpu busy by time, ignoring idle past last instr\n"); + std::cerr << " Instructions per context switch histogram:\n"; + counters.instrs_per_switch->print(); +} + +void +schedule_stats_t::aggregate_results(counters_t &total) +{ + for (const auto &shard : shard_map_) { + total += shard.second->counters; + } } bool @@ -404,9 +420,7 @@ schedule_stats_t::print_results() std::cerr << TOOL_NAME << " results:\n"; std::cerr << "Total counts:\n"; counters_t total; - for (const auto &shard : shard_map_) { - total += shard.second->counters; - } + aggregate_results(total); std::cerr << std::setw(12) << shard_map_.size() << " cores\n"; print_counters(total); for (const auto &shard : shard_map_) { diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h index 00b68ef5587..5cde2be6f77 100644 --- a/clients/drcachesim/tools/schedule_stats.h +++ b/clients/drcachesim/tools/schedule_stats.h @@ -36,7 +36,9 @@ #include #include +#include #include +#include #include #include #include @@ -73,9 +75,63 @@ class schedule_stats_t : public analysis_tool_t { std::string parallel_shard_error(void *shard_data) override; + // Histogram interface for instrs-per-switch distribution. + class histogram_interface_t { + public: + virtual ~histogram_interface_t() = default; + virtual void + add(int64_t value) = 0; + virtual void + merge(const histogram_interface_t *rhs) = 0; + virtual void + print() const = 0; + }; + + // Simple binning histogram for instrs-per-switch distribution. + class histogram_t : public histogram_interface_t { + public: + histogram_t() = default; + + void + add(int64_t value) override + { + // XXX: Add dynamic bin size changing. + // For now with relatively known data ranges we just stick + // with unchanging bin sizes. + uint64_t bin = value - (value % kInitialBinSize); + ++bin2count_[bin]; + } + + void + merge(const histogram_interface_t *rhs) override + { + const histogram_t *rhs_hist = dynamic_cast(rhs); + for (const auto &keyval : rhs_hist->bin2count_) { + bin2count_[keyval.first] += keyval.second; + } + } + + void + print() const override + { + for (const auto &keyval : bin2count_) { + std::cerr << std::setw(12) << keyval.first << ".." << std::setw(8) + << keyval.first + kInitialBinSize << " " << std::setw(5) + << keyval.second << "\n"; + } + } + + protected: + static constexpr uint64_t kInitialBinSize = 50000; + + // Key is the inclusive lower bound of the bin. + std::map bin2count_; + }; + struct counters_t { counters_t() { + instrs_per_switch = std::unique_ptr(new histogram_t); } counters_t & operator+=(const counters_t &rhs) @@ -96,6 +152,7 @@ class schedule_stats_t : public analysis_tool_t { for (const memref_tid_t tid : rhs.threads) { threads.insert(tid); } + instrs_per_switch->merge(rhs.instrs_per_switch.get()); return *this; } int64_t instrs = 0; @@ -112,6 +169,7 @@ class schedule_stats_t : public analysis_tool_t { uint64_t cpu_microseconds = 0; uint64_t wait_microseconds = 0; std::unordered_set threads; + std::unique_ptr instrs_per_switch; }; counters_t get_total_counts(); @@ -120,6 +178,11 @@ class schedule_stats_t : public analysis_tool_t { // We're in one of 3 states. typedef enum { STATE_CPU, STATE_IDLE, STATE_WAIT } state_t; + static constexpr char THREAD_LETTER_INITIAL_START = 'A'; + static constexpr char THREAD_LETTER_SUBSEQUENT_START = 'a'; + static constexpr char WAIT_SYMBOL = '-'; + static constexpr char IDLE_SYMBOL = '_'; + struct per_shard_t { std::string error; memtrace_stream_t *stream = nullptr; @@ -133,11 +196,16 @@ class schedule_stats_t : public analysis_tool_t { bool saw_exit = false; // A representation of the thread interleavings. std::string thread_sequence; + // The instruction count for the current activity (an active input or a wait + // or idle state) on this shard, since the last context switch or reset due + // to knob_print_every_: the time period between switches or resets we call + // a "segment". uint64_t cur_segment_instrs = 0; state_t cur_state = STATE_CPU; // Computing %-idle. uint64_t segment_start_microseconds = 0; intptr_t filetype = 0; + uint64_t switch_start_instrs = 0; }; void @@ -152,6 +220,13 @@ class schedule_stats_t : public analysis_tool_t { bool update_state_time(per_shard_t *shard, state_t state); + void + record_context_switch(per_shard_t *shard, int64_t tid, int64_t input_id, + int64_t letter_ord); + + virtual void + aggregate_results(counters_t &total); + uint64_t knob_print_every_ = 0; unsigned int knob_verbose_ = 0; // We use an ordered map to get our output in order. This table is not