Skip to content

Commit

Permalink
i#6426 sched stats: Add histogram of instrs per switch (#6801)
Browse files Browse the repository at this point in the history
Adds a very simple histogram with fixed bin sizes recording the
distribution of instructions per context switch in the schedule_stats
tool.

Updates two tests to confirm the histogram is produced.

This feature is structured to allow replacing this simple histogram with
a more sophisticated version in subclasses.

Issue: #6426
  • Loading branch information
derekbruening authored and philramsey-arm committed May 14, 2024
1 parent 3e884c0 commit 2a1284c
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 67 deletions.
3 changes: 3 additions & 0 deletions clients/drcachesim/tests/core_serial.templatex
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ Total counts:
[0-9 ]* idle microseconds at last instr
[0-9\. ]*% cpu busy by time
[0-9\. ]*% cpu busy by time, ignoring idle past last instr
Instructions per context switch histogram:
0.. 50000 2
50000.. 100000 4
Core #0 counts:
.*
Core #1 counts:
Expand Down
39 changes: 7 additions & 32 deletions clients/drcachesim/tests/schedule_stats_nopreempt.templatex
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ Total counts:
*[0-9]* idle microseconds at last instr
*[0-9\.]*% cpu busy by time
*[0-9\.]*% cpu busy by time, ignoring idle past last instr
Instructions per context switch histogram:
0.. 50000 2
50000.. 100000 4
Core #0 counts:
. threads: 1257.*
*[0-9]* instructions
Expand All @@ -36,14 +39,7 @@ Core #0 counts:
. maybe-blocking system calls
0 direct switch requests
0 waits
*[0-9]* idles
*[0-9\.]*% cpu busy by record count
*[0-9]* cpu microseconds
*[0-9]* wait microseconds
*[0-9]* idle microseconds
*[0-9]* idle microseconds at last instr
*[0-9\.]*% cpu busy by time
*[0-9\.]*% cpu busy by time, ignoring idle past last instr
.*
Core #1 counts:
. threads: 1257.*
*[0-9]* instructions
Expand All @@ -58,14 +54,7 @@ Core #1 counts:
. maybe-blocking system calls
0 direct switch requests
0 waits
*[0-9]* idles
*[0-9\.]*% cpu busy by record count
*[0-9]* cpu microseconds
*[0-9]* wait microseconds
*[0-9]* idle microseconds
*[0-9]* idle microseconds at last instr
*[0-9\.]*% cpu busy by time
*[0-9\.]*% cpu busy by time, ignoring idle past last instr
.*
Core #2 counts:
. threads: 1257.*
*[0-9]* instructions
Expand All @@ -80,14 +69,7 @@ Core #2 counts:
. maybe-blocking system calls
0 direct switch requests
0 waits
*[0-9]* idles
*[0-9\.]*% cpu busy by record count
*[0-9]* cpu microseconds
*[0-9]* wait microseconds
*[0-9]* idle microseconds
*[0-9]* idle microseconds at last instr
*[0-9\.]*% cpu busy by time
*[0-9\.]*% cpu busy by time, ignoring idle past last instr
.*
Core #3 counts:
. threads: 1257.*
*[0-9]* instructions
Expand All @@ -102,14 +84,7 @@ Core #3 counts:
. maybe-blocking system calls
0 direct switch requests
0 waits
*[0-9]* idles
*[0-9\.]*% cpu busy by record count
*[0-9]* cpu microseconds
*[0-9]* wait microseconds
*[0-9]* idle microseconds
*[0-9]* idle microseconds at last instr
*[0-9\.]*% cpu busy by time
*[0-9\.]*% cpu busy by time, ignoring idle past last instr
.*
Core #0 schedule: [A-Ha-h_]*
Core #1 schedule: [A-Ha-h_]*
Core #2 schedule: [A-Ha-h_]*
Expand Down
84 changes: 49 additions & 35 deletions clients/drcachesim/tools/schedule_stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,46 @@ schedule_stats_t::update_state_time(per_shard_t *shard, state_t state)
return true;
}

void
schedule_stats_t::record_context_switch(per_shard_t *shard, int64_t tid, int64_t input_id,
int64_t letter_ord)
{
// We convert to letters which only works well for <=26 inputs.
if (!shard->thread_sequence.empty()) {
++shard->counters.total_switches;
if (shard->saw_syscall || shard->saw_exit)
++shard->counters.voluntary_switches;
if (shard->direct_switch_target == tid)
++shard->counters.direct_switches;
uint64_t instr_delta = shard->counters.instrs - shard->switch_start_instrs;
shard->counters.instrs_per_switch->add(instr_delta);
shard->switch_start_instrs = shard->counters.instrs;
}
shard->thread_sequence +=
THREAD_LETTER_INITIAL_START + static_cast<char>(letter_ord % 26);
shard->cur_segment_instrs = 0;
if (knob_verbose_ >= 2) {
std::ostringstream line;
line << "Core #" << std::setw(2) << shard->core << " @" << std::setw(9)
<< shard->stream->get_record_ordinal() << " refs, " << std::setw(9)
<< shard->stream->get_instruction_ordinal() << " instrs: input "
<< std::setw(4) << input_id << " @" << std::setw(9)
<< shard->stream->get_input_interface()->get_record_ordinal() << " refs, "
<< std::setw(9)
<< shard->stream->get_input_interface()->get_instruction_ordinal()
<< " instrs, time "
<< std::setw(16)
// TODO i#5843: For time quanta, provide some way to get the
// latest time and print that here instead of the timestamp?
<< shard->stream->get_input_interface()->get_last_timestamp()
<< " == thread " << tid << "\n";
std::cerr << line.str();
}
}

bool
schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref)
{
static constexpr char THREAD_LETTER_INITIAL_START = 'A';
static constexpr char THREAD_LETTER_SUBSEQUENT_START = 'a';
static constexpr char WAIT_SYMBOL = '-';
static constexpr char IDLE_SYMBOL = '_';
per_shard_t *shard = reinterpret_cast<per_shard_t *>(shard_data);
int64_t input_id = shard->stream->get_input_id();
if (knob_verbose_ >= 4) {
Expand Down Expand Up @@ -257,34 +290,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
? tid
: input_id;
if ((workload_id != prev_workload_id || tid != prev_tid) && tid != IDLE_THREAD_ID) {
// We convert to letters which only works well for <=26 inputs.
if (!shard->thread_sequence.empty()) {
++shard->counters.total_switches;
if (shard->saw_syscall || shard->saw_exit)
++shard->counters.voluntary_switches;
if (shard->direct_switch_target == memref.marker.tid)
++shard->counters.direct_switches;
}
shard->thread_sequence +=
THREAD_LETTER_INITIAL_START + static_cast<char>(letter_ord % 26);
shard->cur_segment_instrs = 0;
if (knob_verbose_ >= 2) {
std::ostringstream line;
line << "Core #" << std::setw(2) << shard->core << " @" << std::setw(9)
<< shard->stream->get_record_ordinal() << " refs, " << std::setw(9)
<< shard->stream->get_instruction_ordinal() << " instrs: input "
<< std::setw(4) << input_id << " @" << std::setw(9)
<< shard->stream->get_input_interface()->get_record_ordinal()
<< " refs, " << std::setw(9)
<< shard->stream->get_input_interface()->get_instruction_ordinal()
<< " instrs, time "
<< std::setw(16)
// TODO i#5843: For time quanta, provide some way to get the
// latest time and print that here instead of the the timestamp?
<< shard->stream->get_input_interface()->get_last_timestamp()
<< " == thread " << memref.instr.tid << "\n";
std::cerr << line.str();
}
record_context_switch(shard, tid, input_id, letter_ord);
}
shard->prev_workload_id = workload_id;
shard->prev_tid = tid;
Expand Down Expand Up @@ -396,6 +402,16 @@ schedule_stats_t::print_counters(const counters_t &counters)
static_cast<double>(counters.cpu_microseconds +
counters.idle_micros_at_last_instr),
"% cpu busy by time, ignoring idle past last instr\n");
std::cerr << " Instructions per context switch histogram:\n";
counters.instrs_per_switch->print();
}

void
schedule_stats_t::aggregate_results(counters_t &total)
{
for (const auto &shard : shard_map_) {
total += shard.second->counters;
}
}

bool
Expand All @@ -404,9 +420,7 @@ schedule_stats_t::print_results()
std::cerr << TOOL_NAME << " results:\n";
std::cerr << "Total counts:\n";
counters_t total;
for (const auto &shard : shard_map_) {
total += shard.second->counters;
}
aggregate_results(total);
std::cerr << std::setw(12) << shard_map_.size() << " cores\n";
print_counters(total);
for (const auto &shard : shard_map_) {
Expand Down
75 changes: 75 additions & 0 deletions clients/drcachesim/tools/schedule_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@
#include <stdint.h>

#include <cstdint>
#include <iostream>
#include <map>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_set>
Expand Down Expand Up @@ -73,9 +75,63 @@ class schedule_stats_t : public analysis_tool_t {
std::string
parallel_shard_error(void *shard_data) override;

// Histogram interface for instrs-per-switch distribution.
class histogram_interface_t {
public:
virtual ~histogram_interface_t() = default;
virtual void
add(int64_t value) = 0;
virtual void
merge(const histogram_interface_t *rhs) = 0;
virtual void
print() const = 0;
};

// Simple binning histogram for instrs-per-switch distribution.
class histogram_t : public histogram_interface_t {
public:
histogram_t() = default;

void
add(int64_t value) override
{
// XXX: Add dynamic bin size changing.
// For now with relatively known data ranges we just stick
// with unchanging bin sizes.
uint64_t bin = value - (value % kInitialBinSize);
++bin2count_[bin];
}

void
merge(const histogram_interface_t *rhs) override
{
const histogram_t *rhs_hist = dynamic_cast<const histogram_t *>(rhs);
for (const auto &keyval : rhs_hist->bin2count_) {
bin2count_[keyval.first] += keyval.second;
}
}

void
print() const override
{
for (const auto &keyval : bin2count_) {
std::cerr << std::setw(12) << keyval.first << ".." << std::setw(8)
<< keyval.first + kInitialBinSize << " " << std::setw(5)
<< keyval.second << "\n";
}
}

protected:
static constexpr uint64_t kInitialBinSize = 50000;

// Key is the inclusive lower bound of the bin.
std::map<uint64_t, uint64_t> bin2count_;
};

struct counters_t {
counters_t()
{
instrs_per_switch = std::unique_ptr<histogram_interface_t>(new histogram_t);
}
counters_t &
operator+=(const counters_t &rhs)
Expand All @@ -96,6 +152,7 @@ class schedule_stats_t : public analysis_tool_t {
for (const memref_tid_t tid : rhs.threads) {
threads.insert(tid);
}
instrs_per_switch->merge(rhs.instrs_per_switch.get());
return *this;
}
int64_t instrs = 0;
Expand All @@ -112,6 +169,7 @@ class schedule_stats_t : public analysis_tool_t {
uint64_t cpu_microseconds = 0;
uint64_t wait_microseconds = 0;
std::unordered_set<memref_tid_t> threads;
std::unique_ptr<histogram_interface_t> instrs_per_switch;
};
counters_t
get_total_counts();
Expand All @@ -120,6 +178,11 @@ class schedule_stats_t : public analysis_tool_t {
// We're in one of 3 states.
typedef enum { STATE_CPU, STATE_IDLE, STATE_WAIT } state_t;

static constexpr char THREAD_LETTER_INITIAL_START = 'A';
static constexpr char THREAD_LETTER_SUBSEQUENT_START = 'a';
static constexpr char WAIT_SYMBOL = '-';
static constexpr char IDLE_SYMBOL = '_';

struct per_shard_t {
std::string error;
memtrace_stream_t *stream = nullptr;
Expand All @@ -133,11 +196,16 @@ class schedule_stats_t : public analysis_tool_t {
bool saw_exit = false;
// A representation of the thread interleavings.
std::string thread_sequence;
// The instruction count for the current activity (an active input or a wait
// or idle state) on this shard, since the last context switch or reset due
// to knob_print_every_: the time period between switches or resets we call
// a "segment".
uint64_t cur_segment_instrs = 0;
state_t cur_state = STATE_CPU;
// Computing %-idle.
uint64_t segment_start_microseconds = 0;
intptr_t filetype = 0;
uint64_t switch_start_instrs = 0;
};

void
Expand All @@ -152,6 +220,13 @@ class schedule_stats_t : public analysis_tool_t {
bool
update_state_time(per_shard_t *shard, state_t state);

void
record_context_switch(per_shard_t *shard, int64_t tid, int64_t input_id,
int64_t letter_ord);

virtual void
aggregate_results(counters_t &total);

uint64_t knob_print_every_ = 0;
unsigned int knob_verbose_ = 0;
// We use an ordered map to get our output in order. This table is not
Expand Down

0 comments on commit 2a1284c

Please sign in to comment.