Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

i#6426 sched stats: Add histogram of instrs per switch #6801

Merged
merged 4 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clients/drcachesim/tests/core_serial.templatex
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ Total counts:
[0-9 ]* idle microseconds at last instr
[0-9\. ]*% cpu busy by time
[0-9\. ]*% cpu busy by time, ignoring idle past last instr
Instructions per context switch histogram:
0.. 50000 2
50000.. 100000 4
Core #0 counts:
.*
Core #1 counts:
Expand Down
39 changes: 7 additions & 32 deletions clients/drcachesim/tests/schedule_stats_nopreempt.templatex
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ Total counts:
*[0-9]* idle microseconds at last instr
*[0-9\.]*% cpu busy by time
*[0-9\.]*% cpu busy by time, ignoring idle past last instr
Instructions per context switch histogram:
0.. 50000 2
50000.. 100000 4
Core #0 counts:
. threads: 1257.*
*[0-9]* instructions
Expand All @@ -36,14 +39,7 @@ Core #0 counts:
. maybe-blocking system calls
0 direct switch requests
0 waits
*[0-9]* idles
*[0-9\.]*% cpu busy by record count
*[0-9]* cpu microseconds
*[0-9]* wait microseconds
*[0-9]* idle microseconds
*[0-9]* idle microseconds at last instr
*[0-9\.]*% cpu busy by time
*[0-9\.]*% cpu busy by time, ignoring idle past last instr
.*
Core #1 counts:
. threads: 1257.*
*[0-9]* instructions
Expand All @@ -58,14 +54,7 @@ Core #1 counts:
. maybe-blocking system calls
0 direct switch requests
0 waits
*[0-9]* idles
*[0-9\.]*% cpu busy by record count
*[0-9]* cpu microseconds
*[0-9]* wait microseconds
*[0-9]* idle microseconds
*[0-9]* idle microseconds at last instr
*[0-9\.]*% cpu busy by time
*[0-9\.]*% cpu busy by time, ignoring idle past last instr
.*
Core #2 counts:
. threads: 1257.*
*[0-9]* instructions
Expand All @@ -80,14 +69,7 @@ Core #2 counts:
. maybe-blocking system calls
0 direct switch requests
0 waits
*[0-9]* idles
*[0-9\.]*% cpu busy by record count
*[0-9]* cpu microseconds
*[0-9]* wait microseconds
*[0-9]* idle microseconds
*[0-9]* idle microseconds at last instr
*[0-9\.]*% cpu busy by time
*[0-9\.]*% cpu busy by time, ignoring idle past last instr
.*
Core #3 counts:
. threads: 1257.*
*[0-9]* instructions
Expand All @@ -102,14 +84,7 @@ Core #3 counts:
. maybe-blocking system calls
0 direct switch requests
0 waits
*[0-9]* idles
*[0-9\.]*% cpu busy by record count
*[0-9]* cpu microseconds
*[0-9]* wait microseconds
*[0-9]* idle microseconds
*[0-9]* idle microseconds at last instr
*[0-9\.]*% cpu busy by time
*[0-9\.]*% cpu busy by time, ignoring idle past last instr
.*
Core #0 schedule: [A-Ha-h_]*
Core #1 schedule: [A-Ha-h_]*
Core #2 schedule: [A-Ha-h_]*
Expand Down
84 changes: 49 additions & 35 deletions clients/drcachesim/tools/schedule_stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,46 @@ schedule_stats_t::update_state_time(per_shard_t *shard, state_t state)
return true;
}

void
schedule_stats_t::record_context_switch(per_shard_t *shard, int64_t tid, int64_t input_id,
int64_t letter_ord)
{
// We convert to letters which only works well for <=26 inputs.
if (!shard->thread_sequence.empty()) {
++shard->counters.total_switches;
if (shard->saw_syscall || shard->saw_exit)
++shard->counters.voluntary_switches;
if (shard->direct_switch_target == tid)
++shard->counters.direct_switches;
uint64_t instr_delta = shard->counters.instrs - shard->switch_start_instrs;
shard->counters.instrs_per_switch->add(instr_delta);
shard->switch_start_instrs = shard->counters.instrs;
}
shard->thread_sequence +=
THREAD_LETTER_INITIAL_START + static_cast<char>(letter_ord % 26);
shard->cur_segment_instrs = 0;
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
if (knob_verbose_ >= 2) {
std::ostringstream line;
line << "Core #" << std::setw(2) << shard->core << " @" << std::setw(9)
<< shard->stream->get_record_ordinal() << " refs, " << std::setw(9)
<< shard->stream->get_instruction_ordinal() << " instrs: input "
<< std::setw(4) << input_id << " @" << std::setw(9)
<< shard->stream->get_input_interface()->get_record_ordinal() << " refs, "
<< std::setw(9)
<< shard->stream->get_input_interface()->get_instruction_ordinal()
<< " instrs, time "
<< std::setw(16)
// TODO i#5843: For time quanta, provide some way to get the
// latest time and print that here instead of the timestamp?
<< shard->stream->get_input_interface()->get_last_timestamp()
<< " == thread " << tid << "\n";
std::cerr << line.str();
}
}

bool
schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref)
{
static constexpr char THREAD_LETTER_INITIAL_START = 'A';
static constexpr char THREAD_LETTER_SUBSEQUENT_START = 'a';
static constexpr char WAIT_SYMBOL = '-';
static constexpr char IDLE_SYMBOL = '_';
per_shard_t *shard = reinterpret_cast<per_shard_t *>(shard_data);
int64_t input_id = shard->stream->get_input_id();
if (knob_verbose_ >= 4) {
Expand Down Expand Up @@ -257,34 +290,7 @@ schedule_stats_t::parallel_shard_memref(void *shard_data, const memref_t &memref
? tid
: input_id;
if ((workload_id != prev_workload_id || tid != prev_tid) && tid != IDLE_THREAD_ID) {
// We convert to letters which only works well for <=26 inputs.
if (!shard->thread_sequence.empty()) {
++shard->counters.total_switches;
if (shard->saw_syscall || shard->saw_exit)
++shard->counters.voluntary_switches;
if (shard->direct_switch_target == memref.marker.tid)
++shard->counters.direct_switches;
}
shard->thread_sequence +=
THREAD_LETTER_INITIAL_START + static_cast<char>(letter_ord % 26);
shard->cur_segment_instrs = 0;
if (knob_verbose_ >= 2) {
std::ostringstream line;
line << "Core #" << std::setw(2) << shard->core << " @" << std::setw(9)
<< shard->stream->get_record_ordinal() << " refs, " << std::setw(9)
<< shard->stream->get_instruction_ordinal() << " instrs: input "
<< std::setw(4) << input_id << " @" << std::setw(9)
<< shard->stream->get_input_interface()->get_record_ordinal()
<< " refs, " << std::setw(9)
<< shard->stream->get_input_interface()->get_instruction_ordinal()
<< " instrs, time "
<< std::setw(16)
// TODO i#5843: For time quanta, provide some way to get the
// latest time and print that here instead of the the timestamp?
<< shard->stream->get_input_interface()->get_last_timestamp()
<< " == thread " << memref.instr.tid << "\n";
std::cerr << line.str();
}
record_context_switch(shard, tid, input_id, letter_ord);
}
shard->prev_workload_id = workload_id;
shard->prev_tid = tid;
Expand Down Expand Up @@ -396,6 +402,16 @@ schedule_stats_t::print_counters(const counters_t &counters)
static_cast<double>(counters.cpu_microseconds +
counters.idle_micros_at_last_instr),
"% cpu busy by time, ignoring idle past last instr\n");
std::cerr << " Instructions per context switch histogram:\n";
counters.instrs_per_switch->print();
}

void
schedule_stats_t::aggregate_results(counters_t &total)
{
for (const auto &shard : shard_map_) {
total += shard.second->counters;
}
}

bool
Expand All @@ -404,9 +420,7 @@ schedule_stats_t::print_results()
std::cerr << TOOL_NAME << " results:\n";
std::cerr << "Total counts:\n";
counters_t total;
for (const auto &shard : shard_map_) {
total += shard.second->counters;
}
aggregate_results(total);
std::cerr << std::setw(12) << shard_map_.size() << " cores\n";
print_counters(total);
for (const auto &shard : shard_map_) {
Expand Down
75 changes: 75 additions & 0 deletions clients/drcachesim/tools/schedule_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@
#include <stdint.h>

#include <cstdint>
#include <iostream>
#include <map>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_set>
Expand Down Expand Up @@ -73,9 +75,63 @@ class schedule_stats_t : public analysis_tool_t {
std::string
parallel_shard_error(void *shard_data) override;

// Histogram interface for instrs-per-switch distribution.
class histogram_interface_t {
public:
virtual ~histogram_interface_t() = default;
virtual void
add(int64_t value) = 0;
virtual void
merge(const histogram_interface_t *rhs) = 0;
virtual void
print() const = 0;
};

// Simple binning histogram for instrs-per-switch distribution.
class histogram_t : public histogram_interface_t {
public:
histogram_t() = default;

void
add(int64_t value) override
{
// XXX: Add dynamic bin size changing.
// For now with relatively known data ranges we just stick
// with unchanging bin sizes.
uint64_t bin = value - (value % kInitialBinSize);
++bin2count_[bin];
}

void
merge(const histogram_interface_t *rhs) override
{
const histogram_t *rhs_hist = reinterpret_cast<const histogram_t *>(rhs);
derekbruening marked this conversation as resolved.
Show resolved Hide resolved
for (const auto &keyval : rhs_hist->bin2count_) {
bin2count_[keyval.first] += keyval.second;
}
}

void
print() const override
{
for (const auto &keyval : bin2count_) {
std::cerr << std::setw(12) << keyval.first << ".." << std::setw(8)
<< keyval.first + kInitialBinSize << " " << std::setw(5)
<< keyval.second << "\n";
}
}

protected:
static constexpr uint64_t kInitialBinSize = 50000;

// Key is the inclusive lower bound of the bin.
std::map<uint64_t, uint64_t> bin2count_;
};

struct counters_t {
counters_t()
{
instrs_per_switch = std::unique_ptr<histogram_interface_t>(new histogram_t);
}
counters_t &
operator+=(const counters_t &rhs)
Expand All @@ -96,6 +152,7 @@ class schedule_stats_t : public analysis_tool_t {
for (const memref_tid_t tid : rhs.threads) {
threads.insert(tid);
}
instrs_per_switch->merge(rhs.instrs_per_switch.get());
return *this;
}
int64_t instrs = 0;
Expand All @@ -112,6 +169,7 @@ class schedule_stats_t : public analysis_tool_t {
uint64_t cpu_microseconds = 0;
uint64_t wait_microseconds = 0;
std::unordered_set<memref_tid_t> threads;
std::unique_ptr<histogram_interface_t> instrs_per_switch;
};
counters_t
get_total_counts();
Expand All @@ -120,6 +178,11 @@ class schedule_stats_t : public analysis_tool_t {
// We're in one of 3 states.
typedef enum { STATE_CPU, STATE_IDLE, STATE_WAIT } state_t;

static constexpr char THREAD_LETTER_INITIAL_START = 'A';
static constexpr char THREAD_LETTER_SUBSEQUENT_START = 'a';
static constexpr char WAIT_SYMBOL = '-';
static constexpr char IDLE_SYMBOL = '_';

struct per_shard_t {
std::string error;
memtrace_stream_t *stream = nullptr;
Expand All @@ -133,11 +196,16 @@ class schedule_stats_t : public analysis_tool_t {
bool saw_exit = false;
// A representation of the thread interleavings.
std::string thread_sequence;
// The instruction count for the current activity (an active input or a wait
// or idle state) on this shard, since the last context switch or reset due
// to knob_print_every_: the time period between switches or resets we call
// a "segment".
uint64_t cur_segment_instrs = 0;
state_t cur_state = STATE_CPU;
// Computing %-idle.
uint64_t segment_start_microseconds = 0;
intptr_t filetype = 0;
uint64_t switch_start_instrs = 0;
};

void
Expand All @@ -152,6 +220,13 @@ class schedule_stats_t : public analysis_tool_t {
bool
update_state_time(per_shard_t *shard, state_t state);

void
record_context_switch(per_shard_t *shard, int64_t tid, int64_t input_id,
int64_t letter_ord);

virtual void
aggregate_results(counters_t &total);

uint64_t knob_print_every_ = 0;
unsigned int knob_verbose_ = 0;
// We use an ordered map to get our output in order. This table is not
Expand Down
Loading