Skip to content

Commit

Permalink
[trace][intel pt] Introduce wall clock time for each trace item
Browse files Browse the repository at this point in the history
- Decouple TSCs from trace items
- Turn TSCs into events just like CPUs. The new name is HW clock tick, wich could be reused by other vendors.
- Add a GetWallTime that returns the wall time that the trace plug-in can infer for each trace item.
- For intel pt, we are doing the following interpolation: if an instruction takes less than 1 TSC, we use that duration, otherwise, we assume the instruction took 1 TSC. This helps us avoid having to handle context switches, changes to kernel, idle times, decoding errors, etc. We are just trying to show some approximation and not the real data. For the real data, TSCs are the way to go. Besides that, we are making sure that no two trace items will give the same interpolation value. Finally, we are using as time 0 the time at which tracing started.

Sample output:

```
(lldb) r
Process 750047 launched: '/home/wallace/a.out' (x86_64)
Process 750047 stopped
* thread #1, name = 'a.out', stop reason = breakpoint 1.1
    frame #0: 0x0000000000402479 a.out`main at main.cpp:29:20
   26   };
   27
   28   int main() {
-> 29     std::vector<int> vvv;
   30     for (int i = 0; i < 100; i++)
   31       vvv.push_back(i);
   32
(lldb) process trace start -s 64kb -t --per-cpu
(lldb) b 60
Breakpoint 2: where = a.out`main + 1689 at main.cpp:60:23, address = 0x0000000000402afe
(lldb) c
Process 750047 resuming
Process 750047 stopped
* thread #1, name = 'a.out', stop reason = breakpoint 2.1
    frame #0: 0x0000000000402afe a.out`main at main.cpp:60:23
   57     map<int, int> m;
   58     m[3] = 4;
   59
-> 60     map<string, string> m2;
   61     m2["5"] = "6";
   62
   63     std::vector<std::string> vs = {"2", "3"};
(lldb) thread trace dump instructions -t -f -e thread #1: tid = 750047
    0: [379567.000 ns] (event) HW clock tick [48599428476224707]
    1: [379569.000 ns] (event) CPU core changed [new CPU=2]
    2: [390487.000 ns] (event) HW clock tick [48599428476246495]
    3: [1602508.000 ns] (event) HW clock tick [48599428478664855]
    4: [1662745.000 ns] (event) HW clock tick [48599428478785046]
  libc.so.6`malloc
    5: [1662746.995 ns] 0x00007ffff7176660    endbr64
    6: [1662748.991 ns] 0x00007ffff7176664    movq   0x32387d(%rip), %rax      ;  + 408
    7: [1662750.986 ns] 0x00007ffff717666b    pushq  %r12
    8: [1662752.981 ns] 0x00007ffff717666d    pushq  %rbp
    9: [1662754.977 ns] 0x00007ffff717666e    pushq  %rbx
    10: [1662756.972 ns] 0x00007ffff717666f    movq   (%rax), %rax
    11: [1662758.967 ns] 0x00007ffff7176672    testq  %rax, %rax
    12: [1662760.963 ns] 0x00007ffff7176675    jne    0x9c7e0                   ; <+384>
    13: [1662762.958 ns] 0x00007ffff717667b    leaq   0x17(%rdi), %rax
    14: [1662764.953 ns] 0x00007ffff717667f    cmpq   $0x1f, %rax
    15: [1662766.949 ns] 0x00007ffff7176683    ja     0x9c730                   ; <+208>
    16: [1662768.944 ns] 0x00007ffff7176730    andq   $-0x10, %rax
    17: [1662770.939 ns] 0x00007ffff7176734    cmpq   $-0x41, %rax
    18: [1662772.935 ns] 0x00007ffff7176738    seta   %dl
    19: [1662774.930 ns] 0x00007ffff717673b    jmp    0x9c690                   ; <+48>
    20: [1662776.925 ns] 0x00007ffff7176690    cmpq   %rdi, %rax
    21: [1662778.921 ns] 0x00007ffff7176693    jb     0x9c7b0                   ; <+336>
    22: [1662780.916 ns] 0x00007ffff7176699    testb  %dl, %dl
    23: [1662782.911 ns] 0x00007ffff717669b    jne    0x9c7b0                   ; <+336>
    24: [1662784.906 ns] 0x00007ffff71766a1    movq   0x3236c0(%rip), %r12      ;  + 24
(lldb) thread trace dump instructions -t -f -e -J -c 4
[
  {
    "id": 0,
    "timestamp_ns": "379567.000000",
    "event": "HW clock tick",
    "hwClock": 48599428476224707
  },
  {
    "id": 1,
    "timestamp_ns": "379569.000000",
    "event": "CPU core changed",
    "cpuId": 2
  },
  {
    "id": 2,
    "timestamp_ns": "390487.000000",
    "event": "HW clock tick",
    "hwClock": 48599428476246495
  },
  {
    "id": 3,
    "timestamp_ns": "1602508.000000",
    "event": "HW clock tick",
    "hwClock": 48599428478664855
  },
  {
    "id": 4,
    "timestamp_ns": "1662745.000000",
    "event": "HW clock tick",
    "hwClock": 48599428478785046
  },
  {
    "id": 5,
    "timestamp_ns": "1662746.995324",
    "loadAddress": "0x7ffff7176660",
    "module": "libc.so.6",
    "symbol": "malloc",
    "mnemonic": "endbr64"
  },
  {
    "id": 6,
    "timestamp_ns": "1662748.990648",
    "loadAddress": "0x7ffff7176664",
    "module": "libc.so.6",
    "symbol": "malloc",
    "mnemonic": "movq"
  },
  {
    "id": 7,
    "timestamp_ns": "1662750.985972",
    "loadAddress": "0x7ffff717666b",
    "module": "libc.so.6",
    "symbol": "malloc",
    "mnemonic": "pushq"
  },
  {
    "id": 8,
    "timestamp_ns": "1662752.981296",
    "loadAddress": "0x7ffff717666d",
    "module": "libc.so.6",
    "symbol": "malloc",
    "mnemonic": "pushq"
  }
]
```

Differential Revision: https://reviews.llvm.org/D130054
  • Loading branch information
walter-erquinigo authored and memfrob committed Oct 5, 2022
1 parent 8aca8ac commit 97702ef
Show file tree
Hide file tree
Showing 23 changed files with 606 additions and 291 deletions.
36 changes: 22 additions & 14 deletions lldb/include/lldb/Target/TraceCursor.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ class TraceCursor {
/// of this cursor.
ExecutionContextRef &GetExecutionContextRef();

/// Instruction, event or error information
/// Trace item information (instructions, errors and events)
/// \{

/// \return
Expand Down Expand Up @@ -255,27 +255,35 @@ class TraceCursor {
/// The load address of the instruction the cursor is pointing at.
virtual lldb::addr_t GetLoadAddress() const = 0;

/// Get the hardware counter of a given type associated with the current
/// instruction. Each architecture might support different counters. It might
/// happen that only some instructions of an entire trace have a given counter
/// associated with them.
///
/// \param[in] counter_type
/// The counter type.
/// \return
/// The value of the counter or \b llvm::None if not available.
virtual llvm::Optional<uint64_t>
GetCounter(lldb::TraceCounter counter_type) const = 0;

/// Get the CPU associated with the current trace item.
///
/// This call might not be O(1), so it's suggested to invoke this method
/// whenever a cpu change event is fired.
/// whenever an eTraceEventCPUChanged event is fired.
///
/// \return
/// The requested CPU id, or \a llvm::None if this information is
/// not available for the current item.
virtual llvm::Optional<lldb::cpu_id_t> GetCPU() const = 0;

/// Get the last hardware clock value that was emitted before the current
/// trace item.
///
/// This call might not be O(1), so it's suggested to invoke this method
/// whenever an eTraceEventHWClockTick event is fired.
///
/// \return
/// The requested HW clock value, or \a llvm::None if this information is
/// not available for the current item.
virtual llvm::Optional<uint64_t> GetHWClock() const = 0;

/// Get the approximate wall clock time in nanoseconds at which the current
/// trace item was executed. Each trace plug-in has a different definition for
/// what time 0 means.
///
/// \return
/// The approximate wall clock time for the trace item, or \a llvm::None
/// if not available.
virtual llvm::Optional<double> GetWallClockTime() const = 0;
/// \}

protected:
Expand Down
7 changes: 4 additions & 3 deletions lldb/include/lldb/Target/TraceDumper.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ struct TraceDumperOptions {
bool json = false;
/// When dumping in JSON format, pretty print the output.
bool pretty_print_json = false;
/// For each instruction, print the corresponding timestamp counter if
/// For each trace item, print the corresponding timestamp in nanoseconds if
/// available.
bool show_tsc = false;
bool show_timestamps = false;
/// Dump the events that happened between instructions.
bool show_events = false;
/// For each instruction, print the instruction kind.
Expand Down Expand Up @@ -61,7 +61,8 @@ class TraceDumper {
struct TraceItem {
lldb::user_id_t id;
lldb::addr_t load_address;
llvm::Optional<uint64_t> tsc;
llvm::Optional<double> timestamp;
llvm::Optional<uint64_t> hw_clock;
llvm::Optional<llvm::StringRef> error;
llvm::Optional<lldb::TraceEvent> event;
llvm::Optional<SymbolInfo> symbol_info;
Expand Down
4 changes: 2 additions & 2 deletions lldb/include/lldb/Utility/TraceIntelPTGDBRemotePackets.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ struct LinuxPerfZeroTscConversion {
/// nanoseconds) is defined by the kernel at boot time and has no particularly
/// useful meaning. On the other hand, this value is constant for an entire
/// trace session.
// See 'time_zero' section of
// https://man7.org/linux/man-pages/man2/perf_event_open.2.html
/// See 'time_zero' section of
/// https://man7.org/linux/man-pages/man2/perf_event_open.2.html
///
/// \param[in] tsc
/// The TSC value to be converted.
Expand Down
8 changes: 2 additions & 6 deletions lldb/include/lldb/lldb-enumerations.h
Original file line number Diff line number Diff line change
Expand Up @@ -1159,12 +1159,6 @@ enum SaveCoreStyle {
eSaveCoreStackOnly = 3,
};

// Type of counter values associated with instructions in a trace.
enum TraceCounter {
// Timestamp counter, like the one offered by Intel CPUs (TSC).
eTraceCounterTSC = 0,
};

/// Events that might happen during a trace session.
enum TraceEvent {
/// Tracing was disabled for some time due to a software trigger
Expand All @@ -1174,6 +1168,8 @@ enum TraceEvent {
/// Event due to CPU change for a thread. This event is also fired when
/// suddenly it's not possible to identify the cpu of a given thread.
eTraceEventCPUChanged,
/// Event due to a CPU HW clock tick
eTraceEventHWClockTick,
};

// Enum used to identify which kind of item a \a TraceCursor is pointing at
Expand Down
2 changes: 1 addition & 1 deletion lldb/source/Commands/CommandObjectThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2162,7 +2162,7 @@ class CommandObjectTraceDumpInstructions : public CommandObjectParsed {
break;
}
case 't': {
m_dumper_options.show_tsc = true;
m_dumper_options.show_timestamps = true;
break;
}
case 'e': {
Expand Down
10 changes: 6 additions & 4 deletions lldb/source/Commands/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -1146,15 +1146,17 @@ let Command = "thread trace dump instructions" in {
Desc<"Dump in simple JSON format.">;
def thread_trace_dump_instructions_pretty_print: Option<"pretty-json", "J">,
Group<1>,
Desc<"Dump in JSON format but pretty printing the output for easier readability.">;
Desc<"Dump in JSON format but pretty printing the output for easier "
"readability.">;
def thread_trace_dump_instructions_show_kind : Option<"kind", "k">, Group<1>,
Desc<"Show instruction control flow kind. Refer to the enum "
"`InstructionControlFlowKind` for a list of control flow kind. "
"As an important note, far jumps, far calls and far returns often indicate "
"calls to and from kernel.">;
def thread_trace_dump_instructions_show_tsc : Option<"tsc", "t">, Group<1>,
Desc<"For each instruction, print the corresponding timestamp counter if "
"available.">;
def thread_trace_dump_instructions_show_timestamps: Option<"time", "t">,
Group<1>,
Desc<"For each trace item, print the corresponding wall clock timestamp "
"if available.">;
def thread_trace_dump_instructions_show_events : Option<"events", "e">,
Group<1>,
Desc<"Dump the events that happened during the execution of the target.">;
Expand Down
198 changes: 111 additions & 87 deletions lldb/source/Plugins/Trace/intel-pt/DecodedThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,65 @@ void IntelPTError::log(llvm::raw_ostream &OS) const {
OS << formatv(": {0:x+16}", m_address);
}

int64_t DecodedThread::GetItemsCount() const {
return static_cast<int64_t>(m_item_kinds.size());
bool DecodedThread::TSCRange::InRange(uint64_t item_index) const {
return item_index >= first_item_index &&
item_index < first_item_index + items_count;
}

bool DecodedThread::NanosecondsRange::InRange(uint64_t item_index) const {
return item_index >= first_item_index &&
item_index < first_item_index + items_count;
}

double DecodedThread::NanosecondsRange::GetInterpolatedTime(
uint64_t item_index, uint64_t begin_of_time_nanos,
const LinuxPerfZeroTscConversion &tsc_conversion) const {
uint64_t items_since_last_tsc = item_index - first_item_index;

auto interpolate = [&](uint64_t next_range_start_ns) {
if (next_range_start_ns == nanos) {
// If the resolution of the conversion formula is bad enough to consider
// these two timestamps as equal, then we just increase the next one by 1
// for correction
next_range_start_ns++;
}
long double item_duration =
static_cast<long double>(items_count) / (next_range_start_ns - nanos);
return (nanos - begin_of_time_nanos) + items_since_last_tsc * item_duration;
};

if (!next_range) {
// If this is the last TSC range, so we have to extrapolate. In this case,
// we assume that each instruction took one TSC, which is what an
// instruction would take if no parallelism is achieved and the frequency
// multiplier is 1.
return interpolate(tsc_conversion.ToNanos(tsc + items_count));
}
if (items_count < (next_range->tsc - tsc)) {
// If the numbers of items in this range is less than the total TSC duration
// of this range, i.e. each instruction taking longer than 1 TSC, then we
// can assume that something else happened between these TSCs (e.g. a
// context switch, change to kernel, decoding errors, etc). In this case, we
// also assume that each instruction took 1 TSC. A proper way to improve
// this would be to analize the next events in the trace looking for context
// switches or trace disablement events, but for now, as we only want an
// approximation, we keep it simple. We are also guaranteed that the time in
// nanos of the next range is different to the current one, just because of
// the definition of a NanosecondsRange.
return interpolate(
std::min(tsc_conversion.ToNanos(tsc + items_count), next_range->nanos));
}

// In this case, each item took less than 1 TSC, so some parallelism was
// achieved, which is an indication that we didn't suffered of any kind of
// interruption.
return interpolate(next_range->nanos);
}

lldb::addr_t DecodedThread::GetInstructionLoadAddress(size_t item_index) const {
uint64_t DecodedThread::GetItemsCount() const { return m_item_kinds.size(); }

lldb::addr_t
DecodedThread::GetInstructionLoadAddress(uint64_t item_index) const {
return m_item_data[item_index].load_address;
}

Expand All @@ -58,33 +112,69 @@ DecodedThread::TraceItemStorage &
DecodedThread::CreateNewTraceItem(lldb::TraceItemKind kind) {
m_item_kinds.push_back(kind);
m_item_data.emplace_back();
if (m_last_tsc)
(*m_last_tsc)->second.items_count++;
if (m_last_nanoseconds)
(*m_last_nanoseconds)->second.items_count++;
return m_item_data.back();
}

void DecodedThread::NotifyTsc(uint64_t tsc) {
if (!m_last_tsc || *m_last_tsc != tsc) {
m_timestamps.emplace(m_item_kinds.size(), tsc);
m_last_tsc = tsc;
void DecodedThread::NotifyTsc(TSC tsc) {
if (m_last_tsc && (*m_last_tsc)->second.tsc == tsc)
return;

m_last_tsc =
m_tscs.emplace(GetItemsCount(), TSCRange{tsc, 0, GetItemsCount()}).first;

if (m_tsc_conversion) {
uint64_t nanos = m_tsc_conversion->ToNanos(tsc);
if (!m_last_nanoseconds || (*m_last_nanoseconds)->second.nanos != nanos) {
m_last_nanoseconds =
m_nanoseconds
.emplace(GetItemsCount(), NanosecondsRange{nanos, tsc, nullptr, 0,
GetItemsCount()})
.first;
if (*m_last_nanoseconds != m_nanoseconds.begin()) {
auto prev_range = prev(*m_last_nanoseconds);
prev_range->second.next_range = &(*m_last_nanoseconds)->second;
}
}
}
AppendEvent(lldb::eTraceEventHWClockTick);
}

void DecodedThread::NotifyCPU(lldb::cpu_id_t cpu_id) {
if (!m_last_cpu || *m_last_cpu != cpu_id) {
m_cpus.emplace(m_item_kinds.size(), cpu_id);
m_cpus.emplace(GetItemsCount(), cpu_id);
m_last_cpu = cpu_id;
AppendEvent(lldb::eTraceEventCPUChanged);
}
}

Optional<lldb::cpu_id_t>
DecodedThread::GetCPUByIndex(uint64_t insn_index) const {
// Could possibly optimize the search
auto it = m_cpus.upper_bound(insn_index);
DecodedThread::GetCPUByIndex(uint64_t item_index) const {
auto it = m_cpus.upper_bound(item_index);
if (it == m_cpus.begin())
return None;
return prev(it)->second;
}

Optional<DecodedThread::TSCRange>
DecodedThread::GetTSCRangeByIndex(uint64_t item_index) const {
auto next_it = m_tscs.upper_bound(item_index);
if (next_it == m_tscs.begin())
return None;
return prev(next_it)->second;
}

Optional<DecodedThread::NanosecondsRange>
DecodedThread::GetNanosecondsRangeByIndex(uint64_t item_index) {
auto next_it = m_nanoseconds.upper_bound(item_index);
if (next_it == m_nanoseconds.begin())
return None;
return prev(next_it)->second;
}

void DecodedThread::AppendEvent(lldb::TraceEvent event) {
CreateNewTraceItem(lldb::eTraceItemKindEvent).event = event;
m_events_stats.RecordEvent(event);
Expand Down Expand Up @@ -134,90 +224,24 @@ void DecodedThread::EventsStats::RecordEvent(lldb::TraceEvent event) {
total_count++;
}

Optional<DecodedThread::TscRange> DecodedThread::CalculateTscRange(
size_t insn_index,
const Optional<DecodedThread::TscRange> &hint_range) const {
// We first try to check the given hint range in case we are traversing the
// trace in short jumps. If that fails, then we do the more expensive
// arbitrary lookup.
if (hint_range) {
Optional<TscRange> candidate_range;
if (insn_index < hint_range->GetStartInstructionIndex())
candidate_range = hint_range->Prev();
else if (insn_index > hint_range->GetEndInstructionIndex())
candidate_range = hint_range->Next();
else
candidate_range = hint_range;

if (candidate_range && candidate_range->InRange(insn_index))
return candidate_range;
}
// Now we do a more expensive lookup
auto it = m_timestamps.upper_bound(insn_index);
if (it == m_timestamps.begin())
return None;

return TscRange(--it, *this);
}

lldb::TraceItemKind DecodedThread::GetItemKindByIndex(size_t item_index) const {
lldb::TraceItemKind
DecodedThread::GetItemKindByIndex(uint64_t item_index) const {
return static_cast<lldb::TraceItemKind>(m_item_kinds[item_index]);
}

const char *DecodedThread::GetErrorByIndex(size_t item_index) const {
const char *DecodedThread::GetErrorByIndex(uint64_t item_index) const {
return m_item_data[item_index].error;
}

DecodedThread::DecodedThread(ThreadSP thread_sp) : m_thread_sp(thread_sp) {}

lldb::TraceCursorUP DecodedThread::CreateNewCursor() {
return std::make_unique<TraceCursorIntelPT>(m_thread_sp, shared_from_this());
}
DecodedThread::DecodedThread(
ThreadSP thread_sp,
const llvm::Optional<LinuxPerfZeroTscConversion> &tsc_conversion)
: m_thread_sp(thread_sp), m_tsc_conversion(tsc_conversion) {}

size_t DecodedThread::CalculateApproximateMemoryUsage() const {
return sizeof(TraceItemStorage) * m_item_data.size() +
sizeof(uint8_t) * m_item_kinds.size() +
(sizeof(size_t) + sizeof(uint64_t)) * m_timestamps.size() +
(sizeof(size_t) + sizeof(lldb::cpu_id_t)) * m_cpus.size();
}

DecodedThread::TscRange::TscRange(std::map<size_t, uint64_t>::const_iterator it,
const DecodedThread &decoded_thread)
: m_it(it), m_decoded_thread(&decoded_thread) {
auto next_it = m_it;
++next_it;
m_end_index = (next_it == m_decoded_thread->m_timestamps.end())
? std::numeric_limits<uint64_t>::max()
: next_it->first - 1;
}

size_t DecodedThread::TscRange::GetTsc() const { return m_it->second; }

size_t DecodedThread::TscRange::GetStartInstructionIndex() const {
return m_it->first;
}

size_t DecodedThread::TscRange::GetEndInstructionIndex() const {
return m_end_index;
}

bool DecodedThread::TscRange::InRange(size_t insn_index) const {
return GetStartInstructionIndex() <= insn_index &&
insn_index <= GetEndInstructionIndex();
}

Optional<DecodedThread::TscRange> DecodedThread::TscRange::Next() const {
auto next_it = m_it;
++next_it;
if (next_it == m_decoded_thread->m_timestamps.end())
return None;
return TscRange(next_it, *m_decoded_thread);
}

Optional<DecodedThread::TscRange> DecodedThread::TscRange::Prev() const {
if (m_it == m_decoded_thread->m_timestamps.begin())
return None;
auto prev_it = m_it;
--prev_it;
return TscRange(prev_it, *m_decoded_thread);
(sizeof(uint64_t) + sizeof(TSC)) * m_tscs.size() +
(sizeof(uint64_t) + sizeof(uint64_t)) * m_nanoseconds.size() +
(sizeof(uint64_t) + sizeof(lldb::cpu_id_t)) * m_cpus.size();
}
Loading

0 comments on commit 97702ef

Please sign in to comment.