Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROF-10128] Refactor and speed up profiler stack sampling #3837

Merged
merged 17 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions ext/datadog_profiling_native_extension/collectors_stack.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,14 @@ void sample_thread(
return;
}

// if (captured_frames > 0) {
// int cache_hits = 0;
// for (int i = 0; i < captured_frames; i++) {
// if (buffer->stack_buffer[i].same_frame) cache_hits++;
// }
// fprintf(stderr, "Sampling cache hits: %f\n", ((double) cache_hits / captured_frames) * 100);
// }

// Ruby does not give us path and line number for methods implemented using native code.
// The convention in Kernel#caller_locations is to instead use the path and line number of the first Ruby frame
// on the stack that is below (e.g. directly or indirectly has called) the native method.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ void update_metrics_and_sample(
VALUE thread_being_sampled,
VALUE stack_from_thread,
struct per_thread_context *thread_context,
sampling_buffer* sampling_buffer,
long current_cpu_time_ns,
long current_monotonic_wall_time_ns
);
Expand All @@ -196,6 +197,7 @@ static void trigger_sample_for_thread(
VALUE thread,
VALUE stack_from_thread,
struct per_thread_context *thread_context,
sampling_buffer* sampling_buffer,
sample_values values,
long current_monotonic_wall_time_ns,
ddog_CharSlice *ruby_vm_type,
Expand Down Expand Up @@ -476,6 +478,7 @@ void thread_context_collector_sample(VALUE self_instance, long current_monotonic
/* thread_being_sampled: */ thread,
/* stack_from_thread: */ thread,
thread_context,
thread_context->sampling_buffer,
current_cpu_time_ns,
current_monotonic_wall_time_ns
);
Expand All @@ -492,6 +495,8 @@ void thread_context_collector_sample(VALUE self_instance, long current_monotonic
/* thread_being_sampled: */ current_thread,
/* stack_from_thread: */ profiler_overhead_stack_thread,
current_thread_context,
// Here we use the overhead thread's sampling buffer so as to not invalidate the cache in the buffer of the thread being sampled
get_or_create_context_for(profiler_overhead_stack_thread, state)->sampling_buffer,
cpu_time_now_ns(current_thread_context),
monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
);
Expand All @@ -502,6 +507,7 @@ void update_metrics_and_sample(
VALUE thread_being_sampled,
VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack
struct per_thread_context *thread_context,
sampling_buffer* sampling_buffer,
long current_cpu_time_ns,
long current_monotonic_wall_time_ns
) {
Expand All @@ -527,6 +533,7 @@ void update_metrics_and_sample(
thread_being_sampled,
stack_from_thread,
thread_context,
sampling_buffer,
(sample_values) {.cpu_time_ns = cpu_time_elapsed_ns, .cpu_or_wall_samples = 1, .wall_time_ns = wall_time_elapsed_ns},
current_monotonic_wall_time_ns,
NULL,
Expand Down Expand Up @@ -706,6 +713,7 @@ static void trigger_sample_for_thread(
VALUE thread,
VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack
struct per_thread_context *thread_context,
sampling_buffer* sampling_buffer,
sample_values values,
long current_monotonic_wall_time_ns,
// These two labels are only used for allocation profiling; @ivoanjo: may want to refactor this at some point?
Expand Down Expand Up @@ -826,7 +834,7 @@ static void trigger_sample_for_thread(

sample_thread(
stack_from_thread,
thread_context->sampling_buffer,
sampling_buffer,
state->recorder_instance,
values,
(sample_labels) {.labels = slice_labels, .state_label = state_label, .end_timestamp_ns = end_timestamp_ns}
Expand Down Expand Up @@ -1307,11 +1315,14 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in

track_object(state->recorder_instance, new_object, sample_weight, optional_class_name);

struct per_thread_context *thread_context = get_or_create_context_for(current_thread, state);

trigger_sample_for_thread(
state,
/* thread: */ current_thread,
/* stack_from_thread: */ current_thread,
get_or_create_context_for(current_thread, state),
thread_context,
thread_context->sampling_buffer,
(sample_values) {.alloc_samples = sample_weight, .alloc_samples_unscaled = 1, .heap_sample = true},
INVALID_TIME, // For now we're not collecting timestamps for allocation events, as per profiling team internal discussions
&ruby_vm_type,
Expand Down
25 changes: 23 additions & 2 deletions ext/datadog_profiling_native_extension/private_vm_api_access.c
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ calc_lineno(const rb_iseq_t *iseq, const VALUE *pc)
// was called from.
// * Imported fix from https://github.com/ruby/ruby/pull/7116 to avoid sampling threads that are still being created
// * Imported fix from https://github.com/ruby/ruby/pull/8415 to avoid potential crash when using YJIT.
// * Add frame_flags.same_frame and logic to skip redoing work if the buffer already contains the same data we're collecting
// * Skipped use of rb_callable_method_entry_t (cme) for Ruby frames as it doesn't impact us.
//
// What is rb_profile_frames?
Expand Down Expand Up @@ -466,7 +467,7 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st
// See comment on `record_placeholder_stack_in_native_code` for a full explanation of what this means (and why we don't just return 0)
if (end_cfp <= cfp) return PLACEHOLDER_STACK_IN_NATIVE_CODE;

for (i=0; i<limit && cfp != end_cfp;) {
for (i=0; i<limit && cfp != end_cfp; cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(cfp)) {
if (cfp->iseq && !cfp->pc) {
// Fix: Do nothing -- this frame should not be used
//
Expand All @@ -479,6 +480,16 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st
continue;
}
Copy link
Contributor

@AlexJF AlexJF Aug 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So in the past, the same cfp would "eat" all the start whereas now it'll skip just one? Not sure if this was ever something we relied on but just highlighting this slight change in behaviour.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a really great catch!

I completely glossed over it because a) we don't use start and b) I remembered some discussion somewhere mentioning it was actually broken.

It's actually fixed nowadays https://bugs.ruby-lang.org/issues/14607 // ruby/ruby#8280 . I had spotted that upstream had moved the cfp = ... to the for header as well, but I hadn't spotted it was to fix start.

I did spot that upstream also added the start skip logic to the if (cme && cme->def->type == VM_METHOD_TYPE_CFUNC) branch below. I'll push a commit to add this one as well, so we keep closer to upstream in case we ever want to use start.


stack_buffer[i].same_frame =
stack_buffer[i].is_ruby_frame &&
stack_buffer[i].as.ruby_frame.iseq == (VALUE) cfp->iseq &&
stack_buffer[i].as.ruby_frame.caching_pc == cfp->pc;

if (stack_buffer[i].same_frame) { // Nothing to do, buffer already contains this frame
i++;
continue;
}

// Upstream Ruby has code here to retrieve the rb_callable_method_entry_t (cme) and in some cases to use it
// instead of the iseq.
// In practice, they are usually the same; the difference is that when you have e.g. block, one gets you a
Expand All @@ -499,6 +510,7 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st
// }
// else {
stack_buffer[i].as.ruby_frame.iseq = (VALUE)cfp->iseq;
stack_buffer[i].as.ruby_frame.caching_pc = (void *) cfp->pc;
// }

// The topmost frame may not have an updated PC because the JIT
Expand All @@ -521,12 +533,21 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st
else {
cme = rb_vm_frame_method_entry(cfp);
if (cme && cme->def->type == VM_METHOD_TYPE_CFUNC) {
stack_buffer[i].same_frame =
!stack_buffer[i].is_ruby_frame &&
stack_buffer[i].as.native_frame.caching_cme == (VALUE) cme;

if (stack_buffer[i].same_frame) { // Nothing to do, buffer already contains this frame
i++;
continue;
}

stack_buffer[i].as.native_frame.caching_cme = (VALUE)cme;
stack_buffer[i].as.native_frame.method_id = cme->def->original_id;
stack_buffer[i].is_ruby_frame = false;
i++;
}
}
cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(cfp);
}

return i;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,16 @@ typedef struct frame_info {
union {
struct {
VALUE iseq;
void *caching_pc; // For caching only
int line;
} ruby_frame;
struct {
VALUE caching_cme; // For caching only
ID method_id;
} native_frame;
} as;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Such fluent! Much API! Wow!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something something great artists copy.

bool is_ruby_frame : 1;
bool same_frame : 1;
} frame_info;

rb_nativethread_id_t pthread_id_for(VALUE thread);
Expand Down