From d5062c535bbeccbcbfb612f07dbb3e6245e29180 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 11:56:20 +0100 Subject: [PATCH 01/17] Minor: Make argument name in header match actual function They were out-of-sync >_> --- .../collectors_thread_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index 5b4de6dbf28..cbef6c9e883 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -184,7 +184,7 @@ static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_ void update_metrics_and_sample( struct thread_context_collector_state *state, VALUE thread_being_sampled, - VALUE profiler_overhead_stack_thread, + VALUE stack_from_thread, struct per_thread_context *thread_context, long current_cpu_time_ns, long current_monotonic_wall_time_ns From 9c61d60f59d1eccb78b75910a06d63f2efb84f2d Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 12:01:06 +0100 Subject: [PATCH 02/17] Add helper function to clear per_thread_context This will be more useful later once freeing a per_thread_context gets more complex. --- .../collectors_thread_context.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index cbef6c9e883..d1689914e5d 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -203,6 +203,7 @@ static VALUE _native_thread_list(VALUE self); static struct per_thread_context *get_or_create_context_for(VALUE thread, struct thread_context_collector_state *state); static struct per_thread_context *get_context_for(VALUE thread, struct thread_context_collector_state *state); static void initialize_context(VALUE thread, struct per_thread_context *thread_context, struct thread_context_collector_state *state); +static void free_context(struct per_thread_context* thread_context); static VALUE _native_inspect(VALUE self, VALUE collector_instance); static VALUE per_thread_context_st_table_as_ruby_hash(struct thread_context_collector_state *state); static int per_thread_context_as_ruby_hash(st_data_t key_thread, st_data_t value_context, st_data_t result_hash); @@ -329,8 +330,8 @@ static int hash_map_per_thread_context_mark(st_data_t key_thread, DDTRACE_UNUSED // Used to clear each of the per_thread_contexts inside the hash_map_per_thread_context static int hash_map_per_thread_context_free_values(DDTRACE_UNUSED st_data_t _thread, st_data_t value_per_thread_context, DDTRACE_UNUSED st_data_t _argument) { - struct per_thread_context *per_thread_context = (struct per_thread_context*) value_per_thread_context; - ruby_xfree(per_thread_context); + struct per_thread_context *thread_context = (struct per_thread_context*) value_per_thread_context; + free_context(thread_context); return ST_CONTINUE; } @@ -928,6 +929,10 @@ static void initialize_context(VALUE thread, struct per_thread_context *thread_c thread_context->gc_tracking.wall_time_at_start_ns = INVALID_TIME; } +static void free_context(struct per_thread_context* thread_context) { + ruby_xfree(thread_context); +} + static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instance) { struct thread_context_collector_state *state; TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); @@ -1019,7 +1024,7 @@ static int remove_if_dead_thread(st_data_t key_thread, st_data_t value_context, if (is_thread_alive(thread)) return ST_CONTINUE; - ruby_xfree(thread_context); + free_context(thread_context); return ST_DELETE; } From 3d4b7fcf30b529b191ca737ae13629eb27b8ab63 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 12:11:27 +0100 Subject: [PATCH 03/17] Remove sampling_buffer argument from record_placeholder_stack Recording a placeholder stack does not involve any sampling, so it doesn't actually need a sampling_buffer. --- .../collectors_stack.c | 16 +++++++--------- .../collectors_stack.h | 1 - .../collectors_thread_context.c | 2 -- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index 58ad5199cfb..aa7f9e55ed9 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -33,7 +33,7 @@ static VALUE _native_sample( VALUE in_gc ); static void maybe_add_placeholder_frames_omitted(VALUE thread, sampling_buffer* buffer, char *frames_omitted_message, int frames_omitted_message_size); -static void record_placeholder_stack_in_native_code(sampling_buffer* buffer, VALUE recorder_instance, sample_values values, sample_labels labels); +static void record_placeholder_stack_in_native_code(VALUE recorder_instance, sample_values values, sample_labels labels); static void maybe_trim_template_random_ids(ddog_CharSlice *name_slice, ddog_CharSlice *filename_slice); void collectors_stack_init(VALUE profiling_module) { @@ -111,7 +111,6 @@ static VALUE _native_sample( if (in_gc == Qtrue) { record_placeholder_stack( - buffer, recorder_instance, values, (sample_labels) {.labels = slice_labels, .state_label = state_label}, @@ -160,7 +159,7 @@ void sample_thread( ); if (captured_frames == PLACEHOLDER_STACK_IN_NATIVE_CODE) { - record_placeholder_stack_in_native_code(buffer, recorder_instance, values, labels); + record_placeholder_stack_in_native_code(recorder_instance, values, labels); return; } @@ -355,13 +354,11 @@ static void maybe_add_placeholder_frames_omitted(VALUE thread, sampling_buffer* // with one containing a placeholder frame, so that these threads are properly represented in the UX. static void record_placeholder_stack_in_native_code( - sampling_buffer* buffer, VALUE recorder_instance, sample_values values, sample_labels labels ) { record_placeholder_stack( - buffer, recorder_instance, values, labels, @@ -370,18 +367,19 @@ static void record_placeholder_stack_in_native_code( } void record_placeholder_stack( - sampling_buffer* buffer, VALUE recorder_instance, sample_values values, sample_labels labels, ddog_CharSlice placeholder_stack ) { - ddog_prof_Function placeholder = {.name = DDOG_CHARSLICE_C(""), .filename = placeholder_stack}; - buffer->locations[0] = (ddog_prof_Location) {.function = placeholder, .line = 0}; + ddog_prof_Location placeholder_location = { + .function = {.name = DDOG_CHARSLICE_C(""), .filename = placeholder_stack}, + .line = 0, + }; record_sample( recorder_instance, - (ddog_prof_Slice_Location) {.ptr = buffer->locations, .len = 1}, + (ddog_prof_Slice_Location) {.ptr = &placeholder_location, .len = 1}, values, labels ); diff --git a/ext/datadog_profiling_native_extension/collectors_stack.h b/ext/datadog_profiling_native_extension/collectors_stack.h index 816407ff8cc..86a88d7d874 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.h +++ b/ext/datadog_profiling_native_extension/collectors_stack.h @@ -17,7 +17,6 @@ void sample_thread( sample_labels labels ); void record_placeholder_stack( - sampling_buffer* buffer, VALUE recorder_instance, sample_values values, sample_labels labels, diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index d1689914e5d..0b963d81ab9 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -675,7 +675,6 @@ VALUE thread_context_collector_sample_after_gc(VALUE self_instance) { } record_placeholder_stack( - state->sampling_buffer, state->recorder_instance, (sample_values) { // This event gets both a regular cpu/wall-time duration, as a normal cpu/wall-time sample would, as well as a @@ -1432,7 +1431,6 @@ void thread_context_collector_sample_skipped_allocation_samples(VALUE self_insta ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = sizeof(labels) / sizeof(labels[0])}; record_placeholder_stack( - state->sampling_buffer, state->recorder_instance, (sample_values) {.alloc_samples = skipped_samples}, (sample_labels) { From 5fd0460605f1215894bfa6d397ea4d54247da552 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 12:16:06 +0100 Subject: [PATCH 04/17] Extract checking max_frames into a separate function This will be used by the thread context collector to validate max frames during initialization, separately from creating new sampling buffers. --- ext/datadog_profiling_native_extension/collectors_stack.c | 7 ++++++- ext/datadog_profiling_native_extension/collectors_stack.h | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index aa7f9e55ed9..ba4b71ef3ee 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -385,9 +385,14 @@ void record_placeholder_stack( ); } -sampling_buffer *sampling_buffer_new(unsigned int max_frames) { +uint16_t sampling_buffer_check_max_frames(int max_frames) { if (max_frames < 5) rb_raise(rb_eArgError, "Invalid max_frames: value must be >= 5"); if (max_frames > MAX_FRAMES_LIMIT) rb_raise(rb_eArgError, "Invalid max_frames: value must be <= " MAX_FRAMES_LIMIT_AS_STRING); + return max_frames; +} + +sampling_buffer *sampling_buffer_new(uint16_t max_frames) { + sampling_buffer_check_max_frames(max_frames); // Note: never returns NULL; if out of memory, it calls the Ruby out-of-memory handlers sampling_buffer* buffer = ruby_xcalloc(1, sizeof(sampling_buffer)); diff --git a/ext/datadog_profiling_native_extension/collectors_stack.h b/ext/datadog_profiling_native_extension/collectors_stack.h index 86a88d7d874..7b30adf169c 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.h +++ b/ext/datadog_profiling_native_extension/collectors_stack.h @@ -22,5 +22,6 @@ void record_placeholder_stack( sample_labels labels, ddog_CharSlice placeholder_stack ); -sampling_buffer *sampling_buffer_new(unsigned int max_frames); +uint16_t sampling_buffer_check_max_frames(int max_frames); +sampling_buffer *sampling_buffer_new(uint16_t max_frames); void sampling_buffer_free(sampling_buffer *buffer); From dc59c0ac47b2cfcedd6bf178bf09b6b879b40095 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 12:18:03 +0100 Subject: [PATCH 05/17] Make sampling_buffers per-thread This doesn't yet change anything; this refactor will later allow us to reuse data from the sampling_buffers from sample to sample. This does slightly increase memory usage, but not by as much as may be expected. Ignoring the locations (these will get shared in a later commit), a sampling buffer has max_frames * (VALUE, int, bool) which is less than 10KiB per thread. --- .../collectors_thread_context.c | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index 0b963d81ab9..c2775a9d987 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -92,7 +92,7 @@ struct thread_context_collector_state { // "Update this when modifying state struct" // Required by Datadog::Profiling::Collectors::Stack as a scratch buffer during sampling - sampling_buffer *sampling_buffer; + uint16_t max_frames; // Hashmap st_table *hash_map_per_thread_context; // Datadog::Profiling::StackRecorder instance @@ -138,6 +138,7 @@ struct thread_context_collector_state { // Tracks per-thread state struct per_thread_context { + sampling_buffer *sampling_buffer; char thread_id[THREAD_ID_LIMIT_CHARS]; ddog_CharSlice thread_id_char_slice; char thread_invoke_location[THREAD_INVOKE_LOCATION_LIMIT_CHARS]; @@ -309,10 +310,6 @@ static void thread_context_collector_typed_data_free(void *state_ptr) { // Update this when modifying state struct - // Important: Remember that we're only guaranteed to see here what's been set in _native_new, aka - // pointers that have been set NULL there may still be NULL here. - if (state->sampling_buffer != NULL) sampling_buffer_free(state->sampling_buffer); - // Free each entry in the map st_foreach(state->hash_map_per_thread_context, hash_map_per_thread_context_free_values, 0 /* unused */); // ...and then the map @@ -342,7 +339,7 @@ static VALUE _native_new(VALUE klass) { // being leaked. // Update this when modifying state struct - state->sampling_buffer = NULL; + state->max_frames = 0; state->hash_map_per_thread_context = // "numtable" is an awful name, but TL;DR it's what should be used when keys are `VALUE`s. st_init_numtable(); @@ -389,11 +386,8 @@ static VALUE _native_initialize( struct thread_context_collector_state *state; TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); - int max_frames_requested = NUM2INT(max_frames); - if (max_frames_requested < 0) rb_raise(rb_eArgError, "Invalid max_frames: value must not be negative"); - // Update this when modifying state struct - state->sampling_buffer = sampling_buffer_new(max_frames_requested); + state->max_frames = sampling_buffer_check_max_frames(NUM2INT(max_frames)); // hash_map_per_thread_context is already initialized, nothing to do here state->recorder_instance = enforce_recorder_instance(recorder_instance); state->endpoint_collection_enabled = (endpoint_collection_enabled == Qtrue); @@ -825,7 +819,7 @@ static void trigger_sample_for_thread( sample_thread( stack_from_thread, - state->sampling_buffer, + thread_context->sampling_buffer, state->recorder_instance, values, (sample_labels) {.labels = slice_labels, .state_label = state_label, .end_timestamp_ns = end_timestamp_ns} @@ -888,6 +882,8 @@ static bool is_logging_gem_monkey_patch(VALUE invoke_file_location) { } static void initialize_context(VALUE thread, struct per_thread_context *thread_context, struct thread_context_collector_state *state) { + thread_context->sampling_buffer = sampling_buffer_new(state->max_frames); + snprintf(thread_context->thread_id, THREAD_ID_LIMIT_CHARS, "%"PRIu64" (%lu)", native_thread_id_for(thread), (unsigned long) thread_id_for(thread)); thread_context->thread_id_char_slice = (ddog_CharSlice) {.ptr = thread_context->thread_id, .len = strlen(thread_context->thread_id)}; @@ -929,6 +925,7 @@ static void initialize_context(VALUE thread, struct per_thread_context *thread_c } static void free_context(struct per_thread_context* thread_context) { + sampling_buffer_free(thread_context->sampling_buffer); ruby_xfree(thread_context); } @@ -939,6 +936,7 @@ static VALUE _native_inspect(DDTRACE_UNUSED VALUE _self, VALUE collector_instanc VALUE result = rb_str_new2(" (native state)"); // Update this when modifying state struct + rb_str_concat(result, rb_sprintf(" max_frames=%d", state->max_frames)); rb_str_concat(result, rb_sprintf(" hash_map_per_thread_context=%"PRIsVALUE, per_thread_context_st_table_as_ruby_hash(state))); rb_str_concat(result, rb_sprintf(" recorder_instance=%"PRIsVALUE, state->recorder_instance)); VALUE tracer_context_key = state->tracer_context_key == MISSING_TRACER_CONTEXT_KEY ? Qnil : ID2SYM(state->tracer_context_key); From 99d5915260f7937753d641de3707fa1423d8d072 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 12:28:41 +0100 Subject: [PATCH 06/17] Avoid allocating locations for every sampling_buffer/thread The locations, unlike the rest of the sampling buffer, do not (yet?) need to be per-thread, so let's save some memory. --- .../collectors_stack.c | 13 +++++++------ .../collectors_stack.h | 2 +- .../collectors_thread_context.c | 9 ++++++++- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index ba4b71ef3ee..24950caa81b 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -16,10 +16,10 @@ static VALUE missing_string = Qnil; // Used as scratch space during sampling struct sampling_buffer { unsigned int max_frames; + ddog_prof_Location *locations; VALUE *stack_buffer; int *lines_buffer; bool *is_ruby_frame; - ddog_prof_Location *locations; }; // Note: typedef'd in the header to sampling_buffer static VALUE _native_sample( @@ -105,7 +105,8 @@ static VALUE _native_sample( int max_frames_requested = NUM2INT(max_frames); if (max_frames_requested < 0) rb_raise(rb_eArgError, "Invalid max_frames: value must not be negative"); - sampling_buffer *buffer = sampling_buffer_new(max_frames_requested); + ddog_prof_Location *locations = ruby_xcalloc(max_frames_requested, sizeof(ddog_prof_Location)); + sampling_buffer *buffer = sampling_buffer_new(max_frames_requested, locations); ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = labels_count}; @@ -126,6 +127,7 @@ static VALUE _native_sample( ); } + ruby_xfree(locations); sampling_buffer_free(buffer); return Qtrue; @@ -391,18 +393,17 @@ uint16_t sampling_buffer_check_max_frames(int max_frames) { return max_frames; } -sampling_buffer *sampling_buffer_new(uint16_t max_frames) { +sampling_buffer *sampling_buffer_new(uint16_t max_frames, ddog_prof_Location *locations) { sampling_buffer_check_max_frames(max_frames); // Note: never returns NULL; if out of memory, it calls the Ruby out-of-memory handlers sampling_buffer* buffer = ruby_xcalloc(1, sizeof(sampling_buffer)); buffer->max_frames = max_frames; - + buffer->locations = locations; buffer->stack_buffer = ruby_xcalloc(max_frames, sizeof(VALUE)); buffer->lines_buffer = ruby_xcalloc(max_frames, sizeof(int)); buffer->is_ruby_frame = ruby_xcalloc(max_frames, sizeof(bool)); - buffer->locations = ruby_xcalloc(max_frames, sizeof(ddog_prof_Location)); return buffer; } @@ -410,10 +411,10 @@ sampling_buffer *sampling_buffer_new(uint16_t max_frames) { void sampling_buffer_free(sampling_buffer *buffer) { if (buffer == NULL) rb_raise(rb_eArgError, "sampling_buffer_free called with NULL buffer"); + // buffer->locations are owned by whoever called sampling_buffer_new, not us ruby_xfree(buffer->stack_buffer); ruby_xfree(buffer->lines_buffer); ruby_xfree(buffer->is_ruby_frame); - ruby_xfree(buffer->locations); ruby_xfree(buffer); } diff --git a/ext/datadog_profiling_native_extension/collectors_stack.h b/ext/datadog_profiling_native_extension/collectors_stack.h index 7b30adf169c..96d8b10355c 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.h +++ b/ext/datadog_profiling_native_extension/collectors_stack.h @@ -23,5 +23,5 @@ void record_placeholder_stack( ddog_CharSlice placeholder_stack ); uint16_t sampling_buffer_check_max_frames(int max_frames); -sampling_buffer *sampling_buffer_new(uint16_t max_frames); +sampling_buffer *sampling_buffer_new(uint16_t max_frames, ddog_prof_Location *locations); void sampling_buffer_free(sampling_buffer *buffer); diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index c2775a9d987..8f10900f3a9 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -92,6 +92,7 @@ struct thread_context_collector_state { // "Update this when modifying state struct" // Required by Datadog::Profiling::Collectors::Stack as a scratch buffer during sampling + ddog_prof_Location *locations; uint16_t max_frames; // Hashmap st_table *hash_map_per_thread_context; @@ -310,6 +311,10 @@ static void thread_context_collector_typed_data_free(void *state_ptr) { // Update this when modifying state struct + // Important: Remember that we're only guaranteed to see here what's been set in _native_new, aka + // pointers that have been set NULL there may still be NULL here. + if (state->locations != NULL) ruby_xfree(state->locations); + // Free each entry in the map st_foreach(state->hash_map_per_thread_context, hash_map_per_thread_context_free_values, 0 /* unused */); // ...and then the map @@ -339,6 +344,7 @@ static VALUE _native_new(VALUE klass) { // being leaked. // Update this when modifying state struct + state->locations = NULL; state->max_frames = 0; state->hash_map_per_thread_context = // "numtable" is an awful name, but TL;DR it's what should be used when keys are `VALUE`s. @@ -388,6 +394,7 @@ static VALUE _native_initialize( // Update this when modifying state struct state->max_frames = sampling_buffer_check_max_frames(NUM2INT(max_frames)); + state->locations = ruby_xcalloc(state->max_frames, sizeof(ddog_prof_Location)); // hash_map_per_thread_context is already initialized, nothing to do here state->recorder_instance = enforce_recorder_instance(recorder_instance); state->endpoint_collection_enabled = (endpoint_collection_enabled == Qtrue); @@ -882,7 +889,7 @@ static bool is_logging_gem_monkey_patch(VALUE invoke_file_location) { } static void initialize_context(VALUE thread, struct per_thread_context *thread_context, struct thread_context_collector_state *state) { - thread_context->sampling_buffer = sampling_buffer_new(state->max_frames); + thread_context->sampling_buffer = sampling_buffer_new(state->max_frames, state->locations); snprintf(thread_context->thread_id, THREAD_ID_LIMIT_CHARS, "%"PRIu64" (%lu)", native_thread_id_for(thread), (unsigned long) thread_id_for(thread)); thread_context->thread_id_char_slice = (ddog_CharSlice) {.ptr = thread_context->thread_id, .len = strlen(thread_context->thread_id)}; From 05e552ccf0d1ab9a02e8789e8be4485eceda0c0e Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 13:57:10 +0100 Subject: [PATCH 07/17] Use smaller type for max_frames Not really to save memory, but semantically it's closer to what we want. --- ext/datadog_profiling_native_extension/collectors_stack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index 24950caa81b..7b7e94fdd58 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -15,7 +15,7 @@ static VALUE missing_string = Qnil; // Used as scratch space during sampling struct sampling_buffer { - unsigned int max_frames; + uint16_t max_frames; ddog_prof_Location *locations; VALUE *stack_buffer; int *lines_buffer; From 71839e03ee792cb8d90772d7b92475f89c81ac74 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 14:00:39 +0100 Subject: [PATCH 08/17] Avoid repeating call to rb_profile_frame_path for cframes Instead of storing the `last_ruby_frame` just to later need to re-retrieve the filename, let's directly store the `last_ruby_frame_filename`. --- .../collectors_stack.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index 7b7e94fdd58..6b13ed6b521 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -169,8 +169,8 @@ void sample_thread( // The convention in Kernel#caller_locations is to instead use the path and line number of the first Ruby frame // on the stack that is below (e.g. directly or indirectly has called) the native method. // Thus, we keep that frame here to able to replicate that behavior. - // (This is why we also iterate the sampling buffers backwards below -- so that it's easier to keep the last_ruby_frame) - VALUE last_ruby_frame = Qnil; + // (This is why we also iterate the sampling buffers backwards below -- so that it's easier to keep the last_ruby_frame_filename) + VALUE last_ruby_frame_filename = Qnil; int last_ruby_line = 0; ddog_prof_Label *state_label = labels.state_label; @@ -187,15 +187,15 @@ void sample_thread( int line; if (buffer->is_ruby_frame[i]) { - last_ruby_frame = buffer->stack_buffer[i]; - last_ruby_line = buffer->lines_buffer[i]; - name = rb_profile_frame_base_label(buffer->stack_buffer[i]); filename = rb_profile_frame_path(buffer->stack_buffer[i]); line = buffer->lines_buffer[i]; + + last_ruby_frame_filename = filename; + last_ruby_line = line; } else { name = ddtrace_rb_profile_frame_method_name(buffer->stack_buffer[i]); - filename = NIL_P(last_ruby_frame) ? Qnil : rb_profile_frame_path(last_ruby_frame); + filename = last_ruby_frame_filename; line = last_ruby_line; } From ca040900f8f2b051a417264fe7dcdf4e124c50f6 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 14:13:57 +0100 Subject: [PATCH 09/17] Avoid retrieving rb_callable_method_entry_t in ddtrace_rb_profile_frames As explained in the added comment, for our use this is expected to be equivalent (and our integration tests that compare our output with the Ruby stack trace APIs still pass). This gets us a bit less of work during sampling (but not a lot), but more importantly it will make it easier to add caching in a later commit. --- .../private_vm_api_access.c | 49 +++++++++---------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.c b/ext/datadog_profiling_native_extension/private_vm_api_access.c index 52783619681..435e2a7bdb7 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.c +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.c @@ -392,6 +392,7 @@ calc_lineno(const rb_iseq_t *iseq, const VALUE *pc) // was called from. // * Imported fix from https://github.com/ruby/ruby/pull/7116 to avoid sampling threads that are still being created // * Imported fix from https://github.com/ruby/ruby/pull/8415 to avoid potential crash when using YJIT. +// * Skipped use of rb_callable_method_entry_t (cme) for Ruby frames as it doesn't impact us. // // What is rb_profile_frames? // `rb_profile_frames` is a Ruby VM debug API added for use by profilers for sampling the stack trace of a Ruby thread. @@ -479,33 +480,27 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, VALUE *buff, i continue; } - /* record frame info */ - cme = rb_vm_frame_method_entry(cfp); - - if (cme && cme->def->type == VM_METHOD_TYPE_ISEQ && - // Fix: Do not use callable method entry when iseq is for an eval. - // TL;DR: This fix is needed for us to match the Ruby reference API information in the - // "when sampling an eval/instance eval inside an object" spec. - // - // Longer note: - // When a frame is a ruby frame (VM_FRAME_RUBYFRAME_P above), we can get information about it - // by introspecting both the callable method entry, as well as the iseq directly. - // Often they match... but sometimes they provide different info (as in the "iseq for an eval" situation - // here). - // If my reading of vm_backtrace.c is correct, the actual Ruby stack trace API **never** uses the - // callable method entry for Ruby frames, but only for VM_METHOD_TYPE_CFUNC (see `backtrace_each` method - // on that file). - // So... why does `rb_profile_frames` do something different? Is it a bug? Is it because it exposes - // more information than the Ruby stack frame API? - // As a final note, the `backtracie` gem (https://github.com/ivoanjo/backtracie) can be used to introspect - // the full metadata provided by both the callable method entry as well as the iseq, and is really useful - // to debug and learn more about these differences. - cfp->iseq->body->type != ISEQ_TYPE_EVAL) { - buff[i] = (VALUE)cme; - } - else { - buff[i] = (VALUE)cfp->iseq; - } + // Upstream Ruby has code here to retrieve the rb_callable_method_entry_t (cme) and in some cases to use it + // instead of the iseq. + // In practice, they are usually the same; the difference is that when you have e.g. block, one gets you a + // reference to the block, and the other to the method containing the block. + // This would be important if we used `rb_profile_frame_label` and wanted the "block in foo" label instead + // of just "foo". But we're currently using `rb_profile_frame_base_label` which I believe is always the same + // between the rb_callable_method_entry_t and the iseq. Thus, to simplify a bit our logic and reduce a bit + // the overhead, we always use the iseq here. + // + // cme = rb_vm_frame_method_entry(cfp); + + // if (cme && cme->def->type == VM_METHOD_TYPE_ISEQ && + // // Fix: Do not use callable method entry when iseq is for an eval. + // // TL;DR: This fix is needed for us to match the Ruby reference API information in the + // // "when sampling an eval/instance eval inside an object" spec. + // cfp->iseq->body->type != ISEQ_TYPE_EVAL) { + // buff[i] = (VALUE)cme; + // } + // else { + buff[i] = (VALUE)cfp->iseq; + // } // The topmost frame may not have an updated PC because the JIT // may not have set one. The JIT compiler will update the PC From 90395722fef4c1428b4c4a5357cf1a639dc458f7 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 14:26:28 +0100 Subject: [PATCH 10/17] Introduce `frame_info` to contain sampled stack frame information Rather than having separate arrays for each kind of information we want to get from a stack (VALUE *stack_buffer + int *lines_buffer + bool *is_ruby_frame), introduce a new `frame_info` struct to contain this information. This makes it easy to then add or change members in this struct without having to think about allocating yet more arrays, and managing their lifetimes and passing them in as arguments. (Spoiler: We'll be using this in a commit very soon to introduce caching) As a side bonus of this change, I noticed that we could just expose the method name for native methods as an `ID`, rather than handing over the `rb_callable_method_entry_t` ("cme") as we did before. This has a few advantages: * It's effectively a way to inline the work done by the previous call to `ddtrace_rb_profile_frame_method_name`. Because we had to copy/paste some code from the Ruby sources to support `ddtrace_rb_profile_frame_method_name` on legacy Rubies, we no longer need that code and can delete it! * An `ID` is interned/immutable, so it's much easier to later cache if we want to. (Although no plans yet...) --- .../collectors_stack.c | 26 ++-- .../extconf.rb | 3 - .../private_vm_api_access.c | 120 ++---------------- .../private_vm_api_access.h | 25 ++-- 4 files changed, 33 insertions(+), 141 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index 6b13ed6b521..4bcaaa768ac 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -17,9 +17,7 @@ static VALUE missing_string = Qnil; struct sampling_buffer { uint16_t max_frames; ddog_prof_Location *locations; - VALUE *stack_buffer; - int *lines_buffer; - bool *is_ruby_frame; + frame_info *stack_buffer; }; // Note: typedef'd in the header to sampling_buffer static VALUE _native_sample( @@ -155,9 +153,7 @@ void sample_thread( thread, 0 /* stack starting depth */, buffer->max_frames, - buffer->stack_buffer, - buffer->lines_buffer, - buffer->is_ruby_frame + buffer->stack_buffer ); if (captured_frames == PLACEHOLDER_STACK_IN_NATIVE_CODE) { @@ -186,15 +182,15 @@ void sample_thread( VALUE name, filename; int line; - if (buffer->is_ruby_frame[i]) { - name = rb_profile_frame_base_label(buffer->stack_buffer[i]); - filename = rb_profile_frame_path(buffer->stack_buffer[i]); - line = buffer->lines_buffer[i]; + if (buffer->stack_buffer[i].is_ruby_frame) { + name = rb_profile_frame_base_label(buffer->stack_buffer[i].as.ruby_frame.iseq); + filename = rb_profile_frame_path(buffer->stack_buffer[i].as.ruby_frame.iseq); + line = buffer->stack_buffer[i].as.ruby_frame.line; last_ruby_frame_filename = filename; last_ruby_line = line; } else { - name = ddtrace_rb_profile_frame_method_name(buffer->stack_buffer[i]); + name = rb_id2str(buffer->stack_buffer[i].as.native_frame.method_id); filename = last_ruby_frame_filename; line = last_ruby_line; } @@ -215,7 +211,7 @@ void sample_thread( // approximation, and in the future we hope to replace this with a more accurate approach (such as using the // GVL instrumentation API.) if (top_of_the_stack && only_wall_time) { - if (!buffer->is_ruby_frame[i]) { + if (!buffer->stack_buffer[i].is_ruby_frame) { // We know that known versions of Ruby implement these using native code; thus if we find a method with the // same name that is not native code, we ignore it, as it's probably a user method that coincidentally // has the same name. Thus, even though "matching just by method name" is kinda weak, @@ -401,9 +397,7 @@ sampling_buffer *sampling_buffer_new(uint16_t max_frames, ddog_prof_Location *lo buffer->max_frames = max_frames; buffer->locations = locations; - buffer->stack_buffer = ruby_xcalloc(max_frames, sizeof(VALUE)); - buffer->lines_buffer = ruby_xcalloc(max_frames, sizeof(int)); - buffer->is_ruby_frame = ruby_xcalloc(max_frames, sizeof(bool)); + buffer->stack_buffer = ruby_xcalloc(max_frames, sizeof(frame_info)); return buffer; } @@ -413,8 +407,6 @@ void sampling_buffer_free(sampling_buffer *buffer) { // buffer->locations are owned by whoever called sampling_buffer_new, not us ruby_xfree(buffer->stack_buffer); - ruby_xfree(buffer->lines_buffer); - ruby_xfree(buffer->is_ruby_frame); ruby_xfree(buffer); } diff --git a/ext/datadog_profiling_native_extension/extconf.rb b/ext/datadog_profiling_native_extension/extconf.rb index e2eebefd411..72ab79e167e 100644 --- a/ext/datadog_profiling_native_extension/extconf.rb +++ b/ext/datadog_profiling_native_extension/extconf.rb @@ -172,9 +172,6 @@ def add_compiler_flag(flag) # obj_to_id_tbl mappings. $defs << '-DHAVE_WORKING_RB_GC_FORCE_RECYCLE' if RUBY_VERSION < '3.1' -# On older Rubies, we need to use a backported version of this function. See private_vm_api_access.h for details. -$defs << '-DUSE_BACKPORTED_RB_PROFILE_FRAME_METHOD_NAME' if RUBY_VERSION < '3' - # On older Rubies, there are no Ractors $defs << '-DNO_RACTORS' if RUBY_VERSION < '3' diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.c b/ext/datadog_profiling_native_extension/private_vm_api_access.c index 435e2a7bdb7..9a060cbaf62 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.c +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.c @@ -376,8 +376,8 @@ calc_lineno(const rb_iseq_t *iseq, const VALUE *pc) // Copyright (C) 1993-2012 Yukihiro Matsumoto // Modifications: // * Renamed rb_profile_frames => ddtrace_rb_profile_frames -// * Add thread argument -// * Add is_ruby_frame argument +// * Add thread argument (this is now upstream, actually!) +// * Add frame_flags.is_ruby_frame argument // * Removed `if (lines)` tests -- require/assume that like `buff`, `lines` is always specified // * Skip dummy frame that shows up in main thread // * Add `end_cfp == NULL` and `end_cfp <= cfp` safety checks. These are used in a bunch of places in @@ -422,8 +422,7 @@ calc_lineno(const rb_iseq_t *iseq, const VALUE *pc) // and friends). We've found quite a few situations where the data from rb_profile_frames and the reference APIs // disagree, and quite a few of them seem oversights/bugs (speculation from my part) rather than deliberate // decisions. -int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, VALUE *buff, int *lines, bool* is_ruby_frame) -{ +int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *stack_buffer) { int i; // Modified from upstream: Instead of using `GET_EC` to collect info from the current thread, // support sampling any thread (including the current) passed as an argument @@ -499,7 +498,7 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, VALUE *buff, i // buff[i] = (VALUE)cme; // } // else { - buff[i] = (VALUE)cfp->iseq; + stack_buffer[i].as.ruby_frame.iseq = (VALUE)cfp->iseq; // } // The topmost frame may not have an updated PC because the JIT @@ -508,23 +507,22 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, VALUE *buff, i // so only the topmost frame could possibly have an out of date PC #ifndef NO_JIT_RETURN if (cfp == top && cfp->jit_return) { - lines[i] = 0; + stack_buffer[i].as.ruby_frame.line = 0; } else { - lines[i] = calc_lineno(cfp->iseq, cfp->pc); + stack_buffer[i].as.ruby_frame.line = calc_lineno(cfp->iseq, cfp->pc); } #else // Ruby < 3.1 - lines[i] = calc_lineno(cfp->iseq, cfp->pc); + stack_buffer[i].as.ruby_frame.line = calc_lineno(cfp->iseq, cfp->pc); #endif - is_ruby_frame[i] = true; + stack_buffer[i].is_ruby_frame = true; i++; } else { cme = rb_vm_frame_method_entry(cfp); if (cme && cme->def->type == VM_METHOD_TYPE_CFUNC) { - buff[i] = (VALUE)cme; - lines[i] = 0; - is_ruby_frame[i] = false; + stack_buffer[i].as.native_frame.method_id = cme->def->original_id; + stack_buffer[i].is_ruby_frame = false; i++; } } @@ -534,104 +532,6 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, VALUE *buff, i return i; } -#ifdef USE_BACKPORTED_RB_PROFILE_FRAME_METHOD_NAME - -// Taken from upstream vm_backtrace.c at commit 5f10bd634fb6ae8f74a4ea730176233b0ca96954 (March 2022, Ruby 3.2 trunk) -// Copyright (C) 1993-2012 Yukihiro Matsumoto -// to support our custom rb_profile_frame_method_name (see below) -// Modifications: None -static VALUE -id2str(ID id) -{ - VALUE str = rb_id2str(id); - if (!str) return Qnil; - return str; -} -#define rb_id2str(id) id2str(id) - -// Taken from upstream vm_backtrace.c at commit 5f10bd634fb6ae8f74a4ea730176233b0ca96954 (March 2022, Ruby 3.2 trunk) -// Copyright (C) 1993-2012 Yukihiro Matsumoto -// to support our custom rb_profile_frame_method_name (see below) -// Modifications: None -static const rb_iseq_t * -frame2iseq(VALUE frame) -{ - if (NIL_P(frame)) return NULL; - - if (RB_TYPE_P(frame, T_IMEMO)) { - switch (imemo_type(frame)) { - case imemo_iseq: - return (const rb_iseq_t *)frame; - case imemo_ment: - { - const rb_callable_method_entry_t *cme = (rb_callable_method_entry_t *)frame; - switch (cme->def->type) { - case VM_METHOD_TYPE_ISEQ: - return cme->def->body.iseq.iseqptr; - default: - return NULL; - } - } - default: - break; - } - } - rb_bug("frame2iseq: unreachable"); -} - -// Taken from upstream vm_backtrace.c at commit 5f10bd634fb6ae8f74a4ea730176233b0ca96954 (March 2022, Ruby 3.2 trunk) -// Copyright (C) 1993-2012 Yukihiro Matsumoto -// to support our custom rb_profile_frame_method_name (see below) -// Modifications: None -static const rb_callable_method_entry_t * -cframe(VALUE frame) -{ - if (NIL_P(frame)) return NULL; - - if (RB_TYPE_P(frame, T_IMEMO)) { - switch (imemo_type(frame)) { - case imemo_ment: - { - const rb_callable_method_entry_t *cme = (rb_callable_method_entry_t *)frame; - switch (cme->def->type) { - case VM_METHOD_TYPE_CFUNC: - return cme; - default: - return NULL; - } - } - default: - return NULL; - } - } - - return NULL; -} - -// Taken from upstream vm_backtrace.c at commit 5f10bd634fb6ae8f74a4ea730176233b0ca96954 (March 2022, Ruby 3.2 trunk) -// Copyright (C) 1993-2012 Yukihiro Matsumoto -// -// Ruby 3.0 finally added support for showing CFUNC frames (frames for methods written using native code) -// in stack traces gathered via `rb_profile_frames` (https://github.com/ruby/ruby/pull/3299). -// To access this information on older Rubies, beyond using our custom `ddtrace_rb_profile_frames` above, we also need -// to backport the Ruby 3.0+ version of `rb_profile_frame_method_name`. -// -// Modifications: -// * Renamed rb_profile_frame_method_name => ddtrace_rb_profile_frame_method_name -VALUE -ddtrace_rb_profile_frame_method_name(VALUE frame) -{ - const rb_callable_method_entry_t *cme = cframe(frame); - if (cme) { - ID mid = cme->def->original_id; - return id2str(mid); - } - const rb_iseq_t *iseq = frame2iseq(frame); - return iseq ? rb_iseq_method_name(iseq) : Qnil; -} - -#endif // USE_BACKPORTED_RB_PROFILE_FRAME_METHOD_NAME - // Support code for older Rubies that cannot use the MJIT header #ifndef RUBY_MJIT_HEADER diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.h b/ext/datadog_profiling_native_extension/private_vm_api_access.h index 487ed84b69e..bd4fbfe92ba 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.h +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.h @@ -18,6 +18,19 @@ typedef struct { rb_nativethread_id_t owner; } current_gvl_owner; +typedef struct frame_info { + union { + struct { + VALUE iseq; + int line; + } ruby_frame; + struct { + ID method_id; + } native_frame; + } as; + bool is_ruby_frame : 1; +} frame_info; + rb_nativethread_id_t pthread_id_for(VALUE thread); bool is_current_thread_holding_the_gvl(void); current_gvl_owner gvl_owner(void); @@ -27,20 +40,10 @@ void ddtrace_thread_list(VALUE result_array); bool is_thread_alive(VALUE thread); VALUE thread_name_for(VALUE thread); -int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, VALUE *buff, int *lines, bool* is_ruby_frame); +int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *stack_buffer); // Returns true if the current thread belongs to the main Ractor or if Ruby has no Ractor support bool ddtrace_rb_ractor_main_p(void); -// Ruby 3.0 finally added support for showing CFUNC frames (frames for methods written using native code) -// in stack traces gathered via `rb_profile_frames` (https://github.com/ruby/ruby/pull/3299). -// To access this information on older Rubies, beyond using our custom `ddtrace_rb_profile_frames` above, we also need -// to backport the Ruby 3.0+ version of `rb_profile_frame_method_name`. -#ifdef USE_BACKPORTED_RB_PROFILE_FRAME_METHOD_NAME - VALUE ddtrace_rb_profile_frame_method_name(VALUE frame); -#else // Ruby > 3.0, just use the stock functionality - #define ddtrace_rb_profile_frame_method_name rb_profile_frame_method_name -#endif - // See comment on `record_placeholder_stack_in_native_code` for a full explanation of what this means (and why we don't just return 0) #define PLACEHOLDER_STACK_IN_NATIVE_CODE -1 From 47044b0a50a0fb05ea5f94883d37c8e7843434f7 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 14:47:19 +0100 Subject: [PATCH 11/17] Introduce caching for sampled stacks The intuition here is that in many apps, the same stack (or at least the bottom of the stack) won't change very often -- e.g. during an HTTP request we may take a bunch of samples, and all of them will have a common stack that's webserver + rails routing + endpoint doing work. While right now this mostly avoids us repeating calls to `calc_lineno`, (calculating the line number for Ruby frames) one of the most interesting changes in this PR is the new `same_frame` flag. This `same_frame` flag allows the caller to know when the stack has changed, and I'm hoping to later connect this to libdatadog improvements, e.g. if we store the ids for each stack entry in libdatadog, we can know if the stack changed or not, and thus if we can just re-feed the same ids to libdatadog or need to record new ones. --- .../collectors_stack.c | 8 ++++++ .../collectors_thread_context.c | 15 +++++++++-- .../private_vm_api_access.c | 25 +++++++++++++++++-- .../private_vm_api_access.h | 3 +++ 4 files changed, 47 insertions(+), 4 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index 4bcaaa768ac..3e16036437f 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -161,6 +161,14 @@ void sample_thread( return; } + // if (captured_frames > 0) { + // int cache_hits = 0; + // for (int i = 0; i < captured_frames; i++) { + // if (buffer->stack_buffer[i].same_frame) cache_hits++; + // } + // fprintf(stderr, "Sampling cache hits: %f\n", ((double) cache_hits / captured_frames) * 100); + // } + // Ruby does not give us path and line number for methods implemented using native code. // The convention in Kernel#caller_locations is to instead use the path and line number of the first Ruby frame // on the stack that is below (e.g. directly or indirectly has called) the native method. diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index 8f10900f3a9..8a449dc7a28 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -188,6 +188,7 @@ void update_metrics_and_sample( VALUE thread_being_sampled, VALUE stack_from_thread, struct per_thread_context *thread_context, + sampling_buffer* sampling_buffer, long current_cpu_time_ns, long current_monotonic_wall_time_ns ); @@ -196,6 +197,7 @@ static void trigger_sample_for_thread( VALUE thread, VALUE stack_from_thread, struct per_thread_context *thread_context, + sampling_buffer* sampling_buffer, sample_values values, long current_monotonic_wall_time_ns, ddog_CharSlice *ruby_vm_type, @@ -476,6 +478,7 @@ void thread_context_collector_sample(VALUE self_instance, long current_monotonic /* thread_being_sampled: */ thread, /* stack_from_thread: */ thread, thread_context, + thread_context->sampling_buffer, current_cpu_time_ns, current_monotonic_wall_time_ns ); @@ -492,6 +495,8 @@ void thread_context_collector_sample(VALUE self_instance, long current_monotonic /* thread_being_sampled: */ current_thread, /* stack_from_thread: */ profiler_overhead_stack_thread, current_thread_context, + // Here we use the overhead thread's sampling buffer so as to not invalidate the cache in the buffer of the thread being sampled + get_or_create_context_for(profiler_overhead_stack_thread, state)->sampling_buffer, cpu_time_now_ns(current_thread_context), monotonic_wall_time_now_ns(RAISE_ON_FAILURE) ); @@ -502,6 +507,7 @@ void update_metrics_and_sample( VALUE thread_being_sampled, VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack struct per_thread_context *thread_context, + sampling_buffer* sampling_buffer, long current_cpu_time_ns, long current_monotonic_wall_time_ns ) { @@ -527,6 +533,7 @@ void update_metrics_and_sample( thread_being_sampled, stack_from_thread, thread_context, + sampling_buffer, (sample_values) {.cpu_time_ns = cpu_time_elapsed_ns, .cpu_or_wall_samples = 1, .wall_time_ns = wall_time_elapsed_ns}, current_monotonic_wall_time_ns, NULL, @@ -706,6 +713,7 @@ static void trigger_sample_for_thread( VALUE thread, VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack struct per_thread_context *thread_context, + sampling_buffer* sampling_buffer, sample_values values, long current_monotonic_wall_time_ns, // These two labels are only used for allocation profiling; @ivoanjo: may want to refactor this at some point? @@ -826,7 +834,7 @@ static void trigger_sample_for_thread( sample_thread( stack_from_thread, - thread_context->sampling_buffer, + sampling_buffer, state->recorder_instance, values, (sample_labels) {.labels = slice_labels, .state_label = state_label, .end_timestamp_ns = end_timestamp_ns} @@ -1307,11 +1315,14 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in track_object(state->recorder_instance, new_object, sample_weight, optional_class_name); + struct per_thread_context *thread_context = get_or_create_context_for(current_thread, state); + trigger_sample_for_thread( state, /* thread: */ current_thread, /* stack_from_thread: */ current_thread, - get_or_create_context_for(current_thread, state), + thread_context, + thread_context->sampling_buffer, (sample_values) {.alloc_samples = sample_weight, .alloc_samples_unscaled = 1, .heap_sample = true}, INVALID_TIME, // For now we're not collecting timestamps for allocation events, as per profiling team internal discussions &ruby_vm_type, diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.c b/ext/datadog_profiling_native_extension/private_vm_api_access.c index 9a060cbaf62..15718934a18 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.c +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.c @@ -392,6 +392,7 @@ calc_lineno(const rb_iseq_t *iseq, const VALUE *pc) // was called from. // * Imported fix from https://github.com/ruby/ruby/pull/7116 to avoid sampling threads that are still being created // * Imported fix from https://github.com/ruby/ruby/pull/8415 to avoid potential crash when using YJIT. +// * Add frame_flags.same_frame and logic to skip redoing work if the buffer already contains the same data we're collecting // * Skipped use of rb_callable_method_entry_t (cme) for Ruby frames as it doesn't impact us. // // What is rb_profile_frames? @@ -466,7 +467,7 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st // See comment on `record_placeholder_stack_in_native_code` for a full explanation of what this means (and why we don't just return 0) if (end_cfp <= cfp) return PLACEHOLDER_STACK_IN_NATIVE_CODE; - for (i=0; iiseq && !cfp->pc) { // Fix: Do nothing -- this frame should not be used // @@ -479,6 +480,16 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st continue; } + stack_buffer[i].same_frame = + stack_buffer[i].is_ruby_frame && + stack_buffer[i].as.ruby_frame.iseq == (VALUE) cfp->iseq && + stack_buffer[i].as.ruby_frame.caching_pc == cfp->pc; + + if (stack_buffer[i].same_frame) { // Nothing to do, buffer already contains this frame + i++; + continue; + } + // Upstream Ruby has code here to retrieve the rb_callable_method_entry_t (cme) and in some cases to use it // instead of the iseq. // In practice, they are usually the same; the difference is that when you have e.g. block, one gets you a @@ -499,6 +510,7 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st // } // else { stack_buffer[i].as.ruby_frame.iseq = (VALUE)cfp->iseq; + stack_buffer[i].as.ruby_frame.caching_pc = (void *) cfp->pc; // } // The topmost frame may not have an updated PC because the JIT @@ -521,12 +533,21 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st else { cme = rb_vm_frame_method_entry(cfp); if (cme && cme->def->type == VM_METHOD_TYPE_CFUNC) { + stack_buffer[i].same_frame = + !stack_buffer[i].is_ruby_frame && + stack_buffer[i].as.native_frame.caching_cme == (VALUE) cme; + + if (stack_buffer[i].same_frame) { // Nothing to do, buffer already contains this frame + i++; + continue; + } + + stack_buffer[i].as.native_frame.caching_cme = (VALUE)cme; stack_buffer[i].as.native_frame.method_id = cme->def->original_id; stack_buffer[i].is_ruby_frame = false; i++; } } - cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(cfp); } return i; diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.h b/ext/datadog_profiling_native_extension/private_vm_api_access.h index bd4fbfe92ba..239465d040b 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.h +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.h @@ -22,13 +22,16 @@ typedef struct frame_info { union { struct { VALUE iseq; + void *caching_pc; // For caching only int line; } ruby_frame; struct { + VALUE caching_cme; // For caching only ID method_id; } native_frame; } as; bool is_ruby_frame : 1; + bool same_frame : 1; } frame_info; rb_nativethread_id_t pthread_id_for(VALUE thread); From 360aba2a0992787a90aa19f6537253bef5d0674c Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 15:24:54 +0100 Subject: [PATCH 12/17] Use cheaper `RSTRING_PTR` when converting string to char slice Looking at the implementation of `StringValuePtr`, it does a bunch of checks that we don't really need because we already have the `ENFORCE_TYPE` making sure we do have a string. So we can just use the string without further ado. --- ext/datadog_profiling_native_extension/datadog_ruby_common.h | 2 +- ext/libdatadog_api/datadog_ruby_common.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/datadog_profiling_native_extension/datadog_ruby_common.h b/ext/datadog_profiling_native_extension/datadog_ruby_common.h index 83fc1c821e4..655703a01a0 100644 --- a/ext/datadog_profiling_native_extension/datadog_ruby_common.h +++ b/ext/datadog_profiling_native_extension/datadog_ruby_common.h @@ -35,7 +35,7 @@ VALUE datadog_gem_version(void); inline static ddog_CharSlice char_slice_from_ruby_string(VALUE string) { ENFORCE_TYPE(string, T_STRING); - ddog_CharSlice char_slice = {.ptr = StringValuePtr(string), .len = RSTRING_LEN(string)}; + ddog_CharSlice char_slice = {.ptr = RSTRING_PTR(string), .len = RSTRING_LEN(string)}; return char_slice; } diff --git a/ext/libdatadog_api/datadog_ruby_common.h b/ext/libdatadog_api/datadog_ruby_common.h index 83fc1c821e4..655703a01a0 100644 --- a/ext/libdatadog_api/datadog_ruby_common.h +++ b/ext/libdatadog_api/datadog_ruby_common.h @@ -35,7 +35,7 @@ VALUE datadog_gem_version(void); inline static ddog_CharSlice char_slice_from_ruby_string(VALUE string) { ENFORCE_TYPE(string, T_STRING); - ddog_CharSlice char_slice = {.ptr = StringValuePtr(string), .len = RSTRING_LEN(string)}; + ddog_CharSlice char_slice = {.ptr = RSTRING_PTR(string), .len = RSTRING_LEN(string)}; return char_slice; } From be447a9c1211cfc2cc4de015bda665fda8c127b8 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 16:12:17 +0100 Subject: [PATCH 13/17] Fix build/running with DDTRACE_DEBUG I don't often use this setting so it had bitrotten a bit :sweat_smile: * `sampler_readjust` no longer has the `readjustment_reason`. We can recompute for debugging if needed, but for now I just removed the reference * When building at `-O0` it looks like my version of gcc is refusing to inline `monotonic_wall_time_now_ns` which is a problem because there's no non-inline version of that function, and thus it breaks running the tests with > symbol lookup error: datadog_profiling_native_extension.3.1.4_x86_64-linux.so: undefined symbol: monotonic_wall_time_now_ns `-O1` is probably fine for development as well so let's go with it for now? --- .../collectors_discrete_dynamic_sampler.c | 4 +--- ext/libdatadog_api/extconf.rb | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c b/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c index 2db303e07f4..804c4844f95 100644 --- a/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c +++ b/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c @@ -269,9 +269,7 @@ void discrete_dynamic_sampler_readjust(discrete_dynamic_sampler *sampler, long n double num_this_windows_in_60s = 60 * 1e9 / this_window_time_ns; double real_total_sampling_time_in_60s = sampler->sampling_time_since_last_readjustment_ns * num_this_windows_in_60s / 1e9; - const char* readjustment_reason = should_readjust_based_on_time ? "time" : "samples"; - - fprintf(stderr, "[dds.%s] readjusting due to %s...\n", sampler->debug_name, readjustment_reason); + fprintf(stderr, "[dds.%s] readjusting...\n", sampler->debug_name); fprintf(stderr, "events_since_last_readjustment=%ld\n", sampler->events_since_last_readjustment); fprintf(stderr, "samples_since_last_readjustment=%ld\n", sampler->samples_since_last_readjustment); fprintf(stderr, "this_window_time=%ld\n", this_window_time_ns); diff --git a/ext/libdatadog_api/extconf.rb b/ext/libdatadog_api/extconf.rb index b519e47c3a2..60cc130ab76 100644 --- a/ext/libdatadog_api/extconf.rb +++ b/ext/libdatadog_api/extconf.rb @@ -67,7 +67,7 @@ def skip_building_extension!(reason) if ENV['DDTRACE_DEBUG'] == 'true' $defs << '-DDD_DEBUG' - CONFIG['optflags'] = '-O0' + CONFIG['optflags'] = '-O1' CONFIG['debugflags'] = '-ggdb3' end From c09aaae59443f33623c51fc6d1c18898bec733a8 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 8 Aug 2024 17:02:07 +0100 Subject: [PATCH 14/17] Directly use VM iseq query functions instead of going through rb_profile_ stuff Since we now always return an iseq from ddtrace_rb_profile_frames (it's in the name of the struct element even!), we can directly use the `rb_iseq_` functions. Looking at the `rb_profile_` functions what they do is check if they have an iseq or something else, and when they have an iseq, they call these functions. So this allows us to skip the extra tests because we know we have iseqs. (FYI an "iseq" is an "instruction sequence" -- it's the object that represents the YARV VM bytecode) --- .../collectors_stack.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index 3e16036437f..b8c56bafb93 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -34,6 +34,11 @@ static void maybe_add_placeholder_frames_omitted(VALUE thread, sampling_buffer* static void record_placeholder_stack_in_native_code(VALUE recorder_instance, sample_values values, sample_labels labels); static void maybe_trim_template_random_ids(ddog_CharSlice *name_slice, ddog_CharSlice *filename_slice); +// These two functions are exposed as symbols by the VM but are not in any header. +// Their signatures actually take a `const rb_iseq_t *iseq` but it gets casted back and forth between VALUE. +extern VALUE rb_iseq_path(const VALUE); +extern VALUE rb_iseq_base_label(const VALUE); + void collectors_stack_init(VALUE profiling_module) { VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors"); VALUE collectors_stack_class = rb_define_class_under(collectors_module, "Stack", rb_cObject); @@ -191,8 +196,8 @@ void sample_thread( int line; if (buffer->stack_buffer[i].is_ruby_frame) { - name = rb_profile_frame_base_label(buffer->stack_buffer[i].as.ruby_frame.iseq); - filename = rb_profile_frame_path(buffer->stack_buffer[i].as.ruby_frame.iseq); + name = rb_iseq_base_label(buffer->stack_buffer[i].as.ruby_frame.iseq); + filename = rb_iseq_path(buffer->stack_buffer[i].as.ruby_frame.iseq); line = buffer->stack_buffer[i].as.ruby_frame.line; last_ruby_frame_filename = filename; From 2fc03d5ae5860d4e9a75ce3825fba95ed288a1a2 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Fri, 9 Aug 2024 11:56:51 +0100 Subject: [PATCH 15/17] Add workaround for spec that started flaking This was a really weird one. I don't like handwaving it away, but debugging any more of this would need me to go really deep into Ruby's GC to understand where exactly Ruby is deciding to keep this object alive and I'm not sure such a time investment is worth it for this bug, so I've put in a workaround for now. --- .../heap_recorder.c | 1 + spec/datadog/profiling/stack_recorder_spec.rb | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/ext/datadog_profiling_native_extension/heap_recorder.c b/ext/datadog_profiling_native_extension/heap_recorder.c index 9820f4c039a..ecba0e1cfe5 100644 --- a/ext/datadog_profiling_native_extension/heap_recorder.c +++ b/ext/datadog_profiling_native_extension/heap_recorder.c @@ -842,6 +842,7 @@ VALUE object_record_inspect(object_record *record) { if (!ruby_ref_from_id(LONG2NUM(record->obj_id), &ref)) { rb_str_catf(inspect, "object="); } else { + rb_str_catf(inspect, "value=%p ", (void *) ref); VALUE ruby_inspect = ruby_safe_inspect(ref); if (ruby_inspect != Qnil) { rb_str_catf(inspect, "object=%"PRIsVALUE, ruby_inspect); diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index 997a695b0c3..c2e3a8889a1 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -1027,6 +1027,17 @@ def sample_allocation(obj) end GC.start # All dead objects above will be GCed, all living strings will have age = 0 + # @ivoanjo: For some weird reason, the last object sampled in the "dead_heap_samples" does not always + # get collected the first time, leading to a flaky spec. + # I was able to reproduce it with `rspec spec/datadog/profiling --seed 48141 --fail-fast` and it's + # kinda bizarre since e.g. if you add one more `Object.new` it also stops flaking, so is it perhaps + # related to Ruby's conservative GC? + # I've bisected this and undoing 3d4b7fcf30b529b191ca737ae13629eb27b8ab63 also makes the flakiness + # go away, but again, that change doesn't seem to have anything to do with GC. + # As the weird behavior is transitory, e.g. it provably goes away on the next GC, I'll go with this + # workaround for now. + GC.start + begin # Allocate some extra objects in a block with GC disabled and ask for a serialization # to ensure these strings have age=0 by the time we try to serialize the profile From e6c912258c4e24e540ea23fea95545752126ecc1 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Mon, 12 Aug 2024 09:11:32 +0100 Subject: [PATCH 16/17] Clarify why code is being left commented out --- .../private_vm_api_access.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.c b/ext/datadog_profiling_native_extension/private_vm_api_access.c index 15718934a18..ec89ba5458a 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.c +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.c @@ -490,6 +490,7 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st continue; } + // dd-trace-rb NOTE: // Upstream Ruby has code here to retrieve the rb_callable_method_entry_t (cme) and in some cases to use it // instead of the iseq. // In practice, they are usually the same; the difference is that when you have e.g. block, one gets you a @@ -499,6 +500,9 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st // between the rb_callable_method_entry_t and the iseq. Thus, to simplify a bit our logic and reduce a bit // the overhead, we always use the iseq here. // + // @ivoanjo: I've left the upstream Ruby code commented out below for reference, so it's more obvious that + // we're diverging, and we can easily compare and experiment with the upstream version in the future. + // // cme = rb_vm_frame_method_entry(cfp); // if (cme && cme->def->type == VM_METHOD_TYPE_ISEQ && From 607d5d5323059bc0b3712d87b47223a7ce50b359 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 13 Aug 2024 10:32:06 +0100 Subject: [PATCH 17/17] Pick up upstream fix for `start` parameter We haven't used the `start` parameter so far, but let's keep closer to upstream here. (We had already half-applied the change from upstream for another reason, so now we have the full fix). --- .../private_vm_api_access.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ext/datadog_profiling_native_extension/private_vm_api_access.c b/ext/datadog_profiling_native_extension/private_vm_api_access.c index ec89ba5458a..ca1e03b50c0 100644 --- a/ext/datadog_profiling_native_extension/private_vm_api_access.c +++ b/ext/datadog_profiling_native_extension/private_vm_api_access.c @@ -394,6 +394,7 @@ calc_lineno(const rb_iseq_t *iseq, const VALUE *pc) // * Imported fix from https://github.com/ruby/ruby/pull/8415 to avoid potential crash when using YJIT. // * Add frame_flags.same_frame and logic to skip redoing work if the buffer already contains the same data we're collecting // * Skipped use of rb_callable_method_entry_t (cme) for Ruby frames as it doesn't impact us. +// * Imported fix from https://github.com/ruby/ruby/pull/8280 to keep us closer to upstream // // What is rb_profile_frames? // `rb_profile_frames` is a Ruby VM debug API added for use by profilers for sampling the stack trace of a Ruby thread. @@ -537,6 +538,11 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st else { cme = rb_vm_frame_method_entry(cfp); if (cme && cme->def->type == VM_METHOD_TYPE_CFUNC) { + if (start > 0) { + start--; + continue; + } + stack_buffer[i].same_frame = !stack_buffer[i].is_ruby_frame && stack_buffer[i].as.native_frame.caching_cme == (VALUE) cme;