DataDog · ivoanjo · Aug 13, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
@@ -161,6 +161,14 @@ void sample_thread(
     return;
   }
 
+  // if (captured_frames > 0) {
+  //   int cache_hits = 0;
+  //   for (int i = 0; i < captured_frames; i++) {
+  //     if (buffer->stack_buffer[i].same_frame) cache_hits++;
+  //   }
+  //   fprintf(stderr, "Sampling cache hits: %f\n", ((double) cache_hits / captured_frames) * 100);
+  // }
+
   // Ruby does not give us path and line number for methods implemented using native code.
   // The convention in Kernel#caller_locations is to instead use the path and line number of the first Ruby frame
   // on the stack that is below (e.g. directly or indirectly has called) the native method.

@@ -188,6 +188,7 @@ void update_metrics_and_sample(
   VALUE thread_being_sampled,
   VALUE stack_from_thread,
   struct per_thread_context *thread_context,
+  sampling_buffer* sampling_buffer,
   long current_cpu_time_ns,
   long current_monotonic_wall_time_ns
 );
@@ -196,6 +197,7 @@ static void trigger_sample_for_thread(
   VALUE thread,
   VALUE stack_from_thread,
   struct per_thread_context *thread_context,
+  sampling_buffer* sampling_buffer,
   sample_values values,
   long current_monotonic_wall_time_ns,
   ddog_CharSlice *ruby_vm_type,
@@ -476,6 +478,7 @@ void thread_context_collector_sample(VALUE self_instance, long current_monotonic
       /* thread_being_sampled: */ thread,
       /* stack_from_thread: */ thread,
       thread_context,
+      thread_context->sampling_buffer,
       current_cpu_time_ns,
       current_monotonic_wall_time_ns
     );
@@ -492,6 +495,8 @@ void thread_context_collector_sample(VALUE self_instance, long current_monotonic
     /* thread_being_sampled: */ current_thread,
     /* stack_from_thread: */ profiler_overhead_stack_thread,
     current_thread_context,
+    // Here we use the overhead thread's sampling buffer so as to not invalidate the cache in the buffer of the thread being sampled
+    get_or_create_context_for(profiler_overhead_stack_thread, state)->sampling_buffer,
     cpu_time_now_ns(current_thread_context),
     monotonic_wall_time_now_ns(RAISE_ON_FAILURE)
   );
@@ -502,6 +507,7 @@ void update_metrics_and_sample(
   VALUE thread_being_sampled,
   VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack
   struct per_thread_context *thread_context,
+  sampling_buffer* sampling_buffer,
   long current_cpu_time_ns,
   long current_monotonic_wall_time_ns
 ) {
@@ -527,6 +533,7 @@ void update_metrics_and_sample(
     thread_being_sampled,
     stack_from_thread,
     thread_context,
+    sampling_buffer,
     (sample_values) {.cpu_time_ns = cpu_time_elapsed_ns, .cpu_or_wall_samples = 1, .wall_time_ns = wall_time_elapsed_ns},
     current_monotonic_wall_time_ns,
     NULL,
@@ -706,6 +713,7 @@ static void trigger_sample_for_thread(
   VALUE thread,
   VALUE stack_from_thread, // This can be different when attributing profiler overhead using a different stack
   struct per_thread_context *thread_context,
+  sampling_buffer* sampling_buffer,
   sample_values values,
   long current_monotonic_wall_time_ns,
   // These two labels are only used for allocation profiling; @ivoanjo: may want to refactor this at some point?
@@ -826,7 +834,7 @@ static void trigger_sample_for_thread(
 
   sample_thread(
     stack_from_thread,
-    thread_context->sampling_buffer,
+    sampling_buffer,
     state->recorder_instance,
     values,
     (sample_labels) {.labels = slice_labels, .state_label = state_label, .end_timestamp_ns = end_timestamp_ns}
@@ -1307,11 +1315,14 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in
 
   track_object(state->recorder_instance, new_object, sample_weight, optional_class_name);
 
+  struct per_thread_context *thread_context = get_or_create_context_for(current_thread, state);
+
   trigger_sample_for_thread(
     state,
     /* thread: */  current_thread,
     /* stack_from_thread: */ current_thread,
-    get_or_create_context_for(current_thread, state),
+    thread_context,
+    thread_context->sampling_buffer,
     (sample_values) {.alloc_samples = sample_weight, .alloc_samples_unscaled = 1, .heap_sample = true},
     INVALID_TIME, // For now we're not collecting timestamps for allocation events, as per profiling team internal discussions
     &ruby_vm_type,

@@ -392,6 +392,7 @@ calc_lineno(const rb_iseq_t *iseq, const VALUE *pc)
 //   was called from.
 // * Imported fix from https://github.com/ruby/ruby/pull/7116 to avoid sampling threads that are still being created
 // * Imported fix from https://github.com/ruby/ruby/pull/8415 to avoid potential crash when using YJIT.
+// * Add frame_flags.same_frame and logic to skip redoing work if the buffer already contains the same data we're collecting
 // * Skipped use of rb_callable_method_entry_t (cme) for Ruby frames as it doesn't impact us.
 //
 // What is rb_profile_frames?
@@ -466,7 +467,7 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st
     // See comment on `record_placeholder_stack_in_native_code` for a full explanation of what this means (and why we don't just return 0)
     if (end_cfp <= cfp) return PLACEHOLDER_STACK_IN_NATIVE_CODE;
 
-    for (i=0; i<limit && cfp != end_cfp;) {
+    for (i=0; i<limit && cfp != end_cfp; cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(cfp)) {
         if (cfp->iseq && !cfp->pc) {
           // Fix: Do nothing -- this frame should not be used
           //
@@ -479,6 +480,16 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st
                 continue;
             }
 
+            stack_buffer[i].same_frame =
+              stack_buffer[i].is_ruby_frame &&
+              stack_buffer[i].as.ruby_frame.iseq == (VALUE) cfp->iseq &&
+              stack_buffer[i].as.ruby_frame.caching_pc == cfp->pc;
+
+            if (stack_buffer[i].same_frame) { // Nothing to do, buffer already contains this frame
+              i++;
+              continue;
+            }
+
             // Upstream Ruby has code here to retrieve the rb_callable_method_entry_t (cme) and in some cases to use it
             // instead of the iseq.
             // In practice, they are usually the same; the difference is that when you have e.g. block, one gets you a
@@ -499,6 +510,7 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st
             // }
             // else {
             stack_buffer[i].as.ruby_frame.iseq = (VALUE)cfp->iseq;
+            stack_buffer[i].as.ruby_frame.caching_pc = (void *) cfp->pc;
             // }
 
             // The topmost frame may not have an updated PC because the JIT
@@ -521,12 +533,21 @@ int ddtrace_rb_profile_frames(VALUE thread, int start, int limit, frame_info *st
         else {
             cme = rb_vm_frame_method_entry(cfp);
             if (cme && cme->def->type == VM_METHOD_TYPE_CFUNC) {
+                stack_buffer[i].same_frame =
+                  !stack_buffer[i].is_ruby_frame &&
+                  stack_buffer[i].as.native_frame.caching_cme == (VALUE) cme;
+
+                if (stack_buffer[i].same_frame) { // Nothing to do, buffer already contains this frame
+                  i++;
+                  continue;
+                }
+
+                stack_buffer[i].as.native_frame.caching_cme = (VALUE)cme;
                 stack_buffer[i].as.native_frame.method_id = cme->def->original_id;
                 stack_buffer[i].is_ruby_frame = false;
                 i++;
             }
         }
-        cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(cfp);
     }
 
     return i;

@@ -22,13 +22,16 @@ typedef struct frame_info {
   union {
     struct {
       VALUE iseq;
+      void *caching_pc; // For caching only
       int line;
     } ruby_frame;
     struct {
+      VALUE caching_cme; // For caching only
       ID method_id;
     } native_frame;
   } as;
   bool is_ruby_frame : 1;
+  bool same_frame : 1;
 } frame_info;
 
 rb_nativethread_id_t pthread_id_for(VALUE thread);