From 9310ee74ff0319513f5006f5c2c5d2d6820c8c05 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Mon, 16 Dec 2024 13:11:22 -0500 Subject: [PATCH] DEBUG-3210 DI: change logging to be appropriate for customer inspection Failure to enable DI is now a warning (since enabling it must have been requested by the customer). Remaining logging calls are now emitted at debug level by default, making those entries not visible in customer logs. An internal DI setting is added to reinstate logging of internal issues at warn level, for developing/debugging DI. --- lib/datadog/di/code_tracker.rb | 2 +- lib/datadog/di/component.rb | 10 +++++--- lib/datadog/di/configuration/settings.rb | 9 +++++++ lib/datadog/di/instrumenter.rb | 11 +++++---- lib/datadog/di/logging.rb | 30 ++++++++++++++++++++++++ lib/datadog/di/probe_manager.rb | 11 +++++---- lib/datadog/di/probe_notifier_worker.rb | 11 +++++---- lib/datadog/di/remote.rb | 8 +++---- spec/datadog/di/instrumenter_spec.rb | 1 + spec/datadog/di/probe_manager_spec.rb | 1 + spec/datadog/di/remote_spec.rb | 25 +++++++------------- 11 files changed, 83 insertions(+), 36 deletions(-) create mode 100644 lib/datadog/di/logging.rb diff --git a/lib/datadog/di/code_tracker.rb b/lib/datadog/di/code_tracker.rb index 5ace92a17d2..010f04c7368 100644 --- a/lib/datadog/di/code_tracker.rb +++ b/lib/datadog/di/code_tracker.rb @@ -91,7 +91,7 @@ def start # before release. if component = DI.current_component # steep:ignore raise if component.settings.dynamic_instrumentation.internal.propagate_all_exceptions - component.logger.warn("Unhandled exception in script_compiled trace point: #{exc.class}: #{exc}") + component.log_warn_internal("Unhandled exception in script_compiled trace point: #{exc.class}: #{exc}") component.telemetry&.report(exc, description: "Unhandled exception in script_compiled trace point") # TODO test this path else diff --git a/lib/datadog/di/component.rb b/lib/datadog/di/component.rb index c109bd4082e..715f60e39dd 100644 --- a/lib/datadog/di/component.rb +++ b/lib/datadog/di/component.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require_relative 'logging' + module Datadog module DI # Component for dynamic instrumentation. @@ -13,12 +15,14 @@ module DI # intalled tracepoints and so on. Component will clean up all # resources and installed tracepoints upon shutdown. class Component + include Logging + class << self def build(settings, agent_settings, telemetry: nil) return unless settings.respond_to?(:dynamic_instrumentation) && settings.dynamic_instrumentation.enabled unless settings.respond_to?(:remote) && settings.remote.enabled - Datadog.logger.debug("Dynamic Instrumentation could not be enabled because Remote Configuration Management is not available. To enable Remote Configuration, see https://docs.datadoghq.com/agent/remote_config") + Datadog.logger.warn("Dynamic Instrumentation could not be enabled because Remote Configuration Management is not available. To enable Remote Configuration, see https://docs.datadoghq.com/agent/remote_config") return end @@ -53,12 +57,12 @@ def environment_supported?(settings) # TODO add tests? unless settings.dynamic_instrumentation.internal.development if Datadog::Core::Environment::Execution.development? - Datadog.logger.debug("Not enabling dynamic instrumentation because we are in development environment") + Datadog.logger.warn("Not enabling dynamic instrumentation because we are in development environment") return false end end if RUBY_ENGINE != 'ruby' || RUBY_VERSION < '2.6' - Datadog.logger.debug("Not enabling dynamic instrumentation because of unsupported Ruby version") + Datadog.logger.warn("Not enabling dynamic instrumentation because of unsupported Ruby version") return false end true diff --git a/lib/datadog/di/configuration/settings.rb b/lib/datadog/di/configuration/settings.rb index 55413613494..1a12e518f45 100644 --- a/lib/datadog/di/configuration/settings.rb +++ b/lib/datadog/di/configuration/settings.rb @@ -157,6 +157,15 @@ def self.add_settings!(base) o.default false end + # Increases logging verbosity level from 'debug' to 'warn' + # for internal DI diagnostics, i.e. log messages that + # should not be written to customers' logs in production but + # that are useful when developing DI. + option :verbose_logging do |o| + o.type :bool + o.default false + end + # Minimum interval, in seconds, between probe status and # snapshot submissions to the agent. Probe notifier worker will # batch together payloads submitted during each interval. diff --git a/lib/datadog/di/instrumenter.rb b/lib/datadog/di/instrumenter.rb index b1f8f0f599a..77f382fde6e 100644 --- a/lib/datadog/di/instrumenter.rb +++ b/lib/datadog/di/instrumenter.rb @@ -3,6 +3,7 @@ # rubocop:disable Lint/AssignmentInCondition require 'benchmark' +require_relative 'logging' module Datadog module DI @@ -54,6 +55,8 @@ module DI # # @api private class Instrumenter + include Logging + def initialize(settings, serializer, logger, code_tracker: nil, telemetry: nil) @settings = settings @serializer = serializer @@ -305,13 +308,13 @@ def hook_line(probe, &block) end rescue => exc raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions - logger.warn("Unhandled exception in line trace point: #{exc.class}: #{exc}") + log_warn_internal("Unhandled exception in line trace point: #{exc.class}: #{exc}") telemetry&.report(exc, description: "Unhandled exception in line trace point") # TODO test this path end rescue => exc raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions - logger.warn("Unhandled exception in line trace point: #{exc.class}: #{exc}") + log_warn_internal("Unhandled exception in line trace point: #{exc.class}: #{exc}") telemetry&.report(exc, description: "Unhandled exception in line trace point") # TODO test this path end @@ -357,7 +360,7 @@ def hook(probe, &block) hook_line(probe, &block) else # TODO add test coverage for this path - logger.warn("Unknown probe type to hook: #{probe}") + log_warn_internal("Unknown probe type to hook: #{probe}") end end @@ -368,7 +371,7 @@ def unhook(probe) unhook_line(probe) else # TODO add test coverage for this path - logger.warn("Unknown probe type to unhook: #{probe}") + log_warn_internal("Unknown probe type to unhook: #{probe}") end end diff --git a/lib/datadog/di/logging.rb b/lib/datadog/di/logging.rb new file mode 100644 index 00000000000..24a26f53fae --- /dev/null +++ b/lib/datadog/di/logging.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +module Datadog + module DI + # Logging for DI + module Logging + # Logs an internal warning message. + # When verbose logging is enabled, the logging happens at warn level. + # When verbose logging is disabled, the logging happens at debug level + # (which is how the rest of the library is reporting its internal + # warnings/errors). + def log_warn_internal(msg) + if settings.dynamic_instrumentation.internal.verbose_logging + logger.warn(msg) + else + logger.debug(msg) + end + end + + # Logs an internal informational message. + # When verbose logging is enabled, the logging happens at info level. + # When verbose logging is disabled, nothing is logged. + def log_info_internal(msg) + if settings.dynamic_instrumentation.internal.verbose_logging + logger.info(msg) + end + end + end + end +end diff --git a/lib/datadog/di/probe_manager.rb b/lib/datadog/di/probe_manager.rb index 9dd74d85b5e..97ed41e5540 100644 --- a/lib/datadog/di/probe_manager.rb +++ b/lib/datadog/di/probe_manager.rb @@ -3,6 +3,7 @@ # rubocop:disable Lint/AssignmentInCondition require 'monitor' +require_relative 'logging' module Datadog module DI @@ -15,6 +16,8 @@ module DI # # @api private class ProbeManager + include Logging + def initialize(settings, instrumenter, probe_notification_builder, probe_notifier_worker, logger, telemetry: nil) @settings = settings @@ -32,7 +35,7 @@ def initialize(settings, instrumenter, probe_notification_builder, install_pending_method_probes(tp.self) rescue => exc raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions - logger.warn("Unhandled exception in definition trace point: #{exc.class}: #{exc}") + log_warn_internal("Unhandled exception in definition trace point: #{exc.class}: #{exc}") telemetry&.report(exc, description: "Unhandled exception in definition trace point") # TODO test this path end @@ -120,7 +123,7 @@ def add_probe(probe) # In "propagate all exceptions" mode we will try to instrument again. raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions - logger.warn("Error processing probe configuration: #{exc.class}: #{exc}") + log_warn_internal("Error processing probe configuration: #{exc.class}: #{exc}") telemetry&.report(exc, description: "Error processing probe configuration") # TODO report probe as failed to agent since we won't attempt to # install it again. @@ -160,7 +163,7 @@ def remove_other_probes(probe_ids) raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions # Silence all exceptions? # TODO should we propagate here and rescue upstream? - logger.warn("Error removing probe #{probe.id}: #{exc.class}: #{exc}") + log_warn_internal("Error removing probe #{probe.id}: #{exc.class}: #{exc}") telemetry&.report(exc, description: "Error removing probe") end end @@ -190,7 +193,7 @@ def remove_other_probes(probe_ids) rescue => exc raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions - logger.warn("Error installing probe after class is defined: #{exc.class}: #{exc}") + log_warn_internal("Error installing probe after class is defined: #{exc.class}: #{exc}") telemetry&.report(exc, description: "Error installing probe after class is defined") end end diff --git a/lib/datadog/di/probe_notifier_worker.rb b/lib/datadog/di/probe_notifier_worker.rb index 6ce2ccdf340..c999afce666 100644 --- a/lib/datadog/di/probe_notifier_worker.rb +++ b/lib/datadog/di/probe_notifier_worker.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true require_relative '../core/semaphore' +require_relative 'logging' module Datadog module DI @@ -23,6 +24,8 @@ module DI # # @api private class ProbeNotifierWorker + include Logging + def initialize(settings, transport, logger, telemetry: nil) @settings = settings @telemetry = telemetry @@ -77,7 +80,7 @@ def start rescue => exc raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions - logger.warn("Error in probe notifier worker: #{exc.class}: #{exc} (at #{exc.backtrace.first})") + log_warn_internal("Error in probe notifier worker: #{exc.class}: #{exc} (at #{exc.backtrace.first})") telemetry&.report(exc, description: "Error in probe notifier worker") end @lock.synchronize do @@ -185,7 +188,7 @@ def io_in_progress? queue = send("#{event_type}_queue") # TODO determine a suitable limit via testing/benchmarking if queue.length > 100 - logger.warn("#{self.class.name}: dropping #{event_type} because queue is full") + log_warn_internal("#{self.class.name}: dropping #{event_type} because queue is full") else queue << event end @@ -242,7 +245,7 @@ def set_sleep_remaining end rescue => exc raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions - logger.warn("failed to send #{event_name}: #{exc.class}: #{exc} (at #{exc.backtrace.first})") + log_warn_internal("failed to send #{event_name}: #{exc.class}: #{exc} (at #{exc.backtrace.first})") # Should we report this error to telemetry? Most likely failure # to send is due to a network issue, and trying to send a # telemetry message would also fail. @@ -253,7 +256,7 @@ def set_sleep_remaining # Normally the queue should only be consumed in this method, # however if anyone consumes it elsewhere we don't want to block # while consuming it here. Rescue ThreadError and return. - logger.warn("Unexpected #{event_name} queue underflow - consumed elsewhere?") + log_warn_internal("Unexpected #{event_name} queue underflow - consumed elsewhere?") telemetry&.report(exc, description: "Unexpected #{event_name} queue underflow") ensure @lock.synchronize do diff --git a/lib/datadog/di/remote.rb b/lib/datadog/di/remote.rb index e0a5f3b90ed..eb76abe30e7 100644 --- a/lib/datadog/di/remote.rb +++ b/lib/datadog/di/remote.rb @@ -51,7 +51,7 @@ def receivers(telemetry) probe = ProbeBuilder.build_from_remote_config(probe_spec) payload = component.probe_notification_builder.build_received(probe) component.probe_notifier_worker.add_status(payload) - component.logger.info("Received probe from RC: #{probe.type} #{probe.location}") + component.log_info_internal("Received probe from RC: #{probe.type} #{probe.location}") begin # TODO test exception capture @@ -60,7 +60,7 @@ def receivers(telemetry) rescue => exc raise if component.settings.dynamic_instrumentation.internal.propagate_all_exceptions - component.logger.warn("Unhandled exception adding probe in DI remote receiver: #{exc.class}: #{exc}") + component.log_warn_internal("Unhandled exception adding probe in DI remote receiver: #{exc.class}: #{exc}") component.telemetry&.report(exc, description: "Unhandled exception adding probe in DI remote receiver") # If a probe fails to install, we will mark the content @@ -81,7 +81,7 @@ def receivers(telemetry) rescue => exc raise if component.settings.dynamic_instrumentation.internal.propagate_all_exceptions - component.logger.warn("Unhandled exception handling probe in DI remote receiver: #{exc.class}: #{exc}") + component.log_warn_internal("Unhandled exception handling probe in DI remote receiver: #{exc.class}: #{exc}") component.telemetry&.report(exc, description: "Unhandled exception handling probe in DI remote receiver") content.errored("Error applying dynamic instrumentation configuration: #{exc.class.name} #{exc.message}: #{Array(exc.backtrace).join("\n")}") @@ -95,7 +95,7 @@ def receivers(telemetry) rescue => exc raise if component.settings.dynamic_instrumentation.internal.propagate_all_exceptions - component.logger.warn("Unhandled exception removing probes in DI remote receiver: #{exc.class}: #{exc}") + component.log_warn_internal("Unhandled exception removing probes in DI remote receiver: #{exc.class}: #{exc}") component.telemetry&.report(exc, description: "Unhandled exception removing probes in DI remote receiver") end end diff --git a/spec/datadog/di/instrumenter_spec.rb b/spec/datadog/di/instrumenter_spec.rb index 695d8567386..276876dec66 100644 --- a/spec/datadog/di/instrumenter_spec.rb +++ b/spec/datadog/di/instrumenter_spec.rb @@ -11,6 +11,7 @@ mock_settings_for_di do |settings| allow(settings.dynamic_instrumentation).to receive(:enabled).and_return(true) allow(settings.dynamic_instrumentation.internal).to receive(:untargeted_trace_points).and_return(false) + allow(settings.dynamic_instrumentation.internal).to receive(:verbose_logging).and_return(true) allow(settings.dynamic_instrumentation).to receive(:max_capture_depth).and_return(2) allow(settings.dynamic_instrumentation).to receive(:max_capture_attribute_count).and_return(2) allow(settings.dynamic_instrumentation).to receive(:max_capture_string_length).and_return(100) diff --git a/spec/datadog/di/probe_manager_spec.rb b/spec/datadog/di/probe_manager_spec.rb index f32cb71f577..446a5ef9df2 100644 --- a/spec/datadog/di/probe_manager_spec.rb +++ b/spec/datadog/di/probe_manager_spec.rb @@ -9,6 +9,7 @@ class ProbeManagerSpecTestClass; end mock_settings_for_di do |settings| allow(settings.dynamic_instrumentation).to receive(:enabled).and_return(true) allow(settings.dynamic_instrumentation.internal).to receive(:propagate_all_exceptions).and_return(false) + allow(settings.dynamic_instrumentation.internal).to receive(:verbose_logging).and_return(true) end let(:instrumenter) do diff --git a/spec/datadog/di/remote_spec.rb b/spec/datadog/di/remote_spec.rb index f3a7d386b9e..4a6056d4989 100644 --- a/spec/datadog/di/remote_spec.rb +++ b/spec/datadog/di/remote_spec.rb @@ -92,6 +92,7 @@ mock_settings_for_di do |settings| allow(settings.dynamic_instrumentation).to receive(:enabled).and_return(true) allow(settings.dynamic_instrumentation.internal).to receive(:propagate_all_exceptions).and_return(false) + allow(settings.dynamic_instrumentation.internal).to receive(:verbose_logging).and_return(true) end let(:serializer) do @@ -144,8 +145,7 @@ end it 'calls probe manager to add a probe' do - expect(component).to receive(:logger).and_return(logger) - expect(logger).to receive(:info) do |message| + expect(component).to receive(:log_info_internal) do |message| expect(message).to match(/Received probe/) end @@ -162,15 +162,13 @@ context 'probe addition raises an exception' do it 'logs warning and consumes the exception' do expect(component).to receive(:telemetry).and_return(telemetry) - expect(component).to receive(:logger).and_return(logger) - expect(logger).to receive(:info) do |message| + expect(component).to receive(:log_info_internal) do |message| expect(message).to match(/Received probe/) end - expect(logger).to receive(:warn) do |msg| + expect(component).to receive(:log_warn_internal) do |msg| expect(msg).to match(/Unhandled exception.*Runtime error from test/) end - expect(component).to receive(:logger).and_return(logger) expect(telemetry).to receive(:report) expect(probe_manager).to receive(:add_probe).and_raise("Runtime error from test") @@ -186,17 +184,15 @@ it 'calls probe manager to remove stale probes' do allow(component).to receive(:telemetry) - expect(component).to receive(:logger).and_return(logger) - expect(logger).to receive(:info) do |message| + expect(component).to receive(:log_info_internal) do |message| expect(message).to match(/Received probe/) end - expect(logger).to receive(:warn) do |msg| + expect(component).to receive(:log_warn_internal) do |msg| expect(msg).to match(/Unhandled exception.*Runtime error from test/) end allow(probe_manager).to receive(:add_probe).and_raise("Runtime error from test") - expect(component).to receive(:logger).and_return(logger) expect(component).to receive(:probe_notification_builder).and_return(probe_notification_builder) expect(probe_notification_builder).to receive(:build_received) expect(component).to receive(:probe_notifier_worker).and_return(probe_notifier_worker) @@ -209,27 +205,24 @@ context 'probe removal raises an exception' do it 'logs warning and consumes the exception' do expect(component).to receive(:telemetry).and_return(telemetry).at_least(:once) - expect(component).to receive(:logger).and_return(logger) - expect(logger).to receive(:info) do |message| + expect(component).to receive(:log_info_internal) do |message| expect(message).to match(/Received probe/) end - expect(logger).to receive(:warn) do |msg| + expect(component).to receive(:log_warn_internal) do |msg| expect(msg).to match(/Unhandled exception.*Runtime error 1 from test/) end expect(telemetry).to receive(:report) allow(probe_manager).to receive(:add_probe).and_raise("Runtime error 1 from test") - expect(component).to receive(:logger).and_return(logger) expect(component).to receive(:probe_notification_builder).and_return(probe_notification_builder) expect(probe_notification_builder).to receive(:build_received) expect(component).to receive(:probe_notifier_worker).and_return(probe_notifier_worker) expect(probe_notifier_worker).to receive(:add_status) - expect(logger).to receive(:warn) do |msg| + expect(component).to receive(:log_warn_internal) do |msg| expect(msg).to match(/Unhandled exception.*Runtime error 2 from test/) end - expect(component).to receive(:logger).and_return(logger) expect(telemetry).to receive(:report) expect(probe_manager).to receive(:remove_other_probes).with(['11']).and_raise("Runtime error 2 from test")