diff --git a/Cargo.lock b/Cargo.lock index 73bda20..23939a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1416,7 +1416,7 @@ dependencies = [ [[package]] name = "lightswitch-object" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "data-encoding", diff --git a/lightswitch-object/Cargo.toml b/lightswitch-object/Cargo.toml index 67c938d..3ce2673 100644 --- a/lightswitch-object/Cargo.toml +++ b/lightswitch-object/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lightswitch-object" -version = "0.1.0" +version = "0.1.1" edition = "2021" description = "Deals with object files" license = "MIT" diff --git a/lightswitch-object/src/object.rs b/lightswitch-object/src/object.rs index 0be6d61..4146762 100644 --- a/lightswitch-object/src/object.rs +++ b/lightswitch-object/src/object.rs @@ -94,6 +94,11 @@ impl ObjectFile { Ok(BuildId::sha256_from_digest(&self.code_hash)) } + /// Returns whether the object has debug symbols. + pub fn has_debug_info(&self) -> bool { + self.object.has_debug_symbols() + } + pub fn is_dynamic(&self) -> bool { self.object.kind() == ObjectKind::Dynamic } diff --git a/src/cli/args.rs b/src/cli/args.rs index a3af4de..0980f04 100644 --- a/src/cli/args.rs +++ b/src/cli/args.rs @@ -42,6 +42,14 @@ pub(crate) enum Symbolizer { None, } +#[derive(PartialEq, clap::ValueEnum, Debug, Clone, Default)] +pub(crate) enum DebugInfoBackend { + #[default] + None, + Copy, + Remote, +} + #[derive(Parser, Debug)] pub(crate) struct CliArgs { /// Specific PIDs to profile @@ -126,4 +134,6 @@ pub(crate) struct CliArgs { pub(crate) exclude_self: bool, #[arg(long, default_value_t, value_enum)] pub(crate) symbolizer: Symbolizer, + #[arg(long, default_value_t, value_enum)] + pub(crate) debug_info_backend: DebugInfoBackend, } diff --git a/src/cli/main.rs b/src/cli/main.rs index a196d62..b58082b 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -6,11 +6,13 @@ use std::io::Write; use std::panic; use std::path::PathBuf; use std::sync::{Arc, Mutex}; +use std::time::Duration; use clap::Parser; use crossbeam_channel::bounded; use inferno::flamegraph; use lightswitch::collector::{AggregatorCollector, Collector, NullCollector, StreamingCollector}; +use lightswitch::debug_info::DebugInfoManager; use lightswitch_metadata::metadata_provider::GlobalMetadataProvider; use nix::unistd::Uid; use prost::Message; @@ -21,6 +23,9 @@ use tracing_subscriber::FmtSubscriber; use lightswitch_capabilities::system_info::SystemInfo; use lightswitch_metadata::metadata_provider::ThreadSafeGlobalMetadataProvider; +use lightswitch::debug_info::{ + DebugInfoBackendFilesystem, DebugInfoBackendNull, DebugInfoBackendRemote, +}; use lightswitch::profile::symbolize_profile; use lightswitch::profile::{fold_profile, to_pprof}; use lightswitch::profiler::{Profiler, ProfilerConfig}; @@ -32,12 +37,13 @@ mod args; mod validators; use crate::args::CliArgs; +use crate::args::DebugInfoBackend; use crate::args::LoggingLevel; use crate::args::ProfileFormat; use crate::args::ProfileSender; use crate::args::Symbolizer; -const DEFAULT_PPROF_INGEST_URL: &str = "http://localhost:4567/pprof/new"; +const DEFAULT_SERVER_URL: &str = "http://localhost:4567"; /// Exit the main thread if any thread panics. We prefer this behaviour because pretty much every /// thread is load bearing for the correct functioning. @@ -98,24 +104,34 @@ fn main() -> Result<(), Box> { } } + let server_url = args.server_url.unwrap_or(DEFAULT_SERVER_URL.into()); + let metadata_provider: ThreadSafeGlobalMetadataProvider = Arc::new(Mutex::new(GlobalMetadataProvider::default())); - let collector = Arc::new(Mutex::new(match args.sender { - ProfileSender::None => Box::new(NullCollector::new()) as Box, - ProfileSender::LocalDisk => { - Box::new(AggregatorCollector::new()) as Box - } - ProfileSender::Remote => Box::new(StreamingCollector::new( - args.symbolizer == Symbolizer::Local, - args.server_url - .as_ref() - .map_or(DEFAULT_PPROF_INGEST_URL, |v| v), - ProfilerConfig::default().session_duration, - args.sample_freq, - metadata_provider.clone(), - )) as Box, - })); + let collector: Arc>> = + Arc::new(Mutex::new(match args.sender { + ProfileSender::None => Box::new(NullCollector::new()), + ProfileSender::LocalDisk => Box::new(AggregatorCollector::new()), + ProfileSender::Remote => Box::new(StreamingCollector::new( + args.symbolizer == Symbolizer::Local, + &server_url, + ProfilerConfig::default().session_duration, + args.sample_freq, + metadata_provider.clone(), + )), + })); + + let debug_info_manager: Box = match args.debug_info_backend { + DebugInfoBackend::None => Box::new(DebugInfoBackendNull {}), + DebugInfoBackend::Copy => Box::new(DebugInfoBackendFilesystem { + path: PathBuf::from("/tmp"), + }), + DebugInfoBackend::Remote => Box::new(DebugInfoBackendRemote { + http_client_timeout: Duration::from_millis(500), + server_url, + }), + }; let profiler_config = ProfilerConfig { libbpf_debug: args.libbpf_debug, @@ -128,6 +144,7 @@ fn main() -> Result<(), Box> { mapsize_aggregated_stacks: args.mapsize_aggregated_stacks, mapsize_rate_limits: args.mapsize_rate_limits, exclude_self: args.exclude_self, + debug_info_manager, ..Default::default() }; @@ -260,7 +277,7 @@ mod tests { cmd.arg("--help"); cmd.assert().success(); let actual = String::from_utf8(cmd.unwrap().stdout).unwrap(); - insta::assert_yaml_snapshot!(actual, @r#""Usage: lightswitch [OPTIONS]\n\nOptions:\n --pids \n Specific PIDs to profile\n\n --tids \n Specific TIDs to profile (these can be outside the PIDs selected above)\n\n --show-unwind-info \n Show unwind info for given binary\n\n --show-info \n Show build ID for given binary\n\n -D, --duration \n How long this agent will run in seconds\n \n [default: 18446744073709551615]\n\n --libbpf-debug\n Enable libbpf logs. This includes the BPF verifier output\n\n --bpf-logging\n Enable BPF programs logging\n\n --logging \n Set lightswitch's logging level\n \n [default: info]\n [possible values: trace, debug, info, warn, error]\n\n --sample-freq \n Per-CPU Sampling Frequency in Hz\n \n [default: 19]\n\n --profile-format \n Output file for Flame Graph in SVG format\n \n [default: flame-graph]\n [possible values: none, flame-graph, pprof]\n\n --profile-path \n Path for the generated profile\n\n --profile-name \n Name for the generated profile\n\n --sender \n Where to write the profile\n \n [default: local-disk]\n\n Possible values:\n - none: Discard the profile. Used for kernel tests\n - local-disk\n - remote\n\n --server-url \n \n\n --perf-buffer-bytes \n Size of each profiler perf buffer, in bytes (must be a power of 2)\n \n [default: 524288]\n\n --mapsize-info\n Print eBPF map sizes after creation\n\n --mapsize-stacks \n max number of individual stacks to capture before aggregation\n \n [default: 100000]\n\n --mapsize-aggregated-stacks \n max number of unique stacks after aggregation\n \n [default: 10000]\n\n --mapsize-rate-limits \n max number of rate limit entries\n \n [default: 5000]\n\n --exclude-self\n Do not profile the profiler (myself)\n\n --symbolizer \n [default: local]\n [possible values: local, none]\n\n -h, --help\n Print help (see a summary with '-h')\n""#); + insta::assert_yaml_snapshot!(actual, @r#""Usage: lightswitch [OPTIONS]\n\nOptions:\n --pids \n Specific PIDs to profile\n\n --tids \n Specific TIDs to profile (these can be outside the PIDs selected above)\n\n --show-unwind-info \n Show unwind info for given binary\n\n --show-info \n Show build ID for given binary\n\n -D, --duration \n How long this agent will run in seconds\n \n [default: 18446744073709551615]\n\n --libbpf-debug\n Enable libbpf logs. This includes the BPF verifier output\n\n --bpf-logging\n Enable BPF programs logging\n\n --logging \n Set lightswitch's logging level\n \n [default: info]\n [possible values: trace, debug, info, warn, error]\n\n --sample-freq \n Per-CPU Sampling Frequency in Hz\n \n [default: 19]\n\n --profile-format \n Output file for Flame Graph in SVG format\n \n [default: flame-graph]\n [possible values: none, flame-graph, pprof]\n\n --profile-path \n Path for the generated profile\n\n --profile-name \n Name for the generated profile\n\n --sender \n Where to write the profile\n \n [default: local-disk]\n\n Possible values:\n - none: Discard the profile. Used for kernel tests\n - local-disk\n - remote\n\n --server-url \n \n\n --perf-buffer-bytes \n Size of each profiler perf buffer, in bytes (must be a power of 2)\n \n [default: 524288]\n\n --mapsize-info\n Print eBPF map sizes after creation\n\n --mapsize-stacks \n max number of individual stacks to capture before aggregation\n \n [default: 100000]\n\n --mapsize-aggregated-stacks \n max number of unique stacks after aggregation\n \n [default: 10000]\n\n --mapsize-rate-limits \n max number of rate limit entries\n \n [default: 5000]\n\n --exclude-self\n Do not profile the profiler (myself)\n\n --symbolizer \n [default: local]\n [possible values: local, none]\n\n --debug-info-backend \n [default: none]\n [possible values: none, copy, remote]\n\n -h, --help\n Print help (see a summary with '-h')\n""#); } #[rstest] diff --git a/src/collector.rs b/src/collector.rs index c04f62d..db8d206 100644 --- a/src/collector.rs +++ b/src/collector.rs @@ -88,7 +88,7 @@ impl StreamingCollector { ) -> Self { Self { local_symbolizer, - pprof_ingest_url: pprof_ingest_url.into(), + pprof_ingest_url: format!("{}/pprof/new", pprof_ingest_url), http_client_timeout: Duration::from_secs(30), profile_duration, profile_frequency_hz, diff --git a/src/debug_info.rs b/src/debug_info.rs new file mode 100644 index 0000000..ad972b5 --- /dev/null +++ b/src/debug_info.rs @@ -0,0 +1,165 @@ +use std::io::Read; +use std::path::PathBuf; +use std::time::Duration; + +use reqwest::StatusCode; +use tracing::instrument; + +use lightswitch_object::BuildId; +use lightswitch_object::ExecutableId; + +/// Handles with debug information. +/// +/// This currently experimental, not feature-complete and not used yet during +/// symbolization. The end goal would be to keep track of every debug info +/// that's either present locally or remotely (depending on configuration), while +/// minimizing the number of open FDs, file copies, and race condition windows. +pub trait DebugInfoManager { + fn add_if_not_present( + &self, + name: &str, + build_id: &BuildId, + executable_id: ExecutableId, + file: &mut std::fs::File, + ) -> anyhow::Result<()>; + fn debug_info_path(&self) -> Option; +} + +pub struct DebugInfoBackendNull {} +impl DebugInfoManager for DebugInfoBackendNull { + fn add_if_not_present( + &self, + _name: &str, + _build_id: &BuildId, + _executable_id: ExecutableId, + _file: &mut std::fs::File, + ) -> anyhow::Result<()> { + Ok(()) + } + + fn debug_info_path(&self) -> Option { + None + } +} + +#[derive(Debug)] +pub struct DebugInfoBackendFilesystem { + pub path: PathBuf, +} +impl DebugInfoManager for DebugInfoBackendFilesystem { + #[instrument] + fn add_if_not_present( + &self, + _name: &str, + build_id: &BuildId, + executable_id: ExecutableId, + file: &mut std::fs::File, + ) -> anyhow::Result<()> { + // try to find, else extract + if self.find_in_fs(build_id) { + return Ok(()); + } + + self.add_to_fs(build_id, executable_id, file) + } + + fn debug_info_path(&self) -> Option { + todo!() + } +} + +impl DebugInfoBackendFilesystem { + fn find_in_fs(&self, build_id: &BuildId) -> bool { + self.path.join(build_id.to_string()).exists() + } + + fn add_to_fs( + &self, + build_id: &BuildId, + _executable_id: ExecutableId, + file: &mut std::fs::File, + ) -> anyhow::Result<()> { + // TODO: add support for other methods beyond copying. For example + // hardlinks could be used and only fall back to copying if the src + // and dst filesystems differ. + let mut writer = std::fs::File::create(self.path.join(build_id.to_string()))?; + std::io::copy(file, &mut writer)?; + Ok(()) + } +} + +#[derive(Debug)] +pub struct DebugInfoBackendRemote { + pub http_client_timeout: Duration, + pub server_url: String, +} +impl DebugInfoManager for DebugInfoBackendRemote { + #[instrument(level = "debug")] + fn add_if_not_present( + &self, + name: &str, + build_id: &BuildId, + executable_id: ExecutableId, + file: &mut std::fs::File, + ) -> anyhow::Result<()> { + // TODO: add a local cache to not have to reach to the backend + // unnecessarily. + if self.find_in_backend(build_id)? { + return Ok(()); + } + + // TODO: do this in another thread. + self.upload_to_backend(name, build_id, executable_id, file)?; + Ok(()) + } + + fn debug_info_path(&self) -> Option { + None + } +} + +impl DebugInfoBackendRemote { + /// Whether the backend knows about some debug information. + #[instrument(level = "debug")] + fn find_in_backend(&self, build_id: &BuildId) -> anyhow::Result { + let client_builder = reqwest::blocking::Client::builder().timeout(self.http_client_timeout); + let client = client_builder.build()?; + let response = client + .get(format!( + "{}/debuginfo/{}", + self.server_url.clone(), + build_id + )) + .send(); + + Ok(response?.status() == StatusCode::OK) + } + + /// Send the debug information to the backend. + #[instrument] + fn upload_to_backend( + &self, + name: &str, + build_id: &BuildId, + executable_id: ExecutableId, + file: &mut std::fs::File, + ) -> anyhow::Result<()> { + let client_builder = reqwest::blocking::Client::builder().timeout(self.http_client_timeout); + let client = client_builder.build()?; + let mut debug_info = Vec::new(); + file.read_to_end(&mut debug_info)?; + + let response = client + .post(format!( + "{}/debuginfo/new/{}/{}/{}", + self.server_url.clone(), + name, + build_id, + executable_id + )) + .body(debug_info) + .send()?; + println!("wrote debug info to server {:?}", response); + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index 9fc43c6..59d1bb0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ pub mod bpf; pub mod collector; +pub mod debug_info; pub mod ksym; pub mod perf_events; pub mod process; diff --git a/src/profiler.rs b/src/profiler.rs index 80f46e9..ab5833f 100644 --- a/src/profiler.rs +++ b/src/profiler.rs @@ -30,6 +30,8 @@ use crate::bpf::profiler_skel::{OpenProfilerSkel, ProfilerSkel, ProfilerSkelBuil use crate::bpf::tracers_bindings::*; use crate::bpf::tracers_skel::{TracersSkel, TracersSkelBuilder}; use crate::collector::*; +use crate::debug_info::DebugInfoBackendNull; +use crate::debug_info::DebugInfoManager; use crate::perf_events::setup_perf_event; use crate::process::{ ExecutableMapping, ExecutableMappingType, ExecutableMappings, ObjectFileInfo, Pid, ProcessInfo, @@ -101,6 +103,8 @@ pub struct Profiler { exclude_self: bool, /// Sizes for the unwind information buckets. native_unwind_info_bucket_sizes: Vec, + /// Deals with debug information + debug_info_manager: Box, } pub struct ProfilerConfig { @@ -116,6 +120,7 @@ pub struct ProfilerConfig { pub mapsize_rate_limits: u32, pub exclude_self: bool, pub native_unwind_info_bucket_sizes: Vec, + pub debug_info_manager: Box, } impl Default for ProfilerConfig { @@ -136,6 +141,7 @@ impl Default for ProfilerConfig { 1_000, 10_000, 20_000, 40_000, 80_000, 160_000, 320_000, 640_000, 1_280_000, 2_560_000, 3_840_000, 5_120_000, 7_680_000, ], + debug_info_manager: Box::new(DebugInfoBackendNull {}), } } } @@ -359,6 +365,7 @@ impl Profiler { session_duration: profiler_config.session_duration, exclude_self: profiler_config.exclude_self, native_unwind_info_bucket_sizes: profiler_config.native_unwind_info_bucket_sizes, + debug_info_manager: profiler_config.debug_info_manager, } } @@ -1117,7 +1124,6 @@ impl Profiler { }, }); - // This is not released (see note "deadlock") let object_files = self.object_files.read().unwrap(); let executable = object_files.get(&mapping.executable_id).unwrap(); let executable_path = executable.open_file_path(); @@ -1293,7 +1299,7 @@ impl Profiler { // We want to open the file as quickly as possible to minimise the chances of races // if the file is deleted. - let file = match fs::File::open(&abs_path) { + let mut file = match fs::File::open(&abs_path) { Ok(f) => f, Err(e) => { debug!("failed to open file {} due to {:?}", abs_path, e); @@ -1356,11 +1362,33 @@ impl Profiler { soft_delete: false, }); + let abs_path = PathBuf::from(abs_path); + + // If the object file has debug info, add it to our store. + if object_file.has_debug_info() { + let name = match abs_path.file_name() { + Some(os_name) => os_name.to_string_lossy().to_string(), + None => "error".to_string(), + }; + let res = self.debug_info_manager.add_if_not_present( + &name, + &build_id, + executable_id, + &mut file, + ); + debug!("debug info manager add result {:?}", res); + } else { + debug!( + "could not find debug information for {}", + abs_path.display() + ); + } + match object_files.entry(executable_id) { Entry::Vacant(entry) => match object_file.elf_load_segments() { Ok(elf_loads) => { entry.insert(ObjectFileInfo { - path: PathBuf::from(abs_path), + path: abs_path, file, elf_load_segments: elf_loads, is_dyn: object_file.is_dynamic(),