Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial implementation of debug information manager
Browse files Browse the repository at this point in the history
Currently, executables are opened as soon as possible to have a hold of
them to be able to symbolize later on. This approach is not ideal
because the number of opened file descriptors can balloon, and this is
done without first checking if the executables contain debug information
at all.

In the future all the local and remote use-cases will transition to
using the debug information manager. This first iteration is quite
simple and lacks a lot of features needed to make this really work. In
this commit it's still disabled by default, and just adds some basic
features to upload debug information to a server, if needed.

Left various TODOs on the planned changes.

Test Plan
=========

CI + manual checks.
javierhonduco committed Dec 13, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 4429694 commit e0cbe41
Showing 9 changed files with 249 additions and 23 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion lightswitch-object/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "lightswitch-object"
version = "0.1.0"
version = "0.1.1"
edition = "2021"
description = "Deals with object files"
license = "MIT"
5 changes: 5 additions & 0 deletions lightswitch-object/src/object.rs
Original file line number Diff line number Diff line change
@@ -94,6 +94,11 @@ impl ObjectFile {
Ok(BuildId::sha256_from_digest(&self.code_hash))
}

/// Returns whether the object has debug symbols.
pub fn has_debug_info(&self) -> bool {
self.object.has_debug_symbols()
}

pub fn is_dynamic(&self) -> bool {
self.object.kind() == ObjectKind::Dynamic
}
10 changes: 10 additions & 0 deletions src/cli/args.rs
Original file line number Diff line number Diff line change
@@ -42,6 +42,14 @@ pub(crate) enum Symbolizer {
None,
}

#[derive(PartialEq, clap::ValueEnum, Debug, Clone, Default)]
pub(crate) enum DebugInfo {
#[default]
None,
Copy,
Backend,
}

#[derive(Parser, Debug)]
pub(crate) struct CliArgs {
/// Specific PIDs to profile
@@ -126,4 +134,6 @@ pub(crate) struct CliArgs {
pub(crate) exclude_self: bool,
#[arg(long, default_value_t, value_enum)]
pub(crate) symbolizer: Symbolizer,
#[arg(long, default_value_t, value_enum)]
pub(crate) debug_info: DebugInfo,
}
51 changes: 34 additions & 17 deletions src/cli/main.rs
Original file line number Diff line number Diff line change
@@ -6,11 +6,13 @@ use std::io::Write;
use std::panic;
use std::path::PathBuf;
use std::sync::{Arc, Mutex};
use std::time::Duration;

use clap::Parser;
use crossbeam_channel::bounded;
use inferno::flamegraph;
use lightswitch::collector::{AggregatorCollector, Collector, NullCollector, StreamingCollector};
use lightswitch::debug_info::DebugInfoManager;
use lightswitch_metadata::metadata_provider::GlobalMetadataProvider;
use nix::unistd::Uid;
use prost::Message;
@@ -21,6 +23,9 @@ use tracing_subscriber::FmtSubscriber;
use lightswitch_capabilities::system_info::SystemInfo;
use lightswitch_metadata::metadata_provider::ThreadSafeGlobalMetadataProvider;

use lightswitch::debug_info::{
DebugInfoFilesystemBackend, DebugInfoNullBackend, DebugInfoRemoteBackend,
};
use lightswitch::profile::symbolize_profile;
use lightswitch::profile::{fold_profile, to_pprof};
use lightswitch::profiler::{Profiler, ProfilerConfig};
@@ -32,12 +37,13 @@ mod args;
mod validators;

use crate::args::CliArgs;
use crate::args::DebugInfo;
use crate::args::LoggingLevel;
use crate::args::ProfileFormat;
use crate::args::ProfileSender;
use crate::args::Symbolizer;

const DEFAULT_PPROF_INGEST_URL: &str = "http://localhost:4567/pprof/new";
const DEFAULT_SERVER_URL: &str = "http://localhost:4567";

/// Exit the main thread if any thread panics. We prefer this behaviour because pretty much every
/// thread is load bearing for the correct functioning.
@@ -98,24 +104,34 @@ fn main() -> Result<(), Box<dyn Error>> {
}
}

let server_url = args.server_url.unwrap_or(DEFAULT_SERVER_URL.into());

let metadata_provider: ThreadSafeGlobalMetadataProvider =
Arc::new(Mutex::new(GlobalMetadataProvider::default()));

let collector = Arc::new(Mutex::new(match args.sender {
ProfileSender::None => Box::new(NullCollector::new()) as Box<dyn Collector + Send>,
ProfileSender::LocalDisk => {
Box::new(AggregatorCollector::new()) as Box<dyn Collector + Send>
}
ProfileSender::Remote => Box::new(StreamingCollector::new(
args.symbolizer == Symbolizer::Local,
args.server_url
.as_ref()
.map_or(DEFAULT_PPROF_INGEST_URL, |v| v),
ProfilerConfig::default().session_duration,
args.sample_freq,
metadata_provider.clone(),
)) as Box<dyn Collector + Send>,
}));
let collector: Arc<Mutex<Box<dyn Collector + Send>>> =
Arc::new(Mutex::new(match args.sender {
ProfileSender::None => Box::new(NullCollector::new()),
ProfileSender::LocalDisk => Box::new(AggregatorCollector::new()),
ProfileSender::Remote => Box::new(StreamingCollector::new(
args.symbolizer == Symbolizer::Local,
&server_url,
ProfilerConfig::default().session_duration,
args.sample_freq,
metadata_provider.clone(),
)),
}));

let debug_info_manager: Box<dyn DebugInfoManager> = match args.debug_info {
DebugInfo::None => Box::new(DebugInfoNullBackend {}),
DebugInfo::Copy => Box::new(DebugInfoFilesystemBackend {
path: PathBuf::from("/tmp"),
}),
DebugInfo::Backend => Box::new(DebugInfoRemoteBackend {
http_client_timeout: Duration::from_millis(500),
server_url,
}),
};

let profiler_config = ProfilerConfig {
libbpf_debug: args.libbpf_debug,
@@ -128,6 +144,7 @@ fn main() -> Result<(), Box<dyn Error>> {
mapsize_aggregated_stacks: args.mapsize_aggregated_stacks,
mapsize_rate_limits: args.mapsize_rate_limits,
exclude_self: args.exclude_self,
debug_info_manager,
..Default::default()
};

@@ -260,7 +277,7 @@ mod tests {
cmd.arg("--help");
cmd.assert().success();
let actual = String::from_utf8(cmd.unwrap().stdout).unwrap();
insta::assert_yaml_snapshot!(actual, @r#""Usage: lightswitch [OPTIONS]\n\nOptions:\n --pids <PIDS>\n Specific PIDs to profile\n\n --tids <TIDS>\n Specific TIDs to profile (these can be outside the PIDs selected above)\n\n --show-unwind-info <PATH_TO_BINARY>\n Show unwind info for given binary\n\n --show-info <PATH_TO_BINARY>\n Show build ID for given binary\n\n -D, --duration <DURATION>\n How long this agent will run in seconds\n \n [default: 18446744073709551615]\n\n --libbpf-debug\n Enable libbpf logs. This includes the BPF verifier output\n\n --bpf-logging\n Enable BPF programs logging\n\n --logging <LOGGING>\n Set lightswitch's logging level\n \n [default: info]\n [possible values: trace, debug, info, warn, error]\n\n --sample-freq <SAMPLE_FREQ_IN_HZ>\n Per-CPU Sampling Frequency in Hz\n \n [default: 19]\n\n --profile-format <PROFILE_FORMAT>\n Output file for Flame Graph in SVG format\n \n [default: flame-graph]\n [possible values: none, flame-graph, pprof]\n\n --profile-path <PROFILE_PATH>\n Path for the generated profile\n\n --profile-name <PROFILE_NAME>\n Name for the generated profile\n\n --sender <SENDER>\n Where to write the profile\n \n [default: local-disk]\n\n Possible values:\n - none: Discard the profile. Used for kernel tests\n - local-disk\n - remote\n\n --server-url <SERVER_URL>\n \n\n --perf-buffer-bytes <PERF_BUFFER_BYTES>\n Size of each profiler perf buffer, in bytes (must be a power of 2)\n \n [default: 524288]\n\n --mapsize-info\n Print eBPF map sizes after creation\n\n --mapsize-stacks <MAPSIZE_STACKS>\n max number of individual stacks to capture before aggregation\n \n [default: 100000]\n\n --mapsize-aggregated-stacks <MAPSIZE_AGGREGATED_STACKS>\n max number of unique stacks after aggregation\n \n [default: 10000]\n\n --mapsize-rate-limits <MAPSIZE_RATE_LIMITS>\n max number of rate limit entries\n \n [default: 5000]\n\n --exclude-self\n Do not profile the profiler (myself)\n\n --symbolizer <SYMBOLIZER>\n [default: local]\n [possible values: local, none]\n\n -h, --help\n Print help (see a summary with '-h')\n""#);
insta::assert_yaml_snapshot!(actual, @r#""Usage: lightswitch [OPTIONS]\n\nOptions:\n --pids <PIDS>\n Specific PIDs to profile\n\n --tids <TIDS>\n Specific TIDs to profile (these can be outside the PIDs selected above)\n\n --show-unwind-info <PATH_TO_BINARY>\n Show unwind info for given binary\n\n --show-info <PATH_TO_BINARY>\n Show build ID for given binary\n\n -D, --duration <DURATION>\n How long this agent will run in seconds\n \n [default: 18446744073709551615]\n\n --libbpf-debug\n Enable libbpf logs. This includes the BPF verifier output\n\n --bpf-logging\n Enable BPF programs logging\n\n --logging <LOGGING>\n Set lightswitch's logging level\n \n [default: info]\n [possible values: trace, debug, info, warn, error]\n\n --sample-freq <SAMPLE_FREQ_IN_HZ>\n Per-CPU Sampling Frequency in Hz\n \n [default: 19]\n\n --profile-format <PROFILE_FORMAT>\n Output file for Flame Graph in SVG format\n \n [default: flame-graph]\n [possible values: none, flame-graph, pprof]\n\n --profile-path <PROFILE_PATH>\n Path for the generated profile\n\n --profile-name <PROFILE_NAME>\n Name for the generated profile\n\n --sender <SENDER>\n Where to write the profile\n \n [default: local-disk]\n\n Possible values:\n - none: Discard the profile. Used for kernel tests\n - local-disk\n - remote\n\n --server-url <SERVER_URL>\n \n\n --perf-buffer-bytes <PERF_BUFFER_BYTES>\n Size of each profiler perf buffer, in bytes (must be a power of 2)\n \n [default: 524288]\n\n --mapsize-info\n Print eBPF map sizes after creation\n\n --mapsize-stacks <MAPSIZE_STACKS>\n max number of individual stacks to capture before aggregation\n \n [default: 100000]\n\n --mapsize-aggregated-stacks <MAPSIZE_AGGREGATED_STACKS>\n max number of unique stacks after aggregation\n \n [default: 10000]\n\n --mapsize-rate-limits <MAPSIZE_RATE_LIMITS>\n max number of rate limit entries\n \n [default: 5000]\n\n --exclude-self\n Do not profile the profiler (myself)\n\n --symbolizer <SYMBOLIZER>\n [default: local]\n [possible values: local, none]\n\n --debug-info <DEBUG_INFO>\n [default: none]\n [possible values: none, copy, backend]\n\n -h, --help\n Print help (see a summary with '-h')\n""#);
}

#[rstest]
2 changes: 1 addition & 1 deletion src/collector.rs
Original file line number Diff line number Diff line change
@@ -88,7 +88,7 @@ impl StreamingCollector {
) -> Self {
Self {
local_symbolizer,
pprof_ingest_url: pprof_ingest_url.into(),
pprof_ingest_url: format!("{}/pprof/new", pprof_ingest_url),
http_client_timeout: Duration::from_secs(30),
profile_duration,
profile_frequency_hz,
165 changes: 165 additions & 0 deletions src/debug_info.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
use std::io::Read;
use std::path::PathBuf;
use std::time::Duration;

use reqwest::StatusCode;
use tracing::instrument;

use lightswitch_object::BuildId;
use lightswitch_object::ExecutableId;

/// Handles with debug information.
///
/// This currently experimental, not feature-complete and not used yet during
/// symbolization. The end goal would be to keep track of every debug info
/// that's either present locally or remotely (depending on configuration), while
/// minimizing the number of open FDs, file copies, and race condition windows.
pub trait DebugInfoManager {
fn add_if_not_present(
&self,
name: &str,
build_id: &BuildId,
executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()>;
fn debug_info_path(&self) -> Option<PathBuf>;
}

pub struct DebugInfoNullBackend {}
impl DebugInfoManager for DebugInfoNullBackend {
fn add_if_not_present(
&self,
_name: &str,
_build_id: &BuildId,
_executable_id: ExecutableId,
_file: &mut std::fs::File,
) -> anyhow::Result<()> {
Ok(())
}

fn debug_info_path(&self) -> Option<PathBuf> {
None
}
}

#[derive(Debug)]
pub struct DebugInfoFilesystemBackend {
pub path: PathBuf,
}
impl DebugInfoManager for DebugInfoFilesystemBackend {
#[instrument]
fn add_if_not_present(
&self,
_name: &str,
build_id: &BuildId,
executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()> {
// try to find, else extract
if self.find_in_fs(build_id) {
return Ok(());
}

self.add_to_fs(build_id, executable_id, file)
}

fn debug_info_path(&self) -> Option<PathBuf> {
todo!()
}
}

impl DebugInfoFilesystemBackend {
fn find_in_fs(&self, build_id: &BuildId) -> bool {
self.path.join(build_id.to_string()).exists()
}

fn add_to_fs(
&self,
build_id: &BuildId,
_executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()> {
// TODO: add support for other methods beyond copying. For example
// hardlinks could be used and only fall back to copying if the src
// and dst filesystems differ.
let mut writer = std::fs::File::create(self.path.join(build_id.to_string()))?;
std::io::copy(file, &mut writer)?;
Ok(())
}
}

#[derive(Debug)]
pub struct DebugInfoRemoteBackend {
pub http_client_timeout: Duration,
pub server_url: String,
}
impl DebugInfoManager for DebugInfoRemoteBackend {
#[instrument(level = "debug")]
fn add_if_not_present(
&self,
name: &str,
build_id: &BuildId,
executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()> {
// TODO: add a local cache to not have to reach to the backend
// unnecessarily.
if self.find_in_backend(build_id)? {
return Ok(());
}

// TODO: do this in another thread.
self.upload_to_backend(name, build_id, executable_id, file)?;
Ok(())
}

fn debug_info_path(&self) -> Option<PathBuf> {
None
}
}

impl DebugInfoRemoteBackend {
/// Whether the backend knows about some debug information.
#[instrument(level = "debug")]
fn find_in_backend(&self, build_id: &BuildId) -> anyhow::Result<bool> {
let client_builder = reqwest::blocking::Client::builder().timeout(self.http_client_timeout);
let client = client_builder.build()?;
let response = client
.get(format!(
"{}/debuginfo/{}",
self.server_url.clone(),
build_id
))
.send();

Ok(response?.status() == StatusCode::OK)
}

/// Send the debug information to the backend.
#[instrument]
fn upload_to_backend(
&self,
name: &str,
build_id: &BuildId,
executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()> {
let client_builder = reqwest::blocking::Client::builder().timeout(self.http_client_timeout);
let client = client_builder.build()?;
let mut debug_info = Vec::new();
file.read_to_end(&mut debug_info)?;

let response = client
.post(format!(
"{}/debuginfo/new/{}/{}/{}",
self.server_url.clone(),
name,
build_id,
executable_id
))
.body(debug_info)
.send()?;
println!("wrote debug info to server {:?}", response);
Ok(())
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub mod bpf;
pub mod collector;
pub mod debug_info;
pub mod ksym;
pub mod perf_events;
pub mod process;
34 changes: 31 additions & 3 deletions src/profiler.rs
Original file line number Diff line number Diff line change
@@ -30,6 +30,8 @@ use crate::bpf::profiler_skel::{OpenProfilerSkel, ProfilerSkel, ProfilerSkelBuil
use crate::bpf::tracers_bindings::*;
use crate::bpf::tracers_skel::{TracersSkel, TracersSkelBuilder};
use crate::collector::*;
use crate::debug_info::DebugInfoManager;
use crate::debug_info::DebugInfoNullBackend;
use crate::perf_events::setup_perf_event;
use crate::process::{
ExecutableMapping, ExecutableMappingType, ExecutableMappings, ObjectFileInfo, Pid, ProcessInfo,
@@ -101,6 +103,8 @@ pub struct Profiler {
exclude_self: bool,
/// Sizes for the unwind information buckets.
native_unwind_info_bucket_sizes: Vec<u32>,
/// Deals with debug information
debug_info_manager: Box<dyn DebugInfoManager>,
}

pub struct ProfilerConfig {
@@ -116,6 +120,7 @@ pub struct ProfilerConfig {
pub mapsize_rate_limits: u32,
pub exclude_self: bool,
pub native_unwind_info_bucket_sizes: Vec<u32>,
pub debug_info_manager: Box<dyn DebugInfoManager>,
}

impl Default for ProfilerConfig {
@@ -136,6 +141,7 @@ impl Default for ProfilerConfig {
1_000, 10_000, 20_000, 40_000, 80_000, 160_000, 320_000, 640_000, 1_280_000,
2_560_000, 3_840_000, 5_120_000, 7_680_000,
],
debug_info_manager: Box::new(DebugInfoNullBackend {}),
}
}
}
@@ -359,6 +365,7 @@ impl Profiler {
session_duration: profiler_config.session_duration,
exclude_self: profiler_config.exclude_self,
native_unwind_info_bucket_sizes: profiler_config.native_unwind_info_bucket_sizes,
debug_info_manager: profiler_config.debug_info_manager,
}
}

@@ -1117,7 +1124,6 @@ impl Profiler {
},
});

// This is not released (see note "deadlock")
let object_files = self.object_files.read().unwrap();
let executable = object_files.get(&mapping.executable_id).unwrap();
let executable_path = executable.open_file_path();
@@ -1293,7 +1299,7 @@ impl Profiler {

// We want to open the file as quickly as possible to minimise the chances of races
// if the file is deleted.
let file = match fs::File::open(&abs_path) {
let mut file = match fs::File::open(&abs_path) {
Ok(f) => f,
Err(e) => {
debug!("failed to open file {} due to {:?}", abs_path, e);
@@ -1356,11 +1362,33 @@ impl Profiler {
soft_delete: false,
});

let abs_path = PathBuf::from(abs_path);

// If the object file has debug info, add it to our store.
if object_file.has_debug_info() {
let name = match abs_path.file_name() {
Some(os_name) => os_name.to_string_lossy().to_string(),
None => "error".to_string(),
};
let res = self.debug_info_manager.add_if_not_present(
&name,
&build_id,
executable_id,
&mut file,
);
debug!("debug info manager add result {:?}", res);
} else {
debug!(
"could not find debug information for {}",
abs_path.display()
);
}

match object_files.entry(executable_id) {
Entry::Vacant(entry) => match object_file.elf_load_segments() {
Ok(elf_loads) => {
entry.insert(ObjectFileInfo {
path: PathBuf::from(abs_path),
path: abs_path,
file,
elf_load_segments: elf_loads,
is_dyn: object_file.is_dynamic(),

0 comments on commit e0cbe41

Please sign in to comment.