Skip to content

Commit

Permalink
Initial implementation of debug information manager
Browse files Browse the repository at this point in the history
Currently, executables are opened as soon as possible to have a hold of
them to be able to symbolize later on. This approach is not ideal
because the number of opened file descriptors can balloon, and this is
done without first checking if the executables contain debug information
at all.

In the future all the local and remote use-cases will transition to
using the debug information manager. This first iteration is quite
simple and lacks a lot of features needed to make this really work. In
this commit it's still disabled by default, and just adds some basic
features to upload debug information to a server, if needed.

Left various TODOs on the planned changes.

Test Plan
=========

CI + manual checks.
  • Loading branch information
javierhonduco committed Dec 2, 2024
1 parent 4429694 commit 9f16f5b
Show file tree
Hide file tree
Showing 9 changed files with 231 additions and 21 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion lightswitch-object/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "lightswitch-object"
version = "0.1.0"
version = "0.1.1"
edition = "2021"
description = "Deals with object files"
license = "MIT"
Expand Down
5 changes: 5 additions & 0 deletions lightswitch-object/src/object.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ impl ObjectFile {
Ok(BuildId::sha256_from_digest(&self.code_hash))
}

/// Returns whether the object has debug symbols.
pub fn has_debug_info(&self) -> bool {
self.object.has_debug_symbols()
}

pub fn is_dynamic(&self) -> bool {
self.object.kind() == ObjectKind::Dynamic
}
Expand Down
10 changes: 10 additions & 0 deletions src/cli/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ pub(crate) enum Symbolizer {
None,
}

#[derive(PartialEq, clap::ValueEnum, Debug, Clone, Default)]
pub(crate) enum DebugInfo {
#[default]
None,
Copy,
Backend,
}

#[derive(Parser, Debug)]
pub(crate) struct CliArgs {
/// Specific PIDs to profile
Expand Down Expand Up @@ -126,4 +134,6 @@ pub(crate) struct CliArgs {
pub(crate) exclude_self: bool,
#[arg(long, default_value_t, value_enum)]
pub(crate) symbolizer: Symbolizer,
#[arg(long, default_value_t, value_enum)]
pub(crate) debug_info: DebugInfo,
}
49 changes: 33 additions & 16 deletions src/cli/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ use std::io::Write;
use std::panic;
use std::path::PathBuf;
use std::sync::{Arc, Mutex};
use std::time::Duration;

use clap::Parser;
use crossbeam_channel::bounded;
use inferno::flamegraph;
use lightswitch::collector::{AggregatorCollector, Collector, NullCollector, StreamingCollector};
use lightswitch::debug_info::DebugInfoManager;
use lightswitch_metadata::metadata_provider::GlobalMetadataProvider;
use nix::unistd::Uid;
use prost::Message;
Expand All @@ -21,6 +23,9 @@ use tracing_subscriber::FmtSubscriber;
use lightswitch_capabilities::system_info::SystemInfo;
use lightswitch_metadata::metadata_provider::ThreadSafeGlobalMetadataProvider;

use lightswitch::debug_info::{
DebugInfoFilesystemBackend, DebugInfoNullBackend, DebugInfoRemoteBackend,
};
use lightswitch::profile::symbolize_profile;
use lightswitch::profile::{fold_profile, to_pprof};
use lightswitch::profiler::{Profiler, ProfilerConfig};
Expand All @@ -32,12 +37,13 @@ mod args;
mod validators;

use crate::args::CliArgs;
use crate::args::DebugInfo;
use crate::args::LoggingLevel;
use crate::args::ProfileFormat;
use crate::args::ProfileSender;
use crate::args::Symbolizer;

const DEFAULT_PPROF_INGEST_URL: &str = "http://localhost:4567/pprof/new";
const DEFAULT_SERVER_URL: &str = "http://localhost:4567";

/// Exit the main thread if any thread panics. We prefer this behaviour because pretty much every
/// thread is load bearing for the correct functioning.
Expand Down Expand Up @@ -98,24 +104,34 @@ fn main() -> Result<(), Box<dyn Error>> {
}
}

let server_url = args.server_url.unwrap_or(DEFAULT_SERVER_URL.into());

let metadata_provider: ThreadSafeGlobalMetadataProvider =
Arc::new(Mutex::new(GlobalMetadataProvider::default()));

let collector = Arc::new(Mutex::new(match args.sender {
ProfileSender::None => Box::new(NullCollector::new()) as Box<dyn Collector + Send>,
ProfileSender::LocalDisk => {
Box::new(AggregatorCollector::new()) as Box<dyn Collector + Send>
}
ProfileSender::Remote => Box::new(StreamingCollector::new(
args.symbolizer == Symbolizer::Local,
args.server_url
.as_ref()
.map_or(DEFAULT_PPROF_INGEST_URL, |v| v),
ProfilerConfig::default().session_duration,
args.sample_freq,
metadata_provider.clone(),
)) as Box<dyn Collector + Send>,
}));
let collector: Arc<Mutex<Box<dyn Collector + Send>>> =
Arc::new(Mutex::new(match args.sender {
ProfileSender::None => Box::new(NullCollector::new()),
ProfileSender::LocalDisk => Box::new(AggregatorCollector::new()),
ProfileSender::Remote => Box::new(StreamingCollector::new(
args.symbolizer == Symbolizer::Local,
&server_url,
ProfilerConfig::default().session_duration,
args.sample_freq,
metadata_provider.clone(),
)),
}));

let debug_info_manager: Box<dyn DebugInfoManager> = match args.debug_info {
DebugInfo::None => Box::new(DebugInfoNullBackend {}),
DebugInfo::Copy => Box::new(DebugInfoFilesystemBackend {
path: PathBuf::from("/tmp"),
}),
DebugInfo::Backend => Box::new(DebugInfoRemoteBackend {
http_client_timeout: Duration::from_millis(500),
server_url,
}),
};

let profiler_config = ProfilerConfig {
libbpf_debug: args.libbpf_debug,
Expand All @@ -128,6 +144,7 @@ fn main() -> Result<(), Box<dyn Error>> {
mapsize_aggregated_stacks: args.mapsize_aggregated_stacks,
mapsize_rate_limits: args.mapsize_rate_limits,
exclude_self: args.exclude_self,
debug_info_manager,
..Default::default()
};

Expand Down
2 changes: 1 addition & 1 deletion src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ impl StreamingCollector {
) -> Self {
Self {
local_symbolizer,
pprof_ingest_url: pprof_ingest_url.into(),
pprof_ingest_url: format!("{}/pprof/new", pprof_ingest_url),
http_client_timeout: Duration::from_secs(30),
profile_duration,
profile_frequency_hz,
Expand Down
159 changes: 159 additions & 0 deletions src/debug_info.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
use std::io::Read;
use std::path::PathBuf;
use std::time::Duration;

use reqwest::StatusCode;
use tracing::instrument;

use lightswitch_object::BuildId;
use lightswitch_object::ExecutableId;

/// Handles with debug information.
///
/// This currently experimental, not feature-complete and not used yet during
/// symbolization. The end goal would be to keep track of every debug info
/// that's either present locally or remotely (depending on configuration), while
/// minimizing the number of open FDs, file copies, and race condition windows.
pub trait DebugInfoManager {
fn add_if_not_present(
&self,
build_id: &BuildId,
executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()>;
fn debug_info_path(&self) -> Option<PathBuf>;
}

pub struct DebugInfoNullBackend {}
impl DebugInfoManager for DebugInfoNullBackend {
fn add_if_not_present(
&self,
_build_id: &BuildId,
_executable_id: ExecutableId,
_file: &mut std::fs::File,
) -> anyhow::Result<()> {
Ok(())
}

fn debug_info_path(&self) -> Option<PathBuf> {
None
}
}

#[derive(Debug)]
pub struct DebugInfoFilesystemBackend {
pub path: PathBuf,
}
impl DebugInfoManager for DebugInfoFilesystemBackend {
#[instrument]
fn add_if_not_present(
&self,
build_id: &BuildId,
executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()> {
// try to find, else extract
if self.find_in_fs(build_id) {
return Ok(());
}

self.add_to_fs(build_id, executable_id, file)
}

fn debug_info_path(&self) -> Option<PathBuf> {
todo!()
}
}

impl DebugInfoFilesystemBackend {
fn find_in_fs(&self, build_id: &BuildId) -> bool {
self.path.join(build_id.to_string()).exists()
}

fn add_to_fs(
&self,
build_id: &BuildId,
_executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()> {
// TODO: add support for other methods beyond copying. For example
// hardlinks could be used and only fall back to copying if the src
// and dst filesystems differ.
let mut writer = std::fs::File::create(self.path.join(build_id.to_string()))?;
std::io::copy(file, &mut writer)?;
Ok(())
}
}

#[derive(Debug)]
pub struct DebugInfoRemoteBackend {
pub http_client_timeout: Duration,
pub server_url: String,
}
impl DebugInfoManager for DebugInfoRemoteBackend {
#[instrument(level = "debug")]
fn add_if_not_present(
&self,
build_id: &BuildId,
executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()> {
// TODO: add a local cache to not have to reach to the backend
// unnecessarily.
if self.find_in_backend(build_id)? {
return Ok(());
}

// TODO: do this in another thread.
self.upload_to_backend(build_id, executable_id, file)?;
Ok(())
}

fn debug_info_path(&self) -> Option<PathBuf> {
None
}
}

impl DebugInfoRemoteBackend {
/// Whether the backend knows about some debug information.
#[instrument(level = "debug")]
fn find_in_backend(&self, build_id: &BuildId) -> anyhow::Result<bool> {
let client_builder = reqwest::blocking::Client::builder().timeout(self.http_client_timeout);
let client = client_builder.build()?;
let response = client
.get(format!(
"{}/debuginfo/{}",
self.server_url.clone(),
build_id
))
.send();

Ok(response?.status() == StatusCode::OK)
}

/// Send the debug information to the backend.
#[instrument]
fn upload_to_backend(
&self,
build_id: &BuildId,
executable_id: ExecutableId,
file: &mut std::fs::File,
) -> anyhow::Result<()> {
let client_builder = reqwest::blocking::Client::builder().timeout(self.http_client_timeout);
let client = client_builder.build()?;
let mut debug_info = String::new();
file.read_to_string(&mut debug_info)?;

let response = client
.post(format!(
"{}/debug_info/new/{}/{}",
self.server_url.clone(),
build_id,
executable_id
))
.body(debug_info)
.send()?;
println!("wrote debug info to server {:?}", response);
Ok(())
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub mod bpf;
pub mod collector;
pub mod debug_info;
pub mod ksym;
pub mod perf_events;
pub mod process;
Expand Down
Loading

0 comments on commit 9f16f5b

Please sign in to comment.