diff --git a/Cargo.lock b/Cargo.lock index 11b414456c001..d28e8b371c3b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2228,9 +2228,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd326812b3fd01da5bb1af7d340d0d555fd3d4b641e7f1dfcf5962a902952787" dependencies = [ "futures-core", - "prost", + "prost 0.12.1", "prost-types", - "tonic", + "tonic 0.10.0", "tracing-core", ] @@ -2252,7 +2252,7 @@ dependencies = [ "thread_local", "tokio", "tokio-stream", - "tonic", + "tonic 0.10.0", "tracing", "tracing-core", "tracing-subscriber 0.3.17", @@ -4728,9 +4728,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.61" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" dependencies = [ "wasm-bindgen", ] @@ -6203,7 +6203,7 @@ dependencies = [ "snap", "tokio", "tokio-stream", - "tonic", + "tonic 0.10.0", "tonic-health", "tower", "tower-http", @@ -6356,7 +6356,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", - "tonic", + "tonic 0.10.0", "tracing", "typed-store", "workspace-hack", @@ -6538,7 +6538,7 @@ dependencies = [ "telemetry-subscribers", "tempfile", "tokio", - "tonic", + "tonic 0.10.0", "tracing", "typed-store", "workspace-hack", @@ -6572,7 +6572,7 @@ dependencies = [ "prometheus", "proptest", "proptest-derive", - "prost", + "prost 0.12.1", "prost-build", "protobuf-src", "rand 0.8.5", @@ -6584,7 +6584,7 @@ dependencies = [ "sui-protocol-config", "thiserror", "tokio", - "tonic", + "tonic 0.10.0", "tonic-build", "tracing", "typed-store", @@ -6627,7 +6627,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", - "tonic", + "tonic 0.10.0", "tower", "tracing", "typed-store", @@ -7006,12 +7006,110 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "opentelemetry" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54" +dependencies = [ + "opentelemetry_api", + "opentelemetry_sdk", +] + +[[package]] +name = "opentelemetry-otlp" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275" +dependencies = [ + "async-trait", + "futures-core", + "http", + "opentelemetry-proto", + "opentelemetry-semantic-conventions", + "opentelemetry_api", + "opentelemetry_sdk", + "prost 0.11.9", + "thiserror", + "tokio", + "tonic 0.9.2", +] + +[[package]] +name = "opentelemetry-proto" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb" +dependencies = [ + "opentelemetry_api", + "opentelemetry_sdk", + "prost 0.11.9", + "tonic 0.9.2", +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269" +dependencies = [ + "opentelemetry", +] + +[[package]] +name = "opentelemetry_api" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b" +dependencies = [ + "futures-channel", + "futures-util", + "indexmap 1.9.3", + "js-sys", + "once_cell", + "pin-project-lite", + "thiserror", + "urlencoding", +] + +[[package]] +name = "opentelemetry_sdk" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026" +dependencies = [ + "async-trait", + "crossbeam-channel", + "futures-channel", + "futures-executor", + "futures-util", + "once_cell", + "opentelemetry_api", + "ordered-float", + "percent-encoding", + "rand 0.8.5", + "regex", + "serde_json", + "thiserror", + "tokio", + "tokio-stream", +] + [[package]] name = "option-ext" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "ordered-float" +version = "3.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a54938017eacd63036332b4ae5c8a49fc8c0c1d6d629893057e4f13609edd06" +dependencies = [ + "num-traits", +] + [[package]] name = "ouroboros" version = "0.17.2" @@ -7802,6 +7900,16 @@ dependencies = [ "syn 0.15.44", ] +[[package]] +name = "prost" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd" +dependencies = [ + "bytes", + "prost-derive 0.11.9", +] + [[package]] name = "prost" version = "0.12.1" @@ -7809,7 +7917,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f4fdd22f3b9c31b53c060df4a0613a1c7f062d4115a2b984dd15b1858f7e340d" dependencies = [ "bytes", - "prost-derive", + "prost-derive 0.12.1", ] [[package]] @@ -7826,7 +7934,7 @@ dependencies = [ "once_cell", "petgraph 0.6.2", "prettyplease 0.2.6", - "prost", + "prost 0.12.1", "prost-types", "regex", "syn 2.0.32", @@ -7834,6 +7942,19 @@ dependencies = [ "which", ] +[[package]] +name = "prost-derive" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2 1.0.66", + "quote 1.0.33", + "syn 1.0.107", +] + [[package]] name = "prost-derive" version = "0.12.1" @@ -7853,7 +7974,7 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e081b29f63d83a4bc75cfc9f3fe424f9156cf92d8a4f0c9407cce9a1b67327cf" dependencies = [ - "prost", + "prost 0.12.1", ] [[package]] @@ -10780,7 +10901,7 @@ dependencies = [ "telemetry-subscribers", "tempfile", "tokio", - "tonic", + "tonic 0.10.0", "tonic-build", "tower", "tracing", @@ -10802,6 +10923,7 @@ dependencies = [ "fastcrypto-zkp", "futures", "git-version", + "humantime", "mysten-common", "mysten-metrics", "mysten-network", @@ -10954,7 +11076,7 @@ dependencies = [ "mysten-metrics", "once_cell", "prometheus", - "prost", + "prost 0.12.1", "prost-build", "protobuf", "rand 0.8.5", @@ -11102,7 +11224,7 @@ dependencies = [ "telemetry-subscribers", "test-cluster", "tokio", - "tonic", + "tonic 0.10.0", "tracing", "workspace-hack", ] @@ -11634,7 +11756,7 @@ dependencies = [ "sui-protocol-config", "tap", "thiserror", - "tonic", + "tonic 0.10.0", "tracing", "typed-store", "workspace-hack", @@ -11862,9 +11984,13 @@ dependencies = [ "console-subscriber", "crossterm 0.25.0", "once_cell", + "opentelemetry", + "opentelemetry-otlp", "prometheus", + "tokio", "tracing", "tracing-appender", + "tracing-opentelemetry", "tracing-subscriber 0.3.17", "workspace-hack", ] @@ -12362,6 +12488,34 @@ dependencies = [ "winnow", ] +[[package]] +name = "tonic" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a" +dependencies = [ + "async-trait", + "axum", + "base64 0.21.2", + "bytes", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost 0.11.9", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "tonic" version = "0.10.0" @@ -12380,7 +12534,7 @@ dependencies = [ "hyper-timeout", "percent-encoding", "pin-project", - "prost", + "prost 0.12.1", "rustls-pemfile", "tokio", "tokio-rustls 0.24.0", @@ -12411,10 +12565,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f80db390246dfb46553481f6024f0082ba00178ea495dbb99e70ba9a4fafb5e1" dependencies = [ "async-stream", - "prost", + "prost 0.12.1", "tokio", "tokio-stream", - "tonic", + "tonic 0.10.0", ] [[package]] @@ -12559,6 +12713,22 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-opentelemetry" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8" +dependencies = [ + "once_cell", + "opentelemetry", + "opentelemetry_sdk", + "smallvec", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber 0.3.17", +] + [[package]] name = "tracing-serde" version = "0.1.3" @@ -13085,9 +13255,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -13095,16 +13265,16 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2 1.0.66", "quote 1.0.33", - "syn 1.0.107", + "syn 2.0.32", "wasm-bindgen-shared", ] @@ -13122,9 +13292,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" dependencies = [ "quote 1.0.33", "wasm-bindgen-macro-support", @@ -13132,22 +13302,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2 1.0.66", "quote 1.0.33", - "syn 1.0.107", + "syn 2.0.32", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" [[package]] name = "wasm-streams" @@ -13967,7 +14137,14 @@ dependencies = [ "oorandom", "opaque-debug", "openssl-probe", + "opentelemetry", + "opentelemetry-otlp", + "opentelemetry-proto", + "opentelemetry-semantic-conventions", + "opentelemetry_api", + "opentelemetry_sdk", "option-ext", + "ordered-float", "ouroboros", "ouroboros_macro", "output_vt100", @@ -14045,9 +14222,11 @@ dependencies = [ "prometheus-parse", "proptest", "proptest-derive", - "prost", + "prost 0.11.9", + "prost 0.12.1", "prost-build", - "prost-derive", + "prost-derive 0.11.9", + "prost-derive 0.12.1", "prost-types", "protobuf", "protobuf-src", @@ -14249,7 +14428,8 @@ dependencies = [ "toml_edit 0.14.4", "toml_edit 0.15.0", "toml_edit 0.19.10", - "tonic", + "tonic 0.10.0", + "tonic 0.9.2", "tonic-build", "tonic-health", "toolchain_find", @@ -14262,6 +14442,8 @@ dependencies = [ "tracing-attributes", "tracing-core", "tracing-error", + "tracing-log", + "tracing-opentelemetry", "tracing-serde", "tracing-subscriber 0.2.25", "tracing-subscriber 0.3.17", diff --git a/crates/sui-core/src/authority/authority_per_epoch_store.rs b/crates/sui-core/src/authority/authority_per_epoch_store.rs index c7d1553bec35e..e75ad9b98f584 100644 --- a/crates/sui-core/src/authority/authority_per_epoch_store.rs +++ b/crates/sui-core/src/authority/authority_per_epoch_store.rs @@ -30,7 +30,7 @@ use sui_types::transaction::{ AuthenticatorStateUpdate, CertifiedTransaction, SenderSignedData, SharedInputObject, TransactionDataAPI, VerifiedCertificate, VerifiedSignedTransaction, }; -use tracing::{debug, error, error_span, info, trace, warn}; +use tracing::{debug, error, info, instrument, trace, warn}; use typed_store::rocks::{ default_db_options, DBBatch, DBMap, DBOptions, MetricConf, TypedStoreError, }; @@ -498,6 +498,7 @@ impl AuthorityEpochTables { pub(crate) const MUTEX_TABLE_SIZE: usize = 1024; impl AuthorityPerEpochStore { + #[instrument(name = "AuthorityPerEpochStore::new", level = "error", skip_all, fields(epoch = committee.epoch))] pub fn new( name: AuthorityName, committee: Arc, @@ -514,9 +515,6 @@ impl AuthorityPerEpochStore { let current_time = Instant::now(); let epoch_id = committee.epoch; - let span = error_span!("AuthorityPerEpochStore::new", ?epoch_id); - let _guard = span.enter(); - let tables = AuthorityEpochTables::open(epoch_id, parent_path, db_options.clone()); let end_of_publish = StakeAggregator::from_iter(committee.clone(), tables.end_of_publish.unbounded_iter()); @@ -798,6 +796,7 @@ impl AuthorityPerEpochStore { .map(|t| t.into())) } + #[instrument(level = "trace", skip_all)] pub fn insert_tx_cert_and_effects_signature( &self, tx_digest: &TransactionDigest, diff --git a/crates/sui-core/src/authority/authority_store.rs b/crates/sui-core/src/authority/authority_store.rs index 7fbbcc99db2b9..fc790e2b476f2 100644 --- a/crates/sui-core/src/authority/authority_store.rs +++ b/crates/sui-core/src/authority/authority_store.rs @@ -986,6 +986,7 @@ impl AuthorityStore { /// /// Internally it checks that all locks for active inputs are at the correct /// version, and then writes objects, certificates, parents and clean up locks atomically. + #[instrument(level = "debug", skip_all)] pub async fn update_state( &self, inner_temporary_store: InnerTemporaryStore, diff --git a/crates/sui-node/Cargo.toml b/crates/sui-node/Cargo.toml index 2dcb7161606d5..0c6d3a92898ff 100644 --- a/crates/sui-node/Cargo.toml +++ b/crates/sui-node/Cargo.toml @@ -25,6 +25,7 @@ snap.workspace = true git-version.workspace = true const-str.workspace = true url.workspace = true +humantime.workspace = true sui-archival.workspace = true sui-tls.workspace = true diff --git a/crates/sui-node/src/admin.rs b/crates/sui-node/src/admin.rs index 142f060b17189..6e88228cd97e2 100644 --- a/crates/sui-node/src/admin.rs +++ b/crates/sui-node/src/admin.rs @@ -8,11 +8,12 @@ use axum::{ routing::{get, post}, Router, }; +use humantime::parse_duration; use serde::Deserialize; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use std::sync::Arc; use sui_types::error::SuiError; -use telemetry_subscribers::FilterHandle; +use telemetry_subscribers::Filters; use tracing::info; // Example commands: @@ -37,8 +38,19 @@ use tracing::info; // View the node config (private keys will be masked): // // $ curl 'http://127.0.0.1:1337/node-config' +// +// Set a time-limited tracing config. After the duration expires, tracing will be disabled +// automatically. +// +// $ curl -X POST 'http://127.0.0.1:1337/enable-tracing?filter=info&duration=10s' +// +// Reset tracing to the TRACE_FILTER env var. +// +// $ curl -X POST 'http://127.0.0.1:1337/reset-tracing' const LOGGING_ROUTE: &str = "/logging"; +const TRACING_ROUTE: &str = "/enable-tracing"; +const TRACING_RESET_ROUTE: &str = "/reset-tracing"; const SET_BUFFER_STAKE_ROUTE: &str = "/set-override-buffer-stake"; const CLEAR_BUFFER_STAKE_ROUTE: &str = "/clear-override-buffer-stake"; const FORCE_CLOSE_EPOCH: &str = "/force-close-epoch"; @@ -47,16 +59,13 @@ const NODE_CONFIG: &str = "/node-config"; struct AppState { node: Arc, - filter_handle: FilterHandle, + filters: Filters, } -pub async fn run_admin_server(node: Arc, port: u16, filter_handle: FilterHandle) { - let filter = filter_handle.get().unwrap(); +pub async fn run_admin_server(node: Arc, port: u16, filters: Filters) { + let filter = filters.get_log().unwrap(); - let app_state = AppState { - node, - filter_handle, - }; + let app_state = AppState { node, filters }; let app = Router::new() .route(LOGGING_ROUTE, get(get_filter)) @@ -72,6 +81,8 @@ pub async fn run_admin_server(node: Arc, port: u16, filter_handle: Filt post(clear_override_protocol_upgrade_buffer_stake), ) .route(FORCE_CLOSE_EPOCH, post(force_close_epoch)) + .route(TRACING_ROUTE, post(enable_tracing)) + .route(TRACING_RESET_ROUTE, post(reset_tracing)) .with_state(Arc::new(app_state)); let socket_address = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), port); @@ -87,8 +98,41 @@ pub async fn run_admin_server(node: Arc, port: u16, filter_handle: Filt .unwrap() } +#[derive(Deserialize)] +struct EnableTracing { + filter: String, + duration: String, +} + +async fn enable_tracing( + State(state): State>, + query: Query, +) -> (StatusCode, String) { + let Query(EnableTracing { filter, duration }) = query; + + let Ok(duration) = parse_duration(&duration) else { + return (StatusCode::BAD_REQUEST, "invalid duration".into()); + }; + + match state.filters.update_trace(filter, duration) { + Ok(()) => ( + StatusCode::OK, + format!("tracing enabled for {:?}", duration), + ), + Err(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()), + } +} + +async fn reset_tracing(State(state): State>) -> (StatusCode, String) { + state.filters.reset_trace(); + ( + StatusCode::OK, + "tracing filter reset to TRACE_FILTER env var".into(), + ) +} + async fn get_filter(State(state): State>) -> (StatusCode, String) { - match state.filter_handle.get() { + match state.filters.get_log() { Ok(filter) => (StatusCode::OK, filter), Err(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()), } @@ -98,7 +142,7 @@ async fn set_filter( State(state): State>, new_filter: String, ) -> (StatusCode, String) { - match state.filter_handle.update(&new_filter) { + match state.filters.update_log(&new_filter) { Ok(()) => { info!(filter =% new_filter, "Log filter updated"); (StatusCode::OK, "".into()) diff --git a/crates/sui-node/src/main.rs b/crates/sui-node/src/main.rs index 9a93216f68718..52b78244d3d58 100644 --- a/crates/sui-node/src/main.rs +++ b/crates/sui-node/src/main.rs @@ -58,10 +58,8 @@ fn main() { config.supported_protocol_versions = Some(SupportedProtocolVersions::SYSTEM_DEFAULT); let runtimes = SuiRuntimes::new(&config); - let registry_service = { - let _enter = runtimes.metrics.enter(); - mysten_metrics::start_prometheus_server(config.metrics_address) - }; + let metrics_rt = runtimes.metrics.enter(); + let registry_service = mysten_metrics::start_prometheus_server(config.metrics_address); let prometheus_registry = registry_service.default_registry(); // Initialize logging @@ -73,6 +71,8 @@ fn main() { .with_prom_registry(&prometheus_registry) .init(); + drop(metrics_rt); + info!("Sui Node version: {VERSION}"); info!( "Supported protocol versions: {:?}", diff --git a/crates/sui-swarm/src/memory/container.rs b/crates/sui-swarm/src/memory/container.rs index cda55ff9acb18..3bc10bfc51a32 100644 --- a/crates/sui-swarm/src/memory/container.rs +++ b/crates/sui-swarm/src/memory/container.rs @@ -7,6 +7,7 @@ use std::thread; use sui_config::NodeConfig; use sui_node::{SuiNode, SuiNodeHandle}; use sui_types::crypto::{AuthorityPublicKeyBytes, KeypairTraits}; +use telemetry_subscribers::get_global_telemetry_config; use tracing::{info, trace}; use super::node::RuntimeType; @@ -44,12 +45,21 @@ impl Container { let (cancel_sender, cancel_receiver) = tokio::sync::oneshot::channel(); let thread = thread::spawn(move || { - let span = tracing::span!( - tracing::Level::INFO, - "node", - name =% AuthorityPublicKeyBytes::from(config.protocol_key_pair().public()).concise(), - ); - let _guard = span.enter(); + let span = if get_global_telemetry_config() + .map(|c| c.enable_otlp_tracing) + .unwrap_or(false) + { + // we cannot have long-lived root spans when exporting trace data to otlp + None + } else { + Some(tracing::span!( + tracing::Level::INFO, + "node", + name =% AuthorityPublicKeyBytes::from(config.protocol_key_pair().public()).concise(), + )) + }; + + let _guard = span.as_ref().map(|span| span.enter()); let mut builder = match runtime { RuntimeType::SingleThreaded => tokio::runtime::Builder::new_current_thread(), @@ -63,7 +73,9 @@ impl Container { builder .on_thread_start(move || { SPAN.with(|maybe_entered_span| { - *maybe_entered_span.borrow_mut() = Some(span.clone().entered()); + if let Some(span) = &span { + *maybe_entered_span.borrow_mut() = Some(span.clone().entered()); + } }); }) .on_thread_stop(|| { diff --git a/crates/telemetry-subscribers/Cargo.toml b/crates/telemetry-subscribers/Cargo.toml index 0280a834c68f7..cb2341d8be037 100644 --- a/crates/telemetry-subscribers/Cargo.toml +++ b/crates/telemetry-subscribers/Cargo.toml @@ -17,10 +17,15 @@ tracing.workspace = true tracing-appender.workspace = true tracing-subscriber.workspace = true workspace-hack.workspace = true +opentelemetry = { version = "0.20.0", features = ["rt-tokio"], optional = true } +opentelemetry-otlp = { version = "0.13.0", features = ["grpc-tonic"], optional = true } +tracing-opentelemetry = { version = "0.21.0", optional = true } +tokio.workspace = true [features] -default = [] +default = ["otlp"] tokio-console = ["console-subscriber"] +otlp = ["tracing-opentelemetry", "opentelemetry", "opentelemetry-otlp"] [dev-dependencies] camino.workspace = true diff --git a/crates/telemetry-subscribers/README.md b/crates/telemetry-subscribers/README.md index 3661c5b13bf8e..d3487671fee81 100644 --- a/crates/telemetry-subscribers/README.md +++ b/crates/telemetry-subscribers/README.md @@ -27,7 +27,7 @@ You can also run the example and see output in ANSI color: cargo run --example easy-init ## Features -- `jaeger` - this feature is enabled by default as it enables jaeger tracing +- `otlp` - this feature is enabled by default as it enables otlp tracing - `json` - Bunyan formatter - JSON log output, optional - `tokio-console` - [Tokio-console](https://github.com/tokio-rs/console) subscriber, optional @@ -45,21 +45,27 @@ This output can easily be fed to backends such as ElasticSearch for indexing, al NOTE: JSON output requires the `json` crate feature to be enabled. -### Jaeger (seeing distributed traces) +### OTLP -To see nested spans visualized with [Jaeger](https://www.jaegertracing.io), do the following: +#### Tracing locally: -1. Run this to get a local Jaeger container: `docker run -d -p6831:6831/udp -p6832:6832/udp -p16686:16686 jaegertracing/all-in-one:latest` -2. Set `enable_jaeger` config setting to true or set `TOKIO_JAEGER` env var -3. Run your app -4. Browse to `http://localhost:16686/` and select the service you configured using `service_name` +1. In `docker/grafana-local` run `docker compose up` to start a local grafana instance. +2. Set `TRACE_FILTER=` - for local use `TRACE_FILTER=sui=trace,info` is a good place to start. +3. Start the sui-node or other process. +4. Go to http://localhost:3000 (or [http://localhost:3000/ with traces already filtered to sui-node](http://localhost:3000/explore?panes=%7B%22iHz%22:%7B%22datasource%22:%22tempo%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22datasource%22:%7B%22type%22:%22tempo%22,%22uid%22:%22tempo%22%7D,%22queryType%22:%22traceqlSearch%22,%22limit%22:20,%22filters%22:%5B%7B%22id%22:%22service-name%22,%22tag%22:%22service.name%22,%22operator%22:%22%3D%22,%22scope%22:%22resource%22,%22value%22:%5B%22sui-node%22%5D,%22valueType%22:%22string%22%7D,%7B%22id%22:%22span-name%22,%22tag%22:%22name%22,%22operator%22:%22%3D%22,%22scope%22:%22span%22,%22value%22:%5B%5D,%22valueType%22:%22string%22%7D,%7B%22id%22:%224f3681c5%22,%22operator%22:%22%3D%22,%22scope%22:%22span%22%7D%5D%7D%5D,%22range%22:%7B%22from%22:%22now-5m%22,%22to%22:%22now%22%7D%7D%7D&schemaVersion=1&orgId=1) +5. Select `Tempo` as the data source. -NOTE: separate spans (which are not nested) are not connected as a single trace for now. +#### Tracing in production: -Jaeger subscriber is enabled by default but is protected by the jaeger feature flag. If you'd like to leave -out the Jaeger dependencies, you can turn off the default-features in your dependency: +Because tracing is expensive, it is not enabled by default. To enable trace exporting on a production machine: - telemetry = { url = "...", default-features = false } +1. Ensure the process was started with `TRACE_FILTER=off` - this enables the OTLP system but filters out all spans. + +2. Using the filter expression and duration of your choice, run: + + $ curl -X POST 'http://127.0.0.1:1337/enable-tracing?filter=sui-node=trace,info&duration=10s' + +Tracing will automatically be disabled after the specified duration has elapsed, in order to avoid leaving tracing on unintentionally. ### Automatic Prometheus span latencies @@ -69,13 +75,6 @@ span performance in production apps. Enabling this layer can only be done programmatically, by passing in a Prometheus registry to `TelemetryConfig`. -### Span levels vs log levels - -What spans are included for Jaeger output, automatic span latencies, etc.? These are controlled by -the `span_level` config attribute, or the `TS_SPAN_LEVEL` environment variable. Note that this is -separate from `RUST_LOG`, so that you can separately control the logging verbosity from the level of -spans that are to be recorded and traced. - ### Live async inspection / Tokio Console [Tokio-console](https://github.com/tokio-rs/console) is an awesome CLI tool designed to analyze and help debug Rust apps using Tokio, in real time! It relies on a special subscriber. diff --git a/crates/telemetry-subscribers/src/lib.rs b/crates/telemetry-subscribers/src/lib.rs index 7fca3bb27d7b8..adb75f1143ce9 100644 --- a/crates/telemetry-subscribers/src/lib.rs +++ b/crates/telemetry-subscribers/src/lib.rs @@ -1,99 +1,20 @@ // Copyright (c) Mysten Labs, Inc. // SPDX-License-Identifier: Apache-2.0 -//! # Telemetry-subscribers -//! -//! This is a library for common telemetry functionality, especially subscribers for [Tokio tracing](https://github.com/tokio-rs/tracing) -//! libraries. Here we simply package many common subscribers, -//! common logs and metrics destinations, etc. into a easy to configure common package. There are also -//! some unique layers such as one to automatically create Prometheus latency histograms for spans. -//! -//! We also purposely separate out logging levels from span creation. This is often needed by production apps -//! as normally it is not desired to log at very high levels, but still desirable to gather sampled span data -//! all the way down to TRACE level spans. -//! -//! Getting started is easy. In your app: -//! -//! ```rust -//! use telemetry_subscribers::TelemetryConfig; -//! let (_guard, _handle) = TelemetryConfig::new().init(); -//! ``` -//! -//! It is important to retain the guard until the end of the program. Assign it in the main fn and keep it, -//! for once it drops then log output will stop. -//! -//! There is a builder API available: just do `TelemetryConfig::new()...` Another convenient initialization method -//! is `TelemetryConfig::new().with_env()` to populate the config from environment vars. -//! -//! You can also run the example and see output in ANSI color: -//! -//! ```bash -//! cargo run --example easy-init -//! ``` -//! -//! ## Features -//! - `json` - Bunyan formatter - JSON log output, optional -//! - `tokio-console` - [Tokio-console](https://github.com/tokio-rs/console) subscriber, optional -//! -//! ### Stdout vs file output -//! -//! By default, logs (but not spans) are formatted for human readability and output to stdout, with key-value tags at the end of every line. -//! `RUST_LOG` can be configured for custom logging output, including filtering. -//! -//! By setting `log_file` in the config, one can write log output to a daily-rotated file. -//! -//! ### Tracing and span output -//! -//! Detailed span start and end logs can be generated by defining the `json_log_output` config variable. Note that this causes all output to be in JSON format, which is not as human-readable, so it is not enabled by default. -//! This output can easily be fed to backends such as ElasticSearch for indexing, alerts, aggregation, and analysis. -//! -//! NOTE: JSON output requires the `json` crate feature to be enabled. -//! -//! ### Automatic Prometheus span latencies -//! -//! Included in this library is a tracing-subscriber layer named `PrometheusSpanLatencyLayer`. It will create -//! a Prometheus histogram to track latencies for every span in your app, which is super convenient for tracking -//! span performance in production apps. -//! -//! Enabling this layer can only be done programmatically, by passing in a Prometheus registry to `TelemetryConfig`. -//! -//! ### Span levels vs log levels -//! -//! What spans are included for automatic span latencies, etc.? These are controlled by -//! the `span_level` config attribute, or the `TS_SPAN_LEVEL` environment variable. Note that this is -//! separate from `RUST_LOG`, so that you can separately control the logging verbosity from the level of -//! spans that are to be recorded and traced. -//! -//! ### Live async inspection / Tokio Console -//! -//! [Tokio-console](https://github.com/tokio-rs/console) is an awesome CLI tool designed to analyze and help debug Rust apps using Tokio, in real time! It relies on a special subscriber. -//! -//! 1. Build your app using a special flag: `RUSTFLAGS="--cfg tokio_unstable" cargo build` -//! 2. Enable the `tokio-console` feature for this crate. -//! 2. Set the `tokio_console` config setting when running your app (or set TOKIO_CONSOLE env var if using config `with_env()` method) -//! 3. Clone the console repo and `cargo run` to launch the console -//! -//! NOTE: setting tokio TRACE logs is NOT necessary. It says that in the docs but there's no need to change Tokio logging levels at all. The console subscriber has a special filter enabled taking care of that. -//! -//! By default, Tokio console listens on port 6669. To change this setting as well as other setting such as -//! the retention policy, please see the [configuration](https://docs.rs/console-subscriber/latest/console_subscriber/struct.Builder.html#configuration) guide. -//! -//! ### Custom panic hook -//! -//! This library installs a custom panic hook which records a log (event) at ERROR level using the tracing -//! crate. This allows span information from the panic to be properly recorded as well. -//! -//! To exit the process on panic, set the `CRASH_ON_PANIC` environment variable. - use crossterm::tty::IsTty; +use once_cell::sync::Lazy; use span_latency_prom::PrometheusSpanLatencyLayer; +use std::time::Duration; use std::{ env, io::{stderr, Write}, str::FromStr, - sync::atomic::{AtomicUsize, Ordering}, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, Mutex, + }, }; -use tracing::Level; +use tracing::{error, info, Level}; use tracing::{metadata::LevelFilter, subscriber::Interest, Metadata, Subscriber}; use tracing_appender::non_blocking::{NonBlocking, WorkerGuard}; use tracing_subscriber::{ @@ -115,6 +36,7 @@ pub type BoxError = Box; /// - log_level: error/warn/info/debug/trace, defaults to info #[derive(Default, Clone, Debug)] pub struct TelemetryConfig { + pub enable_otlp_tracing: bool, /// Enables Tokio Console debugging on port 6669 pub tokio_console: bool, /// Output JSON logs. @@ -143,6 +65,19 @@ pub struct TelemetryGuards { worker_guard: WorkerGuard, } +impl TelemetryGuards { + fn new(config: TelemetryConfig, worker_guard: WorkerGuard) -> Self { + set_global_telemetry_config(config); + Self { worker_guard } + } +} + +impl Drop for TelemetryGuards { + fn drop(&mut self) { + clear_global_telemetry_config(); + } +} + #[derive(Clone, Debug)] pub struct FilterHandle(reload::Handle); @@ -160,6 +95,53 @@ impl FilterHandle { } } +pub struct Filters { + log: FilterHandle, + trace: Option, +} + +impl Filters { + pub fn update_log>(&self, directives: S) -> Result<(), BoxError> { + self.log.update(directives) + } + + pub fn get_log(&self) -> Result { + self.log.get() + } + + pub fn update_trace>( + &self, + directives: S, + duration: Duration, + ) -> Result<(), BoxError> { + if let Some(trace) = &self.trace { + let res = trace.update(directives); + // after duration is elapsed, reset to the env setting + let trace = trace.clone(); + let trace_filter_env = env::var("TRACE_FILTER").unwrap_or_else(|_| "off".to_string()); + tokio::spawn(async move { + tokio::time::sleep(duration).await; + if let Err(e) = trace.update(trace_filter_env) { + error!("failed to reset trace filter: {}", e); + } + }); + res + } else { + info!("tracing not enabled, ignoring update"); + Ok(()) + } + } + + pub fn reset_trace(&self) { + if let Some(trace) = &self.trace { + let trace_filter_env = env::var("TRACE_FILTER").unwrap_or_else(|_| "off".to_string()); + if let Err(e) = trace.update(trace_filter_env) { + error!("failed to reset trace filter: {}", e); + } + } + } +} + fn get_output(log_file: Option) -> (NonBlocking, WorkerGuard) { if let Some(logfile_prefix) = log_file { let file_appender = tracing_appender::rolling::daily("", logfile_prefix); @@ -209,9 +191,29 @@ fn set_panic_hook(crash_on_panic: bool) { })); } +static GLOBAL_CONFIG: Lazy>>> = + Lazy::new(|| Arc::new(Mutex::new(None))); + +fn set_global_telemetry_config(config: TelemetryConfig) { + let mut global_config = GLOBAL_CONFIG.lock().unwrap(); + assert!(global_config.is_none()); + *global_config = Some(config); +} + +fn clear_global_telemetry_config() { + let mut global_config = GLOBAL_CONFIG.lock().unwrap(); + *global_config = None; +} + +pub fn get_global_telemetry_config() -> Option { + let global_config = GLOBAL_CONFIG.lock().unwrap(); + global_config.clone() +} + impl TelemetryConfig { pub fn new() -> Self { Self { + enable_otlp_tracing: false, tokio_console: false, json_log_output: false, log_file: None, @@ -265,6 +267,10 @@ impl TelemetryConfig { self.crash_on_panic = true } + if env::var("TRACE_FILTER").is_ok() { + self.enable_otlp_tracing = true + } + if env::var("RUST_LOG_JSON").is_ok() { self.json_log_output = true; } @@ -293,8 +299,9 @@ impl TelemetryConfig { self } - pub fn init(self) -> (TelemetryGuards, FilterHandle) { + pub fn init(self) -> (TelemetryGuards, Filters) { let config = self; + let config_clone = config.clone(); // Setup an EnvFilter for filtering logging output layers. // NOTE: we don't want to use this to filter all layers. That causes problems for layers with @@ -304,7 +311,7 @@ impl TelemetryConfig { let env_filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(log_level)); let (log_filter, reload_handle) = reload::Layer::new(env_filter); - let filter_handle = FilterHandle(reload_handle); + let log_filter_handle = FilterHandle(reload_handle); // Separate span level filter. // This is a dumb filter for now - allows all spans that are below a given level. @@ -330,6 +337,42 @@ impl TelemetryConfig { layers.push(span_lat_layer.with_filter(span_filter.clone()).boxed()); } + let mut trace_filter_handle = None; + if config.enable_otlp_tracing { + use opentelemetry::sdk::{self, Resource}; + use opentelemetry_otlp::WithExportConfig; + + let endpoint = + env::var("OTLP_ENDPOINT").unwrap_or_else(|_| "http://localhost:4317".to_string()); + + let tracer = opentelemetry_otlp::new_pipeline() + .tracing() + .with_exporter( + opentelemetry_otlp::new_exporter() + .tonic() + .with_endpoint(endpoint), + ) + .with_trace_config(sdk::trace::config().with_resource(Resource::new(vec![ + opentelemetry::KeyValue::new("service.name", "sui-node"), + ]))) + .install_batch(sdk::runtime::Tokio) + .expect("Could not create async Tracer"); + + // Create a tracing subscriber with the configured tracer + let telemetry = tracing_opentelemetry::layer().with_tracer(tracer); + + // Enable Trace Contexts for tying spans together + opentelemetry::global::set_text_map_propagator( + opentelemetry::sdk::propagation::TraceContextPropagator::new(), + ); + + let trace_env_filter = EnvFilter::try_from_env("TRACE_FILTER").unwrap(); + let (trace_env_filter, reload_handle) = reload::Layer::new(trace_env_filter); + trace_filter_handle = Some(FilterHandle(reload_handle)); + + layers.push(telemetry.with_filter(trace_env_filter).boxed()); + } + let (nb_output, worker_guard) = get_output(config.log_file.clone()); if config.json_log_output { // Output to file or to stderr in a newline-delimited JSON format @@ -367,9 +410,15 @@ impl TelemetryConfig { // The guard must be returned and kept in the main fn of the app, as when it's dropped then the output // gets flushed and closed. If this is dropped too early then no output will appear! - let guards = TelemetryGuards { worker_guard }; - - (guards, filter_handle) + let guards = TelemetryGuards::new(config_clone, worker_guard); + + ( + guards, + Filters { + log: log_filter_handle, + trace: trace_filter_handle, + }, + ) } } @@ -413,8 +462,6 @@ where /// Globally set a tracing subscriber suitable for testing environments pub fn init_for_testing() { - use once_cell::sync::Lazy; - static LOGGER: Lazy<()> = Lazy::new(|| { let subscriber = ::tracing_subscriber::FmtSubscriber::builder() .with_env_filter( diff --git a/crates/telemetry-subscribers/tests/reload.rs b/crates/telemetry-subscribers/tests/reload.rs index 1a0603b4d4e43..992f723616f14 100644 --- a/crates/telemetry-subscribers/tests/reload.rs +++ b/crates/telemetry-subscribers/tests/reload.rs @@ -18,10 +18,10 @@ fn reload() { info!("Should be able to see this"); debug!("This won't be captured"); - reload_handle.update("debug").unwrap(); + reload_handle.update_log("debug").unwrap(); debug!("Now you can see this!"); - debug!("{}", reload_handle.get().unwrap()); + debug!("{}", reload_handle.get_log().unwrap()); drop(guard); diff --git a/crates/workspace-hack/Cargo.toml b/crates/workspace-hack/Cargo.toml index f21eac8d79956..8e942f854c8c0 100644 --- a/crates/workspace-hack/Cargo.toml +++ b/crates/workspace-hack/Cargo.toml @@ -275,7 +275,7 @@ funty = { version = "1", default-features = false } futures = { version = "0.3", features = ["bilock", "unstable"] } futures-channel = { version = "0.3", features = ["sink", "unstable"] } futures-core = { version = "0.3", features = ["unstable"] } -futures-executor = { version = "0.3", default-features = false, features = ["std"] } +futures-executor = { version = "0.3" } futures-io = { version = "0.3", features = ["unstable"] } futures-lite = { version = "1" } futures-sink = { version = "0.3" } @@ -463,7 +463,14 @@ oid-registry = { version = "0.6", features = ["crypto", "x509"] } once_cell = { version = "1" } oorandom = { version = "11", default-features = false } opaque-debug = { version = "0.3", default-features = false } +opentelemetry = { version = "0.20", features = ["metrics", "rt-tokio"] } +opentelemetry-otlp = { version = "0.13" } +opentelemetry-proto = { version = "0.3", default-features = false, features = ["gen-tonic", "traces"] } +opentelemetry-semantic-conventions = { version = "0.12", default-features = false } +opentelemetry_api = { version = "0.20", features = ["logs", "metrics"] } +opentelemetry_sdk = { version = "0.20", features = ["logs", "metrics", "rt-tokio"] } option-ext = { version = "0.2", default-features = false } +ordered-float = { version = "3" } ouroboros = { version = "0.17" } outref = { version = "0.5", default-features = false } overload = { version = "0.1", default-features = false } @@ -517,7 +524,8 @@ prometheus = { version = "0.13" } prometheus-http-query = { version = "0.6", default-features = false, features = ["rustls-tls"] } prometheus-parse = { git = "https://github.com/asonnino/prometheus-parser.git", rev = "75334db", default-features = false } proptest = { version = "1" } -prost = { version = "0.12" } +prost-5ef9efb8ec2df382 = { package = "prost", version = "0.12" } +prost-a6292c17cd707f01 = { package = "prost", version = "0.11" } prost-types = { version = "0.12" } protobuf = { version = "2", default-features = false, features = ["with-bytes"] } quanta = { version = "0.11" } @@ -677,7 +685,8 @@ toml_datetime-d8f496e17d97b5cb = { package = "toml_datetime", version = "0.5", d toml_edit-3575ec1268b04181 = { package = "toml_edit", version = "0.15" } toml_edit-582f2526e08bb6a0 = { package = "toml_edit", version = "0.14", features = ["easy"] } toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19", features = ["serde"] } -tonic = { version = "0.10", features = ["tls"] } +tonic-274715c4dabd11b0 = { package = "tonic", version = "0.9" } +tonic-93f6ce9d446188ac = { package = "tonic", version = "0.10", features = ["tls"] } tonic-health = { version = "0.10" } tower = { version = "0.4", features = ["full"] } tower-http = { version = "0.3", features = ["full"] } @@ -687,6 +696,8 @@ tracing = { version = "0.1", features = ["log"] } tracing-appender = { version = "0.2", default-features = false } tracing-core = { version = "0.1" } tracing-error = { version = "0.2" } +tracing-log = { version = "0.1", default-features = false } +tracing-opentelemetry = { version = "0.21" } tracing-serde = { version = "0.1", default-features = false } tracing-subscriber-468e82937335b1c9 = { package = "tracing-subscriber", version = "0.3", default-features = false, features = ["ansi", "env-filter", "json", "smallvec", "time"] } tracing-subscriber-6f8ce4dd05d13bba = { package = "tracing-subscriber", version = "0.2", default-features = false } @@ -1050,7 +1061,7 @@ funty = { version = "1", default-features = false } futures = { version = "0.3", features = ["bilock", "unstable"] } futures-channel = { version = "0.3", features = ["sink", "unstable"] } futures-core = { version = "0.3", features = ["unstable"] } -futures-executor = { version = "0.3", default-features = false, features = ["std"] } +futures-executor = { version = "0.3" } futures-io = { version = "0.3", features = ["unstable"] } futures-lite = { version = "1" } futures-macro = { version = "0.3", default-features = false } @@ -1259,7 +1270,14 @@ oid-registry = { version = "0.6", features = ["crypto", "x509"] } once_cell = { version = "1" } oorandom = { version = "11", default-features = false } opaque-debug = { version = "0.3", default-features = false } +opentelemetry = { version = "0.20", features = ["metrics", "rt-tokio"] } +opentelemetry-otlp = { version = "0.13" } +opentelemetry-proto = { version = "0.3", default-features = false, features = ["gen-tonic", "traces"] } +opentelemetry-semantic-conventions = { version = "0.12", default-features = false } +opentelemetry_api = { version = "0.20", features = ["logs", "metrics"] } +opentelemetry_sdk = { version = "0.20", features = ["logs", "metrics", "rt-tokio"] } option-ext = { version = "0.2", default-features = false } +ordered-float = { version = "3" } ouroboros = { version = "0.17" } ouroboros_macro = { version = "0.17", default-features = false, features = ["std"] } outref = { version = "0.5", default-features = false } @@ -1335,9 +1353,11 @@ prometheus-http-query = { version = "0.6", default-features = false, features = prometheus-parse = { git = "https://github.com/asonnino/prometheus-parser.git", rev = "75334db", default-features = false } proptest = { version = "1" } proptest-derive = { version = "0.3", default-features = false } -prost = { version = "0.12" } +prost-5ef9efb8ec2df382 = { package = "prost", version = "0.12" } +prost-a6292c17cd707f01 = { package = "prost", version = "0.11" } prost-build = { version = "0.12" } -prost-derive = { version = "0.12", default-features = false } +prost-derive-5ef9efb8ec2df382 = { package = "prost-derive", version = "0.12", default-features = false } +prost-derive-a6292c17cd707f01 = { package = "prost-derive", version = "0.11", default-features = false } prost-types = { version = "0.12" } protobuf = { version = "2", default-features = false, features = ["with-bytes"] } quanta = { version = "0.11" } @@ -1523,7 +1543,8 @@ toml_datetime-d8f496e17d97b5cb = { package = "toml_datetime", version = "0.5", d toml_edit-3575ec1268b04181 = { package = "toml_edit", version = "0.15" } toml_edit-582f2526e08bb6a0 = { package = "toml_edit", version = "0.14", features = ["easy"] } toml_edit-cdcf2f9584511fe6 = { package = "toml_edit", version = "0.19", features = ["serde"] } -tonic = { version = "0.10", features = ["tls"] } +tonic-274715c4dabd11b0 = { package = "tonic", version = "0.9" } +tonic-93f6ce9d446188ac = { package = "tonic", version = "0.10", features = ["tls"] } tonic-build = { version = "0.10" } tonic-health = { version = "0.10" } toolchain_find = { version = "0.2", default-features = false } @@ -1536,6 +1557,8 @@ tracing-appender = { version = "0.2", default-features = false } tracing-attributes = { version = "0.1", default-features = false } tracing-core = { version = "0.1" } tracing-error = { version = "0.2" } +tracing-log = { version = "0.1", default-features = false } +tracing-opentelemetry = { version = "0.21" } tracing-serde = { version = "0.1", default-features = false } tracing-subscriber-468e82937335b1c9 = { package = "tracing-subscriber", version = "0.3", default-features = false, features = ["ansi", "env-filter", "json", "smallvec", "time"] } tracing-subscriber-6f8ce4dd05d13bba = { package = "tracing-subscriber", version = "0.2", default-features = false } diff --git a/docker/grafana-local/docker-compose.yaml b/docker/grafana-local/docker-compose.yaml new file mode 100644 index 0000000000000..c09f7af4c51da --- /dev/null +++ b/docker/grafana-local/docker-compose.yaml @@ -0,0 +1,39 @@ +version: "3" +services: + tempo: + image: grafana/tempo:latest + command: [ "-config.file=/etc/tempo.yaml" ] + volumes: + - ./tempo.yaml:/etc/tempo.yaml + - ${TMPDIR}/tempo-data:/tmp/tempo + ports: + - "14268:14268" # jaeger ingest + - "3200:3200" # tempo + - "9095:9095" # tempo grpc + - "4317:4317" # otlp grpc + - "4318:4318" # otlp http + - "9411:9411" # zipkin + + prometheus: + image: prom/prometheus:latest + command: + - --config.file=/etc/prometheus.yaml + - --web.enable-remote-write-receiver + - --enable-feature=exemplar-storage + volumes: + - ./prometheus.yaml:/etc/prometheus.yaml + ports: + - "9090:9090" + - "9184:9184" + + grafana: + image: grafana/grafana:10.1.1 + volumes: + - ./grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + - GF_AUTH_DISABLE_LOGIN_FORM=true + - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor + ports: + - "3000:3000" diff --git a/docker/grafana-local/grafana-datasources.yaml b/docker/grafana-local/grafana-datasources.yaml new file mode 100644 index 0000000000000..4a3bc2c4e7423 --- /dev/null +++ b/docker/grafana-local/grafana-datasources.yaml @@ -0,0 +1,30 @@ +apiVersion: 1 + +datasources: +- name: Prometheus + type: prometheus + uid: prometheus + access: proxy + orgId: 1 + url: http://prometheus:9090 + basicAuth: false + isDefault: false + version: 1 + editable: false + jsonData: + httpMethod: GET +- name: Tempo + type: tempo + access: proxy + orgId: 1 + url: http://tempo:3200 + basicAuth: false + isDefault: true + version: 1 + editable: false + apiVersion: 1 + uid: tempo + jsonData: + httpMethod: GET + serviceMap: + datasourceUid: prometheus diff --git a/docker/grafana-local/prometheus.yaml b/docker/grafana-local/prometheus.yaml new file mode 100644 index 0000000000000..5ad755595a658 --- /dev/null +++ b/docker/grafana-local/prometheus.yaml @@ -0,0 +1,11 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: [ 'localhost:9090', 'host.docker.internal:9184' ] + - job_name: 'tempo' + static_configs: + - targets: [ 'tempo:3200' ] diff --git a/docker/grafana-local/tempo.yaml b/docker/grafana-local/tempo.yaml new file mode 100644 index 0000000000000..df9eb3a530cf1 --- /dev/null +++ b/docker/grafana-local/tempo.yaml @@ -0,0 +1,55 @@ +server: + http_listen_port: 3200 + +query_frontend: + search: + duration_slo: 5s + throughput_bytes_slo: 1.073741824e+09 + trace_by_id: + duration_slo: 5s + +distributor: + receivers: # this configuration will listen on all ports and protocols that tempo is capable of. + jaeger: # the receives all come from the OpenTelemetry collector. more configuration information can + protocols: # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver + thrift_http: # + grpc: # for a production deployment you should only enable the receivers you need! + thrift_binary: + thrift_compact: + zipkin: + otlp: + protocols: + http: + grpc: + opencensus: + +ingester: + max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally + +compactor: + compaction: + block_retention: 1h # overall Tempo trace retention. set for demo purposes + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: docker-compose + storage: + path: /tmp/tempo/generator/wal + remote_write: + - url: http://prometheus:9090/api/v1/write + send_exemplars: true + +storage: + trace: + backend: local # backend configuration to use + wal: + path: /tmp/tempo/wal # where to store the the wal locally + local: + path: /tmp/tempo/blocks + +overrides: + defaults: + metrics_generator: + processors: [service-graphs, span-metrics] # enables metrics generator diff --git a/sui-execution/latest/sui-adapter/src/execution_engine.rs b/sui-execution/latest/sui-adapter/src/execution_engine.rs index 29bf0cd2781a6..4e53abcfd19f5 100644 --- a/sui-execution/latest/sui-adapter/src/execution_engine.rs +++ b/sui-execution/latest/sui-adapter/src/execution_engine.rs @@ -473,6 +473,7 @@ mod checked { Ok(()) } + #[instrument(level = "debug", skip_all)] fn execution_loop( temporary_store: &mut TemporaryStore<'_>, transaction_kind: TransactionKind, diff --git a/sui-execution/latest/sui-adapter/src/programmable_transactions/context.rs b/sui-execution/latest/sui-adapter/src/programmable_transactions/context.rs index 6ad227fbee178..7b4138f9cb839 100644 --- a/sui-execution/latest/sui-adapter/src/programmable_transactions/context.rs +++ b/sui-execution/latest/sui-adapter/src/programmable_transactions/context.rs @@ -69,6 +69,7 @@ mod checked { execution_mode::ExecutionMode, execution_status::CommandArgumentError, }; + use tracing::instrument; /// Maintains all runtime state specific to programmable transactions pub struct ExecutionContext<'vm, 'state, 'a> { @@ -121,6 +122,7 @@ mod checked { } impl<'vm, 'state, 'a> ExecutionContext<'vm, 'state, 'a> { + #[instrument(name = "ExecutionContext::new", level = "trace", skip_all)] pub fn new( protocol_config: &'a ProtocolConfig, metrics: Arc, diff --git a/sui-execution/latest/sui-adapter/src/programmable_transactions/execution.rs b/sui-execution/latest/sui-adapter/src/programmable_transactions/execution.rs index b2bb71011af52..0247ec74492aa 100644 --- a/sui-execution/latest/sui-adapter/src/programmable_transactions/execution.rs +++ b/sui-execution/latest/sui-adapter/src/programmable_transactions/execution.rs @@ -61,6 +61,7 @@ mod checked { private_generics::{EVENT_MODULE, PRIVATE_TRANSFER_FUNCTIONS, TRANSFER_MODULE}, INIT_FN_NAME, }; + use tracing::instrument; use crate::adapter::substitute_package_id; use crate::programmable_transactions::context::*; @@ -113,6 +114,7 @@ mod checked { } /// Execute a single command + #[instrument(level = "trace", skip_all)] fn execute_command( context: &mut ExecutionContext<'_, '_, '_>, mode_results: &mut Mode::ExecutionResults,