-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(postgres sink): Add postgres sink #21248
base: master
Are you sure you want to change the base?
Changes from all commits
f47611a
0a0576b
bf2b7af
2ecbd56
a3be3ea
cee4b39
548e996
ea82ad4
ae68fc4
89e63e3
f68f04b
f4cd4c7
9d78d67
984750d
5b1ca45
21f3ad3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -543,6 +543,7 @@ spencergilbert | |
spinlock | ||
SPOF | ||
spog | ||
sqlx | ||
srcaddr | ||
srcport | ||
SREs | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Add a new postgres sink which allows to send log events to a postgres database. | ||
|
||
authors: jorgehermo9 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
use futures::FutureExt; | ||
use tower::ServiceBuilder; | ||
use vector_lib::{ | ||
config::AcknowledgementsConfig, | ||
configurable::{component::GenerateConfig, configurable_component}, | ||
sink::VectorSink, | ||
}; | ||
|
||
use super::{ | ||
service::{PostgresRetryLogic, PostgresService}, | ||
sink::PostgresSink, | ||
}; | ||
use sqlx::{postgres::PgPoolOptions, Pool, Postgres}; | ||
|
||
use crate::{ | ||
config::{Input, SinkConfig, SinkContext}, | ||
sinks::{ | ||
util::{ | ||
BatchConfig, RealtimeSizeBasedDefaultBatchSettings, ServiceBuilderExt, | ||
TowerRequestConfig, UriSerde, | ||
}, | ||
Healthcheck, | ||
}, | ||
}; | ||
|
||
const fn default_pool_size() -> u32 { | ||
5 | ||
} | ||
|
||
/// Configuration for the `postgres` sink. | ||
#[configurable_component(sink("postgres", "Deliver log data to a PostgreSQL database."))] | ||
#[derive(Clone, Debug)] | ||
#[serde(deny_unknown_fields)] | ||
pub struct PostgresConfig { | ||
// TODO: if I used UriSerde instead of String, I couldn't get a url string to use | ||
// in the connection pool, as the password would be redacted with UriSerde::to_string | ||
/// The connection string for the PostgreSQL server. It can contain the username and password. | ||
pub endpoint: String, | ||
|
||
/// The table that data is inserted into. | ||
pub table: String, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we make the table templatable? Like the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is a nice feature but not a must-have, we can do this incrementally. Once we finalized the rest of the comments we can come back to this if you are motivated to add this feature. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay! |
||
|
||
/// The postgres connection pool size. See [this](https://docs.rs/sqlx/latest/sqlx/struct.Pool.html#why-use-a-pool) for more | ||
/// information about why a connection pool should be used. | ||
#[serde(default = "default_pool_size")] | ||
pub pool_size: u32, | ||
|
||
#[configurable(derived)] | ||
#[serde(default)] | ||
pub batch: BatchConfig<RealtimeSizeBasedDefaultBatchSettings>, | ||
|
||
#[configurable(derived)] | ||
#[serde(default)] | ||
pub request: TowerRequestConfig, | ||
|
||
#[configurable(derived)] | ||
#[serde( | ||
default, | ||
deserialize_with = "crate::serde::bool_or_struct", | ||
skip_serializing_if = "crate::serde::is_default" | ||
)] | ||
pub acknowledgements: AcknowledgementsConfig, | ||
} | ||
|
||
impl GenerateConfig for PostgresConfig { | ||
fn generate_config() -> toml::Value { | ||
toml::from_str( | ||
r#"endpoint = "postgres://user:password@localhost/default" | ||
table = "default" | ||
"#, | ||
) | ||
.unwrap() | ||
} | ||
} | ||
|
||
#[async_trait::async_trait] | ||
#[typetag::serde(name = "postgres")] | ||
impl SinkConfig for PostgresConfig { | ||
async fn build(&self, _cx: SinkContext) -> crate::Result<(VectorSink, Healthcheck)> { | ||
// TODO: make connection pool configurable. Or should we just have one connection per sink? | ||
// TODO: it seems that the number of connections in the pool does not affect the throughput of the sink | ||
// does the sink execute batches in parallel? | ||
let connection_pool = PgPoolOptions::new() | ||
.max_connections(self.pool_size) | ||
.connect(&self.endpoint) | ||
.await?; | ||
|
||
let healthcheck = healthcheck(connection_pool.clone()).boxed(); | ||
|
||
let batch_settings = self.batch.into_batcher_settings()?; | ||
let request_settings = self.request.into_settings(); | ||
|
||
let endpoint_uri: UriSerde = self.endpoint.parse()?; | ||
let service = PostgresService::new( | ||
connection_pool, | ||
self.table.clone(), | ||
// TODO: this endpoint is used for metrics' tags. It could contain passwords, | ||
// will it be redacted there? | ||
endpoint_uri.to_string(), | ||
); | ||
let service = ServiceBuilder::new() | ||
.settings(request_settings, PostgresRetryLogic) | ||
.service(service); | ||
|
||
let sink = PostgresSink::new(service, batch_settings); | ||
|
||
Ok((VectorSink::from_event_streamsink(sink), healthcheck)) | ||
} | ||
|
||
// TODO: allow for Input::all() | ||
fn input(&self) -> Input { | ||
Input::log() | ||
} | ||
|
||
fn acknowledgements(&self) -> &AcknowledgementsConfig { | ||
&self.acknowledgements | ||
} | ||
} | ||
|
||
async fn healthcheck(connection_pool: Pool<Postgres>) -> crate::Result<()> { | ||
sqlx::query("SELECT 1").execute(&connection_pool).await?; | ||
Ok(()) | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn generate_config() { | ||
crate::test_util::test_generate_config::<PostgresConfig>(); | ||
} | ||
|
||
#[test] | ||
fn parse_config() { | ||
let cfg = toml::from_str::<PostgresConfig>( | ||
r#" | ||
endpoint = "postgres://user:password@localhost/default" | ||
table = "mytable" | ||
"#, | ||
) | ||
.unwrap(); | ||
assert_eq!(cfg.endpoint, "postgres://user:password@localhost/default"); | ||
assert_eq!(cfg.table, "mytable"); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
use crate::{ | ||
config::{SinkConfig, SinkContext}, | ||
sinks::{ | ||
postgres::PostgresConfig, | ||
util::{test::load_sink, UriSerde}, | ||
}, | ||
test_util::{components::run_and_assert_sink_compliance, random_string, trace_init}, | ||
}; | ||
use futures::stream; | ||
use serde::{Deserialize, Serialize}; | ||
use sqlx::{Connection, FromRow, PgConnection}; | ||
use std::future::ready; | ||
use vector_lib::event::{BatchNotifier, BatchStatus, BatchStatusReceiver, Event, LogEvent}; | ||
|
||
fn pg_host() -> String { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Copied those utility functions from the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's do some small refactoring here.
|
||
std::env::var("PG_HOST").unwrap_or_else(|_| "localhost".into()) | ||
} | ||
|
||
fn pg_url() -> String { | ||
std::env::var("PG_URL") | ||
.unwrap_or_else(|_| format!("postgres://vector:vector@{}/postgres", pg_host())) | ||
} | ||
|
||
fn gen_table() -> String { | ||
format!("test_{}", random_string(10).to_lowercase()) | ||
} | ||
|
||
fn make_event() -> (Event, BatchStatusReceiver) { | ||
let (batch, receiver) = BatchNotifier::new_with_receiver(); | ||
let mut event = LogEvent::from("raw log line").with_batch_notifier(&batch); | ||
event.insert("host", "example.com"); | ||
let event_payload = event.clone().into_parts().0; | ||
event.insert("payload", event_payload); | ||
(event.into(), receiver) | ||
} | ||
|
||
#[derive(Debug, Serialize, Deserialize, FromRow)] | ||
struct TestEvent { | ||
host: String, | ||
timestamp: String, | ||
message: String, | ||
payload: serde_json::Value, | ||
} | ||
|
||
async fn prepare_config() -> (String, String, PgConnection) { | ||
trace_init(); | ||
|
||
let table = gen_table(); | ||
let endpoint = pg_url(); | ||
let _endpoint: UriSerde = endpoint.parse().unwrap(); | ||
|
||
let cfg = format!( | ||
r#" | ||
endpoint = "{endpoint}" | ||
table = "{table}" | ||
batch.max_events = 1 | ||
"#, | ||
); | ||
|
||
let connection = PgConnection::connect(&endpoint) | ||
.await | ||
.expect("Failed to connect to Postgres"); | ||
|
||
(cfg, table, connection) | ||
} | ||
|
||
async fn insert_event_with_cfg(cfg: String, table: String, mut connection: PgConnection) { | ||
// We store the timestamp as text and not as `timestamp with timezone` postgres type due to | ||
// postgres not supporting nanosecond-resolution (it does support microsecond-resolution). | ||
let create_table_sql = | ||
format!("CREATE TABLE IF NOT EXISTS {table} (host text, timestamp text, message text, payload jsonb)",); | ||
sqlx::query(&create_table_sql) | ||
.execute(&mut connection) | ||
.await | ||
.unwrap(); | ||
|
||
let (config, _) = load_sink::<PostgresConfig>(&cfg).unwrap(); | ||
let (sink, _hc) = config.build(SinkContext::default()).await.unwrap(); | ||
|
||
let (input_event, mut receiver) = make_event(); | ||
run_and_assert_sink_compliance( | ||
sink, | ||
stream::once(ready(input_event.clone())), | ||
&["endpoint", "protocol"], | ||
) | ||
.await; | ||
|
||
let select_all_sql = format!("SELECT * FROM {table}"); | ||
let events: Vec<TestEvent> = sqlx::query_as(&select_all_sql) | ||
.fetch_all(&mut connection) | ||
.await | ||
.unwrap(); | ||
dbg!(&events); | ||
assert_eq!(1, events.len()); | ||
|
||
// drop input_event after comparing with response | ||
{ | ||
let log_event = input_event.into_log(); | ||
let expected = serde_json::to_value(&log_event).unwrap(); | ||
let actual = serde_json::to_value(&events[0]).unwrap(); | ||
assert_eq!(expected, actual); | ||
} | ||
|
||
assert_eq!(receiver.try_recv(), Ok(BatchStatus::Delivered)); | ||
} | ||
|
||
#[tokio::test] | ||
async fn test_postgres_sink() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think just a single test is too little.. But I couldn't figure out anything else to test. This test is very similar to the integration tests from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are some more interesting things we can do here, at the very least send more than one events. Also, we could test failures such as sending a badly formatted payload. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, going to include more tests! |
||
let (cfg, table, connection) = prepare_config().await; | ||
insert_event_with_cfg(cfg, table, connection).await; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
mod config; | ||
#[cfg(all(test, feature = "postgres-integration-tests"))] | ||
mod integration_tests; | ||
mod service; | ||
mod sink; | ||
|
||
pub use self::config::PostgresConfig; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should I call this sink
postgres
orpostgres_logs
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm good question, could this evolve to handle both logs and metrics in the future?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm thinking if this can evolve to integrate with other postgres flavours such as timescaledb, which is oriented to time series
My thoughts on this: #21308 (comment)
Timescaledb tracking issue: #939
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be interesting to change the input
vector/src/sinks/postgres/config.rs
Line 110 in 5b1ca45