diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md index 05029a018363..6c0cf63ae08f 100644 --- a/datafusion-cli/README.md +++ b/datafusion-cli/README.md @@ -65,6 +65,41 @@ DataFusion CLI v12.0.0 1 row in set. Query took 0.017 seconds. ``` +## Querying S3 Data Sources + +The CLI can query data in S3 if the following environment variables are defined: + +- `AWS_REGION` +- `AWS_ACCESS_KEY_ID` +- `AWS_SECRET_ACCESS_KEY` + +Note that the region must be set to the region where the bucket exists until the following issue is resolved: + +- https://github.com/apache/arrow-rs/issues/2795 + +Example: + +```bash +$ aws s3 cp test.csv s3://my-bucket/ +upload: ./test.csv to s3://my-bucket/test.csv + +$ export AWS_REGION=us-east-1 +$ export AWS_SECRET_ACCESS_KEY=*************************** +$ export AWS_ACCESS_KEY_ID=************** + +$ ./target/release/datafusion-cli +DataFusion CLI v12.0.0 +❯ create external table test stored as csv location 's3://my-bucket/test.csv'; +0 rows in set. Query took 0.374 seconds. +❯ select * from test; ++----------+----------+ +| column_1 | column_2 | ++----------+----------+ +| 1 | 2 | ++----------+----------+ +1 row in set. Query took 0.171 seconds. +``` + ## DataFusion-Cli Build the `datafusion-cli` by `cd` into the sub-directory: diff --git a/datafusion-cli/src/object_storage.rs b/datafusion-cli/src/object_storage.rs index 4d21e84a17c3..19993e751c26 100644 --- a/datafusion-cli/src/object_storage.rs +++ b/datafusion-cli/src/object_storage.rs @@ -138,8 +138,8 @@ mod tests { .unwrap_err(); assert!(err.to_string().contains("Generic S3 error: Missing region")); - env::set_var("AWS_DEFAULT_REGION", "us-east-1"); + env::set_var("AWS_REGION", "us-east-1"); assert!(provider.get_by_url(&Url::from_str(s3).unwrap()).is_ok()); - env::remove_var("AWS_DEFAULT_REGION"); + env::remove_var("AWS_REGION"); } } diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 5d2150848afb..c1feb1542fbd 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -39,6 +39,7 @@ async-trait = "0.1.41" datafusion = { path = "../datafusion/core" } futures = "0.3" num_cpus = "1.13.0" +object_store = { version = "0.5.0", features = ["aws"] } prost = "0.11.0" serde = { version = "1.0.136", features = ["derive"] } serde_json = "1.0.82" diff --git a/datafusion-examples/examples/query-aws-s3.rs b/datafusion-examples/examples/query-aws-s3.rs new file mode 100644 index 000000000000..5969eb73e026 --- /dev/null +++ b/datafusion-examples/examples/query-aws-s3.rs @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::error::Result; +use datafusion::prelude::*; +use object_store::aws::AmazonS3Builder; +use std::env; +use std::sync::Arc; + +/// This example demonstrates querying data in an S3 bucket. +/// +/// The following environment variables must be defined: +/// +/// - AWS_ACCESS_KEY_ID +/// - AWS_SECRET_ACCESS_KEY +/// +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + + // the region must be set to the region where the bucket exists until the following + // issue is resolved + // https://github.com/apache/arrow-rs/issues/2795 + let region = "us-east-1"; + let bucket_name = "nyc-tlc"; + + let s3 = AmazonS3Builder::new() + .with_bucket_name(bucket_name) + .with_region(region) + .with_access_key_id(env::var("AWS_ACCESS_KEY_ID").unwrap()) + .with_secret_access_key(env::var("AWS_SECRET_ACCESS_KEY").unwrap()) + .build()?; + + ctx.runtime_env() + .register_object_store("s3", bucket_name, Arc::new(s3)); + + // cannot query the parquet files from this bucket because the path contains a whitespace + // and we don't support that yet + // https://github.com/apache/arrow-rs/issues/2799 + let path = format!( + "s3://{}/csv_backup/yellow_tripdata_2022-02.csv", + bucket_name + ); + ctx.register_csv("trips", &path, CsvReadOptions::default()) + .await?; + + // execute the query + let df = ctx.sql("SELECT * FROM trips LIMIT 10").await?; + + // print the results + df.show().await?; + + Ok(()) +} diff --git a/docs/source/user-guide/cli.md b/docs/source/user-guide/cli.md index 4299990c0903..e692f4adc7bd 100644 --- a/docs/source/user-guide/cli.md +++ b/docs/source/user-guide/cli.md @@ -151,6 +151,41 @@ STORED AS CSV LOCATION '/path/to/aggregate_test_100.csv'; ``` +## Querying S3 Data Sources + +The CLI can query data in S3 if the following environment variables are defined: + +- `AWS_REGION` +- `AWS_ACCESS_KEY_ID` +- `AWS_SECRET_ACCESS_KEY` + +Note that the region must be set to the region where the bucket exists until the following issue is resolved: + +- https://github.com/apache/arrow-rs/issues/2795 + +Example: + +```bash +$ aws s3 cp test.csv s3://my-bucket/ +upload: ./test.csv to s3://my-bucket/test.csv + +$ export AWS_REGION=us-east-2 +$ export AWS_SECRET_ACCESS_KEY=*************************** +$ export AWS_ACCESS_KEY_ID=************** + +$ ./target/release/datafusion-cli +DataFusion CLI v12.0.0 +❯ create external table test stored as csv location 's3://my-bucket/test.csv'; +0 rows in set. Query took 0.374 seconds. +❯ select * from test; ++----------+----------+ +| column_1 | column_2 | ++----------+----------+ +| 1 | 2 | ++----------+----------+ +1 row in set. Query took 0.171 seconds. +``` + ## Commands Available commands inside DataFusion CLI are: