From b4ecac13bfdde33fc864112ae3628614e3aa0332 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 15 Aug 2021 12:10:50 -0600 Subject: [PATCH 01/13] Add Ballista support to DataFusion CLI --- datafusion-cli/Cargo.toml | 5 ++- datafusion-cli/README.md | 53 ++++++++++++++++++++++++ datafusion-cli/src/lib.rs | 4 ++ datafusion-cli/src/main.rs | 83 +++++++++++++++++++++++++++++--------- 4 files changed, 124 insertions(+), 21 deletions(-) create mode 100644 datafusion-cli/README.md diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index fda9271876aa..7aad7b6a1364 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -20,7 +20,7 @@ name = "datafusion-cli" version = "4.0.0-SNAPSHOT" authors = ["Apache Arrow "] edition = "2018" -keywords = [ "arrow", "query", "sql", "cli", "repl" ] +keywords = [ "arrow", "datafusion", "ballista", "query", "sql", "cli", "repl" ] license = "Apache-2.0" homepage = "https://github.com/apache/arrow-datafusion" repository = "https://github.com/apache/arrow-datafusion" @@ -31,4 +31,5 @@ clap = "2.33" rustyline = "8.0" tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } datafusion = { path = "../datafusion" } -arrow = { version = "5.0" } +ballista = { path = "../ballista/rust/client" } +arrow = { version = "5.0" } \ No newline at end of file diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md new file mode 100644 index 000000000000..31476be12731 --- /dev/null +++ b/datafusion-cli/README.md @@ -0,0 +1,53 @@ +# DataFusion Command-line Interface + +The DataFusion CLI allows SQL queries to be executed by an in-process DataFusion context, or by a distributed +Ballista context. + +``` +USAGE: + datafusion-cli [FLAGS] [OPTIONS] + +FLAGS: + -h, --help Prints help information + -q, --quiet Reduce printing other than the results and work quietly + -V, --version Prints version information + +OPTIONS: + -c, --batch-size The batch size of each query, or use DataFusion default + -p, --data-path Path to your data, default to current directory + -f, --file ... Execute commands from file(s), then exit + --format Output format [default: table] [possible values: csv, tsv, table, json, ndjson] + --host Ballista scheduler host + --port Ballista scheduler port +``` + +## Example + +Create a CSV file to query. + +```bash +$ echo "1,2" > data.csv +``` + +```sql +$ datafusion-cli + +DataFusion CLI v4.0.0-SNAPSHOT + +> CREATE EXTERNAL TABLE foo (a INT, b INT) STORED AS CSV LOCATION 'data.csv'; +0 rows in set. Query took 0.001 seconds. + +> SELECT * FROM foo; ++---+---+ +| a | b | ++---+---+ +| 1 | 2 | ++---+---+ +1 row in set. Query took 0.017 seconds. +``` + +## Ballista + +```bash +datafusion-cli --host localhost --port 50050 +``` \ No newline at end of file diff --git a/datafusion-cli/src/lib.rs b/datafusion-cli/src/lib.rs index 5b110d315364..fe03eb0a6828 100644 --- a/datafusion-cli/src/lib.rs +++ b/datafusion-cli/src/lib.rs @@ -14,6 +14,10 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. + +#![allow(unused_imports)] +pub const DATAFUSION_CLI_VERSION: &str = env!("CARGO_PKG_VERSION"); + pub mod print_format; use datafusion::arrow::record_batch::RecordBatch; diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 7742051dd352..b6ba4cee1912 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -17,14 +17,6 @@ #![allow(bare_trait_objects)] -use clap::{crate_version, App, Arg}; -use datafusion::error::Result; -use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; -use datafusion_cli::{ - print_format::{all_print_formats, PrintFormat}, - PrintOptions, -}; -use rustyline::Editor; use std::env; use std::fs::File; use std::io::prelude::*; @@ -32,8 +24,27 @@ use std::io::BufReader; use std::path::Path; use std::time::Instant; +use ballista::context::BallistaContext; +use ballista::prelude::BallistaConfig; +use clap::{crate_version, App, Arg}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::execution::context::{ExecutionConfig, ExecutionContext}; +use datafusion_cli::{ + print_format::{all_print_formats, PrintFormat}, + PrintOptions, DATAFUSION_CLI_VERSION, +}; +use rustyline::Editor; + +/// The CLI supports using a local DataFusion context or a distributed BallistaContext +enum Context { + /// In-process execution with DataFusion + Local(ExecutionContext), + /// Distributed execution with Ballista + Remote(BallistaContext), +} + #[tokio::main] -pub async fn main() { +pub async fn main() -> Result<()> { let matches = App::new("DataFusion") .version(crate_version!()) .about( @@ -82,6 +93,18 @@ pub async fn main() { ) .takes_value(true), ) + .arg( + Arg::with_name("host") + .help("Ballista scheduler host") + .long("host") + .takes_value(true), + ) + .arg( + Arg::with_name("port") + .help("Ballista scheduler port") + .long("port") + .takes_value(true), + ) .arg( Arg::with_name("quiet") .help("Reduce printing other than the results and work quietly") @@ -91,6 +114,13 @@ pub async fn main() { ) .get_matches(); + println!("DataFusion CLI v{}\n", DATAFUSION_CLI_VERSION); + + let host = matches.value_of("host"); + let port = matches + .value_of("port") + .and_then(|port| port.parse::().ok()); + if let Some(path) = matches.value_of("data-path") { let p = Path::new(path); env::set_current_dir(&p).unwrap(); @@ -105,6 +135,18 @@ pub async fn main() { execution_config = execution_config.with_batch_size(batch_size); }; + let ctx: Result = match (host, port) { + (Some(h), Some(p)) => { + let config: BallistaConfig = BallistaConfig::new() + .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; + Ok(Context::Remote(BallistaContext::remote(h, p, &config))) + } + _ => Ok(Context::Local(ExecutionContext::with_config( + execution_config.clone(), + ))), + }; + let mut ctx = ctx?; + let format = matches .value_of("format") .expect("No format is specified") @@ -118,18 +160,19 @@ pub async fn main() { let files = file_paths .map(|file_path| File::open(file_path).unwrap()) .collect::>(); - let mut ctx = ExecutionContext::with_config(execution_config); for file in files { let mut reader = BufReader::new(file); exec_from_lines(&mut ctx, &mut reader, print_options.clone()).await; } } else { - exec_from_repl(execution_config, print_options).await; + exec_from_repl(&mut ctx, print_options).await; } + + Ok(()) } async fn exec_from_lines( - ctx: &mut ExecutionContext, + ctx: &mut Context, reader: &mut BufReader, print_options: PrintOptions, ) { @@ -168,9 +211,7 @@ async fn exec_from_lines( } } -async fn exec_from_repl(execution_config: ExecutionConfig, print_options: PrintOptions) { - let mut ctx = ExecutionContext::with_config(execution_config); - +async fn exec_from_repl(ctx: &mut Context, print_options: PrintOptions) { let mut rl = Editor::<()>::new(); rl.load_history(".history").ok(); @@ -186,7 +227,7 @@ async fn exec_from_repl(execution_config: ExecutionConfig, print_options: PrintO Ok(ref line) if line.trim_end().ends_with(';') => { query.push_str(line.trim_end()); rl.add_history_entry(query.clone()); - match exec_and_print(&mut ctx, print_options.clone(), query).await { + match exec_and_print(ctx, print_options.clone(), query).await { Ok(_) => {} Err(err) => println!("{:?}", err), } @@ -234,15 +275,19 @@ fn is_exit_command(line: &str) -> bool { } async fn exec_and_print( - ctx: &mut ExecutionContext, + ctx: &mut Context, print_options: PrintOptions, sql: String, ) -> Result<()> { let now = Instant::now(); - let df = ctx.sql(&sql)?; - let results = df.collect().await?; + let df = match ctx { + Context::Local(datafusion) => datafusion.sql(&sql)?, + Context::Remote(ballista) => ballista.sql(&sql)?, + }; + let results = df.collect().await?; print_options.print_batches(&results, now)?; + Ok(()) } From 01a04b649cc1ec3d3d20f1a969f93b3009d1876a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 15 Aug 2021 12:18:04 -0600 Subject: [PATCH 02/13] document limitation --- datafusion-cli/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md index 31476be12731..eb6ac8780788 100644 --- a/datafusion-cli/README.md +++ b/datafusion-cli/README.md @@ -48,6 +48,9 @@ DataFusion CLI v4.0.0-SNAPSHOT ## Ballista +The DataFusion CLI can connect to a Ballista scheduler for query execution. Note that Ballista does not currently +support the `CREATE EXTERNAL TABLE` SQL command, so this is of limited use. + ```bash datafusion-cli --host localhost --port 50050 ``` \ No newline at end of file From a1bdd5fd210b14c3370d673d57cdf5dd074cb10e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 15 Aug 2021 13:00:34 -0600 Subject: [PATCH 03/13] CLI now works against Ballista --- ballista/rust/client/src/context.rs | 89 ++++++++++++++++++++++------- ballista/rust/core/src/utils.rs | 23 +++++--- datafusion-cli/README.md | 3 +- 3 files changed, 85 insertions(+), 30 deletions(-) diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index 162cd68352ff..63244fa9978b 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -23,13 +23,17 @@ use std::path::PathBuf; use std::sync::{Arc, Mutex}; use ballista_core::config::BallistaConfig; -use ballista_core::{datasource::DfTableAdapter, utils::create_datafusion_context}; +use ballista_core::{ + datasource::DfTableAdapter, utils::create_df_ctx_with_ballista_query_planner, +}; use datafusion::catalog::TableReference; use datafusion::dataframe::DataFrame; -use datafusion::error::Result; +use datafusion::error::{DataFusionError, Result}; +use datafusion::execution::dataframe_impl::DataFrameImpl; use datafusion::logical_plan::LogicalPlan; use datafusion::physical_plan::csv::CsvReadOptions; +use datafusion::sql::parser::FileType; struct BallistaContextState { /// Ballista configuration @@ -130,7 +134,7 @@ impl BallistaContext { // use local DataFusion context for now but later this might call the scheduler let guard = self.state.lock().unwrap(); - let mut ctx = create_datafusion_context( + let mut ctx = create_df_ctx_with_ballista_query_planner( &guard.scheduler_host, guard.scheduler_port, guard.config(), @@ -152,7 +156,7 @@ impl BallistaContext { // use local DataFusion context for now but later this might call the scheduler let guard = self.state.lock().unwrap(); - let mut ctx = create_datafusion_context( + let mut ctx = create_df_ctx_with_ballista_query_planner( &guard.scheduler_host, guard.scheduler_port, guard.config(), @@ -164,6 +168,7 @@ impl BallistaContext { /// Register a DataFrame as a table that can be referenced from a SQL query pub fn register_table(&self, name: &str, table: &dyn DataFrame) -> Result<()> { let mut state = self.state.lock().unwrap(); + println!("inserting ballista ctx state - {}", name); state .tables .insert(name.to_owned(), table.to_logical_plan()); @@ -186,24 +191,66 @@ impl BallistaContext { } /// Create a DataFrame from a SQL statement - pub fn sql(&self, sql: &str) -> Result> { - // use local DataFusion context for now but later this might call the scheduler - // register tables - let state = self.state.lock().unwrap(); - let mut ctx = create_datafusion_context( - &state.scheduler_host, - state.scheduler_port, - state.config(), - ); - for (name, plan) in &state.tables { - let plan = ctx.optimize(plan)?; - let execution_plan = ctx.create_physical_plan(&plan)?; - ctx.register_table( - TableReference::Bare { table: name }, - Arc::new(DfTableAdapter::new(plan, execution_plan)), - )?; + pub fn sql(&mut self, sql: &str) -> Result> { + let mut ctx = { + let state = self.state.lock().unwrap(); + create_df_ctx_with_ballista_query_planner( + &state.scheduler_host, + state.scheduler_port, + state.config(), + ) + }; + + // nested block because of locking state + // TODO refactor this + { + println!("registering ballista tables with df context"); + let state = self.state.lock().unwrap(); + for (name, plan) in &state.tables { + let plan = ctx.optimize(plan)?; + let execution_plan = ctx.create_physical_plan(&plan)?; + println!("register= table {} with df context", name); + ctx.register_table( + TableReference::Bare { table: name }, + Arc::new(DfTableAdapter::new(plan, execution_plan)), + )?; + } + println!("done registering tables"); + } + + let plan = ctx.create_logical_plan(sql)?; + + match plan { + LogicalPlan::CreateExternalTable { + ref schema, + ref name, + ref location, + ref file_type, + ref has_header, + } => match file_type { + FileType::CSV => { + println!("registering csv {} on ballista context", name); + self.register_csv( + name, + location, + CsvReadOptions::new() + .schema(&schema.as_ref().to_owned().into()) + .has_header(*has_header), + )?; + Ok(Arc::new(DataFrameImpl::new(ctx.state.clone(), &plan))) + } + FileType::Parquet => { + self.register_parquet(name, location)?; + Ok(Arc::new(DataFrameImpl::new(ctx.state.clone(), &plan))) + } + _ => Err(DataFusionError::NotImplemented(format!( + "Unsupported file type {:?}.", + file_type + ))), + }, + + _ => ctx.sql(sql), } - ctx.sql(sql) } } diff --git a/ballista/rust/core/src/utils.rs b/ballista/rust/core/src/utils.rs index 4187faa6645a..d753f70fa22e 100644 --- a/ballista/rust/core/src/utils.rs +++ b/ballista/rust/core/src/utils.rs @@ -30,6 +30,7 @@ use crate::memory_stream::MemoryStream; use crate::serde::scheduler::PartitionStats; use crate::config::BallistaConfig; +use datafusion::arrow::datatypes::Schema; use datafusion::arrow::error::Result as ArrowResult; use datafusion::arrow::{ array::{ @@ -51,6 +52,7 @@ use datafusion::physical_optimizer::optimizer::PhysicalOptimizerRule; use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion::physical_plan::csv::CsvExec; +use datafusion::physical_plan::empty::EmptyExec; use datafusion::physical_plan::expressions::{BinaryExpr, Column, Literal}; use datafusion::physical_plan::filter::FilterExec; use datafusion::physical_plan::hash_aggregate::HashAggregateExec; @@ -236,8 +238,9 @@ fn build_exec_plan_diagram( Ok(node_id) } -/// Create a DataFusion context that is compatible with Ballista -pub fn create_datafusion_context( +/// Create a DataFusion context that uses the BallistaQueryPlanner to send logical plans +/// to a Ballista scheduler +pub fn create_df_ctx_with_ballista_query_planner( scheduler_host: &str, scheduler_port: u16, config: &BallistaConfig, @@ -272,11 +275,17 @@ impl QueryPlanner for BallistaQueryPlanner { logical_plan: &LogicalPlan, _ctx_state: &ExecutionContextState, ) -> std::result::Result, DataFusionError> { - Ok(Arc::new(DistributedQueryExec::new( - self.scheduler_url.clone(), - self.config.clone(), - logical_plan.clone(), - ))) + match logical_plan { + LogicalPlan::CreateExternalTable { .. } => { + // table state is managed locally in the BallistaContext, not in the scheduler + Ok(Arc::new(EmptyExec::new(false, Arc::new(Schema::empty())))) + } + _ => Ok(Arc::new(DistributedQueryExec::new( + self.scheduler_url.clone(), + self.config.clone(), + logical_plan.clone(), + ))), + } } } diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md index eb6ac8780788..a54bedf88f11 100644 --- a/datafusion-cli/README.md +++ b/datafusion-cli/README.md @@ -48,8 +48,7 @@ DataFusion CLI v4.0.0-SNAPSHOT ## Ballista -The DataFusion CLI can connect to a Ballista scheduler for query execution. Note that Ballista does not currently -support the `CREATE EXTERNAL TABLE` SQL command, so this is of limited use. +The DataFusion CLI can connect to a Ballista scheduler for query execution. ```bash datafusion-cli --host localhost --port 50050 From 51b721eb525756d7ec6974ba7b690659d7f97d2b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 15 Aug 2021 13:24:56 -0600 Subject: [PATCH 04/13] clean up --- ballista/rust/client/src/context.rs | 30 +++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index 63244fa9978b..4be3290b7e46 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -133,12 +133,14 @@ impl BallistaContext { let path = fs::canonicalize(&path)?; // use local DataFusion context for now but later this might call the scheduler - let guard = self.state.lock().unwrap(); - let mut ctx = create_df_ctx_with_ballista_query_planner( - &guard.scheduler_host, - guard.scheduler_port, - guard.config(), - ); + let mut ctx = { + let guard = self.state.lock().unwrap(); + create_df_ctx_with_ballista_query_planner( + &guard.scheduler_host, + guard.scheduler_port, + guard.config(), + ) + }; let df = ctx.read_parquet(path.to_str().unwrap())?; Ok(df) } @@ -155,12 +157,14 @@ impl BallistaContext { let path = fs::canonicalize(&path)?; // use local DataFusion context for now but later this might call the scheduler - let guard = self.state.lock().unwrap(); - let mut ctx = create_df_ctx_with_ballista_query_planner( - &guard.scheduler_host, - guard.scheduler_port, - guard.config(), - ); + let mut ctx = { + let guard = self.state.lock().unwrap(); + create_df_ctx_with_ballista_query_planner( + &guard.scheduler_host, + guard.scheduler_port, + guard.config(), + ) + }; let df = ctx.read_csv(path.to_str().unwrap(), options)?; Ok(df) } @@ -168,7 +172,6 @@ impl BallistaContext { /// Register a DataFrame as a table that can be referenced from a SQL query pub fn register_table(&self, name: &str, table: &dyn DataFrame) -> Result<()> { let mut state = self.state.lock().unwrap(); - println!("inserting ballista ctx state - {}", name); state .tables .insert(name.to_owned(), table.to_logical_plan()); @@ -229,7 +232,6 @@ impl BallistaContext { ref has_header, } => match file_type { FileType::CSV => { - println!("registering csv {} on ballista context", name); self.register_csv( name, location, From 04ab76494558609e270aee8bdde3626a0e9c865b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 15 Aug 2021 13:31:04 -0600 Subject: [PATCH 05/13] clean up --- ballista-examples/src/bin/ballista-sql.rs | 2 +- ballista/rust/client/src/context.rs | 7 +------ benchmarks/src/bin/tpch.rs | 2 +- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/ballista-examples/src/bin/ballista-sql.rs b/ballista-examples/src/bin/ballista-sql.rs index 590ab7bcf7a4..6eb3539ca743 100644 --- a/ballista-examples/src/bin/ballista-sql.rs +++ b/ballista-examples/src/bin/ballista-sql.rs @@ -26,7 +26,7 @@ async fn main() -> Result<()> { let config = BallistaConfig::builder() .set("ballista.shuffle.partitions", "4") .build()?; - let ctx = BallistaContext::remote("localhost", 50050, &config); + let mut ctx = BallistaContext::remote("localhost", 50050, &config); let testdata = datafusion::arrow::util::test_util::arrow_test_data(); diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index 4be3290b7e46..3cc8c7b55f8e 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -204,25 +204,20 @@ impl BallistaContext { ) }; - // nested block because of locking state - // TODO refactor this + // register tables with DataFusion context { - println!("registering ballista tables with df context"); let state = self.state.lock().unwrap(); for (name, plan) in &state.tables { let plan = ctx.optimize(plan)?; let execution_plan = ctx.create_physical_plan(&plan)?; - println!("register= table {} with df context", name); ctx.register_table( TableReference::Bare { table: name }, Arc::new(DfTableAdapter::new(plan, execution_plan)), )?; } - println!("done registering tables"); } let plan = ctx.create_logical_plan(sql)?; - match plan { LogicalPlan::CreateExternalTable { ref schema, diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index 29835e266589..d02553ba0dbe 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -262,7 +262,7 @@ async fn benchmark_ballista(opt: BallistaBenchmarkOpt) -> Result<()> { .build() .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; - let ctx = + let mut ctx = BallistaContext::remote(opt.host.unwrap().as_str(), opt.port.unwrap(), &config); // register tables with Ballista context From a02c246a15e8df988a074a3bf257f6526bf3f33b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 15 Aug 2021 13:32:17 -0600 Subject: [PATCH 06/13] revert sql signature change --- ballista-examples/src/bin/ballista-sql.rs | 2 +- ballista/rust/client/src/context.rs | 2 +- benchmarks/src/bin/tpch.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ballista-examples/src/bin/ballista-sql.rs b/ballista-examples/src/bin/ballista-sql.rs index 6eb3539ca743..590ab7bcf7a4 100644 --- a/ballista-examples/src/bin/ballista-sql.rs +++ b/ballista-examples/src/bin/ballista-sql.rs @@ -26,7 +26,7 @@ async fn main() -> Result<()> { let config = BallistaConfig::builder() .set("ballista.shuffle.partitions", "4") .build()?; - let mut ctx = BallistaContext::remote("localhost", 50050, &config); + let ctx = BallistaContext::remote("localhost", 50050, &config); let testdata = datafusion::arrow::util::test_util::arrow_test_data(); diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index 3cc8c7b55f8e..6d8e8ef72319 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -194,7 +194,7 @@ impl BallistaContext { } /// Create a DataFrame from a SQL statement - pub fn sql(&mut self, sql: &str) -> Result> { + pub fn sql(&self, sql: &str) -> Result> { let mut ctx = { let state = self.state.lock().unwrap(); create_df_ctx_with_ballista_query_planner( diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index d02553ba0dbe..29835e266589 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -262,7 +262,7 @@ async fn benchmark_ballista(opt: BallistaBenchmarkOpt) -> Result<()> { .build() .map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?; - let mut ctx = + let ctx = BallistaContext::remote(opt.host.unwrap().as_str(), opt.port.unwrap(), &config); // register tables with Ballista context From 6cb381b8c34b63854a5cff5ca6a86a0c1a6647fe Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 15 Aug 2021 13:33:38 -0600 Subject: [PATCH 07/13] rat --- datafusion-cli/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md index a54bedf88f11..68563a97b5b3 100644 --- a/datafusion-cli/README.md +++ b/datafusion-cli/README.md @@ -1,3 +1,22 @@ + + # DataFusion Command-line Interface The DataFusion CLI allows SQL queries to be executed by an in-process DataFusion context, or by a distributed From cfcf4f0852b646ca968f0ad3d6c662541ac74333 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 15 Aug 2021 14:03:38 -0600 Subject: [PATCH 08/13] clippy --- ballista/rust/client/src/context.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ballista/rust/client/src/context.rs b/ballista/rust/client/src/context.rs index 6d8e8ef72319..ee2f65699309 100644 --- a/ballista/rust/client/src/context.rs +++ b/ballista/rust/client/src/context.rs @@ -234,11 +234,11 @@ impl BallistaContext { .schema(&schema.as_ref().to_owned().into()) .has_header(*has_header), )?; - Ok(Arc::new(DataFrameImpl::new(ctx.state.clone(), &plan))) + Ok(Arc::new(DataFrameImpl::new(ctx.state, &plan))) } FileType::Parquet => { self.register_parquet(name, location)?; - Ok(Arc::new(DataFrameImpl::new(ctx.state.clone(), &plan))) + Ok(Arc::new(DataFrameImpl::new(ctx.state, &plan))) } _ => Err(DataFusionError::NotImplemented(format!( "Unsupported file type {:?}.", From 02dc76a878d6e9b3f639a32300dd1160167d7deb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 15 Aug 2021 15:30:30 -0600 Subject: [PATCH 09/13] do not show banner in quiet mode --- datafusion-cli/src/main.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index b6ba4cee1912..4a4588812c24 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -114,7 +114,11 @@ pub async fn main() -> Result<()> { ) .get_matches(); - println!("DataFusion CLI v{}\n", DATAFUSION_CLI_VERSION); + let quiet = matches.is_present("quiet"); + + if !quiet { + println!("DataFusion CLI v{}\n", DATAFUSION_CLI_VERSION); + } let host = matches.value_of("host"); let port = matches @@ -153,7 +157,6 @@ pub async fn main() -> Result<()> { .parse::() .expect("Invalid format"); - let quiet = matches.is_present("quiet"); let print_options = PrintOptions { format, quiet }; if let Some(file_paths) = matches.values_of("file") { From 7774c2c0f464e78cd46693c989845a26bf455982 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 15 Aug 2021 15:45:45 -0600 Subject: [PATCH 10/13] crate docs --- datafusion-cli/src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion-cli/src/lib.rs b/datafusion-cli/src/lib.rs index fe03eb0a6828..0678a0a5d8d9 100644 --- a/datafusion-cli/src/lib.rs +++ b/datafusion-cli/src/lib.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#![doc = include_str!("../README.md")] + #![allow(unused_imports)] pub const DATAFUSION_CLI_VERSION: &str = env!("CARGO_PKG_VERSION"); From 698ad35fb8f588a52e17f963eded7e24d9e7bc41 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 15 Aug 2021 15:55:30 -0600 Subject: [PATCH 11/13] fmt --- datafusion-cli/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion-cli/src/lib.rs b/datafusion-cli/src/lib.rs index 0678a0a5d8d9..74b91ac9f82f 100644 --- a/datafusion-cli/src/lib.rs +++ b/datafusion-cli/src/lib.rs @@ -16,7 +16,6 @@ // under the License. #![doc = include_str!("../README.md")] - #![allow(unused_imports)] pub const DATAFUSION_CLI_VERSION: &str = env!("CARGO_PKG_VERSION"); From 2ccc69ee0f91b43d279edbae66a1823ac7509faa Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 15 Aug 2021 19:15:27 -0600 Subject: [PATCH 12/13] no_run --- datafusion-cli/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md index 68563a97b5b3..0e60cab15675 100644 --- a/datafusion-cli/README.md +++ b/datafusion-cli/README.md @@ -22,7 +22,7 @@ The DataFusion CLI allows SQL queries to be executed by an in-process DataFusion context, or by a distributed Ballista context. -``` +```no_run USAGE: datafusion-cli [FLAGS] [OPTIONS] @@ -44,11 +44,11 @@ OPTIONS: Create a CSV file to query. -```bash +```bash,no_run $ echo "1,2" > data.csv ``` -```sql +```sql,no_run $ datafusion-cli DataFusion CLI v4.0.0-SNAPSHOT @@ -69,6 +69,6 @@ DataFusion CLI v4.0.0-SNAPSHOT The DataFusion CLI can connect to a Ballista scheduler for query execution. -```bash +```bash,no_run datafusion-cli --host localhost --port 50050 ``` \ No newline at end of file From dfcd61cf690d6a31852e40980e5df2420abd5b80 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 16 Aug 2021 06:44:33 -0600 Subject: [PATCH 13/13] ignore code blocks --- datafusion-cli/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion-cli/README.md b/datafusion-cli/README.md index 0e60cab15675..6a4707e2a1d4 100644 --- a/datafusion-cli/README.md +++ b/datafusion-cli/README.md @@ -22,7 +22,7 @@ The DataFusion CLI allows SQL queries to be executed by an in-process DataFusion context, or by a distributed Ballista context. -```no_run +```ignore USAGE: datafusion-cli [FLAGS] [OPTIONS] @@ -44,11 +44,11 @@ OPTIONS: Create a CSV file to query. -```bash,no_run +```bash,ignore $ echo "1,2" > data.csv ``` -```sql,no_run +```sql,ignore $ datafusion-cli DataFusion CLI v4.0.0-SNAPSHOT @@ -69,6 +69,6 @@ DataFusion CLI v4.0.0-SNAPSHOT The DataFusion CLI can connect to a Ballista scheduler for query execution. -```bash,no_run +```bash,ignore datafusion-cli --host localhost --port 50050 ``` \ No newline at end of file