-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Deprecate SessionContext::create_logical_plan (#4617) #4679
Changes from 1 commit
9c9444b
5c23f09
832dbac
838c9a9
f15f1e1
64c969d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,7 @@ use std::sync::Arc; | |
use async_trait::async_trait; | ||
use parquet::file::properties::WriterProperties; | ||
|
||
use datafusion_common::{Column, DFSchema}; | ||
use datafusion_common::{Column, DFSchema, ScalarValue}; | ||
use datafusion_expr::TableProviderFilterPushDown; | ||
|
||
use crate::arrow::datatypes::Schema; | ||
|
@@ -505,17 +505,40 @@ impl DataFrame { | |
self.plan.schema() | ||
} | ||
|
||
/// Return the unoptimized logical plan represented by this DataFrame. | ||
/// Return the unoptimized logical plan | ||
pub fn logical_plan(&self) -> &LogicalPlan { | ||
&self.plan | ||
} | ||
|
||
/// Return the logical plan represented by this DataFrame without running the optimizers | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
/// | ||
/// Note: This method should not be used outside testing, as it loses the snapshot | ||
/// of the [`SessionState`] attached to this [`DataFrame`] and consequently subsequent | ||
/// operations may take place against a different state | ||
pub fn to_unoptimized_plan(self) -> LogicalPlan { | ||
self.plan | ||
} | ||
|
||
/// Return the optimized logical plan represented by this DataFrame. | ||
pub fn to_logical_plan(self) -> Result<LogicalPlan> { | ||
/// | ||
/// Note: This method should not be used outside testing, as it loses the snapshot | ||
/// of the [`SessionState`] attached to this [`DataFrame`] and consequently subsequent | ||
/// operations may take place against a different state | ||
pub fn to_optimized_plan(self) -> Result<LogicalPlan> { | ||
tustvold marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Optimize the plan first for better UX | ||
self.session_state.optimize(&self.plan) | ||
} | ||
|
||
/// Return the optimized logical plan represented by this DataFrame. | ||
/// | ||
/// Note: This method should not be used outside testing, as it loses the snapshot | ||
/// of the [`SessionState`] attached to this [`DataFrame`] and consequently subsequent | ||
/// operations may take place against a different state | ||
#[deprecated(note = "Use DataFrame::to_optimized_plan")] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I opted to deprecate this method as it was a touch confusing, having a method that actually performed a potentially mutating optimization pass without explicitly calling it out |
||
pub fn to_logical_plan(self) -> Result<LogicalPlan> { | ||
self.to_optimized_plan() | ||
} | ||
|
||
/// Return a DataFrame with the explanation of its plan so far. | ||
/// | ||
/// if `analyze` is specified, runs the plan and reports metrics | ||
|
@@ -714,6 +737,12 @@ impl DataFrame { | |
} | ||
} | ||
|
||
/// Convert a prepare logical plan into its inner logical plan with all params replaced with their corresponding values | ||
pub fn with_param_values(self, param_values: Vec<ScalarValue>) -> Result<Self> { | ||
let plan = self.plan.with_param_values(param_values)?; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I plan to move this off |
||
Ok(Self::new(self.session_state, plan)) | ||
} | ||
|
||
/// Cache DataFrame as a memory table. | ||
/// | ||
/// ``` | ||
|
@@ -1014,11 +1043,10 @@ mod tests { | |
let df = df.select(vec![expr])?; | ||
|
||
// build query using SQL | ||
let sql_plan = | ||
ctx.create_logical_plan("SELECT my_fn(c12) FROM aggregate_test_100")?; | ||
let sql_plan = ctx.sql("SELECT my_fn(c12) FROM aggregate_test_100").await?; | ||
|
||
// the two plans should be identical | ||
assert_same_plan(&df.plan, &sql_plan); | ||
assert_same_plan(&df.plan, &sql_plan.plan); | ||
|
||
Ok(()) | ||
} | ||
|
@@ -1128,7 +1156,7 @@ mod tests { | |
async fn create_plan(sql: &str) -> Result<LogicalPlan> { | ||
let mut ctx = SessionContext::new(); | ||
register_aggregate_csv(&mut ctx, "aggregate_test_100").await?; | ||
ctx.create_logical_plan(sql) | ||
Ok(ctx.sql(sql).await?.to_unoptimized_plan()) | ||
} | ||
|
||
async fn test_table_with_name(name: &str) -> Result<DataFrame> { | ||
|
@@ -1308,7 +1336,7 @@ mod tests { | |
\n Inner Join: t1.c1 = t2.c1\ | ||
\n TableScan: t1\ | ||
\n TableScan: t2", | ||
format!("{:?}", df_renamed.clone().to_unoptimized_plan()) | ||
format!("{:?}", df_renamed.logical_plan()) | ||
); | ||
|
||
assert_eq!("\ | ||
|
@@ -1322,7 +1350,7 @@ mod tests { | |
\n SubqueryAlias: t2\ | ||
\n Projection: aggregate_test_100.c1, aggregate_test_100.c2, aggregate_test_100.c3\ | ||
\n TableScan: aggregate_test_100 projection=[c1, c2, c3]", | ||
format!("{:?}", df_renamed.clone().to_logical_plan()?) | ||
format!("{:?}", df_renamed.clone().to_optimized_plan()?) | ||
); | ||
|
||
let df_results = df_renamed.collect().await?; | ||
|
@@ -1469,7 +1497,7 @@ mod tests { | |
|
||
assert_eq!( | ||
"TableScan: ?table? projection=[c2, c3, sum]", | ||
format!("{:?}", cached_df.clone().to_logical_plan()?) | ||
format!("{:?}", cached_df.clone().to_optimized_plan()?) | ||
); | ||
|
||
let df_results = df.collect().await?; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -254,7 +254,21 @@ impl SessionContext { | |
/// This method is `async` because queries of type `CREATE EXTERNAL TABLE` | ||
/// might require the schema to be inferred. | ||
pub async fn sql(&self, sql: &str) -> Result<DataFrame> { | ||
let plan = self.create_logical_plan(sql)?; | ||
let mut statements = DFParser::parse_sql(sql)?; | ||
if statements.len() != 1 { | ||
return Err(DataFusionError::NotImplemented( | ||
"The context currently only supports a single SQL statement".to_string(), | ||
)); | ||
} | ||
|
||
// create a query planner | ||
let plan = { | ||
// TODO: Move catalog off SessionState onto SessionContext | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This somewhat relates to #4607 but also serves avoids issues due to the interior mutability of CatalogList |
||
let state = self.state.read(); | ||
let query_planner = SqlToRel::new(&*state); | ||
query_planner.statement_to_plan(statements.pop_front().unwrap())? | ||
}; | ||
|
||
match plan { | ||
LogicalPlan::CreateExternalTable(cmd) => { | ||
self.create_external_table(&cmd).await | ||
|
@@ -553,6 +567,7 @@ impl SessionContext { | |
/// Creates a logical plan. | ||
/// | ||
/// This function is intended for internal use and should not be called directly. | ||
#[deprecated(note = "Use SessionContext::sql which snapshots the SessionState")] | ||
pub fn create_logical_plan(&self, sql: &str) -> Result<LogicalPlan> { | ||
let mut statements = DFParser::parse_sql(sql)?; | ||
|
||
|
@@ -1791,9 +1806,10 @@ impl SessionState { | |
&self, | ||
logical_plan: &LogicalPlan, | ||
) -> Result<Arc<dyn ExecutionPlan>> { | ||
let planner = self.query_planner.clone(); | ||
let logical_plan = self.optimize(logical_plan)?; | ||
planner.create_physical_plan(&logical_plan, self).await | ||
self.query_planner | ||
.create_physical_plan(&logical_plan, self) | ||
.await | ||
} | ||
|
||
/// return the configuration options | ||
|
@@ -2046,8 +2062,7 @@ mod tests { | |
use std::fs::File; | ||
use std::path::PathBuf; | ||
use std::sync::Weak; | ||
use std::thread::{self, JoinHandle}; | ||
use std::{env, io::prelude::*, sync::Mutex}; | ||
use std::{env, io::prelude::*}; | ||
use tempfile::TempDir; | ||
|
||
#[tokio::test] | ||
|
@@ -2267,23 +2282,21 @@ mod tests { | |
// environment. Usecase is for concurrent planing. | ||
let tmp_dir = TempDir::new()?; | ||
let partition_count = 4; | ||
let ctx = Arc::new(Mutex::new(create_ctx(&tmp_dir, partition_count).await?)); | ||
let ctx = Arc::new(create_ctx(&tmp_dir, partition_count).await?); | ||
|
||
let threads: Vec<JoinHandle<Result<_>>> = (0..2) | ||
let threads: Vec<_> = (0..2) | ||
.map(|_| ctx.clone()) | ||
.map(|ctx_clone| { | ||
thread::spawn(move || { | ||
let ctx = ctx_clone.lock().expect("Locked context"); | ||
.map(|ctx| { | ||
tokio::spawn(async move { | ||
// Ensure we can create logical plan code on a separate thread. | ||
ctx.create_logical_plan( | ||
"SELECT c1, c2 FROM test WHERE c1 > 0 AND c1 < 3", | ||
) | ||
ctx.sql("SELECT c1, c2 FROM test WHERE c1 > 0 AND c1 < 3") | ||
.await | ||
}) | ||
}) | ||
.collect(); | ||
|
||
for thread in threads { | ||
thread.join().expect("Failed to join thread")?; | ||
for handle in threads { | ||
handle.await.unwrap().unwrap(); | ||
} | ||
Ok(()) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As ctx is interior mutable this would effectively plan, optimize, and execute potentially against at least 3 different states 😱