-
Notifications
You must be signed in to change notification settings - Fork 198
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support for multi-scheduler deployments #59
Changes from 33 commits
3e54405
607233a
dc70bf2
c97b531
11a6be7
8d8ef30
632e996
2f39bcf
5fe25c7
780d301
830ed24
d2812ea
3f3e90b
7958086
ecb995c
2eef3b9
fcdf2ad
2f48ff5
30db7f5
92e7cd2
80ef1be
18b9fd2
30ba6d9
b35733b
dba6d1b
2017ea3
e3e6add
c496f97
cd33317
3f6ad7b
f8e3cff
ab89d3b
995c846
6130fd9
0e8905a
799f10b
3782fbf
b4d8d83
0323b85
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,8 +33,7 @@ use datafusion::physical_plan::windows::WindowAggExec; | |
use datafusion::physical_plan::{ | ||
with_new_children_if_necessary, ExecutionPlan, Partitioning, | ||
}; | ||
use futures::future::BoxFuture; | ||
use futures::FutureExt; | ||
|
||
use log::info; | ||
|
||
type PartialQueryStageResult = (Arc<dyn ExecutionPlan>, Vec<Arc<ShuffleWriterExec>>); | ||
|
@@ -59,15 +58,14 @@ impl DistributedPlanner { | |
/// Returns a vector of ExecutionPlans, where the root node is a [ShuffleWriterExec]. | ||
/// Plans that depend on the input of other plans will have leaf nodes of type [UnresolvedShuffleExec]. | ||
/// A [ShuffleWriterExec] is created whenever the partitioning changes. | ||
pub async fn plan_query_stages<'a>( | ||
pub fn plan_query_stages<'a>( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there some reason remove the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it was doing that at tone time but the implementation now is not doing IO so I changed it back to sync. |
||
&'a mut self, | ||
job_id: &'a str, | ||
execution_plan: Arc<dyn ExecutionPlan>, | ||
) -> Result<Vec<Arc<ShuffleWriterExec>>> { | ||
info!("planning query stages"); | ||
let (new_plan, mut stages) = self | ||
.plan_query_stages_internal(job_id, execution_plan) | ||
.await?; | ||
let (new_plan, mut stages) = | ||
self.plan_query_stages_internal(job_id, execution_plan)?; | ||
stages.push(create_shuffle_writer( | ||
job_id, | ||
self.next_stage_id(), | ||
|
@@ -84,97 +82,91 @@ impl DistributedPlanner { | |
&'a mut self, | ||
job_id: &'a str, | ||
execution_plan: Arc<dyn ExecutionPlan>, | ||
) -> BoxFuture<'a, Result<PartialQueryStageResult>> { | ||
async move { | ||
// recurse down and replace children | ||
if execution_plan.children().is_empty() { | ||
return Ok((execution_plan, vec![])); | ||
} | ||
) -> Result<PartialQueryStageResult> { | ||
// async move { | ||
// recurse down and replace children | ||
if execution_plan.children().is_empty() { | ||
return Ok((execution_plan, vec![])); | ||
} | ||
|
||
let mut stages = vec![]; | ||
let mut children = vec![]; | ||
for child in execution_plan.children() { | ||
let (new_child, mut child_stages) = self | ||
.plan_query_stages_internal(job_id, child.clone()) | ||
.await?; | ||
children.push(new_child); | ||
stages.append(&mut child_stages); | ||
} | ||
let mut stages = vec![]; | ||
let mut children = vec![]; | ||
for child in execution_plan.children() { | ||
let (new_child, mut child_stages) = | ||
self.plan_query_stages_internal(job_id, child.clone())?; | ||
children.push(new_child); | ||
stages.append(&mut child_stages); | ||
} | ||
|
||
if let Some(_coalesce) = execution_plan | ||
.as_any() | ||
.downcast_ref::<CoalescePartitionsExec>() | ||
{ | ||
let shuffle_writer = create_shuffle_writer( | ||
job_id, | ||
self.next_stage_id(), | ||
children[0].clone(), | ||
None, | ||
)?; | ||
let unresolved_shuffle = Arc::new(UnresolvedShuffleExec::new( | ||
shuffle_writer.stage_id(), | ||
shuffle_writer.schema(), | ||
shuffle_writer.output_partitioning().partition_count(), | ||
shuffle_writer | ||
.shuffle_output_partitioning() | ||
.map(|p| p.partition_count()) | ||
.unwrap_or_else(|| { | ||
shuffle_writer.output_partitioning().partition_count() | ||
}), | ||
)); | ||
stages.push(shuffle_writer); | ||
Ok(( | ||
with_new_children_if_necessary( | ||
execution_plan, | ||
vec![unresolved_shuffle], | ||
)?, | ||
stages, | ||
)) | ||
} else if let Some(repart) = | ||
execution_plan.as_any().downcast_ref::<RepartitionExec>() | ||
{ | ||
match repart.output_partitioning() { | ||
Partitioning::Hash(_, _) => { | ||
let shuffle_writer = create_shuffle_writer( | ||
job_id, | ||
self.next_stage_id(), | ||
children[0].clone(), | ||
Some(repart.partitioning().to_owned()), | ||
)?; | ||
let unresolved_shuffle = Arc::new(UnresolvedShuffleExec::new( | ||
shuffle_writer.stage_id(), | ||
shuffle_writer.schema(), | ||
shuffle_writer.output_partitioning().partition_count(), | ||
shuffle_writer | ||
.shuffle_output_partitioning() | ||
.map(|p| p.partition_count()) | ||
.unwrap_or_else(|| { | ||
shuffle_writer.output_partitioning().partition_count() | ||
}), | ||
)); | ||
stages.push(shuffle_writer); | ||
Ok((unresolved_shuffle, stages)) | ||
} | ||
_ => { | ||
// remove any non-hash repartition from the distributed plan | ||
Ok((children[0].clone(), stages)) | ||
} | ||
if let Some(_coalesce) = execution_plan | ||
.as_any() | ||
.downcast_ref::<CoalescePartitionsExec>() | ||
{ | ||
let shuffle_writer = create_shuffle_writer( | ||
job_id, | ||
self.next_stage_id(), | ||
children[0].clone(), | ||
None, | ||
)?; | ||
let unresolved_shuffle = Arc::new(UnresolvedShuffleExec::new( | ||
shuffle_writer.stage_id(), | ||
shuffle_writer.schema(), | ||
shuffle_writer.output_partitioning().partition_count(), | ||
shuffle_writer | ||
.shuffle_output_partitioning() | ||
.map(|p| p.partition_count()) | ||
.unwrap_or_else(|| { | ||
shuffle_writer.output_partitioning().partition_count() | ||
}), | ||
)); | ||
stages.push(shuffle_writer); | ||
Ok(( | ||
with_new_children_if_necessary(execution_plan, vec![unresolved_shuffle])?, | ||
stages, | ||
)) | ||
} else if let Some(repart) = | ||
execution_plan.as_any().downcast_ref::<RepartitionExec>() | ||
{ | ||
match repart.output_partitioning() { | ||
Partitioning::Hash(_, _) => { | ||
let shuffle_writer = create_shuffle_writer( | ||
job_id, | ||
self.next_stage_id(), | ||
children[0].clone(), | ||
Some(repart.partitioning().to_owned()), | ||
)?; | ||
let unresolved_shuffle = Arc::new(UnresolvedShuffleExec::new( | ||
shuffle_writer.stage_id(), | ||
shuffle_writer.schema(), | ||
shuffle_writer.output_partitioning().partition_count(), | ||
shuffle_writer | ||
.shuffle_output_partitioning() | ||
.map(|p| p.partition_count()) | ||
.unwrap_or_else(|| { | ||
shuffle_writer.output_partitioning().partition_count() | ||
}), | ||
)); | ||
stages.push(shuffle_writer); | ||
Ok((unresolved_shuffle, stages)) | ||
} | ||
_ => { | ||
// remove any non-hash repartition from the distributed plan | ||
Ok((children[0].clone(), stages)) | ||
} | ||
} else if let Some(window) = | ||
execution_plan.as_any().downcast_ref::<WindowAggExec>() | ||
{ | ||
Err(BallistaError::NotImplemented(format!( | ||
"WindowAggExec with window {:?}", | ||
window | ||
))) | ||
} else { | ||
Ok(( | ||
with_new_children_if_necessary(execution_plan, children)?, | ||
stages, | ||
)) | ||
} | ||
} else if let Some(window) = | ||
execution_plan.as_any().downcast_ref::<WindowAggExec>() | ||
{ | ||
Err(BallistaError::NotImplemented(format!( | ||
"WindowAggExec with window {:?}", | ||
window | ||
))) | ||
} else { | ||
Ok(( | ||
with_new_children_if_necessary(execution_plan, children)?, | ||
stages, | ||
)) | ||
} | ||
.boxed() | ||
} | ||
|
||
/// Generate a new stage ID | ||
|
@@ -318,9 +310,7 @@ mod test { | |
|
||
let mut planner = DistributedPlanner::new(); | ||
let job_uuid = Uuid::new_v4(); | ||
let stages = planner | ||
.plan_query_stages(&job_uuid.to_string(), plan) | ||
.await?; | ||
let stages = planner.plan_query_stages(&job_uuid.to_string(), plan)?; | ||
for stage in &stages { | ||
println!("{}", displayable(stage.as_ref()).indent()); | ||
} | ||
|
@@ -432,9 +422,7 @@ order by | |
|
||
let mut planner = DistributedPlanner::new(); | ||
let job_uuid = Uuid::new_v4(); | ||
let stages = planner | ||
.plan_query_stages(&job_uuid.to_string(), plan) | ||
.await?; | ||
let stages = planner.plan_query_stages(&job_uuid.to_string(), plan)?; | ||
for stage in &stages { | ||
println!("{}", displayable(stage.as_ref()).indent()); | ||
} | ||
|
@@ -580,9 +568,7 @@ order by | |
|
||
let mut planner = DistributedPlanner::new(); | ||
let job_uuid = Uuid::new_v4(); | ||
let stages = planner | ||
.plan_query_stages(&job_uuid.to_string(), plan) | ||
.await?; | ||
let stages = planner.plan_query_stages(&job_uuid.to_string(), plan)?; | ||
|
||
let partial_hash = stages[0].children()[0].clone(); | ||
let partial_hash_serde = roundtrip_operator(partial_hash.clone())?; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It isn't clear to me how these stages form a dag. Are the dependencies between stages stored elsewhere?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, sorry I should have explained that. I'll add better docs to this struct, but for now each stage has an
output_link: Option<usize>
which specifies where it sends it's output. Ifoutput_link
isNone
then the stage is final and it sends its output to theExecutionGraph
soutput_locations
. Likewise, each stage has ainputs: HashMap<usize,StageOuput>
which "collects" input locations from its input stages.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it. Yes, some comments in the structs here would be great,