Skip to content

Commit

Permalink
Merge pull request #440 from broadinstitute/dp-augur
Browse files Browse the repository at this point in the history
simplify augur_from_assemblies workflow
  • Loading branch information
dpark01 authored Dec 1, 2022
2 parents a223f7e + d860d3b commit 0c0e86d
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 69 deletions.
80 changes: 12 additions & 68 deletions pipes/WDL/workflows/augur_from_assemblies.wdl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
version 1.0

import "../tasks/tasks_nextstrain.wdl" as nextstrain
import "../tasks/tasks_reports.wdl" as reports
import "../tasks/tasks_utils.wdl" as utils

workflow augur_from_assemblies {
Expand All @@ -19,17 +18,10 @@ workflow augur_from_assemblies {

Int min_unambig_genome

String focal_variable = "region"
String focal_value = "North America"

String focal_bin_variable = "division"
Int focal_bin_max = 50

String global_bin_variable = "country"
Int global_bin_max = 50

File? clades_tsv
Array[String]? ancestral_traits_to_infer

Boolean make_snps_vcf = false
}

parameter_meta {
Expand All @@ -48,28 +40,6 @@ workflow augur_from_assemblies {
min_unambig_genome: {
description: "Minimum number of called bases in genome to pass prefilter."
}

focal_variable: {
description: "The dataset will be bifurcated based on this column header."
}
focal_value: {
description: "The dataset will be bifurcated based whether the focal_variable column matches this value or not. Rows that match this value are considered to be part of the 'focal' set of interest, rows that do not are part of the 'global' set."
}

focal_bin_variable: {
description: "The focal subset of samples will be evenly subsampled across the discrete values of this column header."
}
focal_bin_max: {
description: "The output will contain no more than this number of focal samples from each discrete value in the focal_bin_variable column."
}

global_bin_variable: {
description: "The global subset of samples will be evenly subsampled across the discrete values of this column header."
}
global_bin_max: {
description: "The output will contain no more than this number of global samples from each discrete value in the global_bin_variable column."
}

ancestral_traits_to_infer: {
description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata."
}
Expand Down Expand Up @@ -98,9 +68,11 @@ workflow augur_from_assemblies {
ref_fasta = ref_fasta,
basename = "all_samples_aligned.fasta"
}
call nextstrain.snp_sites {
input:
msa_fasta = mafft.aligned_sequences
if(make_snps_vcf) {
call nextstrain.snp_sites {
input:
msa_fasta = mafft.aligned_sequences
}
}


Expand All @@ -126,43 +98,17 @@ workflow augur_from_assemblies {
sample_metadata_tsv = derived_cols.derived_metadata
}

call nextstrain.filter_subsample_sequences as subsample_focal {
input:
sequences_fasta = prefilter.filtered_fasta,
sample_metadata_tsv = derived_cols.derived_metadata,
exclude_where = ["${focal_variable}!=${focal_value}"],
sequences_per_group = focal_bin_max,
group_by = focal_bin_variable
}

call nextstrain.filter_subsample_sequences as subsample_global {
input:
sequences_fasta = prefilter.filtered_fasta,
sample_metadata_tsv = derived_cols.derived_metadata,
exclude_where = ["${focal_variable}=${focal_value}"],
sequences_per_group = global_bin_max,
group_by = global_bin_variable
}

call utils.concatenate as cat_fasta {
input:
infiles = [
subsample_focal.filtered_fasta, subsample_global.filtered_fasta
],
output_name = "subsampled.fasta"
}

call utils.fasta_to_ids {
input:
sequences_fasta = cat_fasta.combined
sequences_fasta = prefilter.filtered_fasta
}


#### augur_from_msa
call nextstrain.augur_mask_sites {
input:
sequences = cat_fasta.combined
sequences = prefilter.filtered_fasta
}
call nextstrain.draft_augur_tree {
input:
Expand Down Expand Up @@ -223,14 +169,12 @@ workflow augur_from_assemblies {
output {
File combined_assemblies = filter_sequences_by_length.filtered_fasta
File multiple_alignment = mafft.aligned_sequences
File unmasked_snps = snp_sites.snps_vcf
File? unmasked_snps = snp_sites.snps_vcf

File metadata_merged = derived_cols.derived_metadata
File keep_list = fasta_to_ids.ids_txt
File subsampled_sequences = cat_fasta.combined
Int focal_kept = subsample_focal.sequences_out
Int global_kept = subsample_global.sequences_out
Int sequences_kept = subsample_focal.sequences_out + subsample_global.sequences_out
File subsampled_sequences = prefilter.filtered_fasta
Int sequences_kept = prefilter.sequences_out
File masked_alignment = augur_mask_sites.masked_sequences
Expand Down
2 changes: 1 addition & 1 deletion test/input/WDL/test_outputs-sarscov2_lineages-local.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"sarscov2_lineages.nextclade_clade": "20A",
"sarscov2_lineages.nextclade_clade": "20C",
"sarscov2_lineages.nextclade_aa_subs": "ORF1b:P314L,ORF3a:Q57H,S:D614G",
"sarscov2_lineages.nextclade_aa_dels": "",
"sarscov2_lineages.pango_lineage": "B.1"
Expand Down

0 comments on commit 0c0e86d

Please sign in to comment.