Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

simplify augur_from_assemblies workflow #440

Merged
merged 3 commits into from
Dec 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 12 additions & 68 deletions pipes/WDL/workflows/augur_from_assemblies.wdl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
version 1.0

import "../tasks/tasks_nextstrain.wdl" as nextstrain
import "../tasks/tasks_reports.wdl" as reports
import "../tasks/tasks_utils.wdl" as utils

workflow augur_from_assemblies {
Expand All @@ -19,17 +18,10 @@ workflow augur_from_assemblies {

Int min_unambig_genome

String focal_variable = "region"
String focal_value = "North America"

String focal_bin_variable = "division"
Int focal_bin_max = 50

String global_bin_variable = "country"
Int global_bin_max = 50

File? clades_tsv
Array[String]? ancestral_traits_to_infer

Boolean make_snps_vcf = false
}

parameter_meta {
Expand All @@ -48,28 +40,6 @@ workflow augur_from_assemblies {
min_unambig_genome: {
description: "Minimum number of called bases in genome to pass prefilter."
}

focal_variable: {
description: "The dataset will be bifurcated based on this column header."
}
focal_value: {
description: "The dataset will be bifurcated based whether the focal_variable column matches this value or not. Rows that match this value are considered to be part of the 'focal' set of interest, rows that do not are part of the 'global' set."
}

focal_bin_variable: {
description: "The focal subset of samples will be evenly subsampled across the discrete values of this column header."
}
focal_bin_max: {
description: "The output will contain no more than this number of focal samples from each discrete value in the focal_bin_variable column."
}

global_bin_variable: {
description: "The global subset of samples will be evenly subsampled across the discrete values of this column header."
}
global_bin_max: {
description: "The output will contain no more than this number of global samples from each discrete value in the global_bin_variable column."
}

ancestral_traits_to_infer: {
description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata."
}
Expand Down Expand Up @@ -98,9 +68,11 @@ workflow augur_from_assemblies {
ref_fasta = ref_fasta,
basename = "all_samples_aligned.fasta"
}
call nextstrain.snp_sites {
input:
msa_fasta = mafft.aligned_sequences
if(make_snps_vcf) {
call nextstrain.snp_sites {
input:
msa_fasta = mafft.aligned_sequences
}
}


Expand All @@ -126,43 +98,17 @@ workflow augur_from_assemblies {
sample_metadata_tsv = derived_cols.derived_metadata
}

call nextstrain.filter_subsample_sequences as subsample_focal {
input:
sequences_fasta = prefilter.filtered_fasta,
sample_metadata_tsv = derived_cols.derived_metadata,
exclude_where = ["${focal_variable}!=${focal_value}"],
sequences_per_group = focal_bin_max,
group_by = focal_bin_variable
}

call nextstrain.filter_subsample_sequences as subsample_global {
input:
sequences_fasta = prefilter.filtered_fasta,
sample_metadata_tsv = derived_cols.derived_metadata,
exclude_where = ["${focal_variable}=${focal_value}"],
sequences_per_group = global_bin_max,
group_by = global_bin_variable
}

call utils.concatenate as cat_fasta {
input:
infiles = [
subsample_focal.filtered_fasta, subsample_global.filtered_fasta
],
output_name = "subsampled.fasta"
}

call utils.fasta_to_ids {
input:
sequences_fasta = cat_fasta.combined
sequences_fasta = prefilter.filtered_fasta
}


#### augur_from_msa

call nextstrain.augur_mask_sites {
input:
sequences = cat_fasta.combined
sequences = prefilter.filtered_fasta
}
call nextstrain.draft_augur_tree {
input:
Expand Down Expand Up @@ -223,14 +169,12 @@ workflow augur_from_assemblies {
output {
File combined_assemblies = filter_sequences_by_length.filtered_fasta
File multiple_alignment = mafft.aligned_sequences
File unmasked_snps = snp_sites.snps_vcf
File? unmasked_snps = snp_sites.snps_vcf

File metadata_merged = derived_cols.derived_metadata
File keep_list = fasta_to_ids.ids_txt
File subsampled_sequences = cat_fasta.combined
Int focal_kept = subsample_focal.sequences_out
Int global_kept = subsample_global.sequences_out
Int sequences_kept = subsample_focal.sequences_out + subsample_global.sequences_out
File subsampled_sequences = prefilter.filtered_fasta
Int sequences_kept = prefilter.sequences_out

File masked_alignment = augur_mask_sites.masked_sequences

Expand Down
2 changes: 1 addition & 1 deletion test/input/WDL/test_outputs-sarscov2_lineages-local.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"sarscov2_lineages.nextclade_clade": "20A",
"sarscov2_lineages.nextclade_clade": "20C",
"sarscov2_lineages.nextclade_aa_subs": "ORF1b:P314L,ORF3a:Q57H,S:D614G",
"sarscov2_lineages.nextclade_aa_dels": "",
"sarscov2_lineages.pango_lineage": "B.1"
Expand Down