From 334dedeef45ca6c557b7855de170f6eb1f774531 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 21 Nov 2022 14:56:11 -0500 Subject: [PATCH 1/3] clean up augur_from_assemblies to remove custom subsampling steps --- pipes/WDL/workflows/augur_from_assemblies.wdl | 68 ++----------------- 1 file changed, 4 insertions(+), 64 deletions(-) diff --git a/pipes/WDL/workflows/augur_from_assemblies.wdl b/pipes/WDL/workflows/augur_from_assemblies.wdl index 8ae8f39af..6104a145b 100644 --- a/pipes/WDL/workflows/augur_from_assemblies.wdl +++ b/pipes/WDL/workflows/augur_from_assemblies.wdl @@ -1,7 +1,6 @@ version 1.0 import "../tasks/tasks_nextstrain.wdl" as nextstrain -import "../tasks/tasks_reports.wdl" as reports import "../tasks/tasks_utils.wdl" as utils workflow augur_from_assemblies { @@ -19,15 +18,6 @@ workflow augur_from_assemblies { Int min_unambig_genome - String focal_variable = "region" - String focal_value = "North America" - - String focal_bin_variable = "division" - Int focal_bin_max = 50 - - String global_bin_variable = "country" - Int global_bin_max = 50 - File? clades_tsv Array[String]? ancestral_traits_to_infer } @@ -48,28 +38,6 @@ workflow augur_from_assemblies { min_unambig_genome: { description: "Minimum number of called bases in genome to pass prefilter." } - - focal_variable: { - description: "The dataset will be bifurcated based on this column header." - } - focal_value: { - description: "The dataset will be bifurcated based whether the focal_variable column matches this value or not. Rows that match this value are considered to be part of the 'focal' set of interest, rows that do not are part of the 'global' set." - } - - focal_bin_variable: { - description: "The focal subset of samples will be evenly subsampled across the discrete values of this column header." - } - focal_bin_max: { - description: "The output will contain no more than this number of focal samples from each discrete value in the focal_bin_variable column." - } - - global_bin_variable: { - description: "The global subset of samples will be evenly subsampled across the discrete values of this column header." - } - global_bin_max: { - description: "The output will contain no more than this number of global samples from each discrete value in the global_bin_variable column." - } - ancestral_traits_to_infer: { description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata." } @@ -126,35 +94,9 @@ workflow augur_from_assemblies { sample_metadata_tsv = derived_cols.derived_metadata } - call nextstrain.filter_subsample_sequences as subsample_focal { - input: - sequences_fasta = prefilter.filtered_fasta, - sample_metadata_tsv = derived_cols.derived_metadata, - exclude_where = ["${focal_variable}!=${focal_value}"], - sequences_per_group = focal_bin_max, - group_by = focal_bin_variable - } - - call nextstrain.filter_subsample_sequences as subsample_global { - input: - sequences_fasta = prefilter.filtered_fasta, - sample_metadata_tsv = derived_cols.derived_metadata, - exclude_where = ["${focal_variable}=${focal_value}"], - sequences_per_group = global_bin_max, - group_by = global_bin_variable - } - - call utils.concatenate as cat_fasta { - input: - infiles = [ - subsample_focal.filtered_fasta, subsample_global.filtered_fasta - ], - output_name = "subsampled.fasta" - } - call utils.fasta_to_ids { input: - sequences_fasta = cat_fasta.combined + sequences_fasta = prefilter.filtered_fasta } @@ -162,7 +104,7 @@ workflow augur_from_assemblies { call nextstrain.augur_mask_sites { input: - sequences = cat_fasta.combined + sequences = prefilter.filtered_fasta } call nextstrain.draft_augur_tree { input: @@ -227,10 +169,8 @@ workflow augur_from_assemblies { File metadata_merged = derived_cols.derived_metadata File keep_list = fasta_to_ids.ids_txt - File subsampled_sequences = cat_fasta.combined - Int focal_kept = subsample_focal.sequences_out - Int global_kept = subsample_global.sequences_out - Int sequences_kept = subsample_focal.sequences_out + subsample_global.sequences_out + File subsampled_sequences = prefilter.filtered_fasta + Int sequences_kept = prefilter.sequences_out File masked_alignment = augur_mask_sites.masked_sequences From 159ea052999d0be34ddaf1892e460fd6836fff8c Mon Sep 17 00:00:00 2001 From: golu099 Date: Thu, 17 Nov 2022 17:12:18 -0500 Subject: [PATCH 2/3] Fixing miniwdl testing error on local.json --- test/input/WDL/test_outputs-sarscov2_lineages-local.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/input/WDL/test_outputs-sarscov2_lineages-local.json b/test/input/WDL/test_outputs-sarscov2_lineages-local.json index 804a776ee..bf4191625 100644 --- a/test/input/WDL/test_outputs-sarscov2_lineages-local.json +++ b/test/input/WDL/test_outputs-sarscov2_lineages-local.json @@ -1,5 +1,5 @@ { - "sarscov2_lineages.nextclade_clade": "20A", + "sarscov2_lineages.nextclade_clade": "20C", "sarscov2_lineages.nextclade_aa_subs": "ORF1b:P314L,ORF3a:Q57H,S:D614G", "sarscov2_lineages.nextclade_aa_dels": "", "sarscov2_lineages.pango_lineage": "B.1" From d860d3bf07850eeaf9e27013944613ab9b33f926 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 28 Nov 2022 13:39:52 -0500 Subject: [PATCH 3/3] make snp calling optional --- pipes/WDL/workflows/augur_from_assemblies.wdl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pipes/WDL/workflows/augur_from_assemblies.wdl b/pipes/WDL/workflows/augur_from_assemblies.wdl index 6104a145b..e94ee5574 100644 --- a/pipes/WDL/workflows/augur_from_assemblies.wdl +++ b/pipes/WDL/workflows/augur_from_assemblies.wdl @@ -20,6 +20,8 @@ workflow augur_from_assemblies { File? clades_tsv Array[String]? ancestral_traits_to_infer + + Boolean make_snps_vcf = false } parameter_meta { @@ -66,9 +68,11 @@ workflow augur_from_assemblies { ref_fasta = ref_fasta, basename = "all_samples_aligned.fasta" } - call nextstrain.snp_sites { - input: - msa_fasta = mafft.aligned_sequences + if(make_snps_vcf) { + call nextstrain.snp_sites { + input: + msa_fasta = mafft.aligned_sequences + } } @@ -165,7 +169,7 @@ workflow augur_from_assemblies { output { File combined_assemblies = filter_sequences_by_length.filtered_fasta File multiple_alignment = mafft.aligned_sequences - File unmasked_snps = snp_sites.snps_vcf + File? unmasked_snps = snp_sites.snps_vcf File metadata_merged = derived_cols.derived_metadata File keep_list = fasta_to_ids.ids_txt