From 334dedeef45ca6c557b7855de170f6eb1f774531 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Mon, 21 Nov 2022 14:56:11 -0500
Subject: [PATCH 1/3] clean up augur_from_assemblies to remove custom
 subsampling steps

---
 pipes/WDL/workflows/augur_from_assemblies.wdl | 68 ++-----------------
 1 file changed, 4 insertions(+), 64 deletions(-)

diff --git a/pipes/WDL/workflows/augur_from_assemblies.wdl b/pipes/WDL/workflows/augur_from_assemblies.wdl
index 8ae8f39af..6104a145b 100644
--- a/pipes/WDL/workflows/augur_from_assemblies.wdl
+++ b/pipes/WDL/workflows/augur_from_assemblies.wdl
@@ -1,7 +1,6 @@
 version 1.0
 
 import "../tasks/tasks_nextstrain.wdl" as nextstrain
-import "../tasks/tasks_reports.wdl" as reports
 import "../tasks/tasks_utils.wdl" as utils
 
 workflow augur_from_assemblies {
@@ -19,15 +18,6 @@ workflow augur_from_assemblies {
 
         Int            min_unambig_genome
 
-        String         focal_variable = "region"
-        String         focal_value = "North America"
-
-        String         focal_bin_variable = "division"
-        Int            focal_bin_max = 50
-
-        String         global_bin_variable = "country"
-        Int            global_bin_max = 50
-
         File?          clades_tsv
         Array[String]? ancestral_traits_to_infer
     }
@@ -48,28 +38,6 @@ workflow augur_from_assemblies {
         min_unambig_genome: {
           description: "Minimum number of called bases in genome to pass prefilter."
         }
-
-        focal_variable: {
-            description: "The dataset will be bifurcated based on this column header."
-        }
-        focal_value: {
-            description: "The dataset will be bifurcated based whether the focal_variable column matches this value or not. Rows that match this value are considered to be part of the 'focal' set of interest, rows that do not are part of the 'global' set."
-        }
-
-        focal_bin_variable: {
-            description: "The focal subset of samples will be evenly subsampled across the discrete values of this column header."
-        }
-        focal_bin_max: {
-            description: "The output will contain no more than this number of focal samples from each discrete value in the focal_bin_variable column."
-        }
-
-        global_bin_variable: {
-            description: "The global subset of samples will be evenly subsampled across the discrete values of this column header."
-        }
-        global_bin_max: {
-            description: "The output will contain no more than this number of global samples from each discrete value in the global_bin_variable column."
-        }
-
         ancestral_traits_to_infer: {
           description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata."
         }
@@ -126,35 +94,9 @@ workflow augur_from_assemblies {
             sample_metadata_tsv = derived_cols.derived_metadata
     }
 
-    call nextstrain.filter_subsample_sequences as subsample_focal {
-        input:
-            sequences_fasta     = prefilter.filtered_fasta,
-            sample_metadata_tsv = derived_cols.derived_metadata,
-            exclude_where       = ["${focal_variable}!=${focal_value}"],
-            sequences_per_group = focal_bin_max,
-            group_by            = focal_bin_variable
-    }
-
-    call nextstrain.filter_subsample_sequences as subsample_global {
-        input:
-            sequences_fasta     = prefilter.filtered_fasta,
-            sample_metadata_tsv = derived_cols.derived_metadata,
-            exclude_where       = ["${focal_variable}=${focal_value}"],
-            sequences_per_group = global_bin_max,
-            group_by            = global_bin_variable
-    }
-
-    call utils.concatenate as cat_fasta {
-        input:
-            infiles = [
-                subsample_focal.filtered_fasta, subsample_global.filtered_fasta
-            ],
-            output_name = "subsampled.fasta"
-    }
-
     call utils.fasta_to_ids {
         input:
-            sequences_fasta = cat_fasta.combined
+            sequences_fasta = prefilter.filtered_fasta
     }
 
 
@@ -162,7 +104,7 @@ workflow augur_from_assemblies {
 
     call nextstrain.augur_mask_sites {
         input:
-            sequences = cat_fasta.combined
+            sequences = prefilter.filtered_fasta
     }
     call nextstrain.draft_augur_tree {
         input:
@@ -227,10 +169,8 @@ workflow augur_from_assemblies {
       
       File        metadata_merged      = derived_cols.derived_metadata
       File        keep_list            = fasta_to_ids.ids_txt
-      File        subsampled_sequences = cat_fasta.combined
-      Int         focal_kept           = subsample_focal.sequences_out
-      Int         global_kept          = subsample_global.sequences_out
-      Int         sequences_kept       = subsample_focal.sequences_out + subsample_global.sequences_out
+      File        subsampled_sequences = prefilter.filtered_fasta
+      Int         sequences_kept       = prefilter.sequences_out
       
       File        masked_alignment     = augur_mask_sites.masked_sequences
       

From 159ea052999d0be34ddaf1892e460fd6836fff8c Mon Sep 17 00:00:00 2001
From: golu099 <flavianegrete@gmail.com>
Date: Thu, 17 Nov 2022 17:12:18 -0500
Subject: [PATCH 2/3] Fixing miniwdl testing error on local.json

---
 test/input/WDL/test_outputs-sarscov2_lineages-local.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/input/WDL/test_outputs-sarscov2_lineages-local.json b/test/input/WDL/test_outputs-sarscov2_lineages-local.json
index 804a776ee..bf4191625 100644
--- a/test/input/WDL/test_outputs-sarscov2_lineages-local.json
+++ b/test/input/WDL/test_outputs-sarscov2_lineages-local.json
@@ -1,5 +1,5 @@
 {
-  "sarscov2_lineages.nextclade_clade": "20A",
+  "sarscov2_lineages.nextclade_clade": "20C",
   "sarscov2_lineages.nextclade_aa_subs": "ORF1b:P314L,ORF3a:Q57H,S:D614G",
   "sarscov2_lineages.nextclade_aa_dels": "",
   "sarscov2_lineages.pango_lineage": "B.1"

From d860d3bf07850eeaf9e27013944613ab9b33f926 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Mon, 28 Nov 2022 13:39:52 -0500
Subject: [PATCH 3/3] make snp calling optional

---
 pipes/WDL/workflows/augur_from_assemblies.wdl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pipes/WDL/workflows/augur_from_assemblies.wdl b/pipes/WDL/workflows/augur_from_assemblies.wdl
index 6104a145b..e94ee5574 100644
--- a/pipes/WDL/workflows/augur_from_assemblies.wdl
+++ b/pipes/WDL/workflows/augur_from_assemblies.wdl
@@ -20,6 +20,8 @@ workflow augur_from_assemblies {
 
         File?          clades_tsv
         Array[String]? ancestral_traits_to_infer
+
+        Boolean        make_snps_vcf = false
     }
 
     parameter_meta {
@@ -66,9 +68,11 @@ workflow augur_from_assemblies {
             ref_fasta = ref_fasta,
             basename  = "all_samples_aligned.fasta"
     }
-    call nextstrain.snp_sites {
-        input:
-            msa_fasta = mafft.aligned_sequences
+    if(make_snps_vcf) {
+        call nextstrain.snp_sites {
+            input:
+                msa_fasta = mafft.aligned_sequences
+        }
     }
 
 
@@ -165,7 +169,7 @@ workflow augur_from_assemblies {
     output {
       File        combined_assemblies  = filter_sequences_by_length.filtered_fasta
       File        multiple_alignment   = mafft.aligned_sequences
-      File        unmasked_snps        = snp_sites.snps_vcf
+      File?       unmasked_snps        = snp_sites.snps_vcf
       
       File        metadata_merged      = derived_cols.derived_metadata
       File        keep_list            = fasta_to_ids.ids_txt