From 7dda13c8d3bb28d79ee47c0c175bb623d8cadf6f Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 8 Jan 2024 14:15:16 -0500 Subject: [PATCH 01/15] add assembly_method as output for scaffold_and_refine --- pipes/WDL/workflows/scaffold_and_refine.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipes/WDL/workflows/scaffold_and_refine.wdl b/pipes/WDL/workflows/scaffold_and_refine.wdl index 57926284c..f57c72f1a 100644 --- a/pipes/WDL/workflows/scaffold_and_refine.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine.wdl @@ -59,6 +59,7 @@ workflow scaffold_and_refine { Int read_pairs_aligned = refine.align_to_self_merged_read_pairs_aligned Float bases_aligned = refine.align_to_self_merged_bases_aligned + String assembly_method = "viral-ngs/scaffold_and_refine" String scaffold_viral_assemble_version = scaffold.viralngs_version String refine_viral_assemble_version = refine.viral_assemble_version } From af641ed7043cd18d72cd81d7e078740dd1d0fd9c Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 8 Jan 2024 14:15:38 -0500 Subject: [PATCH 02/15] add new workflow scaffold_and_refine_multitaxa --- .dockstore.yml | 5 + .../scaffold_and_refine_multitaxa.wdl | 110 ++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl diff --git a/.dockstore.yml b/.dockstore.yml index b8e53db2c..a5c4ce41e 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -339,6 +339,11 @@ workflows: primaryDescriptorPath: /pipes/WDL/workflows/scaffold_and_refine.wdl testParameterFiles: - empty.json + - name: scaffold_and_refine_multitaxa + subclass: WDL + primaryDescriptorPath: /pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl + testParameterFiles: + - empty.json - name: subsample_by_casecounts subclass: WDL primaryDescriptorPath: /pipes/WDL/workflows/subsample_by_casecounts.wdl diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl new file mode 100644 index 000000000..3f9ca71a7 --- /dev/null +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -0,0 +1,110 @@ +version 1.0 + +import "../tasks/tasks_assembly.wdl" as assembly +import "../tasks/tasks_ncbi.wdl" as ncbi +import "../tasks/tasks_utils.wdl" as utils +import "assemble_refbased.wdl" as assemble_refbased + +workflow scaffold_and_refine_multitaxa { + meta { + description: "Scaffold de novo contigs against a set of possible references and subsequently polish with reads." + author: "Broad Viral Genomics" + email: "viral-ngs@broadinstitute.org" + allowNestedInputs: true + } + + input { + String sample_id + File reads_unmapped_bam + + Array[Pair[Int,Array[String]+]] taxid_to_ref_accessions = [ + (208893, ["KY654518.1"]), # RSV-A + (208895, ["MZ516105.1"]), # RSV-B + (573824, ["NC_038311.1"]), # Rhino A1 + (147711, ["NC_001617.1"]), # Rhino A2 + (185900, ["ON311191.1"]), # Rhino B27 + (1418033, ["ON311169.1"]), # Rhino C15 + (2697049, ["NC_045512.2"]), # SARS-CoV-2 Wuhan Hu-1 + (641809, ["NC_026438.1", "NC_026435.1", "NC_026437.1", "NC_026433.1", "NC_026436.1", "NC_026434.1", "NC_026431.1", "NC_026432.1"]), # Flu A/California/07/2009 H1N1 + (335341, ["NC_007373.1", "NC_007372.1", "NC_007371.1", "NC_007366.1", "NC_007369.1", "NC_007368.1", "NC_007367.1", "NC_007370.1"]), # Flu A/New York/392/2004 H3N2 + (518987, ["NC_002204.1", "NC_002205.1", "NC_002206.1", "NC_002207.1", "NC_002208.1", "NC_002209.1", "NC_002210.1", "NC_002211.1"]), # Flu B/Lee/1940 + (162145, ["NC_039199.1"]), # metapneumo + (12730, ["NC_003461.1"]), # paraflu 1 + (11216, ["NC_001796.2"]) # paraflu 3 + ] + } + + Array[String] assembly_header = ["sample_id", "taxid", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] + + scatter(taxon in reference_genome_fastas) { + call ncbi.download_annotations { + accessions = taxon.right, + combined_out_prefix = taxon.left + } + call assembly.scaffold { + input: + reads_bam = reads_unmapped_bam, + reference_genome_fasta = download_annotations.combined_fasta, + min_length_fraction = 0, + min_unambig = 0 + } + call assemble_refbased.assemble_refbased as refine { + input: + reads_unmapped_bams = [reads_unmapped_bam], + reference_fasta = scaffold.scaffold_fasta, + sample_name = sample_id + } + # to do: if pre-impute unambig length > some fraction of ref genome, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single + # to do: if biosample attributes file provided, run ncbi.biosample_to_genbank + + Map[String, String] stats_by_taxon = { + "sample_id" : sample_id, + "taxid" : taxon.left, + + "assembly_fasta" : refine.assembly_fasta, + "aligned_only_reads_bam" : refine.align_to_self_merged_aligned_only_bam, + "coverage_plot" : refine.align_to_self_merged_coverage_plot, + "assembly_length" : refine.assembly_length, + "assembly_length_unambiguous" : refine.assembly_length_unambiguous, + "reads_aligned" : refine.align_to_self_merged_reads_aligned, + "mean_coverage" : refine.align_to_self_merged_mean_coverage, + + "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, + "assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous, + + "replicate_concordant_sites" : refine.replicate_concordant_sites, + "replicate_discordant_snps" : refine.replicate_discordant_snps, + "replicate_discordant_indels" : refine.replicate_discordant_indels, + "replicate_discordant_vcf" : refine.replicate_discordant_vcf, + + "isnvsFile" : refine.align_to_self_isnvs_vcf, + "aligned_bam" : refine.align_to_self_merged_aligned_only_bam, + "coverage_tsv" : refine.align_to_self_merged_coverage_tsv, + "read_pairs_aligned" : refine.align_to_self_merged_read_pairs_aligned, + "bases_aligned" : refine.align_to_self_merged_bases_aligned + } + + scatter(h in assembly_header) { + String stat_by_taxon = stats_by_taxon[h] + } + } + + ### summary stats + call utils.concatenate { + input: + infiles = [write_tsv([assembly_header]), write_tsv(stat_by_taxon)], + output_name = "assembly_metadata-~{sample_id}.tsv" + } + + output { + Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon + File assembly_stats_by_taxon_tsv = concatenate.combined + + Int num_read_groups = refine.num_read_groups[0] + Int num_libraries = refine.num_libraries[0] + + String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa" + String scaffold_viral_assemble_version = scaffold.viralngs_version + String refine_viral_assemble_version = refine.viral_assemble_version + } +} From 6048a2370b42e6fcf2383cad229ad6ffb7cc10fb Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 8 Jan 2024 14:21:28 -0500 Subject: [PATCH 03/15] input: input --- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index 3f9ca71a7..bc6cce0b2 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -38,8 +38,9 @@ workflow scaffold_and_refine_multitaxa { scatter(taxon in reference_genome_fastas) { call ncbi.download_annotations { - accessions = taxon.right, - combined_out_prefix = taxon.left + input: + accessions = taxon.right, + combined_out_prefix = taxon.left } call assembly.scaffold { input: From a014452e5ea690cafa0a08dd596968fb143f0916 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 8 Jan 2024 14:27:50 -0500 Subject: [PATCH 04/15] update var name --- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index bc6cce0b2..2c79371e7 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -36,7 +36,7 @@ workflow scaffold_and_refine_multitaxa { Array[String] assembly_header = ["sample_id", "taxid", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] - scatter(taxon in reference_genome_fastas) { + scatter(taxon in taxid_to_ref_accessions) { call ncbi.download_annotations { input: accessions = taxon.right, From c4f9d264ee364b05fd19942108b8497ba3287d28 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 8 Jan 2024 14:32:52 -0500 Subject: [PATCH 05/15] scaffold expects choices (feed it one) --- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index 2c79371e7..ccf4d4766 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -45,7 +45,7 @@ workflow scaffold_and_refine_multitaxa { call assembly.scaffold { input: reads_bam = reads_unmapped_bam, - reference_genome_fasta = download_annotations.combined_fasta, + reference_genome_fasta = [download_annotations.combined_fasta], min_length_fraction = 0, min_unambig = 0 } From bb2ea8d2dda40ca6300dc0d1dbfe1c769b3d5747 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 8 Jan 2024 14:38:50 -0500 Subject: [PATCH 06/15] pick first one for version numbers --- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index ccf4d4766..12135f64a 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -105,7 +105,7 @@ workflow scaffold_and_refine_multitaxa { Int num_libraries = refine.num_libraries[0] String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa" - String scaffold_viral_assemble_version = scaffold.viralngs_version - String refine_viral_assemble_version = refine.viral_assemble_version + String scaffold_viral_assemble_version = scaffold.viralngs_version[0] + String refine_viral_assemble_version = refine.viral_assemble_version[0] } } From 3317336c9814a4de28f4133fee5aee547693e779 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 9 Jan 2024 10:14:42 -0500 Subject: [PATCH 07/15] make scaffolding optionally resilient to empty outputs --- pipes/WDL/tasks/tasks_assembly.wdl | 6 ++++-- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index ef6bb4f27..f26d3b7cc 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -113,6 +113,7 @@ task scaffold { Float? min_length_fraction Float? min_unambig Int replace_length=55 + Boolean allow_incomplete_output = false Int? nucmer_max_gap Int? nucmer_min_match @@ -168,7 +169,7 @@ task scaffold { description: "When scaffolding contigs to the reference via nucmer, this specifies the -l parameter to nucmer (the minimal size of a maximal exact match). Our default is 10 (down from nucmer default of 20) to allow for more divergence.", category: "advanced" } - nucmer_min_cluster:{ + nucmer_min_cluster:{ description: "When scaffolding contigs to the reference via nucmer, this specifies the -c parameter to nucmer (minimum cluster length). Our default is the nucmer default of 65 bp.", category: "advanced" } @@ -214,6 +215,7 @@ task scaffold { --outReference ~{sample_name}.scaffolding_chosen_ref.fasta \ --outStats ~{sample_name}.scaffolding_stats.txt \ --outAlternateContigs ~{sample_name}.scaffolding_alt_contigs.fasta \ + ~{true='--allow_incomplete_output' false="" allow_incomplete_output} \ --loglevel=DEBUG grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | cut -f 1 -d ' ' > ~{sample_name}.scaffolding_chosen_refs.txt @@ -428,7 +430,7 @@ task align_reads { String aligner = "minimap2" String? aligner_options - Boolean? skip_mark_dupes = false + Boolean skip_mark_dupes = false Int? machine_mem_gb String docker = "quay.io/broadinstitute/viral-core:2.2.4" diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index 12135f64a..a95865713 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -47,7 +47,8 @@ workflow scaffold_and_refine_multitaxa { reads_bam = reads_unmapped_bam, reference_genome_fasta = [download_annotations.combined_fasta], min_length_fraction = 0, - min_unambig = 0 + min_unambig = 0, + allow_incomplete_output = true } call assemble_refbased.assemble_refbased as refine { input: From 916216de2b63b69e44e23aa04e65ab46ed87f673 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 9 Jan 2024 10:45:57 -0500 Subject: [PATCH 08/15] update viral-assemble docker upstream --- pipes/WDL/tasks/tasks_assembly.wdl | 8 ++++---- pipes/WDL/tasks/tasks_reports.wdl | 2 +- requirements-modules.txt | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index f26d3b7cc..a91e0f089 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -15,7 +15,7 @@ task assemble { String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt") Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0" + String docker = "quay.io/broadinstitute/viral-assemble:2.2.4.0" } parameter_meta{ reads_unmapped_bam: { @@ -122,7 +122,7 @@ task scaffold { Float? scaffold_min_pct_contig_aligned Int? machine_mem_gb - String docker="quay.io/broadinstitute/viral-assemble:2.1.33.0" + String docker="quay.io/broadinstitute/viral-assemble:2.2.4.0" # do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name String sample_name = basename(basename(contigs_fasta, ".fasta"), ".assembly1-spades") @@ -568,7 +568,7 @@ task refine_assembly_with_aligned_reads { Int min_coverage = 3 Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0" + String docker = "quay.io/broadinstitute/viral-assemble:2.2.4.0" } Int disk_size = 375 @@ -693,7 +693,7 @@ task refine_2x_and_plot { String? plot_coverage_novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k" Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0" + String docker = "quay.io/broadinstitute/viral-assemble:2.2.4.0" # do this in two steps in case the input doesn't actually have "cleaned" in the name String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned") diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index d49084c11..64dc8467f 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -630,7 +630,7 @@ task compare_two_genomes { File genome_two String out_basename - String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0" + String docker = "quay.io/broadinstitute/viral-assemble:2.2.4.0" } Int disk_size = 50 diff --git a/requirements-modules.txt b/requirements-modules.txt index 73aa2a5e8..2cf41a97a 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,5 +1,5 @@ broadinstitute/viral-core=2.2.4 -broadinstitute/viral-assemble=2.1.33.0 +broadinstitute/viral-assemble=2.2.4.0 broadinstitute/viral-classify=2.2.3.0 broadinstitute/viral-phylo=2.1.20.2 broadinstitute/py3-bio=0.1.2 From f0c682018f6e3e5a77c88d118c63e5dacfd42fbe Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 9 Jan 2024 15:38:02 -0500 Subject: [PATCH 09/15] guard against non-zero exit codes from grep on empty input --- pipes/WDL/tasks/tasks_assembly.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index a91e0f089..930cfac8a 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -228,8 +228,10 @@ task scaffold { --maskErrors \ --loglevel=DEBUG + set +e +o pipefail grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\n' | wc -c | tee assembly_preimpute_length grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\nNn' | wc -c | tee assembly_preimpute_length_unambiguous + set -e -o pipefail #Input assembly/contigs, FASTA, already ordered oriented and merged with the reference gneome (FASTA) assembly.py impute_from_reference \ From 70e3173783056036940597351497c04d695f8ecb Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 10 Jan 2024 10:21:36 -0500 Subject: [PATCH 10/15] small cleanups of task alignment_metrics --- pipes/WDL/tasks/tasks_reports.wdl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index 64dc8467f..a788aa13c 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -11,10 +11,10 @@ task alignment_metrics { File? primers_bed String? amplicon_set Int? min_coverage - Int? max_amp_len=5000 - Int? max_amplicons=500 + Int max_amp_len=5000 + Int max_amplicons=500 - Int? machine_mem_gb + Int machine_mem_gb=13 String docker = "quay.io/broadinstitute/viral-core:2.2.4" } @@ -100,8 +100,8 @@ task alignment_metrics { } runtime { - docker: "~{docker}" - memory: select_first([machine_mem_gb, 13]) + " GB" + docker: docker + memory: machine_mem_gb + " GB" cpu: 2 disks: "local-disk " + disk_size + " HDD" disk: disk_size + " GB" # TES From 074a754f51f657e5a806b56bf240874718dacd8a Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 10 Jan 2024 10:46:06 -0500 Subject: [PATCH 11/15] make task run_discordance happy with empty fasta input --- pipes/WDL/tasks/tasks_assembly.wdl | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index 930cfac8a..ee8e026da 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -856,8 +856,8 @@ task run_discordance { read_utils.py --version | tee VERSION - # create 2-col table with read group ids in both cols python3 < filtered.vcf From cb98982634e1a405583b3aeaf5ee73c5265604d4 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 10 Jan 2024 11:24:07 -0500 Subject: [PATCH 12/15] defend task alignment_metrics against empty fasta inputs --- pipes/WDL/tasks/tasks_reports.wdl | 43 +++++++++++++++++-------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index a788aa13c..44d128398 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -31,25 +31,30 @@ task alignment_metrics { cp "~{ref_fasta}" reference.fasta picard $XMX CreateSequenceDictionary -R reference.fasta - # get Picard metrics and clean up the junky outputs - picard $XMX CollectRawWgsMetrics \ - -R reference.fasta \ - -I "~{aligned_bam}" \ - -O picard_raw.raw_wgs_metrics.txt - grep -v \# picard_raw.raw_wgs_metrics.txt | grep . | head -2 > picard_clean.raw_wgs_metrics.txt - - picard $XMX CollectAlignmentSummaryMetrics \ - -R reference.fasta \ - -I "~{aligned_bam}" \ - -O picard_raw.alignment_metrics.txt - grep -v \# picard_raw.alignment_metrics.txt | grep . | head -4 > picard_clean.alignment_metrics.txt - - picard $XMX CollectInsertSizeMetrics \ - -I "~{aligned_bam}" \ - -O picard_raw.insert_size_metrics.txt \ - -H picard_raw.insert_size_metrics.pdf \ - --INCLUDE_DUPLICATES true - grep -v \# picard_raw.insert_size_metrics.txt | grep . | head -2 > picard_clean.insert_size_metrics.txt + if [ -s "~{ref_fasta}" ]; then + # get Picard metrics and clean up the junky outputs + picard $XMX CollectRawWgsMetrics \ + -R reference.fasta \ + -I "~{aligned_bam}" \ + -O picard_raw.raw_wgs_metrics.txt + grep -v \# picard_raw.raw_wgs_metrics.txt | grep . | head -2 > picard_clean.raw_wgs_metrics.txt + + picard $XMX CollectAlignmentSummaryMetrics \ + -R reference.fasta \ + -I "~{aligned_bam}" \ + -O picard_raw.alignment_metrics.txt + grep -v \# picard_raw.alignment_metrics.txt | grep . | head -4 > picard_clean.alignment_metrics.txt + + picard $XMX CollectInsertSizeMetrics \ + -I "~{aligned_bam}" \ + -O picard_raw.insert_size_metrics.txt \ + -H picard_raw.insert_size_metrics.pdf \ + --INCLUDE_DUPLICATES true + grep -v \# picard_raw.insert_size_metrics.txt | grep . | head -2 > picard_clean.insert_size_metrics.txt + else + # ref_fasta is empty -> Picard will fail + touch picard_clean.raw_wgs_metrics.txt picard_clean.alignment_metrics.txt picard_clean.insert_size_metrics.txt + fi # prepend the sample name in order to facilitate tsv joining later SAMPLE=$(samtools view -H "~{aligned_bam}" | grep ^@RG | perl -lape 's/^@RG.*SM:(\S+).*$/$1/' | sort | uniq) From b3f4699e64c603d762979ed42a5dbb2ebce05151 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 10 Jan 2024 14:20:21 -0500 Subject: [PATCH 13/15] more special handling of empty fasta for run_discordance --- pipes/WDL/tasks/tasks_assembly.wdl | 32 ++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index ee8e026da..d2c45fb6b 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -874,13 +874,18 @@ task run_discordance { # detect empty fasta situation and manually create empty VCF import os.path if (os.path.getsize('~{reference_fasta}') == 0): + sample_name = [[x[3:] for x in h if x.startswith('SM:')][0] for h in header if h[0]=='@RG'][0] with open('everything.vcf', 'wt') as outf: outf.write('##fileformat=VCFv4.3') - outf.write('\t'.join(('#CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT'))+'\n') + outf.write('##ALT=') + outf.write('##INFO=') + outf.write('##FORMAT=') + outf.write('##FORMAT=') + outf.write('\t'.join(('#CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT',sample_name))+'\n') CODE - # bcftools call snps while treating each RG as a separate sample if [ ! -f everything.vcf ]; then + # bcftools call snps while treating each RG as a separate sample bcftools mpileup \ -G readgroups.txt -d 10000 -a "FORMAT/DP,FORMAT/AD" \ -q 1 -m 2 -Ou \ @@ -889,16 +894,23 @@ task run_discordance { -P 0 -m --ploidy 1 \ --threads $(nproc) \ -Ov -o everything.vcf - fi - # mask all GT calls when less than 3 reads - cat everything.vcf | bcftools filter -e "FMT/DP<~{min_coverage}" -S . > filtered.vcf - cat filtered.vcf | bcftools filter -i "MAC>0" > "~{out_basename}.discordant.vcf" + # mask all GT calls when less than 3 reads + cat everything.vcf | bcftools filter -e "FMT/DP<~{min_coverage}" -S . > filtered.vcf + cat filtered.vcf | bcftools filter -i "MAC>0" > "~{out_basename}.discordant.vcf" + + # tally outputs + bcftools filter -i 'MAC=0' filtered.vcf | bcftools query -f '%POS\n' | wc -l | tee num_concordant + bcftools filter -i 'TYPE="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_snps + bcftools filter -i 'TYPE!="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_indels - # tally outputs - bcftools filter -i 'MAC=0' filtered.vcf | bcftools query -f '%POS\n' | wc -l | tee num_concordant - bcftools filter -i 'TYPE="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_snps - bcftools filter -i 'TYPE!="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_indels + else + # handle empty case + cp everything.vcf "~{out_basename}.discordant.vcf" + echo 0 > num_concordant + echo 0 > num_discordant_snps + echo 0 > num_discordant_indels + fi } output { From fa6b226cc3be705b82afcd539f6e15113d6ba593 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 5 Feb 2024 13:58:47 -0500 Subject: [PATCH 14/15] add percent_reference_covered --- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index a95865713..12231fe91 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -32,9 +32,11 @@ workflow scaffold_and_refine_multitaxa { (12730, ["NC_003461.1"]), # paraflu 1 (11216, ["NC_001796.2"]) # paraflu 3 ] + + # Float min_pct_reference_covered = 0.1 } - Array[String] assembly_header = ["sample_id", "taxid", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] + Array[String] assembly_header = ["sample_id", "taxid", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] scatter(taxon in taxid_to_ref_accessions) { call ncbi.download_annotations { @@ -56,7 +58,7 @@ workflow scaffold_and_refine_multitaxa { reference_fasta = scaffold.scaffold_fasta, sample_name = sample_id } - # to do: if pre-impute unambig length > some fraction of ref genome, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single + # to do: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single # to do: if biosample attributes file provided, run ncbi.biosample_to_genbank Map[String, String] stats_by_taxon = { @@ -70,6 +72,7 @@ workflow scaffold_and_refine_multitaxa { "assembly_length_unambiguous" : refine.assembly_length_unambiguous, "reads_aligned" : refine.align_to_self_merged_reads_aligned, "mean_coverage" : refine.align_to_self_merged_mean_coverage, + "percent_reference_covered" : 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length, "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, "assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous, From 9c51e00d8bf70713d7e334caa165ae8d3b04c2dd Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 5 Feb 2024 14:02:10 -0500 Subject: [PATCH 15/15] update reference list --- pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl index 12231fe91..eac47dc1f 100644 --- a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -21,16 +21,23 @@ workflow scaffold_and_refine_multitaxa { (208893, ["KY654518.1"]), # RSV-A (208895, ["MZ516105.1"]), # RSV-B (573824, ["NC_038311.1"]), # Rhino A1 - (147711, ["NC_001617.1"]), # Rhino A2 (185900, ["ON311191.1"]), # Rhino B27 (1418033, ["ON311169.1"]), # Rhino C15 + (463676, ["JN837686.2"]), # Rhino C45 + (11137, ["NC_002645.1"]), # HCoV 229E + (290028, ["NC_006577.2"]), # HCoV HKU1 + (277944, ["NC_005831.2"]), # HCoV NL63 + (31631, ["NC_006213.1"]), # HCoV OC43 (2697049, ["NC_045512.2"]), # SARS-CoV-2 Wuhan Hu-1 (641809, ["NC_026438.1", "NC_026435.1", "NC_026437.1", "NC_026433.1", "NC_026436.1", "NC_026434.1", "NC_026431.1", "NC_026432.1"]), # Flu A/California/07/2009 H1N1 (335341, ["NC_007373.1", "NC_007372.1", "NC_007371.1", "NC_007366.1", "NC_007369.1", "NC_007368.1", "NC_007367.1", "NC_007370.1"]), # Flu A/New York/392/2004 H3N2 (518987, ["NC_002204.1", "NC_002205.1", "NC_002206.1", "NC_002207.1", "NC_002208.1", "NC_002209.1", "NC_002210.1", "NC_002211.1"]), # Flu B/Lee/1940 (162145, ["NC_039199.1"]), # metapneumo (12730, ["NC_003461.1"]), # paraflu 1 - (11216, ["NC_001796.2"]) # paraflu 3 + (2560525, ["NC_003443.1"]), # paraflu 2 + (11216, ["NC_001796.2"]), # paraflu 3 + (11224, ["NC_021928.1"]), # paraflu 4 + (129951, ["NC_001405.1"]) # adenovirus C ] # Float min_pct_reference_covered = 0.1