diff --git a/.dockstore.yml b/.dockstore.yml index a3fba3a43..be93a4ad5 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -349,6 +349,11 @@ workflows: primaryDescriptorPath: /pipes/WDL/workflows/scaffold_and_refine.wdl testParameterFiles: - empty.json + - name: scaffold_and_refine_multitaxa + subclass: WDL + primaryDescriptorPath: /pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl + testParameterFiles: + - empty.json - name: subsample_by_casecounts subclass: WDL primaryDescriptorPath: /pipes/WDL/workflows/subsample_by_casecounts.wdl diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index ef6bb4f27..d2c45fb6b 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -15,7 +15,7 @@ task assemble { String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt") Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0" + String docker = "quay.io/broadinstitute/viral-assemble:2.2.4.0" } parameter_meta{ reads_unmapped_bam: { @@ -113,6 +113,7 @@ task scaffold { Float? min_length_fraction Float? min_unambig Int replace_length=55 + Boolean allow_incomplete_output = false Int? nucmer_max_gap Int? nucmer_min_match @@ -121,7 +122,7 @@ task scaffold { Float? scaffold_min_pct_contig_aligned Int? machine_mem_gb - String docker="quay.io/broadinstitute/viral-assemble:2.1.33.0" + String docker="quay.io/broadinstitute/viral-assemble:2.2.4.0" # do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name String sample_name = basename(basename(contigs_fasta, ".fasta"), ".assembly1-spades") @@ -168,7 +169,7 @@ task scaffold { description: "When scaffolding contigs to the reference via nucmer, this specifies the -l parameter to nucmer (the minimal size of a maximal exact match). Our default is 10 (down from nucmer default of 20) to allow for more divergence.", category: "advanced" } - nucmer_min_cluster:{ + nucmer_min_cluster:{ description: "When scaffolding contigs to the reference via nucmer, this specifies the -c parameter to nucmer (minimum cluster length). Our default is the nucmer default of 65 bp.", category: "advanced" } @@ -214,6 +215,7 @@ task scaffold { --outReference ~{sample_name}.scaffolding_chosen_ref.fasta \ --outStats ~{sample_name}.scaffolding_stats.txt \ --outAlternateContigs ~{sample_name}.scaffolding_alt_contigs.fasta \ + ~{true='--allow_incomplete_output' false="" allow_incomplete_output} \ --loglevel=DEBUG grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | cut -f 1 -d ' ' > ~{sample_name}.scaffolding_chosen_refs.txt @@ -226,8 +228,10 @@ task scaffold { --maskErrors \ --loglevel=DEBUG + set +e +o pipefail grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\n' | wc -c | tee assembly_preimpute_length grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\nNn' | wc -c | tee assembly_preimpute_length_unambiguous + set -e -o pipefail #Input assembly/contigs, FASTA, already ordered oriented and merged with the reference gneome (FASTA) assembly.py impute_from_reference \ @@ -428,7 +432,7 @@ task align_reads { String aligner = "minimap2" String? aligner_options - Boolean? skip_mark_dupes = false + Boolean skip_mark_dupes = false Int? machine_mem_gb String docker = "quay.io/broadinstitute/viral-core:2.2.4" @@ -566,7 +570,7 @@ task refine_assembly_with_aligned_reads { Int min_coverage = 3 Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0" + String docker = "quay.io/broadinstitute/viral-assemble:2.2.4.0" } Int disk_size = 375 @@ -691,7 +695,7 @@ task refine_2x_and_plot { String? plot_coverage_novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k" Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0" + String docker = "quay.io/broadinstitute/viral-assemble:2.2.4.0" # do this in two steps in case the input doesn't actually have "cleaned" in the name String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned") @@ -852,8 +856,8 @@ task run_discordance { read_utils.py --version | tee VERSION - # create 2-col table with read group ids in both cols python3 <') + outf.write('##INFO=') + outf.write('##FORMAT=') + outf.write('##FORMAT=') + outf.write('\t'.join(('#CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT',sample_name))+'\n') CODE - # bcftools call snps while treating each RG as a separate sample - bcftools mpileup \ - -G readgroups.txt -d 10000 -a "FORMAT/DP,FORMAT/AD" \ - -q 1 -m 2 -Ou \ - -f "~{reference_fasta}" "~{reads_aligned_bam}" \ - | bcftools call \ - -P 0 -m --ploidy 1 \ - --threads $(nproc) \ - -Ov -o everything.vcf - - # mask all GT calls when less than 3 reads - cat everything.vcf | bcftools filter -e "FMT/DP<~{min_coverage}" -S . > filtered.vcf - cat filtered.vcf | bcftools filter -i "MAC>0" > "~{out_basename}.discordant.vcf" - - # tally outputs - bcftools filter -i 'MAC=0' filtered.vcf | bcftools query -f '%POS\n' | wc -l | tee num_concordant - bcftools filter -i 'TYPE="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_snps - bcftools filter -i 'TYPE!="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_indels + if [ ! -f everything.vcf ]; then + # bcftools call snps while treating each RG as a separate sample + bcftools mpileup \ + -G readgroups.txt -d 10000 -a "FORMAT/DP,FORMAT/AD" \ + -q 1 -m 2 -Ou \ + -f "~{reference_fasta}" "~{reads_aligned_bam}" \ + | bcftools call \ + -P 0 -m --ploidy 1 \ + --threads $(nproc) \ + -Ov -o everything.vcf + + # mask all GT calls when less than 3 reads + cat everything.vcf | bcftools filter -e "FMT/DP<~{min_coverage}" -S . > filtered.vcf + cat filtered.vcf | bcftools filter -i "MAC>0" > "~{out_basename}.discordant.vcf" + + # tally outputs + bcftools filter -i 'MAC=0' filtered.vcf | bcftools query -f '%POS\n' | wc -l | tee num_concordant + bcftools filter -i 'TYPE="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_snps + bcftools filter -i 'TYPE!="snp"' "~{out_basename}.discordant.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_discordant_indels + + else + # handle empty case + cp everything.vcf "~{out_basename}.discordant.vcf" + echo 0 > num_concordant + echo 0 > num_discordant_snps + echo 0 > num_discordant_indels + fi } output { diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index 4b44f5ed9..1b4188e11 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -11,10 +11,10 @@ task alignment_metrics { File? primers_bed String? amplicon_set Int? min_coverage - Int? max_amp_len=5000 - Int? max_amplicons=500 + Int max_amp_len=5000 + Int max_amplicons=500 - Int? machine_mem_gb + Int machine_mem_gb=13 String docker = "quay.io/broadinstitute/viral-core:2.2.4" } @@ -31,25 +31,30 @@ task alignment_metrics { cp "~{ref_fasta}" reference.fasta picard $XMX CreateSequenceDictionary -R reference.fasta - # get Picard metrics and clean up the junky outputs - picard $XMX CollectRawWgsMetrics \ - -R reference.fasta \ - -I "~{aligned_bam}" \ - -O picard_raw.raw_wgs_metrics.txt - grep -v \# picard_raw.raw_wgs_metrics.txt | grep . | head -2 > picard_clean.raw_wgs_metrics.txt - - picard $XMX CollectAlignmentSummaryMetrics \ - -R reference.fasta \ - -I "~{aligned_bam}" \ - -O picard_raw.alignment_metrics.txt - grep -v \# picard_raw.alignment_metrics.txt | grep . | head -4 > picard_clean.alignment_metrics.txt - - picard $XMX CollectInsertSizeMetrics \ - -I "~{aligned_bam}" \ - -O picard_raw.insert_size_metrics.txt \ - -H picard_raw.insert_size_metrics.pdf \ - --INCLUDE_DUPLICATES true - grep -v \# picard_raw.insert_size_metrics.txt | grep . | head -2 > picard_clean.insert_size_metrics.txt + if [ -s "~{ref_fasta}" ]; then + # get Picard metrics and clean up the junky outputs + picard $XMX CollectRawWgsMetrics \ + -R reference.fasta \ + -I "~{aligned_bam}" \ + -O picard_raw.raw_wgs_metrics.txt + grep -v \# picard_raw.raw_wgs_metrics.txt | grep . | head -2 > picard_clean.raw_wgs_metrics.txt + + picard $XMX CollectAlignmentSummaryMetrics \ + -R reference.fasta \ + -I "~{aligned_bam}" \ + -O picard_raw.alignment_metrics.txt + grep -v \# picard_raw.alignment_metrics.txt | grep . | head -4 > picard_clean.alignment_metrics.txt + + picard $XMX CollectInsertSizeMetrics \ + -I "~{aligned_bam}" \ + -O picard_raw.insert_size_metrics.txt \ + -H picard_raw.insert_size_metrics.pdf \ + --INCLUDE_DUPLICATES true + grep -v \# picard_raw.insert_size_metrics.txt | grep . | head -2 > picard_clean.insert_size_metrics.txt + else + # ref_fasta is empty -> Picard will fail + touch picard_clean.raw_wgs_metrics.txt picard_clean.alignment_metrics.txt picard_clean.insert_size_metrics.txt + fi # prepend the sample name in order to facilitate tsv joining later SAMPLE=$(samtools view -H "~{aligned_bam}" | grep ^@RG | perl -lape 's/^@RG.*SM:(\S+).*$/$1/' | sort | uniq) @@ -100,8 +105,8 @@ task alignment_metrics { } runtime { - docker: "~{docker}" - memory: select_first([machine_mem_gb, 13]) + " GB" + docker: docker + memory: machine_mem_gb + " GB" cpu: 2 disks: "local-disk " + disk_size + " HDD" disk: disk_size + " GB" # TES @@ -630,7 +635,7 @@ task compare_two_genomes { File genome_two String out_basename - String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0" + String docker = "quay.io/broadinstitute/viral-assemble:2.2.4.0" } Int disk_size = 50 diff --git a/pipes/WDL/workflows/scaffold_and_refine.wdl b/pipes/WDL/workflows/scaffold_and_refine.wdl index 57926284c..f57c72f1a 100644 --- a/pipes/WDL/workflows/scaffold_and_refine.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine.wdl @@ -59,6 +59,7 @@ workflow scaffold_and_refine { Int read_pairs_aligned = refine.align_to_self_merged_read_pairs_aligned Float bases_aligned = refine.align_to_self_merged_bases_aligned + String assembly_method = "viral-ngs/scaffold_and_refine" String scaffold_viral_assemble_version = scaffold.viralngs_version String refine_viral_assemble_version = refine.viral_assemble_version } diff --git a/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl new file mode 100644 index 000000000..eac47dc1f --- /dev/null +++ b/pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl @@ -0,0 +1,122 @@ +version 1.0 + +import "../tasks/tasks_assembly.wdl" as assembly +import "../tasks/tasks_ncbi.wdl" as ncbi +import "../tasks/tasks_utils.wdl" as utils +import "assemble_refbased.wdl" as assemble_refbased + +workflow scaffold_and_refine_multitaxa { + meta { + description: "Scaffold de novo contigs against a set of possible references and subsequently polish with reads." + author: "Broad Viral Genomics" + email: "viral-ngs@broadinstitute.org" + allowNestedInputs: true + } + + input { + String sample_id + File reads_unmapped_bam + + Array[Pair[Int,Array[String]+]] taxid_to_ref_accessions = [ + (208893, ["KY654518.1"]), # RSV-A + (208895, ["MZ516105.1"]), # RSV-B + (573824, ["NC_038311.1"]), # Rhino A1 + (185900, ["ON311191.1"]), # Rhino B27 + (1418033, ["ON311169.1"]), # Rhino C15 + (463676, ["JN837686.2"]), # Rhino C45 + (11137, ["NC_002645.1"]), # HCoV 229E + (290028, ["NC_006577.2"]), # HCoV HKU1 + (277944, ["NC_005831.2"]), # HCoV NL63 + (31631, ["NC_006213.1"]), # HCoV OC43 + (2697049, ["NC_045512.2"]), # SARS-CoV-2 Wuhan Hu-1 + (641809, ["NC_026438.1", "NC_026435.1", "NC_026437.1", "NC_026433.1", "NC_026436.1", "NC_026434.1", "NC_026431.1", "NC_026432.1"]), # Flu A/California/07/2009 H1N1 + (335341, ["NC_007373.1", "NC_007372.1", "NC_007371.1", "NC_007366.1", "NC_007369.1", "NC_007368.1", "NC_007367.1", "NC_007370.1"]), # Flu A/New York/392/2004 H3N2 + (518987, ["NC_002204.1", "NC_002205.1", "NC_002206.1", "NC_002207.1", "NC_002208.1", "NC_002209.1", "NC_002210.1", "NC_002211.1"]), # Flu B/Lee/1940 + (162145, ["NC_039199.1"]), # metapneumo + (12730, ["NC_003461.1"]), # paraflu 1 + (2560525, ["NC_003443.1"]), # paraflu 2 + (11216, ["NC_001796.2"]), # paraflu 3 + (11224, ["NC_021928.1"]), # paraflu 4 + (129951, ["NC_001405.1"]) # adenovirus C + ] + + # Float min_pct_reference_covered = 0.1 + } + + Array[String] assembly_header = ["sample_id", "taxid", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] + + scatter(taxon in taxid_to_ref_accessions) { + call ncbi.download_annotations { + input: + accessions = taxon.right, + combined_out_prefix = taxon.left + } + call assembly.scaffold { + input: + reads_bam = reads_unmapped_bam, + reference_genome_fasta = [download_annotations.combined_fasta], + min_length_fraction = 0, + min_unambig = 0, + allow_incomplete_output = true + } + call assemble_refbased.assemble_refbased as refine { + input: + reads_unmapped_bams = [reads_unmapped_bam], + reference_fasta = scaffold.scaffold_fasta, + sample_name = sample_id + } + # to do: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single + # to do: if biosample attributes file provided, run ncbi.biosample_to_genbank + + Map[String, String] stats_by_taxon = { + "sample_id" : sample_id, + "taxid" : taxon.left, + + "assembly_fasta" : refine.assembly_fasta, + "aligned_only_reads_bam" : refine.align_to_self_merged_aligned_only_bam, + "coverage_plot" : refine.align_to_self_merged_coverage_plot, + "assembly_length" : refine.assembly_length, + "assembly_length_unambiguous" : refine.assembly_length_unambiguous, + "reads_aligned" : refine.align_to_self_merged_reads_aligned, + "mean_coverage" : refine.align_to_self_merged_mean_coverage, + "percent_reference_covered" : 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length, + + "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, + "assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous, + + "replicate_concordant_sites" : refine.replicate_concordant_sites, + "replicate_discordant_snps" : refine.replicate_discordant_snps, + "replicate_discordant_indels" : refine.replicate_discordant_indels, + "replicate_discordant_vcf" : refine.replicate_discordant_vcf, + + "isnvsFile" : refine.align_to_self_isnvs_vcf, + "aligned_bam" : refine.align_to_self_merged_aligned_only_bam, + "coverage_tsv" : refine.align_to_self_merged_coverage_tsv, + "read_pairs_aligned" : refine.align_to_self_merged_read_pairs_aligned, + "bases_aligned" : refine.align_to_self_merged_bases_aligned + } + + scatter(h in assembly_header) { + String stat_by_taxon = stats_by_taxon[h] + } + } + + ### summary stats + call utils.concatenate { + input: + infiles = [write_tsv([assembly_header]), write_tsv(stat_by_taxon)], + output_name = "assembly_metadata-~{sample_id}.tsv" + } + + output { + Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon + File assembly_stats_by_taxon_tsv = concatenate.combined + + Int num_read_groups = refine.num_read_groups[0] + Int num_libraries = refine.num_libraries[0] + + String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa" + String scaffold_viral_assemble_version = scaffold.viralngs_version[0] + String refine_viral_assemble_version = refine.viral_assemble_version[0] + } +} diff --git a/requirements-modules.txt b/requirements-modules.txt index 73aa2a5e8..2cf41a97a 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,5 +1,5 @@ broadinstitute/viral-core=2.2.4 -broadinstitute/viral-assemble=2.1.33.0 +broadinstitute/viral-assemble=2.2.4.0 broadinstitute/viral-classify=2.2.3.0 broadinstitute/viral-phylo=2.1.20.2 broadinstitute/py3-bio=0.1.2