Skip to content

Commit

Permalink
Merge pull request #422 from broadinstitute/dp-denovo-update
Browse files Browse the repository at this point in the history
incorporate assemble_refbased as refine/polish for denovo
  • Loading branch information
dpark01 authored May 26, 2022
2 parents 7aafe3d + b208b9a commit a47d310
Show file tree
Hide file tree
Showing 15 changed files with 103 additions and 150 deletions.
72 changes: 4 additions & 68 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ task assemble {
Int? spades_n_reads = 10000000
Int? spades_min_contig_len = 0

String? assembler = "spades"
Boolean? always_succeed = false
String assembler = "spades"
Boolean always_succeed = false

# do this in two steps in case the input doesn't actually have "taxfilt" in the name
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt")
Expand Down Expand Up @@ -70,7 +70,7 @@ task scaffold {
File reads_bam
Array[File]+ reference_genome_fasta

String? aligner
String? aligner="muscle"
Float? min_length_fraction
Float? min_unambig
Int? replace_length=55
Expand Down Expand Up @@ -148,7 +148,7 @@ task scaffold {

runtime {
docker: docker
memory: select_first([machine_mem_gb, 15]) + " GB"
memory: select_first([machine_mem_gb, 31]) + " GB"
cpu: 4
disks: "local-disk 375 LOCAL"
dx_instance_type: "mem1_ssd1_v2_x8"
Expand Down Expand Up @@ -496,70 +496,6 @@ task refine_assembly_with_aligned_reads {
}
}
task refine {
meta {
description: "This step refines/polishes a genome based on unmapped short reads, aligning them (with either novoalign or bwa), and producing a new consensus genome. Uses GATK3 Unified Genotyper to produce new consensus. Produces new genome (fasta), variant calls (VCF), and figures of merit."
}
input {
File assembly_fasta
File reads_unmapped_bam
File? novocraft_license
String? novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100"
Float? major_cutoff = 0.5
Int? min_coverage = 1
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.1.16.1"
String assembly_basename=basename(basename(assembly_fasta, ".fasta"), ".scaffold")
}
command {
set -ex -o pipefail
# find 90% memory
mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 90)
assembly.py --version | tee VERSION
ln -s ${assembly_fasta} assembly.fasta
read_utils.py novoindex \
assembly.fasta \
${"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \
--loglevel=DEBUG
assembly.py refine_assembly \
assembly.fasta \
${reads_unmapped_bam} \
${assembly_basename}.refined.fasta \
--outVcf ${assembly_basename}.sites.vcf.gz \
--min_coverage ${min_coverage} \
--major_cutoff ${major_cutoff} \
--novo_params="${novoalign_options}" \
--JVMmemory "$mem_in_mb"m \
${"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \
--loglevel=DEBUG
}
output {
File refined_assembly_fasta = "${assembly_basename}.refined.fasta"
File sites_vcf_gz = "${assembly_basename}.sites.vcf.gz"
String viralngs_version = read_string("VERSION")
}
runtime {
docker: docker
memory: select_first([machine_mem_gb, 7]) + " GB"
cpu: 8
disks: "local-disk 375 LOCAL"
dx_instance_type: "mem1_ssd1_v2_x8"
maxRetries: 2
}
}
task refine_2x_and_plot {
meta {
Expand Down
1 change: 1 addition & 0 deletions pipes/WDL/workflows/align_and_count.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ workflow align_and_count_report {
description: "Align reads to reference with minimap2 and count the number of hits. Results are returned in the format of 'samtools idxstats'."
author: "Broad Viral Genomics"
email: "[email protected]"
allowNestedInputs: true
}

call reports.align_and_count
Expand Down
1 change: 1 addition & 0 deletions pipes/WDL/workflows/align_and_count_multiple_report.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ workflow align_and_count_multiple_report {
description: "Count the number of times reads map to provided reference sequences. Useful for counting spike-ins, etc."
author: "Broad Viral Genomics"
email: "[email protected]"
allowNestedInputs: true
}

input {
Expand Down
1 change: 1 addition & 0 deletions pipes/WDL/workflows/align_and_plot.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ workflow align_and_plot {
description: "Align reads to reference and produce coverage plots and statistics."
author: "Broad Viral Genomics"
email: "[email protected]"
allowNestedInputs: true
}

call assembly.align_reads as align
Expand Down
98 changes: 39 additions & 59 deletions pipes/WDL/workflows/assemble_denovo.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,17 @@ version 1.0
import "../tasks/tasks_taxon_filter.wdl" as taxon_filter
import "../tasks/tasks_read_utils.wdl" as read_utils
import "../tasks/tasks_assembly.wdl" as assembly
import "../tasks/tasks_intrahost.wdl" as intrahost
import "assemble_refbased.wdl" as assemble_refbased

workflow assemble_denovo {


meta {
description: "Assisted de novo viral genome assembly from raw reads."
author: "Broad Viral Genomics"
email: "[email protected]"
allowNestedInputs: true
}

input {
File reads_unmapped_bam

Expand All @@ -19,18 +26,7 @@ workflow assemble_denovo {
File? filter_to_taxon_db
File trim_clip_db

File? novocraft_license

Boolean call_isnvs=false

String assembler="spades"
Float? scaffold_min_length_fraction
Float? scaffold_min_unambig
Int? scaffold_replace_length=55
Int? nucmer_max_gap
Int? nucmer_min_match
Int? nucmer_min_cluster
Float? scaffold_min_pct_contig_aligned
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned")
}

parameter_meta {
Expand All @@ -57,8 +53,6 @@ workflow assemble_denovo {
}
}

String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned")

if(length(deplete_bmtaggerDbs) + length(deplete_blastDbs) + length(deplete_bwaDbs) > 0) {
call taxon_filter.deplete_taxa {
input:
Expand Down Expand Up @@ -87,48 +81,31 @@ workflow assemble_denovo {
reads_unmapped_bam = rmdup_ubam.dedup_bam,
trim_clip_db = trim_clip_db,
always_succeed = true,
assembler = assembler,
sample_name = sample_name
}

call assembly.scaffold {
input:
contigs_fasta = assemble.contigs_fasta,
reads_bam = select_first([filter_to_taxon.taxfilt_bam, deplete_taxa.cleaned_bam, reads_unmapped_bam]),
reference_genome_fasta = reference_genome_fasta,
min_length_fraction = scaffold_min_length_fraction,
min_unambig = scaffold_min_unambig,
replace_length = scaffold_replace_length,
nucmer_max_gap = nucmer_max_gap,
nucmer_min_match = nucmer_min_match,
nucmer_min_cluster = nucmer_min_cluster,
scaffold_min_pct_contig_aligned = scaffold_min_pct_contig_aligned
}

call assembly.refine_2x_and_plot {
input:
assembly_fasta = scaffold.scaffold_fasta,
reads_unmapped_bam = select_first([deplete_taxa.cleaned_bam, reads_unmapped_bam]),
novocraft_license = novocraft_license,
sample_name = sample_name
contigs_fasta = assemble.contigs_fasta,
reads_bam = select_first([filter_to_taxon.taxfilt_bam, deplete_taxa.cleaned_bam, reads_unmapped_bam]),
reference_genome_fasta = reference_genome_fasta
}

if(call_isnvs) {
call intrahost.isnvs_per_sample {
input:
assembly_fasta = refine_2x_and_plot.final_assembly_fasta,
mapped_bam = refine_2x_and_plot.aligned_bam
}
call assemble_refbased.assemble_refbased as refine {
input:
reads_unmapped_bams = [rmdup_ubam.dedup_bam],
reference_fasta = scaffold.scaffold_fasta,
sample_name = sample_name
}

output {
File final_assembly_fasta = refine_2x_and_plot.final_assembly_fasta
File aligned_only_reads_bam = refine_2x_and_plot.aligned_only_reads_bam
File coverage_plot = refine_2x_and_plot.coverage_plot
Int assembly_length = refine_2x_and_plot.assembly_length
Int assembly_length_unambiguous = refine_2x_and_plot.assembly_length_unambiguous
Int reads_aligned = refine_2x_and_plot.reads_aligned
Float mean_coverage = refine_2x_and_plot.mean_coverage
File final_assembly_fasta = refine.assembly_fasta
File aligned_only_reads_bam = refine.align_to_self_merged_aligned_only_bam
File coverage_plot = refine.align_to_self_merged_coverage_plot
Int assembly_length = refine.assembly_length
Int assembly_length_unambiguous = refine.assembly_length_unambiguous
Int reads_aligned = refine.align_to_self_merged_reads_aligned
Float mean_coverage = refine.align_to_self_merged_mean_coverage

File cleaned_bam = select_first([deplete_taxa.cleaned_bam, reads_unmapped_bam])
File? cleaned_fastqc = deplete_taxa.cleaned_fastqc
Expand All @@ -155,22 +132,25 @@ workflow assemble_denovo {
String scaffolding_chosen_ref_name = scaffold.scaffolding_chosen_ref_name
File scaffolding_stats = scaffold.scaffolding_stats
File scaffolding_alt_contigs = scaffold.scaffolding_alt_contigs

Int replicate_concordant_sites = refine.replicate_concordant_sites
Int replicate_discordant_snps = refine.replicate_discordant_snps
Int replicate_discordant_indels = refine.replicate_discordant_indels
Int num_read_groups = refine.num_read_groups
Int num_libraries = refine.num_libraries
File replicate_discordant_vcf = refine.replicate_discordant_vcf

File isnvs_vcf = refine.align_to_self_isnvs_vcf

File? isnvsFile = isnvs_per_sample.isnvsFile

File aligned_bam = refine_2x_and_plot.aligned_bam
File aligned_only_reads_bam_idx = refine_2x_and_plot.aligned_only_reads_bam_idx
File aligned_only_reads_fastqc = refine_2x_and_plot.aligned_only_reads_fastqc
File coverage_tsv = refine_2x_and_plot.coverage_tsv
Int read_pairs_aligned = refine_2x_and_plot.read_pairs_aligned
Float bases_aligned = refine_2x_and_plot.bases_aligned
File aligned_bam = refine.align_to_self_merged_aligned_and_unaligned_bam[0]
File aligned_only_reads_fastqc = refine.align_to_ref_per_input_fastqc[0]
File coverage_tsv = refine.align_to_self_merged_coverage_tsv
Int read_pairs_aligned = refine.align_to_self_merged_read_pairs_aligned
Float bases_aligned = refine.align_to_self_merged_bases_aligned

String? deplete_viral_classify_version = deplete_taxa.viralngs_version
String? taxfilt_viral_classify_version = filter_to_taxon.viralngs_version
String assemble_viral_assemble_version = assemble.viralngs_version
String scaffold_viral_assemble_version = scaffold.viralngs_version
String refine_viral_assemble_version = refine_2x_and_plot.viralngs_version
String? isnvs_viral_phylo_version = isnvs_per_sample.viralngs_version
}

}
19 changes: 18 additions & 1 deletion pipes/WDL/workflows/assemble_refbased.wdl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
version 1.0

import "../tasks/tasks_assembly.wdl" as assembly
import "../tasks/tasks_intrahost.wdl" as intrahost
import "../tasks/tasks_reports.wdl" as reports
import "../tasks/tasks_read_utils.wdl" as read_utils

Expand Down Expand Up @@ -108,6 +109,12 @@ workflow assemble_refbased {
}
File aligned_trimmed_bam = select_first([merge_align_to_ref.out_bam, ivar_trim.aligned_trimmed_bam[0]])

call intrahost.lofreq as isnvs_ref {
input:
reference_fasta = reference_fasta,
aligned_bam = aligned_trimmed_bam
}

call reports.alignment_metrics {
input:
aligned_bam = aligned_trimmed_bam,
Expand Down Expand Up @@ -158,6 +165,12 @@ workflow assemble_refbased {
}
File aligned_self_bam = select_first([merge_align_to_self.out_bam, align_to_self.aligned_only_reads_bam[0]])

call intrahost.lofreq as isnvs_self {
input:
reference_fasta = call_consensus.refined_assembly_fasta,
aligned_bam = aligned_self_bam
}

call reports.plot_coverage as plot_self_coverage {
input:
aligned_reads_bam = aligned_self_bam,
Expand Down Expand Up @@ -198,18 +211,22 @@ workflow assemble_refbased {
Int align_to_ref_merged_reads_aligned = plot_ref_coverage.reads_aligned
Int align_to_ref_merged_read_pairs_aligned = plot_ref_coverage.read_pairs_aligned
Float align_to_ref_merged_bases_aligned = plot_ref_coverage.bases_aligned
File align_to_ref_isnvs_vcf = isnvs_ref.report_vcf

File picard_metrics_wgs = alignment_metrics.wgs_metrics
File picard_metrics_alignment = alignment_metrics.alignment_metrics
File picard_metrics_insert_size = alignment_metrics.insert_size_metrics


Array[File] align_to_self_merged_aligned_and_unaligned_bam = align_to_self.aligned_bam

File align_to_self_merged_aligned_only_bam = aligned_self_bam
File align_to_self_merged_coverage_plot = plot_self_coverage.coverage_plot
File align_to_self_merged_coverage_tsv = plot_self_coverage.coverage_tsv
Int align_to_self_merged_reads_aligned = plot_self_coverage.reads_aligned
Int align_to_self_merged_read_pairs_aligned = plot_self_coverage.read_pairs_aligned
Float align_to_self_merged_bases_aligned = plot_self_coverage.bases_aligned
Float align_to_self_merged_mean_coverage = plot_self_coverage.mean_coverage
File align_to_self_isnvs_vcf = isnvs_self.report_vcf

String align_to_ref_viral_core_version = align_to_ref.viralngs_version[0]
String ivar_version = ivar_trim.ivar_version[0]
Expand Down
1 change: 1 addition & 0 deletions pipes/WDL/workflows/classify_kraken2.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ workflow classify_kraken2 {
description: "Taxonomic classification of sequences via kraken2 (or kraken2x, depending on the database provided)."
author: "Broad Viral Genomics"
email: "[email protected]"
allowNestedInputs: true
}

call metagenomics.kraken2
Expand Down
1 change: 1 addition & 0 deletions pipes/WDL/workflows/classify_single.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ workflow classify_single {
description: "Runs raw reads through taxonomic classification (Kraken2), human read depletion (based on Kraken2), de novo assembly (SPAdes), and FASTQC/multiQC of reads."
author: "Broad Viral Genomics"
email: "[email protected]"
allowNestedInputs: true
}

input {
Expand Down
1 change: 1 addition & 0 deletions pipes/WDL/workflows/demux_only.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ workflow demux_only {
description: "Picard-based demultiplexing and basecalling from a tarball of a raw BCL directory."
author: "Broad Viral Genomics"
email: "[email protected]"
allowNestedInputs: true
}

input {
Expand Down
1 change: 1 addition & 0 deletions pipes/WDL/workflows/demux_plus.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ workflow demux_plus {
description: "Picard-based demultiplexing and basecalling from a tarball of a raw BCL directory, followed by basic metagenomics and QC metrics. Intended for automatic triggering post upload on DNAnexus."
author: "Broad Viral Genomics"
email: "[email protected]"
allowNestedInputs: true
}

input {
Expand Down
1 change: 1 addition & 0 deletions pipes/WDL/workflows/fastq_to_ubam.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ workflow fastq_to_ubam {
description: "Convert reads from fastq format (single or paired) to unaligned BAM format."
author: "Broad Viral Genomics"
email: "[email protected]"
allowNestedInputs: true
}

call tasks_read_utils.FastqToUBAM
Expand Down
1 change: 1 addition & 0 deletions pipes/WDL/workflows/fetch_sra_to_bam.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ workflow fetch_sra_to_bam {
description: "Retrieve reads from the NCBI Short Read Archive in unaligned BAM format with relevant metadata encoded."
author: "Broad Viral Genomics"
email: "[email protected]"
allowNestedInputs: true
}

call ncbi_tools.Fetch_SRA_to_BAM
Expand Down
1 change: 1 addition & 0 deletions pipes/WDL/workflows/genbank.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ workflow genbank {
description: "Prepare assemblies for Genbank submission. This includes annotation by simple coordinate transfer from Genbank annotations and a multiple alignment. See https://viral-pipelines.readthedocs.io/en/latest/ncbi_submission.html for details."
author: "Broad Viral Genomics"
email: "[email protected]"
allowNestedInputs: true
}

input {
Expand Down
Loading

0 comments on commit a47d310

Please sign in to comment.