Skip to content

Commit

Permalink
Merge pull request #423 from broadinstitute/dp-denovo-update
Browse files Browse the repository at this point in the history
new metagenomic_denovo workflow
  • Loading branch information
dpark01 authored May 29, 2022
2 parents a47d310 + 204570e commit 017370c
Show file tree
Hide file tree
Showing 6 changed files with 274 additions and 7 deletions.
5 changes: 5 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,11 @@ workflows:
primaryDescriptorPath: /pipes/WDL/workflows/merge_vcfs.wdl
testParameterFiles:
- empty.json
- name: metagenomic_denovo
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/metagenomic_denovo.wdl
testParameterFiles:
- empty.json
- name: multiqc_only
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/multiqc_only.wdl
Expand Down
8 changes: 4 additions & 4 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ task assemble {
File reads_unmapped_bam
File trim_clip_db

Int? spades_n_reads = 10000000
Int? spades_min_contig_len = 0
Int spades_n_reads = 10000000
Int spades_min_contig_len = 0

String assembler = "spades"
Boolean always_succeed = false
Expand Down Expand Up @@ -70,10 +70,10 @@ task scaffold {
File reads_bam
Array[File]+ reference_genome_fasta

String? aligner="muscle"
String aligner="muscle"
Float? min_length_fraction
Float? min_unambig
Int? replace_length=55
Int replace_length=55

Int? nucmer_max_gap
Int? nucmer_min_match
Expand Down
3 changes: 2 additions & 1 deletion pipes/WDL/tasks/tasks_read_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ task FastqToUBAM {
String? readgroup_name
String? platform_unit
String? run_date
String? platform_name
String platform_name
String? sequencing_center
String docker = "quay.io/broadinstitute/viral-core:2.1.33"
Expand All @@ -323,6 +323,7 @@ task FastqToUBAM {
fastq_2: { description: "Unaligned read2 file in fastq format. This should be empty for single-end read conversion and required for paired-end reads. If provided, it must match fastq_1 in length and order.", patterns: ["*.fastq", "*.fastq.gz", "*.fq", "*.fq.gz"] }
sample_name: { description: "Sample name. This is required and will populate the 'SM' read group value and will be used as the output filename (must be filename-friendly)." }
library_name: { description: "Library name. This is required and will populate the 'LB' read group value. SM & LB combinations must be identical for any sequencing reads generated from the same sequencing library, and must be distinct for any reads generated from different libraries." }
platform_name: { description: "Sequencing platform. This is required and will populate the 'PL' read group value. Must be one of CAPILLARY, DNBSEQ, HELICOS, ILLUMINA, IONTORRENT, LS454, ONT, PACBIO, or SOLID." }
}
command {
set -ex -o pipefail
Expand Down
1 change: 0 additions & 1 deletion pipes/WDL/workflows/classify_single.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ version 1.0

import "../tasks/tasks_metagenomics.wdl" as metagenomics
import "../tasks/tasks_read_utils.wdl" as read_utils
import "../tasks/tasks_taxon_filter.wdl" as taxon_filter
import "../tasks/tasks_assembly.wdl" as assembly
import "../tasks/tasks_reports.wdl" as reports
Expand Down
261 changes: 261 additions & 0 deletions pipes/WDL/workflows/metagenomic_denovo.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
version 1.0

import "../tasks/tasks_taxon_filter.wdl" as taxon_filter
import "../tasks/tasks_read_utils.wdl" as read_utils
import "../tasks/tasks_assembly.wdl" as assembly
import "../tasks/tasks_metagenomics.wdl" as metagenomics
import "../tasks/tasks_reports.wdl" as reports
import "assemble_refbased.wdl" as assemble_refbased
workflow metagenomic_denovo {

meta {
description: "Assisted de novo viral genome assembly (SPAdes, scaffolding, and polishing) from metagenomic raw reads. Runs raw reads through taxonomic classification (Kraken2), human read depletion (based on Kraken2 and optionally using BWA, BLASTN, and/or BMTAGGER databases), and FASTQC/multiQC of reads."
author: "Broad Viral Genomics"
email: "[email protected]"
allowNestedInputs: true
}

input {
String sample_name
File fastq_1
File? fastq_2
String library_name="1"
String? run_date_iso
String sequencing_platform

Array[File]+ reference_genome_fasta

File kraken2_db_tgz
File krona_taxonomy_tab
File ncbi_taxdump_tgz

Array[File] deplete_bmtaggerDbs = []
Array[File] deplete_blastDbs = []
Array[File] deplete_bwaDbs =[]

Array[String] taxa_to_dehost = ["Vertebrata"]
Array[String] taxa_to_avoid_assembly = ["Vertebrata", "other sequences", "Bacteria"]

File? filter_to_taxon_db
File? spikein_db

File trim_clip_db
}

parameter_meta {
fastq_1: { description: "Unaligned read1 file in fastq format", patterns: ["*.fastq", "*.fastq.gz", "*.fq", "*.fq.gz"] }
fastq_2: { description: "Unaligned read2 file in fastq format. This should be empty for single-end read conversion and required for paired-end reads. If provided, it must match fastq_1 in length and order.", patterns: ["*.fastq", "*.fastq.gz", "*.fq", "*.fq.gz"] }
sample_name: { description: "Sample name. This is required and will populate the 'SM' read group value and will be used as the output filename (must be filename-friendly)." }
sequencing_platform: { description: "Sequencing platform. This is required and will populate the 'PL' read group value. Must be one of CAPILLARY, DNBSEQ, HELICOS, ILLUMINA, IONTORRENT, LS454, ONT, PACBIO, or SOLID." }

reference_genome_fasta: {
description: "After denovo assembly, large contigs are scaffolded against a reference genome to determine orientation and to join contigs together, before further polishing by reads. You must supply at least one reference genome (all segments/chromomes in a single fasta file). If more than one reference is provided, contigs will be scaffolded against all of them and the one with the most complete assembly will be chosen for downstream polishing.",
patterns: ["*.fasta"]
}
deplete_bmtaggerDbs: {
description: "Optional list of databases to use for bmtagger-based depletion. Sequences in fasta format will be indexed on the fly, pre-bmtagger-indexed databases may be provided as tarballs.",
patterns: ["*.fasta", "*.fasta.gz", "*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
}
deplete_blastDbs: {
description: "Optional list of databases to use for blastn-based depletion. Sequences in fasta format will be indexed on the fly, pre-blast-indexed databases may be provided as tarballs.",
patterns: ["*.fasta", "*.fasta.gz", "*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
}
deplete_bwaDbs: {
description: "Optional list of databases to use for bwa mem-based depletion. Sequences in fasta format will be indexed on the fly, pre-bwa-indexed databases may be provided as tarballs.",
patterns: ["*.fasta", "*.fasta.gz", "*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
}
filter_to_taxon_db: {
description: "Optional database to use to filter read set to those that match by LASTAL. Sequences in fasta format will be indexed on the fly.",
patterns: ["*.fasta"]
}
spikein_db: {
description: "ERCC/SDSI spike-in sequences",
patterns: ["*.fasta", "*.fasta.gz", "*.fasta.zst"]
}
trim_clip_db: {
description: "Adapter sequences to remove via trimmomatic prior to SPAdes assembly",
patterns: ["*.fasta", "*.fasta.gz", "*.fasta.zst"]
}
kraken2_db_tgz: {
description: "Pre-built Kraken database tarball containing three files: hash.k2d, opts.k2d, and taxo.k2d.",
patterns: ["*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
}
krona_taxonomy_tab: {
description: "Krona taxonomy database containing a single file: taxonomy.tab, or possibly just a compressed taxonomy.tab",
patterns: ["*.tab.zst", "*.tab.gz", "*.tab", "*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
}
ncbi_taxdump_tgz: {
description: "An NCBI taxdump.tar.gz file that contains, at the minimum, a nodes.dmp and names.dmp file.",
patterns: ["*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
}
}

# bundle 1 or 2 input files along with their metadata
call read_utils.FastqToUBAM {
input:
fastq_1 = fastq_1,
fastq_2 = fastq_2,
sample_name = sample_name,
library_name = library_name,
run_date = run_date_iso,
platform_name = sequencing_platform
}
File reads_bam = FastqToUBAM.unmapped_bam

# metagenomics, QC, etc
call reports.fastqc as fastqc_raw {
input: reads_bam = reads_bam
}
if(defined(spikein_db)) {
call reports.align_and_count as spikein {
input:
reads_bam = reads_bam,
ref_db = select_first([spikein_db])
}
}
call metagenomics.kraken2 as kraken2 {
input:
reads_bam = reads_bam,
kraken2_db_tgz = kraken2_db_tgz,
krona_taxonomy_db_tgz = krona_taxonomy_tab
}

# deplete host reads: kraken2 followed by (optional) bmtagger, blastn, and/or bwa
# resulting read set can be published to SRA
call metagenomics.filter_bam_to_taxa as deplete_k2 {
input:
classified_bam = reads_bam,
classified_reads_txt_gz = kraken2.kraken2_reads_report,
ncbi_taxonomy_db_tgz = ncbi_taxdump_tgz,
exclude_taxa = true,
taxonomic_names = taxa_to_dehost,
out_filename_suffix = "host_depleted"
}
if(length(deplete_bmtaggerDbs) + length(deplete_blastDbs) + length(deplete_bwaDbs) > 0) {
call taxon_filter.deplete_taxa {
input:
raw_reads_unmapped_bam = deplete_k2.bam_filtered_to_taxa,
bmtaggerDbs = deplete_bmtaggerDbs,
blastDbs = deplete_blastDbs,
bwaDbs = deplete_bwaDbs
}
}
File dehosted_bam = select_first([deplete_taxa.cleaned_bam, deplete_k2.bam_filtered_to_taxa])
call reports.fastqc as fastqc_dehosted {
input: reads_bam = dehosted_bam
}

# taxonomically focus/filter read set: remove Bacterial and artificial taxa (kraken2) and optionally filter to desired LASTAL database
call metagenomics.filter_bam_to_taxa as filter_acellular {
input:
classified_bam = dehosted_bam,
classified_reads_txt_gz = kraken2.kraken2_reads_report,
ncbi_taxonomy_db_tgz = ncbi_taxdump_tgz,
exclude_taxa = true,
taxonomic_names = taxa_to_avoid_assembly,
out_filename_suffix = "acellular"
}
if(defined(filter_to_taxon_db)) {
call taxon_filter.filter_to_taxon {
input:
reads_unmapped_bam = dehosted_bam,
lastal_db_fasta = select_first([filter_to_taxon_db])
}
}
File taxfiltered_bam = select_first([filter_to_taxon.taxfilt_bam, dehosted_bam])
call reports.fastqc as fastqc_taxfilt {
input: reads_bam = taxfiltered_bam
}

# alignment-free duplicate removal
call read_utils.rmdup_ubam {
input:
reads_unmapped_bam = taxfiltered_bam
}

# denovo assembly (with taxfiltered/rmdup reads)
call assembly.assemble {
input:
reads_unmapped_bam = rmdup_ubam.dedup_bam,
trim_clip_db = trim_clip_db,
always_succeed = true,
sample_name = sample_name
}

# scaffold contigs to one or more ref genomes and impute with reference
call assembly.scaffold {
input:
contigs_fasta = assemble.contigs_fasta,
reads_bam = dehosted_bam,
reference_genome_fasta = reference_genome_fasta
}

# polish/refine with dehosted reads
call assemble_refbased.assemble_refbased as refine {
input:
reads_unmapped_bams = [dehosted_bam],
reference_fasta = scaffold.scaffold_fasta,
sample_name = sample_name
}

output {
File final_assembly_fasta = refine.assembly_fasta
File aligned_only_reads_bam = refine.align_to_self_merged_aligned_only_bam
File coverage_plot = refine.align_to_self_merged_coverage_plot
Int assembly_length = refine.assembly_length
Int assembly_length_unambiguous = refine.assembly_length_unambiguous
Int reads_aligned = refine.align_to_self_merged_reads_aligned
Float mean_coverage = refine.align_to_self_merged_mean_coverage

File kraken2_summary_report = kraken2.kraken2_summary_report
File kraken2_krona_plot = kraken2.krona_report_html

File raw_unmapped_bam = reads_bam
File depleted_bam = dehosted_bam
File taxfilt_bam = taxfiltered_bam
File dedup_bam = rmdup_ubam.dedup_bam
File denovo_in_bam = assemble.subsampBam

Int read_counts_raw = deplete_k2.classified_taxonomic_filter_read_count_pre
Int read_counts_depleted = select_first([deplete_taxa.depletion_read_count_post, deplete_k2.classified_taxonomic_filter_read_count_post])
Int read_counts_taxfilt = select_first([filter_to_taxon.filter_read_count_post, filter_acellular.classified_taxonomic_filter_read_count_post])
Int read_counts_dedup = rmdup_ubam.dedup_read_count_post
Int read_counts_denovo_in = assemble.subsample_read_count

File raw_fastqc = fastqc_raw.fastqc_html
File depleted_fastqc = fastqc_dehosted.fastqc_html
File taxfilt_fastqc = fastqc_taxfilt.fastqc_html
File dedup_fastqc = rmdup_ubam.dedup_fastqc

File contigs_fasta = assemble.contigs_fasta
File scaffold_fasta = scaffold.scaffold_fasta
File intermediate_scaffold_fasta = scaffold.intermediate_scaffold_fasta
File intermediate_gapfill_fasta = scaffold.intermediate_gapfill_fasta
Int assembly_preimpute_length = scaffold.assembly_preimpute_length
Int assembly_preimpute_length_unambiguous = scaffold.assembly_preimpute_length_unambiguous
String scaffolding_chosen_ref_name = scaffold.scaffolding_chosen_ref_name
File scaffolding_stats = scaffold.scaffolding_stats
File scaffolding_alt_contigs = scaffold.scaffolding_alt_contigs

Int replicate_concordant_sites = refine.replicate_concordant_sites
Int replicate_discordant_snps = refine.replicate_discordant_snps
Int replicate_discordant_indels = refine.replicate_discordant_indels
Int num_read_groups = refine.num_read_groups
Int num_libraries = refine.num_libraries
File replicate_discordant_vcf = refine.replicate_discordant_vcf

File isnvs_vcf = refine.align_to_self_isnvs_vcf

File aligned_bam = refine.align_to_self_merged_aligned_and_unaligned_bam[0]
File aligned_only_reads_fastqc = refine.align_to_ref_per_input_fastqc[0]
File coverage_tsv = refine.align_to_self_merged_coverage_tsv
Int read_pairs_aligned = refine.align_to_self_merged_read_pairs_aligned
Float bases_aligned = refine.align_to_self_merged_bases_aligned

File? spikein_hits = spikein.report

String viral_classify_version = kraken2.viralngs_version
String viral_assemble_version = assemble.viralngs_version
}
}
3 changes: 2 additions & 1 deletion test/input/WDL/test_inputs-fastq_to_ubam-local.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
"fastq_to_ubam.FastqToUBAM.fastq_1": "test/input/in1.fastq",
"fastq_to_ubam.FastqToUBAM.fastq_2": "test/input/in2.fastq",
"fastq_to_ubam.FastqToUBAM.sample_name": "mysample",
"fastq_to_ubam.FastqToUBAM.library_name": "mylib"
"fastq_to_ubam.FastqToUBAM.library_name": "mylib",
"fastq_to_ubam.FastqToUBAM.platform_name": "ILLUMINA"
}

0 comments on commit 017370c

Please sign in to comment.