Skip to content

Commit

Permalink
Merge pull request #443 from broadinstitute/dp-denovo
Browse files Browse the repository at this point in the history
genbank prep and assemble_denovo updates
  • Loading branch information
dpark01 authored Dec 13, 2022
2 parents fb1ad41 + 6484b9e commit d4c3ff0
Show file tree
Hide file tree
Showing 30 changed files with 344 additions and 199 deletions.
4 changes: 2 additions & 2 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,9 @@ workflows:
primaryDescriptorPath: /pipes/WDL/workflows/sarscov2_lineages.wdl
testParameterFiles:
- /test/input/WDL/test_inputs-sarscov2_lineages-local.json
- name: read_depths
- name: calc_bam_read_depths
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/read_depths.wdl
primaryDescriptorPath: /pipes/WDL/workflows/calc_bam_read_depths.wdl
testParameterFiles:
- empty.json
- name: sarscov2_gisaid_ingest
Expand Down
46 changes: 21 additions & 25 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,16 @@ task assemble {
File trim_clip_db

Int spades_n_reads = 10000000
Int spades_min_contig_len = 0
Int? spades_min_contig_len
String? spades_options

String assembler = "spades"
Boolean always_succeed = false

# do this in two steps in case the input doesn't actually have "taxfilt" in the name
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt")

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.1.20.2"
String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0"
}

Int disk_size = 375
Expand All @@ -30,28 +29,23 @@ task assemble {

assembly.py --version | tee VERSION

if [[ "~{assembler}" == "spades" ]]; then
assembly.py assemble_spades \
~{reads_unmapped_bam} \
~{trim_clip_db} \
~{sample_name}.assembly1-~{assembler}.fasta \
~{'--nReads=' + spades_n_reads} \
~{true="--alwaysSucceed" false="" always_succeed} \
~{'--minContigLen=' + spades_min_contig_len} \
~{'--spadesOpts="' + spades_options + '"'} \
--memLimitGb $mem_in_gb \
--outReads=~{sample_name}.subsamp.bam \
--loglevel=DEBUG
else
echo "unrecognized assembler ~{assembler}" >&2
exit 1
fi
assembly.py assemble_spades \
~{reads_unmapped_bam} \
~{trim_clip_db} \
~{sample_name}.assembly1-spades.fasta \
~{'--nReads=' + spades_n_reads} \
~{true="--alwaysSucceed" false="" always_succeed} \
~{'--minContigLen=' + spades_min_contig_len} \
~{'--spadesOpts="' + spades_options + '"'} \
--memLimitGb $mem_in_gb \
--outReads=~{sample_name}.subsamp.bam \
--loglevel=DEBUG

samtools view -c ~{sample_name}.subsamp.bam | tee subsample_read_count >&2
}

output {
File contigs_fasta = "~{sample_name}.assembly1-~{assembler}.fasta"
File contigs_fasta = "~{sample_name}.assembly1-spades.fasta"
File subsampBam = "~{sample_name}.subsamp.bam"
Int subsample_read_count = read_int("subsample_read_count")
String viralngs_version = read_string("VERSION")
Expand Down Expand Up @@ -83,10 +77,11 @@ task scaffold {
Int? nucmer_max_gap
Int? nucmer_min_match
Int? nucmer_min_cluster
Int? scaffold_min_contig_len
Float? scaffold_min_pct_contig_aligned

Int? machine_mem_gb
String docker="quay.io/broadinstitute/viral-assemble:2.1.20.2"
String docker="quay.io/broadinstitute/viral-assemble:2.1.33.0"

# do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name
String sample_name = basename(basename(contigs_fasta, ".fasta"), ".assembly1-spades")
Expand All @@ -106,6 +101,7 @@ task scaffold {
~{contigs_fasta} \
~{sep=' ' reference_genome_fasta} \
~{sample_name}.intermediate_scaffold.fasta \
~{'--min_contig_len=' + scaffold_min_contig_len} \
~{'--maxgap=' + nucmer_max_gap} \
~{'--minmatch=' + nucmer_min_match} \
~{'--mincluster=' + nucmer_min_cluster} \
Expand All @@ -115,7 +111,7 @@ task scaffold {
--outAlternateContigs ~{sample_name}.scaffolding_alt_contigs.fasta \
--loglevel=DEBUG

grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | tr '\n' '\t' > ~{sample_name}.scaffolding_chosen_ref.txt
grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | cut -f 1 -d ' ' > ~{sample_name}.scaffolding_chosen_refs.txt

assembly.py gapfill_gap2seq \
~{sample_name}.intermediate_scaffold.fasta \
Expand Down Expand Up @@ -146,7 +142,7 @@ task scaffold {
File intermediate_gapfill_fasta = "~{sample_name}.intermediate_gapfill.fasta"
Int assembly_preimpute_length = read_int("assembly_preimpute_length")
Int assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous")
String scaffolding_chosen_ref_name = read_string("~{sample_name}.scaffolding_chosen_ref.txt")
Array[String] scaffolding_chosen_ref_names = read_lines("~{sample_name}.scaffolding_chosen_refs.txt")
File scaffolding_chosen_ref = "~{sample_name}.scaffolding_chosen_ref.fasta"
File scaffolding_stats = "~{sample_name}.scaffolding_stats.txt"
File scaffolding_alt_contigs = "~{sample_name}.scaffolding_alt_contigs.fasta"
Expand Down Expand Up @@ -428,7 +424,7 @@ task refine_assembly_with_aligned_reads {
Int min_coverage = 3
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.1.20.2"
String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0"
}
Int disk_size = 375
Expand Down Expand Up @@ -538,7 +534,7 @@ task refine_2x_and_plot {
String? plot_coverage_novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k"
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.1.20.2"
String docker = "quay.io/broadinstitute/viral-assemble:2.1.33.0"
# do this in two steps in case the input doesn't actually have "cleaned" in the name
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned")
Expand Down
6 changes: 3 additions & 3 deletions pipes/WDL/tasks/tasks_interhost.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ task multi_align_mafft_ref {
Float? mafft_gapOpeningPenalty

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
}

String fasta_basename = basename(reference_fasta, '.fasta')
Expand Down Expand Up @@ -56,7 +56,7 @@ task multi_align_mafft {
Float? mafft_gapOpeningPenalty

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
}

Int disk_size = 200
Expand Down Expand Up @@ -282,7 +282,7 @@ task merge_vcfs_gatk {
File ref_fasta

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"

String output_prefix = "merged"
}
Expand Down
6 changes: 3 additions & 3 deletions pipes/WDL/tasks/tasks_intrahost.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ task isnvs_per_sample {
Boolean removeDoublyMappedReads = true

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"

String sample_name = basename(basename(basename(mapped_bam, ".bam"), ".all"), ".mapped")
}
Expand Down Expand Up @@ -222,7 +222,7 @@ task isnvs_vcf {
Boolean naiveFilter = false

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
}

parameter_meta {
Expand Down Expand Up @@ -296,7 +296,7 @@ task annotate_vcf_snpeff {
String? emailAddress

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"

String output_basename = basename(basename(in_vcf, ".gz"), ".vcf")
}
Expand Down
16 changes: 8 additions & 8 deletions pipes/WDL/tasks/tasks_metagenomics.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ task krakenuniq {
File krona_taxonomy_db_tgz # taxonomy.tab
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
}

Int disk_size = 750
Expand Down Expand Up @@ -140,7 +140,7 @@ task build_krakenuniq_db {
Int? zstd_compression_level

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
}

Int disk_size = 750
Expand Down Expand Up @@ -210,7 +210,7 @@ task kraken2 {
Int? min_base_qual

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
}

parameter_meta {
Expand Down Expand Up @@ -345,7 +345,7 @@ task build_kraken2_db {
Int? zstd_compression_level

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
}

Int disk_size = 750
Expand Down Expand Up @@ -487,7 +487,7 @@ task blastx {
File krona_taxonomy_db_tgz

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
}

parameter_meta {
Expand Down Expand Up @@ -577,7 +577,7 @@ task krona {
Int? magnitude_column

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
}

Int disk_size = 50
Expand Down Expand Up @@ -684,7 +684,7 @@ task filter_bam_to_taxa {
String out_filename_suffix = "filtered"

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
}

String out_basename = basename(classified_bam, ".bam") + "." + out_filename_suffix
Expand Down Expand Up @@ -771,7 +771,7 @@ task kaiju {
File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0"
}

String input_basename = basename(reads_unmapped_bam, ".bam")
Expand Down
30 changes: 16 additions & 14 deletions pipes/WDL/tasks/tasks_ncbi.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ task download_fasta {
Array[String]+ accessions
String emailAddress

String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
}

command {
Expand Down Expand Up @@ -38,27 +38,29 @@ task download_annotations {
String emailAddress
String combined_out_prefix

String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
}

command {
command <<<
set -ex -o pipefail
ncbi.py --version | tee VERSION
ncbi.py fetch_feature_tables \
${emailAddress} \
~{emailAddress} \
./ \
${sep=' ' accessions} \
~{sep=' ' accessions} \
--loglevel DEBUG
mkdir -p combined
ncbi.py fetch_fastas \
${emailAddress} \
~{emailAddress} \
./ \
${sep=' ' accessions} \
--combinedFilePrefix "${combined_out_prefix}" \
~{sep=' ' accessions} \
--combinedFilePrefix "combined/~{combined_out_prefix}" \
--forceOverwrite \
--loglevel DEBUG
}
>>>

output {
File combined_fasta = "${combined_out_prefix}.fasta"
File combined_fasta = "~{combined_out_prefix}.fasta"
Array[File] genomes_fasta = glob("*.fasta")
Array[File] features_tbl = glob("*.tbl")
String viralngs_version = read_string("VERSION")
Expand All @@ -83,7 +85,7 @@ task annot_transfer {
File reference_fasta
Array[File]+ reference_feature_table

String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
}

parameter_meta {
Expand Down Expand Up @@ -137,7 +139,7 @@ task align_and_annot_transfer_single {
Array[File]+ reference_fastas
Array[File]+ reference_feature_tables

String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
}

parameter_meta {
Expand Down Expand Up @@ -564,7 +566,7 @@ task biosample_to_genbank {
File? filter_to_ids
Boolean s_dropout_note = true
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
}
String base = basename(biosample_attributes, ".txt")
command {
Expand Down Expand Up @@ -730,7 +732,7 @@ task prepare_genbank {
String? assembly_method_version
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.0"
String docker = "quay.io/broadinstitute/viral-phylo:2.1.20.2"
}
parameter_meta {
Expand Down
30 changes: 30 additions & 0 deletions pipes/WDL/tasks/tasks_ncbi_tools.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,36 @@ task Fetch_SRA_to_BAM {
}
}
task fetch_genbank_metadata {
input {
String genbank_accession
String docker = "quay.io/broadinstitute/ncbi-tools:2.10.7.10"
}
Int disk_size = 50
command <<<
set -e
source /opt/miniconda/bin/activate $CONDA_DEFAULT_ENV # for miniwdl / non-login docker runners
esearch -db nuccore -q "~{genbank_accession}" | efetch -db nuccore -format gb -mode xml -json > gb.json
jq -r '[.GBSet.GBSeq."GBSeq_feature-table".GBFeature[0].GBFeature_quals.GBQualifier|.[]|{(.GBQualifier_name): .GBQualifier_value}]|add ' gb.json > "~{genbank_accession}".metadata.json
jq -r '.db_xref' "~{genbank_accession}".metadata.json | grep ^taxon: | cut -f 2 -d : > taxid.txt
jq -r '.organism' "~{genbank_accession}".metadata.json > organism.txt
>>>
output {
Map[String,String] metadata = read_json("~{genbank_accession}.metadata.json")
String taxid = read_string("taxid.txt")
String organism = read_string("organism.txt")
}
runtime {
cpu: 1
memory: "1 GB"
disks: "local-disk " + disk_size + " LOCAL"
disk: disk_size + " GB" # TES
dx_instance_type: "mem1_ssd1_v2_x2"
docker: docker
maxRetries: 2
}
}
task biosample_tsv_filter_preexisting {
input {
File meta_submit_tsv
Expand Down
Loading

0 comments on commit d4c3ff0

Please sign in to comment.