Skip to content

Commit

Permalink
Merge pull request #547 from broadinstitute/dp-scaffold
Browse files Browse the repository at this point in the history
scaffolding regression fixes plus docker updates
  • Loading branch information
dpark01 authored Aug 4, 2024
2 parents 05119b5 + 6148cec commit e28bec4
Show file tree
Hide file tree
Showing 16 changed files with 173 additions and 82 deletions.
8 changes: 4 additions & 4 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@
sys.path.insert(0, os.path.dirname(os.path.abspath('.')))

# -- Mock out the heavyweight pip packages, esp those that require C ----
import mock
MOCK_MODULES = []
for mod_name in MOCK_MODULES:
sys.modules[mod_name] = mock.Mock()
#import mock
#MOCK_MODULES = []
#for mod_name in MOCK_MODULES:
# sys.modules[mod_name] = mock.Mock()

# -- Obtain GIT version --
def _git_version():
Expand Down
1 change: 0 additions & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@ Sphinx==7.4.7 #override sphinx pinning done by RTD: https://docs.readthedocs.io/
sphinx-argparse==0.5.2
sphinx-rtd-theme>=2.0.0
PyYAML==6.0.1
mock==5.0.1
recommonmark
wdl-aid==1.0.0
5 changes: 3 additions & 2 deletions github_actions_ci/install-wdl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ fetch_jar_from_github () {
ln -s $_jar_fname $_tool_name.jar
}

fetch_jar_from_github broadinstitute cromwell womtool 86
fetch_jar_from_github broadinstitute cromwell cromwell 86
fetch_jar_from_github broadinstitute cromwell womtool 87
fetch_jar_from_github broadinstitute cromwell cromwell 87
fetch_jar_from_github dnanexus dxWDL dxWDL v1.50
fetch_jar_from_github dnanexus dxCompiler dxCompiler 2.11.6

TGZ=dx-toolkit-v0.311.0-ubuntu-20.04-amd64.tar.gz
echo "Fetching $TGZ"
Expand Down
80 changes: 54 additions & 26 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ task assemble {
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt")

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.2.0"
}
parameter_meta{
reads_unmapped_bam: {
Expand Down Expand Up @@ -115,7 +115,7 @@ task select_references {
Int? skani_s
Int? skani_c

String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.2.0"
Int machine_mem_gb = 4
Int cpu = 2
Int disk_size = 100
Expand Down Expand Up @@ -206,7 +206,7 @@ task scaffold {
Float? scaffold_min_pct_contig_aligned
Int? machine_mem_gb
String docker="quay.io/broadinstitute/viral-assemble:2.3.1.4"
String docker="quay.io/broadinstitute/viral-assemble:2.3.2.0"
# do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name
String sample_name = basename(basename(contigs_fasta, ".fasta"), ".assembly1-spades")
Expand Down Expand Up @@ -298,27 +298,55 @@ task scaffold {
~{'-s ' + skani_s} \
~{'-c ' + skani_c} \
--loglevel=DEBUG
CHOSEN_REF_FASTA=$(cut -f 1 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1)
cut -f 3 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1 > SKANI_ANI
cut -f 4 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1 > SKANI_REF_AF
cut -f 5 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1 > SKANI_CONTIGS_AF
basename "$CHOSEN_REF_FASTA" .fasta > CHOSEN_REF_BASENAME
assembly.py order_and_orient \
"~{contigs_fasta}" \
"$CHOSEN_REF_FASTA" \
"~{sample_name}".intermediate_scaffold.fasta \
~{'--min_contig_len=' + scaffold_min_contig_len} \
~{'--maxgap=' + nucmer_max_gap} \
~{'--minmatch=' + nucmer_min_match} \
~{'--mincluster=' + nucmer_min_cluster} \
~{'--min_pct_contig_aligned=' + scaffold_min_pct_contig_aligned} \
--outReference "~{sample_name}".scaffolding_chosen_ref.fasta \
--outStats "~{sample_name}".scaffolding_stats.txt \
--outAlternateContigs ~{sample_name}.scaffolding_alt_contigs.fasta \
~{true='--allow_incomplete_output' false="" allow_incomplete_output} \
--loglevel=DEBUG
# sometimes skani fails; if so, just fall-back to sending all refs downstream
if [[ $(wc -l <"~{sample_name}.refs_skani_dist.full.tsv") -ge 2 ]]; then
# skani reference selection worked: just try one reference
CHOSEN_REF_FASTA=$(cut -f 1 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1)
basename "$CHOSEN_REF_FASTA" .fasta > CHOSEN_REF_BASENAME
assembly.py order_and_orient \
"~{contigs_fasta}" \
"$CHOSEN_REF_FASTA" \
"~{sample_name}".intermediate_scaffold.fasta \
~{'--min_contig_len=' + scaffold_min_contig_len} \
~{'--maxgap=' + nucmer_max_gap} \
~{'--minmatch=' + nucmer_min_match} \
~{'--mincluster=' + nucmer_min_cluster} \
~{'--min_pct_contig_aligned=' + scaffold_min_pct_contig_aligned} \
--outReference "~{sample_name}".scaffolding_chosen_ref.fasta \
--outStats "~{sample_name}".scaffolding_stats.txt \
--outAlternateContigs ~{sample_name}.scaffolding_alt_contigs.fasta \
~{true='--allow_incomplete_output' false="" allow_incomplete_output} \
--loglevel=DEBUG
cut -f 3 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1 > SKANI_ANI
cut -f 4 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1 > SKANI_REF_AF
cut -f 5 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1 > SKANI_CONTIGS_AF
else
# skani reference selection failed: try all references
echo "0" > SKANI_ANI
echo "0" > SKANI_REF_AF
echo "0" > SKANI_CONTIGS_AF
echo "" > CHOSEN_REF_BASENAME
assembly.py order_and_orient \
"~{contigs_fasta}" \
"~{sep='" "' reference_genome_fasta}" \
"~{sample_name}".intermediate_scaffold.fasta \
~{'--min_contig_len=' + scaffold_min_contig_len} \
~{'--maxgap=' + nucmer_max_gap} \
~{'--minmatch=' + nucmer_min_match} \
~{'--mincluster=' + nucmer_min_cluster} \
~{'--min_pct_contig_aligned=' + scaffold_min_pct_contig_aligned} \
--outReference "~{sample_name}".scaffolding_chosen_ref.fasta \
--outStats "~{sample_name}".scaffolding_stats.txt \
--outAlternateContigs ~{sample_name}.scaffolding_alt_contigs.fasta \
~{true='--allow_incomplete_output' false="" allow_incomplete_output} \
--loglevel=DEBUG
fi
grep '^>' "~{sample_name}".scaffolding_chosen_ref.fasta | cut -c 2- | cut -f 1 -d ' ' > "~{sample_name}".scaffolding_chosen_refs.txt
assembly.py gapfill_gap2seq \
Expand Down Expand Up @@ -555,7 +583,7 @@ task align_reads {
Boolean skip_mark_dupes = false
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
String sample_name = basename(basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt"), ".clean")
}
Expand Down Expand Up @@ -692,7 +720,7 @@ task refine_assembly_with_aligned_reads {
Int min_coverage = 3
Int machine_mem_gb = 15
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.2.0"
}
Int disk_size = 375
Expand Down Expand Up @@ -817,7 +845,7 @@ task refine_2x_and_plot {
String? plot_coverage_novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k"
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.2.0"
# do this in two steps in case the input doesn't actually have "cleaned" in the name
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned")
Expand Down Expand Up @@ -953,7 +981,7 @@ task run_discordance {
String out_basename = "run"
Int min_coverage = 4
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
}
parameter_meta {
reads_aligned_bam: {
Expand Down
4 changes: 2 additions & 2 deletions pipes/WDL/tasks/tasks_demux.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ task merge_tarballs {
String out_filename

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
}

Int disk_size = 2625
Expand Down Expand Up @@ -163,7 +163,7 @@ task illumina_demux {
Int? machine_mem_gb
Int disk_size = 2625
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
}
parameter_meta {
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/tasks/tasks_interhost.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ task index_ref {
File? novocraft_license

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
}

Int disk_size = 100
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/tasks/tasks_megablast.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ task trim_rmdup_subsamp {
Int machine_mem_gb = 128
Int cpu = 16
Int disk_size_gb = 100
String docker ="quay.io/broadinstitute/viral-assemble:2.3.1.4"
String docker ="quay.io/broadinstitute/viral-assemble:2.3.2.0"
}
parameter_meta {
inBam: {
Expand Down
71 changes: 68 additions & 3 deletions pipes/WDL/tasks/tasks_ncbi.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ task structured_comments {

File? filter_to_ids

String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
}
String out_base = basename(assembly_stats_tsv, '.txt')
command <<<
Expand Down Expand Up @@ -272,7 +272,7 @@ task rename_fasta_header {
String out_basename = basename(genome_fasta, ".fasta")
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
}
command {
set -e
Expand Down Expand Up @@ -437,7 +437,7 @@ task sra_meta_prep {
Boolean paired
String out_name = "sra_metadata.tsv"
String docker="quay.io/broadinstitute/viral-core:2.3.1"
String docker="quay.io/broadinstitute/viral-core:2.3.2"
}
Int disk_size = 100
parameter_meta {
Expand Down Expand Up @@ -1010,3 +1010,68 @@ task vadr {
}
}
task sequence_rename_by_species {
meta {
description: "Rename sequences based on species-specific naming conventions for many viral taxa."
}
input {
String sample_id
String organism_name
File biosample_attributes
String taxid
File taxdump_tgz
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.2"
}
command <<<
set -e
mkdir -p taxdump
read_utils.py extract_tarball "~{taxdump_tgz}" taxdump
python3 << CODE
import metagenomics
taxdb = metagenomics.TaxonomyDb(tax_dir='taxdump', load_nodes=True, load_gis=False)
taxid = int('~{taxid}')
ancestors = taxdb.get_ordered_ancestors(taxid)
if any(node == 3052310 for node in [taxid] + ancestors):
# LASV
pass
elif any(node == 186538 for node in [taxid] + ancestors):
# ZEBOV
pass
elif any(node == 11250 for node in [taxid] + ancestors):
# RSV -- no real convention! Some coalescence around this:
# <type>/<host lowercase>/Country/ST-Institution-LabID/Year
# e.g. RSV-A/human/USA/MA-Broad-1234/2020
pass
elif any(node == 2697049 for node in [taxid] + ancestors):
# SARS-CoV-2
# SARS-CoV-2/<host lowercase>/Country/ST-Institution-LabID/Year
# e.g. SARS-CoV-2/human/USA/MA-Broad-1234/2020
pass
elif any((node == 11320 or node == 11520) for node in [taxid] + ancestors):
# Flu A or B
# <type>/<hostname if not human>/<geoloc>/seqUID/year
# e.g. A/Massachusetts/Broad_MGH-1234/2001 or A/chicken/Hokkaido/TU25-3/2022 or B/Rhode Island/RISHL-1234/2024
pass
elif any(node == 12059 for node in [taxid] + ancestors):
# Enterovirus (including rhinos)
pass
else:
# everything else
pass
CODE
>>>
output {
String assembly_name_genbank = read_string("assembly_name_genbank")
}
runtime {
docker: docker
memory: "1 GB"
cpu: 1
dx_instance_type: "mem1_ssd1_v2_x2"
maxRetries: 2
}
}
4 changes: 2 additions & 2 deletions pipes/WDL/tasks/tasks_nextstrain.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ task derived_cols {
String? lab_highlight_loc
Array[File] table_map = []
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
Int disk_size = 50
}
parameter_meta {
Expand Down Expand Up @@ -891,7 +891,7 @@ task filter_sequences_to_list {
String out_fname = sub(sub(basename(sequences, ".zst"), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta")
# Prior docker image: "nextstrain/base:build-20240318T173028Z"
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
Int disk_size = 750
}
parameter_meta {
Expand Down
14 changes: 7 additions & 7 deletions pipes/WDL/tasks/tasks_read_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ task group_bams_by_sample {
task get_bam_samplename {
input {
File bam
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
}
Int disk_size = round(size(bam, "GB")) + 50
command <<<
Expand All @@ -111,7 +111,7 @@ task get_sample_meta {
input {
Array[File] samplesheets_extended
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
}
Int disk_size = 50
command <<<
Expand Down Expand Up @@ -172,7 +172,7 @@ task merge_and_reheader_bams {
File? reheader_table
String out_basename = basename(in_bams[0], ".bam")
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
Int disk_size = 750
Int machine_mem_gb = 4
}
Expand Down Expand Up @@ -244,7 +244,7 @@ task rmdup_ubam {
String method = "mvicuna"
Int machine_mem_gb = 7
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
}
Int disk_size = 375
Expand Down Expand Up @@ -303,7 +303,7 @@ task downsample_bams {
Boolean deduplicateAfter = false
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
}
Int disk_size = 750
Expand Down Expand Up @@ -367,7 +367,7 @@ task FastqToUBAM {
String? sequencing_center
String? additional_picard_options
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
}
Int disk_size = 375
parameter_meta {
Expand Down Expand Up @@ -418,7 +418,7 @@ task read_depths {
File aligned_bam
String out_basename = basename(aligned_bam, '.bam')
String docker = "quay.io/broadinstitute/viral-core:2.3.1"
String docker = "quay.io/broadinstitute/viral-core:2.3.2"
}
Int disk_size = 200
command <<<
Expand Down
Loading

0 comments on commit e28bec4

Please sign in to comment.