Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reference selection improvements and nextclade generalization #529

Merged
merged 7 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,11 @@ workflows:
primaryDescriptorPath: /pipes/WDL/workflows/subsample_by_metadata_with_focal.wdl
testParameterFiles:
- /empty.json
- name: taxid_to_nextclade
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/taxid_to_nextclade.wdl
testParameterFiles:
- /empty.json
- name: terra_table_to_tsv
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/terra_table_to_tsv.wdl
Expand Down
24 changes: 19 additions & 5 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ task assemble {
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt")

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"
}
parameter_meta{
reads_unmapped_bam: {
Expand Down Expand Up @@ -111,7 +111,11 @@ task select_references {
Array[File] reference_genomes_fastas
File contigs_fasta

String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
Int? skani_m
Int? skani_s
Int? skani_c

String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"
Int machine_mem_gb = 4
Int cpu = 2
Int disk_size = 100
Expand All @@ -128,6 +132,9 @@ task select_references {
"~{contigs_basename}.refs_skani_dist.full.tsv" \
"~{contigs_basename}.refs_skani_dist.top.tsv" \
"~{contigs_basename}.ref_clusters.tsv" \
~{'-m ' + skani_m} \
~{'-s ' + skani_s} \
~{'-c ' + skani_c} \
--loglevel=DEBUG

# create basename-only version of ref_clusters output file
Expand Down Expand Up @@ -188,14 +195,18 @@ task scaffold {
Int replace_length=55
Boolean allow_incomplete_output = false

Int? skani_m
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a future PR we should say more about these single-character params, even if it's only to have entries in the parameter_meta section point to the skani docs.

Int? skani_s
Int? skani_c

Int? nucmer_max_gap
Int? nucmer_min_match
Int? nucmer_min_cluster
Int? scaffold_min_contig_len
Float? scaffold_min_pct_contig_aligned

Int? machine_mem_gb
String docker="quay.io/broadinstitute/viral-assemble:2.3.1.3"
String docker="quay.io/broadinstitute/viral-assemble:2.3.1.4"

# do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name
String sample_name = basename(basename(contigs_fasta, ".fasta"), ".assembly1-spades")
Expand Down Expand Up @@ -283,6 +294,9 @@ task scaffold {
"~{sample_name}.refs_skani_dist.full.tsv" \
"~{sample_name}.refs_skani_dist.top.tsv" \
"~{sample_name}.ref_clusters.tsv" \
~{'-m ' + skani_m} \
~{'-s ' + skani_s} \
~{'-c ' + skani_c} \
--loglevel=DEBUG
CHOSEN_REF_FASTA=$(cut -f 1 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1)
cut -f 3 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1 > SKANI_ANI
Expand Down Expand Up @@ -677,7 +691,7 @@ task refine_assembly_with_aligned_reads {
Int min_coverage = 3

Int machine_mem_gb = 15
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"
}

Int disk_size = 375
Expand Down Expand Up @@ -802,7 +816,7 @@ task refine_2x_and_plot {
String? plot_coverage_novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k"

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"

# do this in two steps in case the input doesn't actually have "cleaned" in the name
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned")
Expand Down
63 changes: 53 additions & 10 deletions pipes/WDL/tasks/tasks_nextstrain.wdl
Original file line number Diff line number Diff line change
@@ -1,5 +1,40 @@
version 1.0

task taxid_to_nextclade_dataset_name {
input {
String taxid
}
command <<<
python3 <<CODE
taxid = int("~{taxid}")
taxid_to_dataset_map = {
2697049 : 'sars-cov-2',
641809 : 'flu_h1n1pdm_ha',
335341 : 'flu_h3n2_ha',
518987 : 'flu_vic_ha',
208893 : 'rsv_a',
208895 : 'rsv_b',
10244 : 'MPXV',
619591 : 'hMPXV'
}
with open('DATASET_NAME', 'wt') as outf:
outf.write(taxid_to_dataset_map.get(taxid, '') + '\n')
CODE
>>>
runtime {
docker: "python:slim"
memory: "1 GB"
cpu: 1
disks: "local-disk 50 HDD"
disk: "50 GB" # TES
dx_instance_type: "mem1_ssd1_v2_x2"
maxRetries: 2
}
output {
String nextclade_dataset_name = read_string("DATASET_NAME")
}
}

task nextclade_one_sample {
meta {
description: "Nextclade classification of one sample. Leaving optional inputs unspecified will use SARS-CoV-2 defaults."
Expand All @@ -17,7 +52,7 @@ task nextclade_one_sample {
String docker = "nextstrain/nextclade:2.14.0"
}
String basename = basename(genome_fasta, ".fasta")
command {
command <<<
set -e
apt-get update
apt-get -y install python3
Expand Down Expand Up @@ -54,17 +89,23 @@ task nextclade_one_sample {
--output-tree "~{basename}".nextclade.auspice.json \
"~{genome_fasta}"
python3 <<CODE
# transpose table
import codecs
import codecs, csv
cols = [('clade', 'NEXTCLADE_CLADE'),
('short-clade', 'NEXTCLADE_SHORTCLADE'),
('subclade', 'NEXTCLADE_SUBCLADE'),
('aaSubstitutions', 'NEXTCLADE_AASUBS'),
('aaDeletions', 'NEXTCLADE_AADELS')]
out = {}
with codecs.open('~{basename}.nextclade.tsv', 'r', encoding='utf-8') as inf:
with codecs.open('transposed.tsv', 'w', encoding='utf-8') as outf:
for c in zip(*(l.rstrip().split('\t') for l in inf)):
outf.write('\t'.join(c)+'\n')
for line in csv.DictReader(inf, delimiter='\t'):
for k,fname in cols:
if line.get(k):
out[k] = line[k]
for k, fname in cols:
with codecs.open(fname, 'w', encoding='utf-8') as outf:
outf.write(out.get(k, '')+'\n')
CODE
grep ^clade\\W transposed.tsv | cut -f 2 | grep -v clade > NEXTCLADE_CLADE
grep ^aaSubstitutions\\W transposed.tsv | cut -f 2 | grep -v aaSubstitutions > NEXTCLADE_AASUBS
grep ^aaDeletions\\W transposed.tsv | cut -f 2 | grep -v aaDeletions > NEXTCLADE_AADELS
}
>>>
runtime {
docker: docker
memory: "3 GB"
Expand All @@ -80,6 +121,8 @@ task nextclade_one_sample {
File auspice_json = "~{basename}.nextclade.auspice.json"
File nextclade_tsv = "~{basename}.nextclade.tsv"
String nextclade_clade = read_string("NEXTCLADE_CLADE")
String nextclade_shortclade = read_string("NEXTCLADE_SHORTCLADE")
String nextclade_subclade = read_string("NEXTCLADE_SUBCLADE")
String aa_subs_csv = read_string("NEXTCLADE_AASUBS")
String aa_dels_csv = read_string("NEXTCLADE_AADELS")
}
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ task compare_two_genomes {
File genome_two
String out_basename

String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"
}

Int disk_size = 50
Expand Down
2 changes: 2 additions & 0 deletions pipes/WDL/workflows/nextclade_single.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ workflow nextclade_single {
File nextclade_json = nextclade_one_sample.nextclade_json
String nextclade_aa_subs = nextclade_one_sample.aa_subs_csv
String nextclade_aa_dels = nextclade_one_sample.aa_dels_csv
String nextclade_shortclade = nextclade_one_sample.nextclade_shortclade
String nextclade_subclade = nextclade_one_sample.nextclade_subclade
String nextclade_version = nextclade_one_sample.nextclade_version
}
}
15 changes: 15 additions & 0 deletions pipes/WDL/workflows/taxid_to_nextclade.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
version 1.0

import "../tasks/tasks_nextstrain.wdl" as nextstrain

workflow taxid_to_nextclade {
meta {
description: "Convert taxids to a nextclade dataset name"
}

call nextstrain.taxid_to_nextclade_dataset_name

output {
String nextclade_dataset = taxid_to_nextclade_dataset_name.nextclade_dataset_name
}
}
2 changes: 1 addition & 1 deletion requirements-modules.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
broadinstitute/viral-core=2.3.1
broadinstitute/viral-assemble=2.3.1.3
broadinstitute/viral-assemble=2.3.1.4
broadinstitute/viral-classify=2.2.4.0
broadinstitute/viral-phylo=2.1.20.2
broadinstitute/py3-bio=0.1.2
Expand Down