Skip to content

Commit

Permalink
Merge pull request #529 from broadinstitute/dp-scaffold
Browse files Browse the repository at this point in the history
WiP refsel improvements
  • Loading branch information
dpark01 authored Apr 4, 2024
2 parents 6a8f9c1 + 0ffc09b commit 967ccb9
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 17 deletions.
5 changes: 5 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,11 @@ workflows:
primaryDescriptorPath: /pipes/WDL/workflows/subsample_by_metadata_with_focal.wdl
testParameterFiles:
- /empty.json
- name: taxid_to_nextclade
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/taxid_to_nextclade.wdl
testParameterFiles:
- /empty.json
- name: terra_table_to_tsv
subclass: WDL
primaryDescriptorPath: /pipes/WDL/workflows/terra_table_to_tsv.wdl
Expand Down
24 changes: 19 additions & 5 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ task assemble {
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt")

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"
}
parameter_meta{
reads_unmapped_bam: {
Expand Down Expand Up @@ -111,7 +111,11 @@ task select_references {
Array[File] reference_genomes_fastas
File contigs_fasta

String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
Int? skani_m
Int? skani_s
Int? skani_c

String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"
Int machine_mem_gb = 4
Int cpu = 2
Int disk_size = 100
Expand All @@ -128,6 +132,9 @@ task select_references {
"~{contigs_basename}.refs_skani_dist.full.tsv" \
"~{contigs_basename}.refs_skani_dist.top.tsv" \
"~{contigs_basename}.ref_clusters.tsv" \
~{'-m ' + skani_m} \
~{'-s ' + skani_s} \
~{'-c ' + skani_c} \
--loglevel=DEBUG

# create basename-only version of ref_clusters output file
Expand Down Expand Up @@ -188,14 +195,18 @@ task scaffold {
Int replace_length=55
Boolean allow_incomplete_output = false
Int? skani_m
Int? skani_s
Int? skani_c
Int? nucmer_max_gap
Int? nucmer_min_match
Int? nucmer_min_cluster
Int? scaffold_min_contig_len
Float? scaffold_min_pct_contig_aligned
Int? machine_mem_gb
String docker="quay.io/broadinstitute/viral-assemble:2.3.1.3"
String docker="quay.io/broadinstitute/viral-assemble:2.3.1.4"
# do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name
String sample_name = basename(basename(contigs_fasta, ".fasta"), ".assembly1-spades")
Expand Down Expand Up @@ -283,6 +294,9 @@ task scaffold {
"~{sample_name}.refs_skani_dist.full.tsv" \
"~{sample_name}.refs_skani_dist.top.tsv" \
"~{sample_name}.ref_clusters.tsv" \
~{'-m ' + skani_m} \
~{'-s ' + skani_s} \
~{'-c ' + skani_c} \
--loglevel=DEBUG
CHOSEN_REF_FASTA=$(cut -f 1 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1)
cut -f 3 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1 > SKANI_ANI
Expand Down Expand Up @@ -677,7 +691,7 @@ task refine_assembly_with_aligned_reads {
Int min_coverage = 3
Int machine_mem_gb = 15
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"
}
Int disk_size = 375
Expand Down Expand Up @@ -802,7 +816,7 @@ task refine_2x_and_plot {
String? plot_coverage_novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k"
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"
# do this in two steps in case the input doesn't actually have "cleaned" in the name
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned")
Expand Down
63 changes: 53 additions & 10 deletions pipes/WDL/tasks/tasks_nextstrain.wdl
Original file line number Diff line number Diff line change
@@ -1,5 +1,40 @@
version 1.0

task taxid_to_nextclade_dataset_name {
input {
String taxid
}
command <<<
python3 <<CODE
taxid = int("~{taxid}")
taxid_to_dataset_map = {
2697049 : 'sars-cov-2',
641809 : 'flu_h1n1pdm_ha',
335341 : 'flu_h3n2_ha',
518987 : 'flu_vic_ha',
208893 : 'rsv_a',
208895 : 'rsv_b',
10244 : 'MPXV',
619591 : 'hMPXV'
}
with open('DATASET_NAME', 'wt') as outf:
outf.write(taxid_to_dataset_map.get(taxid, '') + '\n')
CODE
>>>
runtime {
docker: "python:slim"
memory: "1 GB"
cpu: 1
disks: "local-disk 50 HDD"
disk: "50 GB" # TES
dx_instance_type: "mem1_ssd1_v2_x2"
maxRetries: 2
}
output {
String nextclade_dataset_name = read_string("DATASET_NAME")
}
}
task nextclade_one_sample {
meta {
description: "Nextclade classification of one sample. Leaving optional inputs unspecified will use SARS-CoV-2 defaults."
Expand All @@ -17,7 +52,7 @@ task nextclade_one_sample {
String docker = "nextstrain/nextclade:2.14.0"
}
String basename = basename(genome_fasta, ".fasta")
command {
command <<<
set -e
apt-get update
apt-get -y install python3
Expand Down Expand Up @@ -54,17 +89,23 @@ task nextclade_one_sample {
--output-tree "~{basename}".nextclade.auspice.json \
"~{genome_fasta}"
python3 <<CODE
# transpose table
import codecs
import codecs, csv
cols = [('clade', 'NEXTCLADE_CLADE'),
('short-clade', 'NEXTCLADE_SHORTCLADE'),
('subclade', 'NEXTCLADE_SUBCLADE'),
('aaSubstitutions', 'NEXTCLADE_AASUBS'),
('aaDeletions', 'NEXTCLADE_AADELS')]
out = {}
with codecs.open('~{basename}.nextclade.tsv', 'r', encoding='utf-8') as inf:
with codecs.open('transposed.tsv', 'w', encoding='utf-8') as outf:
for c in zip(*(l.rstrip().split('\t') for l in inf)):
outf.write('\t'.join(c)+'\n')
for line in csv.DictReader(inf, delimiter='\t'):
for k,fname in cols:
if line.get(k):
out[k] = line[k]
for k, fname in cols:
with codecs.open(fname, 'w', encoding='utf-8') as outf:
outf.write(out.get(k, '')+'\n')
CODE
grep ^clade\\W transposed.tsv | cut -f 2 | grep -v clade > NEXTCLADE_CLADE
grep ^aaSubstitutions\\W transposed.tsv | cut -f 2 | grep -v aaSubstitutions > NEXTCLADE_AASUBS
grep ^aaDeletions\\W transposed.tsv | cut -f 2 | grep -v aaDeletions > NEXTCLADE_AADELS
}
>>>
runtime {
docker: docker
memory: "3 GB"
Expand All @@ -80,6 +121,8 @@ task nextclade_one_sample {
File auspice_json = "~{basename}.nextclade.auspice.json"
File nextclade_tsv = "~{basename}.nextclade.tsv"
String nextclade_clade = read_string("NEXTCLADE_CLADE")
String nextclade_shortclade = read_string("NEXTCLADE_SHORTCLADE")
String nextclade_subclade = read_string("NEXTCLADE_SUBCLADE")
String aa_subs_csv = read_string("NEXTCLADE_AASUBS")
String aa_dels_csv = read_string("NEXTCLADE_AADELS")
}
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ task compare_two_genomes {
File genome_two
String out_basename
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.4"
}
Int disk_size = 50
Expand Down
2 changes: 2 additions & 0 deletions pipes/WDL/workflows/nextclade_single.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ workflow nextclade_single {
File nextclade_json = nextclade_one_sample.nextclade_json
String nextclade_aa_subs = nextclade_one_sample.aa_subs_csv
String nextclade_aa_dels = nextclade_one_sample.aa_dels_csv
String nextclade_shortclade = nextclade_one_sample.nextclade_shortclade
String nextclade_subclade = nextclade_one_sample.nextclade_subclade
String nextclade_version = nextclade_one_sample.nextclade_version
}
}
15 changes: 15 additions & 0 deletions pipes/WDL/workflows/taxid_to_nextclade.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
version 1.0

import "../tasks/tasks_nextstrain.wdl" as nextstrain

workflow taxid_to_nextclade {
meta {
description: "Convert taxids to a nextclade dataset name"
}

call nextstrain.taxid_to_nextclade_dataset_name

output {
String nextclade_dataset = taxid_to_nextclade_dataset_name.nextclade_dataset_name
}
}
2 changes: 1 addition & 1 deletion requirements-modules.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
broadinstitute/viral-core=2.3.1
broadinstitute/viral-assemble=2.3.1.3
broadinstitute/viral-assemble=2.3.1.4
broadinstitute/viral-classify=2.2.4.0
broadinstitute/viral-phylo=2.1.20.2
broadinstitute/py3-bio=0.1.2
Expand Down

0 comments on commit 967ccb9

Please sign in to comment.