Skip to content

Commit

Permalink
Merge pull request #528 from broadinstitute/dp-scaffold
Browse files Browse the repository at this point in the history
scaffolding and reference selection based on ANI
  • Loading branch information
dpark01 authored Mar 25, 2024
2 parents 9b2059a + 971fe78 commit 6a8f9c1
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 101 deletions.
141 changes: 116 additions & 25 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ task assemble {
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt")

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.3.0.0"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
}
parameter_meta{
reads_unmapped_bam: {
Expand Down Expand Up @@ -103,6 +103,79 @@ task assemble {

}

task select_references {
meta {
description: "Evaluate reference genomes based on ANI similarity to provided contigs. This will emit an ANI-rank-ordered table of references and will also cluster reference genomes based on ANI similarity to each other, picking only the top hit (based on ANI similairty to contigs) of each cluster. Default parameters for skani are tuned for viral genomes (-m 50 -s 85 --no-learned-ani --slow --robust --no-marker-index). This tool can tolerate a large number of reference genomes as input."
}
input {
Array[File] reference_genomes_fastas
File contigs_fasta

String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
Int machine_mem_gb = 4
Int cpu = 2
Int disk_size = 100
}
String contigs_basename = basename(basename(contigs_fasta, '.fasta'), '.assembly1-spades')

command <<<
set -e

# run skani, find top hits, cluster references
assembly.py skani_contigs_to_refs \
"~{contigs_fasta}" \
"~{sep='" "' reference_genomes_fastas}" \
"~{contigs_basename}.refs_skani_dist.full.tsv" \
"~{contigs_basename}.refs_skani_dist.top.tsv" \
"~{contigs_basename}.ref_clusters.tsv" \
--loglevel=DEBUG

# create basename-only version of ref_clusters output file
# create tar-bundles of ref_clusters fastas, since Cromwell doesn't delocalize files in a Array[Array[File]] = read_tsv
python3 <<CODE
import os, os.path, shutil, tarfile
os.mkdir("clusters")
with open("~{contigs_basename}.ref_clusters.tsv", 'r') as inf:
with open("~{contigs_basename}.ref_clusters.basenames.tsv", 'w') as outf:
for line in inf:
fnames = line.strip().split('\t')
assert fnames
basefnames = list([f[:-6] if f.endswith('.fasta') else f for f in map(os.path.basename, fnames)])
outf.write('\t'.join(basefnames) + '\n')
with tarfile.open(os.path.join("clusters", basefnames[0] + "-" + str(len(basefnames)) + ".tar.gz"), "w:gz") as tarball:
for f in fnames:
shutil.copy(f, ".")
tarball.add(os.path.basename(f))
os.unlink(os.path.basename(f))
CODE
# create top-hits output files
cut -f 1 "~{contigs_basename}.refs_skani_dist.top.tsv" | tail +2 > TOP_FASTAS
for f in $(cat TOP_FASTAS); do basename "$f" .fasta; done > TOP_FASTAS_BASENAMES
>>>
output {
Array[File] matched_reference_clusters_fastas_tars = glob("clusters/*.tar.gz")
Array[Array[String]] matched_reference_clusters_basenames = read_tsv("~{contigs_basename}.ref_clusters.basenames.tsv")
Array[String] top_matches_per_cluster_basenames = read_lines("TOP_FASTAS_BASENAMES")
Array[File] top_matches_per_cluster_fastas = read_lines("TOP_FASTAS")
File skani_dist_full_tsv = "~{contigs_basename}.refs_skani_dist.full.tsv"
File skani_dist_top_tsv = "~{contigs_basename}.refs_skani_dist.top.tsv"
}
runtime {
docker: docker
memory: machine_mem_gb + " GB"
cpu: cpu
disks: "local-disk " + disk_size + " LOCAL"
disk: disk_size + " GB" # TESs
dx_instance_type: "mem1_ssd1_v2_x2"
preemptible: 2
maxRetries: 2
}
}
task scaffold {
input {
File contigs_fasta
Expand All @@ -122,7 +195,7 @@ task scaffold {
Float? scaffold_min_pct_contig_aligned
Int? machine_mem_gb
String docker="quay.io/broadinstitute/viral-assemble:2.3.0.0"
String docker="quay.io/broadinstitute/viral-assemble:2.3.1.3"
# do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name
String sample_name = basename(basename(contigs_fasta, ".fasta"), ".assembly1-spades")
Expand Down Expand Up @@ -203,53 +276,67 @@ task scaffold {
assembly.py --version | tee VERSION
# use skani to choose top hit
assembly.py skani_contigs_to_refs \
"~{contigs_fasta}" \
"~{sep='" "' reference_genome_fasta}" \
"~{sample_name}.refs_skani_dist.full.tsv" \
"~{sample_name}.refs_skani_dist.top.tsv" \
"~{sample_name}.ref_clusters.tsv" \
--loglevel=DEBUG
CHOSEN_REF_FASTA=$(cut -f 1 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1)
cut -f 3 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1 > SKANI_ANI
cut -f 4 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1 > SKANI_REF_AF
cut -f 5 "~{sample_name}.refs_skani_dist.full.tsv" | tail +2 | head -1 > SKANI_CONTIGS_AF
basename "$CHOSEN_REF_FASTA" .fasta > CHOSEN_REF_BASENAME
assembly.py order_and_orient \
~{contigs_fasta} \
~{sep=' ' reference_genome_fasta} \
~{sample_name}.intermediate_scaffold.fasta \
"~{contigs_fasta}" \
"$CHOSEN_REF_FASTA" \
"~{sample_name}".intermediate_scaffold.fasta \
~{'--min_contig_len=' + scaffold_min_contig_len} \
~{'--maxgap=' + nucmer_max_gap} \
~{'--minmatch=' + nucmer_min_match} \
~{'--mincluster=' + nucmer_min_cluster} \
~{'--min_pct_contig_aligned=' + scaffold_min_pct_contig_aligned} \
--outReference ~{sample_name}.scaffolding_chosen_ref.fasta \
--outStats ~{sample_name}.scaffolding_stats.txt \
--outReference "~{sample_name}".scaffolding_chosen_ref.fasta \
--outStats "~{sample_name}".scaffolding_stats.txt \
--outAlternateContigs ~{sample_name}.scaffolding_alt_contigs.fasta \
~{true='--allow_incomplete_output' false="" allow_incomplete_output} \
--loglevel=DEBUG
grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | cut -f 1 -d ' ' > ~{sample_name}.scaffolding_chosen_refs.txt
grep '^>' "~{sample_name}".scaffolding_chosen_ref.fasta | cut -c 2- | cut -f 1 -d ' ' > "~{sample_name}".scaffolding_chosen_refs.txt
assembly.py gapfill_gap2seq \
~{sample_name}.intermediate_scaffold.fasta \
~{reads_bam} \
~{sample_name}.intermediate_gapfill.fasta \
"~{sample_name}".intermediate_scaffold.fasta \
"~{reads_bam}" \
"~{sample_name}".intermediate_gapfill.fasta \
--memLimitGb $mem_in_gb \
--maskErrors \
--loglevel=DEBUG
set +e +o pipefail
grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\n' | wc -c | tee assembly_preimpute_length
grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\nNn' | wc -c | tee assembly_preimpute_length_unambiguous
grep '^>' ~{sample_name}.intermediate_gapfill.fasta | wc -l | tee assembly_num_segments_recovered
grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | wc -l | tee reference_num_segments_required
grep -v '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | tr -d '\n' | wc -c | tee reference_length
grep -v '^>' "~{sample_name}".intermediate_gapfill.fasta | tr -d '\n' | wc -c | tee assembly_preimpute_length
grep -v '^>' "~{sample_name}".intermediate_gapfill.fasta | tr -d '\nNn' | wc -c | tee assembly_preimpute_length_unambiguous
grep '^>' "~{sample_name}".intermediate_gapfill.fasta | wc -l | tee assembly_num_segments_recovered
grep '^>' "~{sample_name}".scaffolding_chosen_ref.fasta | wc -l | tee reference_num_segments_required
grep -v '^>' "~{sample_name}".scaffolding_chosen_ref.fasta | tr -d '\n' | wc -c | tee reference_length
set -e -o pipefail
if ~{true='true' false='false' allow_incomplete_output} && ! cmp -s assembly_num_segments_recovered reference_num_segments_required
then
# draft assembly does not have enough segments--and that's okay (allow_incomplete_output=true)
file_utils.py rename_fasta_sequences \
~{sample_name}.intermediate_gapfill.fasta \
~{sample_name}.scaffolded_imputed.fasta \
"~{sample_name}".intermediate_gapfill.fasta \
"~{sample_name}".scaffolded_imputed.fasta \
"~{sample_name}" --suffix_always --loglevel=DEBUG
else
# draft assembly must have the right number of segments (fail if not)
assembly.py impute_from_reference \
~{sample_name}.intermediate_gapfill.fasta \
~{sample_name}.scaffolding_chosen_ref.fasta \
~{sample_name}.scaffolded_imputed.fasta \
--newName ~{sample_name} \
"~{sample_name}".intermediate_gapfill.fasta \
"~{sample_name}".scaffolding_chosen_ref.fasta \
"~{sample_name}".scaffolded_imputed.fasta \
--newName "~{sample_name}" \
~{'--replaceLength=' + replace_length} \
~{'--minLengthFraction=' + min_length_fraction} \
~{'--minUnambig=' + min_unambig} \
Expand All @@ -268,9 +355,13 @@ task scaffold {
Int reference_num_segments_required = read_int("reference_num_segments_required")
Int reference_length = read_int("reference_length")
Array[String] scaffolding_chosen_ref_names = read_lines("~{sample_name}.scaffolding_chosen_refs.txt")
String scaffolding_chosen_ref_basename = read_string("CHOSEN_REF_BASENAME")
File scaffolding_chosen_ref = "~{sample_name}.scaffolding_chosen_ref.fasta"
File scaffolding_stats = "~{sample_name}.scaffolding_stats.txt"
File scaffolding_stats = "~{sample_name}.refs_skani_dist.full.tsv"
File scaffolding_alt_contigs = "~{sample_name}.scaffolding_alt_contigs.fasta"
Float skani_ani = read_float("SKANI_ANI")
Float skani_ref_aligned_frac = read_float("SKANI_REF_AF")
Float skani_contigs_aligned_frac = read_float("SKANI_CONTIGS_AF")
String viralngs_version = read_string("VERSION")
}
Expand Down Expand Up @@ -586,7 +677,7 @@ task refine_assembly_with_aligned_reads {
Int min_coverage = 3
Int machine_mem_gb = 15
String docker = "quay.io/broadinstitute/viral-assemble:2.3.0.0"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
}
Int disk_size = 375
Expand Down Expand Up @@ -711,7 +802,7 @@ task refine_2x_and_plot {
String? plot_coverage_novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k"
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-assemble:2.3.0.0"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
# do this in two steps in case the input doesn't actually have "cleaned" in the name
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned")
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ task compare_two_genomes {
File genome_two
String out_basename
String docker = "quay.io/broadinstitute/viral-assemble:2.3.0.0"
String docker = "quay.io/broadinstitute/viral-assemble:2.3.1.3"
}
Int disk_size = 50
Expand Down
35 changes: 34 additions & 1 deletion pipes/WDL/tasks/tasks_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,35 @@ task sed {
}
}
task tar_extract {
meta {
description: "Extract a tar file"
}
input {
File tar_file
Int disk_size = 375
String tar_opts = "-z"
}
command <<<
mkdir -p unpack
cd unpack
tar -xv ~{tar_opts} -f "~{tar_file}"
>>>
runtime {
docker: "quay.io/broadinstitute/viral-baseimage:0.2.0"
memory: "2 GB"
cpu: 1
disks: "local-disk " + disk_size + " LOCAL"
disk: disk_size + " GB" # TES
dx_instance_type: "mem1_ssd1_v2_x2"
maxRetries: 2
preemptible: 1
}
output {
Array[File] files = glob("unpack/*")
}
}
task fasta_to_ids {
meta {
description: "Return the headers only from a fasta file"
Expand Down Expand Up @@ -202,15 +231,19 @@ task fetch_row_from_tsv {
String idx_col
String idx_val
Array[String] set_default_keys = []
Array[String] add_header = []
}
Int disk_size = 50
command <<<
python3 << CODE
import csv, gzip, json
open_or_gzopen = lambda *args, **kwargs: gzip.open(*args, **kwargs) if args[0].endswith('.gz') else open(*args, **kwargs)
out_dict = {}
fieldnames = "~{sep='*' add_header}".split("*")
if not fieldnames:
fieldnames = None
with open_or_gzopen('~{tsv}', 'rt') as inf:
for row in csv.DictReader(inf, delimiter='\t'):
for row in csv.DictReader(inf, delimiter='\t', fieldnames=fieldnames):
if row.get('~{idx_col}') == '~{idx_val}':
out_dict = row
break
Expand Down
Loading

0 comments on commit 6a8f9c1

Please sign in to comment.