Skip to content

Commit

Permalink
Merge pull request #511 from broadinstitute/dp-scaffold
Browse files Browse the repository at this point in the history
more scaffolding updates
dpark01 authored Feb 17, 2024
2 parents eae3562 + 88ca4d1 commit 847d661
Showing 9 changed files with 268 additions and 88 deletions.
4 changes: 2 additions & 2 deletions github_actions_ci/install-wdl.sh
Original file line number Diff line number Diff line change
@@ -12,8 +12,8 @@ fetch_jar_from_github () {
ln -s $_jar_fname $_tool_name.jar
}

fetch_jar_from_github broadinstitute cromwell womtool 61
fetch_jar_from_github broadinstitute cromwell cromwell 61
fetch_jar_from_github broadinstitute cromwell womtool 86
fetch_jar_from_github broadinstitute cromwell cromwell 86
fetch_jar_from_github dnanexus dxWDL dxWDL v1.50

TGZ=dx-toolkit-v0.311.0-ubuntu-20.04-amd64.tar.gz
37 changes: 26 additions & 11 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
@@ -231,19 +231,31 @@ task scaffold {
set +e +o pipefail
grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\n' | wc -c | tee assembly_preimpute_length
grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\nNn' | wc -c | tee assembly_preimpute_length_unambiguous
grep '^>' ~{sample_name}.intermediate_gapfill.fasta | wc -l | tee assembly_num_segments_recovered
grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | wc -l | tee reference_num_segments_required
grep -v '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | tr -d '\n' | wc -c | tee reference_length
set -e -o pipefail

#Input assembly/contigs, FASTA, already ordered oriented and merged with the reference gneome (FASTA)
assembly.py impute_from_reference \
~{sample_name}.intermediate_gapfill.fasta \
~{sample_name}.scaffolding_chosen_ref.fasta \
~{sample_name}.scaffolded_imputed.fasta \
--newName ~{sample_name} \
~{'--replaceLength=' + replace_length} \
~{'--minLengthFraction=' + min_length_fraction} \
~{'--minUnambig=' + min_unambig} \
~{'--aligner=' + aligner} \
--loglevel=DEBUG
if ~{true='true' false='false' allow_incomplete_output} && ! cmp -s assembly_num_segments_recovered reference_num_segments_required
then
# draft assembly does not have enough segments--and that's okay (allow_incomplete_output=true)
file_utils.py rename_fasta_sequences \
~{sample_name}.intermediate_gapfill.fasta \
~{sample_name}.scaffolded_imputed.fasta \
"~{sample_name}" --suffix_always --loglevel=DEBUG
else
# draft assembly must have the right number of segments (fail if not)
assembly.py impute_from_reference \
~{sample_name}.intermediate_gapfill.fasta \
~{sample_name}.scaffolding_chosen_ref.fasta \
~{sample_name}.scaffolded_imputed.fasta \
--newName ~{sample_name} \
~{'--replaceLength=' + replace_length} \
~{'--minLengthFraction=' + min_length_fraction} \
~{'--minUnambig=' + min_unambig} \
~{'--aligner=' + aligner} \
--loglevel=DEBUG
fi
}

output {
@@ -252,6 +264,9 @@ task scaffold {
File intermediate_gapfill_fasta = "~{sample_name}.intermediate_gapfill.fasta"
Int assembly_preimpute_length = read_int("assembly_preimpute_length")
Int assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous")
Int assembly_num_segments_recovered = read_int("assembly_num_segments_recovered")
Int reference_num_segments_required = read_int("reference_num_segments_required")
Int reference_length = read_int("reference_length")
Array[String] scaffolding_chosen_ref_names = read_lines("~{sample_name}.scaffolding_chosen_refs.txt")
File scaffolding_chosen_ref = "~{sample_name}.scaffolding_chosen_ref.fasta"
File scaffolding_stats = "~{sample_name}.scaffolding_stats.txt"
104 changes: 96 additions & 8 deletions pipes/WDL/tasks/tasks_metagenomics.wdl
Original file line number Diff line number Diff line change
@@ -11,7 +11,7 @@ task krakenuniq {
File krona_taxonomy_db_tgz # taxonomy.tab
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" #skip-global-version-pin
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" #skip-global-version-pin
}

Int disk_size = 750
@@ -143,7 +143,7 @@ task build_krakenuniq_db {
Int? zstd_compression_level

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" #skip-global-version-pin
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" #skip-global-version-pin
}

Int disk_size = 750
@@ -213,7 +213,7 @@ task kraken2 {
Int? min_base_qual

Int machine_mem_gb = 72
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

parameter_meta {
@@ -326,6 +326,94 @@ task kraken2 {
}
}

task report_primary_kraken_taxa {
meta {
description: "Interprets a kraken (or kraken2 or krakenuniq) summary report file and emits the primary contributing taxa under a focal taxon of interest."
}
input {
File kraken_summary_report
String focal_taxon = "Viruses"

String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}
String out_basename = basename(kraken_summary_report, '.txt')
Int disk_size = 50
Int machine_mem_gb = 2

command <<<
set -e
metagenomics.py taxlevel_plurality "~{kraken_summary_report}" "~{focal_taxon}" "~{out_basename}.ranked_focal_report.tsv"
cat "~{out_basename}.ranked_focal_report.tsv" | head -2 | tail +2 > TOPROW
cut -f 2 TOPROW > NUM_FOCAL
cut -f 4 TOPROW > PCT_OF_FOCAL
cut -f 7 TOPROW > NUM_READS
cut -f 8 TOPROW > TAX_RANK
cut -f 9 TOPROW > TAX_ID
cut -f 10 TOPROW > TAX_NAME
>>>

output {
String focal_tax_name = focal_taxon
File ranked_focal_report = "~{out_basename}.ranked_focal_report.tsv"
Int total_focal_reads = read_int("NUM_FOCAL")
Float percent_of_focal = read_float("PCT_OF_FOCAL")
Int num_reads = read_int("NUM_READS")
String tax_rank = read_string("TAX_RANK")
String tax_id = read_string("TAX_ID")
String tax_name = read_string("TAX_NAME")
}

runtime {
docker: docker
memory: machine_mem_gb + " GB"
cpu: 1
disks: "local-disk " + disk_size + " LOCAL"
disk: disk_size + " GB" # TESs
dx_instance_type: "mem1_ssd1_v2_x2"
preemptible: 2
maxRetries: 2
}
}

task filter_refs_to_found_taxa {
meta {
description: "Filters a taxid_to_ref_accessions_tsv to the set of taxa found in a focal_report."
}
input {
File taxid_to_ref_accessions_tsv
File focal_report_tsv
File taxdump_tgz
Int min_read_count = 100

String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}
String ref_basename = basename(taxid_to_ref_accessions_tsv, '.tsv')
String hits_basename = basename(focal_report_tsv, '.tsv')
Int disk_size = 50

command <<<
set -e
mkdir -p taxdump
read_utils.py extract_tarball "~{taxdump_tgz}" taxdump
metagenomics.py filter_taxids_to_focal_hits "~{taxid_to_ref_accessions_tsv}" "~{focal_report_tsv}" taxdump ~{min_read_count} "~{ref_basename}-~{hits_basename}.tsv"
>>>

output {
File filtered_taxid_to_ref_accessions_tsv = "~{ref_basename}-~{hits_basename}.tsv"
}

runtime {
docker: docker
memory: "2 GB"
cpu: 1
disks: "local-disk " + disk_size + " LOCAL"
disk: disk_size + " GB" # TESs
dx_instance_type: "mem1_ssd1_v2_x2"
preemptible: 2
maxRetries: 2
}
}

task build_kraken2_db {
meta {
description: "Builds a custom kraken2 database. Outputs tar.zst tarballs of kraken2 database, associated krona taxonomy db, and an ncbi taxdump.tar.gz. Requires live internet access if any standard_libraries are specified or if taxonomy_db_tgz is absent."
@@ -348,7 +436,7 @@ task build_kraken2_db {
Int? zstd_compression_level

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

Int disk_size = 750
@@ -490,7 +578,7 @@ task blastx {
File krona_taxonomy_db_tgz

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

parameter_meta {
@@ -580,7 +668,7 @@ task krona {
Int? magnitude_column

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

Int disk_size = 50
@@ -687,7 +775,7 @@ task filter_bam_to_taxa {
String out_filename_suffix = "filtered"

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

String out_basename = basename(classified_bam, ".bam") + "." + out_filename_suffix
@@ -774,7 +862,7 @@ task kaiju {
File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

String input_basename = basename(reads_unmapped_bam, ".bam")
2 changes: 1 addition & 1 deletion pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
@@ -488,7 +488,7 @@ task aggregate_metagenomics_reports {
String aggregate_taxlevel_focus = "species"
Int aggregate_top_N_hits = 5
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}
parameter_meta {
6 changes: 3 additions & 3 deletions pipes/WDL/tasks/tasks_taxon_filter.wdl
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@ task deplete_taxa {

Int? cpu=8
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

parameter_meta {
@@ -113,7 +113,7 @@ task filter_to_taxon {
String? neg_control_prefixes_space_separated = "neg water NTC"

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

# do this in two steps in case the input doesn't actually have "cleaned" in the name
@@ -172,7 +172,7 @@ task build_lastal_db {
File sequences_fasta

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

String db_name = basename(sequences_fasta, ".fasta")
29 changes: 29 additions & 0 deletions pipes/WDL/tasks/tasks_utils.wdl
Original file line number Diff line number Diff line change
@@ -712,6 +712,35 @@ task s3_copy {
}
}
task string_split {
meta {
description: "split a string by a delimiter"
}
input {
String joined_string
String delimiter
}
command <<<
set -e
python3<<CODE
with open('TOKENS', 'wt') as outf:
for token in "~{joined_string}".split("~{delimiter}"):
outf.write(token + '\n')
CODE
>>>
output {
Array[String] tokens = read_lines("TOKENS")
}
runtime {
docker: "python:slim"
memory: "1 GB"
cpu: 1
disks: "local-disk 50 SSD"
disk: "50 GB" # TES
maxRetries: 2
}
}
task filter_sequences_by_length {
meta {
description: "Filter sequences in a fasta file to enforce a minimum count of non-N bases."
13 changes: 13 additions & 0 deletions pipes/WDL/workflows/classify_single.wdl
Original file line number Diff line number Diff line change
@@ -121,6 +121,10 @@ workflow classify_single {
trim_clip_db = trim_clip_db,
always_succeed = true
}
call metagenomics.report_primary_kraken_taxa {
input:
kraken_summary_report = kraken2.kraken2_summary_report
}

output {
File cleaned_reads_unaligned_bam = deplete.bam_filtered_to_taxa
@@ -134,6 +138,15 @@ workflow classify_single {

File kraken2_summary_report = kraken2.kraken2_summary_report
File kraken2_krona_plot = kraken2.krona_report_html
File kraken2_top_taxa_report = report_primary_kraken_taxa.ranked_focal_report
String kraken2_focal_taxon_name = report_primary_kraken_taxa.focal_tax_name
Int kraken2_focal_total_reads = report_primary_kraken_taxa.total_focal_reads
String kraken2_top_taxon_id = report_primary_kraken_taxa.tax_id
String kraken2_top_taxon_name = report_primary_kraken_taxa.tax_name
String kraken2_top_taxon_rank = report_primary_kraken_taxa.tax_rank
Int kraken2_top_taxon_num_reads = report_primary_kraken_taxa.num_reads
Float kraken2_top_taxon_pct_of_focal = report_primary_kraken_taxa.percent_of_focal

File raw_fastqc = merge_raw_reads.fastqc
File cleaned_fastqc = fastqc_cleaned.fastqc_html
File spikein_report = spikein.report
159 changes: 97 additions & 62 deletions pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
version 1.0

import "../tasks/tasks_assembly.wdl" as assembly
import "../tasks/tasks_metagenomics.wdl" as metagenomics
import "../tasks/tasks_ncbi.wdl" as ncbi
import "../tasks/tasks_utils.wdl" as utils
import "assemble_refbased.wdl" as assemble_refbased
@@ -17,39 +18,45 @@ workflow scaffold_and_refine_multitaxa {
String sample_id
File reads_unmapped_bam

Array[Pair[Int,Array[String]+]] taxid_to_ref_accessions = [
(208893, ["KY654518.1"]), # RSV-A
(208895, ["MZ516105.1"]), # RSV-B
(573824, ["NC_038311.1"]), # Rhino A1
(185900, ["ON311191.1"]), # Rhino B27
(1418033, ["ON311169.1"]), # Rhino C15
(463676, ["JN837686.2"]), # Rhino C45
(11137, ["NC_002645.1"]), # HCoV 229E
(290028, ["NC_006577.2"]), # HCoV HKU1
(277944, ["NC_005831.2"]), # HCoV NL63
(31631, ["NC_006213.1"]), # HCoV OC43
(2697049, ["NC_045512.2"]), # SARS-CoV-2 Wuhan Hu-1
(641809, ["NC_026438.1", "NC_026435.1", "NC_026437.1", "NC_026433.1", "NC_026436.1", "NC_026434.1", "NC_026431.1", "NC_026432.1"]), # Flu A/California/07/2009 H1N1
(335341, ["NC_007373.1", "NC_007372.1", "NC_007371.1", "NC_007366.1", "NC_007369.1", "NC_007368.1", "NC_007367.1", "NC_007370.1"]), # Flu A/New York/392/2004 H3N2
(518987, ["NC_002204.1", "NC_002205.1", "NC_002206.1", "NC_002207.1", "NC_002208.1", "NC_002209.1", "NC_002210.1", "NC_002211.1"]), # Flu B/Lee/1940
(162145, ["NC_039199.1"]), # metapneumo
(12730, ["NC_003461.1"]), # paraflu 1
(2560525, ["NC_003443.1"]), # paraflu 2
(11216, ["NC_001796.2"]), # paraflu 3
(11224, ["NC_021928.1"]), # paraflu 4
(129951, ["NC_001405.1"]) # adenovirus C
]
File taxid_to_ref_accessions_tsv
File? focal_report_tsv
File? ncbi_taxdump_tgz

# Float min_pct_reference_covered = 0.1
}

Array[String] assembly_header = ["sample_id", "taxid", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"]
Int min_scaffold_unambig = 10

# if kraken reports are available, filter scaffold list to observed hits (output might be empty!)
if(defined(focal_report_tsv) && defined(ncbi_taxdump_tgz)) {
call metagenomics.filter_refs_to_found_taxa {
input:
taxid_to_ref_accessions_tsv = taxid_to_ref_accessions_tsv,
taxdump_tgz = select_first([ncbi_taxdump_tgz]),
focal_report_tsv = select_first([focal_report_tsv])
}
}

Array[String] assembly_header = ["sample_id", "taxid", "tax_name", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"]
Array[Array[String]] taxid_to_ref_accessions = read_tsv(select_first([filter_refs_to_found_taxa.filtered_taxid_to_ref_accessions_tsv, taxid_to_ref_accessions_tsv]))
scatter(taxon in taxid_to_ref_accessions) {
# cromwell's read_tsv emits [[""]] on empty (0-byte) file input, turn it into []
if(length(taxon)>1) {
Array[String] taxid_to_ref_accessions_fix_cromwell_read_tsv_bug = taxon
}
}

scatter(taxon in select_all(taxid_to_ref_accessions_fix_cromwell_read_tsv_bug)) {
# taxon = [taxid, taxname, semicolon_delim_accession_list]
call utils.string_split {
input:
joined_string = taxon[2],
delimiter = ";"
}
call ncbi.download_annotations {
input:
accessions = taxon.right,
combined_out_prefix = taxon.left
accessions = string_split.tokens,
combined_out_prefix = taxon[0]
}
call assembly.scaffold {
input:
@@ -59,41 +66,64 @@ workflow scaffold_and_refine_multitaxa {
min_unambig = 0,
allow_incomplete_output = true
}
call assemble_refbased.assemble_refbased as refine {
input:
reads_unmapped_bams = [reads_unmapped_bam],
reference_fasta = scaffold.scaffold_fasta,
sample_name = sample_id
if (scaffold.assembly_preimpute_length_unambiguous > min_scaffold_unambig) {
# polish de novo assembly with reads
call assemble_refbased.assemble_refbased as refine {
input:
reads_unmapped_bams = [reads_unmapped_bam],
reference_fasta = scaffold.scaffold_fasta,
sample_name = sample_id
}
String assembly_method_denovo = "viral-ngs/assemble_denovo"
}
# to do: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single
# to do: if biosample attributes file provided, run ncbi.biosample_to_genbank
if (scaffold.assembly_preimpute_length_unambiguous <= min_scaffold_unambig) {
# fall back to refbased assembly if de novo fails
call assemble_refbased.assemble_refbased as ref_based {
input:
reads_unmapped_bams = [reads_unmapped_bam],
reference_fasta = download_annotations.combined_fasta,
sample_name = sample_id
}
String assembly_method_refbased = "viral-ngs/assemble_refbased"
}

# TO DO: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single
# TO DO: if biosample attributes file provided, run ncbi.biosample_to_genbank
String taxid = taxon[0]
String tax_name = taxon[1]
Int assembly_length_unambiguous = select_first([refine.assembly_length_unambiguous, ref_based.assembly_length_unambiguous])
Float percent_reference_covered = 1.0 * assembly_length_unambiguous / scaffold.reference_length
File assembly_fasta = select_first([refine.assembly_fasta, ref_based.assembly_fasta])
Map[String, String] stats_by_taxon = {
"sample_id" : sample_id,
"taxid" : taxon.left,

"assembly_fasta" : refine.assembly_fasta,
"aligned_only_reads_bam" : refine.align_to_self_merged_aligned_only_bam,
"coverage_plot" : refine.align_to_self_merged_coverage_plot,
"assembly_length" : refine.assembly_length,
"assembly_length_unambiguous" : refine.assembly_length_unambiguous,
"reads_aligned" : refine.align_to_self_merged_reads_aligned,
"mean_coverage" : refine.align_to_self_merged_mean_coverage,
"percent_reference_covered" : 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length,

"intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta,
"taxid" : taxid,
"tax_name" : tax_name,

"assembly_fasta" : assembly_fasta,
"aligned_only_reads_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ref_based.align_to_self_merged_aligned_only_bam]),
"coverage_plot" : select_first([refine.align_to_self_merged_coverage_plot, ref_based.align_to_self_merged_coverage_plot]),
"assembly_length" : select_first([refine.assembly_length, ref_based.assembly_length]),
"assembly_length_unambiguous" : assembly_length_unambiguous,
"reads_aligned" : select_first([refine.align_to_self_merged_reads_aligned, ref_based.align_to_self_merged_reads_aligned]),
"mean_coverage" : select_first([refine.align_to_self_merged_mean_coverage, ref_based.align_to_self_merged_mean_coverage]),
"percent_reference_covered" : percent_reference_covered,

"intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta,
"assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous,

"replicate_concordant_sites" : refine.replicate_concordant_sites,
"replicate_discordant_snps" : refine.replicate_discordant_snps,
"replicate_discordant_indels" : refine.replicate_discordant_indels,
"replicate_discordant_vcf" : refine.replicate_discordant_vcf,
"replicate_concordant_sites" : select_first([refine.replicate_concordant_sites, ref_based.replicate_concordant_sites]),
"replicate_discordant_snps" : select_first([refine.replicate_discordant_snps, ref_based.replicate_discordant_snps]),
"replicate_discordant_indels" : select_first([refine.replicate_discordant_indels, ref_based.replicate_discordant_indels]),
"replicate_discordant_vcf" : select_first([refine.replicate_discordant_vcf, ref_based.replicate_discordant_vcf]),

"isnvsFile" : refine.align_to_self_isnvs_vcf,
"aligned_bam" : refine.align_to_self_merged_aligned_only_bam,
"coverage_tsv" : refine.align_to_self_merged_coverage_tsv,
"read_pairs_aligned" : refine.align_to_self_merged_read_pairs_aligned,
"bases_aligned" : refine.align_to_self_merged_bases_aligned
"isnvsFile" : select_first([refine.align_to_self_isnvs_vcf, ref_based.align_to_self_isnvs_vcf]),
"aligned_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ref_based.align_to_self_merged_aligned_only_bam]),
"coverage_tsv" : select_first([refine.align_to_self_merged_coverage_tsv, ref_based.align_to_self_merged_coverage_tsv]),
"read_pairs_aligned" : select_first([refine.align_to_self_merged_read_pairs_aligned, ref_based.align_to_self_merged_read_pairs_aligned]),
"bases_aligned" : select_first([refine.align_to_self_merged_bases_aligned, ref_based.align_to_self_merged_bases_aligned]),

"assembly_method" : select_first([assembly_method_denovo, assembly_method_refbased])
}

scatter(h in assembly_header) {
@@ -109,14 +139,19 @@ workflow scaffold_and_refine_multitaxa {
}

output {
Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon
File assembly_stats_by_taxon_tsv = concatenate.combined

Int num_read_groups = refine.num_read_groups[0]
Int num_libraries = refine.num_libraries[0]

String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa"
String scaffold_viral_assemble_version = scaffold.viralngs_version[0]
String refine_viral_assemble_version = refine.viral_assemble_version[0]
Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon
File assembly_stats_by_taxon_tsv = concatenate.combined
String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa"

## TO DO: some summary stats on stats_by_taxon: how many rows, numbers from the best row, etc
#String assembly_top_taxon_id =
#String assembly_top_length_unambiguous =
#Float assembly_top_pct_ref_cov =
#File assembly_top_fasta =
Array[String] assembly_all_taxids = taxid
Array[String] assembly_all_taxnames = tax_name
Array[Int] assembly_all_lengths_unambig = assembly_length_unambiguous
Array[Float] assembly_all_pct_ref_cov = percent_reference_covered
Array[File] assembly_all_fastas = assembly_fasta
}
}
2 changes: 1 addition & 1 deletion requirements-modules.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
broadinstitute/viral-core=2.2.4
broadinstitute/viral-assemble=2.2.4.0
broadinstitute/viral-classify=2.2.3.0
broadinstitute/viral-classify=2.2.4.0
broadinstitute/viral-phylo=2.1.20.2
broadinstitute/py3-bio=0.1.2
broadinstitute/beast-beagle-cuda=1.10.5pre

0 comments on commit 847d661

Please sign in to comment.