-
Notifications
You must be signed in to change notification settings - Fork 27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
more scaffolding updates #511
Changes from 17 commits
7064708
0593083
0bedc96
98f9bbd
e39919b
a85d7c9
9e12088
02cf671
d824518
fa07252
031a294
8a9b26f
165eb66
8c898c9
1080d49
1a77bf7
d31c14a
526cece
f02a58b
ca24b2d
bc6bee7
6a71e1a
93d455f
88ca4d1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
version 1.0 | ||
|
||
import "../tasks/tasks_assembly.wdl" as assembly | ||
import "../tasks/tasks_metagenomics.wdl" as metagenomics | ||
import "../tasks/tasks_ncbi.wdl" as ncbi | ||
import "../tasks/tasks_utils.wdl" as utils | ||
import "assemble_refbased.wdl" as assemble_refbased | ||
|
@@ -17,39 +18,37 @@ workflow scaffold_and_refine_multitaxa { | |
String sample_id | ||
File reads_unmapped_bam | ||
|
||
Array[Pair[Int,Array[String]+]] taxid_to_ref_accessions = [ | ||
(208893, ["KY654518.1"]), # RSV-A | ||
(208895, ["MZ516105.1"]), # RSV-B | ||
(573824, ["NC_038311.1"]), # Rhino A1 | ||
(185900, ["ON311191.1"]), # Rhino B27 | ||
(1418033, ["ON311169.1"]), # Rhino C15 | ||
(463676, ["JN837686.2"]), # Rhino C45 | ||
(11137, ["NC_002645.1"]), # HCoV 229E | ||
(290028, ["NC_006577.2"]), # HCoV HKU1 | ||
(277944, ["NC_005831.2"]), # HCoV NL63 | ||
(31631, ["NC_006213.1"]), # HCoV OC43 | ||
(2697049, ["NC_045512.2"]), # SARS-CoV-2 Wuhan Hu-1 | ||
(641809, ["NC_026438.1", "NC_026435.1", "NC_026437.1", "NC_026433.1", "NC_026436.1", "NC_026434.1", "NC_026431.1", "NC_026432.1"]), # Flu A/California/07/2009 H1N1 | ||
(335341, ["NC_007373.1", "NC_007372.1", "NC_007371.1", "NC_007366.1", "NC_007369.1", "NC_007368.1", "NC_007367.1", "NC_007370.1"]), # Flu A/New York/392/2004 H3N2 | ||
(518987, ["NC_002204.1", "NC_002205.1", "NC_002206.1", "NC_002207.1", "NC_002208.1", "NC_002209.1", "NC_002210.1", "NC_002211.1"]), # Flu B/Lee/1940 | ||
(162145, ["NC_039199.1"]), # metapneumo | ||
(12730, ["NC_003461.1"]), # paraflu 1 | ||
(2560525, ["NC_003443.1"]), # paraflu 2 | ||
(11216, ["NC_001796.2"]), # paraflu 3 | ||
(11224, ["NC_021928.1"]), # paraflu 4 | ||
(129951, ["NC_001405.1"]) # adenovirus C | ||
] | ||
File taxid_to_ref_accessions_tsv | ||
File? focal_report_tsv | ||
File? ncbi_taxdump_tgz | ||
|
||
# Float min_pct_reference_covered = 0.1 | ||
} | ||
|
||
Array[String] assembly_header = ["sample_id", "taxid", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] | ||
# if kraken reports are available, filter scaffold list to observed hits (output might be empty!) | ||
if(defined(focal_report_tsv) && defined(ncbi_taxdump_tgz)) { | ||
call metagenomics.filter_refs_to_found_taxa { | ||
input: | ||
taxid_to_ref_accessions_tsv = taxid_to_ref_accessions_tsv, | ||
taxdump_tgz = select_first([ncbi_taxdump_tgz]), | ||
focal_report_tsv = select_first([focal_report_tsv]) | ||
} | ||
} | ||
|
||
Array[Array[String]] taxid_to_ref_accessions = read_tsv(select_first([filter_refs_to_found_taxa.filtered_taxid_to_ref_accessions_tsv, taxid_to_ref_accessions_tsv])) | ||
Array[String] assembly_header = ["sample_id", "taxid", "tax_name", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned"] | ||
|
||
scatter(taxon in taxid_to_ref_accessions) { | ||
# taxon = [taxid, taxname, semicolon_delim_accession_list] | ||
call utils.string_split { | ||
input: | ||
joined_string = taxon[2], | ||
delimiter = ";" | ||
} | ||
call ncbi.download_annotations { | ||
input: | ||
accessions = taxon.right, | ||
combined_out_prefix = taxon.left | ||
accessions = string_split.tokens, | ||
combined_out_prefix = taxon[0] | ||
} | ||
call assembly.scaffold { | ||
input: | ||
|
@@ -65,12 +64,17 @@ workflow scaffold_and_refine_multitaxa { | |
reference_fasta = scaffold.scaffold_fasta, | ||
sample_name = sample_id | ||
} | ||
# to do: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single | ||
# to do: if biosample attributes file provided, run ncbi.biosample_to_genbank | ||
# TO DO: if percent_reference_covered > some threshold, run ncbi.rename_fasta_header and ncbi.align_and_annot_transfer_single | ||
# TO DO: if biosample attributes file provided, run ncbi.biosample_to_genbank | ||
|
||
if (refine.reference_genome_length > 0) { | ||
Float percent_reference_covered = 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length | ||
} | ||
|
||
Map[String, String] stats_by_taxon = { | ||
"sample_id" : sample_id, | ||
"taxid" : taxon.left, | ||
"taxid" : taxon[0], | ||
"tax_name" : taxon[1], | ||
|
||
"assembly_fasta" : refine.assembly_fasta, | ||
"aligned_only_reads_bam" : refine.align_to_self_merged_aligned_only_bam, | ||
|
@@ -79,7 +83,7 @@ workflow scaffold_and_refine_multitaxa { | |
"assembly_length_unambiguous" : refine.assembly_length_unambiguous, | ||
"reads_aligned" : refine.align_to_self_merged_reads_aligned, | ||
"mean_coverage" : refine.align_to_self_merged_mean_coverage, | ||
"percent_reference_covered" : 1.0 * refine.assembly_length_unambiguous / refine.reference_genome_length, | ||
"percent_reference_covered" : select_first([percent_reference_covered, 0.0]), | ||
|
||
"intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, | ||
"assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous, | ||
|
@@ -109,14 +113,10 @@ workflow scaffold_and_refine_multitaxa { | |
} | ||
|
||
output { | ||
Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon | ||
File assembly_stats_by_taxon_tsv = concatenate.combined | ||
|
||
Int num_read_groups = refine.num_read_groups[0] | ||
Int num_libraries = refine.num_libraries[0] | ||
Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any reason we can't make this type There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mostly just because of how we construct it (see the scatter in the WDL above), and that WDL 1.0 lacks a lot of the basic methods for navigating |
||
File assembly_stats_by_taxon_tsv = concatenate.combined | ||
String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa" | ||
|
||
String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa" | ||
String scaffold_viral_assemble_version = scaffold.viralngs_version[0] | ||
String refine_viral_assemble_version = refine.viral_assemble_version[0] | ||
# TO DO: some summary stats on stats_by_taxon: how many rows, numbers from the best row, etc | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It may be nice to break out
tax_name
andpercent_reference_covered
for the "top" viral assembly into separate workflow outputs, for easier search and filtering on Terra (where "top" could be defined as the most complete assembly, or the most abundant taxon in terms of # of reads or # of matching distinct k-mers).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added into the TO DO comments at the bottom of the WDL. I think this will require a small bespoke tsv-parsing task for this purpose. It will also need to be reslient to the empty-output scenario (ie, there is no top assembly because none were attempted or were successful).