From bb3cbe606379a930483ad193124140e0f0baa55d Mon Sep 17 00:00:00 2001 From: Chris Tomkins-Tinch Date: Sun, 4 Aug 2024 16:03:22 -0300 Subject: [PATCH] bugfix dx deploy failures (#548) * formatting of megablast wdl for consistency; add memory reporting to kraken2 and krakenuniq tasks * (re)formatting of blastoff.wdl * change in scope of blast_hits_output in ChunkBlastHits * trigger dnanexus deployment on PRs targeting master * cancel earlier actions, if a commit is added to a branch or a PR * apply GH Actions concurrency limit only to deploy_dnanexus job * allow the user to pass an email address as an input to fetch_sra_to_bam (for NCBI API access) --- .github/workflows/build.yml | 6 +- pipes/WDL/tasks/tasks_megablast.wdl | 580 ++++++++++++----------- pipes/WDL/tasks/tasks_metagenomics.wdl | 10 + pipes/WDL/tasks/tasks_read_utils.wdl | 2 +- pipes/WDL/workflows/blastoff.wdl | 42 +- pipes/WDL/workflows/fetch_sra_to_bam.wdl | 8 +- 6 files changed, 348 insertions(+), 300 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d082a880f..6619c4add 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -292,7 +292,11 @@ jobs: deploy_dnanexus: #if: ${{ github.event.ref == 'dnanexus' }} - if: github.event_name == 'release' || github.ref == 'refs/heads/master' || github.ref == 'refs/heads/dnanexus' + if: github.event_name == 'release' || github.ref == 'refs/heads/master' || github.ref == 'refs/heads/dnanexus' || (github.event_name == 'pull_request' && github.base_ref == 'master') + # cancel earlier actions, if a commit is added to a branch or a PR + concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true needs: - validate_wdl_womtool - validate_wdl_miniwdl diff --git a/pipes/WDL/tasks/tasks_megablast.wdl b/pipes/WDL/tasks/tasks_megablast.wdl index dd1a9a52b..7b9822545 100755 --- a/pipes/WDL/tasks/tasks_megablast.wdl +++ b/pipes/WDL/tasks/tasks_megablast.wdl @@ -4,16 +4,20 @@ task trim_rmdup_subsamp { meta { description: "Trim reads via trimmomatic, remove duplicate reads, and subsample to a desired read count (default of 100,000), bam in, bam out. " } + input { File inBam String bam_basename = basename(inBam, '.bam') File clipDb Int n_reads=10000000 + Int machine_mem_gb = 128 - Int cpu = 16 - Int disk_size_gb = 100 - String docker ="quay.io/broadinstitute/viral-assemble:2.3.2.0" + Int cpu = 16 + Int disk_size_gb = 100 + + String docker = "quay.io/broadinstitute/viral-assemble:2.3.2.0" } + parameter_meta { inBam: { description: "Input BAM file", @@ -28,6 +32,7 @@ task trim_rmdup_subsamp { category: "required" } } + command <<< set -ex o pipefail assembly.py --version | tee VERSION @@ -43,16 +48,19 @@ task trim_rmdup_subsamp { #-f 4 (f = include only) (4 = unmapped reads) https://broadinstitute.github.io/picard/explain-flags.html samtools fasta "$(pwd)/outbam.bam" > "~{bam_basename}.fasta" >>> -output { - File trimmed_fasta = "~{bam_basename}.fasta" -} -runtime { - docker:docker - memory: machine_mem_gb + "GB" - cpu: cpu - disks: "local-disk " + disk_size_gb + " LOCAL" - dx_instance_type: "n2-highmem-4" -} + + output { + File trimmed_fasta = "~{bam_basename}.fasta" + } + + runtime { + docker: docker + memory: machine_mem_gb + "GB" + cpu: cpu + disks: "local-disk " + disk_size_gb + " LOCAL" + + dx_instance_type: "n2-highmem-4" + } } task lca_megablast { @@ -66,10 +74,12 @@ task lca_megablast { String db_name File taxonomy_db_tgz String fasta_basename = basename(trimmed_fasta, ".fasta") + Int machine_mem_gb = 500 - Int cpu = 16 - Int disk_size_gb = 300 - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.2" + Int cpu = 16 + Int disk_size_gb = 300 + + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.2" } parameter_meta { trimmed_fasta: { @@ -90,301 +100,315 @@ task lca_megablast { } } command <<< - #Extract BLAST DB tarball - read_utils.py extract_tarball \ - ~{blast_db_tgz} . \ - --loglevel=DEBUG - - # Extract taxonomy DB tarball - read_utils.py extract_tarball \ - ~{taxonomy_db_tgz} . \ - --loglevel=DEBUG - - ''' - #Extract taxid map file tarball - read_utils.py extract_tarball \ - ~{taxdb} . \ - --loglevel=DEBUG - ''' - - #Set permissions - chmod +x /opt/viral-ngs/source/retrieve_top_blast_hits_LCA_for_each_sequence.pl - chmod +x /opt/viral-ngs/source/LCA_table_to_kraken_output_format.pl - # Verify BLAST database - blastdbcmd -db "~{db_name}" -info - if [ $? -ne 0 ]; then - echo "Database '~{db_name}' not found or is inaccessible." - exit 1 - else - echo "Database '~{db_name}' found and accessible." - fi - #miniwdl run worked when the Title DB was same as called under db. Remade DB, make sure to note title of DB. - #Log start time - START_TIME=$(date +%s) - # Run megablast against nt - blastn -task megablast -query "~{trimmed_fasta}" -db "~{db_name}" -max_target_seqs 50 -num_threads `nproc` -outfmt "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" -out "~{fasta_basename}.fasta_megablast_nt.tsv" - #Log end time - END_TIME=$(date +%s) - # Calculate elapsed time - ELAPSED_TIME=$(($END_TIME - $START_TIME)) - echo "BLAST step took $ELAPSED_TIME seconds." > blast_elapsed_time.txt - # Run LCA - retrieve_top_blast_hits_LCA_for_each_sequence.pl "~{fasta_basename}.fasta_megablast_nt.tsv" nodes.dmp 10 > "~{fasta_basename}.fasta_megablast_nt.tsv_LCA.txt" - # Run Krona output conversion - LCA_table_to_kraken_output_format.pl "~{fasta_basename}.fasta_megablast_nt.tsv_LCA.txt" "~{trimmed_fasta}" > "~{fasta_basename}.kraken.tsv" - # Done ->>> - -output { - File LCA_output = "~{fasta_basename}.fasta_megablast_nt.tsv_LCA.txt" - File kraken_output_fromat = "~{fasta_basename}.kraken.tsv" - File elapsed_time_normal_blastn = "blast_elapsed_time.txt" -} -runtime { - docker:docker - memory: machine_mem_gb + "GB" - cpu: cpu - disks: "local-disk" + disk_size_gb + "HDD" - dx_instance_type: "n2-highmem-16" -} + #Extract BLAST DB tarball + read_utils.py extract_tarball \ + ~{blast_db_tgz} . \ + --loglevel=DEBUG + + # Extract taxonomy DB tarball + read_utils.py extract_tarball \ + ~{taxonomy_db_tgz} . \ + --loglevel=DEBUG + + ''' + #Extract taxid map file tarball + read_utils.py extract_tarball \ + ~{taxdb} . \ + --loglevel=DEBUG + ''' + + #Set permissions + chmod +x /opt/viral-ngs/source/retrieve_top_blast_hits_LCA_for_each_sequence.pl + chmod +x /opt/viral-ngs/source/LCA_table_to_kraken_output_format.pl + # Verify BLAST database + blastdbcmd -db "~{db_name}" -info + if [ $? -ne 0 ]; then + echo "Database '~{db_name}' not found or is inaccessible." + exit 1 + else + echo "Database '~{db_name}' found and accessible." + fi + #miniwdl run worked when the Title DB was same as called under db. Remade DB, make sure to note title of DB. + #Log start time + START_TIME=$(date +%s) + # Run megablast against nt + blastn -task megablast -query "~{trimmed_fasta}" -db "~{db_name}" -max_target_seqs 50 -num_threads `nproc` -outfmt "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" -out "~{fasta_basename}.fasta_megablast_nt.tsv" + #Log end time + END_TIME=$(date +%s) + # Calculate elapsed time + ELAPSED_TIME=$(($END_TIME - $START_TIME)) + echo "BLAST step took $ELAPSED_TIME seconds." > blast_elapsed_time.txt + # Run LCA + retrieve_top_blast_hits_LCA_for_each_sequence.pl "~{fasta_basename}.fasta_megablast_nt.tsv" nodes.dmp 10 > "~{fasta_basename}.fasta_megablast_nt.tsv_LCA.txt" + # Run Krona output conversion + LCA_table_to_kraken_output_format.pl "~{fasta_basename}.fasta_megablast_nt.tsv_LCA.txt" "~{trimmed_fasta}" > "~{fasta_basename}.kraken.tsv" + # Done + >>> + + output { + File LCA_output = "~{fasta_basename}.fasta_megablast_nt.tsv_LCA.txt" + File kraken_output_fromat = "~{fasta_basename}.kraken.tsv" + File elapsed_time_normal_blastn = "blast_elapsed_time.txt" + } + + runtime { + docker: docker + memory: machine_mem_gb + "GB" + cpu: cpu + disks: "local-disk" + disk_size_gb + "HDD" + + dx_instance_type: "n2-highmem-16" + } } + task ChunkBlastHits { meta { description: "Process BLAST hits from a FASTA file by dividing the file into smaller chunks for parallel processing (blastn_chunked_fasta)." } + input { - File inFasta - File blast_db_tgz - File? taxidlist - String db_name - String tasks = "megablast" - Int chunkSize=1000000 - String outfmt = "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" - Int max_target_seqs = 1 - String output_type = "full_line" + File inFasta + File blast_db_tgz + File? taxidlist + String db_name + String tasks = "megablast" + Int chunkSize=1000000 + String outfmt = "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" + Int max_target_seqs = 1 + String output_type = "full_line" String? log_dir - String blast_hits_output = "~{fasta_basename}_new_output.txt" - Int machine_mem_gb = 64 - Int cpu = 16 - Int disk_size_gb = 300 - String docker = "quay.io/broadinstitute/viral-classify:fn_blast" #skip-global-version-pin + + Int machine_mem_gb = 64 + Int cpu = 16 + Int disk_size_gb = 300 + + String docker = "quay.io/broadinstitute/viral-classify:fn_blast" #skip-global-version-pin } - String fasta_basename = basename(inFasta, ".fasta") - #setting current working directory as logging outputs - String log_dir_final = select_first([log_dir, "."]) + + String fasta_basename = basename(inFasta, ".fasta") + String blast_hits_output = "~{fasta_basename}_new_output.txt" + #setting current working directory as logging outputs + String log_dir_final = select_first([log_dir, "."]) + command <<< - #Extract tarball contents - read_utils.py extract_tarball \ - ~{blast_db_tgz} . \ - --loglevel=DEBUG - export LOG_DIR=~{log_dir_final} - echo "Using $(nproc) CPU cores." - echo "Asked for ~{machine_mem_gb} memory GB" - #Adding taxidlist input as optional - TAXIDLIST_OPTION="" - if [ -n "~{taxidlist}" ]; then - TAXIDLIST_OPTION="--taxidlist ~{taxidlist}" - fi - #COMMAND - time python /opt/viral-ngs/viral-classify/taxon_filter.py chunk_blast_hits "~{inFasta}" "~{db_name}" "~{blast_hits_output}" --outfmt '~{outfmt}' --chunkSize ~{chunkSize} --task '~{tasks}' --max_target_seqs "~{max_target_seqs}" --output_type "~{output_type}" $TAXIDLIST_OPTION - #add taxidlist to command only if user input - - # Extract runtime - grep "Completed the WHOLE blastn_chunked_fasta in" ~{log_dir_final}/blast_py.log | awk '{print $NF}' > ~{log_dir_final}/duration_seconds.txt - - if [ -f /sys/fs/cgroup/memory.peak ]; then - cat /sys/fs/cgroup/memory.peak - elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then - cat /sys/fs/cgroup/memory/memory.peak - elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes - else - echo "0" - fi > MEM_BYTES - cat /proc/loadavg > CPU_LOAD - ->>> - -output { - File blast_hits = "~{blast_hits_output}" - File blast_py_log = "~{log_dir_final}/blast_py.log" - File duration_seconds = "~{log_dir_final}/duration_seconds.txt" - Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000) - String cpu_load = read_string("CPU_LOAD") -} -runtime { - docker: docker - cpu: cpu - memory: machine_mem_gb + " GB" - disks: "local-disk " + disk_size_gb + " LOCAL" - dx_instance_type: "n2-standard-16" -} + #Extract tarball contents + read_utils.py extract_tarball \ + ~{blast_db_tgz} . \ + --loglevel=DEBUG + export LOG_DIR=~{log_dir_final} + echo "Using $(nproc) CPU cores." + echo "Asked for ~{machine_mem_gb} memory GB" + #Adding taxidlist input as optional + TAXIDLIST_OPTION="" + if [ -n "~{taxidlist}" ]; then + TAXIDLIST_OPTION="--taxidlist ~{taxidlist}" + fi + #COMMAND + time python /opt/viral-ngs/viral-classify/taxon_filter.py chunk_blast_hits "~{inFasta}" "~{db_name}" "~{blast_hits_output}" --outfmt '~{outfmt}' --chunkSize ~{chunkSize} --task '~{tasks}' --max_target_seqs "~{max_target_seqs}" --output_type "~{output_type}" $TAXIDLIST_OPTION + #add taxidlist to command only if user input + + # Extract runtime + grep "Completed the WHOLE blastn_chunked_fasta in" ~{log_dir_final}/blast_py.log | awk '{print $NF}' > ~{log_dir_final}/duration_seconds.txt + + if [ -f /sys/fs/cgroup/memory.peak ]; then + cat /sys/fs/cgroup/memory.peak + elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then + cat /sys/fs/cgroup/memory/memory.peak + elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then + cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes + else + echo "0" + fi > MEM_BYTES + cat /proc/loadavg > CPU_LOAD + >>> + + output { + File blast_hits = "~{blast_hits_output}" + File blast_py_log = "~{log_dir_final}/blast_py.log" + File duration_seconds = "~{log_dir_final}/duration_seconds.txt" + Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000) + String cpu_load = read_string("CPU_LOAD") + } + + runtime { + docker: docker + cpu: cpu + memory: machine_mem_gb + " GB" + disks: "local-disk " + disk_size_gb + " LOCAL" + dx_instance_type: "n2-standard-16" + } } task blastoff { - meta{ + meta { description:"Blastoff wrapper" } + input{ - File trimmed_fasta - String outfmt = "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" - String tasks = "megablast" - Int chunkSize=5000000 - Int max_target_seqs = 50 - String output_type = "full_line" + File trimmed_fasta + String outfmt = "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" + String tasks = "megablast" + Int chunkSize = 5000000 + Int max_target_seqs = 50 + String output_type = "full_line" String? log_dir - Int host_species = 9606 - Int stage2_min_id = 98 + Int host_species = 9606 + Int stage2_min_id = 98 Int stage2_min_qcov = 98 #Are these id/qcov b/w stage 2 & 3 ever different? - Int stage3_min_id = 98 + Int stage3_min_id = 98 Int stage3_min_qcov = 98 File blast_db_tgz File taxonomy_db_tgz String db_name - String fasta_basename = basename(trimmed_fasta, ".fasta") - Int machine_mem_gb = 64 - Int cpu = 16 - Int disk_size_gb = 300 - String docker = "quay.io/broadinstitute/viral-classify:fn_blast" #skip-global-version-pin + String fasta_basename = basename(trimmed_fasta, ".fasta") + + Int machine_mem_gb = 64 + Int cpu = 16 + Int disk_size_gb = 300 + + String docker = "quay.io/broadinstitute/viral-classify:fn_blast" #skip-global-version-pin } - #setting current working directory as logging outputs - String log_dir_final = select_first([log_dir, "."]) - command <<< - #Extract BLAST DB tarball - read_utils.py extract_tarball \ - ~{blast_db_tgz} . \ - --loglevel=DEBUG - - # Extract taxonomy DB tarball (includes nodes.dmp) - read_utils.py extract_tarball \ - ~{taxonomy_db_tgz} . \ - --loglevel=DEBUG - export LOG_DIR=~{log_dir_final} - #Note threads and memory asked - echo "Using $(nproc) CPU cores." - echo "Asked for ~{machine_mem_gb} memory GB" - #permissions - chmod +x /opt/viral-ngs/source/retrieve_top_blast_hits_LCA_for_each_sequence.pl - chmod +x /opt/viral-ngs/source/retrieve_most_common_taxonids_in_LCA_output.pl - chmod +x /opt/viral-ngs/source/filter_LCA_matches.pl - chmod +x /opt/viral-ngs/source/add_one_value_column.pl - chmod +x /opt/viral-ngs/source/retrieve_sequences_appearing_or_not_appearing_in_table.pl - chmod +x /opt/viral-ngs/source/generate_LCA_table_for_sequences_with_no_matches.pl - chmod +x /opt/viral-ngs/source/concatenate_tables.pl - chmod +x /opt/viral-ngs/source/LCA_table_to_kraken_output_format.pl - chmod +x /opt/viral-ngs/source/select_random_sequences.pl - - - #STAGE 1 - #Subsamples 100 random reads from original FASTA file - select_random_sequences.pl "~{trimmed_fasta}" 100 > "~{fasta_basename}_subsampled.fasta" - #run megablast on random reads x nt - #switched output from out to txt for readability issues - #blastn -task megablast -query "~{fasta_basename}_subsampled.fasta" -db "~{db_name}" -max_target_seqs 50 -num_threads `nproc` -outfmt "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" -out "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv" - python /opt/viral-ngs/viral-classify/taxon_filter.py chunk_blast_hits "~{fasta_basename}_subsampled.fasta" "~{db_name}" "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv" --outfmt '~{outfmt}' --chunkSize ~{chunkSize} --task '~{tasks}' --max_target_seqs "~{max_target_seqs}" --output_type "~{output_type}" - # Run LCA - retrieve_top_blast_hits_LCA_for_each_sequence.pl "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv" nodes.dmp 1 1 > "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv_LCA.txt" - #Looks for most frequently matched taxon IDs and outputs a list - retrieve_most_common_taxonids_in_LCA_output.pl "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv_LCA.txt" species 10 1 > "sample_specific_db_taxa.txt" - # Create an empty sample_specific_db_taxa.txt if it doesn't exist - touch sample_specific_db_taxa.txt - #adding host_species to sample_specific_db_taxa.txt - echo "~{host_species}" >> "sample_specific_db_taxa.txt" - #ensure file is sorted and unique - sort sample_specific_db_taxa.txt | uniq > sample_specific_db_taxa_unique.txt - mv sample_specific_db_taxa_unique.txt sample_specific_db_taxa.txt - echo "input sequences to stage 2:" - grep ">" "~{trimmed_fasta}" | wc -l - echo "--END STAGE 1" - - #STAGE 2 - echo "---START STAGE 2" - echo "megablast sample-specific database start" - #Run blastn w/ taxidlist specific - #blastn -task megablast -query "~{fasta_basename}_subsampled.fasta" -db "~{db_name}" -max_target_seqs 50 -num_threads `nproc` -taxidlist "sample_specific_db_taxa.txt" -outfmt "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" -out "~{fasta_basename}_megablast_sample_specific_db.tsv" - python /opt/viral-ngs/viral-classify/taxon_filter.py chunk_blast_hits "~{trimmed_fasta}" "~{db_name}" "~{fasta_basename}_megablast_sample_specific_db.tsv" --outfmt '~{outfmt}' --chunkSize ~{chunkSize} --task '~{tasks}' --max_target_seqs "~{max_target_seqs}" --output_type "~{output_type}" --taxidlist "sample_specific_db_taxa.txt" - #Run LCA on last output - retrieve_top_blast_hits_LCA_for_each_sequence.pl "~{fasta_basename}_megablast_sample_specific_db.tsv" nodes.dmp 2 > "~{fasta_basename}_megablast_sample_specific_db_LCA.txt" - #filter - filter_LCA_matches.pl "~{fasta_basename}_megablast_sample_specific_db_LCA.txt" 1 0 0 "~{stage2_min_id}" 999 "~{stage2_min_qcov}" 999 > "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" - #add one clmn value: database - add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" "database" "sample-specific" > "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt_column_added.txt" - mv "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" - #add one clmn value: classified - add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" "classified" "classified" > "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt_column_added.txt" - mv "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" - #retrieves collection of unclassified sequences - retrieve_sequences_appearing_or_not_appearing_in_table.pl "~{trimmed_fasta}" "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" 0 0 > "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" - # megablast_sample_specific_db_${sample_fasta}_unclassified.fasta = "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" - echo "input sequences to stage 3" - grep ">" "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" | wc -l - echo "--END STAGE 2" - echo "---START STAGE 3" + #setting current working directory as logging outputs + String log_dir_final = select_first([log_dir, "."]) - #Stage 3 - #/blast/results/${sample_fasta}_megablast_sample_specific_db_megablast_nt.out = "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.tsv" - #blastn -task megablast -query "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" -db "~{db_name}" -max_target_seqs 50 -num_threads `nproc` -outfmt "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" -out "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.tsv" - python /opt/viral-ngs/viral-classify/taxon_filter.py chunk_blast_hits "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" "~{db_name}" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.tsv" --outfmt '~{outfmt}' --chunkSize ~{chunkSize} --task '~{tasks}' --max_target_seqs "~{max_target_seqs}" --output_type "~{output_type}" - retrieve_top_blast_hits_LCA_for_each_sequence.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.tsv" nodes.dmp 10 > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt" - filter_LCA_matches.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt" 0 0 0 "~{stage3_min_id}" 999 "~{stage3_min_qcov}" 999 > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" - #add one column: database, nt - add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" "database" "nt" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_classified.txt_column_added.txt" - mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_classified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" - #add one column: classified, classified - add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" "classified" "classified" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_classified.txt_column_added.txt" - mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_classified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" - filter_LCA_matches.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt" 0 0 0 "~{stage3_min_id}" 999 "~{stage3_min_qcov}" 999 1 > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" - add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" "database" "nt" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt_column_added.txt" - mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" - add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" "classified" "unclassified" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt_column_added.txt" - mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" - generate_LCA_table_for_sequences_with_no_matches.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt" "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" - add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" "database" "nt" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt_column_added.txt" - mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" - add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" "classified" "unclassified (no matches)" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt_column_added.txt" - mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" - #Table 1 (classified sequences from stage 2) "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" - #Table 2 (classified sequences from stage 3) "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" - #Table 3 (unclassified sequences from stage 3) "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" - #Table 4 (no-blast-hits sequences from stage 3) "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" - concatenate_tables.pl "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" > "~{fasta_basename}_blastoff.txt" - awk 'BEGIN {FS="\t";OFS="\t"} {for (i=2; i<=NF; i++) printf "%s%s", $i, (i blastoff_no_first_column.txt - mv blastoff_no_first_column.txt "~{fasta_basename}_blastoff.txt" - - LCA_table_to_kraken_output_format.pl "~{fasta_basename}_blastoff.txt" "~{trimmed_fasta}" > "~{fasta_basename}_blastoff_kraken.txt" - - - if [ -f /sys/fs/cgroup/memory.peak ]; then - cat /sys/fs/cgroup/memory.peak - elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then - cat /sys/fs/cgroup/memory/memory.peak - elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then - cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes - else - echo "0" - fi > MEM_BYTES - cat /proc/loadavg > CPU_LOAD + command <<< + #Extract BLAST DB tarball + read_utils.py extract_tarball \ + ~{blast_db_tgz} . \ + --loglevel=DEBUG + + # Extract taxonomy DB tarball (includes nodes.dmp) + read_utils.py extract_tarball \ + ~{taxonomy_db_tgz} . \ + --loglevel=DEBUG + + export LOG_DIR=~{log_dir_final} + #Note threads and memory asked + echo "Using $(nproc) CPU cores." + echo "Asked for ~{machine_mem_gb} memory GB" + #permissions + chmod +x /opt/viral-ngs/source/retrieve_top_blast_hits_LCA_for_each_sequence.pl + chmod +x /opt/viral-ngs/source/retrieve_most_common_taxonids_in_LCA_output.pl + chmod +x /opt/viral-ngs/source/filter_LCA_matches.pl + chmod +x /opt/viral-ngs/source/add_one_value_column.pl + chmod +x /opt/viral-ngs/source/retrieve_sequences_appearing_or_not_appearing_in_table.pl + chmod +x /opt/viral-ngs/source/generate_LCA_table_for_sequences_with_no_matches.pl + chmod +x /opt/viral-ngs/source/concatenate_tables.pl + chmod +x /opt/viral-ngs/source/LCA_table_to_kraken_output_format.pl + chmod +x /opt/viral-ngs/source/select_random_sequences.pl + + #STAGE 1 + #Subsamples 100 random reads from original FASTA file + select_random_sequences.pl "~{trimmed_fasta}" 100 > "~{fasta_basename}_subsampled.fasta" + #run megablast on random reads x nt + #switched output from out to txt for readability issues + #blastn -task megablast -query "~{fasta_basename}_subsampled.fasta" -db "~{db_name}" -max_target_seqs 50 -num_threads `nproc` -outfmt "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" -out "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv" + python /opt/viral-ngs/viral-classify/taxon_filter.py chunk_blast_hits "~{fasta_basename}_subsampled.fasta" "~{db_name}" "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv" --outfmt '~{outfmt}' --chunkSize ~{chunkSize} --task '~{tasks}' --max_target_seqs "~{max_target_seqs}" --output_type "~{output_type}" + # Run LCA + retrieve_top_blast_hits_LCA_for_each_sequence.pl "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv" nodes.dmp 1 1 > "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv_LCA.txt" + #Looks for most frequently matched taxon IDs and outputs a list + retrieve_most_common_taxonids_in_LCA_output.pl "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv_LCA.txt" species 10 1 > "sample_specific_db_taxa.txt" + # Create an empty sample_specific_db_taxa.txt if it doesn't exist + touch sample_specific_db_taxa.txt + #adding host_species to sample_specific_db_taxa.txt + echo "~{host_species}" >> "sample_specific_db_taxa.txt" + #ensure file is sorted and unique + sort sample_specific_db_taxa.txt | uniq > sample_specific_db_taxa_unique.txt + mv sample_specific_db_taxa_unique.txt sample_specific_db_taxa.txt + echo "input sequences to stage 2:" + grep ">" "~{trimmed_fasta}" | wc -l + echo "--END STAGE 1" + + #STAGE 2 + echo "---START STAGE 2" + echo "megablast sample-specific database start" + #Run blastn w/ taxidlist specific + #blastn -task megablast -query "~{fasta_basename}_subsampled.fasta" -db "~{db_name}" -max_target_seqs 50 -num_threads `nproc` -taxidlist "sample_specific_db_taxa.txt" -outfmt "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" -out "~{fasta_basename}_megablast_sample_specific_db.tsv" + python /opt/viral-ngs/viral-classify/taxon_filter.py chunk_blast_hits "~{trimmed_fasta}" "~{db_name}" "~{fasta_basename}_megablast_sample_specific_db.tsv" --outfmt '~{outfmt}' --chunkSize ~{chunkSize} --task '~{tasks}' --max_target_seqs "~{max_target_seqs}" --output_type "~{output_type}" --taxidlist "sample_specific_db_taxa.txt" + #Run LCA on last output + retrieve_top_blast_hits_LCA_for_each_sequence.pl "~{fasta_basename}_megablast_sample_specific_db.tsv" nodes.dmp 2 > "~{fasta_basename}_megablast_sample_specific_db_LCA.txt" + #filter + filter_LCA_matches.pl "~{fasta_basename}_megablast_sample_specific_db_LCA.txt" 1 0 0 "~{stage2_min_id}" 999 "~{stage2_min_qcov}" 999 > "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" + #add one clmn value: database + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" "database" "sample-specific" > "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" + #add one clmn value: classified + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" "classified" "classified" > "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" + #retrieves collection of unclassified sequences + retrieve_sequences_appearing_or_not_appearing_in_table.pl "~{trimmed_fasta}" "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" 0 0 > "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" + # megablast_sample_specific_db_${sample_fasta}_unclassified.fasta = "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" + echo "input sequences to stage 3" + grep ">" "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" | wc -l + echo "--END STAGE 2" + echo "---START STAGE 3" + + #Stage 3 + #/blast/results/${sample_fasta}_megablast_sample_specific_db_megablast_nt.out = "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.tsv" + #blastn -task megablast -query "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" -db "~{db_name}" -max_target_seqs 50 -num_threads `nproc` -outfmt "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" -out "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.tsv" + python /opt/viral-ngs/viral-classify/taxon_filter.py chunk_blast_hits "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" "~{db_name}" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.tsv" --outfmt '~{outfmt}' --chunkSize ~{chunkSize} --task '~{tasks}' --max_target_seqs "~{max_target_seqs}" --output_type "~{output_type}" + retrieve_top_blast_hits_LCA_for_each_sequence.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.tsv" nodes.dmp 10 > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt" + filter_LCA_matches.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt" 0 0 0 "~{stage3_min_id}" 999 "~{stage3_min_qcov}" 999 > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" + #add one column: database, nt + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" "database" "nt" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_classified.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_classified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" + #add one column: classified, classified + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" "classified" "classified" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_classified.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_classified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" + filter_LCA_matches.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt" 0 0 0 "~{stage3_min_id}" 999 "~{stage3_min_qcov}" 999 1 > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" "database" "nt" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" "classified" "unclassified" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" + generate_LCA_table_for_sequences_with_no_matches.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt" "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" "database" "nt" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" "classified" "unclassified (no matches)" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" + #Table 1 (classified sequences from stage 2) "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" + #Table 2 (classified sequences from stage 3) "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" + #Table 3 (unclassified sequences from stage 3) "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" + #Table 4 (no-blast-hits sequences from stage 3) "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" + concatenate_tables.pl "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" > "~{fasta_basename}_blastoff.txt" + awk 'BEGIN {FS="\t";OFS="\t"} {for (i=2; i<=NF; i++) printf "%s%s", $i, (i blastoff_no_first_column.txt + mv blastoff_no_first_column.txt "~{fasta_basename}_blastoff.txt" + + LCA_table_to_kraken_output_format.pl "~{fasta_basename}_blastoff.txt" "~{trimmed_fasta}" > "~{fasta_basename}_blastoff_kraken.txt" + + + if [ -f /sys/fs/cgroup/memory.peak ]; then + cat /sys/fs/cgroup/memory.peak + elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then + cat /sys/fs/cgroup/memory/memory.peak + elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then + cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes + else + echo "0" + fi > MEM_BYTES + cat /proc/loadavg > CPU_LOAD >>> - output{ - File most_popular_taxon_id = "sample_specific_db_taxa.txt" - File blastoff_results = "~{fasta_basename}_blastoff.txt" - File blastoff_kraken = "~{fasta_basename}_blastoff_kraken.txt" - File blast_py_log = "~{log_dir_final}/blast_py.log" - Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000) - String cpu_load = read_string("CPU_LOAD") + output { + File most_popular_taxon_id = "sample_specific_db_taxa.txt" + File blastoff_results = "~{fasta_basename}_blastoff.txt" + File blastoff_kraken = "~{fasta_basename}_blastoff_kraken.txt" + File blast_py_log = "~{log_dir_final}/blast_py.log" + + Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000) + String cpu_load = read_string("CPU_LOAD") } + runtime{ - docker:docker + docker: docker memory: machine_mem_gb + "GB" - cpu: cpu - disks: "local-disk " + disk_size_gb + " HDD" + cpu: cpu + disks: "local-disk " + disk_size_gb + " HDD" + dx_instance_type: "n2-highmem-8" } - } diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl index 8e2a5b6ff..32f0303bf 100644 --- a/pipes/WDL/tasks/tasks_metagenomics.wdl +++ b/pipes/WDL/tasks/tasks_metagenomics.wdl @@ -108,6 +108,8 @@ task krakenuniq { # merge all krona reports ktImportKrona -o krakenuniq.krona.combined.html *.krakenuniq-krona.html + + { if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi } > MEM_BYTES >>> output { @@ -115,6 +117,9 @@ task krakenuniq { Array[File] krakenuniq_summary_reports = glob("*.krakenuniq-summary_report.txt") Array[File] krona_report_html = glob("*.krakenuniq-krona.html") File krona_report_merged_html = "krakenuniq.krona.combined.html" + + Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000) + String viralngs_version = read_string("VERSION") } @@ -305,12 +310,17 @@ task kraken2 { --loglevel=DEBUG wait # pigz reads.txt + + { if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi } > MEM_BYTES >>> output { File kraken2_reads_report = "~{out_basename}.kraken2.reads.txt.gz" File kraken2_summary_report = "~{out_basename}.kraken2.report.txt" File krona_report_html = "~{out_basename}.kraken2.krona.html" + + Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000) + String viralngs_version = read_string("VERSION") } diff --git a/pipes/WDL/tasks/tasks_read_utils.wdl b/pipes/WDL/tasks/tasks_read_utils.wdl index 715c10cc0..a42aad19e 100644 --- a/pipes/WDL/tasks/tasks_read_utils.wdl +++ b/pipes/WDL/tasks/tasks_read_utils.wdl @@ -428,7 +428,7 @@ task read_depths { >>> output { - File read_depths = "~{out_basename}.read_depths.txt" + File read_depths = "~{out_basename}.read_depths.txt" } runtime { docker: docker diff --git a/pipes/WDL/workflows/blastoff.wdl b/pipes/WDL/workflows/blastoff.wdl index 89401658b..15791a400 100644 --- a/pipes/WDL/workflows/blastoff.wdl +++ b/pipes/WDL/workflows/blastoff.wdl @@ -1,39 +1,43 @@ version 1.0 -import "../tasks/tasks_megablast.wdl" as tools +import "../tasks/tasks_megablast.wdl" as megablast workflow megablast { meta { - desription: "Runs megablast followed by LCA for taxon identification." - author: "Broad Viral Genomics" - email: "viral-ngs@broadinstitute.org" + desription: "Runs megablast followed by LCA for taxon identification." + author: "Broad Viral Genomics" + email: "viral-ngs@broadinstitute.org" allowNestedInputs: true } + input { - File inBam - File clipDb - File blast_db_tgz - File taxonomy_db_tgz - Int host_species - String db_name + File inBam + File clipDb + File blast_db_tgz + File taxonomy_db_tgz + Int host_species + String db_name } - call tools.trim_rmdup_subsamp { + + call megablast.trim_rmdup_subsamp { input: inBam = inBam, clipDb = clipDb } - call tools.blastoff { + + call megablast.blastoff { input: - trimmed_fasta = trim_rmdup_subsamp.trimmed_fasta, - blast_db_tgz = blast_db_tgz, + trimmed_fasta = trim_rmdup_subsamp.trimmed_fasta, + blast_db_tgz = blast_db_tgz, taxonomy_db_tgz = taxonomy_db_tgz, - host_species = host_species, - db_name = db_name + host_species = host_species, + db_name = db_name } + output { - File most_popular_taxon_id = blastoff.most_popular_taxon_id - File blastoff_txt_results = blastoff.blastoff_results - File blastoff_kraken = blastoff.blastoff_kraken + File most_popular_taxon_id = blastoff.most_popular_taxon_id + File blastoff_txt_results = blastoff.blastoff_results + File blastoff_kraken = blastoff.blastoff_kraken } } \ No newline at end of file diff --git a/pipes/WDL/workflows/fetch_sra_to_bam.wdl b/pipes/WDL/workflows/fetch_sra_to_bam.wdl index 8b226e957..cd1a2dd0f 100644 --- a/pipes/WDL/workflows/fetch_sra_to_bam.wdl +++ b/pipes/WDL/workflows/fetch_sra_to_bam.wdl @@ -11,12 +11,18 @@ workflow fetch_sra_to_bam { allowNestedInputs: true } + input { + String? email_address + } + call terra.check_terra_env + String? user_email_address = select_first([email_address,check_terra_env.user_email, ""]) + #if(check_terra_env.is_running_on_terra) { call ncbi_tools.Fetch_SRA_to_BAM { input: - email_address = select_first([check_terra_env.user_email, ""]) + email_address = user_email_address } #} #if(!check_terra_env.is_running_on_terra) {