From 2d54ef0606d0b1bd8fb919e6a619aad39be44a6c Mon Sep 17 00:00:00 2001 From: golu099 Date: Wed, 5 Jun 2024 14:40:57 -0400 Subject: [PATCH 1/6] updating with relevant Blastoff scripts to new branch for PR request --- .dockstore.yml | 10 + pipes/WDL/tasks/tasks_megablast.wdl | 390 ++++++++++++++++++++++++ pipes/WDL/workflows/blastoff.wdl | 39 +++ pipes/WDL/workflows/megablast_chunk.wdl | 30 ++ 4 files changed, 469 insertions(+) create mode 100644 pipes/WDL/tasks/tasks_megablast.wdl create mode 100644 pipes/WDL/workflows/blastoff.wdl create mode 100644 pipes/WDL/workflows/megablast_chunk.wdl diff --git a/.dockstore.yml b/.dockstore.yml index 2e5185717..34df25cfa 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -414,3 +414,13 @@ workflows: primaryDescriptorPath: /pipes/WDL/workflows/create_enterics_qc_viz_general.wdl testParameterFiles: - /empty.json + - name: blastoff + subclass: WDL + primaryDescriptorPath: /pipes/WDL/workflows/blastoff.wdl + testParameterFiles: + - /empty.json + - name: chunk_blast + subclass: WDL + primaryDescriptorPath: /pipes/WDL/workflows/megablast_chunk.wdl + testParameterFiles: + - /empty.json diff --git a/pipes/WDL/tasks/tasks_megablast.wdl b/pipes/WDL/tasks/tasks_megablast.wdl new file mode 100644 index 000000000..5daef0591 --- /dev/null +++ b/pipes/WDL/tasks/tasks_megablast.wdl @@ -0,0 +1,390 @@ +version 1.0 + +task trim_rmdup_subsamp { + meta { + description: "Trim reads via trimmomatic, remove duplicate reads, and subsample to a desired read count (default of 100,000), bam in, bam out. " + } + input { + File inBam + String bam_basename = basename(inBam, '.bam') + File clipDb + Int n_reads=10000000 + Int machine_mem_gb = 128 + Int cpu = 16 + Int disk_size_gb = 100 + String docker ="quay.io/broadinstitute/viral-assemble:2.3.1.3" + } + parameter_meta { + inBam: { + description: "Input BAM file", + category: "required" + } + clipDb: { + description: "FASTA file that has a list of sequences to trim from the end of reads. These includes various sequencing adapters and primer sequences that may be on the ends of reads, including those for most of the Illumina kits we use.", + category: "required" + } + n_reads: { + description: "Maximum number of reads set to 10000000 by default.", + category: "required" + } + } + command <<< + set -ex o pipefail + assembly.py --version | tee VERSION + #BAM ->FASTQ-> OutBam? https://github.com/broadinstitute/viral-assemble/blob/80bcc1da5c6a0174362ca9fd8bc0b49ee0b4103b/assembly.py#L91 + assembly.py trim_rmdup_subsamp \ + "~{inBam}" \ + "~{clipDb}" \ + "$(pwd)/outbam.bam" \ + ~{'--n_reads=' + n_reads} + + + #samtools [OutBam -> FASTA] + #-f 4 (f = include only) (4 = unmapped reads) https://broadinstitute.github.io/picard/explain-flags.html + samtools fasta "$(pwd)/outbam.bam" > "~{bam_basename}.fasta" + >>> +output { + File trimmed_fasta = "~{bam_basename}.fasta" +} +runtime { + docker:docker + memory: machine_mem_gb + "GB" + cpu: cpu + disks: "local-disk " + disk_size_gb + " LOCAL" + dx_instance_type: "n2-highmem-4" +} +} + +task lca_megablast { + meta { + description: "Runs megablast followed by LCA for taxon identification." + } + input { + File trimmed_fasta + File blast_db_tgz + File taxdb + String db_name + File taxonomy_db_tgz + String fasta_basename = basename(trimmed_fasta, ".fasta") + Int machine_mem_gb = 500 + Int cpu = 16 + Int disk_size_gb = 300 + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.2" + } + parameter_meta { + trimmed_fasta: { + description: "Input sequence FASTA file with clean bam reads.", + category: "required" + } + blast_db_tgz: { + description: "Compressed BLAST database.", + category: 'required' + } + db_name: { + description: "BLAST database name (default = nt).", + category: "other" + } + taxonomy_db_tgz: { + description: "Compressed taxnonomy dataset.", + category: "required" + } + } + command <<< + #Extract BLAST DB tarball + read_utils.py extract_tarball \ + ~{blast_db_tgz} . \ + --loglevel=DEBUG + + # Extract taxonomy DB tarball + read_utils.py extract_tarball \ + ~{taxonomy_db_tgz} . \ + --loglevel=DEBUG + + ''' + #Extract taxid map file tarball + read_utils.py extract_tarball \ + ~{taxdb} . \ + --loglevel=DEBUG + ''' + + #Set permissions + chmod +x /opt/viral-ngs/source/retrieve_top_blast_hits_LCA_for_each_sequence.pl + chmod +x /opt/viral-ngs/source/LCA_table_to_kraken_output_format.pl + # Verify BLAST database + blastdbcmd -db "~{db_name}" -info + if [ $? -ne 0 ]; then + echo "Database '~{db_name}' not found or is inaccessible." + exit 1 + else + echo "Database '~{db_name}' found and accessible." + fi + #miniwdl run worked when the Title DB was same as called under db. Remade DB, make sure to note title of DB. + #Log start time + START_TIME=$(date +%s) + # Run megablast against nt + blastn -task megablast -query "~{trimmed_fasta}" -db "~{db_name}" -max_target_seqs 50 -num_threads `nproc` -outfmt "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" -out "~{fasta_basename}.fasta_megablast_nt.tsv" + #Log end time + END_TIME=$(date +%s) + # Calculate elapsed time + ELAPSED_TIME=$(($END_TIME - $START_TIME)) + echo "BLAST step took $ELAPSED_TIME seconds." > blast_elapsed_time.txt + # Run LCA + retrieve_top_blast_hits_LCA_for_each_sequence.pl "~{fasta_basename}.fasta_megablast_nt.tsv" nodes.dmp 10 > "~{fasta_basename}.fasta_megablast_nt.tsv_LCA.txt" + # Run Krona output conversion + LCA_table_to_kraken_output_format.pl "~{fasta_basename}.fasta_megablast_nt.tsv_LCA.txt" "~{trimmed_fasta}" > "~{fasta_basename}.kraken.tsv" + # Done +>>> + +output { + File LCA_output = "~{fasta_basename}.fasta_megablast_nt.tsv_LCA.txt" + File kraken_output_fromat = "~{fasta_basename}.kraken.tsv" + File elapsed_time_normal_blastn = "blast_elapsed_time.txt" +} +runtime { + docker:docker + memory: machine_mem_gb + "GB" + cpu: cpu + disks: "local-disk" + disk_size_gb + "HDD" + dx_instance_type: "n2-highmem-16" +} +} +task ChunkBlastHits { + meta { + description: "Process BLAST hits from a FASTA file by dividing the file into smaller chunks for parallel processing (blastn_chunked_fasta)." + } + input { + File inFasta + File blast_db_tgz + File? taxidlist + String db_name + String tasks = "megablast" + Int chunkSize=1000000 + String outfmt = "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" + Int max_target_seqs = 1 + String output_type = "full_line" + String? log_dir + String blast_hits_output = "~{fasta_basename}_new_output.txt" + Int machine_mem_gb = 64 + Int cpu = 16 + Int disk_size_gb = 300 + String docker = "quay.io/broadinstitute/viral-classify:fn_blast" + } + String fasta_basename = basename(inFasta, ".fasta") + #setting current working directory as logging outputs + String log_dir_final = select_first([log_dir, "."]) + command <<< + #Extract tarball contents + read_utils.py extract_tarball \ + ~{blast_db_tgz} . \ + --loglevel=DEBUG + export LOG_DIR=~{log_dir_final} + echo "Using $(nproc) CPU cores." + echo "Asked for ~{machine_mem_gb} memory GB" + #Adding taxidlist input as optional + TAXIDLIST_OPTION="" + if [ -n "~{taxidlist}" ]; then + TAXIDLIST_OPTION="--taxidlist ~{taxidlist}" + fi + #COMMAND + time python /opt/viral-ngs/viral-classify/taxon_filter.py chunk_blast_hits "~{inFasta}" "~{db_name}" "~{blast_hits_output}" --outfmt '~{outfmt}' --chunkSize ~{chunkSize} --task '~{tasks}' --max_target_seqs "~{max_target_seqs}" --output_type "~{output_type}" $TAXIDLIST_OPTION + #add taxidlist to command only if user input + + # Extract runtime + grep "Completed the WHOLE blastn_chunked_fasta in" ~{log_dir_final}/blast_py.log | awk '{print $NF}' > ~{log_dir_final}/duration_seconds.txt + + if [ -f /sys/fs/cgroup/memory.peak ]; then + cat /sys/fs/cgroup/memory.peak + elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then + cat /sys/fs/cgroup/memory/memory.peak + elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then + cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes + else + echo "0" + fi > MEM_BYTES + cat /proc/loadavg > CPU_LOAD + +>>> + +output { + File blast_hits = "~{blast_hits_output}" + File blast_py_log = "~{log_dir_final}/blast_py.log" + File duration_seconds = "~{log_dir_final}/duration_seconds.txt" + Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000) + String cpu_load = read_string("CPU_LOAD") +} +runtime { + docker: docker + cpu: cpu + memory: machine_mem_gb + " GB" + disks: "local-disk " + disk_size_gb + " LOCAL" + dx_instance_type: "n2-standard-16" +} + +} + +task blastoff { + meta{ + description:"Blastoff wrapper" + } + input{ + File trimmed_fasta + String outfmt = "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" + String tasks = "megablast" + Int chunkSize=5000000 + Int max_target_seqs = 50 + String output_type = "full_line" + String? log_dir + Int host_species = 9606 + Int stage2_min_id = 98 + Int stage2_min_qcov = 98 + #Are these id/qcov b/w stage 2 & 3 ever different? + Int stage3_min_id = 98 + Int stage3_min_qcov = 98 + File blast_db_tgz + File taxonomy_db_tgz + String db_name + String fasta_basename = basename(trimmed_fasta, ".fasta") + Int machine_mem_gb = 64 + Int cpu = 16 + Int disk_size_gb = 300 + String docker = "quay.io/broadinstitute/viral-classify:fn_blast" + + } + #setting current working directory as logging outputs + String log_dir_final = select_first([log_dir, "."]) + command <<< + #Extract BLAST DB tarball + read_utils.py extract_tarball \ + ~{blast_db_tgz} . \ + --loglevel=DEBUG + + # Extract taxonomy DB tarball (includes nodes.dmp) + read_utils.py extract_tarball \ + ~{taxonomy_db_tgz} . \ + --loglevel=DEBUG + + export LOG_DIR=~{log_dir_final} + #Note threads and memory asked + echo "Using $(nproc) CPU cores." + echo "Asked for ~{machine_mem_gb} memory GB" + #permissions + chmod +x /opt/viral-ngs/source/retrieve_top_blast_hits_LCA_for_each_sequence.pl + chmod +x /opt/viral-ngs/source/retrieve_most_common_taxonids_in_LCA_output.pl + chmod +x /opt/viral-ngs/source/filter_LCA_matches.pl + chmod +x /opt/viral-ngs/source/add_one_value_column.pl + chmod +x /opt/viral-ngs/source/retrieve_sequences_appearing_or_not_appearing_in_table.pl + chmod +x /opt/viral-ngs/source/generate_LCA_table_for_sequences_with_no_matches.pl + chmod +x /opt/viral-ngs/source/concatenate_tables.pl + chmod +x /opt/viral-ngs/source/LCA_table_to_kraken_output_format.pl + chmod +x /opt/viral-ngs/source/select_random_sequences.pl + + + #STAGE 1 + #Subsamples 100 random reads from original FASTA file + select_random_sequences.pl "~{trimmed_fasta}" 100 > "~{fasta_basename}_subsampled.fasta" + #run megablast on random reads x nt + #switched output from out to txt for readability issues + #blastn -task megablast -query "~{fasta_basename}_subsampled.fasta" -db "~{db_name}" -max_target_seqs 50 -num_threads `nproc` -outfmt "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" -out "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv" + python /opt/viral-ngs/viral-classify/taxon_filter.py chunk_blast_hits "~{fasta_basename}_subsampled.fasta" "~{db_name}" "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv" --outfmt '~{outfmt}' --chunkSize ~{chunkSize} --task '~{tasks}' --max_target_seqs "~{max_target_seqs}" --output_type "~{output_type}" + # Run LCA + retrieve_top_blast_hits_LCA_for_each_sequence.pl "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv" nodes.dmp 1 1 > "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv_LCA.txt" + #Looks for most frequently matched taxon IDs and outputs a list + retrieve_most_common_taxonids_in_LCA_output.pl "~{fasta_basename}_subsampled.fasta_megablast_nt.tsv_LCA.txt" species 10 1 > "sample_specific_db_taxa.txt" + # Create an empty sample_specific_db_taxa.txt if it doesn't exist + touch sample_specific_db_taxa.txt + #adding host_species to sample_specific_db_taxa.txt + echo "~{host_species}" >> "sample_specific_db_taxa.txt" + #ensure file is sorted and unique + sort sample_specific_db_taxa.txt | uniq > sample_specific_db_taxa_unique.txt + mv sample_specific_db_taxa_unique.txt sample_specific_db_taxa.txt + echo "input sequences to stage 2:" + grep ">" "~{trimmed_fasta}" | wc -l + echo "--END STAGE 1" + + #STAGE 2 + echo "---START STAGE 2" + echo "megablast sample-specific database start" + #Run blastn w/ taxidlist specific + #blastn -task megablast -query "~{fasta_basename}_subsampled.fasta" -db "~{db_name}" -max_target_seqs 50 -num_threads `nproc` -taxidlist "sample_specific_db_taxa.txt" -outfmt "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" -out "~{fasta_basename}_megablast_sample_specific_db.tsv" + python /opt/viral-ngs/viral-classify/taxon_filter.py chunk_blast_hits "~{trimmed_fasta}" "~{db_name}" "~{fasta_basename}_megablast_sample_specific_db.tsv" --outfmt '~{outfmt}' --chunkSize ~{chunkSize} --task '~{tasks}' --max_target_seqs "~{max_target_seqs}" --output_type "~{output_type}" --taxidlist "sample_specific_db_taxa.txt" + #Run LCA on last output + retrieve_top_blast_hits_LCA_for_each_sequence.pl "~{fasta_basename}_megablast_sample_specific_db.tsv" nodes.dmp 2 > "~{fasta_basename}_megablast_sample_specific_db_LCA.txt" + #filter + filter_LCA_matches.pl "~{fasta_basename}_megablast_sample_specific_db_LCA.txt" 1 0 0 "~{stage2_min_id}" 999 "~{stage2_min_qcov}" 999 > "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" + #add one clmn value: database + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" "database" "sample-specific" > "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" + #add one clmn value: classified + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" "classified" "classified" > "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" + #retrieves collection of unclassified sequences + retrieve_sequences_appearing_or_not_appearing_in_table.pl "~{trimmed_fasta}" "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" 0 0 > "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" + # megablast_sample_specific_db_${sample_fasta}_unclassified.fasta = "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" + echo "input sequences to stage 3" + grep ">" "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" | wc -l + echo "--END STAGE 2" + echo "---START STAGE 3" + + #Stage 3 + #/blast/results/${sample_fasta}_megablast_sample_specific_db_megablast_nt.out = "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.tsv" + #blastn -task megablast -query "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" -db "~{db_name}" -max_target_seqs 50 -num_threads `nproc` -outfmt "6 qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue" -out "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.tsv" + python /opt/viral-ngs/viral-classify/taxon_filter.py chunk_blast_hits "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" "~{db_name}" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.tsv" --outfmt '~{outfmt}' --chunkSize ~{chunkSize} --task '~{tasks}' --max_target_seqs "~{max_target_seqs}" --output_type "~{output_type}" + retrieve_top_blast_hits_LCA_for_each_sequence.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.tsv" nodes.dmp 10 > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt" + filter_LCA_matches.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt" 0 0 0 "~{stage3_min_id}" 999 "~{stage3_min_qcov}" 999 > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" + #add one column: database, nt + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" "database" "nt" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_classified.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_classified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" + #add one column: classified, classified + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" "classified" "classified" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_classified.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_classified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" + filter_LCA_matches.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt" 0 0 0 "~{stage3_min_id}" 999 "~{stage3_min_qcov}" 999 1 > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" "database" "nt" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" "classified" "unclassified" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" + generate_LCA_table_for_sequences_with_no_matches.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt" "~{fasta_basename}_megablast_sample_specific_db_unclassified.fasta" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" "database" "nt" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" + add_one_value_column.pl "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" "classified" "unclassified (no matches)" > "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt_column_added.txt" + mv "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt_column_added.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" + #Table 1 (classified sequences from stage 2) "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" + #Table 2 (classified sequences from stage 3) "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" + #Table 3 (unclassified sequences from stage 3) "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" + #Table 4 (no-blast-hits sequences from stage 3) "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" + concatenate_tables.pl "~{fasta_basename}_megablast_sample_specific_LCA.txt_classified.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt_LCA_classified.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_unclassified.txt" "~{fasta_basename}_megablast_sample_specific_db_megablast_nt.out_LCA.txt_no_hits.txt" > "~{fasta_basename}_blastoff.txt" + awk 'BEGIN {FS="\t";OFS="\t"} {for (i=2; i<=NF; i++) printf "%s%s", $i, (i blastoff_no_first_column.txt + mv blastoff_no_first_column.txt "~{fasta_basename}_blastoff.txt" + + LCA_table_to_kraken_output_format.pl "~{fasta_basename}_blastoff.txt" "~{trimmed_fasta}" > "~{fasta_basename}_blastoff_kraken.txt" + + + if [ -f /sys/fs/cgroup/memory.peak ]; then + cat /sys/fs/cgroup/memory.peak + elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then + cat /sys/fs/cgroup/memory/memory.peak + elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then + cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes + else + echo "0" + fi > MEM_BYTES + cat /proc/loadavg > CPU_LOAD + >>> + output{ + + File most_popular_taxon_id = "sample_specific_db_taxa.txt" + File blastoff_results = "~{fasta_basename}_blastoff.txt" + File blastoff_kraken = "~{fasta_basename}_blastoff_kraken.txt" + File blast_py_log = "~{log_dir_final}/blast_py.log" + Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000) + String cpu_load = read_string("CPU_LOAD") + + } + runtime{ + docker:docker + memory: machine_mem_gb + "GB" + cpu: cpu + disks: "local-disk " + disk_size_gb + " HDD" + dx_instance_type: "n2-highmem-8" + } + +} diff --git a/pipes/WDL/workflows/blastoff.wdl b/pipes/WDL/workflows/blastoff.wdl new file mode 100644 index 000000000..89401658b --- /dev/null +++ b/pipes/WDL/workflows/blastoff.wdl @@ -0,0 +1,39 @@ +version 1.0 + +import "../tasks/tasks_megablast.wdl" as tools + +workflow megablast { + meta { + desription: "Runs megablast followed by LCA for taxon identification." + author: "Broad Viral Genomics" + email: "viral-ngs@broadinstitute.org" + allowNestedInputs: true + } + input { + File inBam + File clipDb + File blast_db_tgz + File taxonomy_db_tgz + Int host_species + String db_name + } + call tools.trim_rmdup_subsamp { + input: + inBam = inBam, + clipDb = clipDb + } + call tools.blastoff { + input: + trimmed_fasta = trim_rmdup_subsamp.trimmed_fasta, + blast_db_tgz = blast_db_tgz, + taxonomy_db_tgz = taxonomy_db_tgz, + host_species = host_species, + db_name = db_name + + } + output { + File most_popular_taxon_id = blastoff.most_popular_taxon_id + File blastoff_txt_results = blastoff.blastoff_results + File blastoff_kraken = blastoff.blastoff_kraken + } +} \ No newline at end of file diff --git a/pipes/WDL/workflows/megablast_chunk.wdl b/pipes/WDL/workflows/megablast_chunk.wdl new file mode 100644 index 000000000..de4af7379 --- /dev/null +++ b/pipes/WDL/workflows/megablast_chunk.wdl @@ -0,0 +1,30 @@ +version 1.0 + +import "../tasks/tasks_megablast.wdl" as tools + +workflow chunk_megablast { + meta { + description: "Chunk megablast function" + author: "Broad Viral Genomics" + email: "viral-negs@broadinstitute.org" + allowNestedInputs: true + } + input { + File inFasta + File blast_db_tgz + String db_name + } + call tools.ChunkBlastHits { + input: + inFasta = inFasta, + blast_db_tgz = blast_db_tgz, + db_name = db_name + } + output { + File blast_hits = ChunkBlastHits.blast_hits + File blast_filter_logs = ChunkBlastHits.blast_py_log + File chunk_blast_runtime = ChunkBlastHits.duration_seconds + Int max_ram_gb = ChunkBlastHits.max_ram_gb + String cpu_load = ChunkBlastHits.cpu_load + } +} From 5c41876c01aff2e724660d7d8d0920de4e12f341 Mon Sep 17 00:00:00 2001 From: golu099 Date: Sat, 8 Jun 2024 17:40:20 -0400 Subject: [PATCH 2/6] updating docker string to new image --- .dockstore.yml | 2 +- pipes/WDL/tasks/tasks_megablast.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.dockstore.yml b/.dockstore.yml index 34df25cfa..0c9012485 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -419,7 +419,7 @@ workflows: primaryDescriptorPath: /pipes/WDL/workflows/blastoff.wdl testParameterFiles: - /empty.json - - name: chunk_blast + - name: chunk_blast subclass: WDL primaryDescriptorPath: /pipes/WDL/workflows/megablast_chunk.wdl testParameterFiles: diff --git a/pipes/WDL/tasks/tasks_megablast.wdl b/pipes/WDL/tasks/tasks_megablast.wdl index 5daef0591..45f218c35 100644 --- a/pipes/WDL/tasks/tasks_megablast.wdl +++ b/pipes/WDL/tasks/tasks_megablast.wdl @@ -247,7 +247,7 @@ task blastoff { Int machine_mem_gb = 64 Int cpu = 16 Int disk_size_gb = 300 - String docker = "quay.io/broadinstitute/viral-classify:fn_blast" + String docker = "quay.io/broadinstitute/viral-classify:fn_cleaned_blast" } #setting current working directory as logging outputs From 2a383f7d2a39fd920e2ee514393fb6ad26860da9 Mon Sep 17 00:00:00 2001 From: golu099 Date: Fri, 14 Jun 2024 14:54:45 -0400 Subject: [PATCH 3/6] adding krona to the metagenomics wdl --- pipes/WDL/workflows/blastoff.wdl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pipes/WDL/workflows/blastoff.wdl b/pipes/WDL/workflows/blastoff.wdl index 89401658b..12bfe54fe 100644 --- a/pipes/WDL/workflows/blastoff.wdl +++ b/pipes/WDL/workflows/blastoff.wdl @@ -1,6 +1,7 @@ version 1.0 import "../tasks/tasks_megablast.wdl" as tools +import "../tasks/tasks_metagenomics.wdl" as metagenomics workflow megablast { meta { @@ -13,9 +14,17 @@ workflow megablast { File inBam File clipDb File blast_db_tgz + File krona_taxonomy_tab File taxonomy_db_tgz Int host_species String db_name + String sample_name = basename(in_bam, '.bam') + } + parameter_meta { + krona_taxonomy_tab: { + description: "Krona taxonomy database containing a single file: 'taxonomy.tab' (exact name), or possibly just a compressed 'taxonomy.tab" + patterns: ["*.tab.zst", "*.tab.gz", "*.tab", "*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"] + } } call tools.trim_rmdup_subsamp { input: @@ -29,11 +38,19 @@ workflow megablast { taxonomy_db_tgz = taxonomy_db_tgz, host_species = host_species, db_name = db_name + call metagenomics.krona { + input: + reports_txt_gz = blastoff.blastoff_kraken, + krona_taxonomy_db_tgz = krona_taxonomy_tab, + input_type = "tsv", + out_basename = "~{sample_name}.krona" + } } output { File most_popular_taxon_id = blastoff.most_popular_taxon_id File blastoff_txt_results = blastoff.blastoff_results File blastoff_kraken = blastoff.blastoff_kraken + File krona_html = metagenomics.krona_report_html } } \ No newline at end of file From 2a10fea20442caa97860eca837516674bc153eea Mon Sep 17 00:00:00 2001 From: golu099 Date: Fri, 14 Jun 2024 15:14:54 -0400 Subject: [PATCH 4/6] fixing bracket issue --- pipes/WDL/workflows/blastoff.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipes/WDL/workflows/blastoff.wdl b/pipes/WDL/workflows/blastoff.wdl index 12bfe54fe..74a17f01a 100644 --- a/pipes/WDL/workflows/blastoff.wdl +++ b/pipes/WDL/workflows/blastoff.wdl @@ -38,6 +38,7 @@ workflow megablast { taxonomy_db_tgz = taxonomy_db_tgz, host_species = host_species, db_name = db_name + } call metagenomics.krona { input: reports_txt_gz = blastoff.blastoff_kraken, From 015e03ee4ab9540396a8aebbb8dd3a42836131aa Mon Sep 17 00:00:00 2001 From: golu099 Date: Wed, 19 Jun 2024 17:55:59 -0400 Subject: [PATCH 5/6] fixing blastoff wdl krona output --- pipes/WDL/tasks/tasks_megablast.wdl | 2 +- pipes/WDL/workflows/blastoff.wdl | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pipes/WDL/tasks/tasks_megablast.wdl b/pipes/WDL/tasks/tasks_megablast.wdl index 45f218c35..2943c7fd5 100644 --- a/pipes/WDL/tasks/tasks_megablast.wdl +++ b/pipes/WDL/tasks/tasks_megablast.wdl @@ -373,7 +373,7 @@ task blastoff { File most_popular_taxon_id = "sample_specific_db_taxa.txt" File blastoff_results = "~{fasta_basename}_blastoff.txt" - File blastoff_kraken = "~{fasta_basename}_blastoff_kraken.txt" + File blastoff_kraken = "~{fasta_basename}_blastoff_kraken.tsv" File blast_py_log = "~{log_dir_final}/blast_py.log" Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000) String cpu_load = read_string("CPU_LOAD") diff --git a/pipes/WDL/workflows/blastoff.wdl b/pipes/WDL/workflows/blastoff.wdl index 74a17f01a..da326d2e4 100644 --- a/pipes/WDL/workflows/blastoff.wdl +++ b/pipes/WDL/workflows/blastoff.wdl @@ -5,7 +5,7 @@ import "../tasks/tasks_metagenomics.wdl" as metagenomics workflow megablast { meta { - desription: "Runs megablast followed by LCA for taxon identification." + description: "Runs megablast followed by LCA for taxon identification." author: "Broad Viral Genomics" email: "viral-ngs@broadinstitute.org" allowNestedInputs: true @@ -18,7 +18,7 @@ workflow megablast { File taxonomy_db_tgz Int host_species String db_name - String sample_name = basename(in_bam, '.bam') + String sample_name = basename(inBam, '.bam') } parameter_meta { krona_taxonomy_tab: { @@ -41,17 +41,16 @@ workflow megablast { } call metagenomics.krona { input: - reports_txt_gz = blastoff.blastoff_kraken, + reports_txt_gz = [blastoff.blastoff_kraken], krona_taxonomy_db_tgz = krona_taxonomy_tab, input_type = "tsv", out_basename = "~{sample_name}.krona" } - } output { File most_popular_taxon_id = blastoff.most_popular_taxon_id File blastoff_txt_results = blastoff.blastoff_results File blastoff_kraken = blastoff.blastoff_kraken - File krona_html = metagenomics.krona_report_html + File krona_html = krona.krona_report_html } } \ No newline at end of file From 9e62a6dcfe4a0b321bbdfdfa76fc6c14a3262318 Mon Sep 17 00:00:00 2001 From: golu099 Date: Tue, 25 Jun 2024 12:06:09 -0400 Subject: [PATCH 6/6] skip older docker tags --- pipes/WDL/tasks/tasks_megablast.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipes/WDL/tasks/tasks_megablast.wdl b/pipes/WDL/tasks/tasks_megablast.wdl index 2943c7fd5..5e6fe67e2 100644 --- a/pipes/WDL/tasks/tasks_megablast.wdl +++ b/pipes/WDL/tasks/tasks_megablast.wdl @@ -12,7 +12,7 @@ task trim_rmdup_subsamp { Int machine_mem_gb = 128 Int cpu = 16 Int disk_size_gb = 100 - String docker ="quay.io/broadinstitute/viral-assemble:2.3.1.3" + String docker ="quay.io/broadinstitute/viral-assemble:2.3.1.3" #skip-global-version-pin } parameter_meta { inBam: { @@ -69,7 +69,7 @@ task lca_megablast { Int machine_mem_gb = 500 Int cpu = 16 Int disk_size_gb = 300 - String docker = "quay.io/broadinstitute/viral-classify:2.2.4.2" + String docker = "quay.io/broadinstitute/viral-classify:2.2.4.2" #skip-global-version-pin } parameter_meta { trimmed_fasta: { @@ -167,7 +167,7 @@ task ChunkBlastHits { Int machine_mem_gb = 64 Int cpu = 16 Int disk_size_gb = 300 - String docker = "quay.io/broadinstitute/viral-classify:fn_blast" + String docker = "quay.io/broadinstitute/viral-classify:fn_blast" #skip-global-version-pin } String fasta_basename = basename(inFasta, ".fasta") #setting current working directory as logging outputs @@ -247,7 +247,7 @@ task blastoff { Int machine_mem_gb = 64 Int cpu = 16 Int disk_size_gb = 300 - String docker = "quay.io/broadinstitute/viral-classify:fn_cleaned_blast" + String docker = "quay.io/broadinstitute/viral-classify:fn_cleaned_blast" #skip-global-version-pin } #setting current working directory as logging outputs