diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2a1a1a21d..20e870208 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -35,9 +35,9 @@ jobs: runs-on: ubuntu-20.04 steps: - name: checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # fetch git tags (tagged releases) because - # actions/checkout@v3 does either a full checkout or a shallow checkout without tags + # actions/checkout@v4 does either a full checkout or a shallow checkout without tags - name: fetch tags run: git fetch --prune --unshallow --tags - name: Programmatic environment setup @@ -67,7 +67,7 @@ jobs: run: | env - name: install python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.8' - name: install system dependencies @@ -88,9 +88,9 @@ jobs: runs-on: ubuntu-20.04 steps: - name: checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # fetch git tags (tagged releases) because - # actions/checkout@v3 does either a full checkout or a shallow checkout without tags + # actions/checkout@v4 does either a full checkout or a shallow checkout without tags - name: fetch tags run: git fetch --prune --unshallow --tags - name: Programmatic environment setup @@ -138,9 +138,9 @@ jobs: runs-on: ubuntu-20.04 steps: - name: checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # fetch git tags (tagged releases) because - # actions/checkout@v3 does either a full checkout or a shallow checkout without tags + # actions/checkout@v4 does either a full checkout or a shallow checkout without tags - name: fetch tags run: git fetch --prune --unshallow --tags - name: Programmatic environment setup @@ -166,7 +166,7 @@ jobs: echo "GITHUB_ACTIONS_BRANCH=$GITHUB_ACTIONS_BRANCH" echo "GITHUB_ACTIONS_BRANCH=$GITHUB_ACTIONS_BRANCH" >> $GITHUB_ENV - name: install python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.8' - name: install docs dependencies @@ -183,9 +183,9 @@ jobs: runs-on: ubuntu-20.04 steps: - name: checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # fetch git tags (tagged releases) because - # actions/checkout@v3 does either a full checkout or a shallow checkout without tags + # actions/checkout@v4 does either a full checkout or a shallow checkout without tags - name: fetch tags run: git fetch --prune --unshallow --tags - name: Programmatic environment setup @@ -238,9 +238,9 @@ jobs: runs-on: ubuntu-20.04 steps: - name: checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # fetch git tags (tagged releases) because - # actions/checkout@v3 does either a full checkout or a shallow checkout without tags + # actions/checkout@v4 does either a full checkout or a shallow checkout without tags - name: fetch tags run: git fetch --prune --unshallow --tags - name: Programmatic environment setup @@ -273,7 +273,7 @@ jobs: sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" - name: install python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.8' - name: install system dependencies @@ -304,9 +304,9 @@ jobs: DX_PROJECT: project-F8PQ6380xf5bK0Qk0YPjB17P steps: - name: checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # fetch git tags (tagged releases) because - # actions/checkout@v3 does either a full checkout or a shallow checkout without tags + # actions/checkout@v4 does either a full checkout or a shallow checkout without tags - name: fetch tags run: git fetch --prune --unshallow --tags - name: Programmatic environment setup @@ -337,7 +337,7 @@ jobs: echo "${{ github.event.action }}" echo "${{ github.event.pull_request.merged }}" - name: install python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.8' - name: install java diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl index a0ff7e0fe..cab357939 100644 --- a/pipes/WDL/tasks/tasks_assembly.wdl +++ b/pipes/WDL/tasks/tasks_assembly.wdl @@ -450,7 +450,7 @@ task align_reads { Boolean skip_mark_dupes = false Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" String sample_name = basename(basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt"), ".clean") } @@ -846,7 +846,7 @@ task run_discordance { String out_basename = "run" Int min_coverage = 4 - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } parameter_meta { reads_aligned_bam: { diff --git a/pipes/WDL/tasks/tasks_demux.wdl b/pipes/WDL/tasks/tasks_demux.wdl index c3e2d4ff5..7ad347fca 100644 --- a/pipes/WDL/tasks/tasks_demux.wdl +++ b/pipes/WDL/tasks/tasks_demux.wdl @@ -6,7 +6,7 @@ task merge_tarballs { String out_filename Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 2625 @@ -163,7 +163,7 @@ task illumina_demux { Int? machine_mem_gb Int disk_size = 2625 - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } parameter_meta { diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl index daa2aca22..a3f657387 100644 --- a/pipes/WDL/tasks/tasks_interhost.wdl +++ b/pipes/WDL/tasks/tasks_interhost.wdl @@ -351,7 +351,7 @@ task index_ref { File? novocraft_license Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 100 diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl index e7da74972..6d4c02c1c 100644 --- a/pipes/WDL/tasks/tasks_ncbi.wdl +++ b/pipes/WDL/tasks/tasks_ncbi.wdl @@ -192,7 +192,7 @@ task structured_comments { File? filter_to_ids - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } String out_base = basename(assembly_stats_tsv, '.txt') command <<< @@ -272,7 +272,7 @@ task rename_fasta_header { String out_basename = basename(genome_fasta, ".fasta") - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } command { set -e @@ -437,7 +437,7 @@ task sra_meta_prep { Boolean paired String out_name = "sra_metadata.tsv" - String docker="quay.io/broadinstitute/viral-core:2.2.4" + String docker="quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 100 parameter_meta { diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl index 614d6a93e..3ed618939 100644 --- a/pipes/WDL/tasks/tasks_nextstrain.wdl +++ b/pipes/WDL/tasks/tasks_nextstrain.wdl @@ -280,7 +280,7 @@ task derived_cols { String? lab_highlight_loc Array[File] table_map = [] - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" Int disk_size = 50 } parameter_meta { @@ -848,7 +848,7 @@ task filter_sequences_to_list { String out_fname = sub(sub(basename(sequences, ".zst"), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta") # Prior docker image: "nextstrain/base:build-20211012T204409Z" - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" Int disk_size = 750 } parameter_meta { diff --git a/pipes/WDL/tasks/tasks_read_utils.wdl b/pipes/WDL/tasks/tasks_read_utils.wdl index 6a4720651..195198e0b 100644 --- a/pipes/WDL/tasks/tasks_read_utils.wdl +++ b/pipes/WDL/tasks/tasks_read_utils.wdl @@ -84,7 +84,7 @@ task group_bams_by_sample { task get_bam_samplename { input { File bam - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = round(size(bam, "GB")) + 50 command <<< @@ -111,7 +111,7 @@ task get_sample_meta { input { Array[File] samplesheets_extended - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 50 command <<< @@ -172,7 +172,7 @@ task merge_and_reheader_bams { File? reheader_table String out_basename = basename(in_bams[0], ".bam") - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 750 @@ -244,7 +244,7 @@ task rmdup_ubam { String method = "mvicuna" Int machine_mem_gb = 7 - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 375 @@ -303,7 +303,7 @@ task downsample_bams { Boolean deduplicateAfter = false Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 750 @@ -367,7 +367,7 @@ task FastqToUBAM { String? sequencing_center String? additional_picard_options - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 375 parameter_meta { @@ -418,7 +418,7 @@ task read_depths { File aligned_bam String out_basename = basename(aligned_bam, '.bam') - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 200 command <<< diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl index d38faa7ce..3c9b192fa 100644 --- a/pipes/WDL/tasks/tasks_reports.wdl +++ b/pipes/WDL/tasks/tasks_reports.wdl @@ -15,7 +15,7 @@ task alignment_metrics { Int max_amplicons=500 Int machine_mem_gb=13 - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } String out_basename = basename(aligned_bam, ".bam") @@ -136,7 +136,7 @@ task plot_coverage { String? plotXLimits # of the form "min max" (ints, space between) String? plotYLimits # of the form "min max" (ints, space between) - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 375 @@ -283,7 +283,7 @@ task coverage_report { Array[File] mapped_bam_idx # optional.. speeds it up if you provide it, otherwise we auto-index String out_report_name = "coverage_report.txt" - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 375 @@ -350,7 +350,7 @@ task fastqc { input { File reads_bam - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } parameter_meta { reads_bam:{ @@ -392,8 +392,13 @@ task align_and_count { File ref_db Int topNHits = 3 + Boolean filter_bam_to_proper_primary_mapped_reads = false + Boolean do_not_require_proper_mapped_pairs_when_filtering = false + Boolean keep_singletons_when_filtering = false + Boolean keep_duplicates_when_filtering = false + Int? machine_mem_gb - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } String reads_basename=basename(reads_bam, ".bam") @@ -411,28 +416,62 @@ task align_and_count { pattern: ["*.FASTA"], category: "required" } + filter_bam_to_proper_primary_mapped_reads: { + description: "If specified, reads till be filtered after alignment to include only those flagged as properly paired.", + category: "optional" + } + do_not_require_proper_mapped_pairs_when_filtering: { + description: "Do not require reads to be properly paired when filtering", + category: "optional" + } + keep_singletons_when_filtering: { + description: "Keep singletons when filtering", + category: "optional" + } + keep_duplicates_when_filtering: { + description: "Do not exclude reads marked as duplicates when filtering", + category: "optional" + } } - command { + command <<< set -ex -o pipefail read_utils.py --version | tee VERSION - ln -s "${reads_bam}" "${reads_basename}.bam" + ln -s "~{reads_bam}" "~{reads_basename}.bam" read_utils.py minimap2_idxstats \ - "${reads_basename}.bam" \ - "${ref_db}" \ - --outStats "${reads_basename}.count.${ref_basename}.txt.unsorted" \ + "~{reads_basename}.bam" \ + "~{ref_db}" \ + --outStats "~{reads_basename}.count.~{ref_basename}.txt.unsorted" \ + ~{true="--filterReadsAfterAlignment" false="" filter_bam_to_proper_primary_mapped_reads} \ + ~{true="--doNotRequirePairsToBeProper" false="" do_not_require_proper_mapped_pairs_when_filtering} \ + ~{true="--keepSingletons" false="" keep_singletons_when_filtering} \ + ~{true="--keepDuplicates" false="" keep_duplicates_when_filtering} \ --loglevel=DEBUG - sort -b -r -n -k3 "${reads_basename}.count.${ref_basename}.txt.unsorted" > "${reads_basename}.count.${ref_basename}.txt" - head -n ${topNHits} "${reads_basename}.count.${ref_basename}.txt" > "${reads_basename}.count.${ref_basename}.top_${topNHits}_hits.txt" - head -1 "${reads_basename}.count.${ref_basename}.txt" | cut -f 1 > "${reads_basename}.count.${ref_basename}.top.txt" - } + sort -b -r -n -k3 "~{reads_basename}.count.~{ref_basename}.txt.unsorted" > "~{reads_basename}.count.~{ref_basename}.txt" + head -n ~{topNHits} "~{reads_basename}.count.~{ref_basename}.txt" > "~{reads_basename}.count.~{ref_basename}.top_~{topNHits}_hits.txt" + TOP_HIT="$(head -1 '~{reads_basename}.count.~{ref_basename}.txt' | cut -f 1 | tee '~{reads_basename}.count.~{ref_basename}.top.txt')" + + TOTAL_COUNT_OF_TOP_HIT=$(grep -E "^($TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" | cut -f3 | tee TOTAL_COUNT_OF_TOP_HIT) + TOTAL_COUNT_OF_LESSER_HITS=$(grep -vE "^(\*|$TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" | cut -f3 | paste -sd+ - | bc -l | tee TOTAL_COUNT_OF_LESSER_HITS) + PCT_MAPPING_TO_LESSER_HITS=$( echo "scale=3; 100 * $TOTAL_COUNT_OF_LESSER_HITS / ($TOTAL_COUNT_OF_LESSER_HITS + $TOTAL_COUNT_OF_TOP_HIT)" | \ + bc -l | awk '{printf "%.3f\n", $0}' | tee '~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt' ) + + TOTAL_READS_IN_INPUT=$(samtools view -c "~{reads_basename}.bam") + PCT_OF_INPUT_READS_MAPPED=$( echo "scale=3; 100 * ($TOTAL_COUNT_OF_LESSER_HITS + $TOTAL_COUNT_OF_TOP_HIT) / $TOTAL_READS_IN_INPUT" | \ + bc -l | awk '{printf "%.3f\n", $0}' | tee '~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt' ) + >>> output { - File report = "${reads_basename}.count.${ref_basename}.txt" - File report_top_hits = "${reads_basename}.count.${ref_basename}.top_${topNHits}_hits.txt" - String top_hit_id = read_string("${reads_basename}.count.${ref_basename}.top.txt") + File report = "~{reads_basename}.count.~{ref_basename}.txt" + + File report_top_hits = "~{reads_basename}.count.~{ref_basename}.top_~{topNHits}_hits.txt" + String top_hit_id = read_string("~{reads_basename}.count.~{ref_basename}.top.txt") + + String pct_total_reads_mapped = read_string('~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt') + String pct_lesser_hits_of_mapped = read_string('~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt') + String viralngs_version = read_string("VERSION") } @@ -453,7 +492,7 @@ task align_and_count_summary { String output_prefix = "count_summary" - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 100 diff --git a/pipes/WDL/tasks/tasks_taxon_filter.wdl b/pipes/WDL/tasks/tasks_taxon_filter.wdl index 7da05cf2b..0907bc3f9 100644 --- a/pipes/WDL/tasks/tasks_taxon_filter.wdl +++ b/pipes/WDL/tasks/tasks_taxon_filter.wdl @@ -211,7 +211,7 @@ task merge_one_per_sample { Boolean rmdup = false Int machine_mem_gb = 7 - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 750 diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index bbef0044c..293f907eb 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -104,7 +104,7 @@ task zcat { { if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then cat /sys/fs/cgroup/memory/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi } > MEM_BYTES >>> runtime { - docker: "quay.io/broadinstitute/viral-core:2.2.4" + docker: "quay.io/broadinstitute/viral-core:2.3.0" memory: "1 GB" cpu: cpus disks: "local-disk " + disk_size + " LOCAL" @@ -399,7 +399,7 @@ task tsv_join { runtime { memory: "~{machine_mem_gb} GB" cpu: 4 - docker: "quay.io/broadinstitute/viral-core:2.2.4" + docker: "quay.io/broadinstitute/viral-core:2.3.0" disks: "local-disk " + disk_size + " HDD" disk: disk_size + " GB" # TES dx_instance_type: "mem1_ssd1_v2_x4" @@ -486,7 +486,7 @@ task tsv_stack { input { Array[File]+ input_tsvs String out_basename - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" } Int disk_size = 50 @@ -749,7 +749,7 @@ task filter_sequences_by_length { File sequences_fasta Int min_non_N = 1 - String docker = "quay.io/broadinstitute/viral-core:2.2.4" + String docker = "quay.io/broadinstitute/viral-core:2.3.0" Int disk_size = 750 } parameter_meta { diff --git a/pipes/WDL/workflows/align_and_count.wdl b/pipes/WDL/workflows/align_and_count.wdl index 950bca48d..468ffe58e 100644 --- a/pipes/WDL/workflows/align_and_count.wdl +++ b/pipes/WDL/workflows/align_and_count.wdl @@ -22,8 +22,13 @@ workflow align_and_count_report { call reports.align_and_count output { - File report = align_and_count.report - File report_top_hits = align_and_count.report_top_hits - String viral_core_version = align_and_count.viralngs_version + File report = align_and_count.report + File report_top_hits = align_and_count.report_top_hits + String tophit = align_and_count.top_hit_id + + String pct_mapped_of_total_reads = align_and_count.pct_total_reads_mapped + String pct_mapped_to_lesser_hits = align_and_count.pct_lesser_hits_of_mapped + + String viral_core_version = align_and_count.viralngs_version } } diff --git a/pipes/WDL/workflows/classify_single.wdl b/pipes/WDL/workflows/classify_single.wdl index bea728228..42de1fac0 100644 --- a/pipes/WDL/workflows/classify_single.wdl +++ b/pipes/WDL/workflows/classify_single.wdl @@ -151,6 +151,8 @@ workflow classify_single { File cleaned_fastqc = fastqc_cleaned.fastqc_html File spikein_report = spikein.report String spikein_tophit = spikein.top_hit_id + String spikein_pct_of_total_reads = spikein.pct_total_reads_mapped + String spikein_pct_lesser_hits = spikein.pct_lesser_hits_of_mapped String kraken2_viral_classify_version = kraken2.viralngs_version String deplete_viral_classify_version = deplete.viralngs_version diff --git a/pipes/WDL/workflows/metagenomic_denovo.wdl b/pipes/WDL/workflows/metagenomic_denovo.wdl index bebf5db73..4dfbb55ab 100644 --- a/pipes/WDL/workflows/metagenomic_denovo.wdl +++ b/pipes/WDL/workflows/metagenomic_denovo.wdl @@ -255,6 +255,9 @@ workflow metagenomic_denovo { Float bases_aligned = refine.align_to_self_merged_bases_aligned File? spikein_hits = spikein.report + String? spikein_tophit = spikein.top_hit_id + String? spikein_pct_of_total_reads = spikein.pct_total_reads_mapped + String? spikein_pct_lesser_hits = spikein.pct_lesser_hits_of_mapped String viral_classify_version = kraken2.viralngs_version String viral_assemble_version = assemble.viralngs_version diff --git a/requirements-modules.txt b/requirements-modules.txt index d0f852682..53718d00a 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -1,4 +1,4 @@ -broadinstitute/viral-core=2.2.4 +broadinstitute/viral-core=2.3.0 broadinstitute/viral-assemble=2.2.4.0 broadinstitute/viral-classify=2.2.4.0 broadinstitute/viral-phylo=2.1.20.2