Merge pull request #525 from broadinstitute/ct-filter-align-and-count…

…-fns expanded parameterization of align_and_count and additional output metrics
broadinstitute · Mar 11, 2024 · 1ce64ab · 1ce64ab
2 parents 3b25433 + 78f8fa0
commit 1ce64ab
Show file tree

Hide file tree

Showing 14 changed files with 109 additions and 60 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -35,9 +35,9 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
       - name: checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       # fetch git tags (tagged releases) because 
-      # actions/checkout@v3 does either a full checkout or a shallow checkout without tags
+      # actions/checkout@v4 does either a full checkout or a shallow checkout without tags
       - name: fetch tags
         run: git fetch --prune --unshallow --tags      
       - name: Programmatic environment setup
@@ -67,7 +67,7 @@ jobs:
         run: |
           env
       - name: install python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: '3.8'
       - name: install system dependencies
@@ -88,9 +88,9 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
       - name: checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       # fetch git tags (tagged releases) because 
-      # actions/checkout@v3 does either a full checkout or a shallow checkout without tags
+      # actions/checkout@v4 does either a full checkout or a shallow checkout without tags
       - name: fetch tags
         run: git fetch --prune --unshallow --tags
       - name: Programmatic environment setup
@@ -138,9 +138,9 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
       - name: checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       # fetch git tags (tagged releases) because 
-      # actions/checkout@v3 does either a full checkout or a shallow checkout without tags
+      # actions/checkout@v4 does either a full checkout or a shallow checkout without tags
       - name: fetch tags
         run: git fetch --prune --unshallow --tags
       - name: Programmatic environment setup
@@ -166,7 +166,7 @@ jobs:
           echo "GITHUB_ACTIONS_BRANCH=$GITHUB_ACTIONS_BRANCH"
           echo "GITHUB_ACTIONS_BRANCH=$GITHUB_ACTIONS_BRANCH" >> $GITHUB_ENV
       - name: install python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with: 
           python-version: '3.8'
       - name: install docs dependencies
@@ -183,9 +183,9 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
       - name: checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       # fetch git tags (tagged releases) because 
-      # actions/checkout@v3 does either a full checkout or a shallow checkout without tags
+      # actions/checkout@v4 does either a full checkout or a shallow checkout without tags
       - name: fetch tags
         run: git fetch --prune --unshallow --tags
       - name: Programmatic environment setup
@@ -238,9 +238,9 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
       - name: checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       # fetch git tags (tagged releases) because 
-      # actions/checkout@v3 does either a full checkout or a shallow checkout without tags
+      # actions/checkout@v4 does either a full checkout or a shallow checkout without tags
       - name: fetch tags
         run: git fetch --prune --unshallow --tags
       - name: Programmatic environment setup
@@ -273,7 +273,7 @@ jobs:
           sudo rm -rf "/usr/local/share/boost"
           sudo rm -rf "$AGENT_TOOLSDIRECTORY"
       - name: install python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: '3.8'
       - name: install system dependencies
@@ -304,9 +304,9 @@ jobs:
       DX_PROJECT: project-F8PQ6380xf5bK0Qk0YPjB17P
     steps:
       - name: checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       # fetch git tags (tagged releases) because 
-      # actions/checkout@v3 does either a full checkout or a shallow checkout without tags
+      # actions/checkout@v4 does either a full checkout or a shallow checkout without tags
       - name: fetch tags
         run: git fetch --prune --unshallow --tags
       - name: Programmatic environment setup
@@ -337,7 +337,7 @@ jobs:
           echo "${{ github.event.action }}"
           echo "${{ github.event.pull_request.merged }}"
       - name: install python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with: 
           python-version: '3.8'
       - name: install java

diff --git a/pipes/WDL/tasks/tasks_assembly.wdl b/pipes/WDL/tasks/tasks_assembly.wdl
@@ -450,7 +450,7 @@ task align_reads {
     Boolean  skip_mark_dupes = false
 
     Int?     machine_mem_gb
-    String   docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String   docker = "quay.io/broadinstitute/viral-core:2.3.0"
 
     String   sample_name = basename(basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt"), ".clean")
   }
@@ -846,7 +846,7 @@ task run_discordance {
       String out_basename = "run"
       Int    min_coverage = 4
 
-      String docker = "quay.io/broadinstitute/viral-core:2.2.4"
+      String docker = "quay.io/broadinstitute/viral-core:2.3.0"
     }
     parameter_meta {
       reads_aligned_bam: {

diff --git a/pipes/WDL/tasks/tasks_demux.wdl b/pipes/WDL/tasks/tasks_demux.wdl
@@ -6,7 +6,7 @@ task merge_tarballs {
     String       out_filename
 
     Int?         machine_mem_gb
-    String       docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String       docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
 
   Int disk_size = 2625
@@ -163,7 +163,7 @@ task illumina_demux {
 
     Int?    machine_mem_gb
     Int     disk_size = 2625
-    String  docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String  docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
 
   parameter_meta {

diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl
@@ -351,7 +351,7 @@ task index_ref {
     File?  novocraft_license
 
     Int?   machine_mem_gb
-    String docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
 
   Int disk_size = 100

diff --git a/pipes/WDL/tasks/tasks_ncbi.wdl b/pipes/WDL/tasks/tasks_ncbi.wdl
@@ -192,7 +192,7 @@ task structured_comments {
 
     File?  filter_to_ids
 
-    String docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
   String out_base = basename(assembly_stats_tsv, '.txt')
   command <<<
@@ -272,7 +272,7 @@ task rename_fasta_header {
 
     String out_basename = basename(genome_fasta, ".fasta")
 
-    String docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
   command {
     set -e
@@ -437,7 +437,7 @@ task sra_meta_prep {
     Boolean     paired
 
     String      out_name = "sra_metadata.tsv"
-    String      docker="quay.io/broadinstitute/viral-core:2.2.4"
+    String      docker="quay.io/broadinstitute/viral-core:2.3.0"
   }
   Int disk_size = 100
   parameter_meta {

diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl
@@ -280,7 +280,7 @@ task derived_cols {
         String?       lab_highlight_loc
         Array[File]   table_map = []
 
-        String        docker = "quay.io/broadinstitute/viral-core:2.2.4"
+        String        docker = "quay.io/broadinstitute/viral-core:2.3.0"
         Int           disk_size = 50
     }
     parameter_meta {
@@ -848,7 +848,7 @@ task filter_sequences_to_list {
 
         String       out_fname = sub(sub(basename(sequences, ".zst"), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta")
         # Prior docker image: "nextstrain/base:build-20211012T204409Z"
-        String       docker = "quay.io/broadinstitute/viral-core:2.2.4"
+        String       docker = "quay.io/broadinstitute/viral-core:2.3.0"
         Int          disk_size = 750
     }
     parameter_meta {

diff --git a/pipes/WDL/tasks/tasks_read_utils.wdl b/pipes/WDL/tasks/tasks_read_utils.wdl
@@ -84,7 +84,7 @@ task group_bams_by_sample {
 task get_bam_samplename {
   input {
     File    bam
-    String  docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String  docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
   Int   disk_size = round(size(bam, "GB")) + 50
   command <<<
@@ -111,7 +111,7 @@ task get_sample_meta {
   input {
     Array[File] samplesheets_extended
 
-    String      docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String      docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
   Int disk_size = 50
   command <<<
@@ -172,7 +172,7 @@ task merge_and_reheader_bams {
       File?        reheader_table
       String       out_basename = basename(in_bams[0], ".bam")
 
-      String       docker = "quay.io/broadinstitute/viral-core:2.2.4"
+      String       docker = "quay.io/broadinstitute/viral-core:2.3.0"
     }
     
     Int disk_size = 750
@@ -244,7 +244,7 @@ task rmdup_ubam {
     String  method = "mvicuna"
 
     Int     machine_mem_gb = 7
-    String  docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String  docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
 
   Int disk_size = 375
@@ -303,7 +303,7 @@ task downsample_bams {
     Boolean      deduplicateAfter = false
 
     Int?         machine_mem_gb
-    String       docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String       docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
 
   Int disk_size = 750
@@ -367,7 +367,7 @@ task FastqToUBAM {
     String? sequencing_center
     String? additional_picard_options
 
-    String  docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String  docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
   Int disk_size = 375
   parameter_meta {
@@ -418,7 +418,7 @@ task read_depths {
     File      aligned_bam
 
     String    out_basename = basename(aligned_bam, '.bam')
-    String    docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String    docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
   Int disk_size = 200
   command <<<

diff --git a/pipes/WDL/tasks/tasks_reports.wdl b/pipes/WDL/tasks/tasks_reports.wdl
@@ -15,7 +15,7 @@ task alignment_metrics {
     Int    max_amplicons=500
 
     Int    machine_mem_gb=13
-    String docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
 
   String out_basename = basename(aligned_bam, ".bam")
@@ -136,7 +136,7 @@ task plot_coverage {
     String? plotXLimits # of the form "min max" (ints, space between)
     String? plotYLimits # of the form "min max" (ints, space between)
 
-    String  docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String  docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
 
   Int disk_size = 375
@@ -283,7 +283,7 @@ task coverage_report {
     Array[File]  mapped_bam_idx # optional.. speeds it up if you provide it, otherwise we auto-index
     String       out_report_name = "coverage_report.txt"
 
-    String       docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String       docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
 
   Int disk_size = 375
@@ -350,7 +350,7 @@ task fastqc {
   input {
     File   reads_bam
 
-    String docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
   parameter_meta {
     reads_bam:{ 
@@ -392,8 +392,13 @@ task align_and_count {
     File   ref_db
     Int    topNHits = 3
 
+    Boolean filter_bam_to_proper_primary_mapped_reads         = false
+    Boolean do_not_require_proper_mapped_pairs_when_filtering = false
+    Boolean keep_singletons_when_filtering                    = false
+    Boolean keep_duplicates_when_filtering                    = false
+
     Int?   machine_mem_gb
-    String docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
 
   String  reads_basename=basename(reads_bam, ".bam")
@@ -411,28 +416,62 @@ task align_and_count {
       pattern: ["*.FASTA"],
       category: "required"
     }
+    filter_bam_to_proper_primary_mapped_reads: {
+      description: "If specified, reads till be filtered after alignment to include only those flagged as properly paired.",
+      category: "optional"
+    }
+    do_not_require_proper_mapped_pairs_when_filtering: {
+      description: "Do not require reads to be properly paired when filtering",
+      category: "optional"
+    }
+    keep_singletons_when_filtering: {
+      description: "Keep singletons when filtering",
+      category: "optional"
+    }
+    keep_duplicates_when_filtering: {
+      description: "Do not exclude reads marked as duplicates when filtering",
+      category: "optional"
+    }
   }
-  command {
+  command <<<
     set -ex -o pipefail
 
     read_utils.py --version | tee VERSION
 
-    ln -s "${reads_bam}" "${reads_basename}.bam"
+    ln -s "~{reads_bam}" "~{reads_basename}.bam"
     read_utils.py minimap2_idxstats \
-      "${reads_basename}.bam" \
-      "${ref_db}" \
-      --outStats "${reads_basename}.count.${ref_basename}.txt.unsorted" \
+      "~{reads_basename}.bam" \
+      "~{ref_db}" \
+      --outStats "~{reads_basename}.count.~{ref_basename}.txt.unsorted" \
+      ~{true="--filterReadsAfterAlignment"   false="" filter_bam_to_proper_primary_mapped_reads} \
+      ~{true="--doNotRequirePairsToBeProper" false="" do_not_require_proper_mapped_pairs_when_filtering} \
+      ~{true="--keepSingletons"              false="" keep_singletons_when_filtering} \
+      ~{true="--keepDuplicates"              false="" keep_duplicates_when_filtering} \
       --loglevel=DEBUG
 
-    sort -b -r -n -k3 "${reads_basename}.count.${ref_basename}.txt.unsorted" > "${reads_basename}.count.${ref_basename}.txt"
-    head -n ${topNHits} "${reads_basename}.count.${ref_basename}.txt" > "${reads_basename}.count.${ref_basename}.top_${topNHits}_hits.txt"
-    head -1 "${reads_basename}.count.${ref_basename}.txt" | cut -f 1 > "${reads_basename}.count.${ref_basename}.top.txt"
-  }
+    sort -b -r -n -k3 "~{reads_basename}.count.~{ref_basename}.txt.unsorted" > "~{reads_basename}.count.~{ref_basename}.txt"
+    head -n ~{topNHits} "~{reads_basename}.count.~{ref_basename}.txt" > "~{reads_basename}.count.~{ref_basename}.top_~{topNHits}_hits.txt"
+    TOP_HIT="$(head -1 '~{reads_basename}.count.~{ref_basename}.txt' | cut -f 1 | tee '~{reads_basename}.count.~{ref_basename}.top.txt')"
+
+    TOTAL_COUNT_OF_TOP_HIT=$(grep -E "^($TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" | cut -f3 | tee TOTAL_COUNT_OF_TOP_HIT)
+    TOTAL_COUNT_OF_LESSER_HITS=$(grep -vE "^(\*|$TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" | cut -f3 | paste -sd+ - | bc -l | tee TOTAL_COUNT_OF_LESSER_HITS)
+    PCT_MAPPING_TO_LESSER_HITS=$( echo "scale=3; 100 * $TOTAL_COUNT_OF_LESSER_HITS / ($TOTAL_COUNT_OF_LESSER_HITS + $TOTAL_COUNT_OF_TOP_HIT)" | \
+      bc -l | awk '{printf "%.3f\n", $0}' | tee '~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt' )
+
+    TOTAL_READS_IN_INPUT=$(samtools view -c "~{reads_basename}.bam")
+    PCT_OF_INPUT_READS_MAPPED=$( echo "scale=3; 100 * ($TOTAL_COUNT_OF_LESSER_HITS + $TOTAL_COUNT_OF_TOP_HIT) / $TOTAL_READS_IN_INPUT" | \
+      bc -l | awk '{printf "%.3f\n", $0}' | tee '~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt' )
+  >>>
 
   output {
-    File   report           = "${reads_basename}.count.${ref_basename}.txt"
-    File   report_top_hits  = "${reads_basename}.count.${ref_basename}.top_${topNHits}_hits.txt"
-    String top_hit_id       = read_string("${reads_basename}.count.${ref_basename}.top.txt")
+    File   report           = "~{reads_basename}.count.~{ref_basename}.txt"
+    
+    File   report_top_hits  = "~{reads_basename}.count.~{ref_basename}.top_~{topNHits}_hits.txt"
+    String top_hit_id       = read_string("~{reads_basename}.count.~{ref_basename}.top.txt")
+    
+    String pct_total_reads_mapped    = read_string('~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt')
+    String pct_lesser_hits_of_mapped = read_string('~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt')
+    
     String viralngs_version = read_string("VERSION")
   }
 
@@ -453,7 +492,7 @@ task align_and_count_summary {
 
     String       output_prefix = "count_summary"
 
-    String       docker = "quay.io/broadinstitute/viral-core:2.2.4"
+    String       docker = "quay.io/broadinstitute/viral-core:2.3.0"
   }
 
   Int disk_size = 100