Skip to content

Commit

Permalink
Merge pull request #525 from broadinstitute/ct-filter-align-and-count…
Browse files Browse the repository at this point in the history
…-fns

expanded parameterization of align_and_count and additional output metrics
  • Loading branch information
dpark01 authored Mar 11, 2024
2 parents 3b25433 + 78f8fa0 commit 1ce64ab
Show file tree
Hide file tree
Showing 14 changed files with 109 additions and 60 deletions.
32 changes: 16 additions & 16 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ jobs:
runs-on: ubuntu-20.04
steps:
- name: checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
# fetch git tags (tagged releases) because
# actions/checkout@v3 does either a full checkout or a shallow checkout without tags
# actions/checkout@v4 does either a full checkout or a shallow checkout without tags
- name: fetch tags
run: git fetch --prune --unshallow --tags
- name: Programmatic environment setup
Expand Down Expand Up @@ -67,7 +67,7 @@ jobs:
run: |
env
- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: install system dependencies
Expand All @@ -88,9 +88,9 @@ jobs:
runs-on: ubuntu-20.04
steps:
- name: checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
# fetch git tags (tagged releases) because
# actions/checkout@v3 does either a full checkout or a shallow checkout without tags
# actions/checkout@v4 does either a full checkout or a shallow checkout without tags
- name: fetch tags
run: git fetch --prune --unshallow --tags
- name: Programmatic environment setup
Expand Down Expand Up @@ -138,9 +138,9 @@ jobs:
runs-on: ubuntu-20.04
steps:
- name: checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
# fetch git tags (tagged releases) because
# actions/checkout@v3 does either a full checkout or a shallow checkout without tags
# actions/checkout@v4 does either a full checkout or a shallow checkout without tags
- name: fetch tags
run: git fetch --prune --unshallow --tags
- name: Programmatic environment setup
Expand All @@ -166,7 +166,7 @@ jobs:
echo "GITHUB_ACTIONS_BRANCH=$GITHUB_ACTIONS_BRANCH"
echo "GITHUB_ACTIONS_BRANCH=$GITHUB_ACTIONS_BRANCH" >> $GITHUB_ENV
- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: install docs dependencies
Expand All @@ -183,9 +183,9 @@ jobs:
runs-on: ubuntu-20.04
steps:
- name: checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
# fetch git tags (tagged releases) because
# actions/checkout@v3 does either a full checkout or a shallow checkout without tags
# actions/checkout@v4 does either a full checkout or a shallow checkout without tags
- name: fetch tags
run: git fetch --prune --unshallow --tags
- name: Programmatic environment setup
Expand Down Expand Up @@ -238,9 +238,9 @@ jobs:
runs-on: ubuntu-20.04
steps:
- name: checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
# fetch git tags (tagged releases) because
# actions/checkout@v3 does either a full checkout or a shallow checkout without tags
# actions/checkout@v4 does either a full checkout or a shallow checkout without tags
- name: fetch tags
run: git fetch --prune --unshallow --tags
- name: Programmatic environment setup
Expand Down Expand Up @@ -273,7 +273,7 @@ jobs:
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: install system dependencies
Expand Down Expand Up @@ -304,9 +304,9 @@ jobs:
DX_PROJECT: project-F8PQ6380xf5bK0Qk0YPjB17P
steps:
- name: checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
# fetch git tags (tagged releases) because
# actions/checkout@v3 does either a full checkout or a shallow checkout without tags
# actions/checkout@v4 does either a full checkout or a shallow checkout without tags
- name: fetch tags
run: git fetch --prune --unshallow --tags
- name: Programmatic environment setup
Expand Down Expand Up @@ -337,7 +337,7 @@ jobs:
echo "${{ github.event.action }}"
echo "${{ github.event.pull_request.merged }}"
- name: install python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: install java
Expand Down
4 changes: 2 additions & 2 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,7 @@ task align_reads {
Boolean skip_mark_dupes = false
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
String sample_name = basename(basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt"), ".clean")
}
Expand Down Expand Up @@ -846,7 +846,7 @@ task run_discordance {
String out_basename = "run"
Int min_coverage = 4
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
parameter_meta {
reads_aligned_bam: {
Expand Down
4 changes: 2 additions & 2 deletions pipes/WDL/tasks/tasks_demux.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ task merge_tarballs {
String out_filename

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}

Int disk_size = 2625
Expand Down Expand Up @@ -163,7 +163,7 @@ task illumina_demux {
Int? machine_mem_gb
Int disk_size = 2625
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
parameter_meta {
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/tasks/tasks_interhost.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ task index_ref {
File? novocraft_license

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}

Int disk_size = 100
Expand Down
6 changes: 3 additions & 3 deletions pipes/WDL/tasks/tasks_ncbi.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ task structured_comments {

File? filter_to_ids

String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
String out_base = basename(assembly_stats_tsv, '.txt')
command <<<
Expand Down Expand Up @@ -272,7 +272,7 @@ task rename_fasta_header {
String out_basename = basename(genome_fasta, ".fasta")
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
command {
set -e
Expand Down Expand Up @@ -437,7 +437,7 @@ task sra_meta_prep {
Boolean paired
String out_name = "sra_metadata.tsv"
String docker="quay.io/broadinstitute/viral-core:2.2.4"
String docker="quay.io/broadinstitute/viral-core:2.3.0"
}
Int disk_size = 100
parameter_meta {
Expand Down
4 changes: 2 additions & 2 deletions pipes/WDL/tasks/tasks_nextstrain.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ task derived_cols {
String? lab_highlight_loc
Array[File] table_map = []
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
Int disk_size = 50
}
parameter_meta {
Expand Down Expand Up @@ -848,7 +848,7 @@ task filter_sequences_to_list {
String out_fname = sub(sub(basename(sequences, ".zst"), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta")
# Prior docker image: "nextstrain/base:build-20211012T204409Z"
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
Int disk_size = 750
}
parameter_meta {
Expand Down
14 changes: 7 additions & 7 deletions pipes/WDL/tasks/tasks_read_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ task group_bams_by_sample {
task get_bam_samplename {
input {
File bam
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
Int disk_size = round(size(bam, "GB")) + 50
command <<<
Expand All @@ -111,7 +111,7 @@ task get_sample_meta {
input {
Array[File] samplesheets_extended
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
Int disk_size = 50
command <<<
Expand Down Expand Up @@ -172,7 +172,7 @@ task merge_and_reheader_bams {
File? reheader_table
String out_basename = basename(in_bams[0], ".bam")
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
Int disk_size = 750
Expand Down Expand Up @@ -244,7 +244,7 @@ task rmdup_ubam {
String method = "mvicuna"
Int machine_mem_gb = 7
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
Int disk_size = 375
Expand Down Expand Up @@ -303,7 +303,7 @@ task downsample_bams {
Boolean deduplicateAfter = false
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
Int disk_size = 750
Expand Down Expand Up @@ -367,7 +367,7 @@ task FastqToUBAM {
String? sequencing_center
String? additional_picard_options
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
Int disk_size = 375
parameter_meta {
Expand Down Expand Up @@ -418,7 +418,7 @@ task read_depths {
File aligned_bam
String out_basename = basename(aligned_bam, '.bam')
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
Int disk_size = 200
command <<<
Expand Down
75 changes: 57 additions & 18 deletions pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ task alignment_metrics {
Int max_amplicons=500

Int machine_mem_gb=13
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}

String out_basename = basename(aligned_bam, ".bam")
Expand Down Expand Up @@ -136,7 +136,7 @@ task plot_coverage {
String? plotXLimits # of the form "min max" (ints, space between)
String? plotYLimits # of the form "min max" (ints, space between)
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}

Int disk_size = 375
Expand Down Expand Up @@ -283,7 +283,7 @@ task coverage_report {
Array[File] mapped_bam_idx # optional.. speeds it up if you provide it, otherwise we auto-index
String out_report_name = "coverage_report.txt"
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
Int disk_size = 375
Expand Down Expand Up @@ -350,7 +350,7 @@ task fastqc {
input {
File reads_bam
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
parameter_meta {
reads_bam:{
Expand Down Expand Up @@ -392,8 +392,13 @@ task align_and_count {
File ref_db
Int topNHits = 3
Boolean filter_bam_to_proper_primary_mapped_reads = false
Boolean do_not_require_proper_mapped_pairs_when_filtering = false
Boolean keep_singletons_when_filtering = false
Boolean keep_duplicates_when_filtering = false
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
String reads_basename=basename(reads_bam, ".bam")
Expand All @@ -411,28 +416,62 @@ task align_and_count {
pattern: ["*.FASTA"],
category: "required"
}
filter_bam_to_proper_primary_mapped_reads: {
description: "If specified, reads till be filtered after alignment to include only those flagged as properly paired.",
category: "optional"
}
do_not_require_proper_mapped_pairs_when_filtering: {
description: "Do not require reads to be properly paired when filtering",
category: "optional"
}
keep_singletons_when_filtering: {
description: "Keep singletons when filtering",
category: "optional"
}
keep_duplicates_when_filtering: {
description: "Do not exclude reads marked as duplicates when filtering",
category: "optional"
}
}
command {
command <<<
set -ex -o pipefail
read_utils.py --version | tee VERSION
ln -s "${reads_bam}" "${reads_basename}.bam"
ln -s "~{reads_bam}" "~{reads_basename}.bam"
read_utils.py minimap2_idxstats \
"${reads_basename}.bam" \
"${ref_db}" \
--outStats "${reads_basename}.count.${ref_basename}.txt.unsorted" \
"~{reads_basename}.bam" \
"~{ref_db}" \
--outStats "~{reads_basename}.count.~{ref_basename}.txt.unsorted" \
~{true="--filterReadsAfterAlignment" false="" filter_bam_to_proper_primary_mapped_reads} \
~{true="--doNotRequirePairsToBeProper" false="" do_not_require_proper_mapped_pairs_when_filtering} \
~{true="--keepSingletons" false="" keep_singletons_when_filtering} \
~{true="--keepDuplicates" false="" keep_duplicates_when_filtering} \
--loglevel=DEBUG
sort -b -r -n -k3 "${reads_basename}.count.${ref_basename}.txt.unsorted" > "${reads_basename}.count.${ref_basename}.txt"
head -n ${topNHits} "${reads_basename}.count.${ref_basename}.txt" > "${reads_basename}.count.${ref_basename}.top_${topNHits}_hits.txt"
head -1 "${reads_basename}.count.${ref_basename}.txt" | cut -f 1 > "${reads_basename}.count.${ref_basename}.top.txt"
}
sort -b -r -n -k3 "~{reads_basename}.count.~{ref_basename}.txt.unsorted" > "~{reads_basename}.count.~{ref_basename}.txt"
head -n ~{topNHits} "~{reads_basename}.count.~{ref_basename}.txt" > "~{reads_basename}.count.~{ref_basename}.top_~{topNHits}_hits.txt"
TOP_HIT="$(head -1 '~{reads_basename}.count.~{ref_basename}.txt' | cut -f 1 | tee '~{reads_basename}.count.~{ref_basename}.top.txt')"
TOTAL_COUNT_OF_TOP_HIT=$(grep -E "^($TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" | cut -f3 | tee TOTAL_COUNT_OF_TOP_HIT)
TOTAL_COUNT_OF_LESSER_HITS=$(grep -vE "^(\*|$TOP_HIT)" "~{reads_basename}.count.~{ref_basename}.txt" | cut -f3 | paste -sd+ - | bc -l | tee TOTAL_COUNT_OF_LESSER_HITS)
PCT_MAPPING_TO_LESSER_HITS=$( echo "scale=3; 100 * $TOTAL_COUNT_OF_LESSER_HITS / ($TOTAL_COUNT_OF_LESSER_HITS + $TOTAL_COUNT_OF_TOP_HIT)" | \
bc -l | awk '{printf "%.3f\n", $0}' | tee '~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt' )
TOTAL_READS_IN_INPUT=$(samtools view -c "~{reads_basename}.bam")
PCT_OF_INPUT_READS_MAPPED=$( echo "scale=3; 100 * ($TOTAL_COUNT_OF_LESSER_HITS + $TOTAL_COUNT_OF_TOP_HIT) / $TOTAL_READS_IN_INPUT" | \
bc -l | awk '{printf "%.3f\n", $0}' | tee '~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt' )
>>>
output {
File report = "${reads_basename}.count.${ref_basename}.txt"
File report_top_hits = "${reads_basename}.count.${ref_basename}.top_${topNHits}_hits.txt"
String top_hit_id = read_string("${reads_basename}.count.${ref_basename}.top.txt")
File report = "~{reads_basename}.count.~{ref_basename}.txt"
File report_top_hits = "~{reads_basename}.count.~{ref_basename}.top_~{topNHits}_hits.txt"
String top_hit_id = read_string("~{reads_basename}.count.~{ref_basename}.top.txt")
String pct_total_reads_mapped = read_string('~{reads_basename}.count.~{ref_basename}.pct_total_reads_mapped.txt')
String pct_lesser_hits_of_mapped = read_string('~{reads_basename}.count.~{ref_basename}.pct_lesser_hits_of_mapped.txt')
String viralngs_version = read_string("VERSION")
}
Expand All @@ -453,7 +492,7 @@ task align_and_count_summary {
String output_prefix = "count_summary"
String docker = "quay.io/broadinstitute/viral-core:2.2.4"
String docker = "quay.io/broadinstitute/viral-core:2.3.0"
}
Int disk_size = 100
Expand Down
Loading

0 comments on commit 1ce64ab

Please sign in to comment.