Add PathSeq WDL (#4143)

* Add PathSeq WDL
broadinstitute · Jan 29, 2018 · 2b06bb6 · 2b06bb6
1 parent 385fa4a
commit 2b06bb6
Show file tree

Hide file tree

Showing 3 changed files with 262 additions and 0 deletions.
diff --git a/scripts/pathseq/wdl/README.md b/scripts/pathseq/wdl/README.md
@@ -0,0 +1,45 @@
+## Running the PathSeq WDL
+
+### Setting up parameter json file for a run
+
+To get started, *copy* the ``pathseq_pipeline_template.json`` for the workflow and modify the parameters accordingly.
+DO NOT commit your json file to this repo. This file has reasonable default parameters.
+
+PathSeq reference files are available in the [GATK Resource Bundle](https://software.broadinstitute.org/gatk/download/bundle) (located [here](ftp://[email protected]/bundle/beta/PathSeq)).
+
+*Please note that there are optional parameters that do not appear in the template files, since we do not want to specify them by default*
+
+### Docker image
+- "broadinstitute/gatk:4.0.0.0"
+
+### Parameter descriptions
+
+Recommended default values (where possible) are found in ``pathseq_pipeline_template.json``
+
+- ``PathSeqPipelineWorkflow.sample_name`` -- sample ID
+- ``PathSeqPipelineWorkflow.input_bam`` -- sample BAM file
+- ``PathSeqPipelineWorkflow.is_host_aligned`` -- Set to true if the input has already been aligned to a host reference. *NOTE: common human references (e.g. GrCh38) contain decoy sequences such as the Epstein-Barr Virus genome. If this flag is set to true, reads aligning to these decoys will be misidentified as "host" and filtered out.*
+- ``PathSeqPipelineWorkflow.filter_bwa_image`` -- Path to host BWA index image. This corresponds to `pathseq_host.fa.img` in the Resource Bundle.
+- ``PathSeqPipelineWorkflow.kmer_file`` -- Path to host k-mer file. This corresponds to `pathseq_host.bfi` in the Resource Bundle.
+- ``PathSeqPipelineWorkflow.microbe_bwa_image`` -- Path to microbe BWA index image. This corresponds to `pathseq_microbe.fa.img` in the Resource Bundle.
+- ``PathSeqPipelineWorkflow.microbe_fasta`` -- Path to microbe reference FASTA file. This corresponds to `pathseq_microbe.fa` in the Resource Bundle.
+- ``PathSeqPipelineWorkflow.microbe_fasta_dict`` -- Path to microbe reference dictionary file.This corresponds to `pathseq_microbe.dict` in the Resource Bundle.
+- ``PathSeqPipelineWorkflow.taxonomy_file`` -- Path to PathSeq taxonomy file. This corresponds to `pathseq_taxonomy.db` in the Resource Bundle.
+- ``PathSeqPipelineWorkflow.gatk_docker`` -- GATK docker image
+
+Optional parameters:
+
+- ``PathSeqPipelineWorkflow.min_clipped_read_length`` -- Minimum read length after quality trimming. You may need to reduce this if your input reads are shorter than the default value (default 60)
+- ``PathSeqPipelineWorkflow.min_score_identity`` -- Fraction of bases aligned to count a microbe alignment in abundance scoring (default 0.9)
+- ``PathSeqPipelineWorkflow.identity_margin`` -- If a read maps to more than one microbe, the best alignment is counted. Any additional alignments are also counted if they are within this margin of the best alignment (default 0.02)
+- ``PathSeqPipelineWorkflow.divide_by_genome_length`` -- If true, abundance scores are normalized by genome length (default false)
+- ``PathSeqPipelineWorkflow.filter_duplicates`` -- If true, reads with identical sequences will be filtered (default true)
+- ``PathSeqPipelineWorkflow.skip_quality_filters`` -- If true, skips quality filtering steps (default false)
+- ``PathSeqPipelineWorkflow.skip_pre_bwa_repartition`` -- Advanced tuning parameter; recommended if the sample contains a large number (e.g. >10M) microbial reads (default false)
+- ``PathSeqPipelineWorkflow.filter_bwa_seed_length`` -- Sets host alignment MEM length. Reducing this valueincreases host read detection sensitivity but is slower (default 19) 
+- ``PathSeqPipelineWorkflow.host_min_identity`` -- Minimum identity score for reads to be classified as host (default 30)
+- ``PathSeqPipelineWorkflow.bam_partition_size`` -- Advanced tuning parameter; set size of Spark read partitions. Reducing this value increases parallelism but may also increase overhead and affect host BWA alignment (default 4000000)
+- ``PathSeqPipelineWorkflow.mem_gb`` -- Virtual machine (VM) memory in GB (default 208 GB)
+- ``PathSeqPipelineWorkflow.preemptible_attempts`` -- Minimum number of attempts on preemtible VMs before the job will fail (default 3)
+- ``PathSeqPipelineWorkflow.disk_space_gb`` -- VM disk space in GB (automatically sized by default)
+- ``PathSeqPipelineWorkflow.cpu`` -- Number of VM virtual processor cores (default 32)
diff --git a/scripts/pathseq/wdl/pathseq_pipeline.wdl b/scripts/pathseq/wdl/pathseq_pipeline.wdl
@@ -0,0 +1,198 @@
+########################################################################################################################
+## PathSeq Pipeline WDL
+########################################################################################################################
+##
+## Runs the PathSeq pipeline
+##
+## For further info see the GATK Documentation for the PathSeqPipelineSpark tool:
+##   https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_spark_pathseq_PathSeqPipelineSpark.php
+##
+########################################################################################################################
+##
+## Input requirements :
+## - Sequencing data in BAM format
+## - Host and microbe references files available in the GATK Resource Bundle (available on FTP):
+##     https://software.broadinstitute.org/gatk/download/bundle
+##
+## - Input BAM files must comply with the following requirements:
+## - - file must pass validation by ValidateSamFile
+## - - all reads must have an RG tag
+## - - one or more read groups all belong to a single sample (SM)
+##
+## Output:
+## - BAM file containing microbe-mapped reads and reads of unknown sequence
+## - Tab-separated value (.tsv) file of taxonomic abundance scores
+## - Picard-style metrics files for the filter and scoring phases of the pipeline
+##
+########################################################################################################################
+
+task PathseqPipeline {
+
+  # Inputs for this task
+  String sample_name
+  File input_bam
+
+  File kmer_file
+  File filter_bwa_image
+  File microbe_bwa_image
+  File microbe_fasta
+  File microbe_fasta_dict
+  File taxonomy_file
+
+  Boolean is_host_aligned
+  Boolean? skip_quality_filters
+  Boolean? filter_duplicates
+  Boolean? skip_pre_bwa_repartition
+  Boolean? divide_by_genome_length
+  Int? filter_bwa_seed_length
+  Int? host_min_identity
+  Int? min_clipped_read_length
+  Int? bam_partition_size
+  Float? min_score_identity
+  Float? identity_margin
+
+  String bam_output_path = "${sample_name}.pathseq.bam"
+  String scores_output_path = "${sample_name}.pathseq.tsv"
+  String filter_metrics_output_path = "${sample_name}.pathseq.filter_metrics"
+  String score_metrics_output_path = "${sample_name}.pathseq.score_metrics"
+
+  File? gatk4_jar_override
+
+  # Runtime parameters
+  Int? mem_gb
+  String gatk_docker
+  Int? preemptible_attempts
+  Int? disk_space_gb
+  Int? cpu
+  Boolean use_ssd = true
+
+  # You may have to change the following two parameter values depending on the task requirements
+  Int default_ram_mb = 208000
+  # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb).
+  Int default_disk_space_gb = 400
+  # Mem is in units of GB but our command and memory runtime values are in MB
+  Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb
+  Int command_mem = machine_mem - 4000
+
+  Float default_min_score_identity = 0.9
+  Float default_identity_margin = 0.02
+
+  command <<<
+    set -e
+    export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
+      gatk --java-options "-Xmx${command_mem}m" \
+      PathSeqPipelineSpark \
+      --input ${input_bam} \
+      --output ${bam_output_path} \
+      --scores-output ${scores_output_path} \
+      --filter-metrics ${filter_metrics_output_path} \
+      --score-metrics ${score_metrics_output_path} \
+      --kmer-file ${kmer_file} \
+      --filter-bwa-image ${filter_bwa_image} \
+      --microbe-bwa-image ${microbe_bwa_image} \
+      --microbe-fasta ${microbe_fasta} \
+      --taxonomy-file ${taxonomy_file} \
+      --bam-partition-size ${select_first([bam_partition_size, 4000000])} \
+      --is-host-aligned ${is_host_aligned} \
+      --skip-quality-filters ${select_first([skip_quality_filters, false])} \
+      --min-clipped-read-length ${select_first([min_clipped_read_length, 60])} \
+      --filter-bwa-seed-length ${select_first([filter_bwa_seed_length, 19])} \
+      --host-min-identity ${select_first([host_min_identity, 30])} \
+      --filter-duplicates ${select_first([filter_duplicates, true])} \
+      --skip-pre-bwa-repartition ${select_first([skip_pre_bwa_repartition, false])} \
+      --min-score-identity ${select_first([min_score_identity, default_min_score_identity])} \
+      --identity-margin ${select_first([identity_margin, default_identity_margin])} \
+      --divide-by-genome-length ${select_first([divide_by_genome_length, true])}
+  >>>
+  runtime {
+    docker: gatk_docker
+    memory: machine_mem + " MB"
+    # Note that the space before SSD and HDD should be included.
+    disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD"
+    preemptible: select_first([preemptible_attempts, 3])
+    cpu: select_first([cpu, 32])
+  }
+  output {
+    File bam_output_pathFile = "${sample_name}.pathseq.bam"
+    File outputScoresFile = "${sample_name}.pathseq.tsv"
+    File outputFilterMetricsFile = "${sample_name}.pathseq.filter_metrics"
+    File outputScoreMetricsFile = "${sample_name}.pathseq.score_metrics"
+  }
+}
+
+workflow PathSeqPipelineWorkflow {
+
+  String sample_name
+  File input_bam
+
+  File kmer_file
+  File filter_bwa_image
+  File microbe_bwa_image
+  File microbe_fasta
+  File microbe_fasta_dict
+  File taxonomy_file
+
+  Boolean is_host_aligned
+  Boolean? filter_duplicates
+  Boolean? skip_pre_bwa_repartition
+  Boolean? divide_by_genome_length
+  Int? filter_bwa_seed_length
+  Int? host_min_identity
+  Int? min_clipped_read_length
+  Int? bam_partition_size
+  Float? min_score_identity
+  Float? identity_margin
+
+  File? gatk4_jar_override
+
+  # Runtime parameters
+  Int? mem_gb
+  String gatk_docker
+  Int? preemptible_attempts
+  Int? cpu
+
+  # Optional input to increase all disk sizes in case of outlier sample with strange size behavior
+  Int? increase_disk_size
+
+  # Some tasks need wiggle room, and we also need to add a small amount of disk to prevent getting a
+  # Cromwell error from asking for 0 disk when the input is less than 1GB.
+  # Also Spark requires some temporary storage.
+  Int additional_disk = select_first([increase_disk_size, 20])
+
+  # Disk sizes for Downsample and PathSeq tasks
+  Float disk_space_gb = size(input_bam, "GB") + size(kmer_file, "GB") + size(filter_bwa_image, "GB") + size(microbe_bwa_image, "GB") + size(microbe_fasta, "GB") + additional_disk
+
+  call PathseqPipeline {
+    input:
+      sample_name=sample_name,
+      input_bam=input_bam,
+      kmer_file=kmer_file,
+      filter_bwa_image=filter_bwa_image,
+      microbe_bwa_image=microbe_bwa_image,
+      microbe_fasta=microbe_fasta,
+      microbe_fasta_dict=microbe_fasta_dict,
+      taxonomy_file=taxonomy_file,
+      is_host_aligned=is_host_aligned,
+      filter_duplicates=filter_duplicates,
+      skip_pre_bwa_repartition=skip_pre_bwa_repartition,
+      divide_by_genome_length=divide_by_genome_length,
+      min_clipped_read_length=min_clipped_read_length,
+      bam_partition_size=bam_partition_size,
+      min_score_identity=min_score_identity,
+      identity_margin=identity_margin,
+      host_min_identity=host_min_identity,
+      filter_bwa_seed_length=filter_bwa_seed_length,
+      gatk4_jar_override=gatk4_jar_override,
+      mem_gb=mem_gb,
+      gatk_docker=gatk_docker,
+      preemptible_attempts=preemptible_attempts,
+      disk_space_gb=disk_space_gb,
+      cpu=cpu
+  }
+  output {
+    File pathseqBam = PathseqPipeline.bam_output_pathFile
+    File pathseqScores = PathseqPipeline.outputScoresFile
+    File pathseqFilterMetrics = PathseqPipeline.outputFilterMetricsFile
+    File pathseqScoreMetrics = PathseqPipeline.outputScoreMetricsFile
+  }
+}
diff --git a/scripts/pathseq/wdl/pathseq_pipeline_template.json b/scripts/pathseq/wdl/pathseq_pipeline_template.json
@@ -0,0 +1,19 @@
+{
+  "PathSeqPipelineWorkflow.sample_name": "sample",
+  "PathSeqPipelineWorkflow.input_bam": "gs://my-bucket/sample.bam",
+
+  "PathSeqPipelineWorkflow.is_host_aligned": false,
+  "PathSeqPipelineWorkflow.min_clipped_read_length": 60,
+  "PathSeqPipelineWorkflow.filter_duplicates": true,
+  "PathSeqPipelineWorkflow.min_score_identity": 0.90,
+  "PathSeqPipelineWorkflow.identity_margin": 0.02,
+  "PathSeqPipelineWorkflow.divide_by_genome_length": true,
+
+  "PathSeqPipelineWorkflow.filter_bwa_image": "gs://my-bucket/references/pathseq_host.fa.img",
+  "PathSeqPipelineWorkflow.kmer_file": "gs://my-bucket/references/pathseq_host.bfi",
+  "PathSeqPipelineWorkflow.microbe_bwa_image": "gs://my-bucket/references/pathseq_microbe.fa.img",
+  "PathSeqPipelineWorkflow.microbe_fasta": "gs://my-bucket/references/pathseq_microbe.fa",
+  "PathSeqPipelineWorkflow.microbe_fasta_dict": "gs://my-bucket/references/pathseq_microbe.dict",
+  "PathSeqPipelineWorkflow.taxonomy_file": "gs://my-bucket/references/pathseq_taxonomy.db",
+  "PathSeqPipelineWorkflow.gatk_docker": "broadinstitute/gatk:4.0.0.0"
+}