From a3de732eecd26c1bb994d352cc5ba6233f1c5c4f Mon Sep 17 00:00:00 2001 From: meganshand Date: Wed, 7 Feb 2024 11:17:14 -0500 Subject: [PATCH] Update UltimaJointGenotyping to use GATK 4.5.0.0 for filtering (#1151) undefined --- ...UltimaGenomicsJointGenotyping.changelog.md | 5 ++ .../UltimaGenomicsJointGenotyping.wdl | 53 +++++++++++++------ .../test_inputs/Plumbing/plumbing.inputs.json | 8 +-- .../Scientific/scientific.inputs.json | 9 ++-- .../TestUltimaGenomicsJointGenotyping.wdl | 6 +-- 5 files changed, 52 insertions(+), 29 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.changelog.md b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.changelog.md index a6c388e712..b3e7a610e9 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.changelog.md +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.changelog.md @@ -1,3 +1,8 @@ +# 1.1.6 +2023-02-06 (Date of Last Commit) + +* Updated VETS filtering pipeline to GATK version 4.5.0.0. Does not affect outputs. + # 1.1.5 2023-09-08 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.wdl b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.wdl index 48e5da0d28..2104739e3d 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.wdl +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/UltimaGenomicsJointGenotyping.wdl @@ -1,7 +1,7 @@ version 1.0 import "../../../../../../tasks/broad/JointGenotypingTasks.wdl" as Tasks -import "https://raw.githubusercontent.com/broadinstitute/gatk/4.3.0.0/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as Filtering +import "https://raw.githubusercontent.com/broadinstitute/gatk/4.5.0.0/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as Filtering import "../../../../../../tasks/broad/UltimaGenomicsGermlineFilteringThreshold.wdl" as FilteringThreshold @@ -11,7 +11,7 @@ import "../../../../../../tasks/broad/UltimaGenomicsGermlineFilteringThreshold.w # For choosing a filtering threshold (where on the ROC curve to filter) a sample with truth data is required. workflow UltimaGenomicsJointGenotyping { - String pipeline_version = "1.1.5" + String pipeline_version = "1.1.6" input { File unpadded_intervals_file @@ -51,10 +51,11 @@ workflow UltimaGenomicsJointGenotyping { String flow_order #inputs for training and applying filter model - String snp_annotations - String indel_annotations - Boolean use_allele_specific_annotations + Array[String] snp_annotations + Array[String] indel_annotations String model_backend + String snp_resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz" + String indel_resource_args = "--resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz" Int? top_level_scatter_count Boolean? gather_vcfs @@ -154,24 +155,42 @@ workflow UltimaGenomicsJointGenotyping { disk_size_gb = medium_disk } - call Filtering.JointVcfFiltering as TrainAndApplyFilteringModel { + call Filtering.JointVcfFiltering as TrainAndApplyFilteringModelSNPs { input: - vcf = CalculateAverageAnnotations.output_vcf, - vcf_index = CalculateAverageAnnotations.output_vcf_index, + input_vcfs = CalculateAverageAnnotations.output_vcf, + input_vcf_idxs = CalculateAverageAnnotations.output_vcf_index, sites_only_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - snp_annotations = snp_annotations, - indel_annotations = indel_annotations, + sites_only_vcf_idx = SitesOnlyGatherVcf.output_vcf_index, + annotations = snp_annotations, + resource_args = snp_resource_args, model_backend = model_backend, - use_allele_specific_annotations = use_allele_specific_annotations, - basename = callset_name, - gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0" + output_prefix = callset_name, + extract_extra_args = "--mode SNP", + train_extra_args = "--mode SNP", + score_extra_args = "--mode SNP", + gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + } + + call Filtering.JointVcfFiltering as TrainAndApplyFilteringModelINDELs { + input: + input_vcfs = TrainAndApplyFilteringModelSNPs.scored_vcfs, + input_vcf_idxs = TrainAndApplyFilteringModelSNPs.scored_vcf_idxs, + sites_only_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_vcf_idx = SitesOnlyGatherVcf.output_vcf_index, + annotations = indel_annotations, + resource_args = indel_resource_args, + model_backend = model_backend, + output_prefix = callset_name, + extract_extra_args = "--mode INDEL", + train_extra_args = "--mode INDEL", + score_extra_args = "--mode INDEL", + gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" } call FilteringThreshold.ExtractOptimizeSingleSample as FindFilteringThresholdAndFilter { input: - input_vcf = TrainAndApplyFilteringModel.variant_scored_vcf, - input_vcf_index = TrainAndApplyFilteringModel.variant_scored_vcf_index, + input_vcf = TrainAndApplyFilteringModelINDELs.scored_vcfs, + input_vcf_index = TrainAndApplyFilteringModelINDELs.scored_vcf_idxs, base_file_name = callset_name, call_sample_name = call_sample_name, truth_vcf = truth_vcf, @@ -188,7 +207,7 @@ workflow UltimaGenomicsJointGenotyping { medium_disk = medium_disk } - scatter (idx in range(length(TrainAndApplyFilteringModel.variant_scored_vcf))) { + scatter (idx in range(length(TrainAndApplyFilteringModelINDELs.scored_vcfs))) { # For large callsets we need to collect metrics from the shards and gather them later. if (!is_small_callset) { call Tasks.CollectVariantCallingMetrics as CollectMetricsSharded { diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Plumbing/plumbing.inputs.json b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Plumbing/plumbing.inputs.json index a271e30aa6..3dc1a947c6 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Plumbing/plumbing.inputs.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Plumbing/plumbing.inputs.json @@ -14,17 +14,17 @@ "UltimaGenomicsJointGenotyping.scatter_cross_check_fingerprints":false, "UltimaGenomicsJointGenotyping.unbounded_scatter_count_scale_factor":2.5, "UltimaGenomicsJointGenotyping.unpadded_intervals_file":"gs://gcp-public-data--broad-references/hg38/v0/hg38.even.handcurated.20k.intervals", -"UltimaGenomicsJointGenotyping.snp_annotations": "-A AS_ReadPosRankSum -A AS_FS -A AS_SOR -A AS_QD -A AVERAGE_TREE_SCORE -A AVERAGE_ASSEMBLED_HAPS -A AVERAGE_FILTERED_HAPS", -"UltimaGenomicsJointGenotyping.indel_annotations": "-A AS_MQRankSum -A AS_ReadPosRankSum -A AS_FS -A AS_SOR -A AS_QD -A AVERAGE_TREE_SCORE", +"UltimaGenomicsJointGenotyping.snp_annotations": ["AS_ReadPosRankSum", "AS_FS", "AS_SOR", "AS_QD", "AVERAGE_TREE_SCORE", "AVERAGE_ASSEMBLED_HAPS", "AVERAGE_FILTERED_HAPS"], +"UltimaGenomicsJointGenotyping.indel_annotations": ["AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_SOR", "AS_QD", "AVERAGE_TREE_SCORE"], "UltimaGenomicsJointGenotyping.flow_order": "TGCA", "UltimaGenomicsJointGenotyping.ref_fasta_sdf": "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/reference_sdf.tar", "UltimaGenomicsJointGenotyping.runs_file": "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/runs.conservative.bed", "UltimaGenomicsJointGenotyping.annotation_intervals": ["gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/LCR-hs38.bed", "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/mappability.0.bed", "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/exome.twist.bed"], -"UltimaGenomicsJointGenotyping.use_allele_specific_annotations": true, "UltimaGenomicsJointGenotyping.truth_vcf":"gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.broad-header.vcf.gz", "UltimaGenomicsJointGenotyping.truth_vcf_index":"gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.broad-header.vcf.gz", "UltimaGenomicsJointGenotyping.truth_highconf_intervals": "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/plumbing/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed", "UltimaGenomicsJointGenotyping.call_sample_name": "NA12878", "UltimaGenomicsJointGenotyping.truth_sample_name": "HG001", -"UltimaGenomicsJointGenotyping.model_backend": "PYTHON_IFOREST" +"UltimaGenomicsJointGenotyping.model_backend": "PYTHON_IFOREST", +"UltimaGenomicsJointGenotyping.TrainAndApplyFilteringModelSNPs.train_runtime_attributes": {"additional_mem_gb":2} } \ No newline at end of file diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Scientific/scientific.inputs.json b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Scientific/scientific.inputs.json index a91bede656..9b6270b0b4 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Scientific/scientific.inputs.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/UltimaGenomics/test_inputs/Scientific/scientific.inputs.json @@ -14,17 +14,18 @@ "UltimaGenomicsJointGenotyping.scatter_cross_check_fingerprints":false, "UltimaGenomicsJointGenotyping.unbounded_scatter_count_scale_factor":2.5, "UltimaGenomicsJointGenotyping.unpadded_intervals_file":"gs://gcp-public-data--broad-references/hg38/v0/hg38.even.handcurated.20k.intervals", -"UltimaGenomicsJointGenotyping.snp_annotations": "-A AS_ReadPosRankSum -A AS_FS -A AS_SOR -A AS_QD -A AVERAGE_TREE_SCORE -A AVERAGE_ASSEMBLED_HAPS -A AVERAGE_FILTERED_HAPS", -"UltimaGenomicsJointGenotyping.indel_annotations": "-A AS_MQRankSum -A AS_ReadPosRankSum -A AS_FS -A AS_SOR -A AS_QD -A AVERAGE_TREE_SCORE", +"UltimaGenomicsJointGenotyping.snp_annotations": ["AS_ReadPosRankSum", "AS_FS", "AS_SOR", "AS_QD", "AVERAGE_TREE_SCORE", "AVERAGE_ASSEMBLED_HAPS", "AVERAGE_FILTERED_HAPS"], +"UltimaGenomicsJointGenotyping.indel_annotations": ["AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_SOR", "AS_QD", "AVERAGE_TREE_SCORE"], "UltimaGenomicsJointGenotyping.flow_order": "TGCA", "UltimaGenomicsJointGenotyping.ref_fasta_sdf": "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/reference_sdf.tar", "UltimaGenomicsJointGenotyping.runs_file": "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/runs.conservative.bed", "UltimaGenomicsJointGenotyping.annotation_intervals": ["gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/LCR-hs38.bed", "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/mappability.0.bed", "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/exome.twist.bed"], -"UltimaGenomicsJointGenotyping.use_allele_specific_annotations": true, "UltimaGenomicsJointGenotyping.truth_vcf":"gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.broad-header.vcf.gz", "UltimaGenomicsJointGenotyping.truth_vcf_index":"gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.broad-header.vcf.gz", "UltimaGenomicsJointGenotyping.truth_highconf_intervals": "gs://broad-gotc-test-storage/UltimaGenomicsJointGenotyping/wgs/scientific/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed", "UltimaGenomicsJointGenotyping.call_sample_name": "NA12878", "UltimaGenomicsJointGenotyping.truth_sample_name": "HG001", -"UltimaGenomicsJointGenotyping.model_backend": "PYTHON_IFOREST" +"UltimaGenomicsJointGenotyping.model_backend": "PYTHON_IFOREST", +"UltimaGenomicsJointGenotyping.TrainAndApplyFilteringModelSNPs.extract_runtime_attributes": {"command_mem_gb":13, "additional_mem_gb":2}, +"UltimaGenomicsJointGenotyping.TrainAndApplyFilteringModelSNPs.train_runtime_attributes": {"command_mem_gb":13, "additional_mem_gb":2} } \ No newline at end of file diff --git a/verification/test-wdls/TestUltimaGenomicsJointGenotyping.wdl b/verification/test-wdls/TestUltimaGenomicsJointGenotyping.wdl index c3138ddb19..de9899439b 100644 --- a/verification/test-wdls/TestUltimaGenomicsJointGenotyping.wdl +++ b/verification/test-wdls/TestUltimaGenomicsJointGenotyping.wdl @@ -33,9 +33,8 @@ workflow TestUltimaGenomicsJointGenotyping { File runs_file Array[File] annotation_intervals String flow_order - String snp_annotations - String indel_annotations - Boolean use_allele_specific_annotations + Array[String] snp_annotations + Array[String] indel_annotations String model_backend Int? top_level_scatter_count Boolean? gather_vcfs @@ -83,7 +82,6 @@ workflow TestUltimaGenomicsJointGenotyping { flow_order = flow_order, snp_annotations = snp_annotations, indel_annotations = indel_annotations, - use_allele_specific_annotations = use_allele_specific_annotations, model_backend = model_backend, top_level_scatter_count = top_level_scatter_count, gather_vcfs = gather_vcfs,