From 335bb6f719bb6b0a3ec97adaa8781118785b434f Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 1 Jul 2024 16:35:57 -0400 Subject: [PATCH] Run VariantFiltering for Exomes [VS-1305] --- .dockstore.yml | 12 +++++-- .../gvs-exome-workspace-description.md | 4 ++- .../docs/RUNNING_EXOMES_ON_GVS.md | 25 ++++++++------ .../GvsCalculatePrecisionAndSensitivity.wdl | 3 ++ .../variantstore/wdl/GvsExtractCallset.wdl | 22 ++++++++++++ .../wdl/GvsExtractCohortFromSampleNames.wdl | 34 ++++++++++++------- .../wdl/GvsJointVariantCalling.wdl | 13 ++++--- .../wdl/GvsQuickstartIntegration.wdl | 4 ++- .../wdl/GvsQuickstartVcfIntegration.wdl | 6 ++-- .../wdl/GvsRescatterCallsetInterval.wdl | 2 ++ scripts/variantstore/wdl/GvsUtils.wdl | 4 +-- .../wdl/extract/hail_join_vds_vcfs.py | 5 +++ .../tools/gvs/extract/ExtractCohort.java | 2 +- 13 files changed, 97 insertions(+), 39 deletions(-) diff --git a/.dockstore.yml b/.dockstore.yml index 69200b3d348..2199dcc3afe 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -97,6 +97,7 @@ workflows: branches: - master - ah_var_store + - vs_1375_off_target tags: - /.*/ - name: GvsBenchmarkExtractTask @@ -126,6 +127,7 @@ workflows: branches: - master - ah_var_store + - vs_1375_off_target tags: - /.*/ - name: GvsPopulateAltAllele @@ -135,6 +137,7 @@ workflows: branches: - master - ah_var_store + - vs_1375_off_target tags: - /.*/ - name: GvsCreateTables @@ -164,7 +167,7 @@ workflows: branches: - master - ah_var_store - - VS-1379-ploidy + - vs_1375_off_target tags: - /.*/ - name: GvsImportGenomes @@ -174,6 +177,7 @@ workflows: branches: - master - ah_var_store + - vs_1375_off_target tags: - /.*/ - name: GvsBulkIngestGenomes @@ -183,6 +187,7 @@ workflows: branches: - master - ah_var_store + - vs_1375_off_target tags: - /.*/ - name: GvsPrepareRangesCallset @@ -194,6 +199,7 @@ workflows: branches: - master - ah_var_store + - vs_1375_off_target tags: - /.*/ - name: GvsCreateVATfromVDS @@ -280,7 +286,7 @@ workflows: branches: - master - ah_var_store - - rsa_vs_1168_bge + - vs_1375_off_target tags: - /.*/ - name: GvsQuickstartVcfIntegration @@ -308,7 +314,7 @@ workflows: branches: - master - ah_var_store - - vs_1396_bq_query_audit + - vs_1375_off_target tags: - /.*/ - name: GvsIngestTieout diff --git a/scripts/variantstore/beta_docs/gvs-exome-workspace-description.md b/scripts/variantstore/beta_docs/gvs-exome-workspace-description.md index f15741c38f0..9ea5e78d0bd 100644 --- a/scripts/variantstore/beta_docs/gvs-exome-workspace-description.md +++ b/scripts/variantstore/beta_docs/gvs-exome-workspace-description.md @@ -35,7 +35,9 @@ To see details about input requirements, see [Run Your Own Samples](https://gith While the GVS has been tested with almost 190,000 single sample exome GVCF files as input, only datasets of up to 100,000 exomes are currently supported by the beta workflow. -Note that, by default, the Exomes Beta workflow uses the Blended Genomes Interval List: `gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list` for its calling regions. For information on how that interval list was generated, see `gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list.README.md`. +Note that, by default, the Exomes Beta workflow uses two interval lists by default: +1. The calling region interval list defaults to the Blended Genomes Interval List: `gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list` . For information on how that interval list was generated, see `gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list.README.md`. +1. The target region interval list defaults to `gs://broad-gotc-test-storage/JointGenotyping/inputs/scientific/bge/TwistAllianceClinicalResearchExome_Covered_Targets_hg38.interval_list`. Any variants called outside these interval will be *genotype* filtered with `OUTSIDE_OF_TARGETS`. ### What does it return as output? diff --git a/scripts/variantstore/docs/RUNNING_EXOMES_ON_GVS.md b/scripts/variantstore/docs/RUNNING_EXOMES_ON_GVS.md index 5bc170aeae5..1ec91368d5f 100644 --- a/scripts/variantstore/docs/RUNNING_EXOMES_ON_GVS.md +++ b/scripts/variantstore/docs/RUNNING_EXOMES_ON_GVS.md @@ -31,8 +31,8 @@ This document describes the changes necessary to run exome gVCFs through the GVS - This will import the re-blocked gVCF files into GVS. The workflow will check whether data for that sample has already been loaded into GVS. It is designed to be re-run (with the same inputs) if there is a failure during one of the workflow tasks (e.g. BigQuery write API interrupts). - This workflow does not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option. - You will want to set the `external_sample_names`, `input_vcfs` and `input_vcf_indexes` inputs based on the GCP locations of files that contain lists of these values (in the same order). For NA12878/HG001, the files you will need for ingest are: - - `input_vcfs`: `"gs://broad-gotc-test-storage/germline_single_sample/exome/scientific/truth/master/D5327.NA12878/NA12878.rb.g.vcf.gz"` - - `input_vcf_indexes`: `"gs://broad-gotc-test-storage/germline_single_sample/exome/scientific/truth/master/D5327.NA12878/NA12878.rb.g.vcf.gz.tbi"` + - `input_vcfs`: `"gs://broad-gotc-test-storage/ExomeGermlineSingleSample/truth/scientific/master/D5327.NA12878/NA12878.rb.g.vcf.gz"` + - `input_vcf_indexes`: `"gs://broad-gotc-test-storage/ExomeGermlineSingleSample/truth/scientific/master/D5327.NA12878/NA12878.rb.g.vcf.gz.tbi"` - Set the `interval_list` input to `"gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list"` 1. `GvsPopulateAltAllele` workflow - This step loads data into the `alt_allele` table from the `vet_*` tables in preparation for running the filtering step. @@ -40,22 +40,25 @@ This document describes the changes necessary to run exome gVCFs through the GVS 1. `GvsCreateFilterSet` workflow - This step calculates features from the `alt_allele` table, and trains the VETS model along with site-level QC filters and loads them into BigQuery into a series of `filter_set_*` tables. - This workflow does not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option. - - Set the `interval_list` input to `"gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list"` - - Set the `use_VQSR_lite` input to `true` to use VETS (instead of VQSR) + - Set `GvsCreateFilterSet.interval_list` input to `"gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list"` *Make sure you're not setting `interval_list` on any of the called tasks by mistake!* 1. `GvsPrepareRangesCallset` workflow - This workflow transforms the data in the vet tables into a schema optimized for VCF extraction. - This workflow will need to be run once to extract the callset as a whole and an additional time to create the files used to calculate Precision and Sensitivity (with `control_samples` set to `true`). - This workflow does not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option. 1. `GvsExtractCallset` workflow + - This workflow is used to extract non-control samples only. See the `GvsCalculatePrecisionAndSensitivity` instructions below if running Precision and Sensitivity on control samples. - This workflow takes the tables created in the `Prepare` step to output joint VCF shards. - - This workflow will need to be run once to extract the callset into VCF shards and an additional time to calculate Precision and Sensitivity (with `control_samples` set to `true`). + - This workflow is run only to extract the callset into VCF shards (`control_samples` set to `false` explicitly or left at its default `false` value). - This workflow does not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option. - - Set the parameter `use_interval_weights` to `false`. This avoids a bug seen in WeightedSplitIntervals when using exomes, forcing it to use the standard version of SplitIntervals. - - Set the `output_gcs_dir` to a GCS location to collect all the VCF shards into one place. If you are running it twice (to calculate Precision and Sensitivity), be sure to provide distinct locations for each. + - Set the `output_gcs_dir` to a GCS location to collect all the VCF shards into one place. 1. `GvsCalculatePrecisionAndSensitivity` workflow - - This workflow needs to be run with the control sample VCF shards from `GvsExtractCallset` step which were placed in the `output_gcs_dir` GCS location. - This workflow does not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option. - - Truth inputs for NA12878/HG001: - - `truth_beds`: `"gs://gvs-internal/truth/HG001.exome_evaluation_regions.v1.1.bed"` - - `truth_vcfs`: `"gs://gvs-internal/truth/HG001_exome_filtered.recode.vcf.gz"` + - Input values with truth inputs for NA12878/HG001: + - `truth_beds`: `[ "gs://gvs-internal/truth/HG001.exome_evaluation_regions.v1.1.bed" ]` + - `truth_vcfs`: `[ "gs://gvs-internal/truth/HG001_exome_filtered.recode.vcf.gz.tbi" ]` - `truth_vcf_indices`: `"gs://gvs-internal/truth/HG001_exome_filtered.recode.vcf.gz.tbi"` + - `vcf_eval_bed_file`: `"gs://gvs-internal/truth/HG001.exome_evaluation_regions.v1.1.bed"` + - `chromosomes`: `[ "chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22" ]` + - `interval_list`: `"gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list"` + - `target_interval_list`: `"gs://broad-gotc-test-storage/JointGenotyping/inputs/scientific/bge/TwistAllianceClinicalResearchExome_Covered_Targets_hg38.interval_list"` + - `is_wgs`: `false` diff --git a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl index 6bd6134c2c5..6e1f6c58657 100644 --- a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl +++ b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl @@ -10,6 +10,7 @@ workflow GvsCalculatePrecisionAndSensitivity { String dataset_name String filter_set_name File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" + File? target_interval_list File? vcf_eval_bed_file Array[String] chromosomes = ["chr20"] String project_id @@ -37,6 +38,7 @@ workflow GvsCalculatePrecisionAndSensitivity { chromosomes: "The chromosome(s) on which to analyze precision and sensitivity. The default value for this is `['chr20']`." dataset_name: "The GVS BigQuery dataset name." filter_set_name: "The filter_set_name used to generate the callset." + target_interval_list: "The intervals outside of which sites will be OUTSIDE_OF_TARGETS filtered." interval_list: "The intervals over which to extract VCFs for calculating precision and sensitivity." project_id: "The Google Project ID where the GVS lives." sample_names: "A list of the sample names that are controls and that will be used for the analysis. For every element on the list of sample names there must be a corresponding element on the list of `truth_vcfs`, `truth_vcf_indices`, and `truth_beds`." @@ -79,6 +81,7 @@ workflow GvsCalculatePrecisionAndSensitivity { call_set_identifier = call_set_identifier, filter_set_name = filter_set_name, control_samples = true, + target_interval_list = target_interval_list, interval_list = interval_list, extract_scatter_count_override = extract_scatter_count_override, cloud_sdk_docker = effective_cloud_sdk_docker, diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl index fe594689dd3..59e1849a5e0 100644 --- a/scripts/variantstore/wdl/GvsExtractCallset.wdl +++ b/scripts/variantstore/wdl/GvsExtractCallset.wdl @@ -29,6 +29,8 @@ workflow GvsExtractCallset { File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" File interval_weights_bed = "gs://gvs_quickstart_storage/weights/gvs_full_vet_weights_1kb_padded_orig.bed" + File? target_interval_list + String? variants_docker String? cloud_sdk_docker String? gatk_docker @@ -252,6 +254,7 @@ workflow GvsExtractCallset { convert_filtered_genotypes_to_nocalls = convert_filtered_genotypes_to_nocalls, write_cost_to_db = write_cost_to_db, maximum_alternate_alleles = maximum_alternate_alleles, + target_interval_list = target_interval_list, } } @@ -348,6 +351,8 @@ task ExtractTask { Int? local_sort_max_records_in_ram = 10000000 Int? maximum_alternate_alleles + File? target_interval_list + # for call-caching -- check if DB tables haven't been updated since the last run String max_last_modified_timestamp } @@ -422,6 +427,23 @@ task ExtractTask { --shard-identifier ~{intervals_name} \ ~{cost_observability_line} + if [[ -n "~{target_interval_list}" ]] + then + pre_off_target_vcf="pre_off_target_~{output_file}" + mv ~{output_file} ${pre_off_target_vcf} + mv ~{output_file}.tbi "${pre_off_target_vcf}.tbi" + + gatk --java-options "-Xmx${memory_mb}m" \ + IndexFeatureFile \ + -I ~{target_interval_list} + + gatk --java-options "-Xmx${memory_mb}m" \ + VariantFiltration \ + ~{"--filter-not-in-mask --mask-name OUTSIDE_OF_TARGETS --mask-description 'Outside of sequencing target intervals' --mask " + target_interval_list} \ + -O ~{output_file} \ + -V ${pre_off_target_vcf} + fi + # Drop trailing slash if one exists OUTPUT_GCS_DIR=$(echo ~{output_gcs_dir} | sed 's/\/$//') diff --git a/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl b/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl index 0e936917979..c2754793bdd 100644 --- a/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl +++ b/scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl @@ -12,6 +12,7 @@ workflow GvsExtractCohortFromSampleNames { # cohort_sample_names_array will take precedence over cohort_sample_names if both are set Array[String]? cohort_sample_names_array File? cohort_sample_names + Boolean is_wgs = true String gvs_project String gvs_dataset @@ -42,6 +43,8 @@ workflow GvsExtractCohortFromSampleNames { Int? split_intervals_disk_size_override Int? split_intervals_mem_override + File? target_interval_list + String? git_branch_or_tag String? gatk_docker File? gatk_override @@ -79,15 +82,6 @@ workflow GvsExtractCohortFromSampleNames { cloud_sdk_docker = effective_cloud_sdk_docker } - Int effective_scatter_count = if defined(extract_scatter_count_override) then select_first([extract_scatter_count_override]) - else if GetNumSamplesLoaded.num_samples < 100 then 50 # Quickstart - else if GetNumSamplesLoaded.num_samples < 1000 then 250 - else if GetNumSamplesLoaded.num_samples < 5000 then 500 - else if GetNumSamplesLoaded.num_samples < 20000 then 1000 # Stroke Anderson - else if GetNumSamplesLoaded.num_samples < 50000 then 5000 - else if GetNumSamplesLoaded.num_samples < 100000 then 10000 # Charlie - else 20000 - # writing the array to a file has to be done in a task # https://support.terra.bio/hc/en-us/community/posts/360071465631-write-lines-write-map-write-tsv-write-json-fail-when-run-in-a-workflow-rather-than-in-a-task if (defined(cohort_sample_names_array)) { @@ -99,9 +93,22 @@ workflow GvsExtractCohortFromSampleNames { } File cohort_sample_names_file = select_first([write_array_task.output_file, cohort_sample_names]) + Int effective_sample_count = length(read_lines(cohort_sample_names_file)) + + Int effective_scatter_count = if defined(extract_scatter_count_override) then select_first([extract_scatter_count_override]) + else if is_wgs then + if effective_sample_count < 5000 then 1 # This results in 1 VCF per chromosome. + else if effective_sample_count < 20000 then 2000 # Stroke Anderson + else if effective_sample_count < 50000 then 10000 + else 20000 + else + if effective_sample_count < 5000 then 1 # This results in 1 VCF per chromosome. + else if effective_sample_count < 20000 then 1000 + else if effective_sample_count < 50000 then 2500 + else 7500 # allow an interval list to be passed in, but default it to our standard one if no args are here - File working_interval_list = select_first([interval_list, "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"]) + File effective_interval_list = select_first([interval_list, "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"]) call GvsPrepareCallset.GvsPrepareCallset { @@ -119,7 +126,7 @@ workflow GvsExtractCohortFromSampleNames { write_cost_to_db = write_cost_to_db, cloud_sdk_docker = effective_cloud_sdk_docker, enable_extract_table_ttl = true, - interval_list = working_interval_list, + interval_list = effective_interval_list, control_samples = control_samples, cloud_sdk_docker = effective_cloud_sdk_docker, git_hash = effective_git_hash, @@ -150,7 +157,7 @@ workflow GvsExtractCohortFromSampleNames { split_intervals_mem_override = split_intervals_mem_override, extract_memory_override_gib = extract_memory_override, disk_override = extract_disk_override, - interval_list = working_interval_list, + interval_list = effective_interval_list, control_samples = control_samples, cloud_sdk_docker = effective_cloud_sdk_docker, @@ -159,7 +166,8 @@ workflow GvsExtractCohortFromSampleNames { gatk_docker = effective_gatk_docker, git_hash = effective_git_hash, variants_docker = effective_variants_docker, - write_cost_to_db = write_cost_to_db + write_cost_to_db = write_cost_to_db, + target_interval_list = target_interval_list, } output { diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl index a98f74bb69b..fb6ee176b57 100644 --- a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl +++ b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl @@ -58,6 +58,8 @@ workflow GvsJointVariantCalling { String? extract_table_prefix String? filter_set_name + File? target_interval_list + # Overrides to be passed to GvsCreateFilterSet Int? INDEL_VQSR_CLASSIC_max_gaussians_override = 4 Int? INDEL_VQSR_CLASSIC_mem_gb_override @@ -70,15 +72,15 @@ workflow GvsJointVariantCalling { Int? maximum_alternate_alleles } - # If is_wgs is true, we'll use the WGS interval list else, we'll use the Exome interval list. We'll currently use - # the same weighted bed file for whole genomes and exomes. - # But, if interval_list is defined, we'll use that instead of choosing based on is_wgs + # If `is_wgs` is true we'll use the WGS interval list else, otherwise we'll use the Exome interval list. + # We currently use the same weighted bed file for whole genomes and exomes. However if `interval_list` is defined, + # we'll use that instead of choosing based on `is_wgs`. File default_interval_list = if (is_wgs) then "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" else "gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list" File interval_list_to_use = select_first([interval_list, default_interval_list]) - # the call_set_identifier string is used to name many different things throughout this workflow (BQ tables, vcfs etc), - # and so make sure nothing is broken by creative users, we replace spaces and underscores with hyphens + # The `call_set_identifier` string is used to name many different things throughout this workflow (BQ tables, vcfs etc). + # To make sure nothing is broken by creative users we replace spaces and underscores with hyphens. String effective_extract_output_file_base_name = select_first([extract_output_file_base_name, sub(call_set_identifier, "\\s+|\_+", "-")]) String effective_extract_table_prefix = select_first([extract_table_prefix, sub(call_set_identifier, "\\s+|\_+", "-")]) String effective_filter_set_name = select_first([filter_set_name, sub(call_set_identifier, "\\s+|\_+", "-")]) @@ -228,6 +230,7 @@ workflow GvsJointVariantCalling { bgzip_output_vcfs = bgzip_output_vcfs, is_wgs = is_wgs, maximum_alternate_alleles = maximum_alternate_alleles, + target_interval_list = target_interval_list, } output { diff --git a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl index 92d8c6a8fbd..8e79e82268e 100644 --- a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl @@ -21,6 +21,7 @@ workflow GvsQuickstartIntegration { String exome_vcf_files_column_name = "hg38_reblocked_gvcf" String exome_vcf_index_files_column_name = "hg38_reblocked_gvcf_index" String exome_sample_set_name = "exome_integration_sample_set" + File? target_interval_list = "gs://broad-gotc-test-storage/JointGenotyping/inputs/scientific/bge/TwistAllianceClinicalResearchExome_Covered_Targets_hg38.interval_list" String? basic_docker String? cloud_sdk_docker String? cloud_sdk_slim_docker @@ -34,7 +35,7 @@ workflow GvsQuickstartIntegration { File full_wgs_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" File full_exome_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list" String expected_subdir = if (!chr20_X_Y_only) then "all_chrs/" else "" - File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-05-23/" + expected_subdir + File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-07-03/" + expected_subdir # WDL 1.0 trick to set a variable ('none') to be undefined. if (false) { @@ -250,6 +251,7 @@ workflow GvsQuickstartIntegration { workspace_id = GetToolVersions.workspace_id, submission_id = GetToolVersions.submission_id, maximum_alternate_alleles = maximum_alternate_alleles, + target_interval_list = target_interval_list, } if (QuickstartVcfExomeIntegration.used_tighter_gcp_quotas) { diff --git a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl index 0d1d5322f8b..55ed3229fa9 100644 --- a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl @@ -34,6 +34,7 @@ workflow GvsQuickstartVcfIntegration { String? workspace_id String? submission_id + File? target_interval_list Int? maximum_alternate_alleles } String project_id = "gvs-internal" @@ -113,6 +114,7 @@ workflow GvsQuickstartVcfIntegration { git_hash = effective_git_hash, tighter_gcp_quotas = false, maximum_alternate_alleles = maximum_alternate_alleles, + target_interval_list = target_interval_list, extract_output_gcs_dir = extract_output_gcs_dir, } @@ -231,7 +233,7 @@ task AssertIdenticalOutputs { actual="actual/$vcf" expected="expected/$vcf" set +o errexit - cmp <(grep '^#' $actual) <(grep '^#' $expected) + cmp <(grep '^#' $actual | grep -E -v '^##GATKCommandLine=') <(grep '^#' $expected | grep -E -v '^##GATKCommandLine=') rc=$? set -o errexit if [[ $rc -ne 0 ]]; then @@ -260,7 +262,7 @@ task AssertIdenticalOutputs { expected="expected/$vcf" actual="actual/$vcf" set +o errexit - cmp $actual $expected + cmp <(grep -E -v '^##GATKCommandLine=' $actual) <(grep -E -v '^##GATKCommandLine=' $expected) rc=$? set -o errexit if [[ $rc -ne 0 ]]; then diff --git a/scripts/variantstore/wdl/GvsRescatterCallsetInterval.wdl b/scripts/variantstore/wdl/GvsRescatterCallsetInterval.wdl index 79e554cf718..acda6574935 100644 --- a/scripts/variantstore/wdl/GvsRescatterCallsetInterval.wdl +++ b/scripts/variantstore/wdl/GvsRescatterCallsetInterval.wdl @@ -14,6 +14,7 @@ workflow GvsRescatterCallsetInterval { String output_file_base_name String project_id Int re_scatter_count + File? target_interval_list Int? extract_preemptible_override String? final_output_gcs_dir @@ -50,6 +51,7 @@ workflow GvsRescatterCallsetInterval { interval_list = sub(interval_file_dir, "/$", "") + '/' + intervals_to_scatter[i] + "-scattered.interval_list", extract_preemptible_override = extract_preemptible_override, filter_set_name = filter_set_name, + target_interval_list = target_interval_list, gatk_docker = effective_gatk_docker, cloud_sdk_docker = effective_cloud_sdk_docker, variants_docker = effective_variants_docker, diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index f3a322b79db..e20551c3b54 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -72,9 +72,9 @@ task GetToolVersions { # GVS generally uses the smallest `alpine` version of the Google Cloud SDK as it suffices for most tasks, but # there are a handlful of tasks that require the larger GNU libc-based `slim`. String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim" - String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-07-01-alpine-dd25f0664fb5" + String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-07-16-alpine-0b965b8d2679" String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19" - String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-06-17-gatkbase-37738272d538" + String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-07-03-gatkbase-60fb4424dac7" String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623" String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e" diff --git a/scripts/variantstore/wdl/extract/hail_join_vds_vcfs.py b/scripts/variantstore/wdl/extract/hail_join_vds_vcfs.py index 8ad3402df8d..cfb03ff3e0e 100644 --- a/scripts/variantstore/wdl/extract/hail_join_vds_vcfs.py +++ b/scripts/variantstore/wdl/extract/hail_join_vds_vcfs.py @@ -16,6 +16,11 @@ def vcf_mt(vcf_paths): # Setting array_elements_required to false is done as a workaround because Hail has a hard time with unconventional fields with empty values e.g. AS_YNG=.,.,. # Avoiding explicitly acknowledging the use of missing elements in arrays requires Hail to make a decision in several ambiguous cases mt = hl.import_vcf(vcf_paths, force_bgz=True, reference_genome='GRCh38', array_elements_required=False).key_rows_by('locus') + + # The 'Number' attribute of the 'FT' format specifier has changed from '1' to '.', causing the Hail VCF import logic + # to change its datatype representation of 'FT' from 'string' to 'array'. Below is some Hail logic to unwrap + # any non-missing arrays, which in current GVS practice always have a single element. + mt = mt.annotate_entries(FT=hl.or_missing(hl.is_defined(mt.FT), mt.FT[0])) return mt diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java index 2d7136828c7..d608dd18d83 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java @@ -393,7 +393,7 @@ else if (vqScoreFilteringType.equals(VQScoreFilteringType.GENOTYPE)) { } if (vqScoreFilteringType.equals(VQScoreFilteringType.GENOTYPE)) { - extraHeaderLines.add(new VCFFormatHeaderLine("FT", 1, VCFHeaderLineType.String, "Sample Genotype Filter Field")); + extraHeaderLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY)); } if (emitPLs) {