From 9c6bb49c0a87c9cc1cbd80cd94e39ecca691c52c Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Mon, 3 Jun 2024 15:53:19 -0400 Subject: [PATCH 01/13] change hardcoded reference base path --- .../broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index 14b8a97035..a91d37a9b3 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -40,7 +40,7 @@ workflow ImputationBeagle { scatter (contig_index in range(length(contigs))) { # these are specific to hg38 - contig is format 'chr1' - String reference_filename = reference_panel_path + "hgdp.tgp.gwaspy.merged." + contigs[contig_index] + ".merged.AN_added.bcf.ac2" + String reference_filename = reference_panel_path + "sim.5k." + contigs[contig_index] String genetic_map_filename = genetic_maps_path + "plink." + contigs[contig_index] + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { From 5491d1461d665377bff2d0d427678df926c1b46a Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Mon, 3 Jun 2024 22:08:59 -0400 Subject: [PATCH 02/13] change to 6k samples --- .../broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index a91d37a9b3..f37b7240a6 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -40,7 +40,7 @@ workflow ImputationBeagle { scatter (contig_index in range(length(contigs))) { # these are specific to hg38 - contig is format 'chr1' - String reference_filename = reference_panel_path + "sim.5k." + contigs[contig_index] + String reference_filename = reference_panel_path + "sim.6k." + contigs[contig_index] String genetic_map_filename = genetic_maps_path + "plink." + contigs[contig_index] + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { From d205ccc95eda00975f36fd5983874d5e636c2f96 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Mon, 3 Jun 2024 22:24:56 -0400 Subject: [PATCH 03/13] change to 3k samples --- .../broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index f37b7240a6..77aaeea05b 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -40,7 +40,7 @@ workflow ImputationBeagle { scatter (contig_index in range(length(contigs))) { # these are specific to hg38 - contig is format 'chr1' - String reference_filename = reference_panel_path + "sim.6k." + contigs[contig_index] + String reference_filename = reference_panel_path + "sim.3k." + contigs[contig_index] String genetic_map_filename = genetic_maps_path + "plink." + contigs[contig_index] + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { From c373c6821a2b94e14d1bd8dddf9b76753a49530c Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Tue, 4 Jun 2024 10:25:14 -0400 Subject: [PATCH 04/13] update liftoverVCFs wdl to have configurable memory --- .../arrays/imputation_beagle/ImputationBeaglePreChunk.wdl | 2 +- pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index 77aaeea05b..72e6608c35 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -40,7 +40,7 @@ workflow ImputationBeagle { scatter (contig_index in range(length(contigs))) { # these are specific to hg38 - contig is format 'chr1' - String reference_filename = reference_panel_path + "sim.3k." + contigs[contig_index] + String reference_filename = reference_panel_path + "sim.3k.hg38." + contigs[contig_index] String genetic_map_filename = genetic_maps_path + "plink." + contigs[contig_index] + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index 5db6a2481c..d740c873ac 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -57,6 +57,7 @@ task LiftOverArrays { Int max_retries Int preemptible_tries Int min_disk_size + Int mem_gb = 64 } Int disk_size_from_file = (ceil(size(input_vcf, "GiB") + size(liftover_chain, "GiB") + size(reference_fasta, "GiB")) * 2) + 20 @@ -84,7 +85,7 @@ task LiftOverArrays { runtime { docker: docker - memory: "16 GiB" + memory: "~{mem_gb} GiB" cpu: "1" disks: "local-disk ~{disk_size} HDD" maxRetries: max_retries From 270652ed09dbd2a85e8ad5bd256a2d7cb572c891 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Tue, 4 Jun 2024 11:06:15 -0400 Subject: [PATCH 05/13] set java initial and max heap size --- pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index d740c873ac..3cfa393b76 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -59,6 +59,8 @@ task LiftOverArrays { Int min_disk_size Int mem_gb = 64 } + Int command_mem_gb = mem_gb - 2 + Int max_heap_gb = mem_gb - 1 Int disk_size_from_file = (ceil(size(input_vcf, "GiB") + size(liftover_chain, "GiB") + size(reference_fasta, "GiB")) * 2) + 20 Int disk_size = if ( disk_size_from_file > min_disk_size ) then disk_size_from_file else min_disk_size @@ -67,7 +69,7 @@ task LiftOverArrays { command <<< set -euo pipefail - gatk --java-options "-Xms4g -Xmx15g" \ + gatk --java-options "-Xms~{command_mem_gb}g -Xmx~{max_heap_gb}g" \ LiftoverVcf \ --INPUT ~{input_vcf} \ --OUTPUT ~{output_basename}.liftedover.vcf \ From f727155e039a71c88876c2fe79ce9c97fda3ea1d Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Tue, 4 Jun 2024 13:10:21 -0400 Subject: [PATCH 06/13] try less records in memory --- pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index 3cfa393b76..e866aba439 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -76,7 +76,7 @@ task LiftOverArrays { --CHAIN ~{liftover_chain} \ --REJECT ~{output_basename}.rejected_variants.vcf \ --REFERENCE_SEQUENCE ~{reference_fasta} \ - --MAX_RECORDS_IN_RAM 100000 + --MAX_RECORDS_IN_RAM 10000 # compress vcf - this creates a file with .gz suffix bgzip ~{output_basename}.liftedover.vcf From 2e5191f32f2dcada01ca19361951f4248a458b4d Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Tue, 4 Jun 2024 15:19:39 -0400 Subject: [PATCH 07/13] change reference panel path to 5k sim samples --- .../broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index 72e6608c35..7d2e7d6c5a 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -40,7 +40,7 @@ workflow ImputationBeagle { scatter (contig_index in range(length(contigs))) { # these are specific to hg38 - contig is format 'chr1' - String reference_filename = reference_panel_path + "sim.3k.hg38." + contigs[contig_index] + String reference_filename = reference_panel_path + "sim.5k.hg38." + contigs[contig_index] String genetic_map_filename = genetic_maps_path + "plink." + contigs[contig_index] + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { From 5f07c3bc83f8a0b04e87a0f5eb631f81f9732b30 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Tue, 4 Jun 2024 18:05:33 -0400 Subject: [PATCH 08/13] dont need hg38 prefix --- .../broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index 7d2e7d6c5a..a91d37a9b3 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -40,7 +40,7 @@ workflow ImputationBeagle { scatter (contig_index in range(length(contigs))) { # these are specific to hg38 - contig is format 'chr1' - String reference_filename = reference_panel_path + "sim.5k.hg38." + contigs[contig_index] + String reference_filename = reference_panel_path + "sim.5k." + contigs[contig_index] String genetic_map_filename = genetic_maps_path + "plink." + contigs[contig_index] + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { From ca151140f2faaaf4eb4078d929d0c7b4abe91044 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Wed, 5 Jun 2024 11:28:08 -0400 Subject: [PATCH 09/13] add optional error count override for testing --- .../arrays/imputation_beagle/ImputationBeaglePreChunk.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index a91d37a9b3..8dcca56d88 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -24,6 +24,8 @@ workflow ImputationBeagle { String bref3_suffix = ".bref3" String gatk_docker = "broadinstitute/gatk-nightly:2024-06-06-4.5.0.0-36-g2a420e483-NIGHTLY-SNAPSHOT" + + Int? error_count_override } call tasks.CountSamples { @@ -107,7 +109,7 @@ workflow ImputationBeagle { # if any chunk for any chromosome fail CheckChunks, then we will not impute run any task in the next scatter, # namely phasing and imputing which would be the most costly to throw away - Int n_failed_chunks_int = read_int(StoreContigLevelChunksInfo.n_failed_chunks) + Int n_failed_chunks_int = select_first([error_count_override, read_int(StoreContigLevelChunksInfo.n_failed_chunks)]) call tasks.ErrorWithMessageIfErrorCountNotZero as FailQCNChunks { input: errorCount = n_failed_chunks_int, From 3d2d47951da20fa41d196c60072fc026f29443a4 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Thu, 6 Jun 2024 17:21:09 -0400 Subject: [PATCH 10/13] change reference panel to 10k sim panel --- .../broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index 8dcca56d88..515a126305 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -42,7 +42,7 @@ workflow ImputationBeagle { scatter (contig_index in range(length(contigs))) { # these are specific to hg38 - contig is format 'chr1' - String reference_filename = reference_panel_path + "sim.5k." + contigs[contig_index] + String reference_filename = reference_panel_path + "sim.10k." + contigs[contig_index] String genetic_map_filename = genetic_maps_path + "plink." + contigs[contig_index] + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { From a21204a189287a1e09fda01f97a100c2d2e4529c Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Fri, 7 Jun 2024 16:25:48 -0400 Subject: [PATCH 11/13] revert lifovervcf changes and change name of reference file basename --- .../imputation_beagle/ImputationBeaglePreChunk.wdl | 6 +++--- .../broad/arrays/imputation_beagle/LiftoverVcfs.wdl | 11 ++++------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index 515a126305..d4d14a7196 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -42,12 +42,12 @@ workflow ImputationBeagle { scatter (contig_index in range(length(contigs))) { # these are specific to hg38 - contig is format 'chr1' - String reference_filename = reference_panel_path + "sim.10k." + contigs[contig_index] + String reference_basename = reference_panel_path + "sim.10k." + contigs[contig_index] String genetic_map_filename = genetic_maps_path + "plink." + contigs[contig_index] + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { - "interval_list": reference_filename + interval_list_suffix, - "bref3": reference_filename + bref3_suffix, + "interval_list": reference_basename + interval_list_suffix, + "bref3": reference_basename + bref3_suffix, "contig": contigs[contig_index], "genetic_map": genetic_map_filename } diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index e866aba439..fb6e0b3c9a 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -57,26 +57,23 @@ task LiftOverArrays { Int max_retries Int preemptible_tries Int min_disk_size - Int mem_gb = 64 } - Int command_mem_gb = mem_gb - 2 - Int max_heap_gb = mem_gb - 1 Int disk_size_from_file = (ceil(size(input_vcf, "GiB") + size(liftover_chain, "GiB") + size(reference_fasta, "GiB")) * 2) + 20 Int disk_size = if ( disk_size_from_file > min_disk_size ) then disk_size_from_file else min_disk_size - + command <<< set -euo pipefail - gatk --java-options "-Xms~{command_mem_gb}g -Xmx~{max_heap_gb}g" \ + gatk --java-options "-Xms4g -Xmx15g" \ LiftoverVcf \ --INPUT ~{input_vcf} \ --OUTPUT ~{output_basename}.liftedover.vcf \ --CHAIN ~{liftover_chain} \ --REJECT ~{output_basename}.rejected_variants.vcf \ --REFERENCE_SEQUENCE ~{reference_fasta} \ - --MAX_RECORDS_IN_RAM 10000 + --MAX_RECORDS_IN_RAM 100000 # compress vcf - this creates a file with .gz suffix bgzip ~{output_basename}.liftedover.vcf @@ -87,7 +84,7 @@ task LiftOverArrays { runtime { docker: docker - memory: "~{mem_gb} GiB" + memory: "16 GiB" cpu: "1" disks: "local-disk ~{disk_size} HDD" maxRetries: max_retries From afe34992c5fd2a4acd9f9c1bd94b2a7cc3cff992 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Sun, 9 Jun 2024 14:01:25 -0400 Subject: [PATCH 12/13] spacing --- pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index fb6e0b3c9a..a4aa93d0dd 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -61,8 +61,7 @@ task LiftOverArrays { Int disk_size_from_file = (ceil(size(input_vcf, "GiB") + size(liftover_chain, "GiB") + size(reference_fasta, "GiB")) * 2) + 20 Int disk_size = if ( disk_size_from_file > min_disk_size ) then disk_size_from_file else min_disk_size - - + command <<< set -euo pipefail From c49363fa5722bd59b09782d6f4fbf6c16e51f041 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Mon, 10 Jun 2024 13:14:57 -0400 Subject: [PATCH 13/13] rename reference base prefix variable --- .../arrays/imputation_beagle/ImputationBeaglePreChunk.wdl | 4 ++-- pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl index d4d14a7196..7d79f9c754 100644 --- a/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl +++ b/pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl @@ -15,7 +15,7 @@ workflow ImputationBeagle { File ref_dict # for reheadering / adding contig lengths in the header of the ouptut VCF, and calculating contig lengths Array[String] contigs - String reference_panel_path # path to the bucket where the reference panel files are stored for all contigs + String reference_panel_path_prefix # path + file prefix to the bucket where the reference panel files are stored for all contigs String genetic_maps_path # path to the bucket where genetic maps are stored for all contigs String output_basename # the basename for intermediate and output files @@ -42,7 +42,7 @@ workflow ImputationBeagle { scatter (contig_index in range(length(contigs))) { # these are specific to hg38 - contig is format 'chr1' - String reference_basename = reference_panel_path + "sim.10k." + contigs[contig_index] + String reference_basename = reference_panel_path_prefix + "." + contigs[contig_index] String genetic_map_filename = genetic_maps_path + "plink." + contigs[contig_index] + ".GRCh38.withchr.map" ReferencePanelContig referencePanelContig = { diff --git a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl index a4aa93d0dd..47d3392662 100644 --- a/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl +++ b/pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl @@ -61,7 +61,7 @@ task LiftOverArrays { Int disk_size_from_file = (ceil(size(input_vcf, "GiB") + size(liftover_chain, "GiB") + size(reference_fasta, "GiB")) * 2) + 20 Int disk_size = if ( disk_size_from_file > min_disk_size ) then disk_size_from_file else min_disk_size - + command <<< set -euo pipefail