diff --git a/.dockstore.yml b/.dockstore.yml index de11d9eed1a..bd7281690ae 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -141,7 +141,7 @@ workflows: branches: - master - ah_var_store - - gg_VS-785_RegenerateTheVATTsv + - mc_nirvana_reference_disk - name: GvsCreateVATFilesFromBigQuery subclass: WDL primaryDescriptorPath: /scripts/variantstore/variant_annotations_table/GvsCreateVATFilesFromBigQuery.wdl @@ -252,6 +252,14 @@ workflows: branches: - master - ah_var_store + - name: NirvanaReferenceDisk + subclass: WDL + primaryDescriptorPath: /scripts/variantstore/wdl/NirvanaReferenceDisk.wdl + filters: + branches: + - master + - ah_var_store + - mc_nirvana_reference_disk - name: MitochondriaPipeline subclass: WDL primaryDescriptorPath: /scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl diff --git a/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl b/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl index 4786794b84e..4a6a17aff5a 100644 --- a/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl @@ -28,8 +28,6 @@ workflow GvsCreateVATfromVDS { File reference_dict = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" File reference_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai" - File nirvana_data_directory = "gs://gvs_quickstart_storage/Nirvana/Nirvana-references-2022-10-07.tgz" - call MakeSubpopulationFilesAndReadSchemaFiles { input: input_ancestry_file = ancestry_file @@ -85,7 +83,6 @@ workflow GvsCreateVATfromVDS { input: input_vcf = StripCustomAnnotationsFromSitesOnlyVCF.output_vcf, output_annotated_file_name = "${vcf_filename}_annotated", - nirvana_data_tar = nirvana_data_directory, custom_annotations_file = StripCustomAnnotationsFromSitesOnlyVCF.output_custom_annotations_file, } @@ -321,8 +318,63 @@ task AnnotateVCF { input { File input_vcf String output_annotated_file_name - File nirvana_data_tar File custom_annotations_file + + # These "inputs" do not need to be provided by the caller, these are listed here so Cromwell will connect the + # Nirvana reference image and create the appropriate symlinks. + # Specific Nirvana references sourced from firecloud-develop cromwell-reference-images.conf. + Array[File] references = [ + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/phyloP_hg38.npd", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/phyloP_hg38.npd.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_gene_scores_2.1.nga", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus_refMinor.rma.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/References/Homo_sapiens.GRCh38.Nirvana.dat", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/Cache/GRCh38/Both.sift.ndb", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_Dosage_Sensitivity_Map_20220512.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/SpliceAi_1.3.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_151_globalMinor.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/TOPMed_freeze_5.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/Gerp_20110522.gsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/Cache/GRCh38/Both.transcripts.ndb", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MultiZ100Way_20171006.pcs", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_155.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinVar_20220505.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_155.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/Cache/GRCh38/Both.polyphen.ndb", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/TOPMed_freeze_5.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_SV_2.1.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_disease_validity_curations_20220512.nga", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/OMIM_20220516.nga", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/REVEL_20200205.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/GME_20160618.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/DANN_20200205.gsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/PrimateAI_0.2.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinVar_20220505.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/GME_20160618.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_LCR_2.1.lcr", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/PrimateAI_0.2.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_Dosage_Sensitivity_Map_20220512.nga", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MITOMAP_SV_20200819.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/REVEL_20200205.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_20160414.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/SpliceAi_1.3.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/DECIPHER_201509.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/DANN_20200205.gsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_3.1.2.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/COSMIC_GeneFusions_94.gfj", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_151_globalMinor.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/FusionCatcher_1.33.gfs", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_(SV)_Phase_3_v5a.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MITOMAP_20200819.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinVar_20220505.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus_refMinor.rma", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/Gerp_20110522.gsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MITOMAP_20200819.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_3.1.2.nsa" + ] + } File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh" @@ -338,8 +390,6 @@ task AnnotateVCF { String path_reference = "/References/Homo_sapiens.GRCh38.Nirvana.dat" command <<< - # set -e - bash ~{monitoring_script} > monitoring.log & # Prepend date, time and pwd to xtrace log entries. @@ -349,10 +399,7 @@ task AnnotateVCF { # ======================================= # Handle our data sources: - echo "Extracting annotation data sources tar/gzip file..." - mkdir datasources_dir - tar zxvf ~{nirvana_data_tar} -C datasources_dir ## --strip-components 2 - DATA_SOURCES_FOLDER="$PWD/datasources_dir/references" + DATA_SOURCES_FOLDER="/cromwell_root/broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1" # ======================================= echo "Creating custom annotations" diff --git a/scripts/variantstore/wdl/NirvanaReferenceDisk.wdl b/scripts/variantstore/wdl/NirvanaReferenceDisk.wdl new file mode 100644 index 00000000000..0b2a6fe50a8 --- /dev/null +++ b/scripts/variantstore/wdl/NirvanaReferenceDisk.wdl @@ -0,0 +1,92 @@ +version 1.0 + +workflow NirvanaReferenceDisk { + input { + } + + call ReferenceDiskTask { + } + + output { + File stdout = ReferenceDiskTask.stdout + } +} + + +# Proof of concept for using the Nirvana reference image for VAT annotations in AoU Echo+ callsets. +task ReferenceDiskTask { + input { + # These "inputs" do not need to be provided by the caller, these are listed here so Cromwell will connect the + # Nirvana reference image and create the appropriate symlinks. + # Specific Nirvana references sourced from firecloud-develop cromwell-reference-images.conf. + Array[File] references = [ + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/phyloP_hg38.npd", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/phyloP_hg38.npd.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_gene_scores_2.1.nga", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus_refMinor.rma.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/References/Homo_sapiens.GRCh38.Nirvana.dat", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/Cache/GRCh38/Both.sift.ndb", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_Dosage_Sensitivity_Map_20220512.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/SpliceAi_1.3.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_151_globalMinor.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/TOPMed_freeze_5.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/Gerp_20110522.gsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/Cache/GRCh38/Both.transcripts.ndb", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MultiZ100Way_20171006.pcs", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_155.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinVar_20220505.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_155.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/Cache/GRCh38/Both.polyphen.ndb", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/TOPMed_freeze_5.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_SV_2.1.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_disease_validity_curations_20220512.nga", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/OMIM_20220516.nga", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/REVEL_20200205.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/GME_20160618.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/DANN_20200205.gsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/PrimateAI_0.2.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinVar_20220505.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/GME_20160618.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_LCR_2.1.lcr", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/PrimateAI_0.2.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_Dosage_Sensitivity_Map_20220512.nga", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MITOMAP_SV_20200819.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/REVEL_20200205.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_20160414.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/SpliceAi_1.3.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/DECIPHER_201509.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/DANN_20200205.gsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_3.1.2.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/COSMIC_GeneFusions_94.gfj", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_151_globalMinor.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/FusionCatcher_1.33.gfs", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_(SV)_Phase_3_v5a.nsi", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MITOMAP_20200819.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinVar_20220505.nsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus_refMinor.rma", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/Gerp_20110522.gsa.idx", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MITOMAP_20200819.nsa", + "gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_3.1.2.nsa" + ] + } + + command <<< + # Prepend date, time and pwd to xtrace log entries. + PS4='\D{+%F %T} \w $ ' + set -o errexit -o nounset -o pipefail -o xtrace + + NIRVANA_REFERENCE_DIR=/cromwell_root/broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1 + + find $NIRVANA_REFERENCE_DIR -print + >>> + + runtime { + docker: "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19" + } + + output { + File stdout = read_string(stdout()) + } +}