Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
mcovarr committed Jan 24, 2023
1 parent 6a56dea commit 9272ab1
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 11 deletions.
10 changes: 9 additions & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ workflows:
branches:
- master
- ah_var_store
- gg_VS-785_RegenerateTheVATTsv
- mc_nirvana_reference_disk
- name: GvsCreateVATFilesFromBigQuery
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/variant_annotations_table/GvsCreateVATFilesFromBigQuery.wdl
Expand Down Expand Up @@ -252,6 +252,14 @@ workflows:
branches:
- master
- ah_var_store
- name: NirvanaReferenceDisk
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/NirvanaReferenceDisk.wdl
filters:
branches:
- master
- ah_var_store
- mc_nirvana_reference_disk
- name: MitochondriaPipeline
subclass: WDL
primaryDescriptorPath: /scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl
Expand Down
67 changes: 57 additions & 10 deletions scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ workflow GvsCreateVATfromVDS {
File reference_dict = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict"
File reference_index = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai"

File nirvana_data_directory = "gs://gvs_quickstart_storage/Nirvana/Nirvana-references-2022-10-07.tgz"

call MakeSubpopulationFilesAndReadSchemaFiles {
input:
input_ancestry_file = ancestry_file
Expand Down Expand Up @@ -85,7 +83,6 @@ workflow GvsCreateVATfromVDS {
input:
input_vcf = StripCustomAnnotationsFromSitesOnlyVCF.output_vcf,
output_annotated_file_name = "${vcf_filename}_annotated",
nirvana_data_tar = nirvana_data_directory,
custom_annotations_file = StripCustomAnnotationsFromSitesOnlyVCF.output_custom_annotations_file,
}

Expand Down Expand Up @@ -321,8 +318,63 @@ task AnnotateVCF {
input {
File input_vcf
String output_annotated_file_name
File nirvana_data_tar
File custom_annotations_file

# These "inputs" do not need to be provided by the caller, these are listed here so Cromwell will connect the
# Nirvana reference image and create the appropriate symlinks.
# Specific Nirvana references sourced from firecloud-develop cromwell-reference-images.conf.
Array[File] references = [
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/phyloP_hg38.npd",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/phyloP_hg38.npd.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_gene_scores_2.1.nga",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus_refMinor.rma.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/References/Homo_sapiens.GRCh38.Nirvana.dat",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/Cache/GRCh38/Both.sift.ndb",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_Dosage_Sensitivity_Map_20220512.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/SpliceAi_1.3.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_151_globalMinor.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/TOPMed_freeze_5.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/Gerp_20110522.gsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/Cache/GRCh38/Both.transcripts.ndb",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MultiZ100Way_20171006.pcs",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_155.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinVar_20220505.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_155.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/Cache/GRCh38/Both.polyphen.ndb",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/TOPMed_freeze_5.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_SV_2.1.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_disease_validity_curations_20220512.nga",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/OMIM_20220516.nga",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/REVEL_20200205.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/GME_20160618.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/DANN_20200205.gsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/PrimateAI_0.2.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinVar_20220505.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/GME_20160618.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_LCR_2.1.lcr",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/PrimateAI_0.2.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_Dosage_Sensitivity_Map_20220512.nga",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MITOMAP_SV_20200819.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/REVEL_20200205.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_20160414.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/SpliceAi_1.3.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/DECIPHER_201509.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/DANN_20200205.gsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_3.1.2.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/COSMIC_GeneFusions_94.gfj",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_151_globalMinor.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/FusionCatcher_1.33.gfs",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_(SV)_Phase_3_v5a.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MITOMAP_20200819.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinVar_20220505.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus_refMinor.rma",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/Gerp_20110522.gsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MITOMAP_20200819.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_3.1.2.nsa"
]

}

File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh"
Expand All @@ -338,8 +390,6 @@ task AnnotateVCF {
String path_reference = "/References/Homo_sapiens.GRCh38.Nirvana.dat"

command <<<
# set -e

bash ~{monitoring_script} > monitoring.log &

# Prepend date, time and pwd to xtrace log entries.
Expand All @@ -349,10 +399,7 @@ task AnnotateVCF {
# =======================================
# Handle our data sources:

echo "Extracting annotation data sources tar/gzip file..."
mkdir datasources_dir
tar zxvf ~{nirvana_data_tar} -C datasources_dir ## --strip-components 2
DATA_SOURCES_FOLDER="$PWD/datasources_dir/references"
DATA_SOURCES_FOLDER="/cromwell_root/broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1"

# =======================================
echo "Creating custom annotations"
Expand Down
92 changes: 92 additions & 0 deletions scripts/variantstore/wdl/NirvanaReferenceDisk.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
version 1.0

workflow NirvanaReferenceDisk {
input {
}

call ReferenceDiskTask {
}

output {
File stdout = ReferenceDiskTask.stdout
}
}


# Proof of concept for using the Nirvana reference image for VAT annotations in AoU Echo+ callsets.
task ReferenceDiskTask {
input {
# These "inputs" do not need to be provided by the caller, these are listed here so Cromwell will connect the
# Nirvana reference image and create the appropriate symlinks.
# Specific Nirvana references sourced from firecloud-develop cromwell-reference-images.conf.
Array[File] references = [
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/phyloP_hg38.npd",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/phyloP_hg38.npd.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_gene_scores_2.1.nga",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus_refMinor.rma.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/References/Homo_sapiens.GRCh38.Nirvana.dat",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/Cache/GRCh38/Both.sift.ndb",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_Dosage_Sensitivity_Map_20220512.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/SpliceAi_1.3.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_151_globalMinor.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/TOPMed_freeze_5.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/Gerp_20110522.gsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/Cache/GRCh38/Both.transcripts.ndb",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MultiZ100Way_20171006.pcs",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_155.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinVar_20220505.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_155.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/Cache/GRCh38/Both.polyphen.ndb",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/TOPMed_freeze_5.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_SV_2.1.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_disease_validity_curations_20220512.nga",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/OMIM_20220516.nga",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/REVEL_20200205.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/GME_20160618.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/DANN_20200205.gsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/PrimateAI_0.2.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinVar_20220505.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/GME_20160618.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_LCR_2.1.lcr",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/PrimateAI_0.2.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_Dosage_Sensitivity_Map_20220512.nga",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MITOMAP_SV_20200819.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/REVEL_20200205.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinGen_20160414.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/SpliceAi_1.3.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/DECIPHER_201509.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/DANN_20200205.gsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_3.1.2.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/COSMIC_GeneFusions_94.gfj",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/dbSNP_151_globalMinor.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/FusionCatcher_1.33.gfs",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_(SV)_Phase_3_v5a.nsi",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MITOMAP_20200819.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/ClinVar_20220505.nsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/1000_Genomes_Project_Phase_3_v3_plus_refMinor.rma",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/Gerp_20110522.gsa.idx",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/MITOMAP_20200819.nsa",
"gs://broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1/SupplementaryAnnotation/GRCh38/gnomAD_3.1.2.nsa"
]
}

command <<<
# Prepend date, time and pwd to xtrace log entries.
PS4='\D{+%F %T} \w $ '
set -o errexit -o nounset -o pipefail -o xtrace

NIRVANA_REFERENCE_DIR=/cromwell_root/broad-public-datasets/gvs/vat-annotations/Nirvana/3.18.1

find $NIRVANA_REFERENCE_DIR -print
>>>

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19"
}

output {
File stdout = read_string(stdout())
}
}

0 comments on commit 9272ab1

Please sign in to comment.