Skip to content

Commit

Permalink
VS-757 - Use JASIX to make sub-jsons of annotated output of Nirvana (#…
Browse files Browse the repository at this point in the history
…8133)

* Using Nirvana's Jasix tool to parse the annotated json into chunks more tractable for our python scripts.
* no longer put the annotations file in the output bucket for debugging.
  • Loading branch information
gbggrant authored Dec 22, 2022
1 parent 080d66a commit 10a737c
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 35 deletions.
1 change: 0 additions & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@ workflows:
branches:
- master
- ah_var_store
- rc-vs-651-vat-from-vds
- name: GvsValidateVat
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/variant_annotations_table/GvsValidateVAT.wdl
Expand Down
51 changes: 30 additions & 21 deletions scripts/variantstore/wdl/GvsCreateVATfromVDS.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ workflow GvsCreateVATfromVDS {

File nirvana_data_directory = "gs://gvs_quickstart_storage/Nirvana/Nirvana-references-2022-10-07.tgz"

## TODO: where do we need to validate that there are no hemis?
call MakeSubpopulationFilesAndReadSchemaFiles {
input:
input_ancestry_file = ancestry_file
Expand Down Expand Up @@ -93,14 +91,14 @@ workflow GvsCreateVATfromVDS {

call PrepVtAnnotationJson {
input:
annotation_json = AnnotateVCF.annotation_json,
positions_annotation_json = AnnotateVCF.positions_annotation_json,
output_file_suffix = "${vcf_filename}.json.gz",
output_path = output_path,
}

call PrepGenesAnnotationJson {
input:
annotation_json = AnnotateVCF.annotation_json,
genes_annotation_json = AnnotateVCF.genes_annotation_json,
output_file_suffix = "${vcf_filename}.json.gz",
output_path = output_path,
}
Expand Down Expand Up @@ -176,7 +174,7 @@ task MakeSubpopulationFilesAndReadSchemaFiles {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:VS-561_var_store_2022_12_08"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_12_22"
memory: "1 GB"
preemptible: 3
cpu: "1"
Expand Down Expand Up @@ -221,7 +219,7 @@ task StripCustomAnnotationsFromSitesOnlyVCF {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:VS-561_var_store_2022_12_08"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_12_22"
memory: "7 GiB"
cpu: "2"
preemptible: 3
Expand Down Expand Up @@ -265,7 +263,8 @@ task RemoveDuplicatesFromSitesOnlyVCF {
bcftools view --threads 4 -i 'REF~"N"' -O u sites_only.bcf | bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > track_dropped.tsv

echo_date "VAT: filter out sites with N's in the reference AND sites with AC=0"
# TODO - NOTE - we are NOT tracking the sites with AC=0 - should we? For Lee (Rori is checking)
## NOTE: Sites that were filtered out because of AC=0 are not recorded in the 'track_dropped.tsv' file, but can be
## determined by examining the sites-only VCF provided to this WDL.
bcftools view --threads 4 -e 'REF~"N" || AC=0' -O b sites_only.bcf -o filtered_sites_only.bcf
rm sites_only.bcf

Expand Down Expand Up @@ -331,9 +330,11 @@ task AnnotateVCF {
File monitoring_script = "gs://gvs_quickstart_storage/cromwell_monitoring_script.sh"
}
String annotation_json_name = output_annotated_file_name + ".json.gz"
String annotation_json_name_jsi = annotation_json_name + ".jsi"
String gene_annotation_json_name = output_annotated_file_name + ".genes.json.gz"
String positions_annotation_json_name = output_annotated_file_name + ".positions.json.gz"
String nirvana_location = "/Nirvana/Nirvana.dll"
String custom_creation_location = "/Nirvana/SAUtils.dll"
String jasix_location = "/Nirvana/Jasix.dll"
String path = "/Cache/GRCh38/Both"
String path_supplementary_annotations = "/SupplementaryAnnotation/GRCh38"
String path_reference = "/References/Homo_sapiens.GRCh38.Nirvana.dat"
Expand Down Expand Up @@ -370,7 +371,6 @@ task AnnotateVCF {
# =======================================
# Create Nirvana annotations:


dotnet ~{nirvana_location} \
-i ~{input_vcf} \
-c $DATA_SOURCES_FOLDER~{path} \
Expand All @@ -379,6 +379,19 @@ task AnnotateVCF {
-r $DATA_SOURCES_FOLDER~{path_reference} \
-o ~{output_annotated_file_name}

# https://illumina.github.io/NirvanaDocumentation/introduction/parsing-json#jasix
# Parse out the Genes section into a separate annotated json
dotnet ~{jasix_location} \
--in ~{annotation_json_name} \
--section genes \
--out ~{gene_annotation_json_name}

# Parse out the Positions section into a separate annotated json
dotnet ~{jasix_location} \
--in ~{annotation_json_name} \
--section positions \
--out ~{positions_annotation_json_name}

>>>
# ------------------------------------------------
# Runtime settings:
Expand All @@ -392,15 +405,15 @@ task AnnotateVCF {
# ------------------------------------------------
# Outputs:
output {
File annotation_json = "~{annotation_json_name}"
File annotation_json_jsi = "~{annotation_json_name_jsi}"
File genes_annotation_json = "~{gene_annotation_json_name}"
File positions_annotation_json = "~{positions_annotation_json_name}"
File monitoring_log = "monitoring.log"
}
}

task PrepVtAnnotationJson {
input {
File annotation_json
File positions_annotation_json
String output_file_suffix
String output_path
}
Expand All @@ -409,7 +422,6 @@ task PrepVtAnnotationJson {

String output_vt_json = "vat_vt_bq_load" + output_file_suffix
String output_vt_gcp_path = output_path + 'vt/'
String output_annotations_gcp_path = output_path + 'annotations/'

command <<<
set -o errexit -o nounset -o pipefail -o xtrace
Expand All @@ -420,12 +432,9 @@ task PrepVtAnnotationJson {
# Prepend date, time and pwd to xtrace log entries.
PS4='\D{+%F %T} \w $ '

# for debugging purposes only
gsutil cp ~{annotation_json} '~{output_annotations_gcp_path}'

## the annotation jsons are split into the specific VAT schema
python3 /app/create_vt_bqloadjson_from_annotations.py \
--annotated_json ~{annotation_json} \
--annotated_json ~{positions_annotation_json} \
--output_vt_json ~{output_vt_json}

gsutil cp ~{output_vt_json} '~{output_vt_gcp_path}'
Expand All @@ -434,7 +443,7 @@ task PrepVtAnnotationJson {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:VS-561_var_store_2022_12_08"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_12_22"
memory: "7 GB"
preemptible: 3
cpu: "1"
Expand All @@ -451,7 +460,7 @@ task PrepVtAnnotationJson {

task PrepGenesAnnotationJson {
input {
File annotation_json
File genes_annotation_json
String output_file_suffix
String output_path
}
Expand All @@ -472,7 +481,7 @@ task PrepGenesAnnotationJson {

## the annotation jsons are split into the specific VAT schema
python3 /app/create_genes_bqloadjson_from_annotations.py \
--annotated_json ~{annotation_json} \
--annotated_json ~{genes_annotation_json} \
--output_genes_json ~{output_genes_json}

gsutil cp ~{output_genes_json} '~{output_genes_gcp_path}'
Expand All @@ -481,7 +490,7 @@ task PrepGenesAnnotationJson {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:VS-561_var_store_2022_12_08"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_12_22"
memory: "7 GB"
preemptible: 3
cpu: "1"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import gzip
import argparse
import logging
import sys

vat_nirvana_omim_dictionary = {
"omim_phenotypes_id": "mimNumber", # nullable
Expand All @@ -19,17 +20,19 @@ def make_genes_json(annotated_json, output_genes_json):
json_data = open(annotated_json, 'rb')

logging.info(f"Loading the genes json data")
genes = ijson.items(json_data, 'genes.item', use_float=True)
genes = ijson.items(json_data, 'item', use_float=True)
logging.info(f"Done loading the genes json data")

gene_count = 0
for gene_line in genes:
gene_count += 1
logging.info(f"gene_line: {gene_line}")
if gene_line.get("omim") != None:
row = {}
row["gene_symbol"] = gene_line.get("name")
omim_line = gene_line["omim"][0]
if len(gene_line.get("omim")) > 1:
print("WARNING: An assumption about the possible count of omim values is incorrect.", gene_line.get("name"),len(gene_line.get("omim")))
logging.warn("WARNING: An assumption about the possible count of omim values is incorrect.", gene_line.get("name"),len(gene_line.get("omim")))
row["gene_omim_id"] = omim_line.get("mimNumber")
if omim_line.get("phenotypes") != None:
phenotypes = omim_line["phenotypes"]
Expand All @@ -46,6 +49,9 @@ def make_genes_json(annotated_json, output_genes_json):
output_genes_file.close()
json_data.close()

if gene_count == 0:
logging.info(f"ERROR - Found no items in annotated json file: {annotated_json}")
sys.exit(1)

def make_annotation_jsons(annotated_json, output_genes_json):
logging.basicConfig(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import gzip
import argparse
import logging
import sys

vat_nirvana_positions_dictionary = {
"position": "position", # required
Expand Down Expand Up @@ -108,21 +109,21 @@
def check_filtering(variant):
# skip any row (with a warning) if no gvsAnnotations exist
if variant.get("gvsAnnotations") == None: # <-- enum since we need this to be in tandem with the custom annotations header / template
print("WARNING: There has been an error in creating custom annotations for AC/AF/AN", variant.get("vid"))
logging.warn("WARNING: There has been an error in creating custom annotations for AC/AF/AN", variant.get("vid"))
return False
# skip any row (with a warning) if the AC value is 0
elif variant["gvsAnnotations"].get("AC") == 0:
print("WARNING: Its AC is 0 so we are dropping this variant", variant.get("vid"))
logging.warn("WARNING: Its AC is 0 so we are dropping this variant", variant.get("vid"))
return False
# skip any row (with a warning) if AC, AN or AF is missing
elif variant["gvsAnnotations"].get("AC") == None:
print("WARNING: There has been an error-- there is no AC value---should AN be 0 for this variant?", variant.get("vid"))
logging.warn("WARNING: There has been an error-- there is no AC value---should AN be 0 for this variant?", variant.get("vid"))
return False
elif variant["gvsAnnotations"].get("AN") == None:
print("WARNING: There has been an error-- there is an AC value---but no AN value", variant.get("vid"))
logging.warn("WARNING: There has been an error-- there is an AC value---but no AN value", variant.get("vid"))
return False
elif variant["gvsAnnotations"].get("AF") == None:
print("WARNING: There has been an error-- there is an AC value---but no AF value", variant.get("vid"))
logging.warn("WARNING: There has been an error-- there is an AC value---but no AF value", variant.get("vid"))
return False
else:
return True
Expand Down Expand Up @@ -282,14 +283,11 @@ def make_positions_json(annotated_json, output_json):
else:
json_data = open(annotated_json, 'rb')

positions = ijson.items(json_data, 'positions.item', use_float=True)
positions = ijson.items(json_data, 'item', use_float=True)

last_chrom = ""
position_count = 0
for p in positions:
chrom=p['chromosome']
if (chrom != last_chrom):
last_chrom = chrom
logging.info(f"Chrom: {chrom}")
position_count += 1
position=p['position']
refAllele=p['refAllele'] # ref_allele
altAlleles=p['altAlleles'] # this is an array that we need to split into each alt_allele
Expand Down Expand Up @@ -331,6 +329,9 @@ def make_positions_json(annotated_json, output_json):
output_file.close()
json_data.close()

if position_count == 0:
logging.error(f"ERROR - Found no items in annotated json file: {annotated_json}")
sys.exit(1)

def make_annotation_jsons(annotated_json, output_json):
logging.basicConfig(
Expand Down

0 comments on commit 10a737c

Please sign in to comment.