Skip to content

Commit

Permalink
add initial notebook copy pasta (#8008)
Browse files Browse the repository at this point in the history
* add initial notebook copy pasta

* pr review updates

* update dockstore

* update docker image
  • Loading branch information
RoriCremer authored Sep 16, 2022
1 parent f427624 commit 53cf8a7
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 6 deletions.
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,6 @@ task GenerateHailScript {
File hail_script = 'hail_script.py'
}
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:vs_605_hail_codegen"
docker: "us.gcr.io/broad-dsde-methods/variantstore:rc_616_var_store_2022_09_06"
}
}
49 changes: 44 additions & 5 deletions scripts/variantstore/wdl/extract/generate_hail_gvs_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ def generate_gvs_import_script(avro_prefix, gcs_listing, vds_output_path, vcf_ou
# pip install hail-0.2.97-py3-none-any.whl
# gcloud auth application-default login
# curl -sSL https://broad.io/install-gcs-connector | python3
#
import hail as hl
Expand All @@ -97,10 +96,50 @@ def generate_gvs_import_script(avro_prefix, gcs_listing, vds_output_path, vcf_ou
vds = hl.vds.read_vds('{vds_output_path}')
mt = hl.vds.to_dense_mt(vds)
fail_case = 'FAIL'
mt = mt.annotate_entries(FT=hl.if_else(mt.FT, 'PASS', fail_case))
hl.export_vcf(mt, '{vcf_output_path}')
from datetime import datetime
start = datetime.now()
current_time = start.strftime("%H:%M:%S")
print("Start Time =", current_time)
## * Hard filter out non-passing sites !!!!!TODO ok wait, DO WE want to do this? I guess it depends on what we are handing off and when.
# note: no AC/AN and AF for filtered out positions
vd = vds.variant_data
filtered_vd = vd.filter_rows(hl.len(vd.filters)==0)
filtered_vds = hl.vds.VariantDataset(vds.reference_data, filtered_vd) # now we apply it back to the vds
## * Replace LGT with GT ( for easier calculations later )
filtered_vd = filtered_vds.variant_data
filtered_vd = filtered_vd.annotate_entries(GT=hl.vds.lgt_to_gt(filtered_vd.LGT, filtered_vd.LA) )
filtered_vds = hl.vds.VariantDataset(filtered_vds.reference_data, filtered_vd)
## * Respect the FT flag by setting all failing GTs to a no call
# TODO We dont seem to be using the dense matrix table here (TODO do we need to?)
# Logic for assigning non passing GTs as no-calls to ensure that AC,AN and AF respect the FT flag
# filtered_vd.FT is True ⇒ GT keeps its current value
# filtered_vd.FT is False ⇒ GT assigned no-call
# filtered_vd.FT is missing ⇒ GT keeps its current value
filtered_vd = filtered_vd.annotate_entries(GT=hl.or_missing(hl.coalesce(filtered_vd.FT, True), filtered_vd.GT))
# TODO drop LGT now that it will be different than the GT
## * Turn the GQ0s into no calls so that ANs are correct
rd = filtered_vds.reference_data
rd = rd.filter_entries(rd.GQ > 0) ## would be better to drop these once its a dense mt?
filtered_vds = hl.vds.VariantDataset(rd, filtered_vd)
## * Create a DENSE MATRIX TABLE to calculate AC, AN, AF and TODO: Sample Count
mt = hl.vds.to_dense_mt(filtered_vds)
mt = hl.variant_qc(mt)
mt = mt.annotate_rows(AC=mt.variant_qc.AC, AN=mt.variant_qc.AN, AF=mt.variant_qc.AF)
mt = mt.drop('variant_qc')
# mt = hl.vds.to_dense_mt(vds)
# fail_case = 'FAIL'
# mt = mt.annotate_entries(FT=hl.if_else(mt.FT, 'PASS', fail_case))
# hl.export_vcf(mt, '{vcf_output_path}')
"""
return hail_script

Expand Down

0 comments on commit 53cf8a7

Please sign in to comment.