From 53cf8a72b0b47a80d86385d40f413572ae87b1c7 Mon Sep 17 00:00:00 2001 From: RoriCremer <6863459+RoriCremer@users.noreply.github.com> Date: Fri, 16 Sep 2022 10:42:10 -0400 Subject: [PATCH] add initial notebook copy pasta (#8008) * add initial notebook copy pasta * pr review updates * update dockstore * update docker image --- .../wdl/GvsExtractAvroFilesForHail.wdl | 2 +- .../wdl/extract/generate_hail_gvs_import.py | 49 +++++++++++++++++-- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl b/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl index a2587754d60..e1b8552f3dd 100644 --- a/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl +++ b/scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl @@ -267,6 +267,6 @@ task GenerateHailScript { File hail_script = 'hail_script.py' } runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:vs_605_hail_codegen" + docker: "us.gcr.io/broad-dsde-methods/variantstore:rc_616_var_store_2022_09_06" } } diff --git a/scripts/variantstore/wdl/extract/generate_hail_gvs_import.py b/scripts/variantstore/wdl/extract/generate_hail_gvs_import.py index 29b547bc520..4113034683c 100644 --- a/scripts/variantstore/wdl/extract/generate_hail_gvs_import.py +++ b/scripts/variantstore/wdl/extract/generate_hail_gvs_import.py @@ -80,7 +80,6 @@ def generate_gvs_import_script(avro_prefix, gcs_listing, vds_output_path, vcf_ou # pip install hail-0.2.97-py3-none-any.whl # gcloud auth application-default login # curl -sSL https://broad.io/install-gcs-connector | python3 -# import hail as hl @@ -97,10 +96,50 @@ def generate_gvs_import_script(avro_prefix, gcs_listing, vds_output_path, vcf_ou vds = hl.vds.read_vds('{vds_output_path}') -mt = hl.vds.to_dense_mt(vds) -fail_case = 'FAIL' -mt = mt.annotate_entries(FT=hl.if_else(mt.FT, 'PASS', fail_case)) -hl.export_vcf(mt, '{vcf_output_path}') +from datetime import datetime +start = datetime.now() +current_time = start.strftime("%H:%M:%S") +print("Start Time =", current_time) + +## * Hard filter out non-passing sites !!!!!TODO ok wait, DO WE want to do this? I guess it depends on what we are handing off and when. +# note: no AC/AN and AF for filtered out positions +vd = vds.variant_data +filtered_vd = vd.filter_rows(hl.len(vd.filters)==0) +filtered_vds = hl.vds.VariantDataset(vds.reference_data, filtered_vd) # now we apply it back to the vds + +## * Replace LGT with GT ( for easier calculations later ) +filtered_vd = filtered_vds.variant_data +filtered_vd = filtered_vd.annotate_entries(GT=hl.vds.lgt_to_gt(filtered_vd.LGT, filtered_vd.LA) ) +filtered_vds = hl.vds.VariantDataset(filtered_vds.reference_data, filtered_vd) + +## * Respect the FT flag by setting all failing GTs to a no call +# TODO We dont seem to be using the dense matrix table here (TODO do we need to?) + +# Logic for assigning non passing GTs as no-calls to ensure that AC,AN and AF respect the FT flag +# filtered_vd.FT is True ⇒ GT keeps its current value +# filtered_vd.FT is False ⇒ GT assigned no-call +# filtered_vd.FT is missing ⇒ GT keeps its current value + +filtered_vd = filtered_vd.annotate_entries(GT=hl.or_missing(hl.coalesce(filtered_vd.FT, True), filtered_vd.GT)) +# TODO drop LGT now that it will be different than the GT + + +## * Turn the GQ0s into no calls so that ANs are correct +rd = filtered_vds.reference_data +rd = rd.filter_entries(rd.GQ > 0) ## would be better to drop these once its a dense mt? +filtered_vds = hl.vds.VariantDataset(rd, filtered_vd) + +## * Create a DENSE MATRIX TABLE to calculate AC, AN, AF and TODO: Sample Count +mt = hl.vds.to_dense_mt(filtered_vds) +mt = hl.variant_qc(mt) +mt = mt.annotate_rows(AC=mt.variant_qc.AC, AN=mt.variant_qc.AN, AF=mt.variant_qc.AF) +mt = mt.drop('variant_qc') + + +# mt = hl.vds.to_dense_mt(vds) +# fail_case = 'FAIL' +# mt = mt.annotate_entries(FT=hl.if_else(mt.FT, 'PASS', fail_case)) +# hl.export_vcf(mt, '{vcf_output_path}') """ return hail_script