Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AG-1314] JSON Schema Validation GX Prototyping #111

Merged
merged 19 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ datasets:
goterm_name: go_terms
destination: *dest
gx_folder: syn53127958
gx_nested_columns:
- gene_biodomains

- neuropath_corr:
files:
Expand Down
28 changes: 15 additions & 13 deletions gx_suite_definitions/genes_biodomains.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,17 @@
"outputs": [],
"source": [
"import synapseclient\n",
"import json\n",
"\n",
"import pandas as pd\n",
"import great_expectations as gx\n",
"\n",
"from agoradatatools.gx import GreatExpectationsRunner\n",
"\n",
"context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')\n",
"\n",
"from expectations.expect_column_values_to_have_list_length import ExpectColumnValuesToHaveListLength\n",
"from expectations.expect_column_values_to_have_list_length_in_range import ExpectColumnValuesToHaveListLengthInRange\n",
"from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers\n",
"from expectations.expect_column_values_to_have_list_members_of_type import ExpectColumnValuesToHaveListMembersOfType\n",
"from expectations.expect_column_values_to_have_list_of_dict_with_expected_values import ExpectColumnValuesToHaveListOfDictWithExpectedValues\n"
"from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers"
]
},
{
Expand Down Expand Up @@ -65,10 +66,11 @@
"metadata": {},
"outputs": [],
"source": [
"validator = context.sources.pandas_default.read_json(\n",
" genes_biodomains_data_file\n",
")\n",
"validator.expectation_suite_name = \"genes_biodomains\"\n"
"df = pd.read_json(genes_biodomains_data_file)\n",
"nested_columns = ['gene_biodomains']\n",
"df = GreatExpectationsRunner.convert_nested_columns_to_json(df, nested_columns)\n",
"validator = context.sources.pandas_default.read_dataframe(df)\n",
"validator.expectation_suite_name = \"genes_biodomains\""
]
},
{
Expand Down Expand Up @@ -100,12 +102,12 @@
"outputs": [],
"source": [
"# gene_biodomains\n",
"validator.expect_column_values_to_be_of_type(\"gene_biodomains\", \"list\")\n",
"validator.expect_column_values_to_be_of_type(\"gene_biodomains\", \"str\")\n",
"validator.expect_column_values_to_not_be_null(\"gene_biodomains\")\n",
"validator.expect_column_values_to_have_list_length_in_range(column=\"gene_biodomains\", list_length_range=[1, 19])\n",
"validator.expect_column_values_to_have_list_members_of_type(column=\"gene_biodomains\", member_type=\"dict\")\n",
"biodomain_list = ['Apoptosis', 'Vasculature', 'Lipid Metabolism', 'Proteostasis', 'Immune Response', 'Autophagy', 'Mitochondrial Metabolism', 'Structural Stabilization', 'Synapse', 'Endolysosome', 'Metal Binding and Homeostasis', 'Oxidative Stress', 'Epigenetic', 'APP Metabolism', 'Cell Cycle', 'DNA Repair', 'RNA Spliceosome', 'Tau Homeostasis', 'Myelination']\n",
"validator.expect_column_values_to_have_list_of_dict_with_expected_values(column=\"gene_biodomains\", list_dict_values={\"key\": \"biodomain\", \"values\": biodomain_list})\n"
"#get JSON schema\n",
"with open(\"../src/agoradatatools/great_expectations/gx/json_schemas/genes_biodomains/gene_biodomains_schema.json\", \"r\") as file:\n",
" gene_biodomains_schema = json.load(file)\n",
"validator.expect_column_values_to_match_json_schema(\"gene_biodomains\", json_schema=gene_biodomains_schema)"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
"expectation_type": "expect_column_values_to_be_of_type",
"kwargs": {
"column": "gene_biodomains",
"type_": "list"
"type_": "str"
},
"meta": {}
},
Expand All @@ -56,51 +56,76 @@
"meta": {}
},
{
"expectation_type": "expect_column_values_to_have_list_length_in_range",
"kwargs": {
"column": "gene_biodomains",
"list_length_range": [
1,
19
]
},
"meta": {}
},
{
"expectation_type": "expect_column_values_to_have_list_members_of_type",
"kwargs": {
"column": "gene_biodomains",
"member_type": "dict"
},
"meta": {}
},
{
"expectation_type": "expect_column_values_to_have_list_of_dict_with_expected_values",
"expectation_type": "expect_column_values_to_match_json_schema",
"kwargs": {
"column": "gene_biodomains",
"list_dict_values": {
"key": "biodomain",
"values": [
"Apoptosis",
"Vasculature",
"Lipid Metabolism",
"Proteostasis",
"Immune Response",
"Autophagy",
"Mitochondrial Metabolism",
"Structural Stabilization",
"Synapse",
"Endolysosome",
"Metal Binding and Homeostasis",
"Oxidative Stress",
"Epigenetic",
"APP Metabolism",
"Cell Cycle",
"DNA Repair",
"RNA Spliceosome",
"Tau Homeostasis",
"Myelination"
]
"json_schema": {
"$id": "https://github.com/Sage-Bionetworks/agora-data-tools/src/agoradatatools/great_expectations/gx/json_schemas/genes_biodomains/gene_biodomains_schema.json",
"$schema": "https://json-schema.org/draft/2019-09/schema",
"default": [],
"items": {
"default": {},
"properties": {
"biodomain": {
"default": "",
"enum": [
"Apoptosis",
"Vasculature",
"Lipid Metabolism",
"Proteostasis",
"Immune Response",
"Autophagy",
"Mitochondrial Metabolism",
"Structural Stabilization",
"Synapse",
"Endolysosome",
"Metal Binding and Homeostasis",
"Oxidative Stress",
"Epigenetic",
"APP Metabolism",
"Cell Cycle",
"DNA Repair",
"RNA Spliceosome",
"Tau Homeostasis",
"Myelination"
],
"type": "string"
},
"go_terms": {
"default": [],
"items": {
"UniqueItems": true,
"type": "string"
},
"maxItems": 100,
"minItems": 1,
"type": "array"
},
"n_biodomain_terms": {
"default": 0,
"type": "integer"
},
"n_gene_biodomain_terms": {
"default": 0,
"type": "integer"
},
"pct_linking_terms": {
"maximum": 100,
"minimum": 0,
"type": "number"
jaclynbeck-sage marked this conversation as resolved.
Show resolved Hide resolved
}
},
"required": [
"biodomain",
"go_terms",
"n_biodomain_terms",
"n_gene_biodomain_terms",
"pct_linking_terms"
],
"type": "object"
},
"title": "Gene Biodomains",
"type": "array"
}
},
"meta": {}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
{
BWMac marked this conversation as resolved.
Show resolved Hide resolved
"$schema": "https://json-schema.org/draft/2019-09/schema",
"$id": "https://github.com/Sage-Bionetworks/agora-data-tools/src/agoradatatools/great_expectations/gx/json_schemas/genes_biodomains/gene_biodomains_schema.json",
"type": "array",
"default": [],
"title": "Gene Biodomains",
"items": {
"type": "object",
"default": {},
"required": [
"biodomain",
"go_terms",
"n_biodomain_terms",
"n_gene_biodomain_terms",
"pct_linking_terms"
],
"properties": {
"biodomain": {
"type": "string",
"default": "",
"enum": [
"Apoptosis",
"Vasculature",
"Lipid Metabolism",
"Proteostasis",
"Immune Response",
"Autophagy",
"Mitochondrial Metabolism",
"Structural Stabilization",
"Synapse",
"Endolysosome",
"Metal Binding and Homeostasis",
"Oxidative Stress",
"Epigenetic",
"APP Metabolism",
"Cell Cycle",
"DNA Repair",
"RNA Spliceosome",
"Tau Homeostasis",
"Myelination"
]
},
"go_terms": {
"type": "array",
"default": [],
"minItems": 1,
"maxItems": 100,
"items": {
"type": "string",
"UniqueItems": true
}
},
"n_biodomain_terms": {
"type": "integer",
"default": 0
},
"n_gene_biodomain_terms": {
"type": "integer",
"default": 0
},
"pct_linking_terms": {
"type": "number",
"minimum": 0,
"maximum": 100
}
}
}
}
33 changes: 29 additions & 4 deletions src/agoradatatools/gx.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import logging
import os
import shutil
import json
import typing

import pandas as pd

import great_expectations as gx
from great_expectations.checkpoint.types.checkpoint_result import CheckpointResult
Expand All @@ -15,13 +19,19 @@ class GreatExpectationsRunner:
"""Class to run great expectations on a dataset and upload the HTML report to Synapse"""

def __init__(
self, syn: Synapse, dataset_path: str, dataset_name: str, upload_folder: str
self,
syn: Synapse,
dataset_path: str,
dataset_name: str,
upload_folder: str,
nested_columns: typing.List[str] = None,
):
"""Initialize the class"""
self.syn = syn
self.dataset_path = dataset_path
self.expectation_suite_name = dataset_name
self.upload_folder = upload_folder
self.nested_columns = nested_columns
self.gx_project_dir = self._get_data_context_location()

self.context = gx.get_context(project_root_dir=self.gx_project_dir)
Expand Down Expand Up @@ -97,14 +107,29 @@ def _upload_results_file_to_synapse(self, results_path: str) -> None:
),
)

@staticmethod
def convert_nested_columns_to_json(
thomasyu888 marked this conversation as resolved.
Show resolved Hide resolved
df: pd.DataFrame, nested_columns: typing.List[str]
) -> pd.DataFrame:
"""Converts nested columns in a DataFrame to JSON-parseable strings"""
for column in nested_columns:
df[column] = df[column].apply(json.dumps)
return df

def run(self) -> None:
"""Run great expectations on a dataset and upload the results to Synapse"""
if not self._check_if_expectation_suite_exists():
return

logger.info(f"Running data validation on {self.expectation_suite_name}")
validator = self.context.sources.pandas_default.read_json(
self.dataset_path,
)

gx_df = pd.read_json(self.dataset_path)
if self.nested_columns:
gx_df = self.convert_nested_columns_to_json(
df=gx_df, nested_columns=self.nested_columns
)

validator = self.context.sources.pandas_default.read_dataframe(gx_df)
expectation_suite = self.context.get_expectation_suite(
self.expectation_suite_name
)
Expand Down
3 changes: 3 additions & 0 deletions src/agoradatatools/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ def process_dataset(
dataset_path=json_path,
dataset_name=dataset_name,
upload_folder=dataset_obj[dataset_name]["gx_folder"],
nested_columns=dataset_obj[dataset_name]["gx_nested_columns"]
if "gx_nested_columns" in dataset_obj[dataset_name].keys()
else None,
)
gx_runner.run()

Expand Down
2 changes: 2 additions & 0 deletions test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ datasets:
goterm_name: go_terms
destination: *dest
gx_folder: syn53127956
gx_nested_columns:
- gene_biodomains

- neuropath_corr:
files:
Expand Down
Loading
Loading