diff --git a/config.yaml b/config.yaml index 1f22fe95..f282dcf0 100644 --- a/config.yaml +++ b/config.yaml @@ -63,6 +63,8 @@ datasets: goterm_name: go_terms destination: *dest gx_folder: syn53127958 + gx_nested_columns: + - gene_biodomains - neuropath_corr: files: diff --git a/gx_suite_definitions/genes_biodomains.ipynb b/gx_suite_definitions/genes_biodomains.ipynb index 23eeea6d..1994ceea 100644 --- a/gx_suite_definitions/genes_biodomains.ipynb +++ b/gx_suite_definitions/genes_biodomains.ipynb @@ -7,16 +7,17 @@ "outputs": [], "source": [ "import synapseclient\n", + "import json\n", "\n", + "import pandas as pd\n", "import great_expectations as gx\n", "\n", + "from agoradatatools.gx import GreatExpectationsRunner\n", + "\n", "context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')\n", "\n", "from expectations.expect_column_values_to_have_list_length import ExpectColumnValuesToHaveListLength\n", - "from expectations.expect_column_values_to_have_list_length_in_range import ExpectColumnValuesToHaveListLengthInRange\n", - "from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers\n", - "from expectations.expect_column_values_to_have_list_members_of_type import ExpectColumnValuesToHaveListMembersOfType\n", - "from expectations.expect_column_values_to_have_list_of_dict_with_expected_values import ExpectColumnValuesToHaveListOfDictWithExpectedValues\n" + "from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers" ] }, { @@ -65,10 +66,11 @@ "metadata": {}, "outputs": [], "source": [ - "validator = context.sources.pandas_default.read_json(\n", - " genes_biodomains_data_file\n", - ")\n", - "validator.expectation_suite_name = \"genes_biodomains\"\n" + "df = pd.read_json(genes_biodomains_data_file)\n", + "nested_columns = ['gene_biodomains']\n", + "df = GreatExpectationsRunner.convert_nested_columns_to_json(df, nested_columns)\n", + "validator = context.sources.pandas_default.read_dataframe(df)\n", + "validator.expectation_suite_name = \"genes_biodomains\"" ] }, { @@ -100,12 +102,12 @@ "outputs": [], "source": [ "# gene_biodomains\n", - "validator.expect_column_values_to_be_of_type(\"gene_biodomains\", \"list\")\n", + "validator.expect_column_values_to_be_of_type(\"gene_biodomains\", \"str\")\n", "validator.expect_column_values_to_not_be_null(\"gene_biodomains\")\n", - "validator.expect_column_values_to_have_list_length_in_range(column=\"gene_biodomains\", list_length_range=[1, 19])\n", - "validator.expect_column_values_to_have_list_members_of_type(column=\"gene_biodomains\", member_type=\"dict\")\n", - "biodomain_list = ['Apoptosis', 'Vasculature', 'Lipid Metabolism', 'Proteostasis', 'Immune Response', 'Autophagy', 'Mitochondrial Metabolism', 'Structural Stabilization', 'Synapse', 'Endolysosome', 'Metal Binding and Homeostasis', 'Oxidative Stress', 'Epigenetic', 'APP Metabolism', 'Cell Cycle', 'DNA Repair', 'RNA Spliceosome', 'Tau Homeostasis', 'Myelination']\n", - "validator.expect_column_values_to_have_list_of_dict_with_expected_values(column=\"gene_biodomains\", list_dict_values={\"key\": \"biodomain\", \"values\": biodomain_list})\n" + "#get JSON schema\n", + "with open(\"../src/agoradatatools/great_expectations/gx/json_schemas/genes_biodomains/gene_biodomains_schema.json\", \"r\") as file:\n", + " gene_biodomains_schema = json.load(file)\n", + "validator.expect_column_values_to_match_json_schema(\"gene_biodomains\", json_schema=gene_biodomains_schema)" ] }, { diff --git a/src/agoradatatools/great_expectations/gx/expectations/genes_biodomains.json b/src/agoradatatools/great_expectations/gx/expectations/genes_biodomains.json index 91b009a2..9e024954 100644 --- a/src/agoradatatools/great_expectations/gx/expectations/genes_biodomains.json +++ b/src/agoradatatools/great_expectations/gx/expectations/genes_biodomains.json @@ -44,7 +44,7 @@ "expectation_type": "expect_column_values_to_be_of_type", "kwargs": { "column": "gene_biodomains", - "type_": "list" + "type_": "str" }, "meta": {} }, @@ -56,51 +56,76 @@ "meta": {} }, { - "expectation_type": "expect_column_values_to_have_list_length_in_range", - "kwargs": { - "column": "gene_biodomains", - "list_length_range": [ - 1, - 19 - ] - }, - "meta": {} - }, - { - "expectation_type": "expect_column_values_to_have_list_members_of_type", - "kwargs": { - "column": "gene_biodomains", - "member_type": "dict" - }, - "meta": {} - }, - { - "expectation_type": "expect_column_values_to_have_list_of_dict_with_expected_values", + "expectation_type": "expect_column_values_to_match_json_schema", "kwargs": { "column": "gene_biodomains", - "list_dict_values": { - "key": "biodomain", - "values": [ - "Apoptosis", - "Vasculature", - "Lipid Metabolism", - "Proteostasis", - "Immune Response", - "Autophagy", - "Mitochondrial Metabolism", - "Structural Stabilization", - "Synapse", - "Endolysosome", - "Metal Binding and Homeostasis", - "Oxidative Stress", - "Epigenetic", - "APP Metabolism", - "Cell Cycle", - "DNA Repair", - "RNA Spliceosome", - "Tau Homeostasis", - "Myelination" - ] + "json_schema": { + "$id": "https://github.com/Sage-Bionetworks/agora-data-tools/src/agoradatatools/great_expectations/gx/json_schemas/genes_biodomains/gene_biodomains_schema.json", + "$schema": "https://json-schema.org/draft/2019-09/schema", + "default": [], + "items": { + "default": {}, + "properties": { + "biodomain": { + "default": "", + "enum": [ + "Apoptosis", + "Vasculature", + "Lipid Metabolism", + "Proteostasis", + "Immune Response", + "Autophagy", + "Mitochondrial Metabolism", + "Structural Stabilization", + "Synapse", + "Endolysosome", + "Metal Binding and Homeostasis", + "Oxidative Stress", + "Epigenetic", + "APP Metabolism", + "Cell Cycle", + "DNA Repair", + "RNA Spliceosome", + "Tau Homeostasis", + "Myelination" + ], + "type": "string" + }, + "go_terms": { + "default": [], + "items": { + "UniqueItems": true, + "type": "string" + }, + "maxItems": 100, + "minItems": 1, + "type": "array" + }, + "n_biodomain_terms": { + "default": 0, + "type": "integer" + }, + "n_gene_biodomain_terms": { + "default": 0, + "type": "integer" + }, + "pct_linking_terms": { + "maximum": 100, + "minimum": 0, + "type": "number" + } + }, + "required": [ + "biodomain", + "go_terms", + "n_biodomain_terms", + "n_gene_biodomain_terms", + "pct_linking_terms" + ], + "type": "object" + }, + "title": "Gene Biodomains", + "type": "array" } }, "meta": {} diff --git a/src/agoradatatools/great_expectations/gx/json_schemas/genes_biodomains/gene_biodomains_schema.json b/src/agoradatatools/great_expectations/gx/json_schemas/genes_biodomains/gene_biodomains_schema.json new file mode 100644 index 00000000..03949207 --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/json_schemas/genes_biodomains/gene_biodomains_schema.json @@ -0,0 +1,68 @@ +{ + "$schema": "https://json-schema.org/draft/2019-09/schema", + "$id": "https://github.com/Sage-Bionetworks/agora-data-tools/src/agoradatatools/great_expectations/gx/json_schemas/genes_biodomains/gene_biodomains_schema.json", + "type": "array", + "default": [], + "title": "Gene Biodomains", + "items": { + "type": "object", + "default": {}, + "required": [ + "biodomain", + "go_terms", + "n_biodomain_terms", + "n_gene_biodomain_terms", + "pct_linking_terms" + ], + "properties": { + "biodomain": { + "type": "string", + "default": "", + "enum": [ + "Apoptosis", + "Vasculature", + "Lipid Metabolism", + "Proteostasis", + "Immune Response", + "Autophagy", + "Mitochondrial Metabolism", + "Structural Stabilization", + "Synapse", + "Endolysosome", + "Metal Binding and Homeostasis", + "Oxidative Stress", + "Epigenetic", + "APP Metabolism", + "Cell Cycle", + "DNA Repair", + "RNA Spliceosome", + "Tau Homeostasis", + "Myelination" + ] + }, + "go_terms": { + "type": "array", + "default": [], + "minItems": 1, + "maxItems": 100, + "items": { + "type": "string", + "UniqueItems": true + } + }, + "n_biodomain_terms": { + "type": "integer", + "default": 0 + }, + "n_gene_biodomain_terms": { + "type": "integer", + "default": 0 + }, + "pct_linking_terms": { + "type": "number", + "minimum": 0, + "maximum": 100 + } + } + } +} diff --git a/src/agoradatatools/gx.py b/src/agoradatatools/gx.py index cbaa2adf..36fe7d9d 100644 --- a/src/agoradatatools/gx.py +++ b/src/agoradatatools/gx.py @@ -1,6 +1,10 @@ import logging import os import shutil +import json +import typing + +import pandas as pd import great_expectations as gx from great_expectations.checkpoint.types.checkpoint_result import CheckpointResult @@ -15,13 +19,19 @@ class GreatExpectationsRunner: """Class to run great expectations on a dataset and upload the HTML report to Synapse""" def __init__( - self, syn: Synapse, dataset_path: str, dataset_name: str, upload_folder: str + self, + syn: Synapse, + dataset_path: str, + dataset_name: str, + upload_folder: str, + nested_columns: typing.List[str] = None, ): """Initialize the class""" self.syn = syn self.dataset_path = dataset_path self.expectation_suite_name = dataset_name self.upload_folder = upload_folder + self.nested_columns = nested_columns self.gx_project_dir = self._get_data_context_location() self.context = gx.get_context(project_root_dir=self.gx_project_dir) @@ -97,14 +107,29 @@ def _upload_results_file_to_synapse(self, results_path: str) -> None: ), ) + @staticmethod + def convert_nested_columns_to_json( + df: pd.DataFrame, nested_columns: typing.List[str] + ) -> pd.DataFrame: + """Converts nested columns in a DataFrame to JSON-parseable strings""" + for column in nested_columns: + df[column] = df[column].apply(json.dumps) + return df + def run(self) -> None: """Run great expectations on a dataset and upload the results to Synapse""" if not self._check_if_expectation_suite_exists(): return + logger.info(f"Running data validation on {self.expectation_suite_name}") - validator = self.context.sources.pandas_default.read_json( - self.dataset_path, - ) + + gx_df = pd.read_json(self.dataset_path) + if self.nested_columns: + gx_df = self.convert_nested_columns_to_json( + df=gx_df, nested_columns=self.nested_columns + ) + + validator = self.context.sources.pandas_default.read_dataframe(gx_df) expectation_suite = self.context.get_expectation_suite( self.expectation_suite_name ) diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index ce74c4e8..92f3bc4e 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -125,6 +125,9 @@ def process_dataset( dataset_path=json_path, dataset_name=dataset_name, upload_folder=dataset_obj[dataset_name]["gx_folder"], + nested_columns=dataset_obj[dataset_name]["gx_nested_columns"] + if "gx_nested_columns" in dataset_obj[dataset_name].keys() + else None, ) gx_runner.run() diff --git a/test_config.yaml b/test_config.yaml index fbc1ad8a..d359220b 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -63,6 +63,8 @@ datasets: goterm_name: go_terms destination: *dest gx_folder: syn53127956 + gx_nested_columns: + - gene_biodomains - neuropath_corr: files: diff --git a/tests/test_gx.py b/tests/test_gx.py index def12f64..23d4a947 100644 --- a/tests/test_gx.py +++ b/tests/test_gx.py @@ -1,8 +1,11 @@ import os import shutil +import json from unittest import mock from unittest.mock import patch +import pandas as pd + import pytest from great_expectations.checkpoint.types.checkpoint_result import CheckpointResult from great_expectations.data_context import FileDataContext @@ -22,12 +25,14 @@ def setup_method(self, syn): dataset_path="./tests/test_assets/gx/metabolomics.json", dataset_name="metabolomics", upload_folder="test_folder", + nested_columns=None, ) self.bad_runner = GreatExpectationsRunner( syn=syn, dataset_path="./tests/test_assets/gx/not_supported_dataset.json", dataset_name="not_supported_dataset", upload_folder="test_folder", + nested_columns=None, ) def test_that_an_initialized_runner_has_the_attributes_it_should(self, syn): @@ -104,17 +109,66 @@ def test_upload_results_file_to_synapse(self): ), ) - def test_that_run_completes_successfully_when_check_if_expectation_suite_exists_is_true( + def test_that_convert_nested_columns_to_json_converts_nested_columns_to_json(self): + df = pd.DataFrame({"a": [[1, 2, 3]], "b": [[4, 5, 6]]}) + expected = pd.DataFrame({"a": [[1, 2, 3]], "b": ["[4, 5, 6]"]}) + result = self.good_runner.convert_nested_columns_to_json(df, ["b"]) + assert json.loads(result["b"][0]) == [4, 5, 6] + pd.testing.assert_frame_equal(result, expected) + + def test_that_convert_nested_columns_to_json_does_nothing_if_no_nested_columns( + self, + ): + df = pd.DataFrame({"a": [[1, 2, 3]], "b": [[4, 5, 6]]}) + result = self.good_runner.convert_nested_columns_to_json(df, []) + pd.testing.assert_frame_equal(result, df) + + def test_run_when_expectation_suite_exists_and_nested_columns( + self, + ): + with patch.object( + self.good_runner, "_check_if_expectation_suite_exists", return_value=True + ), patch.object( + pd, "read_json", return_value=pd.DataFrame() + ) as patch_read_json, patch.object( + self.good_runner, + "convert_nested_columns_to_json", + return_value=pd.DataFrame(), + ) as patch_convert_nested_columns_to_json, patch.object( + self.good_runner, "_get_results_path", return_value="test_path" + ) as patch_get_results_path, patch.object( + self.good_runner, "_upload_results_file_to_synapse", return_value=None + ) as patch_upload_results_file_to_synapse: + self.good_runner.nested_columns = ["a"] + self.good_runner.run() + patch_read_json.assert_called_once_with( + self.good_runner.dataset_path, + ) + patch_convert_nested_columns_to_json.assert_called_once() + patch_get_results_path.assert_called_once() + patch_upload_results_file_to_synapse.assert_called_once_with("test_path") + + def test_run_when_expectation_suite_exists_and_no_nested_columns( self, ): with patch.object( self.good_runner, "_check_if_expectation_suite_exists", return_value=True ), patch.object( + pd, "read_json", return_value=pd.DataFrame() + ) as patch_read_json, patch.object( + self.good_runner, + "convert_nested_columns_to_json", + return_value=pd.DataFrame(), + ) as patch_convert_nested_columns_to_json, patch.object( self.good_runner, "_get_results_path", return_value="test_path" ) as patch_get_results_path, patch.object( self.good_runner, "_upload_results_file_to_synapse", return_value=None ) as patch_upload_results_file_to_synapse: self.good_runner.run() + patch_read_json.assert_called_once_with( + self.good_runner.dataset_path, + ) + patch_convert_nested_columns_to_json.assert_not_called() patch_get_results_path.assert_called_once() patch_upload_results_file_to_synapse.assert_called_once_with("test_path") @@ -124,10 +178,18 @@ def test_that_run_does_not_complete_when_check_if_expectation_suite_exists_is_fa with patch.object( self.good_runner, "_check_if_expectation_suite_exists", return_value=False ), patch.object( + pd, "read_json", return_value=pd.DataFrame() + ) as patch_read_json, patch.object( + self.good_runner, + "convert_nested_columns_to_json", + return_value=pd.DataFrame(), + ) as patch_convert_nested_columns_to_json, patch.object( self.good_runner, "_get_results_path", return_value="test_path" ) as patch_get_results_path, patch.object( self.good_runner, "_upload_results_file_to_synapse", return_value=None ) as patch_upload_results_file_to_synapse: self.good_runner.run() + patch_read_json.assert_not_called() + patch_convert_nested_columns_to_json.assert_not_called() patch_get_results_path.assert_not_called() patch_upload_results_file_to_synapse.assert_not_called()