From 09ecfd05f07372fa0139a0cacfe69f058bf694db Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 27 Jun 2024 13:26:35 -0600 Subject: [PATCH 1/6] adds gene_info expectation suite --- config.yaml | 6 + gx_suite_definitions/gene_info.ipynb | 2774 +++++++++++++++++ .../gx/expectations/gene_info.json | 653 ++++ .../json_schemas/gene_info/druggability.json | 40 + .../json_schemas/gene_info/ensembl_info.json | 31 + .../gene_info/median_expression.json | 53 + .../gene_info/target_nominations.json | 87 + test_config.yaml | 6 + 8 files changed, 3650 insertions(+) create mode 100644 gx_suite_definitions/gene_info.ipynb create mode 100644 src/agoradatatools/great_expectations/gx/expectations/gene_info.json create mode 100644 src/agoradatatools/great_expectations/gx/json_schemas/gene_info/druggability.json create mode 100644 src/agoradatatools/great_expectations/gx/json_schemas/gene_info/ensembl_info.json create mode 100644 src/agoradatatools/great_expectations/gx/json_schemas/gene_info/median_expression.json create mode 100644 src/agoradatatools/great_expectations/gx/json_schemas/gene_info/target_nominations.json diff --git a/config.yaml b/config.yaml index 85540671..f1666796 100644 --- a/config.yaml +++ b/config.yaml @@ -196,6 +196,12 @@ datasets: agora_rename: symbol: hgnc_symbol destination: *dest + gx_enabled: true + gx_nested_columns: + - target_nominations + - median_expression + - druggability + - ensembl_info - team_info: files: diff --git a/gx_suite_definitions/gene_info.ipynb b/gx_suite_definitions/gene_info.ipynb new file mode 100644 index 00000000..84d97020 --- /dev/null +++ b/gx_suite_definitions/gene_info.ipynb @@ -0,0 +1,2774 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "import pandas as pd\n", + "\n", + "import great_expectations as gx\n", + "import synapseclient\n", + "\n", + "from agoradatatools.gx import GreatExpectationsRunner\n", + "\n", + "context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')\n", + "\n", + "from expectations.expect_column_values_to_have_list_length import \\\n", + " ExpectColumnValuesToHaveListLength\n", + "from expectations.expect_column_values_to_have_list_members import \\\n", + " ExpectColumnValuesToHaveListMembers\n", + "from expectations.expect_column_values_to_have_list_members_of_type import \\\n", + " ExpectColumnValuesToHaveListMembersOfType" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Expectation Suite for Gene Info Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get Example Data File" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Welcome, Brad Macdonald!\n", + "\n", + "INFO: 2024-06-27 11:52:39 | synapseclient_default | Welcome, Brad Macdonald!\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "UPGRADE AVAILABLE\n", + "\n", + "A more recent version of the Synapse Client (4.3.0) is available. Your version (4.0.0) can be upgraded by typing:\n", + " pip install --upgrade synapseclient\n", + "\n", + "Python Synapse Client version 4.3.0 release notes\n", + "\n", + "https://python-docs.synapse.org/news/\n", + "\n" + ] + } + ], + "source": [ + "syn = synapseclient.Synapse()\n", + "syn.login()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "gene_info_data_file = syn.get(\"syn17015359\").path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Validator Object on Data File" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_json(gene_info_data_file)\n", + "nested_columns = ['target_nominations', 'median_expression', 'druggability', 'ensembl_info']\n", + "df = GreatExpectationsRunner.convert_nested_columns_to_json(df, nested_columns)\n", + "validator = context.sources.pandas_default.read_dataframe(df)\n", + "validator.expectation_suite_name = \"gene_info\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add Expectations to Validator Object For Each Column" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] <>:6: DeprecationWarning: invalid escape sequence \\d\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING: 2024-06-27 11:52:42 | py.warnings | <>:6: DeprecationWarning: invalid escape sequence \\d\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] <>:6: DeprecationWarning: invalid escape sequence \\d\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING: 2024-06-27 11:52:42 | py.warnings | <>:6: DeprecationWarning: invalid escape sequence \\d\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] /var/folders/sr/3g4hnkfd4ld306tty7kqf1rr0000gr/T/ipykernel_64698/2773054897.py:6: DeprecationWarning: invalid escape sequence \\d\n", + " validator.expect_column_values_to_match_regex(\"ensembl_gene_id\", \"^ENSG\\d{11}$\")\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING: 2024-06-27 11:52:42 | py.warnings | /var/folders/sr/3g4hnkfd4ld306tty7kqf1rr0000gr/T/ipykernel_64698/2773054897.py:6: DeprecationWarning: invalid escape sequence \\d\n", + " validator.expect_column_values_to_match_regex(\"ensembl_gene_id\", \"^ENSG\\d{11}$\")\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] /Users/bmacdonald/.local/share/virtualenvs/agora-data-tools-CK0oUlHB/lib/python3.9/site-packages/great_expectations/expectations/expectation.py:1481: UserWarning: `result_format` configured at the Validator-level will not be persisted. Please add the configuration to your Checkpoint config or checkpoint_run() method instead.\n", + " warnings.warn(\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING: 2024-06-27 11:52:42 | py.warnings | /Users/bmacdonald/.local/share/virtualenvs/agora-data-tools-CK0oUlHB/lib/python3.9/site-packages/great_expectations/expectations/expectation.py:1481: UserWarning: `result_format` configured at the Validator-level will not be persisted. Please add the configuration to your Checkpoint config or checkpoint_run() method instead.\n", + " warnings.warn(\n", + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6234a4d6a77c480485a6f02cf50734e2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Calculating Metrics: 0%| | 0/8 [00:00 Date: Thu, 27 Jun 2024 13:33:57 -0600 Subject: [PATCH 2/6] clear jupyter outputs --- gx_suite_definitions/gene_info.ipynb | 2441 +------------------------- 1 file changed, 50 insertions(+), 2391 deletions(-) diff --git a/gx_suite_definitions/gene_info.ipynb b/gx_suite_definitions/gene_info.ipynb index 84d97020..3993777f 100644 --- a/gx_suite_definitions/gene_info.ipynb +++ b/gx_suite_definitions/gene_info.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 69, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -41,36 +41,9 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Welcome, Brad Macdonald!\n", - "\n", - "INFO: 2024-06-27 11:52:39 | synapseclient_default | Welcome, Brad Macdonald!\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "UPGRADE AVAILABLE\n", - "\n", - "A more recent version of the Synapse Client (4.3.0) is available. Your version (4.0.0) can be upgraded by typing:\n", - " pip install --upgrade synapseclient\n", - "\n", - "Python Synapse Client version 4.3.0 release notes\n", - "\n", - "https://python-docs.synapse.org/news/\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "syn = synapseclient.Synapse()\n", "syn.login()\n" @@ -78,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -94,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -114,248 +87,9 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] <>:6: DeprecationWarning: invalid escape sequence \\d\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING: 2024-06-27 11:52:42 | py.warnings | <>:6: DeprecationWarning: invalid escape sequence \\d\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] <>:6: DeprecationWarning: invalid escape sequence \\d\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING: 2024-06-27 11:52:42 | py.warnings | <>:6: DeprecationWarning: invalid escape sequence \\d\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] /var/folders/sr/3g4hnkfd4ld306tty7kqf1rr0000gr/T/ipykernel_64698/2773054897.py:6: DeprecationWarning: invalid escape sequence \\d\n", - " validator.expect_column_values_to_match_regex(\"ensembl_gene_id\", \"^ENSG\\d{11}$\")\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING: 2024-06-27 11:52:42 | py.warnings | /var/folders/sr/3g4hnkfd4ld306tty7kqf1rr0000gr/T/ipykernel_64698/2773054897.py:6: DeprecationWarning: invalid escape sequence \\d\n", - " validator.expect_column_values_to_match_regex(\"ensembl_gene_id\", \"^ENSG\\d{11}$\")\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] /Users/bmacdonald/.local/share/virtualenvs/agora-data-tools-CK0oUlHB/lib/python3.9/site-packages/great_expectations/expectations/expectation.py:1481: UserWarning: `result_format` configured at the Validator-level will not be persisted. Please add the configuration to your Checkpoint config or checkpoint_run() method instead.\n", - " warnings.warn(\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING: 2024-06-27 11:52:42 | py.warnings | /Users/bmacdonald/.local/share/virtualenvs/agora-data-tools-CK0oUlHB/lib/python3.9/site-packages/great_expectations/expectations/expectation.py:1481: UserWarning: `result_format` configured at the Validator-level will not be persisted. Please add the configuration to your Checkpoint config or checkpoint_run() method instead.\n", - " warnings.warn(\n", - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6234a4d6a77c480485a6f02cf50734e2", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Calculating Metrics: 0%| | 0/8 [00:00 Date: Thu, 27 Jun 2024 13:51:01 -0600 Subject: [PATCH 3/6] pre-commit --- .../great_expectations/gx/expectations/gene_info.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agoradatatools/great_expectations/gx/expectations/gene_info.json b/src/agoradatatools/great_expectations/gx/expectations/gene_info.json index 33b2f581..1d7655c1 100644 --- a/src/agoradatatools/great_expectations/gx/expectations/gene_info.json +++ b/src/agoradatatools/great_expectations/gx/expectations/gene_info.json @@ -650,4 +650,4 @@ "meta": { "great_expectations_version": "0.18.1" } -} \ No newline at end of file +} From d38d9a0b77530e843c55d4c73e616cadcccb38a6 Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 27 Jun 2024 14:55:01 -0600 Subject: [PATCH 4/6] fixes typo --- gx_suite_definitions/gene_info.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gx_suite_definitions/gene_info.ipynb b/gx_suite_definitions/gene_info.ipynb index 3993777f..319948ec 100644 --- a/gx_suite_definitions/gene_info.ipynb +++ b/gx_suite_definitions/gene_info.ipynb @@ -95,7 +95,7 @@ "validator.expect_column_values_to_be_of_type(\"ensembl_gene_id\", \"str\")\n", "validator.expect_column_values_to_not_be_null(\"ensembl_gene_id\")\n", "validator.expect_column_value_lengths_to_equal(\"ensembl_gene_id\", 15)\n", - "# checks format and allowed chatacters\n", + "# checks format and allowed characters\n", "validator.expect_column_values_to_match_regex(\"ensembl_gene_id\", \"^ENSG\\d{11}$\")\n", "validator.expect_column_values_to_be_unique(\"ensembl_gene_id\")\n" ] From 3078ca0aaa2ff44223f574a85983aaeef5d1ae10 Mon Sep 17 00:00:00 2001 From: bwmac Date: Mon, 1 Jul 2024 16:15:59 -0600 Subject: [PATCH 5/6] updates from review --- gx_suite_definitions/gene_info.ipynb | 13 +---- .../gx/expectations/gene_info.json | 53 ++++++++----------- .../gene_info/target_nominations.json | 14 ++--- 3 files changed, 30 insertions(+), 50 deletions(-) diff --git a/gx_suite_definitions/gene_info.ipynb b/gx_suite_definitions/gene_info.ipynb index 319948ec..4d4b327f 100644 --- a/gx_suite_definitions/gene_info.ipynb +++ b/gx_suite_definitions/gene_info.ipynb @@ -94,7 +94,6 @@ "# ensembl_gene_id\n", "validator.expect_column_values_to_be_of_type(\"ensembl_gene_id\", \"str\")\n", "validator.expect_column_values_to_not_be_null(\"ensembl_gene_id\")\n", - "validator.expect_column_value_lengths_to_equal(\"ensembl_gene_id\", 15)\n", "# checks format and allowed characters\n", "validator.expect_column_values_to_match_regex(\"ensembl_gene_id\", \"^ENSG\\d{11}$\")\n", "validator.expect_column_values_to_be_unique(\"ensembl_gene_id\")\n" @@ -224,7 +223,7 @@ "validator.expect_column_values_to_be_of_type(\"target_nominations\", \"str\")\n", "with open(\"../src/agoradatatools/great_expectations/gx/json_schemas/gene_info/target_nominations.json\", \"r\") as file:\n", " target_nominations_schema = json.load(file)\n", - "validator.expect_column_values_to_match_json_schema(\"target_nominations\", json_schema=target_nominations_schema, mostly=0.98)" + "validator.expect_column_values_to_match_json_schema(\"target_nominations\", json_schema=target_nominations_schema)" ] }, { @@ -344,16 +343,6 @@ "validator.expect_column_values_to_match_json_schema(\"ensembl_info\", ensembl_info_schema)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# multi-field\n", - "validator.expect_compound_columns_to_be_unique([\"hgnc_symbol\", \"ensembl_gene_id\"])" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/src/agoradatatools/great_expectations/gx/expectations/gene_info.json b/src/agoradatatools/great_expectations/gx/expectations/gene_info.json index 1d7655c1..af1bc89d 100644 --- a/src/agoradatatools/great_expectations/gx/expectations/gene_info.json +++ b/src/agoradatatools/great_expectations/gx/expectations/gene_info.json @@ -17,14 +17,6 @@ }, "meta": {} }, - { - "expectation_type": "expect_column_value_lengths_to_equal", - "kwargs": { - "column": "ensembl_gene_id", - "value": 15 - }, - "meta": {} - }, { "expectation_type": "expect_column_values_to_match_regex", "kwargs": { @@ -250,13 +242,19 @@ "maxLength": 25, "minLength": 4, "pattern": "^syn\\d+$", - "type": "string" + "type": [ + "string", + "null" + ] }, "data_used_to_support_target_selection": { "maxLength": 2000, "minLength": 15, - "pattern": "^((?!\ufffd).)*$", - "type": "string" + "pattern": "^(?:(?![\ufffd]).|[\r\n])*(? Date: Mon, 1 Jul 2024 16:16:15 -0600 Subject: [PATCH 6/6] pre-commit --- .../great_expectations/gx/expectations/gene_info.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agoradatatools/great_expectations/gx/expectations/gene_info.json b/src/agoradatatools/great_expectations/gx/expectations/gene_info.json index af1bc89d..ea351cac 100644 --- a/src/agoradatatools/great_expectations/gx/expectations/gene_info.json +++ b/src/agoradatatools/great_expectations/gx/expectations/gene_info.json @@ -641,4 +641,4 @@ "meta": { "great_expectations_version": "0.18.1" } -} \ No newline at end of file +}