diff --git a/config.yaml b/config.yaml index f1666796..c50b8dee 100644 --- a/config.yaml +++ b/config.yaml @@ -275,6 +275,11 @@ datasets: geneticsscore: genetics_score omicsscore: multi_omics_score destination: *dest + gx_enabled: true + gx_nested_columns: + - target_risk_score + - genetics_score + - multi_omics_score - rna_distribution_data: files: *rna_diff_expr_data_files diff --git a/gx_suite_definitions/distribution_data.ipynb b/gx_suite_definitions/distribution_data.ipynb new file mode 100644 index 00000000..798a8317 --- /dev/null +++ b/gx_suite_definitions/distribution_data.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import synapseclient\n", + "import json\n", + "\n", + "import pandas as pd\n", + "import great_expectations as gx\n", + "\n", + "from agoradatatools.gx import GreatExpectationsRunner\n", + "\n", + "context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Expectation Suite for Distribution Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get Example Data File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "syn = synapseclient.Synapse()\n", + "syn.login()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distribution_data_file = syn.get(\"syn27572407\").path\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Validator Object on Data File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_json(distribution_data_file)\n", + "nested_columns = ['target_risk_score', 'genetics_score', 'multi_omics_score']\n", + "df = GreatExpectationsRunner.convert_nested_columns_to_json(df, nested_columns)\n", + "validator = context.sources.pandas_default.read_dataframe(df)\n", + "validator.expectation_suite_name = \"distribution_data\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add Expectations to Validator Object For Each Column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# target_risk_score\n", + "validator.expect_column_values_to_be_of_type(\"target_risk_score\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"target_risk_score\")\n", + "#get JSON schema\n", + "with open(\"../src/agoradatatools/great_expectations/gx/json_schemas/distribution_data/target_risk_score.json\", \"r\") as file:\n", + " target_risk_score_schema = json.load(file)\n", + "validator.expect_column_values_to_match_json_schema(\"target_risk_score\", json_schema=target_risk_score_schema)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# genetics_score\n", + "validator.expect_column_values_to_be_of_type(\"genetics_score\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"genetics_score\")\n", + "#get JSON schema\n", + "with open(\"../src/agoradatatools/great_expectations/gx/json_schemas/distribution_data/genetics_score.json\", \"r\") as file:\n", + " genetics_score_schema = json.load(file)\n", + "validator.expect_column_values_to_match_json_schema(\"genetics_score\", json_schema=genetics_score_schema)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# multi_omics_score\n", + "validator.expect_column_values_to_be_of_type(\"multi_omics_score\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"multi_omics_score\")\n", + "#get JSON schema\n", + "with open(\"../src/agoradatatools/great_expectations/gx/json_schemas/distribution_data/multi_omics_score.json\", \"r\") as file:\n", + " multi_omics_score_schema = json.load(file)\n", + "validator.expect_column_values_to_match_json_schema(\"multi_omics_score\", json_schema=multi_omics_score_schema)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save Expectation Suite" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "validator.save_expectation_suite(discard_failed_expectations=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Checkpoint and View Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint = context.add_or_update_checkpoint(\n", + " name=\"agora-test-checkpoint\",\n", + " validator=validator,\n", + ")\n", + "checkpoint_result = checkpoint.run()\n", + "context.view_validation_result(checkpoint_result)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build Data Docs - Click on Expectation Suite to View All Expectations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "context.build_data_docs()\n", + "context.open_data_docs()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/agoradatatools/great_expectations/gx/expectations/distribution_data.json b/src/agoradatatools/great_expectations/gx/expectations/distribution_data.json new file mode 100644 index 00000000..9f05f876 --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/expectations/distribution_data.json @@ -0,0 +1,352 @@ +{ + "data_asset_type": null, + "expectation_suite_name": "distribution_data", + "expectations": [ + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "target_risk_score", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "target_risk_score" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_json_schema", + "kwargs": { + "column": "target_risk_score", + "json_schema": { + "$id": "http://example.com/example.json", + "$schema": "https://json-schema.org/draft/2019-09/schema", + "default": {}, + "properties": { + "bins": { + "default": [], + "items": { + "default": [], + "items": { + "allOf": [ + { + "default": 0, + "minimum": 0, + "type": "number" + }, + { + "default": 0, + "maximum": 5, + "type": "number" + } + ] + }, + "maxItems": 2, + "minItems": 2, + "type": "array" + }, + "maxItems": 10, + "minItems": 10, + "type": "array" + }, + "distribution": { + "default": [], + "items": { + "minimum": 0, + "type": "number" + }, + "maxItems": 10, + "minItems": 10, + "type": "array" + }, + "first_quartile": { + "default": 0.0, + "type": "number" + }, + "max": { + "default": 0.0, + "type": "number" + }, + "mean": { + "default": 0.0, + "type": "number" + }, + "min": { + "default": 0.0, + "type": "number" + }, + "name": { + "default": "Target Risk Score", + "pattern": "Target Risk Score", + "type": "string" + }, + "syn_id": { + "default": "syn25913473", + "pattern": "syn25913473", + "type": "string" + }, + "third_quartile": { + "default": 0.0, + "type": "number" + }, + "wiki_id": { + "default": "621071", + "pattern": "621071", + "type": "string" + } + }, + "required": [ + "distribution", + "bins", + "min", + "max", + "mean", + "first_quartile", + "third_quartile", + "name", + "syn_id", + "wiki_id" + ], + "title": "Target Risk Score Schema", + "type": "object" + } + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "genetics_score", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "genetics_score" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_json_schema", + "kwargs": { + "column": "genetics_score", + "json_schema": { + "$id": "http://example.com/example.json", + "$schema": "https://json-schema.org/draft/2019-09/schema", + "default": {}, + "properties": { + "bins": { + "default": [], + "items": { + "default": [], + "items": { + "allOf": [ + { + "default": 0, + "minimum": 0, + "type": "number" + }, + { + "default": 0, + "maximum": 3, + "type": "number" + } + ] + }, + "maxItems": 2, + "minItems": 2, + "type": "array" + }, + "maxItems": 10, + "minItems": 10, + "type": "array" + }, + "distribution": { + "default": [], + "items": { + "minimum": 0, + "type": "number" + }, + "maxItems": 10, + "minItems": 10, + "type": "array" + }, + "first_quartile": { + "default": 0.0, + "type": "number" + }, + "max": { + "default": 0.0, + "type": "number" + }, + "mean": { + "default": 0.0, + "type": "number" + }, + "min": { + "default": 0.0, + "type": "number" + }, + "name": { + "default": "Genetic Risk Score", + "pattern": "Genetic Risk Score", + "type": "string" + }, + "syn_id": { + "default": "syn25913473", + "pattern": "syn25913473", + "type": "string" + }, + "third_quartile": { + "default": 0.0, + "type": "number" + }, + "wiki_id": { + "default": "621069", + "pattern": "621069", + "type": "string" + } + }, + "required": [ + "distribution", + "bins", + "min", + "max", + "mean", + "first_quartile", + "third_quartile", + "name", + "syn_id", + "wiki_id" + ], + "title": "Genetics Score Schema", + "type": "object" + } + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "multi_omics_score", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "multi_omics_score" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_json_schema", + "kwargs": { + "column": "multi_omics_score", + "json_schema": { + "$id": "http://example.com/example.json", + "$schema": "https://json-schema.org/draft/2019-09/schema", + "default": {}, + "properties": { + "bins": { + "default": [], + "items": { + "default": [], + "items": { + "allOf": [ + { + "default": 0, + "minimum": 0, + "type": "number" + }, + { + "default": 0, + "maximum": 2, + "type": "number" + } + ] + }, + "maxItems": 2, + "minItems": 2, + "type": "array" + }, + "maxItems": 10, + "minItems": 10, + "type": "array" + }, + "distribution": { + "default": [], + "items": { + "minimum": 0, + "type": "number" + }, + "maxItems": 10, + "minItems": 10, + "type": "array" + }, + "first_quartile": { + "default": 0.0, + "type": "number" + }, + "max": { + "default": 0.0, + "type": "number" + }, + "mean": { + "default": 0.0, + "type": "number" + }, + "min": { + "default": 0.0, + "type": "number" + }, + "name": { + "default": "Multi-omic Risk Score", + "pattern": "Multi-omic Risk Score", + "type": "string" + }, + "syn_id": { + "default": "syn25913473", + "pattern": "syn25913473", + "type": "string" + }, + "third_quartile": { + "default": 0.0, + "type": "number" + }, + "wiki_id": { + "default": "621070", + "pattern": "621070", + "type": "string" + } + }, + "required": [ + "distribution", + "bins", + "min", + "max", + "mean", + "first_quartile", + "third_quartile", + "name", + "syn_id", + "wiki_id" + ], + "title": "Multi Omics Score Schema", + "type": "object" + } + }, + "meta": {} + } + ], + "ge_cloud_id": null, + "meta": { + "great_expectations_version": "0.18.1" + } +} diff --git a/src/agoradatatools/great_expectations/gx/json_schemas/distribution_data/genetics_score.json b/src/agoradatatools/great_expectations/gx/json_schemas/distribution_data/genetics_score.json new file mode 100644 index 00000000..d577f669 --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/json_schemas/distribution_data/genetics_score.json @@ -0,0 +1,90 @@ +{ + "$schema": "https://json-schema.org/draft/2019-09/schema", + "$id": "http://example.com/example.json", + "type": "object", + "default": {}, + "title": "Genetics Score Schema", + "required": [ + "distribution", + "bins", + "min", + "max", + "mean", + "first_quartile", + "third_quartile", + "name", + "syn_id", + "wiki_id" + ], + "properties": { + "distribution": { + "type": "array", + "default": [], + "minItems": 10, + "maxItems": 10, + "items": { + "type": "number", + "minimum": 0 + } + }, + "bins": { + "type": "array", + "default": [], + "minItems": 10, + "maxItems": 10, + "items": { + "type": "array", + "default": [], + "minItems": 2, + "maxItems": 2, + "items": { + "allOf": [{ + "type": "number", + "default": 0, + "minimum": 0 + }, + { + "type": "number", + "default": 0, + "maximum": 3 + }] + } + } + }, + "min": { + "type": "number", + "default": 0.0 + }, + "max": { + "type": "number", + "default": 0.0 + }, + "mean": { + "type": "number", + "default": 0.0 + }, + "first_quartile": { + "type": "number", + "default": 0.0 + }, + "third_quartile": { + "type": "number", + "default": 0.0 + }, + "name": { + "type": "string", + "default": "Genetic Risk Score", + "pattern": "Genetic Risk Score" + }, + "syn_id": { + "type": "string", + "default": "syn25913473", + "pattern": "syn25913473" + }, + "wiki_id": { + "type": "string", + "default": "621069", + "pattern": "621069" + } + } +} diff --git a/src/agoradatatools/great_expectations/gx/json_schemas/distribution_data/multi_omics_score.json b/src/agoradatatools/great_expectations/gx/json_schemas/distribution_data/multi_omics_score.json new file mode 100644 index 00000000..e29bd64f --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/json_schemas/distribution_data/multi_omics_score.json @@ -0,0 +1,90 @@ +{ + "$schema": "https://json-schema.org/draft/2019-09/schema", + "$id": "http://example.com/example.json", + "type": "object", + "default": {}, + "title": "Multi Omics Score Schema", + "required": [ + "distribution", + "bins", + "min", + "max", + "mean", + "first_quartile", + "third_quartile", + "name", + "syn_id", + "wiki_id" + ], + "properties": { + "distribution": { + "type": "array", + "default": [], + "minItems": 10, + "maxItems": 10, + "items": { + "type": "number", + "minimum": 0 + } + }, + "bins": { + "type": "array", + "default": [], + "minItems": 10, + "maxItems": 10, + "items": { + "type": "array", + "default": [], + "minItems": 2, + "maxItems": 2, + "items": { + "allOf": [{ + "type": "number", + "default": 0, + "minimum": 0 + }, + { + "type": "number", + "default": 0, + "maximum": 2 + }] + } + } + }, + "min": { + "type": "number", + "default": 0.0 + }, + "max": { + "type": "number", + "default": 0.0 + }, + "mean": { + "type": "number", + "default": 0.0 + }, + "first_quartile": { + "type": "number", + "default": 0.0 + }, + "third_quartile": { + "type": "number", + "default": 0.0 + }, + "name": { + "type": "string", + "default": "Multi-omic Risk Score", + "pattern": "Multi-omic Risk Score" + }, + "syn_id": { + "type": "string", + "default": "syn25913473", + "pattern": "syn25913473" + }, + "wiki_id": { + "type": "string", + "default": "621070", + "pattern": "621070" + } + } +} diff --git a/src/agoradatatools/great_expectations/gx/json_schemas/distribution_data/target_risk_score.json b/src/agoradatatools/great_expectations/gx/json_schemas/distribution_data/target_risk_score.json new file mode 100644 index 00000000..647a6ee2 --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/json_schemas/distribution_data/target_risk_score.json @@ -0,0 +1,90 @@ +{ + "$schema": "https://json-schema.org/draft/2019-09/schema", + "$id": "http://example.com/example.json", + "type": "object", + "default": {}, + "title": "Target Risk Score Schema", + "required": [ + "distribution", + "bins", + "min", + "max", + "mean", + "first_quartile", + "third_quartile", + "name", + "syn_id", + "wiki_id" + ], + "properties": { + "distribution": { + "type": "array", + "default": [], + "minItems": 10, + "maxItems": 10, + "items": { + "type": "number", + "minimum": 0 + } + }, + "bins": { + "type": "array", + "default": [], + "minItems": 10, + "maxItems": 10, + "items": { + "type": "array", + "default": [], + "minItems": 2, + "maxItems": 2, + "items": { + "allOf": [{ + "type": "number", + "default": 0, + "minimum": 0 + }, + { + "type": "number", + "default": 0, + "maximum": 5 + }] + } + } + }, + "min": { + "type": "number", + "default": 0.0 + }, + "max": { + "type": "number", + "default": 0.0 + }, + "mean": { + "type": "number", + "default": 0.0 + }, + "first_quartile": { + "type": "number", + "default": 0.0 + }, + "third_quartile": { + "type": "number", + "default": 0.0 + }, + "name": { + "type": "string", + "default": "Target Risk Score", + "pattern": "Target Risk Score" + }, + "syn_id": { + "type": "string", + "default": "syn25913473", + "pattern": "syn25913473" + }, + "wiki_id": { + "type": "string", + "default": "621071", + "pattern": "621071" + } + } +} diff --git a/test_config.yaml b/test_config.yaml index e520b011..88e10dd9 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -275,6 +275,11 @@ datasets: geneticsscore: genetics_score omicsscore: multi_omics_score destination: *dest + gx_enabled: true + gx_nested_columns: + - target_risk_score + - genetics_score + - multi_omics_score - rna_distribution_data: files: *rna_diff_expr_data_files