diff --git a/.gitignore b/.gitignore index 51c9dc49..20c38245 100644 --- a/.gitignore +++ b/.gitignore @@ -139,3 +139,5 @@ test_staging_dir/ # dev config file dev_config.yaml + +.vscode/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2bd1d3b7..833463a4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -198,13 +198,13 @@ These expectations are defined in the `/great_expectations/gx/plugins/expectatio #### Nested Columns -If the transform includes nested columns (example: `druggability` column in `gene_info` tranform), please follow these steps: -1. Add the nested column name to the `gx_nested_columns` flag in the configuration file for the specific transform. This will convert the column values to a JSON parsable string. +If the transform includes nested columns (example: `druggability` column in `gene_info` tranform), please follow these four steps: +1. In the config file, add the nested column name to the `gx_nested_columns` flag for the specific transform. This will convert the column values to a JSON parsable string. ``` gx_nested_columns: - ``` -1. When creating the validator object in the gx_suite_definitions notebook, the nested column(s) must be included in the `nested_columns` list. +2. When creating the validator object in the gx_suite_definitions notebook, the nested column(s) must be included in the `nested_columns` list. ``` df = pd.read_json() nested_columns = [''] @@ -212,11 +212,11 @@ df = GreatExpectationsRunner.convert_nested_columns_to_json(df, nested_columns) validator = context.sources.pandas_default.read_dataframe(df) validator.expectation_suite_name = "" ``` -1. When validating the value type of the nested column, make sure to specify it as a string (see Step 1 for reasoning): +3. When validating the value type of the nested column, specify it as a string (see Step 1 for reasoning): ``` validator.expect_column_values_to_be_of_type("", "str") ``` -1. A JSON file containing the expected schema must be added here: `src/agoradatatools/great_expectations/gx/json_schemas//.json`. Use the [JSON schema tool](https://jsonschema.net/app/schemas/0) to create the schema template for your nested column. +4. A JSON file containing the expected schema must be added here: `src/agoradatatools/great_expectations/gx/json_schemas//.json`. Use the [JSON schema tool](https://jsonschema.net/app/schemas/0) to create the schema template for your nested column. ### DockerHub diff --git a/gx_suite_definitions/biomarkers.ipynb b/gx_suite_definitions/biomarkers.ipynb new file mode 100644 index 00000000..aec0b4f0 --- /dev/null +++ b/gx_suite_definitions/biomarkers.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import synapseclient\n", + "\n", + "import great_expectations as gx\n", + "import pandas as pd\n", + "import json\n", + "\n", + "context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')\n", + "\n", + "from agoradatatools.gx import GreatExpectationsRunner\n", + "from expectations.expect_column_values_to_have_list_length import ExpectColumnValuesToHaveListLength\n", + "from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers\n", + "from expectations.expect_column_values_to_have_list_members_of_type import ExpectColumnValuesToHaveListMembersOfType" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Expectation Suite for Biomarkers Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "syn = synapseclient.Synapse()\n", + "syn.login()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "biomarkers_data_file = syn.get(\"syn63540269\").path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Validator Object on Data File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_json(biomarkers_data_file)\n", + "nested_columns = ['points']\n", + "df = GreatExpectationsRunner.convert_nested_columns_to_json(df, nested_columns)\n", + "validator = context.sources.pandas_default.read_dataframe(df)\n", + "validator.expectation_suite_name = \"biomarkers\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add Expectations to Validator Object For Each Column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the list of unique values for fields\n", + "with open(\"../src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/biomarkers_unique_field_values.json\", \"r\") as file:\n", + " unique_field_values = json.load(file)\n", + "print(unique_field_values)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# points\n", + "validator.expect_column_values_to_be_of_type(\"points\", \"str\")\n", + "with open(\"../src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/points.json\", \"r\") as file:\n", + " points_schema = json.load(file)\n", + "validator.expect_column_values_to_match_json_schema(\"points\", json_schema=points_schema)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# model\n", + "validator.expect_column_values_to_be_of_type(\"model\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"model\")\n", + "\n", + "# List of accepted values\n", + "field_name = \"model\"\n", + "if field_name in unique_field_values:\n", + " validator.expect_column_distinct_values_to_be_in_set(field_name, unique_field_values[field_name])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# type\n", + "validator.expect_column_values_to_be_of_type(\"type\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"type\")\n", + "\n", + "# List of accepted values\n", + "field_name = \"type\"\n", + "if field_name in unique_field_values:\n", + " validator.expect_column_distinct_values_to_be_in_set(field_name, unique_field_values[field_name])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# units\n", + "validator.expect_column_values_to_be_of_type(\"units\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"units\")\n", + "\n", + "# List of accepted values\n", + "field_name = \"units\"\n", + "if field_name in unique_field_values:\n", + " validator.expect_column_distinct_values_to_be_in_set(field_name, unique_field_values[field_name])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# age_death\n", + "validator.expect_column_values_to_be_of_type(\"age_death\", \"int\")\n", + "validator.expect_column_values_to_not_be_null(\"age_death\")\n", + "validator.expect_column_values_to_be_between(\"age_death\", strict_min_value=0, max_value=100)\n", + "\n", + "# List of accepted values\n", + "field_name = \"age_death\"\n", + "if field_name in unique_field_values:\n", + " validator.expect_column_distinct_values_to_be_in_set(field_name, unique_field_values[field_name])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# tissue\n", + "validator.expect_column_values_to_be_of_type(\"tissue\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"tissue\")\n", + "\n", + "# List of accepted values\n", + "field_name = \"tissue\"\n", + "if field_name in unique_field_values:\n", + " validator.expect_column_distinct_values_to_be_in_set(field_name, unique_field_values[field_name])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# unique entries ExpectSelectColumnValuesToBeUniqueWithinRecord\n", + "validator.expect_select_column_values_to_be_unique_within_record(column_list=[\"model\", \"type\", \"age_death\", \"tissue\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "validator.save_expectation_suite(discard_failed_expectations=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Checkpoint and View Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint = context.add_or_update_checkpoint(\n", + " name=\"agora-test-checkpoint\",\n", + " validator=validator,\n", + ")\n", + "checkpoint_result = checkpoint.run()\n", + "context.view_validation_result(checkpoint_result)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build Data Docs - Click on Expectation Suite to View All Expectations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "context.build_data_docs()\n", + "context.open_data_docs()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/gx_suite_definitions/pathology.ipynb b/gx_suite_definitions/pathology.ipynb new file mode 100644 index 00000000..17583b56 --- /dev/null +++ b/gx_suite_definitions/pathology.ipynb @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import synapseclient\n", + "\n", + "import great_expectations as gx\n", + "import pandas as pd\n", + "import json\n", + "\n", + "context = gx.get_context(project_root_dir='../src/agoradatatools/great_expectations')\n", + "\n", + "from agoradatatools.gx import GreatExpectationsRunner\n", + "from expectations.expect_column_values_to_have_list_length import ExpectColumnValuesToHaveListLength\n", + "from expectations.expect_column_values_to_have_list_members import ExpectColumnValuesToHaveListMembers\n", + "from expectations.expect_column_values_to_have_list_members_of_type import ExpectColumnValuesToHaveListMembersOfType" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Expectation Suite for Pathology Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "syn = synapseclient.Synapse()\n", + "syn.login()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pathology_data_file = syn.get(\"syn63644533\").path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Validator Object on Data File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_json(pathology_data_file)\n", + "nested_columns = ['points']\n", + "df = GreatExpectationsRunner.convert_nested_columns_to_json(df, nested_columns)\n", + "validator = context.sources.pandas_default.read_dataframe(df)\n", + "validator.expectation_suite_name = \"pathology\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add Expectations to Validator Object For Each Column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the list of unique values for fields\n", + "with open(\"../src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/pathology_unique_field_values.json\", \"r\") as file:\n", + " unique_field_values = json.load(file)\n", + "print(unique_field_values)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# points\n", + "validator.expect_column_values_to_be_of_type(\"points\", \"str\")\n", + "with open(\"../src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/points.json\", \"r\") as file:\n", + " points_schema = json.load(file)\n", + "validator.expect_column_values_to_match_json_schema(\"points\", json_schema=points_schema)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# model\n", + "validator.expect_column_values_to_be_of_type(\"model\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"model\")\n", + "\n", + "# List of accepted values\n", + "field_name = \"model\"\n", + "if field_name in unique_field_values:\n", + " validator.expect_column_distinct_values_to_be_in_set(field_name, unique_field_values[field_name])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# type\n", + "validator.expect_column_values_to_be_of_type(\"type\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"type\")\n", + "\n", + "# List of accepted values\n", + "field_name = \"type\"\n", + "if field_name in unique_field_values:\n", + " validator.expect_column_distinct_values_to_be_in_set(field_name, unique_field_values[field_name])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# units\n", + "validator.expect_column_values_to_be_of_type(\"units\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"units\")\n", + "\n", + "# List of accepted values\n", + "field_name = \"units\"\n", + "if field_name in unique_field_values:\n", + " validator.expect_column_distinct_values_to_be_in_set(field_name, unique_field_values[field_name])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# age_death\n", + "validator.expect_column_values_to_be_of_type(\"age_death\", \"int\")\n", + "validator.expect_column_values_to_not_be_null(\"age_death\")\n", + "validator.expect_column_values_to_be_between(\"age_death\", strict_min_value=0, max_value=100)\n", + "\n", + "# List of accepted values\n", + "field_name = \"age_death\"\n", + "if field_name in unique_field_values:\n", + " validator.expect_column_distinct_values_to_be_in_set(field_name, unique_field_values[field_name])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# tissue\n", + "validator.expect_column_values_to_be_of_type(\"tissue\", \"str\")\n", + "validator.expect_column_values_to_not_be_null(\"tissue\")\n", + "\n", + "# List of accepted values\n", + "field_name = \"tissue\"\n", + "if field_name in unique_field_values:\n", + " validator.expect_column_distinct_values_to_be_in_set(field_name, unique_field_values[field_name])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# unique entries ExpectSelectColumnValuesToBeUniqueWithinRecord\n", + "validator.expect_select_column_values_to_be_unique_within_record(column_list=[\"model\", \"type\", \"age_death\", \"tissue\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "validator.save_expectation_suite(discard_failed_expectations=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Checkpoint and View Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint = context.add_or_update_checkpoint(\n", + " name=\"agora-test-checkpoint\",\n", + " validator=validator,\n", + ")\n", + "checkpoint_result = checkpoint.run()\n", + "context.view_validation_result(checkpoint_result)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build Data Docs - Click on Expectation Suite to View All Expectations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "context.build_data_docs()\n", + "context.open_data_docs()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/modelad_test_config.yaml b/modelad_test_config.yaml index d09652a2..96e3eac1 100644 --- a/modelad_test_config.yaml +++ b/modelad_test_config.yaml @@ -1,7 +1,7 @@ destination: &dest syn51498092 staging_path: ./staging -gx_folder: none -gx_table: none +gx_folder: syn63141015 +gx_table: syn63891939 datasets: - biomarkers: files: @@ -15,6 +15,9 @@ datasets: custom_transformations: 1 column_rename: agedeath: age_death + gx_enabled: true + gx_nested_columns: + - points - pathology: files: @@ -28,3 +31,6 @@ datasets: custom_transformations: 1 column_rename: agedeath: age_death + gx_enabled: true + gx_nested_columns: + - points diff --git a/src/agoradatatools/great_expectations/gx/expectations/biomarkers.json b/src/agoradatatools/great_expectations/gx/expectations/biomarkers.json new file mode 100644 index 00000000..4e7952e5 --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/expectations/biomarkers.json @@ -0,0 +1,191 @@ +{ + "data_asset_type": null, + "expectation_suite_name": "biomarkers", + "expectations": [ + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "points", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_json_schema", + "kwargs": { + "column": "points", + "json_schema": { + "$id": "http://example.com/example.json", + "$schema": "https://json-schema.org/draft/2019-09/schema", + "default": [], + "items": { + "properties": { + "genotype": { + "type": "string" + }, + "measurement": { + "type": [ + "number", + "integer" + ] + }, + "sex": { + "type": "string" + } + }, + "required": [ + "genotype", + "measurement", + "sex" + ], + "title": "A Schema", + "type": "object" + }, + "title": "Points Schema", + "type": "array" + } + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "model", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "model" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_distinct_values_to_be_in_set", + "kwargs": { + "column": "model", + "value_set": [ + "3xTG-AD", + "5xFAD", + "Trem2-R47H_NSS", + "Abca7*v1599M" + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "type", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "type" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_distinct_values_to_be_in_set", + "kwargs": { + "column": "type", + "value_set": [ + "Insoluble Abeta40", + "Insoluble Abeta42", + "Soluble Abeta40", + "Soluble Abeta42", + "NfL" + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "units", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "units" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "age_death", + "type_": "int" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "age_death" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "age_death", + "max_value": 100, + "strict_min_value": 0 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "tissue", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "tissue" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_distinct_values_to_be_in_set", + "kwargs": { + "column": "tissue", + "value_set": [ + "cerebral cortex", + "hippocampus", + "plasma" + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_select_column_values_to_be_unique_within_record", + "kwargs": { + "column_list": [ + "model", + "type", + "age_death", + "tissue", + "units" + ] + }, + "meta": {} + } + ], + "ge_cloud_id": null, + "meta": { + "great_expectations_version": "0.18.1" + } +} diff --git a/src/agoradatatools/great_expectations/gx/expectations/pathology.json b/src/agoradatatools/great_expectations/gx/expectations/pathology.json new file mode 100644 index 00000000..f0c0803d --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/expectations/pathology.json @@ -0,0 +1,193 @@ +{ + "data_asset_type": null, + "expectation_suite_name": "pathology", + "expectations": [ + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "points", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_match_json_schema", + "kwargs": { + "column": "points", + "json_schema": { + "$id": "http://example.com/example.json", + "$schema": "https://json-schema.org/draft/2019-09/schema", + "default": [], + "items": { + "properties": { + "genotype": { + "type": "string" + }, + "measurement": { + "type": [ + "number", + "integer" + ] + }, + "sex": { + "type": "string" + } + }, + "required": [ + "genotype", + "measurement", + "sex" + ], + "title": "A Schema", + "type": "object" + }, + "title": "Points Schema", + "type": "array" + } + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "model", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "model" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_distinct_values_to_be_in_set", + "kwargs": { + "column": "model", + "value_set": [ + "3xTG-AD", + "5xFAD", + "Trem2-R47H_NSS", + "Abca7*V1599M" + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "type", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "type" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_distinct_values_to_be_in_set", + "kwargs": { + "column": "type", + "value_set": [ + "Microglia Cell Density (IBA1)", + "Plaque Density (Thio-S)", + "Plaque Size (Thio-S)", + "Astrocyte Cell Density (GFAP)", + "Astrocyte Cell Density (S100B)", + "Phospho-tau (AT8)", + "Dystrophic Neurites (LAMP1)", + "Tau (HT7)" + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "units", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "units" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "age_death", + "type_": "int" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "age_death" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_between", + "kwargs": { + "column": "age_death", + "max_value": 100, + "strict_min_value": 0 + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_be_of_type", + "kwargs": { + "column": "tissue", + "type_": "str" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_values_to_not_be_null", + "kwargs": { + "column": "tissue" + }, + "meta": {} + }, + { + "expectation_type": "expect_column_distinct_values_to_be_in_set", + "kwargs": { + "column": "tissue", + "value_set": [ + "hippocampus", + "cerebral cortex" + ] + }, + "meta": {} + }, + { + "expectation_type": "expect_select_column_values_to_be_unique_within_record", + "kwargs": { + "column_list": [ + "model", + "type", + "age_death", + "tissue", + "units" + ] + }, + "meta": {} + } + ], + "ge_cloud_id": null, + "meta": { + "great_expectations_version": "0.18.1" + } +} diff --git a/src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/biomarkers_unique_field_values.json b/src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/biomarkers_unique_field_values.json new file mode 100644 index 00000000..25aac928 --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/biomarkers_unique_field_values.json @@ -0,0 +1,24 @@ +{ + "model": [ + "3xTG-AD", + "5xFAD", + "Trem2-R47H_NSS", + "Abca7*v1599M" + ], + "type": [ + "Insoluble Abeta40", + "Insoluble Abeta42", + "Soluble Abeta40", + "Soluble Abeta42", + "NfL" + ], + "tissue": [ + "cerebral cortex", + "hippocampus", + "plasma" + ], + "sex": [ + "female", + "male" + ] +} diff --git a/src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/generate_field_validation_lists.ipynb b/src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/generate_field_validation_lists.ipynb new file mode 100644 index 00000000..6b17e27c --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/generate_field_validation_lists.ipynb @@ -0,0 +1,149 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generate Jsons of Expected Field Values for Datasets\n", + "We need to identify and standardize all possible values for a set of fields in the MODEL-AD immunohisto data (biomarkers and pathology - or more). Although it is not ideal to create a validation set directly from the data, we decided it would be best to start here and manually update the lists as needed.\n", + "\n", + "#### General steps:\n", + "1. Define your datasets\n", + "2. Download the data\n", + "3. Get unique set of values for each field of interest\n", + "4. Output the information in a json to be read during gx validation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from agoradatatools.etl import extract, utils" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### User specified values\n", + "You must create a dataset object with the following structure.\n", + "```\n", + "datasets = {\n", + " \"dataset_name\": {\n", + " \"synapse_id\": \"ID\",\n", + " \"fields\": {\n", + " \"field_name\": [],\n", + " \"field_name\": [],\n", + " ...\n", + " }\n", + " },\n", + " ...\n", + "}\n", + "```\n", + "If you want to extract more fields, add '\"field_name\": []' to the \"fields\" dictionary. After running the notebook, the unique field values will be stored as a list for each field." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# User specified values\n", + "\n", + "datasets = {\n", + " \"biomarkers\": {\n", + " \"synapse_id\": \"syn61250724.1\",\n", + " \"fields\": {\n", + " \"model\": [],\n", + " \"type\": [],\n", + " \"tissue\": [],\n", + " \"sex\": []\n", + " }\n", + " },\n", + " \"pathology\": {\n", + " \"synapse_id\": \"syn61357279\",\n", + " \"fields\": {\n", + " \"model\": [],\n", + " \"type\": [],\n", + " \"tissue\": [],\n", + " \"sex\": []\n", + " }\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Log into Synapse\n", + "syn = utils._login_to_synapse()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download data as dataframes\n", + "for dataset in datasets:\n", + " df = extract.get_entity_as_df(syn_id=datasets[dataset][\"synapse_id\"], source=\"csv\", syn=syn)\n", + " df = utils.standardize_column_names(df=df)\n", + " df = utils.standardize_values(df=df)\n", + " datasets[dataset][\"df\"] = df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get unique values for each field\n", + "for dataset in datasets:\n", + " for field in datasets[dataset][\"fields\"]:\n", + " datasets[dataset][\"fields\"][field] = datasets[dataset][\"df\"][field].unique().tolist()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write to json\n", + "for dataset in datasets:\n", + " with open(f\"{dataset}_unique_field_values.json\", \"w\") as f:\n", + " json.dump(datasets[dataset][\"fields\"], f, indent=4)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "synapseclient", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/pathology_unique_field_values.json b/src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/pathology_unique_field_values.json new file mode 100644 index 00000000..ebb6d16a --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/pathology_unique_field_values.json @@ -0,0 +1,26 @@ +{ + "model": [ + "3xTG-AD", + "5xFAD", + "Trem2-R47H_NSS", + "Abca7*V1599M" + ], + "type": [ + "Microglia Cell Density (IBA1)", + "Plaque Density (Thio-S)", + "Plaque Size (Thio-S)", + "Astrocyte Cell Density (GFAP)", + "Astrocyte Cell Density (S100B)", + "Phospho-tau (AT8)", + "Dystrophic Neurites (LAMP1)", + "Tau (HT7)" + ], + "tissue": [ + "hippocampus", + "cerebral cortex" + ], + "sex": [ + "female", + "male" + ] +} diff --git a/src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/points.json b/src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/points.json new file mode 100644 index 00000000..73fbbf1a --- /dev/null +++ b/src/agoradatatools/great_expectations/gx/json_schemas/immunohisto/points.json @@ -0,0 +1,30 @@ +{ + "$schema": "https://json-schema.org/draft/2019-09/schema", + "$id": "http://example.com/example.json", + "type": "array", + "default": [], + "title": "Points Schema", + "items": { + "type": "object", + "title": "A Schema", + "required": [ + "genotype", + "measurement", + "sex" + ], + "properties": { + "genotype": { + "type": "string" + }, + "measurement": { + "type": [ + "number", + "integer" + ] + }, + "sex": { + "type": "string" + } + } + } +}