Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First cut at a python notebook to validate inputs. #7845

Merged
merged 1 commit into from
May 16, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
281 changes: 281 additions & 0 deletions scripts/variantstore/InputValidation.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Input Validation\n",
"\n",
"This python notebook is intended to allow you to quickly validate the inputs for a Joint Call Set.\n",
"To run it:\n",
"\n",
"Define the variable `sample_set_id` (below) to the name of the sample_set (in the current workspace) containing the list of samples to process\n",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit

Suggested change
"Define the variable `sample_set_id` (below) to the name of the sample_set (in the current workspace) containing the list of samples to process\n",
"Define the variable `sample_set_id` (below) to the name of the sample_set (in the current workspace) containing the list of samples to process.\n",

"\n",
"The notebook will validate that:\n",
"- the sample set that you have listed is found\n",
"- there are no duplicate samples in the sample set\n",
"- there are no empty sample names in the sample set\n",
"- each sample has a corresponding reblocked_gvcf index\n",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe

Suggested change
"- each sample has a corresponding reblocked_gvcf index\n",
"- each sample has a reblocked gVCF and a corresponding reblocked gVCF index\n",

"\n",
"In order to help determine which column the reblocked_gvcf and its index are stored in, the notebook will look for a defined column name among a predefined list of such column names (see `reblocked_gvcf_fields` and `reblocked_gvcf_index_fields` below). The script expects to find one of those column names in the sample table. If you are using column names that are NOT in those lists, they will need to be added.\n"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"sample_set_id = \"MSSNG_20_sample_callet\"\n"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import math\n",
"\n",
"from firecloud import api as fapi\n",
"from tqdm import tqdm\n"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"def get_field_name(possible_field_names, attribute_names, type_string):\n",
" error_seen = False\n",
" \n",
" field_names_found = set()\n",
" for field_name in possible_field_names:\n",
" if (field_name in attribute_names):\n",
" field_names_found.add(field_name)\n",
"\n",
" field_name = None\n",
" if (len(field_names_found) > 0):\n",
" if (len(field_names_found) == 1):\n",
" field_name = list(field_names_found)[0]\n",
" else:\n",
" error_seen = True\n",
" print(f\"ERROR: There are multiple columns in the 'sample' datatable {str(field_names_found)} that potentially contain reblocked gvcfs\")\n",
" else:\n",
" error_seen = True\n",
" print(f\"ERROR: No column for {type_string} in the 'sample' datatable\")\n",
" return field_name, error_seen\n"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"code_folding": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Validating inputs for: \n",
"Project: broad-dsp-spec-ops-fc\n",
"Workspace: GVS - MSSNG Example Callset\n",
"Found 20 samples in sample_set 'MSSNG_20_sample_callet'\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:00<00:00, 2.76it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Successfully Validated GVS Inputs\n",
"Validated that all samples have non-empty, and unique names\n",
"Validated that reblocked gvcfs and indexes were found in the data model\n",
"Validated that all samples' reblocked gvcfs have corresponding indexes\n",
"\n",
"FYI: The name of the column in the datamodel that contains the reblocked gvcfs is: reblocked_gvcf\n",
"FYI: The name of the column in the datamodel that contains the reblocked gvcf indices is: reblocked_gvcf_index\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"ws_project = os.environ['WORKSPACE_NAMESPACE']\n",
"ws_name = os.environ['WORKSPACE_NAME']\n",
"\n",
"print(\"Validating inputs for: \")\n",
"print(\"Project: \" + ws_project)\n",
"print(\"Workspace: \" + ws_name)\n",
"\n",
"errors_seen = False\n",
"\n",
"# This is a list of all of the *possible* field names for reblocked gvcfs and their corresponding indices\n",
"reblocked_gvcf_fields = ['reblocked_gvcf', \n",
" 'reblocked_gvcf_path',\n",
" 'hg38_reblocked_gvcf']\n",
"reblocked_gvcf_index_fields = [\n",
" 'reblocked_gvcf_index',\n",
" 'reblocked_gvcf_index_path',\n",
" 'hg38_reblocked_gvcf_index']\n",
"\n",
"\n",
"entity_types = fapi.list_entity_types(ws_project, ws_name).json()\n",
"if ((\"sample\" not in entity_types) or (\"sample_set\" not in entity_types)):\n",
" errors_seen = True\n",
" print(f\"ERROR: Not all expected entities (sample, sample_set) were found in workspace\")\n",
"\n",
"if (not errors_seen):\n",
" sample_set = fapi.get_entity(ws_project, ws_name, \"sample_set\", sample_set_id).json()\n",
" if (\"attributes\" not in sample_set):\n",
" errors_seen = True\n",
" error_message = sample_set[\"message\"]\n",
" print(f\"ERROR: Looking up {sample_set_id}: {error_message}\")\n",
" \n",
"if (not errors_seen):\n",
" samples_in_sample_set = set()\n",
" samples_dupes = set()\n",
" \n",
" attributes = sample_set[\"attributes\"]\n",
" for entity in sample_set['attributes']['samples']['items']:\n",
" sample_id = entity['entityName']\n",
"\n",
" if sample_id in samples_in_sample_set:\n",
" samples_dupes.add(sample_id)\n",
" else:\n",
" samples_in_sample_set.add(sample_id) \n",
"\n",
" # Are there any empty sample_ids?\n",
" if ('' in samples_in_sample_set):\n",
" errors_seen = True\n",
" samples_in_sample_set.delete('')\n",
" print(\"ERROR: sample_id set to an empty string.\")\n",
"\n",
" # Are all the sample names unique?\n",
" if (len(samples_dupes) > 0):\n",
" errors_seen = True\n",
" print(f\"ERROR: Found {str(len(samples_dupes))} duplicate sample_ids: \")\n",
" print(list(dict.fromkeys(samples_dupes)))\n",
"\n",
" print(f\"Found {str(len(samples_in_sample_set))} samples in sample_set '{sample_set_id}'\")\n",
"\n",
"if (not errors_seen):\n",
" # Inspect sample table - determine possible names for reblocked_gvcfs.\n",
" gvcf_field, error_seen = get_field_name(reblocked_gvcf_fields, attribute_names, \"reblocked gvcf\")\n",
" if (error_seen):\n",
" errors_seen = True\n",
"\n",
" # Inspect sample table - determine possible names for reblocked_gvcf indices.\n",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fun fact: three counts of indices, five counts of indexes 😄

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I go back and forth on that.

" gvcf_index_field, error_seen = get_field_name(reblocked_gvcf_index_fields, attribute_names, \"reblocked gvcf index\")\n",
" if (error_seen):\n",
" errors_seen = True\n",
" \n",
"if (not errors_seen):\n",
" entity_count = entity_types[etype][\"count\"]\n",
"\n",
" page_size = 1000\n",
" num_pages = int(math.ceil(float(entity_count) / page_size))\n",
"\n",
" # get entities by page where each page has page_size # of rows using API call\n",
" for page in tqdm(range(1, num_pages + 1)):\n",
" page_of_entitites = fapi.get_entities_query(ws_project, ws_name, etype, page=page,\n",
" page_size=page_size).json()\n",
"\n",
" for entity in page_of_entitites['results']:\n",
" sample_id = entity['name']\n",
"\n",
" if (sample_id in samples_in_sample_set):\n",
" reblocked_gvcf = None\n",
" reblocked_gvcf_index = None\n",
"\n",
" if (gvcf_field in entity['attributes']):\n",
" reblocked_gvcf = entity['attributes'][gvcf_field]\n",
" if (gvcf_index_field in entity['attributes']):\n",
" reblocked_gvcf_index = entity['attributes'][gvcf_index_field]\n",
"\n",
" if (reblocked_gvcf is not None) and (reblocked_gvcf_index is not None):\n",
" reblocked_gvcf_name = reblocked_gvcf.split('/')[-1]\n",
" expected_reblocked_gvcf_index_name = reblocked_gvcf_name + \".tbi\"\n",
" reblocked_gvcf_index_name = reblocked_gvcf_index.split('/')[-1]\n",
" if (reblocked_gvcf_index_name != expected_reblocked_gvcf_index_name):\n",
" errors_seen = True\n",
" print(f\"ERROR: Did not find expected index file (named: {expected_reblocked_gvcf_index_name}) for reblocked_gvcf: {reblocked_gvcf}\")\n",
" else:\n",
" errors_seen = True\n",
" if (reblocked_gvcf is not None):\n",
" print(f\"ERROR: reblocked_gvcf not found for sample_id: {sample_id}\")\n",
" if (reblocked_gvcf_index is not None):\n",
" print(f\"ERROR: reblocked_gvcf_index not found for sample_id: {sample_id}\")\n",
"\n",
"\n",
"if (errors_seen):\n",
" print(\"\\nErrors were seen - The inputs have not been validated\\n\")\n",
"else:\n",
" print(\"Successfully Validated GVS Inputs\")\n",
" print(\"Validated that all samples have non-empty, and unique names\")\n",
" print(\"Validated that reblocked gvcfs and indices were found in the data model\")\n",
" print(\"Validated that all samples' reblocked gvcfs have corresponding indices\\n\")\n",
"\n",
" print(f\"FYI: The name of the column in the datamodel that contains the reblocked gvcfs is: {gvcf_field}\")\n",
" print(f\"FYI: The name of the column in the datamodel that contains the reblocked gvcf indices is: {gvcf_index_field}\")\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}