broadinstitute · gbggrant · May 16, 2022 · May 6, 2022 · rsasch · May 13, 2022
diff --git a/scripts/variantstore/InputValidation.ipynb b/scripts/variantstore/InputValidation.ipynb
@@ -0,0 +1,281 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Input Validation\n",
+    "\n",
+    "This python notebook is intended to allow you to quickly validate the inputs for a Joint Call Set.\n",
+    "To run it:\n",
+    "\n",
+    "Define the variable `sample_set_id` (below) to the name of the sample_set (in the current workspace) containing the list of samples to process\n",
-    "Define the variable `sample_set_id` (below) to the name of the sample_set (in the current workspace) containing the list of samples to process\n",
+    "Define the variable `sample_set_id` (below) to the name of the sample_set (in the current workspace) containing the list of samples to process.\n",
-    "Define the variable `sample_set_id` (below) to the name of the sample_set (in the current workspace) containing the list of samples to process\n",
+    "Define the variable `sample_set_id` (below) to the name of the sample_set (in the current workspace) containing the list of samples to process.\n",
+    "\n",
+    "The notebook will validate that:\n",
+    "- the sample set that you have listed is found\n",
+    "- there are no duplicate samples in the sample set\n",
+    "- there are no empty sample names in the sample set\n",
+    "- each sample has a corresponding reblocked_gvcf index\n",
-    "- each sample has a corresponding reblocked_gvcf index\n",
+    "- each sample has a reblocked gVCF and a corresponding reblocked gVCF index\n",
-    "- each sample has a corresponding reblocked_gvcf index\n",
+    "- each sample has a reblocked gVCF and a corresponding reblocked gVCF index\n",
+    "\n",
+    "In order to help determine which column the reblocked_gvcf and its index are stored in, the notebook will look for a defined column name among a predefined list of such column names (see `reblocked_gvcf_fields` and `reblocked_gvcf_index_fields` below). The script expects to find one of those column names in the sample table. If you are using column names that are NOT in those lists, they will need to be added.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_set_id = \"MSSNG_20_sample_callet\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import math\n",
+    "\n",
+    "from firecloud import api as fapi\n",
+    "from tqdm import tqdm\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_field_name(possible_field_names, attribute_names, type_string):\n",
+    "    error_seen = False\n",
+    "    \n",
+    "    field_names_found = set()\n",
+    "    for field_name in possible_field_names:\n",
+    "        if (field_name in attribute_names):\n",
+    "            field_names_found.add(field_name)\n",
+    "\n",
+    "    field_name = None\n",
+    "    if (len(field_names_found) > 0):\n",
+    "        if (len(field_names_found) == 1):\n",
+    "            field_name = list(field_names_found)[0]\n",
+    "        else:\n",
+    "            error_seen = True\n",
+    "            print(f\"ERROR: There are multiple columns in the 'sample' datatable {str(field_names_found)} that potentially contain reblocked gvcfs\")\n",
+    "    else:\n",
+    "        error_seen = True\n",
+    "        print(f\"ERROR: No column for {type_string} in the 'sample' datatable\")\n",
+    "    return field_name, error_seen\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {
+    "code_folding": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Validating inputs for: \n",
+      "Project: broad-dsp-spec-ops-fc\n",
+      "Workspace: GVS - MSSNG Example Callset\n",
+      "Found 20 samples in sample_set 'MSSNG_20_sample_callet'\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1/1 [00:00<00:00,  2.76it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Successfully Validated GVS Inputs\n",
+      "Validated that all samples have non-empty, and unique names\n",
+      "Validated that reblocked gvcfs and indexes were found in the data model\n",
+      "Validated that all samples' reblocked gvcfs have corresponding indexes\n",
+      "\n",
+      "FYI: The name of the column in the datamodel that contains the reblocked gvcfs is: reblocked_gvcf\n",
+      "FYI: The name of the column in the datamodel that contains the reblocked gvcf indices is: reblocked_gvcf_index\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "ws_project = os.environ['WORKSPACE_NAMESPACE']\n",
+    "ws_name = os.environ['WORKSPACE_NAME']\n",
+    "\n",
+    "print(\"Validating inputs for: \")\n",
+    "print(\"Project: \" + ws_project)\n",
+    "print(\"Workspace: \" + ws_name)\n",
+    "\n",
+    "errors_seen = False\n",
+    "\n",
+    "# This is a list of all of the *possible* field names for reblocked gvcfs and their corresponding indices\n",
+    "reblocked_gvcf_fields = ['reblocked_gvcf', \n",
+    "                         'reblocked_gvcf_path',\n",
+    "                         'hg38_reblocked_gvcf']\n",
+    "reblocked_gvcf_index_fields = [\n",
+    "                         'reblocked_gvcf_index',\n",
+    "                         'reblocked_gvcf_index_path',\n",
+    "                         'hg38_reblocked_gvcf_index']\n",
+    "\n",
+    "\n",
+    "entity_types = fapi.list_entity_types(ws_project, ws_name).json()\n",
+    "if ((\"sample\" not in entity_types) or (\"sample_set\" not in entity_types)):\n",
+    "    errors_seen = True\n",
+    "    print(f\"ERROR: Not all expected entities (sample, sample_set) were found in workspace\")\n",
+    "\n",
+    "if (not errors_seen):\n",
+    "    sample_set = fapi.get_entity(ws_project, ws_name, \"sample_set\", sample_set_id).json()\n",
+    "    if (\"attributes\" not in sample_set):\n",
+    "        errors_seen = True\n",
+    "        error_message = sample_set[\"message\"]\n",
+    "        print(f\"ERROR: Looking up {sample_set_id}: {error_message}\")\n",
+    "    \n",
+    "if (not errors_seen):\n",
+    "    samples_in_sample_set = set()\n",
+    "    samples_dupes = set()\n",
+    "    \n",
+    "    attributes = sample_set[\"attributes\"]\n",
+    "    for entity in sample_set['attributes']['samples']['items']:\n",
+    "        sample_id = entity['entityName']\n",
+    "\n",
+    "        if sample_id in samples_in_sample_set:\n",
+    "            samples_dupes.add(sample_id)\n",
+    "        else:\n",
+    "            samples_in_sample_set.add(sample_id)    \n",
+    "\n",
+    "    # Are there any empty sample_ids?\n",
+    "    if ('' in samples_in_sample_set):\n",
+    "        errors_seen = True\n",
+    "        samples_in_sample_set.delete('')\n",
+    "        print(\"ERROR: sample_id set to an empty string.\")\n",
+    "\n",
+    "    # Are all the sample names unique?\n",
+    "    if (len(samples_dupes) > 0):\n",
+    "        errors_seen = True\n",
+    "        print(f\"ERROR: Found {str(len(samples_dupes))}  duplicate sample_ids: \")\n",
+    "        print(list(dict.fromkeys(samples_dupes)))\n",
+    "\n",
+    "    print(f\"Found {str(len(samples_in_sample_set))} samples in sample_set '{sample_set_id}'\")\n",
+    "\n",
+    "if (not errors_seen):\n",
+    "    # Inspect sample table - determine possible names for reblocked_gvcfs.\n",
+    "    gvcf_field, error_seen = get_field_name(reblocked_gvcf_fields, attribute_names, \"reblocked gvcf\")\n",
+    "    if (error_seen):\n",
+    "        errors_seen = True\n",
+    "\n",
+    "    # Inspect sample table - determine possible names for reblocked_gvcf indices.\n",
+    "    gvcf_index_field, error_seen = get_field_name(reblocked_gvcf_index_fields, attribute_names, \"reblocked gvcf index\")\n",
+    "    if (error_seen):\n",
+    "        errors_seen = True\n",
+    "    \n",
+    "if (not errors_seen):\n",
+    "    entity_count = entity_types[etype][\"count\"]\n",
+    "\n",
+    "    page_size = 1000\n",
+    "    num_pages = int(math.ceil(float(entity_count) / page_size))\n",
+    "\n",
+    "    # get entities by page where each page has page_size # of rows using API call\n",
+    "    for page in tqdm(range(1, num_pages + 1)):\n",
+    "        page_of_entitites = fapi.get_entities_query(ws_project, ws_name, etype, page=page,\n",
+    "                                           page_size=page_size).json()\n",
+    "\n",
+    "        for entity in page_of_entitites['results']:\n",
+    "            sample_id = entity['name']\n",
+    "\n",
+    "            if (sample_id in samples_in_sample_set):\n",
+    "                reblocked_gvcf = None\n",
+    "                reblocked_gvcf_index = None\n",
+    "\n",
+    "                if (gvcf_field in entity['attributes']):\n",
+    "                    reblocked_gvcf = entity['attributes'][gvcf_field]\n",
+    "                if (gvcf_index_field in entity['attributes']):\n",
+    "                    reblocked_gvcf_index = entity['attributes'][gvcf_index_field]\n",
+    "\n",
+    "                if (reblocked_gvcf is not None) and (reblocked_gvcf_index is not None):\n",
+    "                    reblocked_gvcf_name = reblocked_gvcf.split('/')[-1]\n",
+    "                    expected_reblocked_gvcf_index_name = reblocked_gvcf_name + \".tbi\"\n",
+    "                    reblocked_gvcf_index_name = reblocked_gvcf_index.split('/')[-1]\n",
+    "                    if (reblocked_gvcf_index_name != expected_reblocked_gvcf_index_name):\n",
+    "                        errors_seen = True\n",
+    "                        print(f\"ERROR: Did not find expected index file (named: {expected_reblocked_gvcf_index_name}) for reblocked_gvcf: {reblocked_gvcf}\")\n",
+    "                else:\n",
+    "                    errors_seen = True\n",
+    "                    if (reblocked_gvcf is not None):\n",
+    "                        print(f\"ERROR: reblocked_gvcf not found for sample_id: {sample_id}\")\n",
+    "                    if (reblocked_gvcf_index is not None):\n",
+    "                        print(f\"ERROR: reblocked_gvcf_index not found for sample_id: {sample_id}\")\n",
+    "\n",
+    "\n",
+    "if (errors_seen):\n",
+    "    print(\"\\nErrors were seen - The inputs have not been validated\\n\")\n",
+    "else:\n",
+    "    print(\"Successfully Validated GVS Inputs\")\n",
+    "    print(\"Validated that all samples have non-empty, and unique names\")\n",
+    "    print(\"Validated that reblocked gvcfs and indices were found in the data model\")\n",
+    "    print(\"Validated that all samples' reblocked gvcfs have corresponding indices\\n\")\n",
+    "\n",
+    "    print(f\"FYI: The name of the column in the datamodel that contains the reblocked gvcfs is: {gvcf_field}\")\n",
+    "    print(f\"FYI: The name of the column in the datamodel that contains the reblocked gvcf indices is: {gvcf_index_field}\")\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.12"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}