From 4042dbdf910806c988660652df40aac93830d375 Mon Sep 17 00:00:00 2001
From: rbpatt2019 <rb.patterson.cross@gmail.com>
Date: Tue, 6 Jul 2021 11:10:19 +0100
Subject: [PATCH] docs(.): Add documentation for lut_check

Adds proper documentation for code introduced by
93248724cfc264ff82e66a635ff3ba882fce2971. Updates docstrings in relevant
`scripts` files and modifies the necessary sphinx rst files. Ensures
doctests continue to pass by adding pandas as a dep to the relevant
environment.

See: #2
---
 README.rst                        | 4 ++--
 docs/data_handling.rst            | 7 +++++++
 docs/data_handling_tests.rst      | 6 ++++++
 environments/doc_tests.txt        | 1 +
 scripts/data_handling/request.py  | 8 +++++++-
 scripts/multithreading/request.py | 3 +++
 scripts/request.py                | 8 ++++++++
 7 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index b6c79b7..53f6175 100644
--- a/README.rst
+++ b/README.rst
@@ -103,7 +103,7 @@ The following will do the trick:
    xargs -n1 curl -sL |
    tar xzf -
 
-After querying the github api to ge the most recent release information,
+After querying the github api to get the most recent release information,
 we grep for the desired URL,
 split the line and extract the field,
 trim superfluous characters,
@@ -195,7 +195,7 @@ as we ran it,
 with the software versions,
 as we used them.
 To further aid in this effort,
-`nox`_ and `pre-commit` are used,
+`nox`_ and `pre-commit`_ are used,
 which also ensures that development happens in reproducible environments.
 
 Unfortunately,
diff --git a/docs/data_handling.rst b/docs/data_handling.rst
index 0262e24..efcfde7 100644
--- a/docs/data_handling.rst
+++ b/docs/data_handling.rst
@@ -5,6 +5,13 @@ data_handling Module
 
 .. automodule:: scripts.data_handling
 
+data_handling.request
+---------------------
+
+.. automodule:: scripts.data_handling.request
+   :members:
+   :private-members:
+
 data_handling.process
 ---------------------
 
diff --git a/docs/data_handling_tests.rst b/docs/data_handling_tests.rst
index fe411d4..402b47e 100644
--- a/docs/data_handling_tests.rst
+++ b/docs/data_handling_tests.rst
@@ -5,6 +5,12 @@ Tests for the data_handling Module
 
 .. automodule:: tests.data_handling
 
+Tests for the data_handling.request Submodule
+---------------------------------------------
+
+.. automodule:: tests.data_handling.test_request
+   :members:
+
 Tests for the data_handling.process Submodule
 ---------------------------------------------
 
diff --git a/environments/doc_tests.txt b/environments/doc_tests.txt
index 6413abf..98ccc51 100644
--- a/environments/doc_tests.txt
+++ b/environments/doc_tests.txt
@@ -1,2 +1,3 @@
 xdoctest==0.15.4
 pytest==6.2.4
+pandas==1.2.4
diff --git a/scripts/data_handling/request.py b/scripts/data_handling/request.py
index bd765f4..a921a61 100644
--- a/scripts/data_handling/request.py
+++ b/scripts/data_handling/request.py
@@ -29,7 +29,6 @@ def lut_check(gene: str, lut: pd.DataFrame) -> Optional[str]:  # type: ignore
     Common reasons (at least for me!) that a gene might not be found include
     spelling errors and name errors (ie. using NGN2 instead of NEUROG2).
 
-
     Parameters
     ----------
     gene : str
@@ -41,6 +40,13 @@ def lut_check(gene: str, lut: pd.DataFrame) -> Optional[str]:  # type: ignore
     -------
     Optional[str]
 
+    Example
+    -------
+    >>> lut = pd.DataFrame.from_dict({"name": ["ASCL1"], "id": ["ENSG00000139352.3"]})
+    >>> lut_check("ASCL1", lut)
+    'ENSG00000139352.3'
+    >>> lut_check("NotAGene", lut)
+
     """
     with contextlib.suppress(IndexError):
         return lut.loc[lut["name"] == gene, "id"].values[0]
diff --git a/scripts/multithreading/request.py b/scripts/multithreading/request.py
index 48c9364..a542683 100644
--- a/scripts/multithreading/request.py
+++ b/scripts/multithreading/request.py
@@ -69,6 +69,9 @@ def _get_session(region: str) -> requests.Session:
 def gtex_request(region: str, gene: str, output: str) -> None:
     """Make a thead-safe gtex request against medianTranscriptExpression.
 
+    If gene is a str, then a query is made to GTEx; however, if gene is None, then
+    a blank file is created and no query is performed.
+
     A thread local session is provided by a call to ``_get_session``.
     This allows the reuse of sessions, which, among other things,
     provides significant speed ups.
diff --git a/scripts/request.py b/scripts/request.py
index 41e3d3b..fcd9379 100644
--- a/scripts/request.py
+++ b/scripts/request.py
@@ -4,6 +4,14 @@
 This step queries the GTEx API for transcript expression data in the region
 specified by the user,
 using a user provided list of genes names.
+To simplify the use experience,
+the user should provide common gene names.
+These are then automatically converted to the GTEx-required Ensembl IDs
+by referencing Gencode v26 as this is the version used by GTEx.
+If a gene name is not found in Gencode,
+then a warning is dumped to the logs,
+and a blank file created to propagate this error downstream.
+
 As the GTEx API is quite straightforward,
 these queries can be made using the standard `requests.session`_ object.
 Data were pulled from the ``gtex_v8`` dataset limited to the