From 4042dbdf910806c988660652df40aac93830d375 Mon Sep 17 00:00:00 2001 From: rbpatt2019 Date: Tue, 6 Jul 2021 11:10:19 +0100 Subject: [PATCH] docs(.): Add documentation for lut_check Adds proper documentation for code introduced by 93248724cfc264ff82e66a635ff3ba882fce2971. Updates docstrings in relevant `scripts` files and modifies the necessary sphinx rst files. Ensures doctests continue to pass by adding pandas as a dep to the relevant environment. See: #2 --- README.rst | 4 ++-- docs/data_handling.rst | 7 +++++++ docs/data_handling_tests.rst | 6 ++++++ environments/doc_tests.txt | 1 + scripts/data_handling/request.py | 8 +++++++- scripts/multithreading/request.py | 3 +++ scripts/request.py | 8 ++++++++ 7 files changed, 34 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index b6c79b7..53f6175 100644 --- a/README.rst +++ b/README.rst @@ -103,7 +103,7 @@ The following will do the trick: xargs -n1 curl -sL | tar xzf - -After querying the github api to ge the most recent release information, +After querying the github api to get the most recent release information, we grep for the desired URL, split the line and extract the field, trim superfluous characters, @@ -195,7 +195,7 @@ as we ran it, with the software versions, as we used them. To further aid in this effort, -`nox`_ and `pre-commit` are used, +`nox`_ and `pre-commit`_ are used, which also ensures that development happens in reproducible environments. Unfortunately, diff --git a/docs/data_handling.rst b/docs/data_handling.rst index 0262e24..efcfde7 100644 --- a/docs/data_handling.rst +++ b/docs/data_handling.rst @@ -5,6 +5,13 @@ data_handling Module .. automodule:: scripts.data_handling +data_handling.request +--------------------- + +.. automodule:: scripts.data_handling.request + :members: + :private-members: + data_handling.process --------------------- diff --git a/docs/data_handling_tests.rst b/docs/data_handling_tests.rst index fe411d4..402b47e 100644 --- a/docs/data_handling_tests.rst +++ b/docs/data_handling_tests.rst @@ -5,6 +5,12 @@ Tests for the data_handling Module .. automodule:: tests.data_handling +Tests for the data_handling.request Submodule +--------------------------------------------- + +.. automodule:: tests.data_handling.test_request + :members: + Tests for the data_handling.process Submodule --------------------------------------------- diff --git a/environments/doc_tests.txt b/environments/doc_tests.txt index 6413abf..98ccc51 100644 --- a/environments/doc_tests.txt +++ b/environments/doc_tests.txt @@ -1,2 +1,3 @@ xdoctest==0.15.4 pytest==6.2.4 +pandas==1.2.4 diff --git a/scripts/data_handling/request.py b/scripts/data_handling/request.py index bd765f4..a921a61 100644 --- a/scripts/data_handling/request.py +++ b/scripts/data_handling/request.py @@ -29,7 +29,6 @@ def lut_check(gene: str, lut: pd.DataFrame) -> Optional[str]: # type: ignore Common reasons (at least for me!) that a gene might not be found include spelling errors and name errors (ie. using NGN2 instead of NEUROG2). - Parameters ---------- gene : str @@ -41,6 +40,13 @@ def lut_check(gene: str, lut: pd.DataFrame) -> Optional[str]: # type: ignore ------- Optional[str] + Example + ------- + >>> lut = pd.DataFrame.from_dict({"name": ["ASCL1"], "id": ["ENSG00000139352.3"]}) + >>> lut_check("ASCL1", lut) + 'ENSG00000139352.3' + >>> lut_check("NotAGene", lut) + """ with contextlib.suppress(IndexError): return lut.loc[lut["name"] == gene, "id"].values[0] diff --git a/scripts/multithreading/request.py b/scripts/multithreading/request.py index 48c9364..a542683 100644 --- a/scripts/multithreading/request.py +++ b/scripts/multithreading/request.py @@ -69,6 +69,9 @@ def _get_session(region: str) -> requests.Session: def gtex_request(region: str, gene: str, output: str) -> None: """Make a thead-safe gtex request against medianTranscriptExpression. + If gene is a str, then a query is made to GTEx; however, if gene is None, then + a blank file is created and no query is performed. + A thread local session is provided by a call to ``_get_session``. This allows the reuse of sessions, which, among other things, provides significant speed ups. diff --git a/scripts/request.py b/scripts/request.py index 41e3d3b..fcd9379 100644 --- a/scripts/request.py +++ b/scripts/request.py @@ -4,6 +4,14 @@ This step queries the GTEx API for transcript expression data in the region specified by the user, using a user provided list of genes names. +To simplify the use experience, +the user should provide common gene names. +These are then automatically converted to the GTEx-required Ensembl IDs +by referencing Gencode v26 as this is the version used by GTEx. +If a gene name is not found in Gencode, +then a warning is dumped to the logs, +and a blank file created to propagate this error downstream. + As the GTEx API is quite straightforward, these queries can be made using the standard `requests.session`_ object. Data were pulled from the ``gtex_v8`` dataset limited to the