tests(data_handling): increase coverage

Adds test coverage for the scripts.data_handling.process module. This necesitated adding pandas as a testing dependency to handle some data checks.
IMS-Bio2Core-Facility · Jun 11, 2021 · 5c34730 · 5c34730
1 parent b387600
commit 5c34730
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 0 deletions.
diff --git a/environments/tests.txt b/environments/tests.txt
@@ -5,3 +5,4 @@ pytest-sugar==0.9.4
 pytest-cov==2.12.0
 coverage==5.5
 typeguard==2.12.0
+pandas==1.2.4
diff --git a/tests/data_handling/__init__.py b/tests/data_handling/__init__.py
@@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+"""Tests for the data_handling submodule."""
diff --git a/tests/data_handling/test_process.py b/tests/data_handling/test_process.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+"""Tests for the scripts.data_handling.process submodule.
+
+These unit tests are designed to test that the results of the query
+are handled correctly.
+They assume the data returned by the API query is formatted correctly,
+as tests of data returned by realworld API queries are best left to integrations tests.
+As such,
+representative data is included in the tests.data module.
+
+Attributes
+----------
+GTEX_PATH : Path
+    Path to representative GTEx data.
+BM_PATH : Path
+    Path to representative BioMart data.
+MANE : pd.DataFrame
+    Representative MANE data.
+"""
+from pathlib import Path
+
+import pandas as pd
+
+from scripts.data_handling.process import merge_data
+
+GTEX_PATH = Path("tests", "data", "gtex_message.csv")
+BM_PATH = Path("tests", "data", "biomart_message.csv")
+MANE: pd.DataFrame = pd.read_csv(Path("tests", "data", "mane_minimal.csv"), index_col=0)
+
+
+def test_returns_dataframe() -> None:
+    """It returns a DataFrame."""
+    results = merge_data(GTEX_PATH, BM_PATH, MANE)
+    assert type(results) == pd.DataFrame
+
+
+def test_results_columns() -> None:
+    """Its columns are named correctly ."""
+    columns = [
+        "gencodeId",
+        "geneSymbol",
+        "tissueSiteDetailId",
+        "transcriptId",
+        "median",
+        "unit",
+        "datasetId",
+        "refseq",
+        "#NCBI_GeneID",
+        "HGNC_ID",
+        "name",
+        "RefSeq_prot",
+        "Ensembl_prot",
+        "MANE_status",
+        "GRCh38_chr",
+        "chr_start",
+        "chr_end",
+        "chr_strand",
+    ]
+    results = merge_data(GTEX_PATH, BM_PATH, MANE)
+    assert all(x in columns for x in results.columns), "Found an unexpected column."
+    assert len(results.columns) == len(
+        columns
+    ), f"There should be {len(columns)} columns"
+
+
+def test_sorted_results() -> None:
+    """The results are sorted by median.
+
+    As the ``merge_data`` function technically sorts on "MANE_status" as well,
+    it would be ideal to test that sort, too.
+    However, it it impossible to know in advance how many will have this status,
+    so we cannot check count.
+    Additionally, we cannot check the sort as most values are NaN,
+    and knowing the correct order would require prior knowledge about the number
+    of GTEx transcripts and the number with MANE status.
+    """
+    results = merge_data(GTEX_PATH, BM_PATH, MANE)
+    assert results["median"].is_monotonic
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# -- coding: utf-8 --
		"""Tests for the data_handling submodule."""