Skip to content

Commit

Permalink
tests(data_handling): increase coverage
Browse files Browse the repository at this point in the history
Adds test coverage for the scripts.data_handling.process module. This
necesitated adding pandas as a testing dependency to handle some data
checks.
  • Loading branch information
rbpatt2019 committed Jun 11, 2021
1 parent b387600 commit 5c34730
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 0 deletions.
1 change: 1 addition & 0 deletions environments/tests.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ pytest-sugar==0.9.4
pytest-cov==2.12.0
coverage==5.5
typeguard==2.12.0
pandas==1.2.4
2 changes: 2 additions & 0 deletions tests/data_handling/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-
"""Tests for the data_handling submodule."""
78 changes: 78 additions & 0 deletions tests/data_handling/test_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
"""Tests for the scripts.data_handling.process submodule.
These unit tests are designed to test that the results of the query
are handled correctly.
They assume the data returned by the API query is formatted correctly,
as tests of data returned by realworld API queries are best left to integrations tests.
As such,
representative data is included in the tests.data module.
Attributes
----------
GTEX_PATH : Path
Path to representative GTEx data.
BM_PATH : Path
Path to representative BioMart data.
MANE : pd.DataFrame
Representative MANE data.
"""
from pathlib import Path

import pandas as pd

from scripts.data_handling.process import merge_data

GTEX_PATH = Path("tests", "data", "gtex_message.csv")
BM_PATH = Path("tests", "data", "biomart_message.csv")
MANE: pd.DataFrame = pd.read_csv(Path("tests", "data", "mane_minimal.csv"), index_col=0)


def test_returns_dataframe() -> None:
"""It returns a DataFrame."""
results = merge_data(GTEX_PATH, BM_PATH, MANE)
assert type(results) == pd.DataFrame


def test_results_columns() -> None:
"""Its columns are named correctly ."""
columns = [
"gencodeId",
"geneSymbol",
"tissueSiteDetailId",
"transcriptId",
"median",
"unit",
"datasetId",
"refseq",
"#NCBI_GeneID",
"HGNC_ID",
"name",
"RefSeq_prot",
"Ensembl_prot",
"MANE_status",
"GRCh38_chr",
"chr_start",
"chr_end",
"chr_strand",
]
results = merge_data(GTEX_PATH, BM_PATH, MANE)
assert all(x in columns for x in results.columns), "Found an unexpected column."
assert len(results.columns) == len(
columns
), f"There should be {len(columns)} columns"


def test_sorted_results() -> None:
"""The results are sorted by median.
As the ``merge_data`` function technically sorts on "MANE_status" as well,
it would be ideal to test that sort, too.
However, it it impossible to know in advance how many will have this status,
so we cannot check count.
Additionally, we cannot check the sort as most values are NaN,
and knowing the correct order would require prior knowledge about the number
of GTEx transcripts and the number with MANE status.
"""
results = merge_data(GTEX_PATH, BM_PATH, MANE)
assert results["median"].is_monotonic

0 comments on commit 5c34730

Please sign in to comment.