feat(data_handling): change process output to csv

Change output of process to csv. This also simplifies its multithreading, eliminating the need for a complex, class-based pipeline. BREAKING CHANGE: Real-world tests showed that it was quite challenging, if not impossible, to create two excel files that could be compared on a byte-by-byte level and be identical, even if their content was the same by eye. To obliviate this, and easing pipeline testing, the decision was made to move to the more reliable (and transparent) csv format.
IMS-Bio2Core-Facility · Jul 14, 2021 · 98f6225 · 98f6225
1 parent 129334d
commit 98f6225
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 188 deletions.
diff --git a/gtexquery/data_handling/process.py b/gtexquery/data_handling/process.py
@@ -1,45 +1,39 @@
 # -*- coding: utf-8 -*-
-"""Data handling for *process* step.
+"""Data handling for *process* step."""
 
-Attributes
-----------
-Pathlike : TypeVar
-    A custom type to unify string and Path
-"""
 import logging
 from pathlib import Path
-from typing import TypeVar
+from typing import Union
 
 import pandas as pd
 
 logger = logging.getLogger(__name__)
 
 
-Pathlike = TypeVar("Pathlike", Path, str)
-
-
 def merge_data(
-    gtex_path: Pathlike, bm_path: Pathlike, mane: pd.DataFrame
-) -> pd.DataFrame:
+    gtex_path: Union[Path, str],
+    bm_path: Union[Path, str],
+    mane: pd.DataFrame,
+    out_path: Union[Path, str],
+) -> None:
     """Merge the data from previous pipeline queries.
 
     Parameters
     ----------
-    gtex_path : Pathlike
+    gtex_path : Union[Path, str]
         Path to the file containing GTEx query data.
-    bm_path : Pathlike
+    bm_path : Union[Path, str]
         Path to the file containing BioMart query data.
     mane : pd.DataFrame
-        A DataFrame containing MANE annotations
-
-    Returns
-    -------
-    pd.DataFrame
-        The merged DataFrame, containing the GTEx query,
-        the BioMart query, and the MANE annotations.
-
+        A DataFrame containing MANE annotations.
+    out_path : Union[Path, str]
+        Path to the output file.
     """
     gtex = pd.read_csv(gtex_path, header=0, index_col=None)
+
+    gene = gtex["geneSymbol"].unique()[0]
+    logger.info(f"Processing data for gene {gene}")
+
     bm = pd.read_csv(bm_path, header=0, index_col=None)
     data = (
         gtex.merge(bm, on=["geneSymbol", "gencodeId", "transcriptId"], how="outer")
@@ -50,27 +44,5 @@ def merge_data(
         )
         .sort_values(["median", "MANE_status"])
     )
-    return data
-
-
-def write_data(data: pd.DataFrame, writer: pd.ExcelWriter) -> None:
-    """Write a DataFrame to an Excel file.
-
-    Note
-    ----
-    This function is best used within a ``with`` block, so that:
-
-    #. The ``ExcelWriter`` is already open.
-    #. It will be properly closed.
-
-    Parameters
-    ----------
-    data : pd.DataFrame
-        The DataFrame to be written.
-    writer : pd.ExcelWriter
-        An **open** pandas ExcelWriter.
-
-    """
-    gene = data["geneSymbol"].unique()[0]
-    data.to_excel(writer, index=False, sheet_name=gene)
-    logger.info(f"{gene} add to output file.")
+    data.to_csv(out_path, index=False)
+    logger.info(f"Gene {gene} processed!")
diff --git a/gtexquery/multithreading/process.py b/gtexquery/multithreading/process.py
diff --git a/tests/data_handling/test_process.py b/tests/data_handling/test_process.py
@@ -14,6 +14,7 @@
     A minimal MANE dataset
 """
 from io import StringIO
+from pathlib import Path
 
 import pandas as pd
 
@@ -29,17 +30,19 @@
 MANE: pd.DataFrame = pd.read_csv(StringIO(MANE_CONTENTS))
 
 
-def test_returns_dataframe() -> None:
-    """It returns a DataFrame."""
-    results = merge_data(
+def test_writes_file(tmp_path: Path) -> None:
+    """It writes a file."""
+    out_path = tmp_path / "out.csv"
+    merge_data(
         CustomTempFile(GTEX_CONTENTS).filename,
         CustomTempFile(BIOMART_CONTENTS).filename,
         MANE,
+        out_path,
     )
-    assert type(results) == pd.DataFrame
+    assert out_path.stat().st_size > 0
 
 
-def test_results_columns() -> None:
+def test_results_columns(tmp_path: Path) -> None:
     """Its columns are named correctly ."""
     columns = [
         "gencodeId",
@@ -61,18 +64,21 @@ def test_results_columns() -> None:
         "chr_end",
         "chr_strand",
     ]
-    results = merge_data(
+    out_path = tmp_path / "out.csv"
+    merge_data(
         CustomTempFile(GTEX_CONTENTS).filename,
         CustomTempFile(BIOMART_CONTENTS).filename,
         MANE,
+        out_path,
     )
+    results = pd.read_csv(out_path, index_col=None)
     assert all(x in columns for x in results.columns), "Found an unexpected column."
     assert len(results.columns) == len(
         columns
     ), f"There should be {len(columns)} columns"
 
 
-def test_sorted_results() -> None:
+def test_sorted_results(tmp_path: Path) -> None:
     """The results are sorted by median.
 
     As the ``merge_data`` function technically sorts on "MANE_status" as well,
@@ -82,10 +88,18 @@ def test_sorted_results() -> None:
     Additionally, we cannot check the sort as most values are NaN,
     and knowing the correct order would require prior knowledge about the number
     of GTEx transcripts and the number with MANE status.
+
+    Parameters
+    ----------
+    tmp_path : Path
+        pytest fixture for temporary path
     """
-    results = merge_data(
+    out_path = tmp_path / "out.csv"
+    merge_data(
         CustomTempFile(GTEX_CONTENTS).filename,
         CustomTempFile(BIOMART_CONTENTS).filename,
         MANE,
+        out_path,
     )
+    results = pd.read_csv(out_path, index_col=None)
     assert results["median"].is_monotonic