Skip to content

Commit

Permalink
feat(data_handling): change process output to csv
Browse files Browse the repository at this point in the history
Change output of process to csv. This also simplifies its multithreading, eliminating the need for a complex, class-based pipeline.

BREAKING CHANGE: Real-world tests showed that it was quite challenging, if not impossible, to create two excel files that could be compared on a byte-by-byte level and be identical, even if their content was the same by eye. To obliviate this, and easing pipeline testing, the decision was made to move to the more reliable (and transparent) csv format.
  • Loading branch information
rbpatt2019 committed Jul 14, 2021
1 parent 129334d commit 98f6225
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 188 deletions.
64 changes: 18 additions & 46 deletions gtexquery/data_handling/process.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,39 @@
# -*- coding: utf-8 -*-
"""Data handling for *process* step.
"""Data handling for *process* step."""

Attributes
----------
Pathlike : TypeVar
A custom type to unify string and Path
"""
import logging
from pathlib import Path
from typing import TypeVar
from typing import Union

import pandas as pd

logger = logging.getLogger(__name__)


Pathlike = TypeVar("Pathlike", Path, str)


def merge_data(
gtex_path: Pathlike, bm_path: Pathlike, mane: pd.DataFrame
) -> pd.DataFrame:
gtex_path: Union[Path, str],
bm_path: Union[Path, str],
mane: pd.DataFrame,
out_path: Union[Path, str],
) -> None:
"""Merge the data from previous pipeline queries.
Parameters
----------
gtex_path : Pathlike
gtex_path : Union[Path, str]
Path to the file containing GTEx query data.
bm_path : Pathlike
bm_path : Union[Path, str]
Path to the file containing BioMart query data.
mane : pd.DataFrame
A DataFrame containing MANE annotations
Returns
-------
pd.DataFrame
The merged DataFrame, containing the GTEx query,
the BioMart query, and the MANE annotations.
A DataFrame containing MANE annotations.
out_path : Union[Path, str]
Path to the output file.
"""
gtex = pd.read_csv(gtex_path, header=0, index_col=None)

gene = gtex["geneSymbol"].unique()[0]
logger.info(f"Processing data for gene {gene}")

bm = pd.read_csv(bm_path, header=0, index_col=None)
data = (
gtex.merge(bm, on=["geneSymbol", "gencodeId", "transcriptId"], how="outer")
Expand All @@ -50,27 +44,5 @@ def merge_data(
)
.sort_values(["median", "MANE_status"])
)
return data


def write_data(data: pd.DataFrame, writer: pd.ExcelWriter) -> None:
"""Write a DataFrame to an Excel file.
Note
----
This function is best used within a ``with`` block, so that:
#. The ``ExcelWriter`` is already open.
#. It will be properly closed.
Parameters
----------
data : pd.DataFrame
The DataFrame to be written.
writer : pd.ExcelWriter
An **open** pandas ExcelWriter.
"""
gene = data["geneSymbol"].unique()[0]
data.to_excel(writer, index=False, sheet_name=gene)
logger.info(f"{gene} add to output file.")
data.to_csv(out_path, index=False)
logger.info(f"Gene {gene} processed!")
134 changes: 0 additions & 134 deletions gtexquery/multithreading/process.py

This file was deleted.

30 changes: 22 additions & 8 deletions tests/data_handling/test_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
A minimal MANE dataset
"""
from io import StringIO
from pathlib import Path

import pandas as pd

Expand All @@ -29,17 +30,19 @@
MANE: pd.DataFrame = pd.read_csv(StringIO(MANE_CONTENTS))


def test_returns_dataframe() -> None:
"""It returns a DataFrame."""
results = merge_data(
def test_writes_file(tmp_path: Path) -> None:
"""It writes a file."""
out_path = tmp_path / "out.csv"
merge_data(
CustomTempFile(GTEX_CONTENTS).filename,
CustomTempFile(BIOMART_CONTENTS).filename,
MANE,
out_path,
)
assert type(results) == pd.DataFrame
assert out_path.stat().st_size > 0


def test_results_columns() -> None:
def test_results_columns(tmp_path: Path) -> None:
"""Its columns are named correctly ."""
columns = [
"gencodeId",
Expand All @@ -61,18 +64,21 @@ def test_results_columns() -> None:
"chr_end",
"chr_strand",
]
results = merge_data(
out_path = tmp_path / "out.csv"
merge_data(
CustomTempFile(GTEX_CONTENTS).filename,
CustomTempFile(BIOMART_CONTENTS).filename,
MANE,
out_path,
)
results = pd.read_csv(out_path, index_col=None)
assert all(x in columns for x in results.columns), "Found an unexpected column."
assert len(results.columns) == len(
columns
), f"There should be {len(columns)} columns"


def test_sorted_results() -> None:
def test_sorted_results(tmp_path: Path) -> None:
"""The results are sorted by median.
As the ``merge_data`` function technically sorts on "MANE_status" as well,
Expand All @@ -82,10 +88,18 @@ def test_sorted_results() -> None:
Additionally, we cannot check the sort as most values are NaN,
and knowing the correct order would require prior knowledge about the number
of GTEx transcripts and the number with MANE status.
Parameters
----------
tmp_path : Path
pytest fixture for temporary path
"""
results = merge_data(
out_path = tmp_path / "out.csv"
merge_data(
CustomTempFile(GTEX_CONTENTS).filename,
CustomTempFile(BIOMART_CONTENTS).filename,
MANE,
out_path,
)
results = pd.read_csv(out_path, index_col=None)
assert results["median"].is_monotonic

0 comments on commit 98f6225

Please sign in to comment.