Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MG45 - ADT: Pathology data ETL #149

Merged
merged 5 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions modelad_test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,16 @@ datasets:
custom_transformations: 1
column_rename:
agedeath: age_death

- pathology:
files:
- name: pathology
id: syn61357279
format: csv
final_format: json
provenance:
- syn61357279
destination: *dest
custom_transformations: 1
column_rename:
agedeath: age_death
6 changes: 4 additions & 2 deletions src/agoradatatools/etl/transform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
)
from agoradatatools.etl.transform.team_info import transform_team_info
from agoradatatools.etl.transform.proteomics import transform_proteomics
from agoradatatools.etl.transform.biomarkers import transform_biomarkers
from agoradatatools.etl.transform.immunohisto_transform import (
immunohisto_transform,
)

__all__ = [
"transform_distribution_data",
Expand All @@ -29,5 +31,5 @@
"transform_rnaseq_differential_expression",
"transform_team_info",
"transform_proteomics",
"transform_biomarkers",
"immunohisto_transform",
]
46 changes: 0 additions & 46 deletions src/agoradatatools/etl/transform/biomarkers.py

This file was deleted.

52 changes: 52 additions & 0 deletions src/agoradatatools/etl/transform/immunohisto_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""
This module contains the transformation logic for the biomarkers and pathology datasets.
This is for the Model AD project.
"""

import pandas as pd
from typing import Dict, List


def immunohisto_transform(
datasets: Dict[str, pd.DataFrame],
dataset_name: str,
group_columns: List[str] = ["model", "type", "age_death", "tissue", "units"],
extra_columns: List[str] = ["genotype", "measurement", "sex"],
extra_column_name: str = "points",
) -> pd.DataFrame:
"""
Takes a dictionary of dataset DataFrames, extracts the 'dataset_name'
DataFrame, and transforms it into a DataFrame grouped by group_columns.
Will include extra_columns in the group.

Args:
datasets (Dict[str, pd.DataFrame]): Dictionary of dataset names mapped to their DataFrame.
dataset_name (str): The name of the dataset to transform.
group_columns (List[str], optional): List of columns to group by. Defaults to ['model', 'type', 'age_death', 'tissue', 'units'].
extra_columns (List[str], optional): List of columns to include in the group. Defaults to ['genotype', 'measurement', 'sex'].
extra_column_name (str, optional): Name of the column containing the extra columns. Defaults to 'points'.

Returns:
pd.DataFrame: A DataFrame grouped by the group_columns.
"""
dataset = datasets[dataset_name]

missing_columns = [
col for col in group_columns + extra_columns if col not in dataset.columns
]
if missing_columns:
raise ValueError(
f"{dataset_name} dataset missing columns: {', '.join(missing_columns)}"
)

dataset = dataset.fillna("none")
data_rows = []

grouped = dataset.groupby(group_columns)

for group_key, group in grouped:
entry = dict(zip(group_columns, group_key))
entry[extra_column_name] = group[extra_columns].to_dict("records")
data_rows.append(entry)

return pd.DataFrame(data_rows)
6 changes: 4 additions & 2 deletions src/agoradatatools/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,10 @@ def apply_custom_transformations(
if dataset_name in ["proteomics", "proteomics_tmt", "proteomics_srm"]:
df = datasets[dataset_name]
return transform.transform_proteomics(df=df)
if dataset_name == "biomarkers":
return transform.transform_biomarkers(datasets=datasets)
if dataset_name in ["biomarkers", "pathology"]:
return transform.immunohisto_transform(
datasets=datasets, dataset_name=dataset_name
)
else:
return None

Expand Down
73 changes: 0 additions & 73 deletions tests/transform/test_biomarkers.py

This file was deleted.

85 changes: 85 additions & 0 deletions tests/transform/test_immunohisto_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os

import pandas as pd
import pytest

from agoradatatools.etl.transform.immunohisto_transform import (
immunohisto_transform,
)


class TestTransformGeneralModelAD:
data_files_path = "tests/test_assets/immunohisto_transform"
pass_test_data = [
(
# Pass with good fake data
"immunohisto_transform_good_test_input.csv",
"immunohisto_transform_good_test_output.json",
),
(
# Pass with duplicated data
"immunohisto_transform_duplicated_input.csv",
"immunohisto_transform_duplicated_output.json",
),
(
# Pass with none data
"immunohisto_transform_none_input.csv",
"immunohisto_transform_none_output.json",
),
(
# Pass with missing data
"immunohisto_transform_missing_input.csv",
"immunohisto_transform_missing_output.json",
),
(
# Pass with extra column
"immunohisto_transform_extra_column.csv",
"immunohisto_transform_extra_column_output.json",
),
]
pass_test_ids = [
"Pass with good fake data",
"Pass with duplicated data",
"Pass with none data",
"Pass with missing data",
"Pass with extra column",
]
fail_test_data = [("immunohisto_transform_missing_column.csv")]
fail_test_ids = [("Fail with missing column")]

@pytest.mark.parametrize(
"immunohisto_transform_file, expected_output_file",
pass_test_data,
ids=pass_test_ids,
)
def test_immunohisto_transform_should_pass(
self, immunohisto_transform_file, expected_output_file
):
immunohisto_transform_df = pd.read_csv(
os.path.join(self.data_files_path, "input", immunohisto_transform_file)
)
output_df = pd.DataFrame(
immunohisto_transform(
datasets={"immunohisto_transform": immunohisto_transform_df},
dataset_name="immunohisto_transform",
)
)
expected_df = pd.read_json(
os.path.join(self.data_files_path, "output", expected_output_file),
)
pd.testing.assert_frame_equal(output_df, expected_df)

@pytest.mark.parametrize(
"immunohisto_transform_file", fail_test_data, ids=fail_test_ids
)
def test_immunohisto_transform_should_fail(
self, immunohisto_transform_file, error_type: BaseException = ValueError
):
immunohisto_transform_df = pd.read_csv(
os.path.join(self.data_files_path, "input", immunohisto_transform_file)
)
with pytest.raises(error_type):
immunohisto_transform(
datasets={"immunohisto_transform": immunohisto_transform_df},
dataset_name="immunohisto_transform",
)