Skip to content

Commit

Permalink
load_profiles should work with PurePath types (#285)
Browse files Browse the repository at this point in the history
* add test that breaks upon purepath

* add back pytest

* switch type check to purepath to capture both path and purepath instance types

* Update pycytominer/tests/test_cyto_utils/test_load.py

Co-authored-by: Dave Bunten <[email protected]>

* add docstring

* fix type hint

* move file path resolve to test_parquet function

this change required propagation to other files

* check no file found

adding various other improveements as well including testing load_profiles() with parquet and other software gardening

* remove extra space

* add clarifying comments and update docstring

---------

Co-authored-by: Dave Bunten <[email protected]>
  • Loading branch information
gwaybio and d33bs authored Jun 5, 2023
1 parent 470fc56 commit 182745a
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 46 deletions.
40 changes: 15 additions & 25 deletions pycytominer/cyto_utils/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
from typing import Union


def is_path_a_parquet_file(file: Union[str, pathlib.Path]) -> bool:
def is_path_a_parquet_file(file: Union[str, pathlib.PurePath]) -> bool:
"""Checks if the provided file path is a parquet file.
Identify parquet files by inspecting the file extensions.
If the file does not end with `parquet`, this will return False, else True.
Parameters
----------
file : Union[str, pathlib.Path]
file : Union[str, pathlib.PurePath]
path to parquet file
Returns
Expand All @@ -30,14 +30,13 @@ def is_path_a_parquet_file(file: Union[str, pathlib.Path]) -> bool:
FileNotFoundError
Raised if the provided path in the `file` does not exist
"""
# type checking
if not isinstance(file, (str, pathlib.Path)):
raise TypeError(f"file must be a str or pathlib.Path not {type(file)}")

# Convert str to pathlib.Path object and absolute path
# check if the file also exists while converting to absolute path
if isinstance(file, str):
file = pathlib.PurePath(file)
try:
# strict=true tests if path exists
file = pathlib.Path(file).resolve(strict=True)
except FileNotFoundError as e:
print("load_profiles() didn't find the path.", e, sep="\n")

# Check if file path is a parquet file
if file.suffix.lower() == ".parquet":
Expand Down Expand Up @@ -77,7 +76,7 @@ def load_profiles(profiles):
Parameters
----------
profiles : {str, pandas.DataFrame}
profiles : {str, pathlib.Path, pandas.DataFrame}
file location or actual pandas dataframe of profiles
Return
Expand All @@ -88,17 +87,15 @@ def load_profiles(profiles):
-------
FileNotFoundError
Raised if the provided profile does not exists
"""
if not isinstance(profiles, pd.DataFrame):
# Check if path exists and load depending on file type
if is_path_a_parquet_file(profiles):
return pd.read_parquet(profiles, engine="pyarrow")

try:
else:
delim = infer_delim(profiles)
profiles = pd.read_csv(profiles, sep=delim)
except FileNotFoundError:
raise FileNotFoundError(f"{profiles} profile file not found")
return pd.read_csv(profiles, sep=delim)

return profiles

Expand Down Expand Up @@ -176,12 +173,9 @@ def load_npz_features(npz_file, fallback_feature_prefix="DP", metadata=True):
# Load metadata
if "metadata" in files:
metadata = npz["metadata"].item()
metadata_df = pd.DataFrame(
metadata, index=range(0, df.shape[0]), dtype=str
)
metadata_df = pd.DataFrame(metadata, index=range(0, df.shape[0]), dtype=str)
metadata_df.columns = [
f"Metadata_{x}" if not x.startswith("Metadata_") else x
for x in metadata_df
f"Metadata_{x}" if not x.startswith("Metadata_") else x for x in metadata_df
]

# Determine the appropriate metadata prefix
Expand All @@ -200,16 +194,12 @@ def load_npz_features(npz_file, fallback_feature_prefix="DP", metadata=True):

# Append metadata with features
if "metadata" in files:
df = metadata_df.merge(
df, how="outer", left_index=True, right_index=True
)
df = metadata_df.merge(df, how="outer", left_index=True, right_index=True)

return df


def load_npz_locations(
npz_file, location_x_col_index=0, location_y_col_index=1
):
def load_npz_locations(npz_file, location_x_col_index=0, location_y_col_index=1):
"""
Load an npz file storing locations and, sometimes, metadata.
Expand Down
54 changes: 33 additions & 21 deletions pycytominer/tests/test_cyto_utils/test_load.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import random
import pytest
import pathlib
import tempfile
import numpy as np
import pandas as pd
Expand All @@ -27,9 +28,7 @@
output_platemap_file_gzip = "{}.gz".format(output_platemap_file)
output_npz_file = os.path.join(tmpdir, "test_npz.npz")
output_npz_with_model_file = os.path.join(tmpdir, "test_npz_withmodel.npz")
output_npz_without_metadata_file = os.path.join(
tmpdir, "test_npz_withoutmetadata.npz"
)
output_npz_without_metadata_file = os.path.join(tmpdir, "test_npz_withoutmetadata.npz")


# Example .npz file with real data
Expand Down Expand Up @@ -88,23 +87,17 @@
# Write to temp files
data_df.to_csv(output_data_file, sep="\t", index=False)
data_df.to_csv(output_data_comma_file, sep=",", index=False)
data_df.to_csv(
output_data_gzip_file, sep="\t", index=False, compression="gzip"
)
data_df.to_csv(output_data_gzip_file, sep="\t", index=False, compression="gzip")
data_df.to_parquet(output_data_parquet, engine="pyarrow")

platemap_df.to_csv(output_platemap_file, sep="\t", index=False)
platemap_df.to_csv(output_platemap_comma_file, sep=",", index=False)
platemap_df.to_csv(
output_platemap_file_gzip, sep="\t", index=False, compression="gzip"
)
platemap_df.to_csv(output_platemap_file_gzip, sep="\t", index=False, compression="gzip")

# Write npz temp files
key_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}
npz_metadata_dict.update(npz_model_key)
key_with_model_values = {
k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()
}
key_with_model_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}

np.savez_compressed(output_npz_file, features=npz_feats, metadata=key_values)
np.savez_compressed(
Expand Down Expand Up @@ -133,12 +126,12 @@ def test_load_profiles():
profiles_gzip = load_profiles(output_data_gzip_file)
pd.testing.assert_frame_equal(data_df, profiles_gzip)

platemap = load_platemap(output_data_comma_file, add_metadata_id=False)
pd.testing.assert_frame_equal(data_df, profiles)

profiles_from_frame = load_profiles(data_df)
pd.testing.assert_frame_equal(data_df, profiles_from_frame)

profiles_from_parquet = load_profiles(output_data_parquet)
pd.testing.assert_frame_equal(data_df, profiles_from_parquet)


def test_load_platemap():
platemap = load_platemap(output_platemap_file, add_metadata_id=False)
Expand All @@ -150,9 +143,7 @@ def test_load_platemap():
platemap = load_platemap(output_platemap_file_gzip, add_metadata_id=False)
pd.testing.assert_frame_equal(platemap, platemap_df)

platemap_with_annotation = load_platemap(
output_platemap_file, add_metadata_id=True
)
platemap_with_annotation = load_platemap(output_platemap_file, add_metadata_id=True)
platemap_df.columns = [f"Metadata_{x}" for x in platemap_df.columns]
pd.testing.assert_frame_equal(platemap_with_annotation, platemap_df)

Expand Down Expand Up @@ -194,9 +185,7 @@ def test_load_npz():

# Check real data
assert real_data_df.shape == (206, 54)
assert all(
[x in real_data_df.columns for x in core_cols + ["Metadata_Model"]]
)
assert all([x in real_data_df.columns for x in core_cols + ["Metadata_Model"]])
assert len(real_data_df.Metadata_Model.unique()) == 1
assert real_data_df.Metadata_Model.unique()[0] == "cnn"
assert real_data_df.drop(
Expand Down Expand Up @@ -250,3 +239,26 @@ def test_is_path_a_parquet_file():

# checking if the same df is produced from parquet and csv files
pd.testing.assert_frame_equal(parquet_profile_test, csv_profile_test)


def test_load_profiles_file_path_input():
"""
The `load_profiles()` function will work input file arguments that resolve.
This test confirms that different input file types work as expected.
"""
# All paths should resolve and result in the same data being loaded
data_file_os: str = os.path.join(tmpdir, "test_data.csv")
data_file_path: pathlib.Path = pathlib.Path(tmpdir, "test_data.csv")
data_file_purepath: pathlib.PurePath = pathlib.PurePath(tmpdir, "test_data.csv")

profiles_os = load_profiles(data_file_os)
profiles_path = load_profiles(data_file_path)
profiles_purepath = load_profiles(data_file_purepath)

pd.testing.assert_frame_equal(profiles_os, profiles_path)
pd.testing.assert_frame_equal(profiles_purepath, profiles_path)

# Testing non-existing file paths should result in expected behavior
data_file_not_exist: pathlib.Path = pathlib.Path(tmpdir, "file_not_exist.csv")
with pytest.raises(FileNotFoundError, match="No such file or directory"):
load_profiles(data_file_not_exist)

0 comments on commit 182745a

Please sign in to comment.