load_profiles should work with PurePath types (#285)

* add test that breaks upon purepath * add back pytest * switch type check to purepath to capture both path and purepath instance types * Update pycytominer/tests/test_cyto_utils/test_load.py Co-authored-by: Dave Bunten <[email protected]> * add docstring * fix type hint * move file path resolve to test_parquet function this change required propagation to other files * check no file found adding various other improveements as well including testing load_profiles() with parquet and other software gardening * remove extra space * add clarifying comments and update docstring --------- Co-authored-by: Dave Bunten <[email protected]>
cytomining · Jun 5, 2023 · 182745a · 182745a
1 parent 470fc56
commit 182745a
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 46 deletions.
diff --git a/pycytominer/cyto_utils/load.py b/pycytominer/cyto_utils/load.py
@@ -6,15 +6,15 @@
 from typing import Union
 
 
-def is_path_a_parquet_file(file: Union[str, pathlib.Path]) -> bool:
+def is_path_a_parquet_file(file: Union[str, pathlib.PurePath]) -> bool:
     """Checks if the provided file path is a parquet file.
 
     Identify parquet files by inspecting the file extensions.
     If the file does not end with `parquet`, this will return False, else True.
 
     Parameters
     ----------
-    file : Union[str, pathlib.Path]
+    file : Union[str, pathlib.PurePath]
         path to parquet file
 
     Returns
@@ -30,14 +30,13 @@ def is_path_a_parquet_file(file: Union[str, pathlib.Path]) -> bool:
     FileNotFoundError
         Raised if the provided path in the `file` does not exist
     """
-    # type checking
-    if not isinstance(file, (str, pathlib.Path)):
-        raise TypeError(f"file must be a str or pathlib.Path not {type(file)}")
 
-    # Convert str to pathlib.Path object and absolute path
-    # check if the file also exists while converting to absolute path
-    if isinstance(file, str):
+    file = pathlib.PurePath(file)
+    try:
+        # strict=true tests if path exists
         file = pathlib.Path(file).resolve(strict=True)
+    except FileNotFoundError as e:
+        print("load_profiles() didn't find the path.", e, sep="\n")
 
     # Check if file path is a parquet file
     if file.suffix.lower() == ".parquet":
@@ -77,7 +76,7 @@ def load_profiles(profiles):
 
     Parameters
     ----------
-    profiles : {str, pandas.DataFrame}
+    profiles : {str, pathlib.Path, pandas.DataFrame}
         file location or actual pandas dataframe of profiles
 
     Return
@@ -88,17 +87,15 @@ def load_profiles(profiles):
     -------
     FileNotFoundError
         Raised if the provided profile does not exists
-
     """
     if not isinstance(profiles, pd.DataFrame):
+        # Check if path exists and load depending on file type
         if is_path_a_parquet_file(profiles):
             return pd.read_parquet(profiles, engine="pyarrow")
 
-        try:
+        else:
             delim = infer_delim(profiles)
-            profiles = pd.read_csv(profiles, sep=delim)
-        except FileNotFoundError:
-            raise FileNotFoundError(f"{profiles} profile file not found")
+            return pd.read_csv(profiles, sep=delim)
 
     return profiles
 
@@ -176,12 +173,9 @@ def load_npz_features(npz_file, fallback_feature_prefix="DP", metadata=True):
     # Load metadata
     if "metadata" in files:
         metadata = npz["metadata"].item()
-        metadata_df = pd.DataFrame(
-            metadata, index=range(0, df.shape[0]), dtype=str
-        )
+        metadata_df = pd.DataFrame(metadata, index=range(0, df.shape[0]), dtype=str)
         metadata_df.columns = [
-            f"Metadata_{x}" if not x.startswith("Metadata_") else x
-            for x in metadata_df
+            f"Metadata_{x}" if not x.startswith("Metadata_") else x for x in metadata_df
         ]
 
         # Determine the appropriate metadata prefix
@@ -200,16 +194,12 @@ def load_npz_features(npz_file, fallback_feature_prefix="DP", metadata=True):
 
     # Append metadata with features
     if "metadata" in files:
-        df = metadata_df.merge(
-            df, how="outer", left_index=True, right_index=True
-        )
+        df = metadata_df.merge(df, how="outer", left_index=True, right_index=True)
 
     return df
 
 
-def load_npz_locations(
-    npz_file, location_x_col_index=0, location_y_col_index=1
-):
+def load_npz_locations(npz_file, location_x_col_index=0, location_y_col_index=1):
     """
     Load an npz file storing locations and, sometimes, metadata.
 

diff --git a/pycytominer/tests/test_cyto_utils/test_load.py b/pycytominer/tests/test_cyto_utils/test_load.py
@@ -1,6 +1,7 @@
 import os
 import random
 import pytest
+import pathlib
 import tempfile
 import numpy as np
 import pandas as pd
@@ -27,9 +28,7 @@
 output_platemap_file_gzip = "{}.gz".format(output_platemap_file)
 output_npz_file = os.path.join(tmpdir, "test_npz.npz")
 output_npz_with_model_file = os.path.join(tmpdir, "test_npz_withmodel.npz")
-output_npz_without_metadata_file = os.path.join(
-    tmpdir, "test_npz_withoutmetadata.npz"
-)
+output_npz_without_metadata_file = os.path.join(tmpdir, "test_npz_withoutmetadata.npz")
 
 
 # Example .npz file with real data
@@ -88,23 +87,17 @@
 # Write to temp files
 data_df.to_csv(output_data_file, sep="\t", index=False)
 data_df.to_csv(output_data_comma_file, sep=",", index=False)
-data_df.to_csv(
-    output_data_gzip_file, sep="\t", index=False, compression="gzip"
-)
+data_df.to_csv(output_data_gzip_file, sep="\t", index=False, compression="gzip")
 data_df.to_parquet(output_data_parquet, engine="pyarrow")
 
 platemap_df.to_csv(output_platemap_file, sep="\t", index=False)
 platemap_df.to_csv(output_platemap_comma_file, sep=",", index=False)
-platemap_df.to_csv(
-    output_platemap_file_gzip, sep="\t", index=False, compression="gzip"
-)
+platemap_df.to_csv(output_platemap_file_gzip, sep="\t", index=False, compression="gzip")
 
 # Write npz temp files
 key_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}
 npz_metadata_dict.update(npz_model_key)
-key_with_model_values = {
-    k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()
-}
+key_with_model_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}
 
 np.savez_compressed(output_npz_file, features=npz_feats, metadata=key_values)
 np.savez_compressed(
@@ -133,12 +126,12 @@ def test_load_profiles():
     profiles_gzip = load_profiles(output_data_gzip_file)
     pd.testing.assert_frame_equal(data_df, profiles_gzip)
 
-    platemap = load_platemap(output_data_comma_file, add_metadata_id=False)
-    pd.testing.assert_frame_equal(data_df, profiles)
-
     profiles_from_frame = load_profiles(data_df)
     pd.testing.assert_frame_equal(data_df, profiles_from_frame)
 
+    profiles_from_parquet = load_profiles(output_data_parquet)
+    pd.testing.assert_frame_equal(data_df, profiles_from_parquet)
+
 
 def test_load_platemap():
     platemap = load_platemap(output_platemap_file, add_metadata_id=False)
@@ -150,9 +143,7 @@ def test_load_platemap():
     platemap = load_platemap(output_platemap_file_gzip, add_metadata_id=False)
     pd.testing.assert_frame_equal(platemap, platemap_df)
 
-    platemap_with_annotation = load_platemap(
-        output_platemap_file, add_metadata_id=True
-    )
+    platemap_with_annotation = load_platemap(output_platemap_file, add_metadata_id=True)
     platemap_df.columns = [f"Metadata_{x}" for x in platemap_df.columns]
     pd.testing.assert_frame_equal(platemap_with_annotation, platemap_df)
 
@@ -194,9 +185,7 @@ def test_load_npz():
 
     # Check real data
     assert real_data_df.shape == (206, 54)
-    assert all(
-        [x in real_data_df.columns for x in core_cols + ["Metadata_Model"]]
-    )
+    assert all([x in real_data_df.columns for x in core_cols + ["Metadata_Model"]])
     assert len(real_data_df.Metadata_Model.unique()) == 1
     assert real_data_df.Metadata_Model.unique()[0] == "cnn"
     assert real_data_df.drop(
@@ -250,3 +239,26 @@ def test_is_path_a_parquet_file():
 
     # checking if the same df is produced from parquet and csv files
     pd.testing.assert_frame_equal(parquet_profile_test, csv_profile_test)
+
+
+def test_load_profiles_file_path_input():
+    """
+    The `load_profiles()` function will work input file arguments that resolve.
+    This test confirms that different input file types work as expected.
+    """
+    # All paths should resolve and result in the same data being loaded
+    data_file_os: str = os.path.join(tmpdir, "test_data.csv")
+    data_file_path: pathlib.Path = pathlib.Path(tmpdir, "test_data.csv")
+    data_file_purepath: pathlib.PurePath = pathlib.PurePath(tmpdir, "test_data.csv")
+
+    profiles_os = load_profiles(data_file_os)
+    profiles_path = load_profiles(data_file_path)
+    profiles_purepath = load_profiles(data_file_purepath)
+
+    pd.testing.assert_frame_equal(profiles_os, profiles_path)
+    pd.testing.assert_frame_equal(profiles_purepath, profiles_path)
+
+    # Testing non-existing file paths should result in expected behavior
+    data_file_not_exist: pathlib.Path = pathlib.Path(tmpdir, "file_not_exist.csv")
+    with pytest.raises(FileNotFoundError, match="No such file or directory"):
+        load_profiles(data_file_not_exist)