Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementing parquet loading in load_profiles function #262

Merged
merged 32 commits into from
Mar 23, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
39a02a2
added new function `infer_profile_file_type`
axiomcura Mar 15, 2023
3fd6f8e
Fixed Unicode Bug
axiomcura Mar 15, 2023
59dbfa3
fixed csv error
axiomcura Mar 16, 2023
fbec069
improved variable names
axiomcura Mar 16, 2023
32b780c
removed unwanted comments
axiomcura Mar 16, 2023
ef9e22f
added extension based inference for parquet
axiomcura Mar 20, 2023
55cc36b
Update pycytominer/cyto_utils/load.py
axiomcura Mar 20, 2023
cf7f1b2
Update pycytominer/tests/test_cyto_utils/test_load.py
axiomcura Mar 20, 2023
5645c23
edited pathlib imports, documentation fixed
axiomcura Mar 20, 2023
3df2715
applied black formatting
axiomcura Mar 20, 2023
c5d3d84
added typing
axiomcura Mar 20, 2023
5350322
updated tests
axiomcura Mar 20, 2023
8625cec
update tests
axiomcura Mar 20, 2023
352c656
testing update
axiomcura Mar 20, 2023
20ae6e7
Update pycytominer/cyto_utils/load.py
axiomcura Mar 21, 2023
a367fae
Update pycytominer/cyto_utils/load.py
axiomcura Mar 21, 2023
ab3dfe4
added black formatting
axiomcura Mar 21, 2023
76ca362
update pathing
axiomcura Mar 21, 2023
5a0240e
fixed docs
axiomcura Mar 21, 2023
b23b0a9
black formatting
axiomcura Mar 21, 2023
3e0e073
tests update
axiomcura Mar 21, 2023
13bf9c7
Update pycytominer/cyto_utils/load.py
axiomcura Mar 21, 2023
e94623e
Update pycytominer/cyto_utils/load.py
axiomcura Mar 21, 2023
1b173f5
Update pycytominer/cyto_utils/load.py
axiomcura Mar 21, 2023
51a9178
test update
axiomcura Mar 21, 2023
a0f0708
Update pycytominer/cyto_utils/load.py
axiomcura Mar 21, 2023
4eac164
fixed typo
axiomcura Mar 21, 2023
2d17c24
added comments
axiomcura Mar 21, 2023
25c9e36
Update pycytominer/cyto_utils/load.py
axiomcura Mar 22, 2023
c647e65
replaced `.absolute()` with `.resolve()`
axiomcura Mar 22, 2023
175db2a
applied black formatting
axiomcura Mar 22, 2023
5f7ecae
removed try and accept block
axiomcura Mar 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 52 additions & 7 deletions pycytominer/cyto_utils/load.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,48 @@
import csv
import gzip
from pathlib import Path
axiomcura marked this conversation as resolved.
Show resolved Hide resolved
import numpy as np
import pandas as pd

def is_path_a_parquet_file(file: str) -> bool:
"""Checks if the provided file path is a parquet file.

Identification of parquet files are done by inspecting the file extensions.
axiomcura marked this conversation as resolved.
Show resolved Hide resolved
If the file does not end with `parquet`, this will return False, else True.

Parameters
----------
file : str
path to parquet file

Returns
-------
bool
Returns True if the file path contains `.parquet`, else it will return
False

Raises
------
TypeError
Raised if a non str object is passed in the `file` parameter
axiomcura marked this conversation as resolved.
Show resolved Hide resolved
FileNotFoundError
Raised if the provided path in the `file` does not exist
"""
# type checking
if not isinstance(file, str):
axiomcura marked this conversation as resolved.
Show resolved Hide resolved
raise TypeError(f"file must be a str not {type(file)}")
axiomcura marked this conversation as resolved.
Show resolved Hide resolved

# converting str object to Path Object
# -- checking if it exists
file = Path(file)
if not file.exists():
raise FileNotFoundError(f"{str(file.absolute())} does not exists")
axiomcura marked this conversation as resolved.
Show resolved Hide resolved

# checking if file path is a parquet file
if not file.suffix.lower() == ".parquet":
return False
return True


def infer_delim(file):
"""
Expand Down Expand Up @@ -41,20 +81,25 @@ def load_profiles(profiles):
Return
------
pandas DataFrame of profiles

Raises
------
FileNotFoundError
Raised if the provided
axiomcura marked this conversation as resolved.
Show resolved Hide resolved
"""
if not isinstance(profiles, pd.DataFrame):
try:
delim = infer_delim(profiles)
profiles = pd.read_csv(profiles, sep=delim)
except FileNotFoundError:
axiomcura marked this conversation as resolved.
Show resolved Hide resolved
raise FileNotFoundError(f"{profiles} profile file not found")

if is_path_a_parquet_file(profiles):
return pd.read_parquet(profiles, engine="pyarrow")

delim = infer_delim(profiles)
profiles = pd.read_csv(profiles, sep=delim)

return profiles


def load_platemap(platemap, add_metadata_id=True):
"""
Unless a dataframe is provided, load the given platemap dataframe from path or string
axiomcura marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
platemap : pandas dataframe
Expand Down
22 changes: 21 additions & 1 deletion pycytominer/tests/test_cyto_utils/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
load_npz_features,
load_npz_locations,
)
from pycytominer.cyto_utils.load import infer_delim
from pycytominer.cyto_utils.load import (infer_delim,
axiomcura marked this conversation as resolved.
Show resolved Hide resolved
axiomcura marked this conversation as resolved.
Show resolved Hide resolved
is_path_a_parquet_file)

random.seed(123)

Expand Down Expand Up @@ -196,3 +197,22 @@ def test_load_npz():
load_npz_locations(
example_npz_file_locations, location_x_col_index=0, location_y_col_index=2
)

def test_infer_plate_files():
axiomcura marked this conversation as resolved.
Show resolved Hide resolved

# file paths
csv_file = "../test_data/cytominer_database_example_data/test_SQ00014613.csv.gz"
parquet_file = "../test_data/cytominer_database_example_data/test_SQ00014613.parquet"

# checking parquet file
check_pass = is_path_a_parquet_file(parquet_file)
check_fail = is_path_a_parquet_file(csv_file)

# checking if the correct booleans are returned
assert(check_pass, True)
assert(check_fail, False)

# loading in pandas dataframe from parquet file
axiomcura marked this conversation as resolved.
Show resolved Hide resolved
parquet_df = pd.read_parquet(parquet_file)
parquet_profile_test = load_profiles(parquet_file)
pd.testing.assert_frame_equal(parquet_profile_test, parquet_df)