Skip to content

Commit

Permalink
Closes Bears-R-Us#3421 testing equivalence module
Browse files Browse the repository at this point in the history
  • Loading branch information
ajpotts committed Jul 22, 2024
1 parent e14ff6a commit 030b0fc
Show file tree
Hide file tree
Showing 9 changed files with 1,283 additions and 272 deletions.
4 changes: 2 additions & 2 deletions PROTO_tests/tests/dataframe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,9 +595,9 @@ def test_groupby_standard(self):
s = ak.DataFrame({"a": ak.Categorical(ak.array(["a", "a", "a", "b"]))}).groupby("a").size()
pds = pd.Series(
data=np.array([3, 1]),
index=pd.Index(data=np.array(["a", "b"], dtype="<U7"), name="a"),
index=pd.Index(data=pd.Categorical(np.array(["a", "b"])), name="a"),
)
assert_series_equal(pds, s.to_pandas())
assert_series_equal(pds, s.to_pandas(), check_categorical=False)

def test_gb_series(self):
df = self.build_ak_df()
Expand Down
642 changes: 528 additions & 114 deletions PROTO_tests/tests/testing/asserters_test.py

Large diffs are not rendered by default.

49 changes: 37 additions & 12 deletions arkouda/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)

import numpy as np
from pandas import Categorical as pd_Categorical
from typeguard import typechecked

from arkouda.client import generic_msg
Expand Down Expand Up @@ -48,8 +49,8 @@ class Categorical:
Parameters
----------
values : Strings
String values to convert to categories
values : Strings, Categorical, pd.Categorical
Values to convert to categories
NAvalue : str scalar
The value to use to represent missing/null data
Expand Down Expand Up @@ -107,16 +108,32 @@ def __init__(self, values, **kwargs) -> None:
self._categories_used = self.categories[unique_codes]
else:
# Typical initialization, called with values
if not isinstance(values, Strings):
raise ValueError(("Categorical: inputs other than " + "Strings not yet supported"))
g = GroupBy(values)
self.categories = g.unique_keys
self.codes = g.broadcast(arange(self.categories.size), permute=True)
self.permutation = cast(pdarray, g.permutation)
self.segments = g.segments
# Make a copy because N/A value must be added below
self._categories_used = self.categories[:]

if isinstance(values, pd_Categorical):
self.values = array(values.to_numpy())
self.categories = array(values.categories)
self.codes = array(values.codes.astype("int64"))
self._categories_used = self.categories[unique(self.codes)]
self.permutation = None
self.segments = None
elif isinstance(values, Categorical):
self.values = values.values
self.categories = values.categories
self.codes = values.codes
self._categories_used = values._categories_used
self.permutation = values.permutation
self.segments = values.segments
elif isinstance(values, Strings):
g = GroupBy(values)
self.categories = g.unique_keys
self.codes = g.broadcast(arange(self.categories.size), permute=True)
self.permutation = cast(pdarray, g.permutation)
self.segments = g.segments
# Make a copy because N/A value must be added below
self._categories_used = self.categories[:]
else:
raise ValueError(
("Categorical: inputs other than " + "Strings or pd.Categorical not yet supported")
)
# When read from file or attached, NA code will be passed as a pdarray
# Otherwise, the NA value is set to a string
if "_akNAcode" in kwargs and kwargs["_akNAcode"] is not None:
Expand Down Expand Up @@ -399,6 +416,14 @@ def to_ndarray(self) -> np.ndarray:
valcodes = self.codes.to_ndarray()
return idx[valcodes]

def to_pandas(self) -> pd_Categorical:
"""
Return the equivalent Pandas Categorical.
"""
return pd_Categorical.from_codes(
codes=self.codes.to_ndarray(), categories=self.categories.to_ndarray()
)

def to_list(self) -> List:
"""
Convert the Categorical to a list, transferring data from
Expand Down
26 changes: 19 additions & 7 deletions arkouda/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,12 +897,16 @@ def __init__(self, initialdata=None, index=None, columns=None):
self._set_index(index)
self.data = {}
for key in initialdata.columns:
self.data[key] = (
SegArray.from_multi_array([array(r) for r in initialdata[key]])
if hasattr(initialdata[key], "values")
and isinstance(initialdata[key].values[0], (list, np.ndarray))
else array(initialdata[key])
)
if hasattr(initialdata[key], "values") and isinstance(
initialdata[key].values[0], (list, np.ndarray)
):
self.data[key] = SegArray.from_multi_array([array(r) for r in initialdata[key]])
elif hasattr(initialdata[key], "values") and isinstance(
initialdata[key].values, pd.Categorical
):
self.data[key] = Categorical(initialdata[key].values)
else:
self.data[key] = array(initialdata[key])

self.data.update()
return
Expand Down Expand Up @@ -2888,6 +2892,9 @@ def to_pandas(self, datalimit=maxTransferBytes, retain_index=False):
nbytes += (val.dtype).itemsize * self._nrows
elif isinstance(val, Strings):
nbytes += val.nbytes
elif isinstance(val, Categorical):
nbytes += val.codes.nbytes
nbytes += val.categories.nbytes

KB = 1024
MB = KB * KB
Expand Down Expand Up @@ -2919,7 +2926,12 @@ def to_pandas(self, datalimit=maxTransferBytes, retain_index=False):
try:
# in order for proper pandas functionality, SegArrays must be seen as 1d
# and therefore need to be converted to list
pandas_data[key] = val.to_ndarray() if not isinstance(val, SegArray) else val.to_list()
if isinstance(val, SegArray):
pandas_data[key] = val.to_list()
elif isinstance(val, Categorical):
pandas_data[key] = val.to_pandas()
else:
pandas_data[key] = val.to_ndarray()
except TypeError:
raise IndexError("Bad index type or format.")

Expand Down
Loading

0 comments on commit 030b0fc

Please sign in to comment.