Skip to content

Commit

Permalink
Closes Bears-R-Us#3510-Series.to_pandas to handle categoricals
Browse files Browse the repository at this point in the history
  • Loading branch information
ajpotts committed Jul 26, 2024
1 parent dd0d1f9 commit 716cfeb
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 46 deletions.
43 changes: 37 additions & 6 deletions PROTO_tests/tests/series_test.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import numpy as np
import pandas as pd
import pytest
from pandas.testing import assert_frame_equal, assert_series_equal
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from pandas.testing import assert_series_equal as pd_assert_series_equal

import arkouda as ak
from arkouda.series import Series
from arkouda.testing import assert_series_equal as ak_assert_series_equal

DTYPES = [ak.int64, ak.uint64, ak.bool_, ak.float64, ak.bigint, ak.str_]
NO_STRING = [ak.int64, ak.uint64, ak.bool_, ak.float64, ak.bigint]
Expand Down Expand Up @@ -41,6 +43,35 @@ def test_series_creation(self, dtype):
with pytest.raises(ValueError):
ak.Series(data=ak.arange(3), index=ak.arange(6))

@pytest.mark.parametrize("size", pytest.prob_size)
def test_series_creation_pandas_series(self, size):
str_vals = ak.random_strings_uniform(9, 10, size)
idx = ak.arange(size) * -1

vals = [str_vals, ak.Categorical(str_vals), ak.arange(size) * -2]
for val in vals:
if isinstance(val, ak.Categorical):
pd_ser = pd.Series(val.to_pandas(), idx.to_ndarray())
else:
pd_ser = pd.Series(val.to_ndarray(), idx.to_ndarray())
ak_ser = Series(pd_ser)
expected = Series(val, index=idx)
ak_assert_series_equal(ak_ser, expected)

@pytest.mark.parametrize("size", pytest.prob_size)
def test_to_pandas(self, size):
str_vals = ak.random_strings_uniform(9, 10, size)
idx = ak.arange(size)

vals = [str_vals, ak.Categorical(str_vals), ak.arange(size) * -2]
for val in vals:
ak_ser = Series(val, idx)
if isinstance(val, ak.Categorical):
pd_ser = pd.Series(val.to_pandas(), idx.to_ndarray())
else:
pd_ser = pd.Series(val.to_ndarray(), idx.to_ndarray())
pd_assert_series_equal(ak_ser.to_pandas(), pd_ser)

@pytest.mark.parametrize("dtype", INTEGRAL_TYPES)
@pytest.mark.parametrize("dtype_index", [ak.int64, ak.uint64])
def test_lookup(self, dtype, dtype_index):
Expand Down Expand Up @@ -171,7 +202,7 @@ def test_concat(self):
"val_1": [0, 0, 0, 0, 0, 5, 6, 7, 8, 9, 10],
}
)
assert_frame_equal(ref_df, df.to_pandas())
pd_assert_frame_equal(ref_df, df.to_pandas())

def list_helper(arr):
return arr.to_list() if isinstance(arr, (ak.pdarray, ak.Index)) else arr.tolist()
Expand All @@ -188,11 +219,11 @@ def list_helper(arr):
{"idx": [0, 1, 2, 3, 4], "val_0": [0, 1, 2, 3, 4], "val_1": [5, 6, 7, 8, 9]}
)
assert isinstance(df, ak.DataFrame)
assert_frame_equal(ref_df, df.to_pandas())
pd_assert_frame_equal(ref_df, df.to_pandas())
else:
ref_df = pd.DataFrame({0: [0, 1, 2, 3, 4], 1: [5, 6, 7, 8, 9]})
assert isinstance(df, pd.DataFrame)
assert_frame_equal(ref_df, df)
pd_assert_frame_equal(ref_df, df)

def test_index_as_index_compat(self):
# added to validate functionality for issue #1506
Expand Down Expand Up @@ -367,8 +398,8 @@ def test_series_segarray_to_pandas(self):
akdf = ak.DataFrame({"test": sa})
pddf = pd.DataFrame({"test": sa.to_list()})

assert_frame_equal(akdf.to_pandas(), pddf)
assert_series_equal(akdf.to_pandas()["test"], pddf["test"], check_names=False)
pd_assert_frame_equal(akdf.to_pandas(), pddf)
pd_assert_series_equal(akdf.to_pandas()["test"], pddf["test"], check_names=False)

def test_getitem_scalars(self):
ints = [0, 1, 3, 7, 3]
Expand Down
104 changes: 64 additions & 40 deletions arkouda/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from arkouda.index import Index, MultiIndex
from arkouda.numeric import cast as akcast
from arkouda.numeric import isnan, value_counts
from arkouda.segarray import SegArray
from arkouda.pdarrayclass import (
RegistrationError,
any,
Expand All @@ -27,8 +26,9 @@
)
from arkouda.pdarraycreation import arange, array, full, zeros
from arkouda.pdarraysetops import argsort, concatenate, in1d, indexof1d
from arkouda.segarray import SegArray
from arkouda.strings import Strings
from arkouda.util import convert_if_categorical, get_callback, is_float
from arkouda.util import get_callback, is_float

# pd.set_option("display.max_colwidth", 65) is being called in DataFrame.py. This will resolve BitVector
# truncation issues. If issues arise, that's where to look for it.
Expand Down Expand Up @@ -133,11 +133,16 @@ class Series:
@typechecked
def __init__(
self,
data: Union[Tuple, List, groupable_element_type, Series, SegArray],
data: Union[Tuple, List, groupable_element_type, Series, SegArray, pd.Series, pd.Categorical],
name=None,
index: Optional[Union[pdarray, Strings, Tuple, List, Index]] = None,
):

if isinstance(data, pd.Categorical):
data = Categorical(data)

self.registered_name: Optional[str] = None

if index is None and isinstance(data, (tuple, list)) and len(data) == 2:
# handles the previous `ar_tuple` case
if not isinstance(data[0], (pdarray, Index, Strings, Categorical, list, tuple)):
Expand All @@ -146,6 +151,13 @@ def __init__(
raise TypeError("values must be a pdarray, Strings, SegArray, or Categorical")
self.values = data[1] if not isinstance(data[1], Series) else data[1].values
self.index = Index.factory(index) if index else Index.factory(data[0])
elif isinstance(data, pd.Series):
if isinstance(data.values, pd.Categorical):
self.values = Categorical(data.values)
else:
self.values = array(data.values)
self.index = Index(data.index)
self.name = data.name
elif isinstance(data, tuple) and len(data) != 2:
raise TypeError("Series initialization requries a tuple of (index, values)")
else:
Expand All @@ -162,7 +174,10 @@ def __init__(
raise ValueError(
"Index size does not match data size: {} != {}".format(self.index.size, self.values.size)
)
self.name = name
if name is None and isinstance(data, (Series, pd.Series)):
self.name = data.name
else:
self.name = name
self.size = self.index.size

def __len__(self):
Expand Down Expand Up @@ -737,16 +752,21 @@ def to_pandas(self) -> pd.Series:
import copy

idx = self.index.to_pandas()
val = convert_if_categorical(self.values)
# pandas errors when ndarray formatted like a segarray is
# passed into Series but works when it's just a list of lists
vals_on_client = val.to_list() if isinstance(val, SegArray) else val.to_ndarray()

if isinstance(self.values, Categorical):
val = self.values.to_pandas()
elif isinstance(self.values, SegArray):
# pandas errors when ndarray formatted like a segarray is
# passed into Series but works when it's just a list of lists
val = self.values.to_list()
else:
val = self.values.to_ndarray()

if isinstance(self.name, str):
name = copy.copy(self.name)
return pd.Series(vals_on_client, index=idx, name=name)
return pd.Series(val, index=idx, name=name)
else:
return pd.Series(vals_on_client, index=idx)
return pd.Series(val, index=idx)

def to_markdown(self, mode="wt", index=True, tablefmt="grid", storage_options=None, **kwargs):
r"""
Expand Down Expand Up @@ -917,46 +937,50 @@ def register(self, user_defined_name: str):
"objType": self.objType,
"num_idxs": 1,
"idx_names": [
(
json.dumps(
{
"codes": self.index.values.codes.name,
"categories": self.index.values.categories.name,
"NA_codes": self.index.values._akNAcode.name,
**(
{"permutation": self.index.values.permutation.name}
if self.index.values.permutation is not None
else {}
),
**(
{"segments": self.index.values.segments.name}
if self.index.values.segments is not None
else {}
),
}
)
if isinstance(self.index.values, Categorical)
else self.index.values.name
)
],
"idx_types": [self.index.values.objType],
"values": (
json.dumps(
{
"codes": self.index.values.codes.name,
"categories": self.index.values.categories.name,
"NA_codes": self.index.values._akNAcode.name,
"codes": self.values.codes.name,
"categories": self.values.categories.name,
"NA_codes": self.values._akNAcode.name,
**(
{"permutation": self.index.values.permutation.name}
if self.index.values.permutation is not None
{"permutation": self.values.permutation.name}
if self.values.permutation is not None
else {}
),
**(
{"segments": self.index.values.segments.name}
if self.index.values.segments is not None
{"segments": self.values.segments.name}
if self.values.segments is not None
else {}
),
}
)
if isinstance(self.index.values, Categorical)
else self.index.values.name
],
"idx_types": [self.index.values.objType],
"values": json.dumps(
{
"codes": self.values.codes.name,
"categories": self.values.categories.name,
"NA_codes": self.values._akNAcode.name,
**(
{"permutation": self.values.permutation.name}
if self.values.permutation is not None
else {}
),
**(
{"segments": self.values.segments.name}
if self.values.segments is not None
else {}
),
}
)
if isinstance(self.values, Categorical)
else self.values.name,
if isinstance(self.values, Categorical)
else self.values.name
),
"val_type": self.values.objType,
},
)
Expand Down

0 comments on commit 716cfeb

Please sign in to comment.