From a01ab52d21f478735e41a00e001e517c0d79bc01 Mon Sep 17 00:00:00 2001 From: Amanda Potts Date: Thu, 18 Jul 2024 11:54:11 -0400 Subject: [PATCH] Closes #3510-Series.to_pandas to handle categoricals --- PROTO_tests/tests/series_test.py | 56 +++++++++++++++-- arkouda/series.py | 104 +++++++++++++++++++------------ 2 files changed, 114 insertions(+), 46 deletions(-) diff --git a/PROTO_tests/tests/series_test.py b/PROTO_tests/tests/series_test.py index 74f970a2bb..46ebc629a3 100644 --- a/PROTO_tests/tests/series_test.py +++ b/PROTO_tests/tests/series_test.py @@ -1,10 +1,12 @@ import numpy as np import pandas as pd import pytest -from pandas.testing import assert_frame_equal, assert_series_equal +from pandas.testing import assert_frame_equal as pd_assert_frame_equal +from pandas.testing import assert_series_equal as pd_assert_series_equal import arkouda as ak from arkouda.series import Series +from arkouda.testing import assert_series_equal as ak_assert_series_equal DTYPES = [ak.int64, ak.uint64, ak.bool_, ak.float64, ak.bigint, ak.str_] NO_STRING = [ak.int64, ak.uint64, ak.bool_, ak.float64, ak.bigint] @@ -41,6 +43,48 @@ def test_series_creation(self, dtype): with pytest.raises(ValueError): ak.Series(data=ak.arange(3), index=ak.arange(6)) + @pytest.mark.parametrize("size", pytest.prob_size) + def test_series_creation_pandas_series(self, size): + str_vals = ak.random_strings_uniform(9, 10, size) + cat_vals = ak.Categorical(str_vals) + num_vals = ak.arange(size) * -2 + idx = ak.arange(size) * -1 + + pd_str = pd.Series(str_vals.to_ndarray(), idx.to_ndarray()) + pd_cat = pd.Series(cat_vals.to_pandas(), idx.to_ndarray()) + pd_num = pd.Series(num_vals.to_ndarray(), idx.to_ndarray()) + + ak_str = Series(pd_str) + ak_cat = Series(pd_cat) + ak_num = Series(pd_num) + + expected_str = Series(str_vals, index=idx) + expected_cat = Series(cat_vals, index=idx) + expected_num = Series(num_vals, index=idx) + + ak_assert_series_equal(ak_str, expected_str) + ak_assert_series_equal(ak_cat, expected_cat) + ak_assert_series_equal(ak_num, expected_num) + + @pytest.mark.parametrize("size", pytest.prob_size) + def test_to_pandas(self, size): + str_vals = ak.random_strings_uniform(9, 10, size) + cat_vals = ak.Categorical(str_vals) + num_vals = ak.arange(size) * -1 + idx = ak.arange(size) + + ak_str = Series(str_vals, idx) + ak_cat = Series(cat_vals, idx) + ak_num = Series(num_vals, idx) + + pd_str = pd.Series(str_vals.to_ndarray(), idx.to_ndarray()) + pd_cat = pd.Series(cat_vals.to_pandas(), idx.to_ndarray()) + pd_num = pd.Series(num_vals.to_ndarray(), idx.to_ndarray()) + + pd_assert_series_equal(ak_str.to_pandas(), pd_str) + pd_assert_series_equal(ak_cat.to_pandas(), pd_cat) + pd_assert_series_equal(ak_num.to_pandas(), pd_num) + @pytest.mark.parametrize("dtype", INTEGRAL_TYPES) @pytest.mark.parametrize("dtype_index", [ak.int64, ak.uint64]) def test_lookup(self, dtype, dtype_index): @@ -171,7 +215,7 @@ def test_concat(self): "val_1": [0, 0, 0, 0, 0, 5, 6, 7, 8, 9, 10], } ) - assert_frame_equal(ref_df, df.to_pandas()) + pd_assert_frame_equal(ref_df, df.to_pandas()) def list_helper(arr): return arr.to_list() if isinstance(arr, (ak.pdarray, ak.Index)) else arr.tolist() @@ -188,11 +232,11 @@ def list_helper(arr): {"idx": [0, 1, 2, 3, 4], "val_0": [0, 1, 2, 3, 4], "val_1": [5, 6, 7, 8, 9]} ) assert isinstance(df, ak.DataFrame) - assert_frame_equal(ref_df, df.to_pandas()) + pd_assert_frame_equal(ref_df, df.to_pandas()) else: ref_df = pd.DataFrame({0: [0, 1, 2, 3, 4], 1: [5, 6, 7, 8, 9]}) assert isinstance(df, pd.DataFrame) - assert_frame_equal(ref_df, df) + pd_assert_frame_equal(ref_df, df) def test_index_as_index_compat(self): # added to validate functionality for issue #1506 @@ -367,8 +411,8 @@ def test_series_segarray_to_pandas(self): akdf = ak.DataFrame({"test": sa}) pddf = pd.DataFrame({"test": sa.to_list()}) - assert_frame_equal(akdf.to_pandas(), pddf) - assert_series_equal(akdf.to_pandas()["test"], pddf["test"], check_names=False) + pd_assert_frame_equal(akdf.to_pandas(), pddf) + pd_assert_series_equal(akdf.to_pandas()["test"], pddf["test"], check_names=False) def test_getitem_scalars(self): ints = [0, 1, 3, 7, 3] diff --git a/arkouda/series.py b/arkouda/series.py index a3592cd562..c75c7feb1f 100644 --- a/arkouda/series.py +++ b/arkouda/series.py @@ -17,7 +17,6 @@ from arkouda.index import Index, MultiIndex from arkouda.numeric import cast as akcast from arkouda.numeric import isnan, value_counts -from arkouda.segarray import SegArray from arkouda.pdarrayclass import ( RegistrationError, any, @@ -27,8 +26,9 @@ ) from arkouda.pdarraycreation import arange, array, full, zeros from arkouda.pdarraysetops import argsort, concatenate, in1d, indexof1d +from arkouda.segarray import SegArray from arkouda.strings import Strings -from arkouda.util import convert_if_categorical, get_callback, is_float +from arkouda.util import get_callback, is_float # pd.set_option("display.max_colwidth", 65) is being called in DataFrame.py. This will resolve BitVector # truncation issues. If issues arise, that's where to look for it. @@ -133,11 +133,16 @@ class Series: @typechecked def __init__( self, - data: Union[Tuple, List, groupable_element_type, Series, SegArray], + data: Union[Tuple, List, groupable_element_type, Series, SegArray, pd.Series, pd.Categorical], name=None, index: Optional[Union[pdarray, Strings, Tuple, List, Index]] = None, ): + + if isinstance(data, pd.Categorical): + data = Categorical(data) + self.registered_name: Optional[str] = None + if index is None and isinstance(data, (tuple, list)) and len(data) == 2: # handles the previous `ar_tuple` case if not isinstance(data[0], (pdarray, Index, Strings, Categorical, list, tuple)): @@ -146,6 +151,13 @@ def __init__( raise TypeError("values must be a pdarray, Strings, SegArray, or Categorical") self.values = data[1] if not isinstance(data[1], Series) else data[1].values self.index = Index.factory(index) if index else Index.factory(data[0]) + elif isinstance(data, pd.Series): + if isinstance(data.values, pd.Categorical): + self.values = Categorical(data.values) + else: + self.values = array(data.values) + self.index = Index(data.index) + self.name = data.name elif isinstance(data, tuple) and len(data) != 2: raise TypeError("Series initialization requries a tuple of (index, values)") else: @@ -162,7 +174,10 @@ def __init__( raise ValueError( "Index size does not match data size: {} != {}".format(self.index.size, self.values.size) ) - self.name = name + if name is None and isinstance(data, (Series, pd.Series)): + self.name = data.name + else: + self.name = name self.size = self.index.size def __len__(self): @@ -737,16 +752,21 @@ def to_pandas(self) -> pd.Series: import copy idx = self.index.to_pandas() - val = convert_if_categorical(self.values) - # pandas errors when ndarray formatted like a segarray is - # passed into Series but works when it's just a list of lists - vals_on_client = val.to_list() if isinstance(val, SegArray) else val.to_ndarray() + + if isinstance(self.values, Categorical): + val = self.values.to_pandas() + elif isinstance(self.values, SegArray): + # pandas errors when ndarray formatted like a segarray is + # passed into Series but works when it's just a list of lists + val = self.values.to_list() + else: + val = self.values.to_ndarray() if isinstance(self.name, str): name = copy.copy(self.name) - return pd.Series(vals_on_client, index=idx, name=name) + return pd.Series(val, index=idx, name=name) else: - return pd.Series(vals_on_client, index=idx) + return pd.Series(val, index=idx) def to_markdown(self, mode="wt", index=True, tablefmt="grid", storage_options=None, **kwargs): r""" @@ -917,46 +937,50 @@ def register(self, user_defined_name: str): "objType": self.objType, "num_idxs": 1, "idx_names": [ + ( + json.dumps( + { + "codes": self.index.values.codes.name, + "categories": self.index.values.categories.name, + "NA_codes": self.index.values._akNAcode.name, + **( + {"permutation": self.index.values.permutation.name} + if self.index.values.permutation is not None + else {} + ), + **( + {"segments": self.index.values.segments.name} + if self.index.values.segments is not None + else {} + ), + } + ) + if isinstance(self.index.values, Categorical) + else self.index.values.name + ) + ], + "idx_types": [self.index.values.objType], + "values": ( json.dumps( { - "codes": self.index.values.codes.name, - "categories": self.index.values.categories.name, - "NA_codes": self.index.values._akNAcode.name, + "codes": self.values.codes.name, + "categories": self.values.categories.name, + "NA_codes": self.values._akNAcode.name, **( - {"permutation": self.index.values.permutation.name} - if self.index.values.permutation is not None + {"permutation": self.values.permutation.name} + if self.values.permutation is not None else {} ), **( - {"segments": self.index.values.segments.name} - if self.index.values.segments is not None + {"segments": self.values.segments.name} + if self.values.segments is not None else {} ), } ) - if isinstance(self.index.values, Categorical) - else self.index.values.name - ], - "idx_types": [self.index.values.objType], - "values": json.dumps( - { - "codes": self.values.codes.name, - "categories": self.values.categories.name, - "NA_codes": self.values._akNAcode.name, - **( - {"permutation": self.values.permutation.name} - if self.values.permutation is not None - else {} - ), - **( - {"segments": self.values.segments.name} - if self.values.segments is not None - else {} - ), - } - ) - if isinstance(self.values, Categorical) - else self.values.name, + if isinstance(self.values, Categorical) + else self.values.name + ), "val_type": self.values.objType, }, )