Closes Bears-R-Us#3421 testing equivalence module

ajpotts · Jul 22, 2024 · 030b0fc · 030b0fc
1 parent e14ff6a
commit 030b0fc
Show file tree

Hide file tree

Showing 9 changed files with 1,283 additions and 272 deletions.
diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py
@@ -595,9 +595,9 @@ def test_groupby_standard(self):
         s = ak.DataFrame({"a": ak.Categorical(ak.array(["a", "a", "a", "b"]))}).groupby("a").size()
         pds = pd.Series(
             data=np.array([3, 1]),
-            index=pd.Index(data=np.array(["a", "b"], dtype="<U7"), name="a"),
+            index=pd.Index(data=pd.Categorical(np.array(["a", "b"])), name="a"),
         )
-        assert_series_equal(pds, s.to_pandas())
+        assert_series_equal(pds, s.to_pandas(), check_categorical=False)
 
     def test_gb_series(self):
         df = self.build_ak_df()

diff --git a/PROTO_tests/tests/testing/asserters_test.py b/PROTO_tests/tests/testing/asserters_test.py
diff --git a/arkouda/categorical.py b/arkouda/categorical.py
@@ -16,6 +16,7 @@
 )
 
 import numpy as np
+from pandas import Categorical as pd_Categorical
 from typeguard import typechecked
 
 from arkouda.client import generic_msg
@@ -48,8 +49,8 @@ class Categorical:
 
     Parameters
     ----------
-    values : Strings
-        String values to convert to categories
+    values : Strings, Categorical, pd.Categorical
+        Values to convert to categories
     NAvalue : str scalar
         The value to use to represent missing/null data
 
@@ -107,16 +108,32 @@ def __init__(self, values, **kwargs) -> None:
             self._categories_used = self.categories[unique_codes]
         else:
             # Typical initialization, called with values
-            if not isinstance(values, Strings):
-                raise ValueError(("Categorical: inputs other than " + "Strings not yet supported"))
-            g = GroupBy(values)
-            self.categories = g.unique_keys
-            self.codes = g.broadcast(arange(self.categories.size), permute=True)
-            self.permutation = cast(pdarray, g.permutation)
-            self.segments = g.segments
-            # Make a copy because N/A value must be added below
-            self._categories_used = self.categories[:]
-
+            if isinstance(values, pd_Categorical):
+                self.values = array(values.to_numpy())
+                self.categories = array(values.categories)
+                self.codes = array(values.codes.astype("int64"))
+                self._categories_used = self.categories[unique(self.codes)]
+                self.permutation = None
+                self.segments = None
+            elif isinstance(values, Categorical):
+                self.values = values.values
+                self.categories = values.categories
+                self.codes = values.codes
+                self._categories_used = values._categories_used
+                self.permutation = values.permutation
+                self.segments = values.segments
+            elif isinstance(values, Strings):
+                g = GroupBy(values)
+                self.categories = g.unique_keys
+                self.codes = g.broadcast(arange(self.categories.size), permute=True)
+                self.permutation = cast(pdarray, g.permutation)
+                self.segments = g.segments
+                # Make a copy because N/A value must be added below
+                self._categories_used = self.categories[:]
+            else:
+                raise ValueError(
+                    ("Categorical: inputs other than " + "Strings or pd.Categorical not yet supported")
+                )
         # When read from file or attached, NA code will be passed as a pdarray
         # Otherwise, the NA value is set to a string
         if "_akNAcode" in kwargs and kwargs["_akNAcode"] is not None:
@@ -399,6 +416,14 @@ def to_ndarray(self) -> np.ndarray:
             valcodes = self.codes.to_ndarray()
         return idx[valcodes]
 
+    def to_pandas(self) -> pd_Categorical:
+        """
+        Return the equivalent Pandas Categorical.
+        """
+        return pd_Categorical.from_codes(
+            codes=self.codes.to_ndarray(), categories=self.categories.to_ndarray()
+        )
+
     def to_list(self) -> List:
         """
         Convert the Categorical to a list, transferring data from

diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py
@@ -897,12 +897,16 @@ def __init__(self, initialdata=None, index=None, columns=None):
                 self._set_index(index)
             self.data = {}
             for key in initialdata.columns:
-                self.data[key] = (
-                    SegArray.from_multi_array([array(r) for r in initialdata[key]])
-                    if hasattr(initialdata[key], "values")
-                    and isinstance(initialdata[key].values[0], (list, np.ndarray))
-                    else array(initialdata[key])
-                )
+                if hasattr(initialdata[key], "values") and isinstance(
+                    initialdata[key].values[0], (list, np.ndarray)
+                ):
+                    self.data[key] = SegArray.from_multi_array([array(r) for r in initialdata[key]])
+                elif hasattr(initialdata[key], "values") and isinstance(
+                    initialdata[key].values, pd.Categorical
+                ):
+                    self.data[key] = Categorical(initialdata[key].values)
+                else:
+                    self.data[key] = array(initialdata[key])
 
             self.data.update()
             return
@@ -2888,6 +2892,9 @@ def to_pandas(self, datalimit=maxTransferBytes, retain_index=False):
                 nbytes += (val.dtype).itemsize * self._nrows
             elif isinstance(val, Strings):
                 nbytes += val.nbytes
+            elif isinstance(val, Categorical):
+                nbytes += val.codes.nbytes
+                nbytes += val.categories.nbytes
 
         KB = 1024
         MB = KB * KB
@@ -2919,7 +2926,12 @@ def to_pandas(self, datalimit=maxTransferBytes, retain_index=False):
             try:
                 # in order for proper pandas functionality, SegArrays must be seen as 1d
                 # and therefore need to be converted to list
-                pandas_data[key] = val.to_ndarray() if not isinstance(val, SegArray) else val.to_list()
+                if isinstance(val, SegArray):
+                    pandas_data[key] = val.to_list()
+                elif isinstance(val, Categorical):
+                    pandas_data[key] = val.to_pandas()
+                else:
+                    pandas_data[key] = val.to_ndarray()
             except TypeError:
                 raise IndexError("Bad index type or format.")