diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py index c2bf81dd8a..aba2c2afd9 100644 --- a/PROTO_tests/tests/dataframe_test.py +++ b/PROTO_tests/tests/dataframe_test.py @@ -595,9 +595,9 @@ def test_groupby_standard(self): s = ak.DataFrame({"a": ak.Categorical(ak.array(["a", "a", "a", "b"]))}).groupby("a").size() pds = pd.Series( data=np.array([3, 1]), - index=pd.Index(data=np.array(["a", "b"], dtype=" DataFrame: index=Index(ak.arange(6, dtype=index_dtype), name=index_name), ) - def test_assert_almost_equal(self): - size = 10 + @staticmethod + def perturb(a: ak.pdarray, atol: float, rtol: float, rng=None): + if rng is None: + rng = ak.random.default_rng() + return a + rtol * a + atol * rng.random() + + @staticmethod + def convert(obj, as_arkouda: bool): + if not isinstance( + obj, + ( + ak.DataFrame, + ak.Series, + ak.Index, + ak.MultiIndex, + ak.pdarray, + ak.Categorical, + ak.Strings, + ak.SegArray, + ), + ): + raise TypeError("obj must be an arkouda object.") + + if as_arkouda: + return obj + elif isinstance(obj, (ak.pdarray, ak.Strings)): + return obj.to_ndarray() + elif isinstance(obj, (ak.DataFrame)): + return obj.to_pandas(retain_index=True) + elif isinstance(obj, (ak.Series, ak.Index, ak.SegArray, ak.Categorical)): + return obj.to_pandas() + return None + + def get_converter(self, as_arkouda: bool): + def converter(obj): + return self.convert(obj, as_arkouda) + + return converter + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_almost_equal(self, size, left_as_arkouda, right_as_arkouda): + + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) rng = ak.random.default_rng() atol = 0.001 rtol = 0.001 a = ak.arange(size, dtype="float64") - a2 = a + rtol * a + atol * rng.random() + a2 = self.perturb(a, atol=atol, rtol=rtol, rng=rng) a3 = a + rtol + atol - assert_almost_equal(a, a2, atol=atol, rtol=rtol) + if both_ak: + assert_almost_equal(a, a2, atol=atol, rtol=rtol) + assert_almost_equivalent(convert_left(a), convert_right(a2), atol=atol, rtol=rtol) with pytest.raises(AssertionError): - assert_almost_equal(a, a3, atol=atol, rtol=rtol) + if both_ak: + assert_almost_equal(a, a3, atol=atol, rtol=rtol) + assert_almost_equivalent(convert_left(a), convert_right(a3), atol=atol, rtol=rtol) idx = Index(a) idx2 = Index(a2) idx3 = Index(a3) - assert_almost_equal(idx, idx2, atol=atol, rtol=rtol) + if both_ak: + assert_almost_equal(idx, idx2, atol=atol, rtol=rtol) + assert_almost_equivalent(convert_left(idx), convert_right(idx2), atol=atol, rtol=rtol) with pytest.raises(AssertionError): - assert_almost_equal(idx, idx3, atol=atol, rtol=rtol) + if both_ak: + assert_almost_equal(idx, idx3, atol=atol, rtol=rtol) + assert_almost_equivalent(convert_left(idx), convert_right(idx3), atol=atol, rtol=rtol) s = Series(a) s2 = Series(a2) s3 = Series(a3) - assert_almost_equal(s, s2, atol=atol, rtol=rtol) + if both_ak: + assert_almost_equal(s, s2, atol=atol, rtol=rtol) + assert_almost_equivalent(convert_left(s), convert_right(s2), atol=atol, rtol=rtol) with pytest.raises(AssertionError): - assert_almost_equal(s, s3, atol=atol, rtol=rtol) + if both_ak: + assert_almost_equal(s, s3, atol=atol, rtol=rtol) + assert_almost_equivalent(convert_left(s), convert_right(s3), atol=atol, rtol=rtol) df = DataFrame({"col1": a}, index=idx) df2 = DataFrame({"col1": a2}, index=idx2) df3 = DataFrame({"col1": a3}, index=idx3) - assert_almost_equal(df, df2, atol=atol, rtol=rtol) + if both_ak: + assert_almost_equal(df, df2, atol=atol, rtol=rtol) + assert_almost_equivalent(convert_left(df), convert_right(df2), atol=atol, rtol=rtol) with pytest.raises(AssertionError): - assert_almost_equal(df, df3, atol=atol, rtol=rtol) + if both_ak: + assert_almost_equal(df, df3, atol=atol, rtol=rtol) + assert_almost_equivalent(convert_left(df), convert_right(df3), atol=atol, rtol=rtol) + + def test_assert_almost_equal_scalars(self): + atol = 0.001 + rtol = 0.001 assert_almost_equal(True, True, atol=atol, rtol=rtol) + assert_almost_equivalent(True, True, atol=atol, rtol=rtol) with pytest.raises(AssertionError): assert_almost_equal(True, False, atol=atol, rtol=rtol) + assert_almost_equivalent(True, False, atol=atol, rtol=rtol) assert_almost_equal(1.0, 1.0, atol=atol, rtol=rtol) + assert_almost_equivalent(1.0, 1.0, atol=atol, rtol=rtol) with pytest.raises(AssertionError): assert_almost_equal(1.0, 1.5, atol=atol, rtol=rtol) + assert_almost_equivalent(1.0, 1.5, atol=atol, rtol=rtol) - def test_assert_index_equal(self): - size = 10 + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_index_equal(self, size, left_as_arkouda, right_as_arkouda): + + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) # exact i1 = Index(ak.arange(size, dtype="float64")) i2 = Index(ak.arange(size, dtype="int64")) - assert_index_equal(i1, i2, exact=False) + if both_ak: + assert_index_equal(i1, i2, exact=False) + assert_index_equivalent(convert_left(i1), convert_right(i2), exact=False) with pytest.raises(AssertionError): - assert_index_equal(i1, i2, exact=True) + if both_ak: + assert_index_equal(i1, i2, exact=True) + assert_index_equivalent(convert_left(i1), convert_right(i2), exact=True) # check_names i3 = Index(ak.arange(size), name="name1") i4 = Index(ak.arange(size), name="name1") i5 = Index(ak.arange(size), name="name2") - assert_index_equal(i3, i4, check_names=True) - assert_index_equal(i3, i5, check_names=False) + if both_ak: + assert_index_equal(i3, i4, check_names=True) + assert_index_equivalent(convert_left(i3), convert_right(i4), check_names=True) + + if both_ak: + assert_index_equal(i3, i5, check_names=False) + assert_index_equivalent(convert_left(i3), convert_right(i5), check_names=False) + with pytest.raises(AssertionError): - assert_index_equal(i3, i5, check_names=True) + if both_ak: + assert_index_equal(i3, i5, check_names=True) + assert_index_equivalent(convert_left(i3), convert_right(i5), check_names=True) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_index_equal_categorical(self, size, left_as_arkouda, right_as_arkouda): + + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_index_equal_categorical(self): # check_categorical # check_order i1 = Index(Categorical(ak.array(["a", "a", "b"]))) @@ -135,25 +236,51 @@ def test_assert_index_equal_categorical(self): i4 = Index(Categorical(ak.array(["a", "b", "c"]))) i5 = Index(Categorical(ak.array(["a", "a", "b"])).sort_values()) - assert_index_equal(i1, i1) - assert_index_equal(i1, i3, check_order=False) + if both_ak: + assert_index_equal(i1, i1) + assert_index_equivalent(convert_left(i1), convert_right(i1)) + + if both_ak: + assert_index_equal(i1, i3, check_order=False) + assert_index_equivalent(convert_left(i1), convert_right(i3), check_order=False) + with pytest.raises(AssertionError): - assert_index_equal(i1, i3, check_order=True) + if both_ak: + assert_index_equal(i1, i3, check_order=True) + assert_index_equivalent(convert_left(i1), convert_right(i3), check_order=True) with pytest.raises(AssertionError): - assert_index_equal(i1, i3, check_categorical=False) + if both_ak: + assert_index_equal(i1, i3, check_categorical=False) + assert_index_equivalent(convert_left(i1), convert_right(i3), check_categorical=False) with pytest.raises(AssertionError): - assert_index_equal(i1, i4, check_categorical=False) - assert_index_equal(i1, i5, check_order=True, check_categorical=True) + if both_ak: + assert_index_equal(i1, i4, check_categorical=False) + assert_index_equivalent(convert_left(i1), convert_right(i4), check_categorical=False) + if both_ak: + assert_index_equal(i1, i5, check_order=True, check_categorical=True) + assert_index_equivalent( + convert_left(i1), convert_right(i5), check_order=True, check_categorical=True + ) - def test_assert_index_equal_check_exact(self): - size = 10 + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_index_equal_check_exact(self, size, left_as_arkouda, right_as_arkouda): + + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) # check_exact i1 = Index(ak.arange(size, dtype="float64")) i2 = Index(ak.arange(size) + 1e-9) - assert_index_equal(i1, i2, check_exact=False) + if both_ak: + assert_index_equal(i1, i2, check_exact=False) + assert_index_equivalent(convert_left(i1), convert_right(i2), check_exact=False) with pytest.raises(AssertionError): - assert_index_equal(i1, i2, check_exact=True) + if both_ak: + assert_index_equal(i1, i2, check_exact=True) + assert_index_equivalent(convert_left(i1), convert_right(i2), check_exact=True) # rtol # atol @@ -164,24 +291,49 @@ def test_assert_index_equal_check_exact(self): rtol = 0.001 i3_atol = Index(ak.arange(size) + atol * rng.random()) - assert_index_equal(i3_float, i3_atol, check_exact=False, atol=atol) + if both_ak: + assert_index_equal(i3_float, i3_atol, check_exact=False, atol=atol) + assert_index_equivalent( + convert_left(i3_float), convert_right(i3_atol), check_exact=False, atol=atol + ) i3_atol_rtol = Index(ak.arange(size) + rtol * ak.arange(size) + atol * rng.random()) - assert_index_equal(i3_float, i3_atol_rtol, check_exact=False, atol=atol, rtol=rtol) + if both_ak: + assert_index_equal(i3_float, i3_atol_rtol, check_exact=False, atol=atol, rtol=rtol) + assert_index_equivalent( + convert_left(i3_float), convert_right(i3_atol_rtol), check_exact=False, atol=atol, rtol=rtol + ) i3_2rtol = Index(ak.arange(size) + ak.arange(size) * 2 * rtol) with pytest.raises(AssertionError): - assert_index_equal(i3_float, i3_2rtol, check_exact=False, rtol=rtol) + if both_ak: + assert_index_equal(i3_float, i3_2rtol, check_exact=False, rtol=rtol) + assert_index_equivalent( + convert_left(i3_float), convert_right(i3_2rtol), check_exact=False, rtol=rtol + ) i3_2atol = Index(ak.arange(size) + 2 * atol) with pytest.raises(AssertionError): - assert_index_equal(i3_float, i3_2atol, check_exact=False, atol=atol) + if both_ak: + assert_index_equal(i3_float, i3_2atol, check_exact=False, atol=atol) + assert_index_equivalent( + convert_left(i3_float), convert_right(i3_2atol), check_exact=False, atol=atol + ) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_index_equal_multiindex(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_index_equal_multiindex(self): m1 = self.build_multi_index(self) m2 = self.build_multi_index(self) - assert_index_equal(m1, m2) + if both_ak: + assert_index_equal(m1, m2) + assert_index_equivalent(convert_left(m1), convert_right(m2)) def test_assert_attr_equal_index(self): idx = self.build_index(self) @@ -318,35 +470,75 @@ def test_assert_categorical_equal(self): with pytest.raises(AssertionError): assert_categorical_equal(c1, c2, check_category_order=True) - def test_assert_series_equal_check_names(self): + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_series_equal_check_names(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) + s = Series(ak.array(["a", "b", "c"]), index=Index(ak.arange(3)), name="test") - assert_series_equal(s, s) + if both_ak: + assert_series_equal(s, s) + assert_series_equivalent(convert_left(s), convert_right(s)) # check_names s_diff_name = Series(ak.array(["a", "b", "c"]), index=Index(ak.arange(3)), name="different_name") - assert_series_equal(s, s_diff_name, check_names=False) - with pytest.raises(AssertionError): - assert_series_equal(s, s_diff_name, check_names=True) + if both_ak: + assert_series_equal(s, s_diff_name, check_names=False) + assert_series_equivalent(convert_left(s), convert_right(s_diff_name), check_names=False) + with pytest.raises(AssertionError): + if both_ak: + assert_series_equal(s, s_diff_name, check_names=True) + assert_series_equivalent(convert_left(s), convert_right(s_diff_name), check_names=True) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_series_equal(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_series_equal(self): s = Series(ak.array([1, 0, 2]), index=Index(ak.arange(3))) s_float = Series(ak.array([1.0, 0.0, 2.0]), index=Index(ak.arange(3) * 1.0)) - assert_series_equal(s, s) - assert_series_equal(s_float, s_float) + if both_ak: + assert_series_equal(s, s) + assert_series_equivalent(convert_left(s), convert_right(s)) + if both_ak: + assert_series_equal(s_float, s_float) + assert_series_equivalent(convert_left(s_float), convert_right(s_float)) # check_dtype - assert_series_equal(s, s_float, check_dtype=False, check_index_type=False) + if both_ak: + assert_series_equal(s, s_float, check_dtype=False, check_index_type=False) + assert_series_equivalent( + convert_left(s), convert_right(s_float), check_dtype=False, check_index_type=False + ) with pytest.raises(AssertionError): - assert_series_equal(s, s_float, check_dtype=False, check_index_type=True) + if both_ak: + assert_series_equal(s, s_float, check_dtype=False, check_index_type=True) + assert_series_equivalent( + convert_left(s), convert_right(s_float), check_dtype=False, check_index_type=True + ) with pytest.raises(AssertionError): - assert_series_equal(s, s_float, check_dtype=True, check_index_type=False) + if both_ak: + assert_series_equal(s, s_float, check_dtype=True, check_index_type=False) + assert_series_equivalent( + convert_left(s), convert_right(s_float), check_dtype=True, check_index_type=False + ) # check_index s_diff_index = Series(ak.array([1, 0, 2]), index=Index(ak.arange(3) * 2.0)) - assert_series_equal(s, s_diff_index, check_index=False) + if both_ak: + assert_series_equal(s, s_diff_index, check_index=False) + assert_series_equivalent(convert_left(s), convert_right(s_diff_index), check_index=False) with pytest.raises(AssertionError): - assert_series_equal(s, s_diff_index, check_index=True) + if both_ak: + assert_series_equal(s, s_diff_index, check_index=True) + assert_series_equivalent(convert_left(s), convert_right(s_diff_index), check_index=True) rng = ak.random.default_rng() atol = 0.001 @@ -373,22 +565,60 @@ def test_assert_series_equal(self): index=Index(ak.arange(3) + 2 * atol), ) - assert_series_equal(s_float, s_atol, check_exact=False, atol=atol) - assert_series_equal(s_float, s_rtol_atol, check_exact=False, atol=atol, rtol=rtol) + if both_ak: + assert_series_equal(s_float, s_atol, check_exact=False, atol=atol) + assert_series_equivalent( + convert_left(s_float), convert_right(s_atol), check_exact=False, atol=atol + ) + if both_ak: + assert_series_equal(s_float, s_rtol_atol, check_exact=False, atol=atol, rtol=rtol) + assert_series_equivalent( + convert_left(s_float), convert_right(s_rtol_atol), check_exact=False, atol=atol, rtol=rtol + ) with pytest.raises(AssertionError): - assert_series_equal(s_float, s_2rtol, check_exact=False, rtol=rtol) + if both_ak: + assert_series_equal(s_float, s_2rtol, check_exact=False, rtol=rtol) + assert_series_equivalent( + convert_left(s_float), convert_right(s_2rtol), check_exact=False, rtol=rtol + ) with pytest.raises(AssertionError): - assert_series_equal(s_float, s_2atol, check_exact=False, atol=atol) + if both_ak: + assert_series_equal(s_float, s_2atol, check_exact=False, atol=atol) + assert_series_equivalent( + convert_left(s_float), convert_right(s_2atol), check_exact=False, atol=atol + ) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_series_equal_check_like(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_series_equal_check_like(self): # check_like s_unordered_index = Series(ak.array([1, 0, 2]), index=Index(ak.array([0, 2, 1]))) s_ordered_index = s_unordered_index.sort_index() - assert_series_equal(s_ordered_index, s_unordered_index, check_like=True) + if both_ak: + assert_series_equal(s_ordered_index, s_unordered_index, check_like=True) + assert_series_equivalent( + convert_left(s_ordered_index), convert_right(s_unordered_index), check_like=True + ) with pytest.raises(AssertionError): - assert_series_equal(s_ordered_index, s_unordered_index, check_like=False) + if both_ak: + assert_series_equal(s_ordered_index, s_unordered_index, check_like=False) + assert_series_equivalent( + convert_left(s_ordered_index), convert_right(s_unordered_index), check_like=False + ) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_series_equal_categorical(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_series_equal_categorical(self): # check_categorical # check_category_order @@ -402,75 +632,177 @@ def test_assert_series_equal_categorical(self): index=Index(Categorical(ak.array(["a", "a", "b"]))), name="test", ) - assert_series_equal(s3a, s3a) - with pytest.raises(AssertionError): - assert_series_equal(s3a, s3b, check_categorical=True, check_category_order=True) - assert_series_equal(s3a, s3b, check_categorical=True, check_category_order=False) + if both_ak: + assert_series_equal(s3a, s3a) + assert_series_equivalent(convert_left(s3a), convert_right(s3a)) + with pytest.raises(AssertionError): + if both_ak: + assert_series_equal(s3a, s3b, check_categorical=True, check_category_order=True) + assert_series_equivalent( + convert_left(s3a), convert_right(s3b), check_categorical=True, check_category_order=True + ) + if both_ak: + assert_series_equal(s3a, s3b, check_categorical=True, check_category_order=False) + assert_series_equivalent( + convert_left(s3a), convert_right(s3b), check_categorical=True, check_category_order=False + ) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_frame_equal(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_frame_equal(self): df = self.build_ak_df(self) df2 = self.build_ak_df(self) - assert_frame_equal(df, df2) + if both_ak: + assert_frame_equal(df, df2) + assert_frame_equivalent(convert_left(df), convert_right(df2)) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_frame_equal_segarray(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_frame_equal_segarray(self): akdf = ak.DataFrame({"rand": ak.SegArray(ak.array([0, 3, 9]), ak.arange(10))}) - assert_frame_equal(akdf, akdf) + if both_ak: + assert_frame_equal(akdf, akdf) + assert_frame_equivalent(convert_left(akdf), convert_right(akdf)) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_frame_equal_check_dtype(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_frame_equal_check_dtype(self): df = self.build_ak_df(self) # check_dtype df_cpy = df.copy(deep=True) - assert_frame_equal(df, df_cpy, check_dtype=True) - df_cpy["day"] = cast(df_cpy["day"], dt="float64") - assert_frame_equal(df_cpy, df_cpy, check_dtype=True) - assert_frame_equal(df, df_cpy, check_dtype=False) - with pytest.raises(AssertionError): + if both_ak: assert_frame_equal(df, df_cpy, check_dtype=True) + assert_frame_equivalent(convert_left(df), convert_right(df_cpy), check_dtype=True) + df_cpy["day"] = cast(df_cpy["day"], dt="float64") + if both_ak: + assert_frame_equal(df_cpy, df_cpy, check_dtype=True) + assert_frame_equivalent(convert_left(df_cpy), convert_right(df_cpy), check_dtype=True) + if both_ak: + assert_frame_equal(df, df_cpy, check_dtype=False) + assert_frame_equivalent(convert_left(df), convert_right(df_cpy), check_dtype=False) + with pytest.raises(AssertionError): + if both_ak: + assert_frame_equal(df, df_cpy, check_dtype=True) + assert_frame_equivalent(convert_left(df), convert_right(df_cpy), check_dtype=True) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_frame_equal_check_index_type(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_frame_equal_check_index_type(self): df = self.build_ak_df(self) # check_index_type df_float_index = self.build_ak_df(self, index_dtype="float64") - assert_frame_equal(df, df_float_index, check_index_type=False) - with pytest.raises(AssertionError): - assert_frame_equal(df, df_float_index, check_index_type=True) + if both_ak: + assert_frame_equal(df, df_float_index, check_index_type=False) + assert_frame_equivalent(convert_left(df), convert_right(df_float_index), check_index_type=False) + with pytest.raises(AssertionError): + if both_ak: + assert_frame_equal(df, df_float_index, check_index_type=True) + assert_frame_equivalent( + convert_left(df), convert_right(df_float_index), check_index_type=True + ) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_frame_equal_check_names(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_frame_equal_check_names(self): # check_names df_name1 = self.build_ak_df(self, index_name="name1") df_name2 = self.build_ak_df(self, index_name="name2") - assert_frame_equal(df_name1, df_name2, check_names=False) - with pytest.raises(AssertionError): - assert_frame_equal(df_name1, df_name2, check_names=True) + if both_ak: + assert_frame_equal(df_name1, df_name2, check_names=False) + assert_frame_equivalent(convert_left(df_name1), convert_right(df_name2), check_names=False) + with pytest.raises(AssertionError): + if both_ak: + assert_frame_equal(df_name1, df_name2, check_names=True) + assert_frame_equivalent(convert_left(df_name1), convert_right(df_name2), check_names=True) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_frame_equal_check_like(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_frame_equal_check_like(self): df = self.build_ak_df(self) # check_like df_sorted = df.sort_values("amount") - assert_frame_equal(df, df_sorted, check_like=True) + if both_ak: + assert_frame_equal(df, df_sorted, check_like=True) + assert_frame_equivalent(convert_left(df), convert_right(df_sorted), check_like=True) with pytest.raises(AssertionError): - assert_frame_equal(df, df_sorted, check_like=False) + if both_ak: + assert_frame_equal(df, df_sorted, check_like=False) + assert_frame_equivalent(convert_left(df), convert_right(df_sorted), check_like=False) df_new_col_order = df[["bi", "userID", "day", "item", "amount", "userName"]] - assert_frame_equal(df, df_new_col_order, check_like=True) - with pytest.raises(AssertionError): - assert_frame_equal(df, df_new_col_order, check_column_type=True) + if both_ak: + assert_frame_equal(df, df_new_col_order, check_like=True) + assert_frame_equivalent(convert_left(df), convert_right(df_new_col_order), check_like=True) + with pytest.raises(AssertionError): + if both_ak: + assert_frame_equal(df, df_new_col_order, check_column_type=True) + assert_frame_equivalent( + convert_left(df), convert_right(df_new_col_order), check_column_type=True + ) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_frame_equal_check_categorical(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_frame_equal_check_categorical(self): # check_categorical df = self.build_ak_df(self) df["userName"] = Categorical(df["userName"]) df_ordered = self.build_ak_df(self) df_ordered["userName"] = Categorical(df_ordered["userName"]).sort_values() - assert_frame_equal(df, df_ordered, check_categorical=False) + if both_ak: + assert_frame_equal(df, df_ordered, check_categorical=False) + assert_frame_equivalent(convert_left(df), convert_right(df_ordered), check_categorical=False) with pytest.raises(AssertionError): - assert_frame_equal(df, df_ordered, check_categorical=True) + if both_ak: + assert_frame_equal(df, df_ordered, check_categorical=True) + assert_frame_equivalent(convert_left(df), convert_right(df_ordered), check_categorical=True) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_frame_equal_check_exact(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_frame_equal_check_exact(self): # check_exact # rtol # atol @@ -484,16 +816,37 @@ def test_assert_frame_equal_check_exact(self): df_rtol_atol["amount"] + rtol * df_rtol_atol["amount"] + rng.random() * atol ) - assert_frame_equal(df, df_rtol_atol, check_exact=False, atol=atol, rtol=rtol) + if both_ak: + assert_frame_equal(df, df_rtol_atol, check_exact=False, atol=atol, rtol=rtol) + assert_frame_equivalent( + convert_left(df), convert_right(df_rtol_atol), check_exact=False, atol=atol, rtol=rtol + ) with pytest.raises(AssertionError): - assert_frame_equal(df, df_rtol_atol, check_exact=True) + if both_ak: + assert_frame_equal(df, df_rtol_atol, check_exact=True) + assert_frame_equivalent(convert_left(df), convert_right(df_rtol_atol), check_exact=True) with pytest.raises(AssertionError): - assert_frame_equal(df, df_rtol_atol, check_exact=False, rtol=rtol) + if both_ak: + assert_frame_equal(df, df_rtol_atol, check_exact=False, rtol=rtol) + assert_frame_equivalent( + convert_left(df), convert_right(df_rtol_atol), check_exact=False, rtol=rtol + ) with pytest.raises(AssertionError): - assert_frame_equal(df, df_rtol_atol, check_exact=False, atol=atol) + if both_ak: + assert_frame_equal(df, df_rtol_atol, check_exact=False, atol=atol) + assert_frame_equivalent( + convert_left(df), convert_right(df_rtol_atol), check_exact=False, atol=atol + ) + + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_equal(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) - def test_assert_equal(self): size = 10 a = ak.arange(size) a2 = a + 1 @@ -504,33 +857,59 @@ def test_assert_equal(self): df = DataFrame({"col": a}, index=idx) df2 = DataFrame({"col": a2}, index=idx2) - assert_equal(a, a) + if both_ak: + assert_equal(a, a) + assert_equivalent(convert_left(a), convert_right(a)) with pytest.raises(AssertionError): - assert_equal(a, a2) + if both_ak: + assert_equal(a, a2) + assert_equivalent(convert_left(a), convert_right(a2)) - assert_equal(idx, idx) + if both_ak: + assert_equal(idx, idx) + assert_equivalent(convert_left(idx), convert_right(idx)) with pytest.raises(AssertionError): - assert_equal(idx, idx2) + if both_ak: + assert_equal(idx, idx2) + assert_equivalent(convert_left(idx), convert_right(idx2)) - assert_equal(s, s) + if both_ak: + assert_equal(s, s) + assert_equivalent(convert_left(s), convert_right(s)) with pytest.raises(AssertionError): - assert_equal(s, s2) + if both_ak: + assert_equal(s, s2) + assert_equivalent(convert_left(s), convert_right(s2)) - assert_equal(df, df) + if both_ak: + assert_equal(df, df) + assert_equivalent(convert_left(df), convert_right(df)) with pytest.raises(AssertionError): - assert_equal(df, df2) + if both_ak: + assert_equal(df, df2) + assert_equivalent(convert_left(df), convert_right(df2)) + + def test_assert_equal_scalars(self): st = "string1" st2 = "string2" + assert_equal(st, st) + assert_equivalent(st, st) + with pytest.raises(AssertionError): assert_equal(st, st2) + assert_equivalent(st, st2) n = 1.0 n2 = 1.5 + assert_equal(n, n) + assert_equivalent(n, n) + with pytest.raises(AssertionError): assert_equal(n, n2) + assert_equivalent(n, n2) def test_assert_contains_all(self): d = {"a": 1, "b": 2, "c": 3} @@ -553,38 +932,63 @@ def test_assert_copy(self): arrays3 = [ak.arange(10), ak.arange(10)] assert_copy(arrays, arrays3) - def test_assert_arkouda_array_equal(self): + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_arkouda_array_equal(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) + size = 10 a = ak.arange(size) a2 = a + 1 - assert_arkouda_array_equal(a, a) + if both_ak: + assert_arkouda_array_equal(a, a) + assert_arkouda_array_equivalent(convert_left(a), convert_right(a)) with pytest.raises(AssertionError): - assert_arkouda_array_equal(a, a2) + if both_ak: + assert_arkouda_array_equal(a, a2) + assert_arkouda_array_equivalent(convert_left(a), convert_right(a2)) s = ak.array(["a", "b", "b"]) s2 = ak.array(["a", "b", "c"]) - assert_arkouda_array_equal(s, s) + if both_ak: + assert_arkouda_array_equal(s, s) + assert_arkouda_array_equivalent(convert_left(s), convert_right(s)) with pytest.raises(AssertionError): - assert_arkouda_array_equal(s, s2) + if both_ak: + assert_arkouda_array_equal(s, s2) + assert_arkouda_array_equivalent(convert_left(s), convert_right(s2)) c = Categorical(s) c2 = Categorical(s2) - assert_arkouda_array_equal(c, c) + if both_ak: + assert_arkouda_array_equal(c, c) + assert_arkouda_array_equivalent(convert_left(c), convert_right(c)) with pytest.raises(AssertionError): - assert_arkouda_array_equal(c, c2) + if both_ak: + assert_arkouda_array_equal(c, c2) + assert_arkouda_array_equivalent(convert_left(c), convert_right(c2)) with pytest.raises(AssertionError): - assert_arkouda_array_equal(a, s) + if both_ak: + assert_arkouda_array_equal(a, s) + assert_arkouda_array_equivalent(convert_left(a), convert_right(s)) with pytest.raises(AssertionError): - assert_arkouda_array_equal(s, c) + if both_ak: + assert_arkouda_array_equal(s, c) + assert_arkouda_array_equivalent(convert_left(s), convert_right(c)) def test_assert_arkouda_segarray_equal(self): + seg = ak.SegArray(ak.array([0, 3, 9]), ak.arange(10)) seg_cpy = ak.SegArray(ak.array([0, 3, 9]), ak.arange(10)) seg_float = ak.SegArray(ak.array([0, 3, 9]), ak.arange(10, dtype="float64")) assert_arkouda_segarray_equal(seg, seg) + assert_arkouda_segarray_equal(seg, seg, check_same="same") with pytest.raises(AssertionError): assert_arkouda_segarray_equal(seg, seg, check_same="copy") @@ -598,10 +1002,20 @@ def test_assert_arkouda_segarray_equal(self): with pytest.raises(AssertionError): assert_arkouda_segarray_equal(seg, seg_float, check_dtype=True) - def test_assert_arkouda_array_equal_bigint(self): - size = 10 + @pytest.mark.parametrize("size", pytest.prob_size) + @pytest.mark.parametrize("left_as_arkouda", [True, False]) + @pytest.mark.parametrize("right_as_arkouda", [True, False]) + def test_assert_arkouda_array_equal_bigint(self, size, left_as_arkouda, right_as_arkouda): + both_ak = left_as_arkouda and right_as_arkouda + convert_left = self.get_converter(left_as_arkouda) + convert_right = self.get_converter(right_as_arkouda) + a = ak.arange(size, dtype=ak.bigint) + (2**64 - size - 1) a2 = a + 1 - assert_arkouda_array_equal(a, a) + if both_ak: + assert_arkouda_array_equal(a, a) + assert_arkouda_array_equivalent(convert_left(a), convert_right(a)) with pytest.raises(AssertionError): - assert_arkouda_array_equal(a, a2) + if both_ak: + assert_arkouda_array_equal(a, a2) + assert_arkouda_array_equivalent(convert_left(a), convert_right(a2)) diff --git a/arkouda/categorical.py b/arkouda/categorical.py index 5cea888a4f..13c9a7e312 100644 --- a/arkouda/categorical.py +++ b/arkouda/categorical.py @@ -16,6 +16,7 @@ ) import numpy as np +from pandas import Categorical as pd_Categorical from typeguard import typechecked from arkouda.client import generic_msg @@ -48,8 +49,8 @@ class Categorical: Parameters ---------- - values : Strings - String values to convert to categories + values : Strings, Categorical, pd.Categorical + Values to convert to categories NAvalue : str scalar The value to use to represent missing/null data @@ -107,16 +108,32 @@ def __init__(self, values, **kwargs) -> None: self._categories_used = self.categories[unique_codes] else: # Typical initialization, called with values - if not isinstance(values, Strings): - raise ValueError(("Categorical: inputs other than " + "Strings not yet supported")) - g = GroupBy(values) - self.categories = g.unique_keys - self.codes = g.broadcast(arange(self.categories.size), permute=True) - self.permutation = cast(pdarray, g.permutation) - self.segments = g.segments - # Make a copy because N/A value must be added below - self._categories_used = self.categories[:] - + if isinstance(values, pd_Categorical): + self.values = array(values.to_numpy()) + self.categories = array(values.categories) + self.codes = array(values.codes.astype("int64")) + self._categories_used = self.categories[unique(self.codes)] + self.permutation = None + self.segments = None + elif isinstance(values, Categorical): + self.values = values.values + self.categories = values.categories + self.codes = values.codes + self._categories_used = values._categories_used + self.permutation = values.permutation + self.segments = values.segments + elif isinstance(values, Strings): + g = GroupBy(values) + self.categories = g.unique_keys + self.codes = g.broadcast(arange(self.categories.size), permute=True) + self.permutation = cast(pdarray, g.permutation) + self.segments = g.segments + # Make a copy because N/A value must be added below + self._categories_used = self.categories[:] + else: + raise ValueError( + ("Categorical: inputs other than " + "Strings or pd.Categorical not yet supported") + ) # When read from file or attached, NA code will be passed as a pdarray # Otherwise, the NA value is set to a string if "_akNAcode" in kwargs and kwargs["_akNAcode"] is not None: @@ -399,6 +416,14 @@ def to_ndarray(self) -> np.ndarray: valcodes = self.codes.to_ndarray() return idx[valcodes] + def to_pandas(self) -> pd_Categorical: + """ + Return the equivalent Pandas Categorical. + """ + return pd_Categorical.from_codes( + codes=self.codes.to_ndarray(), categories=self.categories.to_ndarray() + ) + def to_list(self) -> List: """ Convert the Categorical to a list, transferring data from diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py index 218b5ef386..94ed80b584 100644 --- a/arkouda/dataframe.py +++ b/arkouda/dataframe.py @@ -897,12 +897,16 @@ def __init__(self, initialdata=None, index=None, columns=None): self._set_index(index) self.data = {} for key in initialdata.columns: - self.data[key] = ( - SegArray.from_multi_array([array(r) for r in initialdata[key]]) - if hasattr(initialdata[key], "values") - and isinstance(initialdata[key].values[0], (list, np.ndarray)) - else array(initialdata[key]) - ) + if hasattr(initialdata[key], "values") and isinstance( + initialdata[key].values[0], (list, np.ndarray) + ): + self.data[key] = SegArray.from_multi_array([array(r) for r in initialdata[key]]) + elif hasattr(initialdata[key], "values") and isinstance( + initialdata[key].values, pd.Categorical + ): + self.data[key] = Categorical(initialdata[key].values) + else: + self.data[key] = array(initialdata[key]) self.data.update() return @@ -2888,6 +2892,9 @@ def to_pandas(self, datalimit=maxTransferBytes, retain_index=False): nbytes += (val.dtype).itemsize * self._nrows elif isinstance(val, Strings): nbytes += val.nbytes + elif isinstance(val, Categorical): + nbytes += val.codes.nbytes + nbytes += val.categories.nbytes KB = 1024 MB = KB * KB @@ -2919,7 +2926,12 @@ def to_pandas(self, datalimit=maxTransferBytes, retain_index=False): try: # in order for proper pandas functionality, SegArrays must be seen as 1d # and therefore need to be converted to list - pandas_data[key] = val.to_ndarray() if not isinstance(val, SegArray) else val.to_list() + if isinstance(val, SegArray): + pandas_data[key] = val.to_list() + elif isinstance(val, Categorical): + pandas_data[key] = val.to_pandas() + else: + pandas_data[key] = val.to_ndarray() except TypeError: raise IndexError("Bad index type or format.") diff --git a/arkouda/index.py b/arkouda/index.py index d4887c362c..47afef4d55 100644 --- a/arkouda/index.py +++ b/arkouda/index.py @@ -34,7 +34,7 @@ class Index: Parameters ---------- - values: List, pdarray, Strings, Categorical, pandas.Index, or Index + values: List, pdarray, Strings, Categorical, pandas.Categorical, pandas.Index, or Index name : str, default=None Name to be stored in the index. allow_list = False, @@ -68,20 +68,27 @@ class Index: @typechecked def __init__( self, - values: Union[List, pdarray, Strings, Categorical, pd.Index, "Index"], + values: Union[List, pdarray, Strings, Categorical, pd.Index, "Index", pd.Categorical], name: Optional[str] = None, allow_list=False, max_list_size=1000, ): self.max_list_size = max_list_size self.registered_name: Optional[str] = None + + if isinstance(values, pd.Categorical): + values = Categorical(values) + if isinstance(values, Index): self.values = values.values self.size = values.size self.dtype = values.dtype self.name = name if name else values.name elif isinstance(values, pd.Index): - self.values = array(values.values) + if isinstance(values.values, pd.Categorical): + self.values = Categorical(values.values) + else: + self.values = array(values.values) self.size = values.size self.dtype = self.values.dtype self.name = name if name else values.name @@ -382,10 +389,16 @@ def memory_usage(self, unit="B"): return convert_bytes(self.values.nbytes, unit=unit) def to_pandas(self): + """ + Return the equivalent Pandas Index. + """ if isinstance(self.values, list): val = ndarray(self.values) + elif isinstance(self.values, Categorical): + val = self.values.to_pandas() + return pd.CategoricalIndex(data=val, dtype=val.dtype, name=self.name) else: - val = convert_if_categorical(self.values).to_ndarray() + val = self.values.to_ndarray() return pd.Index(data=val, dtype=val.dtype, name=self.name) def to_ndarray(self): @@ -459,25 +472,27 @@ def register(self, user_defined_name): "objType": self.objType, "num_idxs": 1, "idx_names": [ - json.dumps( - { - "codes": self.values.codes.name, - "categories": self.values.categories.name, - "NA_codes": self.values._akNAcode.name, - **( - {"permutation": self.values.permutation.name} - if self.values.permutation is not None - else {} - ), - **( - {"segments": self.values.segments.name} - if self.values.segments is not None - else {} - ), - } + ( + json.dumps( + { + "codes": self.values.codes.name, + "categories": self.values.categories.name, + "NA_codes": self.values._akNAcode.name, + **( + {"permutation": self.values.permutation.name} + if self.values.permutation is not None + else {} + ), + **( + {"segments": self.values.segments.name} + if self.values.segments is not None + else {} + ), + } + ) + if isinstance(self.values, Categorical) + else self.values.name ) - if isinstance(self.values, Categorical) - else self.values.name ], "idx_types": [self.values.objType], }, @@ -714,24 +729,26 @@ def to_hdf( raise TypeError("Unable to write Index to hdf when values are a list.") index_data = [ - self.values.name - if not isinstance(self.values, (Categorical_)) - else json.dumps( - { - "codes": self.values.codes.name, - "categories": self.values.categories.name, - "NA_codes": self.values._akNAcode.name, - **( - {"permutation": self.values.permutation.name} - if self.values.permutation is not None - else {} - ), - **( - {"segments": self.values.segments.name} - if self.values.segments is not None - else {} - ), - } + ( + self.values.name + if not isinstance(self.values, (Categorical_)) + else json.dumps( + { + "codes": self.values.codes.name, + "categories": self.values.categories.name, + "NA_codes": self.values._akNAcode.name, + **( + {"permutation": self.values.permutation.name} + if self.values.permutation is not None + else {} + ), + **( + {"segments": self.values.segments.name} + if self.values.segments is not None + else {} + ), + } + ) ) ] return typecast( @@ -805,24 +822,26 @@ def update_hdf( file_type = _get_hdf_filetype(prefix_path + "*") index_data = [ - self.values.name - if not isinstance(self.values, (Categorical_)) - else json.dumps( - { - "codes": self.values.codes.name, - "categories": self.values.categories.name, - "NA_codes": self.values._akNAcode.name, - **( - {"permutation": self.values.permutation.name} - if self.values.permutation is not None - else {} - ), - **( - {"segments": self.values.segments.name} - if self.values.segments is not None - else {} - ), - } + ( + self.values.name + if not isinstance(self.values, (Categorical_)) + else json.dumps( + { + "codes": self.values.codes.name, + "categories": self.values.categories.name, + "NA_codes": self.values._akNAcode.name, + **( + {"permutation": self.values.permutation.name} + if self.values.permutation is not None + else {} + ), + **( + {"segments": self.values.segments.name} + if self.values.segments is not None + else {} + ), + } + ) ) ] @@ -1214,8 +1233,10 @@ def memory_usage(self, unit="B"): return convert_bytes(nbytes, unit=unit) def to_pandas(self): - idx = [convert_if_categorical(i) for i in self.index] - mi = pd.MultiIndex.from_arrays([i.to_ndarray() for i in idx], names=self.names) + mi = pd.MultiIndex.from_arrays( + [i.to_pandas() if isinstance(i, Categorical) else i.to_ndarray() for i in self.index], + names=self.names, + ) return pd.Series(index=mi, dtype="float64", name=self.name).index def set_dtype(self, dtype): @@ -1279,17 +1300,23 @@ def register(self, user_defined_name): "objType": self.objType, "num_idxs": len(self.levels), "idx_names": [ - json.dumps( - { - "codes": v.codes.name, - "categories": v.categories.name, - "NA_codes": v._akNAcode.name, - **({"permutation": v.permutation.name} if v.permutation is not None else {}), - **({"segments": v.segments.name} if v.segments is not None else {}), - } + ( + json.dumps( + { + "codes": v.codes.name, + "categories": v.categories.name, + "NA_codes": v._akNAcode.name, + **( + {"permutation": v.permutation.name} + if v.permutation is not None + else {} + ), + **({"segments": v.segments.name} if v.segments is not None else {}), + } + ) + if isinstance(v, Categorical) + else v.name ) - if isinstance(v, Categorical) - else v.name for v in self.levels ], "idx_types": [v.objType for v in self.levels], @@ -1409,16 +1436,18 @@ def to_hdf( from arkouda.io import _file_type_to_int, _mode_str_to_int index_data = [ - obj.name - if not isinstance(obj, (Categorical_)) - else json.dumps( - { - "codes": obj.codes.name, - "categories": obj.categories.name, - "NA_codes": obj._akNAcode.name, - **({"permutation": obj.permutation.name} if obj.permutation is not None else {}), - **({"segments": obj.segments.name} if obj.segments is not None else {}), - } + ( + obj.name + if not isinstance(obj, (Categorical_)) + else json.dumps( + { + "codes": obj.codes.name, + "categories": obj.categories.name, + "NA_codes": obj._akNAcode.name, + **({"permutation": obj.permutation.name} if obj.permutation is not None else {}), + **({"segments": obj.segments.name} if obj.segments is not None else {}), + } + ) ) for obj in self.levels ] @@ -1498,16 +1527,18 @@ def update_hdf( file_type = _get_hdf_filetype(prefix_path + "*") index_data = [ - obj.name - if not isinstance(obj, (Categorical_)) - else json.dumps( - { - "codes": obj.codes.name, - "categories": obj.categories.name, - "NA_codes": obj._akNAcode.name, - **({"permutation": obj.permutation.name} if obj.permutation is not None else {}), - **({"segments": obj.segments.name} if obj.segments is not None else {}), - } + ( + obj.name + if not isinstance(obj, (Categorical_)) + else json.dumps( + { + "codes": obj.codes.name, + "categories": obj.categories.name, + "NA_codes": obj._akNAcode.name, + **({"permutation": obj.permutation.name} if obj.permutation is not None else {}), + **({"segments": obj.segments.name} if obj.segments is not None else {}), + } + ) ) for obj in self.levels ] diff --git a/arkouda/series.py b/arkouda/series.py index a3592cd562..c75c7feb1f 100644 --- a/arkouda/series.py +++ b/arkouda/series.py @@ -17,7 +17,6 @@ from arkouda.index import Index, MultiIndex from arkouda.numeric import cast as akcast from arkouda.numeric import isnan, value_counts -from arkouda.segarray import SegArray from arkouda.pdarrayclass import ( RegistrationError, any, @@ -27,8 +26,9 @@ ) from arkouda.pdarraycreation import arange, array, full, zeros from arkouda.pdarraysetops import argsort, concatenate, in1d, indexof1d +from arkouda.segarray import SegArray from arkouda.strings import Strings -from arkouda.util import convert_if_categorical, get_callback, is_float +from arkouda.util import get_callback, is_float # pd.set_option("display.max_colwidth", 65) is being called in DataFrame.py. This will resolve BitVector # truncation issues. If issues arise, that's where to look for it. @@ -133,11 +133,16 @@ class Series: @typechecked def __init__( self, - data: Union[Tuple, List, groupable_element_type, Series, SegArray], + data: Union[Tuple, List, groupable_element_type, Series, SegArray, pd.Series, pd.Categorical], name=None, index: Optional[Union[pdarray, Strings, Tuple, List, Index]] = None, ): + + if isinstance(data, pd.Categorical): + data = Categorical(data) + self.registered_name: Optional[str] = None + if index is None and isinstance(data, (tuple, list)) and len(data) == 2: # handles the previous `ar_tuple` case if not isinstance(data[0], (pdarray, Index, Strings, Categorical, list, tuple)): @@ -146,6 +151,13 @@ def __init__( raise TypeError("values must be a pdarray, Strings, SegArray, or Categorical") self.values = data[1] if not isinstance(data[1], Series) else data[1].values self.index = Index.factory(index) if index else Index.factory(data[0]) + elif isinstance(data, pd.Series): + if isinstance(data.values, pd.Categorical): + self.values = Categorical(data.values) + else: + self.values = array(data.values) + self.index = Index(data.index) + self.name = data.name elif isinstance(data, tuple) and len(data) != 2: raise TypeError("Series initialization requries a tuple of (index, values)") else: @@ -162,7 +174,10 @@ def __init__( raise ValueError( "Index size does not match data size: {} != {}".format(self.index.size, self.values.size) ) - self.name = name + if name is None and isinstance(data, (Series, pd.Series)): + self.name = data.name + else: + self.name = name self.size = self.index.size def __len__(self): @@ -737,16 +752,21 @@ def to_pandas(self) -> pd.Series: import copy idx = self.index.to_pandas() - val = convert_if_categorical(self.values) - # pandas errors when ndarray formatted like a segarray is - # passed into Series but works when it's just a list of lists - vals_on_client = val.to_list() if isinstance(val, SegArray) else val.to_ndarray() + + if isinstance(self.values, Categorical): + val = self.values.to_pandas() + elif isinstance(self.values, SegArray): + # pandas errors when ndarray formatted like a segarray is + # passed into Series but works when it's just a list of lists + val = self.values.to_list() + else: + val = self.values.to_ndarray() if isinstance(self.name, str): name = copy.copy(self.name) - return pd.Series(vals_on_client, index=idx, name=name) + return pd.Series(val, index=idx, name=name) else: - return pd.Series(vals_on_client, index=idx) + return pd.Series(val, index=idx) def to_markdown(self, mode="wt", index=True, tablefmt="grid", storage_options=None, **kwargs): r""" @@ -917,46 +937,50 @@ def register(self, user_defined_name: str): "objType": self.objType, "num_idxs": 1, "idx_names": [ + ( + json.dumps( + { + "codes": self.index.values.codes.name, + "categories": self.index.values.categories.name, + "NA_codes": self.index.values._akNAcode.name, + **( + {"permutation": self.index.values.permutation.name} + if self.index.values.permutation is not None + else {} + ), + **( + {"segments": self.index.values.segments.name} + if self.index.values.segments is not None + else {} + ), + } + ) + if isinstance(self.index.values, Categorical) + else self.index.values.name + ) + ], + "idx_types": [self.index.values.objType], + "values": ( json.dumps( { - "codes": self.index.values.codes.name, - "categories": self.index.values.categories.name, - "NA_codes": self.index.values._akNAcode.name, + "codes": self.values.codes.name, + "categories": self.values.categories.name, + "NA_codes": self.values._akNAcode.name, **( - {"permutation": self.index.values.permutation.name} - if self.index.values.permutation is not None + {"permutation": self.values.permutation.name} + if self.values.permutation is not None else {} ), **( - {"segments": self.index.values.segments.name} - if self.index.values.segments is not None + {"segments": self.values.segments.name} + if self.values.segments is not None else {} ), } ) - if isinstance(self.index.values, Categorical) - else self.index.values.name - ], - "idx_types": [self.index.values.objType], - "values": json.dumps( - { - "codes": self.values.codes.name, - "categories": self.values.categories.name, - "NA_codes": self.values._akNAcode.name, - **( - {"permutation": self.values.permutation.name} - if self.values.permutation is not None - else {} - ), - **( - {"segments": self.values.segments.name} - if self.values.segments is not None - else {} - ), - } - ) - if isinstance(self.values, Categorical) - else self.values.name, + if isinstance(self.values, Categorical) + else self.values.name + ), "val_type": self.values.objType, }, ) diff --git a/arkouda/testing/__init__.py b/arkouda/testing/__init__.py index af956e108f..b076e5193a 100644 --- a/arkouda/testing/__init__.py +++ b/arkouda/testing/__init__.py @@ -16,6 +16,14 @@ assert_is_sorted, assert_series_equal, ) +from ._equivalence_asserters import ( + assert_almost_equivalent, + assert_arkouda_array_equivalent, + assert_equivalent, + assert_frame_equivalent, + assert_index_equivalent, + assert_series_equivalent, +) __all__ = [ "assert_almost_equal", @@ -34,4 +42,10 @@ "assert_index_equal", "assert_is_sorted", "assert_series_equal", + "assert_almost_equivalent", + "assert_arkouda_array_equivalent", + "assert_equivalent", + "assert_frame_equivalent", + "assert_index_equivalent", + "assert_series_equivalent", ] diff --git a/arkouda/testing/_asserters.py b/arkouda/testing/_asserters.py index 64ae92f166..2ba9646168 100644 --- a/arkouda/testing/_asserters.py +++ b/arkouda/testing/_asserters.py @@ -11,6 +11,7 @@ DataFrame, Index, MultiIndex, + SegArray, Series, Strings, argsort, @@ -18,7 +19,6 @@ pdarray, sort, ) -from arkouda import SegArray from arkouda import sum as aksum from arkouda.util import is_numeric @@ -539,6 +539,10 @@ def assert_arkouda_pdarray_equal( # both classes must be an ak.pdarray _check_isinstance(left, right, pdarray) + assert len(left) == len( + right + ), f"Arrays were not same size. left had length {len(left)} and right had length {len(right)}" + def _get_base(obj): return obj.base if getattr(obj, "base", None) is not None else obj @@ -715,8 +719,8 @@ def _raise(left: Strings, right: Strings, err_msg): def assert_arkouda_array_equal( - left: pdarray | Strings | Categorical, - right: pdarray | Strings | Categorical, + left: pdarray | Strings | Categorical | SegArray, + right: pdarray | Strings | Categorical | SegArray, check_dtype: bool = True, err_msg=None, check_same=None, @@ -724,11 +728,11 @@ def assert_arkouda_array_equal( index_values=None, ) -> None: """ - Check that 'ak.pdarray' or 'ak.Strings' or 'ak.Categorical' is equivalent. + Check that 'ak.pdarray' or 'ak.Strings', 'ak.Categorical', or 'ak.SegArray' is equivalent. Parameters ---------- - left, right : arkouda.pdarray or arkouda.Strings or arkouda.Categorical + left, right : arkouda.pdarray or arkouda.Strings or arkouda.Categorical or arkouda.SegArray The two arrays to be compared. check_dtype : bool, default True Check dtype if both a and b are ak.pdarray. @@ -1088,7 +1092,7 @@ def assert_equal(left, right, **kwargs) -> None: Parameters ---------- - left, right : Index, Series, DataFrame, or np.pdarray + left, right : Index, Series, DataFrame, or pdarray The two items to be compared. **kwargs All keyword arguments are passed through to the underlying assert method. @@ -1101,7 +1105,7 @@ def assert_equal(left, right, **kwargs) -> None: assert_series_equal(left, right, **kwargs) elif isinstance(left, DataFrame): assert_frame_equal(left, right, **kwargs) - elif isinstance(left, pdarray): + elif isinstance(left, (pdarray, Strings, Categorical, SegArray)): assert_arkouda_array_equal(left, right, **kwargs) elif isinstance(left, str): assert kwargs == {} diff --git a/arkouda/testing/_equivalence_asserters.py b/arkouda/testing/_equivalence_asserters.py new file mode 100644 index 0000000000..20a6bb3f56 --- /dev/null +++ b/arkouda/testing/_equivalence_asserters.py @@ -0,0 +1,487 @@ +from __future__ import annotations + +import numpy as np +import pandas as pd +from numpy import bool_, floating, integer, str_ + +from arkouda import ( + Categorical, + DataFrame, + Index, + MultiIndex, + SegArray, + Series, + Strings, + array, + pdarray, +) +from arkouda.testing import ( + assert_almost_equal, + assert_arkouda_array_equal, + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) + +DEBUG = True + +__all__ = [ + "assert_almost_equivalent", + "assert_arkouda_array_equivalent", + "assert_equivalent", + "assert_frame_equivalent", + "assert_index_equivalent", + "assert_series_equivalent", +] + + +def _convert_to_arkouda(obj): + """ + Convert a numpy or pandas object to an arkouda object. + """ + + if isinstance( + obj, + ( + DataFrame, + Series, + Index, + MultiIndex, + SegArray, + Categorical, + Strings, + pdarray, + str_, + integer, + floating, + bool_, + bool, + float, + ), + ): + return obj + + if not isinstance( + obj, (pd.MultiIndex, pd.Index, pd.Series, pd.DataFrame, pd.Categorical, np.ndarray) + ): + raise TypeError(f"obj must be an arkouda, numpy or pandas object, but was type: {type(obj)}") + + if isinstance(obj, pd.MultiIndex): + return MultiIndex(obj) + elif isinstance(obj, pd.Index): + return Index(obj) + elif isinstance(obj, pd.Series): + return Series(obj) + elif isinstance(obj, pd.DataFrame): + return DataFrame(obj) + elif isinstance(obj, pd.Categorical): + return Categorical(obj) + elif isinstance(obj, np.ndarray): + return array(obj) + return None + + +def assert_almost_equivalent( + left, + right, + rtol: float = 1.0e-5, + atol: float = 1.0e-8, +) -> None: + """ + Check that the left and right objects are approximately equal. + + By approximately equal, we refer to objects that are numbers or that + contain numbers which may be equivalent to specific levels of precision. + + If the objects are pandas or numpy objects, they are converted to arkouda objects. + Then assert_almost_equal is applied to the result. + + Parameters + ---------- + left : object + right : object + rtol : float, default 1e-5 + Relative tolerance. + atol : float, default 1e-8 + Absolute tolerance. + + Warning + ------- + This function cannot be used on pdarray of size > ak.client.maxTransferBytes + because it converts pdarrays to numpy arrays and calls np.allclose. + + See Also + -------- + assert_almost_equal + """ + __tracebackhide__ = not DEBUG + + assert_almost_equal( + _convert_to_arkouda(left), + _convert_to_arkouda(right), + rtol=rtol, + atol=atol, + ) + + +def assert_index_equivalent( + left: Index | pd.Index, + right: Index | pd.Index, + exact: bool = True, + check_names: bool = True, + check_exact: bool = True, + check_categorical: bool = True, + check_order: bool = True, + rtol: float = 1.0e-5, + atol: float = 1.0e-8, + obj: str = "Index", +) -> None: + """ + Check that left and right Index are equal. + + If the objects are pandas.Index, they are converted to arkouda.Index. + Then assert_almost_equal is applied to the result. + + Parameters + ---------- + left : Index or pandas.Index + right : Index or pandas.Index + exact : True + Whether to check the Index class, dtype and inferred_type + are identical. + check_names : bool, default True + Whether to check the names attribute. + check_exact : bool, default True + Whether to compare number exactly. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + check_order : bool, default True + Whether to compare the order of index entries as well as their values. + If True, both indexes must contain the same elements, in the same order. + If False, both indexes must contain the same elements, but in any order. + rtol : float, default 1e-5 + Relative tolerance. Only used when check_exact is False. + atol : float, default 1e-8 + Absolute tolerance. Only used when check_exact is False. + obj : str, default 'Index' + Specify object name being compared, internally used to show appropriate + assertion message. + + See Also + -------- + assert_index_equal + + Examples + -------- + >>> from arkouda import testing as tm + >>> import pandas as pd + >>> a = ak.Index([1, 2, 3]) + >>> b = pd.Index([1, 2, 3]) + >>> tm.assert_index_equivalent(a, b) + """ + __tracebackhide__ = not DEBUG + + if not isinstance(left, (Index, pd.Index)) or not isinstance(right, (Index, pd.Index)): + raise TypeError( + f"left and right must be type arkouda.Index, or pandas.Index. " + f"Instead types were {type(left)} and {type(right)}" + ) + + assert_index_equal( + _convert_to_arkouda(left), + _convert_to_arkouda(right), + exact=exact, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + check_order=check_order, + rtol=rtol, + atol=atol, + obj=obj, + ) + + +def assert_arkouda_array_equivalent( + left: pdarray | Strings | Categorical | SegArray | np.ndarray | pd.Categorical, + right: pdarray | Strings | Categorical | SegArray | np.ndarray | pd.Categorical, + check_dtype: bool = True, + err_msg=None, + check_same=None, + obj: str = "pdarray", + index_values=None, +) -> None: + """ + Check that 'np.array', 'pd.Categorical', 'ak.pdarray', 'ak.Strings', + 'ak.Categorical', or 'ak.SegArray' is equivalent. + + np.nparray's and pd.Categorical's will be converted to the arkouda equivalent. + Then assert_arkouda_pdarray_equal will be applied to the result. + + Parameters + ---------- + left, right : np.ndarray, pd.Categorical, arkouda.pdarray or arkouda.Strings or arkouda.Categorical + The two arrays to be compared. + check_dtype : bool, default True + Check dtype if both a and b are ak.pdarray or np.ndarray. + err_msg : str, default None + If provided, used as assertion message. + check_same : None|'copy'|'same', default None + Ensure left and right refer/do not refer to the same memory area. + obj : str, default 'numpy array' + Specify object name being compared, internally used to show appropriate + assertion message. + index_values : Index | arkouda.pdarray, default None + optional index (shared by both left and right), used in output. + + See Also + -------- + assert_arkouda_array_equal + """ + __tracebackhide__ = not DEBUG + + if not isinstance( + left, (np.ndarray, pd.Categorical, pdarray, Strings, Categorical, SegArray) + ) or not isinstance(right, (np.ndarray, pd.Categorical, pdarray, Strings, Categorical, SegArray)): + raise TypeError( + f"left and right must be type np.ndarray, pdarray, Strings, " + f"Categorical, or SegArray. " + f"Instead types were {type(left)} and {type(right)}" + ) + + assert_arkouda_array_equal( + _convert_to_arkouda(left), + _convert_to_arkouda(right), + check_dtype=check_dtype, + err_msg=err_msg, + check_same=check_same, + obj=obj, + index_values=index_values, + ) + + +def assert_series_equivalent( + left: Series | pd.Series, + right: Series | pd.Series, + check_dtype: bool = True, + check_index_type: bool = True, + check_series_type: bool = True, + check_names: bool = True, + check_exact: bool = False, + check_categorical: bool = True, + check_category_order: bool = True, + rtol: float = 1.0e-5, + atol: float = 1.0e-8, + obj: str = "Series", + *, + check_index: bool = True, + check_like: bool = False, +) -> None: + """ + Check that left and right Series are equal. + + pd.Series's will be converted to the arkouda equivalent. + Then assert_series_equal will be applied to the result. + + Parameters + ---------- + left : Series or pd.Series + right : Series or pd.Series + check_dtype : bool, default True + Whether to check the Series dtype is identical. + check_index_type : bool, default True + Whether to check the Index class, dtype and inferred_type + are identical. + check_series_type : bool, default True + Whether to check the Series class is identical. + check_names : bool, default True + Whether to check the Series and Index names attribute. + check_exact : bool, default False + Whether to compare number exactly. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + check_category_order : bool, default True + Whether to compare category order of internal Categoricals. + rtol : float, default 1e-5 + Relative tolerance. Only used when check_exact is False. + atol : float, default 1e-8 + Absolute tolerance. Only used when check_exact is False. + obj : str, default 'Series' + Specify object name being compared, internally used to show appropriate + assertion message. + check_index : bool, default True + Whether to check index equivalence. If False, then compare only values. + check_like : bool, default False + If True, ignore the order of the index. Must be False if check_index is False. + Note: same labels must be with the same data. + + See Also + -------- + assert_series_equal + + Examples + -------- + >>> from arkouda import testing as tm + >>> import pandas as pd + >>> a = ak.Series([1, 2, 3, 4]) + >>> b = pd.Series([1, 2, 3, 4]) + >>> tm.assert_series_equivalent(a, b) + """ + __tracebackhide__ = not DEBUG + + if not isinstance(left, (Series, pd.Series)) or not isinstance(right, (Series, pd.Series)): + raise TypeError( + f"left and right must be type arkouda.Series or pandas.Series. " + f"Instead types were {type(left)} and {type(right)}." + ) + + assert_series_equal( + _convert_to_arkouda(left), + _convert_to_arkouda(right), + check_dtype=check_dtype, + check_index_type=check_index_type, + check_series_type=check_series_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + check_category_order=check_category_order, + rtol=rtol, + atol=atol, + obj=obj, + check_index=check_index, + check_like=check_like, + ) + + +def assert_frame_equivalent( + left: DataFrame | pd.DataFrame, + right: DataFrame | pd.DataFrame, + check_dtype: bool = True, + check_index_type: bool = True, + check_column_type: bool = True, + check_frame_type: bool = True, + check_names: bool = True, + check_exact: bool = True, + check_categorical: bool = True, + check_like: bool = False, + rtol: float = 1.0e-5, + atol: float = 1.0e-8, + obj: str = "DataFrame", +) -> None: + """ + Check that left and right DataFrame are equal. + + This function is intended to compare two DataFrames and output any + differences. It is mostly intended for use in unit tests. + Additional parameters allow varying the strictness of the + equality checks performed. + + pd.DataFrame's will be converted to the arkouda equivalent. + Then assert_frame_equal will be applied to the result. + + Parameters + ---------- + left : DataFrame or pd.DataFrame + First DataFrame to compare. + right : DataFrame or pd.DataFrame + Second DataFrame to compare. + check_dtype : bool, default True + Whether to check the DataFrame dtype is identical. + check_index_type : bool, default = True + Whether to check the Index class, dtype and inferred_type + are identical. + check_column_type : bool or {'equiv'}, default 'equiv' + Whether to check the columns class, dtype and inferred_type + are identical. Is passed as the ``exact`` argument of + :func:`assert_index_equal`. + check_frame_type : bool, default True + Whether to check the DataFrame class is identical. + check_names : bool, default True + Whether to check that the `names` attribute for both the `index` + and `column` attributes of the DataFrame is identical. + check_exact : bool, default False + Whether to compare number exactly. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + check_like : bool, default False + If True, ignore the order of index & columns. + Note: index labels must match their respective rows + (same as in columns) - same labels must be with the same data. + rtol : float, default 1e-5 + Relative tolerance. Only used when check_exact is False. + atol : float, default 1e-8 + Absolute tolerance. Only used when check_exact is False. + obj : str, default 'DataFrame' + Specify object name being compared, internally used to show appropriate + assertion message. + + See Also + -------- + assert_frame_equal + + Examples + -------- + This example shows comparing two DataFrames that are equal + but with columns of differing dtypes. + + >>> from arkouda.testing import assert_frame_equivalent + >>> import pandas as pd + >>> df1 = ak.DataFrame({'a': [1, 2], 'b': [3, 4]}) + >>> df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]}) + >>> assert_frame_equivalent(df1, df1) + + """ + __tracebackhide__ = not DEBUG + + if not isinstance(left, (DataFrame, pd.DataFrame)) or not isinstance( + right, (DataFrame, pd.DataFrame) + ): + raise TypeError( + f"left and right must be type arkouda.DataFrame or pandas.DataFrame. " + f"Instead types were {type(left)} and {type(right)}." + ) + + assert_frame_equal( + _convert_to_arkouda(left), + _convert_to_arkouda(right), + check_dtype=check_dtype, + check_index_type=check_index_type, + check_column_type=check_column_type, + check_frame_type=check_frame_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + check_like=check_like, + rtol=rtol, + atol=atol, + obj=obj, + ) + + +def assert_equivalent(left, right, **kwargs) -> None: + """ + Wrapper for tm.assert_*_equivalent to dispatch to the appropriate test function. + + Parameters + ---------- + left, right : Index, pd.Index, Series, pd.Series, DataFrame, pd.DataFrame, + Strings, Categorical, pd.Categorical, SegArray, pdarray, np.ndarray, + The two items to be compared. + **kwargs + All keyword arguments are passed through to the underlying assert method. + """ + __tracebackhide__ = not DEBUG + + if isinstance(left, (Index, pd.Index)): + assert_index_equivalent(left, right, **kwargs) + elif isinstance(left, (Series, pd.Series)): + assert_series_equivalent(left, right, **kwargs) + elif isinstance(left, (DataFrame, pd.DataFrame)): + assert_frame_equivalent(left, right, **kwargs) + elif isinstance(left, (pdarray, np.ndarray, Strings, Categorical, pd.Categorical, SegArray)): + assert_arkouda_array_equivalent(left, right, **kwargs) + elif isinstance(left, str): + assert kwargs == {} + assert left == right + else: + assert kwargs == {} + assert_almost_equivalent(left, right)