From ee5b196b69df03693d4799962056f35d5a34761d Mon Sep 17 00:00:00 2001 From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com> Date: Tue, 11 Jul 2023 09:34:53 -0400 Subject: [PATCH 1/8] Existing tests converted. Still need to review for missing coverage and possible simplications. --- PROTO_tests/tests/dataframe_test.py | 716 ++++++++++++++++++++++++++++ 1 file changed, 716 insertions(+) create mode 100644 PROTO_tests/tests/dataframe_test.py diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py new file mode 100644 index 0000000000..87ea2b273c --- /dev/null +++ b/PROTO_tests/tests/dataframe_test.py @@ -0,0 +1,716 @@ +import arkouda as ak +import pandas as pd +import numpy as np +import pytest +import random +import string +import tempfile +import glob +import os +from arkouda import io_util + + +class TestDataFrame: + df_test_base_tmp = "{}/df_test".format(os.getcwd()) + io_util.get_directory(df_test_base_tmp) + + @staticmethod + def build_ak_df(): + username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"]) + userid = ak.array([111, 222, 111, 333, 222, 111]) + item = ak.array([0, 0, 1, 1, 2, 0]) + day = ak.array([5, 5, 6, 5, 6, 6]) + amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6]) + bi = ak.arange(2 ** 200, 2 ** 200 + 6) + return ak.DataFrame( + {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi} + ) + + @staticmethod + def build_pd_df(): + username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"] + userid = [111, 222, 111, 333, 222, 111] + item = [0, 0, 1, 1, 2, 0] + day = [5, 5, 6, 5, 6, 6] + amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6] + bi = [2 ** 200, 2 ** 200 + 1, 2 ** 200 + 2, 2 ** 200 + 3, 2 ** 200 + 4, 2 ** 200 + 5] + return pd.DataFrame( + {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi} + ) + + @staticmethod + def build_ak_df_duplicates(): + username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"]) + userid = ak.array([111, 222, 111, 333, 222, 111]) + item = ak.array([0, 1, 0, 2, 1, 0]) + day = ak.array([5, 5, 5, 5, 5, 5]) + return ak.DataFrame({"userName": username, "userID": userid, "item": item, "day": day}) + + @staticmethod + def build_pd_df_duplicates(): + username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"] + userid = [111, 222, 111, 333, 222, 111] + item = [0, 1, 0, 2, 1, 0] + day = [5, 5, 5, 5, 5, 5] + return pd.DataFrame({"userName": username, "userID": userid, "item": item, "day": day}) + + @staticmethod + def build_ak_append(): + username = ak.array(["John", "Carol"]) + userid = ak.array([444, 333]) + item = ak.array([0, 2]) + day = ak.array([1, 2]) + amount = ak.array([0.5, 5.1]) + bi = ak.array([2 ** 200 + 6, 2 ** 200 + 7]) + return ak.DataFrame( + {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi} + ) + + @staticmethod + def build_pd_df_append(): + username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice", "John", "Carol"] + userid = [111, 222, 111, 333, 222, 111, 444, 333] + item = [0, 0, 1, 1, 2, 0, 0, 2] + day = [5, 5, 6, 5, 6, 6, 1, 2] + amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6, 0.5, 5.1] + bi = [ + 2 ** 200, + 2 ** 200 + 1, + 2 ** 200 + 2, + 2 ** 200 + 3, + 2 ** 200 + 4, + 2 ** 200 + 5, + 2 ** 200 + 6, + 2 ** 200 + 7, + ] + return pd.DataFrame( + {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi} + ) + + @staticmethod + def build_ak_keyerror(): + userid = ak.array([444, 333]) + item = ak.array([0, 2]) + return ak.DataFrame({"user_id": userid, "item": item}) + + @staticmethod + def build_ak_typeerror(): + username = ak.array([111, 222, 111, 333, 222, 111]) + userid = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"]) + item = ak.array([0, 0, 1, 1, 2, 0]) + day = ak.array([5, 5, 6, 5, 6, 6]) + amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6]) + bi = ak.arange(2 ** 200, 2 ** 200 + 6) + return ak.DataFrame( + {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi} + ) + + def test_dataframe_creation(self): + # Validate empty DataFrame + df = ak.DataFrame() + assert isinstance(df, ak.DataFrame) + assert df.empty + + df = self.build_ak_df() + ref_df = self.build_pd_df() + assert isinstance(df, ak.DataFrame) + assert len(df) == 6 + assert ref_df.equals(df.to_pandas()) + + def test_client_type_creation(self): + f = ak.Fields(ak.arange(10), ["A", "B", "c"]) + ip = ak.ip_address(ak.arange(10)) + d = ak.Datetime(ak.arange(10)) + bv = ak.BitVector(ak.arange(10), width=4) + + df_dict = {"fields": f, "ip": ip, "date": d, "bitvector": bv} + df = ak.DataFrame(df_dict) + pd_d = [pd.to_datetime(x, unit="ns") for x in d.to_list()] + pddf = pd.DataFrame( + {"fields": f.to_list(), "ip": ip.to_list(), "date": pd_d, "bitvector": bv.to_list()} + ) + shape = f"({df._shape_str()})".replace("(", "[").replace(")", "]") + pd.set_option("display.max_rows", 4) + s = df.__repr__().replace(f" ({df._shape_str()})", f"\n\n{shape}") + assert s == pddf.__repr__() + + pd.set_option("display.max_rows", 10) + pdf = pd.DataFrame({"a": list(range(1000)), "b": list(range(1000))}) + pdf["a"] = pdf["a"].apply(lambda x: "AA" + str(x)) + pdf["b"] = pdf["b"].apply(lambda x: "BB" + str(x)) + df = ak.DataFrame(pdf) + shape = f"({df._shape_str()})".replace("(", "[").replace(")", "]") + s = df.__repr__().replace(f" ({df._shape_str()})", f"\n\n{shape}") + assert s, pdf.__repr__() + + def test_boolean_indexing(self): + df = self.build_ak_df() + ref_df = self.build_pd_df() + row = df[df["userName"] == "Carol"] + + assert len(row) == 1 + assert ref_df[ref_df["userName"] == "Carol"].equals(row.to_pandas(retain_index=True)) + + def test_column_indexing(self): + df = self.build_ak_df() + assert isinstance(df.userName, ak.Series) + assert isinstance(df.userID, ak.Series) + assert isinstance(df.item, ak.Series) + assert isinstance(df.day, ak.Series) + assert isinstance(df.amount, ak.Series) + assert isinstance(df.bi, ak.Series) + for col in ("userName", "userID", "item", "day", "amount", "bi"): + assert isinstance(df[col], (ak.pdarray, ak.Strings, ak.Categorical)) + assert isinstance(df[["userName", "amount", "bi"]], ak.DataFrame) + assert isinstance(df[("userID", "item", "day", "bi")], ak.DataFrame) + assert isinstance(df.index, ak.Index) + + def test_dtype_prop(self): + str_arr = ak.array( + ["".join(random.choices(string.ascii_letters + string.digits, k=5)) for _ in range(3)] + ) + df_dict = { + "i": ak.arange(3), + "c_1": ak.arange(3, 6, 1), + "c_2": ak.arange(6, 9, 1), + "c_3": str_arr, + "c_4": ak.Categorical(str_arr), + "c_5": ak.SegArray(ak.array([0, 9, 14]), ak.arange(20)), + "c_6": ak.arange(2**200, 2**200 + 3), + } + akdf = ak.DataFrame(df_dict) + assert len(akdf.columns) == len(akdf.dtypes) + + def test_from_pandas(self): + username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice", "John", "Carol"] + userid = [111, 222, 111, 333, 222, 111, 444, 333] + item = [0, 0, 1, 1, 2, 0, 0, 2] + day = [5, 5, 6, 5, 6, 6, 1, 2] + amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6, 0.5, 5.1] + bi = 2**200 + bi_arr = [bi, bi + 1, bi + 2, bi + 3, bi + 4, bi + 5, bi + 6, bi + 7] + ref_df = pd.DataFrame( + { + "userName": username, + "userID": userid, + "item": item, + "day": day, + "amount": amount, + "bi": bi_arr, + } + ) + + df = ak.DataFrame(ref_df) + + assert ((ref_df == df.to_pandas()).all()).all() + + df = ak.DataFrame.from_pandas(ref_df) + assert ((ref_df == df.to_pandas()).all()).all() + + def test_drop(self): + # create an arkouda df. + df = self.build_ak_df() + # create pandas df to validate functionality against + pd_df = self.build_pd_df() + + # test out of place drop + df_drop = df.drop([0, 1, 2]) + pddf_drop = pd_df.drop(labels=[0, 1, 2]) + pddf_drop.reset_index(drop=True, inplace=True) + assert pddf_drop.equals(df_drop.to_pandas()) + + df_drop = df.drop("userName", axis=1) + pddf_drop = pd_df.drop(labels=["userName"], axis=1) + assert pddf_drop.equals(df_drop.to_pandas()) + + # Test dropping columns + df.drop("userName", axis=1, inplace=True) + pd_df.drop(labels=["userName"], axis=1, inplace=True) + + assert ((df.to_pandas() == pd_df).all()).all() + + # Test dropping rows + df.drop([0, 2, 5], inplace=True) + # pandas retains original indexes when dropping rows, need to reset to line up with arkouda + pd_df.drop(labels=[0, 2, 5], inplace=True) + pd_df.reset_index(drop=True, inplace=True) + + assert pd_df.equals(df.to_pandas()) + + # verify that index keys must be ints + with pytest.raises(TypeError): + df.drop("index") + + # verify axis can only be 0 or 1 + with pytest.raises(ValueError): + df.drop("amount", 15) + + def test_drop_duplicates(self): + df = self.build_ak_df_duplicates() + ref_df = self.build_pd_df_duplicates() + + dedup = df.drop_duplicates() + dedup_pd = ref_df.drop_duplicates() + # pandas retains original indexes when dropping dups, need to reset to line up with arkouda + dedup_pd.reset_index(drop=True, inplace=True) + + dedup_test = dedup.to_pandas().sort_values("userName").reset_index(drop=True) + dedup_pd_test = dedup_pd.sort_values("userName").reset_index(drop=True) + + assert dedup_test.equals(dedup_pd_test) + + def test_shape(self): + df = self.build_ak_df() + + row, col = df.shape + assert row == 6 + assert col == 6 + + def test_reset_index(self): + df = self.build_ak_df() + + slice_df = df[ak.array([1, 3, 5])] + assert slice_df.index.to_list() == [1, 3, 5] + + df_reset = slice_df.reset_index() + assert df_reset.index.to_list() == [0, 1, 2] + assert slice_df.index.to_list(), [1, 3, 5] + + slice_df.reset_index(inplace=True) + assert slice_df.index.to_list(), [0, 1, 2] + + def test_rename(self): + df = self.build_ak_df() + + rename = {"userName": "name_col", "userID": "user_id"} + + # Test out of Place - column + df_rename = df.rename(rename, axis=1) + assert "user_id" in df_rename.columns + assert "name_col" in df_rename.columns + assert "userName" not in df_rename.columns + assert "userID" not in df_rename.columns + assert "userID" in df.columns + assert "userName" in df.columns + assert "user_id" not in df.columns + assert "name_col" not in df.columns + + # Test in place - column + df.rename(column=rename, inplace=True) + assert "user_id" in df.columns + assert "name_col" in df.columns + assert "userName" not in df.columns + assert "userID" not in df.columns + + # prep for index renaming + rename_idx = {1: 17, 2: 93} + conf = list(range(6)) + conf[1] = 17 + conf[2] = 93 + + # Test out of Place - index + df_rename = df.rename(rename_idx) + assert df_rename.index.values.to_list() == conf + assert df.index.values.to_list() == list(range(6)) + + # Test in place - index + df.rename(index=rename_idx, inplace=True) + assert df.index.values.to_list() == conf + + def test_append(self): + df = self.build_ak_df() + df_toappend = self.build_ak_append() + + df.append(df_toappend) + + ref_df = self.build_pd_df_append() + + # dataframe equality returns series with bool result for each row. + assert ref_df.equals(df.to_pandas()) + + idx = np.arange(8) + assert idx.tolist() == df.index.index.to_list() + + df_keyerror = self.build_ak_keyerror() + with pytest.raises(KeyError): + df.append(df_keyerror) + + df_typeerror = self.build_ak_typeerror() + with pytest.raises(TypeError): + df.append(df_typeerror) + + def test_concat(self): + df = self.build_ak_df() + df_toappend = self.build_ak_append() + + glued = ak.DataFrame.concat([df, df_toappend]) + + ref_df = self.build_pd_df_append() + + # dataframe equality returns series with bool result for each row. + assert ref_df.equals(glued.to_pandas()) + + df_keyerror = self.build_ak_keyerror() + with pytest.raises(KeyError): + ak.DataFrame.concat([df, df_keyerror]) + + df_typeerror = self.build_ak_typeerror() + with pytest.raises(TypeError): + ak.DataFrame.concat([df, df_typeerror]) + + def test_head(self): + df = self.build_ak_df() + ref_df = self.build_pd_df() + + hdf = df.head(3) + hdf_ref = ref_df.head(3).reset_index(drop=True) + assert hdf_ref.equals(hdf.to_pandas()) + + def test_tail(self): + df = self.build_ak_df() + ref_df = self.build_pd_df() + + hdf = df.tail(2) + hdf_ref = ref_df.tail(2).reset_index(drop=True) + assert hdf_ref.equals(hdf.to_pandas()) + + def test_groupby_standard(self): + df = self.build_ak_df() + gb = df.GroupBy("userName") + keys, count = gb.count() + assert keys.to_list() == ["Bob", "Alice", "Carol"] + assert count.to_list() == [2, 3, 1] + assert gb.permutation.to_list() == [1, 4, 0, 2, 5, 3] + + gb = df.GroupBy(["userName", "userID"]) + keys, count = gb.count() + assert len(keys) == 2 + assert keys[0].to_list() == ["Carol", "Bob", "Alice"] + assert keys[1].to_list() == [333, 222, 111] + assert count.to_list() == [1, 2, 3] + + # testing counts with IPv4 column + s = ak.DataFrame({"a": ak.IPv4(ak.arange(1, 5))}).groupby("a").count() + pds = pd.Series( + data=np.ones(4, dtype=np.int64), + index=pd.Index(data=np.array(["0.0.0.1", "0.0.0.2", "0.0.0.3", "0.0.0.4"], dtype=" 2**63 + # to avoid loss of precision see (#1983) + df = pd.DataFrame({"Test": [2**64 - 1, 0]}) + assert df["Test"].dtype == ak.uint64 + + def test_head_tail_resetting_index(self): + # Test that issue #2183 is resolved + df = ak.DataFrame({"cnt": ak.arange(65)}) + # Note we have to call __repr__ to trigger head_tail_server call + + bool_idx = df[df["cnt"] > 3] + bool_idx.__repr__() + assert bool_idx.index.index.to_list() == list(range(4, 65)) + + slice_idx = df[:] + slice_idx.__repr__() + assert slice_idx.index.index.to_list() == list(range(65)) + + # verify it persists non-int Index + idx = ak.concatenate([ak.zeros(5, bool), ak.ones(60, bool)]) + df = ak.DataFrame({"cnt": ak.arange(65)}, index=idx) + + bool_idx = df[df["cnt"] > 3] + bool_idx.__repr__() + # the new index is first False and rest True (because we lose first 4), so equivalent to arange(61, bool) + assert bool_idx.index.index.to_list() == ak.arange(61, dtype=bool).to_list() + + slice_idx = df[:] + slice_idx.__repr__() + assert slice_idx.index.index.to_list() == idx.to_list() + + def test_ipv4_columns(self): + # test with single IPv4 column + df = ak.DataFrame({ + 'a': ak.arange(10), + 'b': ak.IPv4(ak.arange(10)) + }) + with tempfile.TemporaryDirectory(dir=TestDataFrame.df_test_base_tmp) as tmp_dirname: + fname = tmp_dirname + "/ipv4_df" + df.to_parquet(fname) + + data = ak.read(fname+"*") + rddf = ak.DataFrame({ + 'a': data['a'], + 'b': ak.IPv4(data['b']) + }) + + assert df['a'].to_list() == rddf['a'].to_list() + assert df['b'].to_list() == rddf['b'].to_list() + + # test with multiple + df = ak.DataFrame({ + 'a': ak.IPv4(ak.arange(10)), + 'b': ak.IPv4(ak.arange(10)) + }) + with tempfile.TemporaryDirectory(dir=TestDataFrame.df_test_base_tmp) as tmp_dirname: + fname = tmp_dirname + "/ipv4_df" + df.to_parquet(fname) + + data = ak.read(fname + "*") + rddf = ak.DataFrame({ + 'a': ak.IPv4(data['a']), + 'b': ak.IPv4(data['b']) + }) + + assert df['a'].to_list() == rddf['a'].to_list() + assert df['b'].to_list() == rddf['b'].to_list() + + # test replacement of IPv4 with uint representation + df = ak.DataFrame({ + 'a': ak.IPv4(ak.arange(10)) + }) + df['a'] = df['a'].export_uint() + assert ak.arange(10).to_list() == df['a'].to_list() + + def test_subset(self): + df = ak.DataFrame({ + 'a': ak.arange(100), + 'b': ak.randint(0, 20, 100), + 'c': ak.random_strings_uniform(0, 16, 100), + 'd': ak.randint(25, 75, 100) + }) + df2 = df[['a', 'b']] + assert ['a', 'b'] == df2.columns + assert df.index.to_list() == df2.index.to_list() + assert df['a'].to_list() == df2['a'].to_list() + assert df['b'].to_list() == df2['b'].to_list() From 576f96d9f9369df317860124bcd8902cd3565b28 Mon Sep 17 00:00:00 2001 From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com> Date: Thu, 13 Jul 2023 08:10:43 -0400 Subject: [PATCH 2/8] Updating creation testing coverage. --- PROTO_tests/tests/dataframe_test.py | 111 ++++++++++++---------------- arkouda/dataframe.py | 10 +-- pytest_PROTO.ini | 2 +- 3 files changed, 52 insertions(+), 71 deletions(-) diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py index 87ea2b273c..3412160d33 100644 --- a/PROTO_tests/tests/dataframe_test.py +++ b/PROTO_tests/tests/dataframe_test.py @@ -1,5 +1,6 @@ import arkouda as ak import pandas as pd +from pandas.testing import assert_frame_equal import numpy as np import pytest import random @@ -105,17 +106,57 @@ def build_ak_typeerror(): {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi} ) - def test_dataframe_creation(self): + @pytest.mark.parametrize("size", pytest.prob_size) + def test_dataframe_creation(self, size): # Validate empty DataFrame df = ak.DataFrame() assert isinstance(df, ak.DataFrame) assert df.empty - df = self.build_ak_df() - ref_df = self.build_pd_df() - assert isinstance(df, ak.DataFrame) - assert len(df) == 6 - assert ref_df.equals(df.to_pandas()) + # Validation of Creation from Pandas + pddf = pd.DataFrame({ + "int": np.arange(size), + "uint": np.random.randint(0, size/2, size, dtype=np.uint64), + "bigint": np.arange(2**200, 2**200+size), + "bool": np.random.randint(0, 1, size=size, dtype=bool), + "segarray": [np.random.randint(0, size / 2, 2) for i in range(size)] + }) + akdf = ak.DataFrame(pddf) + assert isinstance(akdf, ak.DataFrame) + assert len(akdf) == size + assert_frame_equal(pddf, akdf.to_pandas()) + + # validation of creation from dictionary + akdf = ak.DataFrame({ + "int": ak.arange(size), + "uint": ak.array(pddf["uint"]), + "bigint": ak.arange(2 ** 200, 2 ** 200 + size), + "bool": ak.array(pddf["bool"]), + "segarray": ak.SegArray.from_multi_array([ak.array(x) for x in pddf["segarray"]]) + }) + assert isinstance(akdf, ak.DataFrame) + assert len(akdf) == size + + assert_frame_equal(pddf, akdf.to_pandas()) + + # validation of creation from list + x = [ + np.arange(size), + np.random.randint(0, 5, size), + np.random.randint(5, 10, size), + ] + pddf = pd.DataFrame(x) + l = [ak.array(val) for val in list(zip(x[0], x[1], x[2]))] + akdf = ak.DataFrame(l) + assert isinstance(akdf, ak.DataFrame) + assert len(akdf) == len(pddf) + # arkouda does not allow for numeric columns. + assert akdf.columns == [str(x) for x in pddf.columns.values] + # use the columns from the pandas created for equivalence check + # these should be equivalent + ak_to_pd = akdf.to_pandas() + ak_to_pd.columns = pddf.columns + assert_frame_equal(pddf, ak_to_pd) def test_client_type_creation(self): f = ak.Fields(ak.arange(10), ["A", "B", "c"]) @@ -537,64 +578,6 @@ def test_copy(self): df_copy.__setitem__("userID", ak.array([1, 2, 1, 3, 2, 1])) assert df.__repr__() == df_copy.__repr__() - # TODO - This should be covered in HDF5 and Parquet testing - def test_save(self): - i = list(range(3)) - c1 = [9, 7, 17] - c2 = [2, 4, 6] - df_dict = {"i": ak.array(i), "c_1": ak.array(c1), "c_2": ak.array(c2)} - - akdf = ak.DataFrame(df_dict) - - validation_df = pd.DataFrame( - { - "i": i, - "c_1": c1, - "c_2": c2, - } - ) - with tempfile.TemporaryDirectory(dir=self.df_test_base_tmp) as tmp_dirname: - akdf.to_parquet(f"{tmp_dirname}/testName") - - ak_loaded = ak.DataFrame.load(f"{tmp_dirname}/testName") - assert validation_df.equals(ak_loaded[akdf.columns].to_pandas()) - - # test save with index true - akdf.to_parquet(f"{tmp_dirname}/testName_with_index.pq", index=True) - assert ( - len(glob.glob(f"{tmp_dirname}/testName_with_index*.pq")) == ak.get_config()["numLocales"] - ) - - # Test for df having seg array col - df = ak.DataFrame({"a": ak.arange(10), "b": ak.SegArray(ak.arange(10), ak.arange(10))}) - df.to_hdf(f"{tmp_dirname}/seg_test.h5") - assert ( - len(glob.glob(f"{tmp_dirname}/seg_test*.h5")) == ak.get_config()["numLocales"] - ) - ak_loaded = ak.DataFrame.load(f"{tmp_dirname}/seg_test.h5") - assert df.to_pandas().equals(ak_loaded.to_pandas()) - - # test with segarray with _ in column name - df_dict = { - "c_1": ak.arange(3, 6), - "c_2": ak.arange(6, 9), - "c_3": ak.SegArray(ak.array([0, 9, 14]), ak.arange(20)), - } - akdf = ak.DataFrame(df_dict) - akdf.to_hdf(f"{tmp_dirname}/seg_test.h5") - assert ( - len(glob.glob(f"{tmp_dirname}/seg_test*.h5")) == ak.get_config()["numLocales"] - ) - ak_loaded = ak.DataFrame.load(f"{tmp_dirname}/seg_test.h5") - assert akdf.to_pandas().equals(ak_loaded.to_pandas()) - - # test load_all and read workflows - ak_load_all = ak.DataFrame(ak.load_all(f"{tmp_dirname}/seg_test.h5")) - assert akdf.to_pandas().equals(ak_load_all.to_pandas()) - - ak_read = ak.DataFrame(ak.read(f"{tmp_dirname}/seg_test*")) - assert akdf.to_pandas().equals(ak_read.to_pandas()) - def test_isin(self): df = ak.DataFrame({"col_A": ak.array([7, 3]), "col_B": ak.array([1, 9])}) diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py index e6f4a2a82b..4145f4e6fa 100644 --- a/arkouda/dataframe.py +++ b/arkouda/dataframe.py @@ -275,13 +275,11 @@ def __init__(self, initialdata=None, index=None): else: self._set_index(index) self.data = {} - # convert the lists defining each column into a pdarray - # pd.DataFrame.values is stored as rows, we need lists to be columns - for key, val in initialdata.to_dict("list").items(): + for key in initialdata.columns: self.data[key] = ( - SegArray.from_multi_array([array(r) for r in val]) - if isinstance(val[0], list) - else array(val) + SegArray.from_multi_array([array(r) for r in initialdata[key]]) + if isinstance(initialdata[key][0], (list, np.ndarray)) + else array(initialdata[key]) ) self.data.update() diff --git a/pytest_PROTO.ini b/pytest_PROTO.ini index afdb98cf87..68cbf3e01e 100644 --- a/pytest_PROTO.ini +++ b/pytest_PROTO.ini @@ -2,7 +2,7 @@ addopts = --benchmark-disable --benchmark-skip - --size=100 + --size=5 filterwarnings = ignore:Version mismatch between client .* testpaths = From b80986aaddbc844a715c285980793d3aea484b6d Mon Sep 17 00:00:00 2001 From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com> Date: Mon, 17 Jul 2023 09:45:22 -0400 Subject: [PATCH 3/8] DataFrame testing new framework final. Adds to_list to ak.Series and updates pdarray creation to detect numpy.object_ with str elements. --- PROTO_tests/tests/dataframe_test.py | 148 +++++++++++----------------- arkouda/pdarraycreation.py | 6 +- arkouda/series.py | 5 + pytest_PROTO.ini | 2 +- 4 files changed, 71 insertions(+), 90 deletions(-) diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py index 3412160d33..6a0b50f133 100644 --- a/PROTO_tests/tests/dataframe_test.py +++ b/PROTO_tests/tests/dataframe_test.py @@ -1,6 +1,6 @@ import arkouda as ak import pandas as pd -from pandas.testing import assert_frame_equal +from pandas.testing import assert_frame_equal, assert_series_equal import numpy as np import pytest import random @@ -170,19 +170,13 @@ def test_client_type_creation(self): pddf = pd.DataFrame( {"fields": f.to_list(), "ip": ip.to_list(), "date": pd_d, "bitvector": bv.to_list()} ) - shape = f"({df._shape_str()})".replace("(", "[").replace(")", "]") - pd.set_option("display.max_rows", 4) - s = df.__repr__().replace(f" ({df._shape_str()})", f"\n\n{shape}") - assert s == pddf.__repr__() - - pd.set_option("display.max_rows", 10) - pdf = pd.DataFrame({"a": list(range(1000)), "b": list(range(1000))}) - pdf["a"] = pdf["a"].apply(lambda x: "AA" + str(x)) - pdf["b"] = pdf["b"].apply(lambda x: "BB" + str(x)) - df = ak.DataFrame(pdf) - shape = f"({df._shape_str()})".replace("(", "[").replace(")", "]") - s = df.__repr__().replace(f" ({df._shape_str()})", f"\n\n{shape}") - assert s, pdf.__repr__() + assert_frame_equal(pddf, df.to_pandas()) + pddf = pd.DataFrame({"a": list(range(1000)), "b": list(range(1000))}) + pddf["a"] = pddf["a"].apply(lambda x: "AA" + str(x)) + pddf["b"] = pddf["b"].apply(lambda x: "BB" + str(x)) + + df = ak.DataFrame(pddf) + assert_frame_equal(pddf, df.to_pandas()) def test_boolean_indexing(self): df = self.build_ak_df() @@ -194,22 +188,32 @@ def test_boolean_indexing(self): def test_column_indexing(self): df = self.build_ak_df() - assert isinstance(df.userName, ak.Series) - assert isinstance(df.userID, ak.Series) - assert isinstance(df.item, ak.Series) - assert isinstance(df.day, ak.Series) - assert isinstance(df.amount, ak.Series) - assert isinstance(df.bi, ak.Series) - for col in ("userName", "userID", "item", "day", "amount", "bi"): - assert isinstance(df[col], (ak.pdarray, ak.Strings, ak.Categorical)) - assert isinstance(df[["userName", "amount", "bi"]], ak.DataFrame) - assert isinstance(df[("userID", "item", "day", "bi")], ak.DataFrame) + ref_df = self.build_pd_df() + + # index validation assert isinstance(df.index, ak.Index) + assert df.index.to_list() == ref_df.index.to_list() + + # column validation [] and . access + for cname, col, ref_col in zip(df.columns, [df.userName, df.userID, df.item, df.day, df.amount, df.bi], [ref_df.userName, ref_df.userID, ref_df.item, ref_df.day, ref_df.amount, ref_df.bi]): + assert isinstance(col, ak.Series) + assert col.to_list() == ref_col.to_list() + assert isinstance(df[cname], (ak.pdarray, ak.Strings, ak.Categorical)) + assert df[cname].to_list() == ref_df[cname].to_list() + + # check mult-column list + col_list = ["userName", "amount", "bi"] + assert isinstance(df[col_list], ak.DataFrame) + assert_frame_equal(df[col_list].to_pandas(), ref_df[col_list]) + + # check multi-column tuple + col_tup = ("userID", "item", "day", "bi") + assert isinstance(df[col_tup], ak.DataFrame) + # pandas only supports lists of columns, not tuples + assert_frame_equal(df[col_tup].to_pandas(), ref_df[list(col_tup)]) def test_dtype_prop(self): - str_arr = ak.array( - ["".join(random.choices(string.ascii_letters + string.digits, k=5)) for _ in range(3)] - ) + str_arr = ak.random_strings_uniform(1, 5, 3) df_dict = { "i": ak.arange(3), "c_1": ak.arange(3, 6, 1), @@ -221,32 +225,15 @@ def test_dtype_prop(self): } akdf = ak.DataFrame(df_dict) assert len(akdf.columns) == len(akdf.dtypes) + # dtypes returns objType for categorical, segarray. We should probably fix + # this and add a df.objTypes property. pdarrays return actual dtype + for ref_type, c in zip(["int64", "int64", "int64", "str", "Categorical", "SegArray", "bigint"], akdf.columns): + assert ref_type == str(akdf.dtypes[c]) def test_from_pandas(self): - username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice", "John", "Carol"] - userid = [111, 222, 111, 333, 222, 111, 444, 333] - item = [0, 0, 1, 1, 2, 0, 0, 2] - day = [5, 5, 6, 5, 6, 6, 1, 2] - amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6, 0.5, 5.1] - bi = 2**200 - bi_arr = [bi, bi + 1, bi + 2, bi + 3, bi + 4, bi + 5, bi + 6, bi + 7] - ref_df = pd.DataFrame( - { - "userName": username, - "userID": userid, - "item": item, - "day": day, - "amount": amount, - "bi": bi_arr, - } - ) - + ref_df = self.build_pd_df() df = ak.DataFrame(ref_df) - - assert ((ref_df == df.to_pandas()).all()).all() - - df = ak.DataFrame.from_pandas(ref_df) - assert ((ref_df == df.to_pandas()).all()).all() + assert_frame_equal(ref_df, df.to_pandas()) def test_drop(self): # create an arkouda df. @@ -258,17 +245,17 @@ def test_drop(self): df_drop = df.drop([0, 1, 2]) pddf_drop = pd_df.drop(labels=[0, 1, 2]) pddf_drop.reset_index(drop=True, inplace=True) - assert pddf_drop.equals(df_drop.to_pandas()) + assert_frame_equal(pddf_drop, df_drop.to_pandas()) df_drop = df.drop("userName", axis=1) pddf_drop = pd_df.drop(labels=["userName"], axis=1) - assert pddf_drop.equals(df_drop.to_pandas()) + assert_frame_equal(pddf_drop, df_drop.to_pandas()) # Test dropping columns df.drop("userName", axis=1, inplace=True) pd_df.drop(labels=["userName"], axis=1, inplace=True) - assert ((df.to_pandas() == pd_df).all()).all() + assert_frame_equal(pddf_drop, df_drop.to_pandas()) # Test dropping rows df.drop([0, 2, 5], inplace=True) @@ -276,7 +263,7 @@ def test_drop(self): pd_df.drop(labels=[0, 2, 5], inplace=True) pd_df.reset_index(drop=True, inplace=True) - assert pd_df.equals(df.to_pandas()) + assert_frame_equal(pddf_drop, df_drop.to_pandas()) # verify that index keys must be ints with pytest.raises(TypeError): @@ -298,7 +285,7 @@ def test_drop_duplicates(self): dedup_test = dedup.to_pandas().sort_values("userName").reset_index(drop=True) dedup_pd_test = dedup_pd.sort_values("userName").reset_index(drop=True) - assert dedup_test.equals(dedup_pd_test) + assert_frame_equal(dedup_pd_test, dedup_test) def test_shape(self): df = self.build_ak_df() @@ -367,7 +354,7 @@ def test_append(self): ref_df = self.build_pd_df_append() # dataframe equality returns series with bool result for each row. - assert ref_df.equals(df.to_pandas()) + assert_frame_equal(ref_df, df.to_pandas()) idx = np.arange(8) assert idx.tolist() == df.index.index.to_list() @@ -389,7 +376,7 @@ def test_concat(self): ref_df = self.build_pd_df_append() # dataframe equality returns series with bool result for each row. - assert ref_df.equals(glued.to_pandas()) + assert_frame_equal(ref_df, glued.to_pandas()) df_keyerror = self.build_ak_keyerror() with pytest.raises(KeyError): @@ -405,15 +392,15 @@ def test_head(self): hdf = df.head(3) hdf_ref = ref_df.head(3).reset_index(drop=True) - assert hdf_ref.equals(hdf.to_pandas()) + assert_frame_equal(hdf_ref, hdf.to_pandas()) def test_tail(self): df = self.build_ak_df() ref_df = self.build_pd_df() - hdf = df.tail(2) - hdf_ref = ref_df.tail(2).reset_index(drop=True) - assert hdf_ref.equals(hdf.to_pandas()) + tdf = df.tail(2) + tdf_ref = ref_df.tail(2).reset_index(drop=True) + assert_frame_equal(tdf_ref, tdf.to_pandas()) def test_groupby_standard(self): df = self.build_ak_df() @@ -436,12 +423,12 @@ def test_groupby_standard(self): data=np.ones(4, dtype=np.int64), index=pd.Index(data=np.array(["0.0.0.1", "0.0.0.2", "0.0.0.3", "0.0.0.4"], dtype=" pd.Series: val = convert_if_categorical(self.values) return pd.Series(val.to_ndarray(), index=idx) + @typechecked() + def to_list(self) -> list: + p = self.to_pandas() + return p.to_list() + @typechecked def value_counts(self, sort: bool = True) -> Series: """Return a Series containing counts of unique values. diff --git a/pytest_PROTO.ini b/pytest_PROTO.ini index 68cbf3e01e..afdb98cf87 100644 --- a/pytest_PROTO.ini +++ b/pytest_PROTO.ini @@ -2,7 +2,7 @@ addopts = --benchmark-disable --benchmark-skip - --size=5 + --size=100 filterwarnings = ignore:Version mismatch between client .* testpaths = From 41688695a6de81557d2a56fb358e979f30b6ce44 Mon Sep 17 00:00:00 2001 From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com> Date: Mon, 17 Jul 2023 10:08:37 -0400 Subject: [PATCH 4/8] Correcting formatting errors. --- PROTO_tests/tests/dataframe_test.py | 185 ++++++++++++++++------------ arkouda/pdarraycreation.py | 1 - 2 files changed, 109 insertions(+), 77 deletions(-) diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py index 6a0b50f133..0b4636ff4c 100644 --- a/PROTO_tests/tests/dataframe_test.py +++ b/PROTO_tests/tests/dataframe_test.py @@ -1,13 +1,12 @@ -import arkouda as ak -import pandas as pd -from pandas.testing import assert_frame_equal, assert_series_equal +import os +import tempfile + import numpy as np +import pandas as pd import pytest -import random -import string -import tempfile -import glob -import os +from pandas.testing import assert_frame_equal, assert_series_equal + +import arkouda as ak from arkouda import io_util @@ -22,9 +21,16 @@ def build_ak_df(): item = ak.array([0, 0, 1, 1, 2, 0]) day = ak.array([5, 5, 6, 5, 6, 6]) amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6]) - bi = ak.arange(2 ** 200, 2 ** 200 + 6) + bi = ak.arange(2**200, 2**200 + 6) return ak.DataFrame( - {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi} + { + "userName": username, + "userID": userid, + "item": item, + "day": day, + "amount": amount, + "bi": bi, + } ) @staticmethod @@ -34,9 +40,16 @@ def build_pd_df(): item = [0, 0, 1, 1, 2, 0] day = [5, 5, 6, 5, 6, 6] amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6] - bi = [2 ** 200, 2 ** 200 + 1, 2 ** 200 + 2, 2 ** 200 + 3, 2 ** 200 + 4, 2 ** 200 + 5] + bi = [2**200, 2**200 + 1, 2**200 + 2, 2**200 + 3, 2**200 + 4, 2**200 + 5] return pd.DataFrame( - {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi} + { + "userName": username, + "userID": userid, + "item": item, + "day": day, + "amount": amount, + "bi": bi, + } ) @staticmethod @@ -62,9 +75,16 @@ def build_ak_append(): item = ak.array([0, 2]) day = ak.array([1, 2]) amount = ak.array([0.5, 5.1]) - bi = ak.array([2 ** 200 + 6, 2 ** 200 + 7]) + bi = ak.array([2**200 + 6, 2**200 + 7]) return ak.DataFrame( - {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi} + { + "userName": username, + "userID": userid, + "item": item, + "day": day, + "amount": amount, + "bi": bi, + } ) @staticmethod @@ -75,17 +95,24 @@ def build_pd_df_append(): day = [5, 5, 6, 5, 6, 6, 1, 2] amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6, 0.5, 5.1] bi = [ - 2 ** 200, - 2 ** 200 + 1, - 2 ** 200 + 2, - 2 ** 200 + 3, - 2 ** 200 + 4, - 2 ** 200 + 5, - 2 ** 200 + 6, - 2 ** 200 + 7, + 2**200, + 2**200 + 1, + 2**200 + 2, + 2**200 + 3, + 2**200 + 4, + 2**200 + 5, + 2**200 + 6, + 2**200 + 7, ] return pd.DataFrame( - {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi} + { + "userName": username, + "userID": userid, + "item": item, + "day": day, + "amount": amount, + "bi": bi, + } ) @staticmethod @@ -101,9 +128,16 @@ def build_ak_typeerror(): item = ak.array([0, 0, 1, 1, 2, 0]) day = ak.array([5, 5, 6, 5, 6, 6]) amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6]) - bi = ak.arange(2 ** 200, 2 ** 200 + 6) + bi = ak.arange(2**200, 2**200 + 6) return ak.DataFrame( - {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi} + { + "userName": username, + "userID": userid, + "item": item, + "day": day, + "amount": amount, + "bi": bi, + } ) @pytest.mark.parametrize("size", pytest.prob_size) @@ -114,26 +148,30 @@ def test_dataframe_creation(self, size): assert df.empty # Validation of Creation from Pandas - pddf = pd.DataFrame({ - "int": np.arange(size), - "uint": np.random.randint(0, size/2, size, dtype=np.uint64), - "bigint": np.arange(2**200, 2**200+size), - "bool": np.random.randint(0, 1, size=size, dtype=bool), - "segarray": [np.random.randint(0, size / 2, 2) for i in range(size)] - }) + pddf = pd.DataFrame( + { + "int": np.arange(size), + "uint": np.random.randint(0, size / 2, size, dtype=np.uint64), + "bigint": np.arange(2**200, 2**200 + size), + "bool": np.random.randint(0, 1, size=size, dtype=bool), + "segarray": [np.random.randint(0, size / 2, 2) for i in range(size)], + } + ) akdf = ak.DataFrame(pddf) assert isinstance(akdf, ak.DataFrame) assert len(akdf) == size assert_frame_equal(pddf, akdf.to_pandas()) # validation of creation from dictionary - akdf = ak.DataFrame({ - "int": ak.arange(size), - "uint": ak.array(pddf["uint"]), - "bigint": ak.arange(2 ** 200, 2 ** 200 + size), - "bool": ak.array(pddf["bool"]), - "segarray": ak.SegArray.from_multi_array([ak.array(x) for x in pddf["segarray"]]) - }) + akdf = ak.DataFrame( + { + "int": ak.arange(size), + "uint": ak.array(pddf["uint"]), + "bigint": ak.arange(2**200, 2**200 + size), + "bool": ak.array(pddf["bool"]), + "segarray": ak.SegArray.from_multi_array([ak.array(x) for x in pddf["segarray"]]), + } + ) assert isinstance(akdf, ak.DataFrame) assert len(akdf) == size @@ -146,8 +184,8 @@ def test_dataframe_creation(self, size): np.random.randint(5, 10, size), ] pddf = pd.DataFrame(x) - l = [ak.array(val) for val in list(zip(x[0], x[1], x[2]))] - akdf = ak.DataFrame(l) + l_cols = [ak.array(val) for val in list(zip(x[0], x[1], x[2]))] + akdf = ak.DataFrame(l_cols) assert isinstance(akdf, ak.DataFrame) assert len(akdf) == len(pddf) # arkouda does not allow for numeric columns. @@ -195,7 +233,11 @@ def test_column_indexing(self): assert df.index.to_list() == ref_df.index.to_list() # column validation [] and . access - for cname, col, ref_col in zip(df.columns, [df.userName, df.userID, df.item, df.day, df.amount, df.bi], [ref_df.userName, ref_df.userID, ref_df.item, ref_df.day, ref_df.amount, ref_df.bi]): + for cname, col, ref_col in zip( + df.columns, + [df.userName, df.userID, df.item, df.day, df.amount, df.bi], + [ref_df.userName, ref_df.userID, ref_df.item, ref_df.day, ref_df.amount, ref_df.bi], + ): assert isinstance(col, ak.Series) assert col.to_list() == ref_col.to_list() assert isinstance(df[cname], (ak.pdarray, ak.Strings, ak.Categorical)) @@ -227,7 +269,9 @@ def test_dtype_prop(self): assert len(akdf.columns) == len(akdf.dtypes) # dtypes returns objType for categorical, segarray. We should probably fix # this and add a df.objTypes property. pdarrays return actual dtype - for ref_type, c in zip(["int64", "int64", "int64", "str", "Categorical", "SegArray", "bigint"], akdf.columns): + for ref_type, c in zip( + ["int64", "int64", "int64", "str", "Categorical", "SegArray", "bigint"], akdf.columns + ): assert ref_type == str(akdf.dtypes[c]) def test_from_pandas(self): @@ -608,7 +652,8 @@ def test_head_tail_resetting_index(self): bool_idx = df[df["cnt"] > 3] bool_idx.__repr__() - # the new index is first False and rest True (because we lose first 4), so equivalent to arange(61, bool) + # the new index is first False and rest True (because we lose first 4), + # so equivalent to arange(61, bool) assert bool_idx.index.index.to_list() == ak.arange(61, dtype=bool).to_list() slice_idx = df[:] @@ -617,55 +662,43 @@ def test_head_tail_resetting_index(self): def test_ipv4_columns(self): # test with single IPv4 column - df = ak.DataFrame({ - 'a': ak.arange(10), - 'b': ak.IPv4(ak.arange(10)) - }) + df = ak.DataFrame({"a": ak.arange(10), "b": ak.IPv4(ak.arange(10))}) with tempfile.TemporaryDirectory(dir=TestDataFrame.df_test_base_tmp) as tmp_dirname: fname = tmp_dirname + "/ipv4_df" df.to_parquet(fname) - data = ak.read(fname+"*") - rddf = ak.DataFrame({ - 'a': data['a'], - 'b': ak.IPv4(data['b']) - }) + data = ak.read(fname + "*") + rddf = ak.DataFrame({"a": data["a"], "b": ak.IPv4(data["b"])}) assert_frame_equal(df.to_pandas(), rddf.to_pandas()) # test with multiple - df = ak.DataFrame({ - 'a': ak.IPv4(ak.arange(10)), - 'b': ak.IPv4(ak.arange(10)) - }) + df = ak.DataFrame({"a": ak.IPv4(ak.arange(10)), "b": ak.IPv4(ak.arange(10))}) with tempfile.TemporaryDirectory(dir=TestDataFrame.df_test_base_tmp) as tmp_dirname: fname = tmp_dirname + "/ipv4_df" df.to_parquet(fname) data = ak.read(fname + "*") - rddf = ak.DataFrame({ - 'a': ak.IPv4(data['a']), - 'b': ak.IPv4(data['b']) - }) + rddf = ak.DataFrame({"a": ak.IPv4(data["a"]), "b": ak.IPv4(data["b"])}) assert_frame_equal(df.to_pandas(), rddf.to_pandas()) # test replacement of IPv4 with uint representation - df = ak.DataFrame({ - 'a': ak.IPv4(ak.arange(10)) - }) - df['a'] = df['a'].export_uint() - assert ak.arange(10).to_list() == df['a'].to_list() + df = ak.DataFrame({"a": ak.IPv4(ak.arange(10))}) + df["a"] = df["a"].export_uint() + assert ak.arange(10).to_list() == df["a"].to_list() def test_subset(self): - df = ak.DataFrame({ - 'a': ak.arange(100), - 'b': ak.randint(0, 20, 100), - 'c': ak.random_strings_uniform(0, 16, 100), - 'd': ak.randint(25, 75, 100) - }) - df2 = df[['a', 'b']] - assert ['a', 'b'] == df2.columns + df = ak.DataFrame( + { + "a": ak.arange(100), + "b": ak.randint(0, 20, 100), + "c": ak.random_strings_uniform(0, 16, 100), + "d": ak.randint(25, 75, 100), + } + ) + df2 = df[["a", "b"]] + assert ["a", "b"] == df2.columns assert df.index.to_list() == df2.index.to_list() - assert df['a'].to_list() == df2['a'].to_list() - assert df['b'].to_list() == df2['b'].to_list() + assert df["a"].to_list() == df2["a"].to_list() + assert df["b"].to_list() == df2["b"].to_list() diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py index 0d25bde0e4..4450fc5632 100755 --- a/arkouda/pdarraycreation.py +++ b/arkouda/pdarraycreation.py @@ -241,7 +241,6 @@ def array( else: raise TypeError("Must be an iterable or have a numeric DType") - # Check if array of strings # if a.dtype == numpy.object_ need to check first element if "U" in a.dtype.kind or (a.dtype == np.object_ and isinstance(a[0], str)): From 3bcfbe228bdd9b434764b96bfa8f3ca61ccd7c5d Mon Sep 17 00:00:00 2001 From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com> Date: Wed, 19 Jul 2023 11:57:49 -0400 Subject: [PATCH 5/8] Addressing review feedback --- PROTO_tests/tests/dataframe_test.py | 104 +++++++++------------------- 1 file changed, 32 insertions(+), 72 deletions(-) diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py index 0b4636ff4c..bb9097ba5e 100644 --- a/PROTO_tests/tests/dataframe_test.py +++ b/PROTO_tests/tests/dataframe_test.py @@ -14,25 +14,6 @@ class TestDataFrame: df_test_base_tmp = "{}/df_test".format(os.getcwd()) io_util.get_directory(df_test_base_tmp) - @staticmethod - def build_ak_df(): - username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"]) - userid = ak.array([111, 222, 111, 333, 222, 111]) - item = ak.array([0, 0, 1, 1, 2, 0]) - day = ak.array([5, 5, 6, 5, 6, 6]) - amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6]) - bi = ak.arange(2**200, 2**200 + 6) - return ak.DataFrame( - { - "userName": username, - "userID": userid, - "item": item, - "day": day, - "amount": amount, - "bi": bi, - } - ) - @staticmethod def build_pd_df(): username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"] @@ -40,7 +21,8 @@ def build_pd_df(): item = [0, 0, 1, 1, 2, 0] day = [5, 5, 6, 5, 6, 6] amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6] - bi = [2**200, 2**200 + 1, 2**200 + 2, 2**200 + 3, 2**200 + 4, 2**200 + 5] + bi = [2 ** 200, 2 ** 200 + 1, 2 ** 200 + 2, 2 ** 200 + 3, 2 ** 200 + 4, 2 ** 200 + 5] + ui = np.arange(6).astype(ak.uint64) return pd.DataFrame( { "userName": username, @@ -49,16 +31,13 @@ def build_pd_df(): "day": day, "amount": amount, "bi": bi, + "ui": ui } ) @staticmethod - def build_ak_df_duplicates(): - username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"]) - userid = ak.array([111, 222, 111, 333, 222, 111]) - item = ak.array([0, 1, 0, 2, 1, 0]) - day = ak.array([5, 5, 5, 5, 5, 5]) - return ak.DataFrame({"userName": username, "userID": userid, "item": item, "day": day}) + def build_ak_df(): + return ak.DataFrame(TestDataFrame.build_pd_df()) @staticmethod def build_pd_df_duplicates(): @@ -68,6 +47,10 @@ def build_pd_df_duplicates(): day = [5, 5, 5, 5, 5, 5] return pd.DataFrame({"userName": username, "userID": userid, "item": item, "day": day}) + @staticmethod + def build_ak_df_duplicates(): + return ak.DataFrame(TestDataFrame.build_pd_df_duplicates()) + @staticmethod def build_ak_append(): username = ak.array(["John", "Carol"]) @@ -76,6 +59,7 @@ def build_ak_append(): day = ak.array([1, 2]) amount = ak.array([0.5, 5.1]) bi = ak.array([2**200 + 6, 2**200 + 7]) + ui = ak.array([6, 7], dtype=ak.uint64) return ak.DataFrame( { "userName": username, @@ -84,6 +68,7 @@ def build_ak_append(): "day": day, "amount": amount, "bi": bi, + "ui": ui } ) @@ -94,16 +79,8 @@ def build_pd_df_append(): item = [0, 0, 1, 1, 2, 0, 0, 2] day = [5, 5, 6, 5, 6, 6, 1, 2] amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6, 0.5, 5.1] - bi = [ - 2**200, - 2**200 + 1, - 2**200 + 2, - 2**200 + 3, - 2**200 + 4, - 2**200 + 5, - 2**200 + 6, - 2**200 + 7, - ] + bi = (np.arange(8) + 2**200).tolist() + ui = np.arange(8).astype(ak.uint64) return pd.DataFrame( { "userName": username, @@ -112,6 +89,7 @@ def build_pd_df_append(): "day": day, "amount": amount, "bi": bi, + "ui": ui } ) @@ -129,6 +107,7 @@ def build_ak_typeerror(): day = ak.array([5, 5, 6, 5, 6, 6]) amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6]) bi = ak.arange(2**200, 2**200 + 6) + ui = ak.arange(6, dtype=ak.uint64) return ak.DataFrame( { "userName": username, @@ -137,6 +116,7 @@ def build_ak_typeerror(): "day": day, "amount": amount, "bi": bi, + "ui": ui } ) @@ -184,8 +164,7 @@ def test_dataframe_creation(self, size): np.random.randint(5, 10, size), ] pddf = pd.DataFrame(x) - l_cols = [ak.array(val) for val in list(zip(x[0], x[1], x[2]))] - akdf = ak.DataFrame(l_cols) + akdf = ak.DataFrame([ak.array(val) for val in list(zip(*x))]) assert isinstance(akdf, ak.DataFrame) assert len(akdf) == len(pddf) # arkouda does not allow for numeric columns. @@ -233,11 +212,13 @@ def test_column_indexing(self): assert df.index.to_list() == ref_df.index.to_list() # column validation [] and . access - for cname, col, ref_col in zip( - df.columns, - [df.userName, df.userID, df.item, df.day, df.amount, df.bi], - [ref_df.userName, ref_df.userID, ref_df.item, ref_df.day, ref_df.amount, ref_df.bi], - ): + # for cname, col, ref_col in zip( + # df.columns, + # [df.userName, df.userID, df.item, df.day, df.amount, df.bi], + # [ref_df.userName, ref_df.userID, ref_df.item, ref_df.day, ref_df.amount, ref_df.bi], + # ): + for cname in df.columns: + col, ref_col = getattr(df, cname), getattr(ref_df, cname) assert isinstance(col, ak.Series) assert col.to_list() == ref_col.to_list() assert isinstance(df[cname], (ak.pdarray, ak.Strings, ak.Categorical)) @@ -261,7 +242,7 @@ def test_dtype_prop(self): "c_1": ak.arange(3, 6, 1), "c_2": ak.arange(6, 9, 1), "c_3": str_arr, - "c_4": ak.Categorical(str_arr), + "c_4": ak.Categorical(ak.array(["str"] * 3)), "c_5": ak.SegArray(ak.array([0, 9, 14]), ak.arange(20)), "c_6": ak.arange(2**200, 2**200 + 3), } @@ -274,11 +255,6 @@ def test_dtype_prop(self): ): assert ref_type == str(akdf.dtypes[c]) - def test_from_pandas(self): - ref_df = self.build_pd_df() - df = ak.DataFrame(ref_df) - assert_frame_equal(ref_df, df.to_pandas()) - def test_drop(self): # create an arkouda df. df = self.build_ak_df() @@ -299,7 +275,7 @@ def test_drop(self): df.drop("userName", axis=1, inplace=True) pd_df.drop(labels=["userName"], axis=1, inplace=True) - assert_frame_equal(pddf_drop, df_drop.to_pandas()) + assert_frame_equal(pd_df, df.to_pandas()) # Test dropping rows df.drop([0, 2, 5], inplace=True) @@ -307,7 +283,7 @@ def test_drop(self): pd_df.drop(labels=[0, 2, 5], inplace=True) pd_df.reset_index(drop=True, inplace=True) - assert_frame_equal(pddf_drop, df_drop.to_pandas()) + assert_frame_equal(pd_df, df.to_pandas()) # verify that index keys must be ints with pytest.raises(TypeError): @@ -336,7 +312,7 @@ def test_shape(self): row, col = df.shape assert row == 6 - assert col == 6 + assert col == 7 def test_reset_index(self): df = self.build_ak_df() @@ -391,9 +367,8 @@ def test_rename(self): def test_append(self): df = self.build_ak_df() - df_toappend = self.build_ak_append() - df.append(df_toappend) + df.append(self.build_ak_append()) ref_df = self.build_pd_df_append() @@ -401,6 +376,7 @@ def test_append(self): assert_frame_equal(ref_df, df.to_pandas()) idx = np.arange(8) + print(type(df.index.index)) assert idx.tolist() == df.index.index.to_list() df_keyerror = self.build_ak_keyerror() @@ -413,9 +389,8 @@ def test_append(self): def test_concat(self): df = self.build_ak_df() - df_toappend = self.build_ak_append() - glued = ak.DataFrame.concat([df, df_toappend]) + glued = ak.DataFrame.concat([df, self.build_ak_append()]) ref_df = self.build_pd_df_append() @@ -475,22 +450,7 @@ def test_groupby_standard(self): assert_series_equal(pds, s.to_pandas()) def test_gb_series(self): - username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"]) - userid = ak.array([111, 222, 111, 333, 222, 111]) - item = ak.array([0, 0, 1, 1, 2, 0]) - day = ak.array([5, 5, 6, 5, 6, 6]) - amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6]) - bi = ak.arange(2**200, 2**200 + 6) - df = ak.DataFrame( - { - "userName": username, - "userID": userid, - "item": item, - "day": day, - "amount": amount, - "bi": bi, - } - ) + df = self.build_ak_df() gb = df.GroupBy("userName", use_series=True) From 169f0d0bb1be0571b9a41776946b94443e7981a8 Mon Sep 17 00:00:00 2001 From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com> Date: Thu, 20 Jul 2023 08:01:16 -0400 Subject: [PATCH 6/8] Address comments from Pierce --- PROTO_tests/tests/dataframe_test.py | 55 +++++++++-------------------- 1 file changed, 16 insertions(+), 39 deletions(-) diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py index bb9097ba5e..4547ad0add 100644 --- a/PROTO_tests/tests/dataframe_test.py +++ b/PROTO_tests/tests/dataframe_test.py @@ -22,7 +22,7 @@ def build_pd_df(): day = [5, 5, 6, 5, 6, 6] amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6] bi = [2 ** 200, 2 ** 200 + 1, 2 ** 200 + 2, 2 ** 200 + 3, 2 ** 200 + 4, 2 ** 200 + 5] - ui = np.arange(6).astype(ak.uint64) + ui = (np.arange(6).astype(ak.uint64)) + 2**63 return pd.DataFrame( { "userName": username, @@ -59,7 +59,7 @@ def build_ak_append(): day = ak.array([1, 2]) amount = ak.array([0.5, 5.1]) bi = ak.array([2**200 + 6, 2**200 + 7]) - ui = ak.array([6, 7], dtype=ak.uint64) + ui = ak.array([6, 7], dtype=ak.uint64) + 2**63 return ak.DataFrame( { "userName": username, @@ -80,7 +80,7 @@ def build_pd_df_append(): day = [5, 5, 6, 5, 6, 6, 1, 2] amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6, 0.5, 5.1] bi = (np.arange(8) + 2**200).tolist() - ui = np.arange(8).astype(ak.uint64) + ui = (np.arange(8).astype(ak.uint64)) + 2**63 return pd.DataFrame( { "userName": username, @@ -107,7 +107,7 @@ def build_ak_typeerror(): day = ak.array([5, 5, 6, 5, 6, 6]) amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6]) bi = ak.arange(2**200, 2**200 + 6) - ui = ak.arange(6, dtype=ak.uint64) + ui = ak.arange(6, dtype=ak.uint64) + 2**63 return ak.DataFrame( { "userName": username, @@ -188,6 +188,13 @@ def test_client_type_creation(self): {"fields": f.to_list(), "ip": ip.to_list(), "date": pd_d, "bitvector": bv.to_list()} ) assert_frame_equal(pddf, df.to_pandas()) + + # validate that set max_rows adjusts the repr properly + shape = f"({df._shape_str()})".replace("(", "[").replace(")", "]") + pd.set_option("display.max_rows", 4) + s = df.__repr__().replace(f" ({df._shape_str()})", f"\n\n{shape}") + assert s == pddf.__repr__() + pddf = pd.DataFrame({"a": list(range(1000)), "b": list(range(1000))}) pddf["a"] = pddf["a"].apply(lambda x: "AA" + str(x)) pddf["b"] = pddf["b"].apply(lambda x: "BB" + str(x)) @@ -195,6 +202,11 @@ def test_client_type_creation(self): df = ak.DataFrame(pddf) assert_frame_equal(pddf, df.to_pandas()) + pd.set_option("display.max_rows", 10) + shape = f"({df._shape_str()})".replace("(", "[").replace(")", "]") + s = df.__repr__().replace(f" ({df._shape_str()})", f"\n\n{shape}") + assert s == pddf.__repr__() + def test_boolean_indexing(self): df = self.build_ak_df() ref_df = self.build_pd_df() @@ -211,12 +223,6 @@ def test_column_indexing(self): assert isinstance(df.index, ak.Index) assert df.index.to_list() == ref_df.index.to_list() - # column validation [] and . access - # for cname, col, ref_col in zip( - # df.columns, - # [df.userName, df.userID, df.item, df.day, df.amount, df.bi], - # [ref_df.userName, ref_df.userID, ref_df.item, ref_df.day, ref_df.amount, ref_df.bi], - # ): for cname in df.columns: col, ref_col = getattr(df, cname), getattr(ref_df, cname) assert isinstance(col, ak.Series) @@ -376,7 +382,6 @@ def test_append(self): assert_frame_equal(ref_df, df.to_pandas()) idx = np.arange(8) - print(type(df.index.index)) assert idx.tolist() == df.index.index.to_list() df_keyerror = self.build_ak_keyerror() @@ -620,34 +625,6 @@ def test_head_tail_resetting_index(self): slice_idx.__repr__() assert slice_idx.index.index.to_list() == idx.to_list() - def test_ipv4_columns(self): - # test with single IPv4 column - df = ak.DataFrame({"a": ak.arange(10), "b": ak.IPv4(ak.arange(10))}) - with tempfile.TemporaryDirectory(dir=TestDataFrame.df_test_base_tmp) as tmp_dirname: - fname = tmp_dirname + "/ipv4_df" - df.to_parquet(fname) - - data = ak.read(fname + "*") - rddf = ak.DataFrame({"a": data["a"], "b": ak.IPv4(data["b"])}) - - assert_frame_equal(df.to_pandas(), rddf.to_pandas()) - - # test with multiple - df = ak.DataFrame({"a": ak.IPv4(ak.arange(10)), "b": ak.IPv4(ak.arange(10))}) - with tempfile.TemporaryDirectory(dir=TestDataFrame.df_test_base_tmp) as tmp_dirname: - fname = tmp_dirname + "/ipv4_df" - df.to_parquet(fname) - - data = ak.read(fname + "*") - rddf = ak.DataFrame({"a": ak.IPv4(data["a"]), "b": ak.IPv4(data["b"])}) - - assert_frame_equal(df.to_pandas(), rddf.to_pandas()) - - # test replacement of IPv4 with uint representation - df = ak.DataFrame({"a": ak.IPv4(ak.arange(10))}) - df["a"] = df["a"].export_uint() - assert ak.arange(10).to_list() == df["a"].to_list() - def test_subset(self): df = ak.DataFrame( { From a8ba48dc2fd67cd9bb10766de0bf292d45af5fa9 Mon Sep 17 00:00:00 2001 From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com> Date: Thu, 20 Jul 2023 12:00:02 -0400 Subject: [PATCH 7/8] Update PROTO_tests/tests/dataframe_test.py Co-authored-by: pierce <48131946+pierce314159@users.noreply.github.com> --- PROTO_tests/tests/dataframe_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py index 4547ad0add..5d7ecaec7f 100644 --- a/PROTO_tests/tests/dataframe_test.py +++ b/PROTO_tests/tests/dataframe_test.py @@ -11,8 +11,6 @@ class TestDataFrame: - df_test_base_tmp = "{}/df_test".format(os.getcwd()) - io_util.get_directory(df_test_base_tmp) @staticmethod def build_pd_df(): From c3e22ce3c8f283e494962cf8e8827ad4a497854c Mon Sep 17 00:00:00 2001 From: pierce <48131946+pierce314159@users.noreply.github.com> Date: Thu, 20 Jul 2023 13:11:56 -0400 Subject: [PATCH 8/8] Update PROTO_tests/tests/dataframe_test.py --- PROTO_tests/tests/dataframe_test.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py index 5d7ecaec7f..4c2b8d84a4 100644 --- a/PROTO_tests/tests/dataframe_test.py +++ b/PROTO_tests/tests/dataframe_test.py @@ -1,13 +1,9 @@ -import os -import tempfile - import numpy as np import pandas as pd import pytest from pandas.testing import assert_frame_equal, assert_series_equal import arkouda as ak -from arkouda import io_util class TestDataFrame: