From ee5b196b69df03693d4799962056f35d5a34761d Mon Sep 17 00:00:00 2001
From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com>
Date: Tue, 11 Jul 2023 09:34:53 -0400
Subject: [PATCH 1/8] Existing tests converted. Still need to review for
 missing coverage and possible simplications.

---
 PROTO_tests/tests/dataframe_test.py | 716 ++++++++++++++++++++++++++++
 1 file changed, 716 insertions(+)
 create mode 100644 PROTO_tests/tests/dataframe_test.py

diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py
new file mode 100644
index 0000000000..87ea2b273c
--- /dev/null
+++ b/PROTO_tests/tests/dataframe_test.py
@@ -0,0 +1,716 @@
+import arkouda as ak
+import pandas as pd
+import numpy as np
+import pytest
+import random
+import string
+import tempfile
+import glob
+import os
+from arkouda import io_util
+
+
+class TestDataFrame:
+    df_test_base_tmp = "{}/df_test".format(os.getcwd())
+    io_util.get_directory(df_test_base_tmp)
+
+    @staticmethod
+    def build_ak_df():
+        username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"])
+        userid = ak.array([111, 222, 111, 333, 222, 111])
+        item = ak.array([0, 0, 1, 1, 2, 0])
+        day = ak.array([5, 5, 6, 5, 6, 6])
+        amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6])
+        bi = ak.arange(2 ** 200, 2 ** 200 + 6)
+        return ak.DataFrame(
+            {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi}
+        )
+
+    @staticmethod
+    def build_pd_df():
+        username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"]
+        userid = [111, 222, 111, 333, 222, 111]
+        item = [0, 0, 1, 1, 2, 0]
+        day = [5, 5, 6, 5, 6, 6]
+        amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6]
+        bi = [2 ** 200, 2 ** 200 + 1, 2 ** 200 + 2, 2 ** 200 + 3, 2 ** 200 + 4, 2 ** 200 + 5]
+        return pd.DataFrame(
+            {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi}
+        )
+
+    @staticmethod
+    def build_ak_df_duplicates():
+        username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"])
+        userid = ak.array([111, 222, 111, 333, 222, 111])
+        item = ak.array([0, 1, 0, 2, 1, 0])
+        day = ak.array([5, 5, 5, 5, 5, 5])
+        return ak.DataFrame({"userName": username, "userID": userid, "item": item, "day": day})
+
+    @staticmethod
+    def build_pd_df_duplicates():
+        username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"]
+        userid = [111, 222, 111, 333, 222, 111]
+        item = [0, 1, 0, 2, 1, 0]
+        day = [5, 5, 5, 5, 5, 5]
+        return pd.DataFrame({"userName": username, "userID": userid, "item": item, "day": day})
+
+    @staticmethod
+    def build_ak_append():
+        username = ak.array(["John", "Carol"])
+        userid = ak.array([444, 333])
+        item = ak.array([0, 2])
+        day = ak.array([1, 2])
+        amount = ak.array([0.5, 5.1])
+        bi = ak.array([2 ** 200 + 6, 2 ** 200 + 7])
+        return ak.DataFrame(
+            {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi}
+        )
+
+    @staticmethod
+    def build_pd_df_append():
+        username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice", "John", "Carol"]
+        userid = [111, 222, 111, 333, 222, 111, 444, 333]
+        item = [0, 0, 1, 1, 2, 0, 0, 2]
+        day = [5, 5, 6, 5, 6, 6, 1, 2]
+        amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6, 0.5, 5.1]
+        bi = [
+            2 ** 200,
+            2 ** 200 + 1,
+            2 ** 200 + 2,
+            2 ** 200 + 3,
+            2 ** 200 + 4,
+            2 ** 200 + 5,
+            2 ** 200 + 6,
+            2 ** 200 + 7,
+        ]
+        return pd.DataFrame(
+            {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi}
+        )
+
+    @staticmethod
+    def build_ak_keyerror():
+        userid = ak.array([444, 333])
+        item = ak.array([0, 2])
+        return ak.DataFrame({"user_id": userid, "item": item})
+
+    @staticmethod
+    def build_ak_typeerror():
+        username = ak.array([111, 222, 111, 333, 222, 111])
+        userid = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"])
+        item = ak.array([0, 0, 1, 1, 2, 0])
+        day = ak.array([5, 5, 6, 5, 6, 6])
+        amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6])
+        bi = ak.arange(2 ** 200, 2 ** 200 + 6)
+        return ak.DataFrame(
+            {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi}
+        )
+
+    def test_dataframe_creation(self):
+        # Validate empty DataFrame
+        df = ak.DataFrame()
+        assert isinstance(df, ak.DataFrame)
+        assert df.empty
+
+        df = self.build_ak_df()
+        ref_df = self.build_pd_df()
+        assert isinstance(df, ak.DataFrame)
+        assert len(df) == 6
+        assert ref_df.equals(df.to_pandas())
+
+    def test_client_type_creation(self):
+        f = ak.Fields(ak.arange(10), ["A", "B", "c"])
+        ip = ak.ip_address(ak.arange(10))
+        d = ak.Datetime(ak.arange(10))
+        bv = ak.BitVector(ak.arange(10), width=4)
+
+        df_dict = {"fields": f, "ip": ip, "date": d, "bitvector": bv}
+        df = ak.DataFrame(df_dict)
+        pd_d = [pd.to_datetime(x, unit="ns") for x in d.to_list()]
+        pddf = pd.DataFrame(
+            {"fields": f.to_list(), "ip": ip.to_list(), "date": pd_d, "bitvector": bv.to_list()}
+        )
+        shape = f"({df._shape_str()})".replace("(", "[").replace(")", "]")
+        pd.set_option("display.max_rows", 4)
+        s = df.__repr__().replace(f" ({df._shape_str()})", f"\n\n{shape}")
+        assert s == pddf.__repr__()
+
+        pd.set_option("display.max_rows", 10)
+        pdf = pd.DataFrame({"a": list(range(1000)), "b": list(range(1000))})
+        pdf["a"] = pdf["a"].apply(lambda x: "AA" + str(x))
+        pdf["b"] = pdf["b"].apply(lambda x: "BB" + str(x))
+        df = ak.DataFrame(pdf)
+        shape = f"({df._shape_str()})".replace("(", "[").replace(")", "]")
+        s = df.__repr__().replace(f" ({df._shape_str()})", f"\n\n{shape}")
+        assert s, pdf.__repr__()
+
+    def test_boolean_indexing(self):
+        df = self.build_ak_df()
+        ref_df = self.build_pd_df()
+        row = df[df["userName"] == "Carol"]
+
+        assert len(row) == 1
+        assert ref_df[ref_df["userName"] == "Carol"].equals(row.to_pandas(retain_index=True))
+
+    def test_column_indexing(self):
+        df = self.build_ak_df()
+        assert isinstance(df.userName, ak.Series)
+        assert isinstance(df.userID, ak.Series)
+        assert isinstance(df.item, ak.Series)
+        assert isinstance(df.day, ak.Series)
+        assert isinstance(df.amount, ak.Series)
+        assert isinstance(df.bi, ak.Series)
+        for col in ("userName", "userID", "item", "day", "amount", "bi"):
+            assert isinstance(df[col], (ak.pdarray, ak.Strings, ak.Categorical))
+        assert isinstance(df[["userName", "amount", "bi"]], ak.DataFrame)
+        assert isinstance(df[("userID", "item", "day", "bi")], ak.DataFrame)
+        assert isinstance(df.index, ak.Index)
+
+    def test_dtype_prop(self):
+        str_arr = ak.array(
+            ["".join(random.choices(string.ascii_letters + string.digits, k=5)) for _ in range(3)]
+        )
+        df_dict = {
+            "i": ak.arange(3),
+            "c_1": ak.arange(3, 6, 1),
+            "c_2": ak.arange(6, 9, 1),
+            "c_3": str_arr,
+            "c_4": ak.Categorical(str_arr),
+            "c_5": ak.SegArray(ak.array([0, 9, 14]), ak.arange(20)),
+            "c_6": ak.arange(2**200, 2**200 + 3),
+        }
+        akdf = ak.DataFrame(df_dict)
+        assert len(akdf.columns) == len(akdf.dtypes)
+
+    def test_from_pandas(self):
+        username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice", "John", "Carol"]
+        userid = [111, 222, 111, 333, 222, 111, 444, 333]
+        item = [0, 0, 1, 1, 2, 0, 0, 2]
+        day = [5, 5, 6, 5, 6, 6, 1, 2]
+        amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6, 0.5, 5.1]
+        bi = 2**200
+        bi_arr = [bi, bi + 1, bi + 2, bi + 3, bi + 4, bi + 5, bi + 6, bi + 7]
+        ref_df = pd.DataFrame(
+            {
+                "userName": username,
+                "userID": userid,
+                "item": item,
+                "day": day,
+                "amount": amount,
+                "bi": bi_arr,
+            }
+        )
+
+        df = ak.DataFrame(ref_df)
+
+        assert ((ref_df == df.to_pandas()).all()).all()
+
+        df = ak.DataFrame.from_pandas(ref_df)
+        assert ((ref_df == df.to_pandas()).all()).all()
+
+    def test_drop(self):
+        # create an arkouda df.
+        df = self.build_ak_df()
+        # create pandas df to validate functionality against
+        pd_df = self.build_pd_df()
+
+        # test out of place drop
+        df_drop = df.drop([0, 1, 2])
+        pddf_drop = pd_df.drop(labels=[0, 1, 2])
+        pddf_drop.reset_index(drop=True, inplace=True)
+        assert pddf_drop.equals(df_drop.to_pandas())
+
+        df_drop = df.drop("userName", axis=1)
+        pddf_drop = pd_df.drop(labels=["userName"], axis=1)
+        assert pddf_drop.equals(df_drop.to_pandas())
+
+        # Test dropping columns
+        df.drop("userName", axis=1, inplace=True)
+        pd_df.drop(labels=["userName"], axis=1, inplace=True)
+
+        assert ((df.to_pandas() == pd_df).all()).all()
+
+        # Test dropping rows
+        df.drop([0, 2, 5], inplace=True)
+        # pandas retains original indexes when dropping rows, need to reset to line up with arkouda
+        pd_df.drop(labels=[0, 2, 5], inplace=True)
+        pd_df.reset_index(drop=True, inplace=True)
+
+        assert pd_df.equals(df.to_pandas())
+
+        # verify that index keys must be ints
+        with pytest.raises(TypeError):
+            df.drop("index")
+
+        # verify axis can only be 0 or 1
+        with pytest.raises(ValueError):
+            df.drop("amount", 15)
+
+    def test_drop_duplicates(self):
+        df = self.build_ak_df_duplicates()
+        ref_df = self.build_pd_df_duplicates()
+
+        dedup = df.drop_duplicates()
+        dedup_pd = ref_df.drop_duplicates()
+        # pandas retains original indexes when dropping dups, need to reset to line up with arkouda
+        dedup_pd.reset_index(drop=True, inplace=True)
+
+        dedup_test = dedup.to_pandas().sort_values("userName").reset_index(drop=True)
+        dedup_pd_test = dedup_pd.sort_values("userName").reset_index(drop=True)
+
+        assert dedup_test.equals(dedup_pd_test)
+
+    def test_shape(self):
+        df = self.build_ak_df()
+
+        row, col = df.shape
+        assert row == 6
+        assert col == 6
+
+    def test_reset_index(self):
+        df = self.build_ak_df()
+
+        slice_df = df[ak.array([1, 3, 5])]
+        assert slice_df.index.to_list() == [1, 3, 5]
+
+        df_reset = slice_df.reset_index()
+        assert df_reset.index.to_list() == [0, 1, 2]
+        assert slice_df.index.to_list(), [1, 3, 5]
+
+        slice_df.reset_index(inplace=True)
+        assert slice_df.index.to_list(), [0, 1, 2]
+
+    def test_rename(self):
+        df = self.build_ak_df()
+
+        rename = {"userName": "name_col", "userID": "user_id"}
+
+        # Test out of Place - column
+        df_rename = df.rename(rename, axis=1)
+        assert "user_id" in df_rename.columns
+        assert "name_col" in df_rename.columns
+        assert "userName" not in df_rename.columns
+        assert "userID" not in df_rename.columns
+        assert "userID" in df.columns
+        assert "userName" in df.columns
+        assert "user_id" not in df.columns
+        assert "name_col" not in df.columns
+
+        # Test in place - column
+        df.rename(column=rename, inplace=True)
+        assert "user_id" in df.columns
+        assert "name_col" in df.columns
+        assert "userName" not in df.columns
+        assert "userID" not in df.columns
+
+        # prep for index renaming
+        rename_idx = {1: 17, 2: 93}
+        conf = list(range(6))
+        conf[1] = 17
+        conf[2] = 93
+
+        # Test out of Place - index
+        df_rename = df.rename(rename_idx)
+        assert df_rename.index.values.to_list() == conf
+        assert df.index.values.to_list() == list(range(6))
+
+        # Test in place - index
+        df.rename(index=rename_idx, inplace=True)
+        assert df.index.values.to_list() == conf
+
+    def test_append(self):
+        df = self.build_ak_df()
+        df_toappend = self.build_ak_append()
+
+        df.append(df_toappend)
+
+        ref_df = self.build_pd_df_append()
+
+        # dataframe equality returns series with bool result for each row.
+        assert ref_df.equals(df.to_pandas())
+
+        idx = np.arange(8)
+        assert idx.tolist() == df.index.index.to_list()
+
+        df_keyerror = self.build_ak_keyerror()
+        with pytest.raises(KeyError):
+            df.append(df_keyerror)
+
+        df_typeerror = self.build_ak_typeerror()
+        with pytest.raises(TypeError):
+            df.append(df_typeerror)
+
+    def test_concat(self):
+        df = self.build_ak_df()
+        df_toappend = self.build_ak_append()
+
+        glued = ak.DataFrame.concat([df, df_toappend])
+
+        ref_df = self.build_pd_df_append()
+
+        # dataframe equality returns series with bool result for each row.
+        assert ref_df.equals(glued.to_pandas())
+
+        df_keyerror = self.build_ak_keyerror()
+        with pytest.raises(KeyError):
+            ak.DataFrame.concat([df, df_keyerror])
+
+        df_typeerror = self.build_ak_typeerror()
+        with pytest.raises(TypeError):
+            ak.DataFrame.concat([df, df_typeerror])
+
+    def test_head(self):
+        df = self.build_ak_df()
+        ref_df = self.build_pd_df()
+
+        hdf = df.head(3)
+        hdf_ref = ref_df.head(3).reset_index(drop=True)
+        assert hdf_ref.equals(hdf.to_pandas())
+
+    def test_tail(self):
+        df = self.build_ak_df()
+        ref_df = self.build_pd_df()
+
+        hdf = df.tail(2)
+        hdf_ref = ref_df.tail(2).reset_index(drop=True)
+        assert hdf_ref.equals(hdf.to_pandas())
+
+    def test_groupby_standard(self):
+        df = self.build_ak_df()
+        gb = df.GroupBy("userName")
+        keys, count = gb.count()
+        assert keys.to_list() == ["Bob", "Alice", "Carol"]
+        assert count.to_list() == [2, 3, 1]
+        assert gb.permutation.to_list() == [1, 4, 0, 2, 5, 3]
+
+        gb = df.GroupBy(["userName", "userID"])
+        keys, count = gb.count()
+        assert len(keys) == 2
+        assert keys[0].to_list() == ["Carol", "Bob", "Alice"]
+        assert keys[1].to_list() == [333, 222, 111]
+        assert count.to_list() == [1, 2, 3]
+
+        # testing counts with IPv4 column
+        s = ak.DataFrame({"a": ak.IPv4(ak.arange(1, 5))}).groupby("a").count()
+        pds = pd.Series(
+            data=np.ones(4, dtype=np.int64),
+            index=pd.Index(data=np.array(["0.0.0.1", "0.0.0.2", "0.0.0.3", "0.0.0.4"], dtype="<U7")),
+        )
+        assert s.to_pandas().equals(other=pds)
+
+        # testing counts with Categorical column
+        s = ak.DataFrame({"a": ak.Categorical(ak.array(["a", "a", "a", "b"]))}).groupby("a").count()
+        pds = pd.Series(data=np.array([3, 1]), index=pd.Index(data=np.array(["a", "b"], dtype="<U7")))
+        assert s.to_pandas().equals(other=pds)
+
+    def test_gb_series(self):
+        username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"])
+        userid = ak.array([111, 222, 111, 333, 222, 111])
+        item = ak.array([0, 0, 1, 1, 2, 0])
+        day = ak.array([5, 5, 6, 5, 6, 6])
+        amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6])
+        bi = ak.arange(2**200, 2**200 + 6)
+        df = ak.DataFrame(
+            {
+                "userName": username,
+                "userID": userid,
+                "item": item,
+                "day": day,
+                "amount": amount,
+                "bi": bi,
+            }
+        )
+
+        gb = df.GroupBy("userName", use_series=True)
+
+        c = gb.count()
+        assert isinstance(c, ak.Series)
+        assert c.index.to_list() == ["Bob", "Alice", "Carol"]
+        assert c.values.to_list() == [2, 3, 1]
+
+    def test_to_pandas(self):
+        df = self.build_ak_df()
+        pd_df = self.build_pd_df()
+
+        assert pd_df.equals(df.to_pandas())
+
+        slice_df = df[ak.array([1, 3, 5])]
+        pd_df = slice_df.to_pandas(retain_index=True)
+        assert pd_df.index.tolist() == [1, 3, 5]
+
+        pd_df = slice_df.to_pandas()
+        assert pd_df.index.tolist() == [0, 1, 2]
+
+    def test_argsort(self):
+        df = self.build_ak_df()
+
+        p = df.argsort(key="userName")
+        assert p.to_list() == [0, 2, 5, 1, 4, 3]
+
+        p = df.argsort(key="userName", ascending=False)
+        assert p.to_list() == [3, 4, 1, 5, 2, 0]
+
+    def test_coargsort(self):
+        df = self.build_ak_df()
+
+        p = df.coargsort(keys=["userID", "amount"])
+        assert p.to_list() == [0, 5, 2, 1, 4, 3]
+
+        p = df.coargsort(keys=["userID", "amount"], ascending=False)
+        assert p.to_list() == [3, 4, 1, 2, 5, 0]
+
+    def test_sort_values(self):
+        userid = [111, 222, 111, 333, 222, 111]
+        userid_ak = ak.array(userid)
+
+        # sort userid to build dataframes to reference
+        userid.sort()
+
+        df = ak.DataFrame({"userID": userid_ak})
+        ord = df.sort_values()
+        assert ord.to_pandas().equals(pd.DataFrame(data=userid, columns=["userID"]))
+        ord = df.sort_values(ascending=False)
+        userid.reverse()
+        assert ord.to_pandas().equals(pd.DataFrame(data=userid, columns=["userID"]))
+
+        df = self.build_ak_df()
+        ord = df.sort_values(by="userID")
+        ref_df = self.build_pd_df()
+        ref_df = ref_df.sort_values(by="userID").reset_index(drop=True)
+        assert ref_df.equals(ord.to_pandas())
+
+        ord = df.sort_values(by=["userID", "day"])
+        ref_df = ref_df.sort_values(by=["userID", "day"]).reset_index(drop=True)
+        assert ref_df.equals(ord.to_pandas())
+
+        with pytest.raises(TypeError):
+            df.sort_values(by=1)
+
+    def test_intx(self):
+        username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"])
+        userid = ak.array([111, 222, 111, 333, 222, 111])
+        df_1 = ak.DataFrame({"user_name": username, "user_id": userid})
+
+        username = ak.array(["Bob", "Alice"])
+        userid = ak.array([222, 445])
+        df_2 = ak.DataFrame({"user_name": username, "user_id": userid})
+
+        rows = ak.intx(df_1, df_2)
+        assert rows.to_list() == [False, True, False, False, True, False]
+
+        df_3 = ak.DataFrame({"user_name": username, "user_number": userid})
+        with pytest.raises(ValueError):
+            rows = ak.intx(df_1, df_3)
+
+    def test_apply_perm(self):
+        df = self.build_ak_df()
+        ref_df = self.build_pd_df()
+
+        ord = df.sort_values(by="userID")
+        perm_list = [0, 3, 1, 5, 4, 2]
+        default_perm = ak.array(perm_list)
+        ord.apply_permutation(default_perm)
+
+        ord_ref = ref_df.sort_values(by="userID").reset_index(drop=True)
+        ord_ref = ord_ref.reindex(perm_list).reset_index(drop=True)
+        assert ord_ref.equals(ord.to_pandas())
+
+    def test_filter_by_range(self):
+        userid = ak.array([111, 222, 111, 333, 222, 111])
+        amount = ak.array([0, 1, 1, 2, 3, 15])
+        df = ak.DataFrame({"userID": userid, "amount": amount})
+
+        filtered = df.filter_by_range(keys=["userID"], low=1, high=2)
+        assert filtered.to_list() == [False, True, False, True, True, False]
+
+    def test_copy(self):
+        username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"])
+        userid = ak.array([111, 222, 111, 333, 222, 111])
+        df = ak.DataFrame({"userName": username, "userID": userid})
+
+        df_copy = df.copy(deep=True)
+        assert df.__repr__() == df_copy.__repr__()
+
+        df_copy.__setitem__("userID", ak.array([1, 2, 1, 3, 2, 1]))
+        assert df.__repr__() != df_copy.__repr__()
+
+        df_copy = df.copy(deep=False)
+        df_copy.__setitem__("userID", ak.array([1, 2, 1, 3, 2, 1]))
+        assert df.__repr__() == df_copy.__repr__()
+
+    # TODO - This should be covered in HDF5 and Parquet testing
+    def test_save(self):
+        i = list(range(3))
+        c1 = [9, 7, 17]
+        c2 = [2, 4, 6]
+        df_dict = {"i": ak.array(i), "c_1": ak.array(c1), "c_2": ak.array(c2)}
+
+        akdf = ak.DataFrame(df_dict)
+
+        validation_df = pd.DataFrame(
+            {
+                "i": i,
+                "c_1": c1,
+                "c_2": c2,
+            }
+        )
+        with tempfile.TemporaryDirectory(dir=self.df_test_base_tmp) as tmp_dirname:
+            akdf.to_parquet(f"{tmp_dirname}/testName")
+
+            ak_loaded = ak.DataFrame.load(f"{tmp_dirname}/testName")
+            assert validation_df.equals(ak_loaded[akdf.columns].to_pandas())
+
+            # test save with index true
+            akdf.to_parquet(f"{tmp_dirname}/testName_with_index.pq", index=True)
+            assert (
+                len(glob.glob(f"{tmp_dirname}/testName_with_index*.pq")) == ak.get_config()["numLocales"]
+            )
+
+            # Test for df having seg array col
+            df = ak.DataFrame({"a": ak.arange(10), "b": ak.SegArray(ak.arange(10), ak.arange(10))})
+            df.to_hdf(f"{tmp_dirname}/seg_test.h5")
+            assert (
+                len(glob.glob(f"{tmp_dirname}/seg_test*.h5")) == ak.get_config()["numLocales"]
+            )
+            ak_loaded = ak.DataFrame.load(f"{tmp_dirname}/seg_test.h5")
+            assert df.to_pandas().equals(ak_loaded.to_pandas())
+
+            # test with segarray with _ in column name
+            df_dict = {
+                "c_1": ak.arange(3, 6),
+                "c_2": ak.arange(6, 9),
+                "c_3": ak.SegArray(ak.array([0, 9, 14]), ak.arange(20)),
+            }
+            akdf = ak.DataFrame(df_dict)
+            akdf.to_hdf(f"{tmp_dirname}/seg_test.h5")
+            assert (
+                len(glob.glob(f"{tmp_dirname}/seg_test*.h5")) == ak.get_config()["numLocales"]
+            )
+            ak_loaded = ak.DataFrame.load(f"{tmp_dirname}/seg_test.h5")
+            assert akdf.to_pandas().equals(ak_loaded.to_pandas())
+
+            # test load_all and read workflows
+            ak_load_all = ak.DataFrame(ak.load_all(f"{tmp_dirname}/seg_test.h5"))
+            assert akdf.to_pandas().equals(ak_load_all.to_pandas())
+
+            ak_read = ak.DataFrame(ak.read(f"{tmp_dirname}/seg_test*"))
+            assert akdf.to_pandas().equals(ak_read.to_pandas())
+
+    def test_isin(self):
+        df = ak.DataFrame({"col_A": ak.array([7, 3]), "col_B": ak.array([1, 9])})
+
+        # test against pdarray
+        test_df = df.isin(ak.array([0, 1]))
+        assert test_df["col_A"].to_list() == [False, False]
+        assert test_df["col_B"].to_list() == [True, False]
+
+        # Test against dict
+        test_df = df.isin({"col_A": ak.array([0, 3])})
+        assert test_df["col_A"].to_list() == [False, True]
+        assert test_df["col_B"].to_list() == [False, False]
+
+        # test against series
+        i = ak.Index(ak.arange(2))
+        s = ak.Series(data=ak.array([3, 9]), index=i.index)
+        test_df = df.isin(s)
+        assert test_df["col_A"].to_list() == [False, False]
+        assert test_df["col_B"].to_list() == [False, True]
+
+        # test against another dataframe
+        other_df = ak.DataFrame({"col_A": ak.array([7, 3], dtype=ak.bigint), "col_C": ak.array([0, 9])})
+        test_df = df.isin(other_df)
+        assert test_df["col_A"].to_list() == [True, True]
+        assert test_df["col_B"].to_list() == [False, False]
+
+    def test_multiindex_compat(self):
+        # Added for testing Issue #1505
+        df = ak.DataFrame({"a": ak.arange(10), "b": ak.arange(10), "c": ak.arange(10)})
+        df.groupby(["a", "b"]).sum("c")
+
+    def test_uint_greediness(self):
+        # default to uint when all supportedInt and any value > 2**63
+        # to avoid loss of precision see (#1983)
+        df = pd.DataFrame({"Test": [2**64 - 1, 0]})
+        assert df["Test"].dtype == ak.uint64
+
+    def test_head_tail_resetting_index(self):
+        # Test that issue #2183 is resolved
+        df = ak.DataFrame({"cnt": ak.arange(65)})
+        # Note we have to call __repr__ to trigger head_tail_server call
+
+        bool_idx = df[df["cnt"] > 3]
+        bool_idx.__repr__()
+        assert bool_idx.index.index.to_list() == list(range(4, 65))
+
+        slice_idx = df[:]
+        slice_idx.__repr__()
+        assert slice_idx.index.index.to_list() == list(range(65))
+
+        # verify it persists non-int Index
+        idx = ak.concatenate([ak.zeros(5, bool), ak.ones(60, bool)])
+        df = ak.DataFrame({"cnt": ak.arange(65)}, index=idx)
+
+        bool_idx = df[df["cnt"] > 3]
+        bool_idx.__repr__()
+        # the new index is first False and rest True (because we lose first 4), so equivalent to arange(61, bool)
+        assert bool_idx.index.index.to_list() == ak.arange(61, dtype=bool).to_list()
+
+        slice_idx = df[:]
+        slice_idx.__repr__()
+        assert slice_idx.index.index.to_list() == idx.to_list()
+
+    def test_ipv4_columns(self):
+        # test with single IPv4 column
+        df = ak.DataFrame({
+            'a': ak.arange(10),
+            'b': ak.IPv4(ak.arange(10))
+        })
+        with tempfile.TemporaryDirectory(dir=TestDataFrame.df_test_base_tmp) as tmp_dirname:
+            fname = tmp_dirname + "/ipv4_df"
+            df.to_parquet(fname)
+
+            data = ak.read(fname+"*")
+            rddf = ak.DataFrame({
+                'a': data['a'],
+                'b': ak.IPv4(data['b'])
+            })
+
+            assert df['a'].to_list() == rddf['a'].to_list()
+            assert df['b'].to_list() == rddf['b'].to_list()
+
+        # test with multiple
+        df = ak.DataFrame({
+            'a': ak.IPv4(ak.arange(10)),
+            'b': ak.IPv4(ak.arange(10))
+        })
+        with tempfile.TemporaryDirectory(dir=TestDataFrame.df_test_base_tmp) as tmp_dirname:
+            fname = tmp_dirname + "/ipv4_df"
+            df.to_parquet(fname)
+
+            data = ak.read(fname + "*")
+            rddf = ak.DataFrame({
+                'a': ak.IPv4(data['a']),
+                'b': ak.IPv4(data['b'])
+            })
+
+            assert df['a'].to_list() == rddf['a'].to_list()
+            assert df['b'].to_list() == rddf['b'].to_list()
+
+        # test replacement of IPv4 with uint representation
+        df = ak.DataFrame({
+            'a': ak.IPv4(ak.arange(10))
+        })
+        df['a'] = df['a'].export_uint()
+        assert ak.arange(10).to_list() == df['a'].to_list()
+
+    def test_subset(self):
+        df = ak.DataFrame({
+            'a': ak.arange(100),
+            'b': ak.randint(0, 20, 100),
+            'c': ak.random_strings_uniform(0, 16, 100),
+            'd': ak.randint(25, 75, 100)
+        })
+        df2 = df[['a', 'b']]
+        assert ['a', 'b'] == df2.columns
+        assert df.index.to_list() == df2.index.to_list()
+        assert df['a'].to_list() == df2['a'].to_list()
+        assert df['b'].to_list() == df2['b'].to_list()

From 576f96d9f9369df317860124bcd8902cd3565b28 Mon Sep 17 00:00:00 2001
From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com>
Date: Thu, 13 Jul 2023 08:10:43 -0400
Subject: [PATCH 2/8] Updating creation testing coverage.

---
 PROTO_tests/tests/dataframe_test.py | 111 ++++++++++++----------------
 arkouda/dataframe.py                |  10 +--
 pytest_PROTO.ini                    |   2 +-
 3 files changed, 52 insertions(+), 71 deletions(-)

diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py
index 87ea2b273c..3412160d33 100644
--- a/PROTO_tests/tests/dataframe_test.py
+++ b/PROTO_tests/tests/dataframe_test.py
@@ -1,5 +1,6 @@
 import arkouda as ak
 import pandas as pd
+from pandas.testing import assert_frame_equal
 import numpy as np
 import pytest
 import random
@@ -105,17 +106,57 @@ def build_ak_typeerror():
             {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi}
         )
 
-    def test_dataframe_creation(self):
+    @pytest.mark.parametrize("size", pytest.prob_size)
+    def test_dataframe_creation(self, size):
         # Validate empty DataFrame
         df = ak.DataFrame()
         assert isinstance(df, ak.DataFrame)
         assert df.empty
 
-        df = self.build_ak_df()
-        ref_df = self.build_pd_df()
-        assert isinstance(df, ak.DataFrame)
-        assert len(df) == 6
-        assert ref_df.equals(df.to_pandas())
+        # Validation of Creation from Pandas
+        pddf = pd.DataFrame({
+            "int": np.arange(size),
+            "uint": np.random.randint(0, size/2, size, dtype=np.uint64),
+            "bigint": np.arange(2**200, 2**200+size),
+            "bool": np.random.randint(0, 1, size=size, dtype=bool),
+            "segarray": [np.random.randint(0, size / 2, 2) for i in range(size)]
+        })
+        akdf = ak.DataFrame(pddf)
+        assert isinstance(akdf, ak.DataFrame)
+        assert len(akdf) == size
+        assert_frame_equal(pddf, akdf.to_pandas())
+
+        # validation of creation from dictionary
+        akdf = ak.DataFrame({
+            "int": ak.arange(size),
+            "uint": ak.array(pddf["uint"]),
+            "bigint": ak.arange(2 ** 200, 2 ** 200 + size),
+            "bool": ak.array(pddf["bool"]),
+            "segarray": ak.SegArray.from_multi_array([ak.array(x) for x in pddf["segarray"]])
+        })
+        assert isinstance(akdf, ak.DataFrame)
+        assert len(akdf) == size
+
+        assert_frame_equal(pddf, akdf.to_pandas())
+
+        # validation of creation from list
+        x = [
+            np.arange(size),
+            np.random.randint(0, 5, size),
+            np.random.randint(5, 10, size),
+        ]
+        pddf = pd.DataFrame(x)
+        l = [ak.array(val) for val in list(zip(x[0], x[1], x[2]))]
+        akdf = ak.DataFrame(l)
+        assert isinstance(akdf, ak.DataFrame)
+        assert len(akdf) == len(pddf)
+        # arkouda does not allow for numeric columns.
+        assert akdf.columns == [str(x) for x in pddf.columns.values]
+        # use the columns from the pandas created for equivalence check
+        # these should be equivalent
+        ak_to_pd = akdf.to_pandas()
+        ak_to_pd.columns = pddf.columns
+        assert_frame_equal(pddf, ak_to_pd)
 
     def test_client_type_creation(self):
         f = ak.Fields(ak.arange(10), ["A", "B", "c"])
@@ -537,64 +578,6 @@ def test_copy(self):
         df_copy.__setitem__("userID", ak.array([1, 2, 1, 3, 2, 1]))
         assert df.__repr__() == df_copy.__repr__()
 
-    # TODO - This should be covered in HDF5 and Parquet testing
-    def test_save(self):
-        i = list(range(3))
-        c1 = [9, 7, 17]
-        c2 = [2, 4, 6]
-        df_dict = {"i": ak.array(i), "c_1": ak.array(c1), "c_2": ak.array(c2)}
-
-        akdf = ak.DataFrame(df_dict)
-
-        validation_df = pd.DataFrame(
-            {
-                "i": i,
-                "c_1": c1,
-                "c_2": c2,
-            }
-        )
-        with tempfile.TemporaryDirectory(dir=self.df_test_base_tmp) as tmp_dirname:
-            akdf.to_parquet(f"{tmp_dirname}/testName")
-
-            ak_loaded = ak.DataFrame.load(f"{tmp_dirname}/testName")
-            assert validation_df.equals(ak_loaded[akdf.columns].to_pandas())
-
-            # test save with index true
-            akdf.to_parquet(f"{tmp_dirname}/testName_with_index.pq", index=True)
-            assert (
-                len(glob.glob(f"{tmp_dirname}/testName_with_index*.pq")) == ak.get_config()["numLocales"]
-            )
-
-            # Test for df having seg array col
-            df = ak.DataFrame({"a": ak.arange(10), "b": ak.SegArray(ak.arange(10), ak.arange(10))})
-            df.to_hdf(f"{tmp_dirname}/seg_test.h5")
-            assert (
-                len(glob.glob(f"{tmp_dirname}/seg_test*.h5")) == ak.get_config()["numLocales"]
-            )
-            ak_loaded = ak.DataFrame.load(f"{tmp_dirname}/seg_test.h5")
-            assert df.to_pandas().equals(ak_loaded.to_pandas())
-
-            # test with segarray with _ in column name
-            df_dict = {
-                "c_1": ak.arange(3, 6),
-                "c_2": ak.arange(6, 9),
-                "c_3": ak.SegArray(ak.array([0, 9, 14]), ak.arange(20)),
-            }
-            akdf = ak.DataFrame(df_dict)
-            akdf.to_hdf(f"{tmp_dirname}/seg_test.h5")
-            assert (
-                len(glob.glob(f"{tmp_dirname}/seg_test*.h5")) == ak.get_config()["numLocales"]
-            )
-            ak_loaded = ak.DataFrame.load(f"{tmp_dirname}/seg_test.h5")
-            assert akdf.to_pandas().equals(ak_loaded.to_pandas())
-
-            # test load_all and read workflows
-            ak_load_all = ak.DataFrame(ak.load_all(f"{tmp_dirname}/seg_test.h5"))
-            assert akdf.to_pandas().equals(ak_load_all.to_pandas())
-
-            ak_read = ak.DataFrame(ak.read(f"{tmp_dirname}/seg_test*"))
-            assert akdf.to_pandas().equals(ak_read.to_pandas())
-
     def test_isin(self):
         df = ak.DataFrame({"col_A": ak.array([7, 3]), "col_B": ak.array([1, 9])})
 
diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py
index e6f4a2a82b..4145f4e6fa 100644
--- a/arkouda/dataframe.py
+++ b/arkouda/dataframe.py
@@ -275,13 +275,11 @@ def __init__(self, initialdata=None, index=None):
             else:
                 self._set_index(index)
             self.data = {}
-            # convert the lists defining each column into a pdarray
-            # pd.DataFrame.values is stored as rows, we need lists to be columns
-            for key, val in initialdata.to_dict("list").items():
+            for key in initialdata.columns:
                 self.data[key] = (
-                    SegArray.from_multi_array([array(r) for r in val])
-                    if isinstance(val[0], list)
-                    else array(val)
+                    SegArray.from_multi_array([array(r) for r in initialdata[key]])
+                    if isinstance(initialdata[key][0], (list, np.ndarray))
+                    else array(initialdata[key])
                 )
 
             self.data.update()
diff --git a/pytest_PROTO.ini b/pytest_PROTO.ini
index afdb98cf87..68cbf3e01e 100644
--- a/pytest_PROTO.ini
+++ b/pytest_PROTO.ini
@@ -2,7 +2,7 @@
 addopts =
     --benchmark-disable
     --benchmark-skip
-    --size=100
+    --size=5
 filterwarnings =
     ignore:Version mismatch between client .*
 testpaths =

From b80986aaddbc844a715c285980793d3aea484b6d Mon Sep 17 00:00:00 2001
From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com>
Date: Mon, 17 Jul 2023 09:45:22 -0400
Subject: [PATCH 3/8] DataFrame testing new framework final. Adds to_list to
 ak.Series and updates pdarray creation to detect numpy.object_ with str
 elements.

---
 PROTO_tests/tests/dataframe_test.py | 148 +++++++++++-----------------
 arkouda/pdarraycreation.py          |   6 +-
 arkouda/series.py                   |   5 +
 pytest_PROTO.ini                    |   2 +-
 4 files changed, 71 insertions(+), 90 deletions(-)

diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py
index 3412160d33..6a0b50f133 100644
--- a/PROTO_tests/tests/dataframe_test.py
+++ b/PROTO_tests/tests/dataframe_test.py
@@ -1,6 +1,6 @@
 import arkouda as ak
 import pandas as pd
-from pandas.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal, assert_series_equal
 import numpy as np
 import pytest
 import random
@@ -170,19 +170,13 @@ def test_client_type_creation(self):
         pddf = pd.DataFrame(
             {"fields": f.to_list(), "ip": ip.to_list(), "date": pd_d, "bitvector": bv.to_list()}
         )
-        shape = f"({df._shape_str()})".replace("(", "[").replace(")", "]")
-        pd.set_option("display.max_rows", 4)
-        s = df.__repr__().replace(f" ({df._shape_str()})", f"\n\n{shape}")
-        assert s == pddf.__repr__()
-
-        pd.set_option("display.max_rows", 10)
-        pdf = pd.DataFrame({"a": list(range(1000)), "b": list(range(1000))})
-        pdf["a"] = pdf["a"].apply(lambda x: "AA" + str(x))
-        pdf["b"] = pdf["b"].apply(lambda x: "BB" + str(x))
-        df = ak.DataFrame(pdf)
-        shape = f"({df._shape_str()})".replace("(", "[").replace(")", "]")
-        s = df.__repr__().replace(f" ({df._shape_str()})", f"\n\n{shape}")
-        assert s, pdf.__repr__()
+        assert_frame_equal(pddf, df.to_pandas())
+        pddf = pd.DataFrame({"a": list(range(1000)), "b": list(range(1000))})
+        pddf["a"] = pddf["a"].apply(lambda x: "AA" + str(x))
+        pddf["b"] = pddf["b"].apply(lambda x: "BB" + str(x))
+
+        df = ak.DataFrame(pddf)
+        assert_frame_equal(pddf, df.to_pandas())
 
     def test_boolean_indexing(self):
         df = self.build_ak_df()
@@ -194,22 +188,32 @@ def test_boolean_indexing(self):
 
     def test_column_indexing(self):
         df = self.build_ak_df()
-        assert isinstance(df.userName, ak.Series)
-        assert isinstance(df.userID, ak.Series)
-        assert isinstance(df.item, ak.Series)
-        assert isinstance(df.day, ak.Series)
-        assert isinstance(df.amount, ak.Series)
-        assert isinstance(df.bi, ak.Series)
-        for col in ("userName", "userID", "item", "day", "amount", "bi"):
-            assert isinstance(df[col], (ak.pdarray, ak.Strings, ak.Categorical))
-        assert isinstance(df[["userName", "amount", "bi"]], ak.DataFrame)
-        assert isinstance(df[("userID", "item", "day", "bi")], ak.DataFrame)
+        ref_df = self.build_pd_df()
+
+        # index validation
         assert isinstance(df.index, ak.Index)
+        assert df.index.to_list() == ref_df.index.to_list()
+
+        # column validation [] and . access
+        for cname, col, ref_col in zip(df.columns, [df.userName, df.userID, df.item, df.day, df.amount, df.bi], [ref_df.userName, ref_df.userID, ref_df.item, ref_df.day, ref_df.amount, ref_df.bi]):
+            assert isinstance(col, ak.Series)
+            assert col.to_list() == ref_col.to_list()
+            assert isinstance(df[cname], (ak.pdarray, ak.Strings, ak.Categorical))
+            assert df[cname].to_list() == ref_df[cname].to_list()
+
+        # check mult-column list
+        col_list = ["userName", "amount", "bi"]
+        assert isinstance(df[col_list], ak.DataFrame)
+        assert_frame_equal(df[col_list].to_pandas(), ref_df[col_list])
+
+        # check multi-column tuple
+        col_tup = ("userID", "item", "day", "bi")
+        assert isinstance(df[col_tup], ak.DataFrame)
+        # pandas only supports lists of columns, not tuples
+        assert_frame_equal(df[col_tup].to_pandas(), ref_df[list(col_tup)])
 
     def test_dtype_prop(self):
-        str_arr = ak.array(
-            ["".join(random.choices(string.ascii_letters + string.digits, k=5)) for _ in range(3)]
-        )
+        str_arr = ak.random_strings_uniform(1, 5, 3)
         df_dict = {
             "i": ak.arange(3),
             "c_1": ak.arange(3, 6, 1),
@@ -221,32 +225,15 @@ def test_dtype_prop(self):
         }
         akdf = ak.DataFrame(df_dict)
         assert len(akdf.columns) == len(akdf.dtypes)
+        # dtypes returns objType for categorical, segarray. We should probably fix
+        # this and add a df.objTypes property. pdarrays return actual dtype
+        for ref_type, c in zip(["int64", "int64", "int64", "str", "Categorical", "SegArray", "bigint"], akdf.columns):
+            assert ref_type == str(akdf.dtypes[c])
 
     def test_from_pandas(self):
-        username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice", "John", "Carol"]
-        userid = [111, 222, 111, 333, 222, 111, 444, 333]
-        item = [0, 0, 1, 1, 2, 0, 0, 2]
-        day = [5, 5, 6, 5, 6, 6, 1, 2]
-        amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6, 0.5, 5.1]
-        bi = 2**200
-        bi_arr = [bi, bi + 1, bi + 2, bi + 3, bi + 4, bi + 5, bi + 6, bi + 7]
-        ref_df = pd.DataFrame(
-            {
-                "userName": username,
-                "userID": userid,
-                "item": item,
-                "day": day,
-                "amount": amount,
-                "bi": bi_arr,
-            }
-        )
-
+        ref_df = self.build_pd_df()
         df = ak.DataFrame(ref_df)
-
-        assert ((ref_df == df.to_pandas()).all()).all()
-
-        df = ak.DataFrame.from_pandas(ref_df)
-        assert ((ref_df == df.to_pandas()).all()).all()
+        assert_frame_equal(ref_df, df.to_pandas())
 
     def test_drop(self):
         # create an arkouda df.
@@ -258,17 +245,17 @@ def test_drop(self):
         df_drop = df.drop([0, 1, 2])
         pddf_drop = pd_df.drop(labels=[0, 1, 2])
         pddf_drop.reset_index(drop=True, inplace=True)
-        assert pddf_drop.equals(df_drop.to_pandas())
+        assert_frame_equal(pddf_drop, df_drop.to_pandas())
 
         df_drop = df.drop("userName", axis=1)
         pddf_drop = pd_df.drop(labels=["userName"], axis=1)
-        assert pddf_drop.equals(df_drop.to_pandas())
+        assert_frame_equal(pddf_drop, df_drop.to_pandas())
 
         # Test dropping columns
         df.drop("userName", axis=1, inplace=True)
         pd_df.drop(labels=["userName"], axis=1, inplace=True)
 
-        assert ((df.to_pandas() == pd_df).all()).all()
+        assert_frame_equal(pddf_drop, df_drop.to_pandas())
 
         # Test dropping rows
         df.drop([0, 2, 5], inplace=True)
@@ -276,7 +263,7 @@ def test_drop(self):
         pd_df.drop(labels=[0, 2, 5], inplace=True)
         pd_df.reset_index(drop=True, inplace=True)
 
-        assert pd_df.equals(df.to_pandas())
+        assert_frame_equal(pddf_drop, df_drop.to_pandas())
 
         # verify that index keys must be ints
         with pytest.raises(TypeError):
@@ -298,7 +285,7 @@ def test_drop_duplicates(self):
         dedup_test = dedup.to_pandas().sort_values("userName").reset_index(drop=True)
         dedup_pd_test = dedup_pd.sort_values("userName").reset_index(drop=True)
 
-        assert dedup_test.equals(dedup_pd_test)
+        assert_frame_equal(dedup_pd_test, dedup_test)
 
     def test_shape(self):
         df = self.build_ak_df()
@@ -367,7 +354,7 @@ def test_append(self):
         ref_df = self.build_pd_df_append()
 
         # dataframe equality returns series with bool result for each row.
-        assert ref_df.equals(df.to_pandas())
+        assert_frame_equal(ref_df, df.to_pandas())
 
         idx = np.arange(8)
         assert idx.tolist() == df.index.index.to_list()
@@ -389,7 +376,7 @@ def test_concat(self):
         ref_df = self.build_pd_df_append()
 
         # dataframe equality returns series with bool result for each row.
-        assert ref_df.equals(glued.to_pandas())
+        assert_frame_equal(ref_df, glued.to_pandas())
 
         df_keyerror = self.build_ak_keyerror()
         with pytest.raises(KeyError):
@@ -405,15 +392,15 @@ def test_head(self):
 
         hdf = df.head(3)
         hdf_ref = ref_df.head(3).reset_index(drop=True)
-        assert hdf_ref.equals(hdf.to_pandas())
+        assert_frame_equal(hdf_ref, hdf.to_pandas())
 
     def test_tail(self):
         df = self.build_ak_df()
         ref_df = self.build_pd_df()
 
-        hdf = df.tail(2)
-        hdf_ref = ref_df.tail(2).reset_index(drop=True)
-        assert hdf_ref.equals(hdf.to_pandas())
+        tdf = df.tail(2)
+        tdf_ref = ref_df.tail(2).reset_index(drop=True)
+        assert_frame_equal(tdf_ref, tdf.to_pandas())
 
     def test_groupby_standard(self):
         df = self.build_ak_df()
@@ -436,12 +423,12 @@ def test_groupby_standard(self):
             data=np.ones(4, dtype=np.int64),
             index=pd.Index(data=np.array(["0.0.0.1", "0.0.0.2", "0.0.0.3", "0.0.0.4"], dtype="<U7")),
         )
-        assert s.to_pandas().equals(other=pds)
+        assert_series_equal(pds, s.to_pandas())
 
         # testing counts with Categorical column
         s = ak.DataFrame({"a": ak.Categorical(ak.array(["a", "a", "a", "b"]))}).groupby("a").count()
         pds = pd.Series(data=np.array([3, 1]), index=pd.Index(data=np.array(["a", "b"], dtype="<U7")))
-        assert s.to_pandas().equals(other=pds)
+        assert_series_equal(pds, s.to_pandas())
 
     def test_gb_series(self):
         username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"])
@@ -468,19 +455,6 @@ def test_gb_series(self):
         assert c.index.to_list() == ["Bob", "Alice", "Carol"]
         assert c.values.to_list() == [2, 3, 1]
 
-    def test_to_pandas(self):
-        df = self.build_ak_df()
-        pd_df = self.build_pd_df()
-
-        assert pd_df.equals(df.to_pandas())
-
-        slice_df = df[ak.array([1, 3, 5])]
-        pd_df = slice_df.to_pandas(retain_index=True)
-        assert pd_df.index.tolist() == [1, 3, 5]
-
-        pd_df = slice_df.to_pandas()
-        assert pd_df.index.tolist() == [0, 1, 2]
-
     def test_argsort(self):
         df = self.build_ak_df()
 
@@ -508,20 +482,20 @@ def test_sort_values(self):
 
         df = ak.DataFrame({"userID": userid_ak})
         ord = df.sort_values()
-        assert ord.to_pandas().equals(pd.DataFrame(data=userid, columns=["userID"]))
+        assert_frame_equal(pd.DataFrame(data=userid, columns=["userID"]), ord.to_pandas())
         ord = df.sort_values(ascending=False)
         userid.reverse()
-        assert ord.to_pandas().equals(pd.DataFrame(data=userid, columns=["userID"]))
+        assert_frame_equal(pd.DataFrame(data=userid, columns=["userID"]), ord.to_pandas())
 
         df = self.build_ak_df()
         ord = df.sort_values(by="userID")
         ref_df = self.build_pd_df()
         ref_df = ref_df.sort_values(by="userID").reset_index(drop=True)
-        assert ref_df.equals(ord.to_pandas())
+        assert_frame_equal(ref_df, ord.to_pandas())
 
         ord = df.sort_values(by=["userID", "day"])
         ref_df = ref_df.sort_values(by=["userID", "day"]).reset_index(drop=True)
-        assert ref_df.equals(ord.to_pandas())
+        assert_frame_equal(ref_df, ord.to_pandas())
 
         with pytest.raises(TypeError):
             df.sort_values(by=1)
@@ -553,7 +527,7 @@ def test_apply_perm(self):
 
         ord_ref = ref_df.sort_values(by="userID").reset_index(drop=True)
         ord_ref = ord_ref.reindex(perm_list).reset_index(drop=True)
-        assert ord_ref.equals(ord.to_pandas())
+        assert_frame_equal(ord_ref, ord.to_pandas())
 
     def test_filter_by_range(self):
         userid = ak.array([111, 222, 111, 333, 222, 111])
@@ -569,14 +543,14 @@ def test_copy(self):
         df = ak.DataFrame({"userName": username, "userID": userid})
 
         df_copy = df.copy(deep=True)
-        assert df.__repr__() == df_copy.__repr__()
+        assert_frame_equal(df.to_pandas(), df_copy.to_pandas())
 
         df_copy.__setitem__("userID", ak.array([1, 2, 1, 3, 2, 1]))
-        assert df.__repr__() != df_copy.__repr__()
+        assert df["userID"].to_list() != df_copy["userID"].to_list()
 
         df_copy = df.copy(deep=False)
         df_copy.__setitem__("userID", ak.array([1, 2, 1, 3, 2, 1]))
-        assert df.__repr__() == df_copy.__repr__()
+        assert_frame_equal(df.to_pandas(), df_copy.to_pandas())
 
     def test_isin(self):
         df = ak.DataFrame({"col_A": ak.array([7, 3]), "col_B": ak.array([1, 9])})
@@ -657,8 +631,7 @@ def test_ipv4_columns(self):
                 'b': ak.IPv4(data['b'])
             })
 
-            assert df['a'].to_list() == rddf['a'].to_list()
-            assert df['b'].to_list() == rddf['b'].to_list()
+            assert_frame_equal(df.to_pandas(), rddf.to_pandas())
 
         # test with multiple
         df = ak.DataFrame({
@@ -675,8 +648,7 @@ def test_ipv4_columns(self):
                 'b': ak.IPv4(data['b'])
             })
 
-            assert df['a'].to_list() == rddf['a'].to_list()
-            assert df['b'].to_list() == rddf['b'].to_list()
+            assert_frame_equal(df.to_pandas(), rddf.to_pandas())
 
         # test replacement of IPv4 with uint representation
         df = ak.DataFrame({
diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py
index e8327ea338..0d25bde0e4 100755
--- a/arkouda/pdarraycreation.py
+++ b/arkouda/pdarraycreation.py
@@ -229,6 +229,7 @@ def array(
                 a = np.array(a)
         except (RuntimeError, TypeError, ValueError):
             raise TypeError("a must be a pdarray, np.ndarray, or convertible to a numpy array")
+
     # Return multi-dimensional arrayview
     if a.ndim != 1:
         # TODO add order
@@ -239,8 +240,11 @@ def array(
                 return flat_a.reshape(a.shape)
         else:
             raise TypeError("Must be an iterable or have a numeric DType")
+
+
     # Check if array of strings
-    if "U" in a.dtype.kind:
+    # if a.dtype == numpy.object_ need to check first element
+    if "U" in a.dtype.kind or (a.dtype == np.object_ and isinstance(a[0], str)):
         # encode each string and add a null byte terminator
         encoded = [i for i in itertools.chain.from_iterable(map(lambda x: x.encode() + b"\x00", a))]
         nbytes = len(encoded)
diff --git a/arkouda/series.py b/arkouda/series.py
index 6b35275508..8496907783 100644
--- a/arkouda/series.py
+++ b/arkouda/series.py
@@ -358,6 +358,11 @@ def to_pandas(self) -> pd.Series:
         val = convert_if_categorical(self.values)
         return pd.Series(val.to_ndarray(), index=idx)
 
+    @typechecked()
+    def to_list(self) -> list:
+        p = self.to_pandas()
+        return p.to_list()
+
     @typechecked
     def value_counts(self, sort: bool = True) -> Series:
         """Return a Series containing counts of unique values.
diff --git a/pytest_PROTO.ini b/pytest_PROTO.ini
index 68cbf3e01e..afdb98cf87 100644
--- a/pytest_PROTO.ini
+++ b/pytest_PROTO.ini
@@ -2,7 +2,7 @@
 addopts =
     --benchmark-disable
     --benchmark-skip
-    --size=5
+    --size=100
 filterwarnings =
     ignore:Version mismatch between client .*
 testpaths =

From 41688695a6de81557d2a56fb358e979f30b6ce44 Mon Sep 17 00:00:00 2001
From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com>
Date: Mon, 17 Jul 2023 10:08:37 -0400
Subject: [PATCH 4/8] Correcting formatting errors.

---
 PROTO_tests/tests/dataframe_test.py | 185 ++++++++++++++++------------
 arkouda/pdarraycreation.py          |   1 -
 2 files changed, 109 insertions(+), 77 deletions(-)

diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py
index 6a0b50f133..0b4636ff4c 100644
--- a/PROTO_tests/tests/dataframe_test.py
+++ b/PROTO_tests/tests/dataframe_test.py
@@ -1,13 +1,12 @@
-import arkouda as ak
-import pandas as pd
-from pandas.testing import assert_frame_equal, assert_series_equal
+import os
+import tempfile
+
 import numpy as np
+import pandas as pd
 import pytest
-import random
-import string
-import tempfile
-import glob
-import os
+from pandas.testing import assert_frame_equal, assert_series_equal
+
+import arkouda as ak
 from arkouda import io_util
 
 
@@ -22,9 +21,16 @@ def build_ak_df():
         item = ak.array([0, 0, 1, 1, 2, 0])
         day = ak.array([5, 5, 6, 5, 6, 6])
         amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6])
-        bi = ak.arange(2 ** 200, 2 ** 200 + 6)
+        bi = ak.arange(2**200, 2**200 + 6)
         return ak.DataFrame(
-            {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi}
+            {
+                "userName": username,
+                "userID": userid,
+                "item": item,
+                "day": day,
+                "amount": amount,
+                "bi": bi,
+            }
         )
 
     @staticmethod
@@ -34,9 +40,16 @@ def build_pd_df():
         item = [0, 0, 1, 1, 2, 0]
         day = [5, 5, 6, 5, 6, 6]
         amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6]
-        bi = [2 ** 200, 2 ** 200 + 1, 2 ** 200 + 2, 2 ** 200 + 3, 2 ** 200 + 4, 2 ** 200 + 5]
+        bi = [2**200, 2**200 + 1, 2**200 + 2, 2**200 + 3, 2**200 + 4, 2**200 + 5]
         return pd.DataFrame(
-            {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi}
+            {
+                "userName": username,
+                "userID": userid,
+                "item": item,
+                "day": day,
+                "amount": amount,
+                "bi": bi,
+            }
         )
 
     @staticmethod
@@ -62,9 +75,16 @@ def build_ak_append():
         item = ak.array([0, 2])
         day = ak.array([1, 2])
         amount = ak.array([0.5, 5.1])
-        bi = ak.array([2 ** 200 + 6, 2 ** 200 + 7])
+        bi = ak.array([2**200 + 6, 2**200 + 7])
         return ak.DataFrame(
-            {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi}
+            {
+                "userName": username,
+                "userID": userid,
+                "item": item,
+                "day": day,
+                "amount": amount,
+                "bi": bi,
+            }
         )
 
     @staticmethod
@@ -75,17 +95,24 @@ def build_pd_df_append():
         day = [5, 5, 6, 5, 6, 6, 1, 2]
         amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6, 0.5, 5.1]
         bi = [
-            2 ** 200,
-            2 ** 200 + 1,
-            2 ** 200 + 2,
-            2 ** 200 + 3,
-            2 ** 200 + 4,
-            2 ** 200 + 5,
-            2 ** 200 + 6,
-            2 ** 200 + 7,
+            2**200,
+            2**200 + 1,
+            2**200 + 2,
+            2**200 + 3,
+            2**200 + 4,
+            2**200 + 5,
+            2**200 + 6,
+            2**200 + 7,
         ]
         return pd.DataFrame(
-            {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi}
+            {
+                "userName": username,
+                "userID": userid,
+                "item": item,
+                "day": day,
+                "amount": amount,
+                "bi": bi,
+            }
         )
 
     @staticmethod
@@ -101,9 +128,16 @@ def build_ak_typeerror():
         item = ak.array([0, 0, 1, 1, 2, 0])
         day = ak.array([5, 5, 6, 5, 6, 6])
         amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6])
-        bi = ak.arange(2 ** 200, 2 ** 200 + 6)
+        bi = ak.arange(2**200, 2**200 + 6)
         return ak.DataFrame(
-            {"userName": username, "userID": userid, "item": item, "day": day, "amount": amount, "bi": bi}
+            {
+                "userName": username,
+                "userID": userid,
+                "item": item,
+                "day": day,
+                "amount": amount,
+                "bi": bi,
+            }
         )
 
     @pytest.mark.parametrize("size", pytest.prob_size)
@@ -114,26 +148,30 @@ def test_dataframe_creation(self, size):
         assert df.empty
 
         # Validation of Creation from Pandas
-        pddf = pd.DataFrame({
-            "int": np.arange(size),
-            "uint": np.random.randint(0, size/2, size, dtype=np.uint64),
-            "bigint": np.arange(2**200, 2**200+size),
-            "bool": np.random.randint(0, 1, size=size, dtype=bool),
-            "segarray": [np.random.randint(0, size / 2, 2) for i in range(size)]
-        })
+        pddf = pd.DataFrame(
+            {
+                "int": np.arange(size),
+                "uint": np.random.randint(0, size / 2, size, dtype=np.uint64),
+                "bigint": np.arange(2**200, 2**200 + size),
+                "bool": np.random.randint(0, 1, size=size, dtype=bool),
+                "segarray": [np.random.randint(0, size / 2, 2) for i in range(size)],
+            }
+        )
         akdf = ak.DataFrame(pddf)
         assert isinstance(akdf, ak.DataFrame)
         assert len(akdf) == size
         assert_frame_equal(pddf, akdf.to_pandas())
 
         # validation of creation from dictionary
-        akdf = ak.DataFrame({
-            "int": ak.arange(size),
-            "uint": ak.array(pddf["uint"]),
-            "bigint": ak.arange(2 ** 200, 2 ** 200 + size),
-            "bool": ak.array(pddf["bool"]),
-            "segarray": ak.SegArray.from_multi_array([ak.array(x) for x in pddf["segarray"]])
-        })
+        akdf = ak.DataFrame(
+            {
+                "int": ak.arange(size),
+                "uint": ak.array(pddf["uint"]),
+                "bigint": ak.arange(2**200, 2**200 + size),
+                "bool": ak.array(pddf["bool"]),
+                "segarray": ak.SegArray.from_multi_array([ak.array(x) for x in pddf["segarray"]]),
+            }
+        )
         assert isinstance(akdf, ak.DataFrame)
         assert len(akdf) == size
 
@@ -146,8 +184,8 @@ def test_dataframe_creation(self, size):
             np.random.randint(5, 10, size),
         ]
         pddf = pd.DataFrame(x)
-        l = [ak.array(val) for val in list(zip(x[0], x[1], x[2]))]
-        akdf = ak.DataFrame(l)
+        l_cols = [ak.array(val) for val in list(zip(x[0], x[1], x[2]))]
+        akdf = ak.DataFrame(l_cols)
         assert isinstance(akdf, ak.DataFrame)
         assert len(akdf) == len(pddf)
         # arkouda does not allow for numeric columns.
@@ -195,7 +233,11 @@ def test_column_indexing(self):
         assert df.index.to_list() == ref_df.index.to_list()
 
         # column validation [] and . access
-        for cname, col, ref_col in zip(df.columns, [df.userName, df.userID, df.item, df.day, df.amount, df.bi], [ref_df.userName, ref_df.userID, ref_df.item, ref_df.day, ref_df.amount, ref_df.bi]):
+        for cname, col, ref_col in zip(
+            df.columns,
+            [df.userName, df.userID, df.item, df.day, df.amount, df.bi],
+            [ref_df.userName, ref_df.userID, ref_df.item, ref_df.day, ref_df.amount, ref_df.bi],
+        ):
             assert isinstance(col, ak.Series)
             assert col.to_list() == ref_col.to_list()
             assert isinstance(df[cname], (ak.pdarray, ak.Strings, ak.Categorical))
@@ -227,7 +269,9 @@ def test_dtype_prop(self):
         assert len(akdf.columns) == len(akdf.dtypes)
         # dtypes returns objType for categorical, segarray. We should probably fix
         # this and add a df.objTypes property. pdarrays return actual dtype
-        for ref_type, c in zip(["int64", "int64", "int64", "str", "Categorical", "SegArray", "bigint"], akdf.columns):
+        for ref_type, c in zip(
+            ["int64", "int64", "int64", "str", "Categorical", "SegArray", "bigint"], akdf.columns
+        ):
             assert ref_type == str(akdf.dtypes[c])
 
     def test_from_pandas(self):
@@ -608,7 +652,8 @@ def test_head_tail_resetting_index(self):
 
         bool_idx = df[df["cnt"] > 3]
         bool_idx.__repr__()
-        # the new index is first False and rest True (because we lose first 4), so equivalent to arange(61, bool)
+        # the new index is first False and rest True (because we lose first 4),
+        # so equivalent to arange(61, bool)
         assert bool_idx.index.index.to_list() == ak.arange(61, dtype=bool).to_list()
 
         slice_idx = df[:]
@@ -617,55 +662,43 @@ def test_head_tail_resetting_index(self):
 
     def test_ipv4_columns(self):
         # test with single IPv4 column
-        df = ak.DataFrame({
-            'a': ak.arange(10),
-            'b': ak.IPv4(ak.arange(10))
-        })
+        df = ak.DataFrame({"a": ak.arange(10), "b": ak.IPv4(ak.arange(10))})
         with tempfile.TemporaryDirectory(dir=TestDataFrame.df_test_base_tmp) as tmp_dirname:
             fname = tmp_dirname + "/ipv4_df"
             df.to_parquet(fname)
 
-            data = ak.read(fname+"*")
-            rddf = ak.DataFrame({
-                'a': data['a'],
-                'b': ak.IPv4(data['b'])
-            })
+            data = ak.read(fname + "*")
+            rddf = ak.DataFrame({"a": data["a"], "b": ak.IPv4(data["b"])})
 
             assert_frame_equal(df.to_pandas(), rddf.to_pandas())
 
         # test with multiple
-        df = ak.DataFrame({
-            'a': ak.IPv4(ak.arange(10)),
-            'b': ak.IPv4(ak.arange(10))
-        })
+        df = ak.DataFrame({"a": ak.IPv4(ak.arange(10)), "b": ak.IPv4(ak.arange(10))})
         with tempfile.TemporaryDirectory(dir=TestDataFrame.df_test_base_tmp) as tmp_dirname:
             fname = tmp_dirname + "/ipv4_df"
             df.to_parquet(fname)
 
             data = ak.read(fname + "*")
-            rddf = ak.DataFrame({
-                'a': ak.IPv4(data['a']),
-                'b': ak.IPv4(data['b'])
-            })
+            rddf = ak.DataFrame({"a": ak.IPv4(data["a"]), "b": ak.IPv4(data["b"])})
 
             assert_frame_equal(df.to_pandas(), rddf.to_pandas())
 
         # test replacement of IPv4 with uint representation
-        df = ak.DataFrame({
-            'a': ak.IPv4(ak.arange(10))
-        })
-        df['a'] = df['a'].export_uint()
-        assert ak.arange(10).to_list() == df['a'].to_list()
+        df = ak.DataFrame({"a": ak.IPv4(ak.arange(10))})
+        df["a"] = df["a"].export_uint()
+        assert ak.arange(10).to_list() == df["a"].to_list()
 
     def test_subset(self):
-        df = ak.DataFrame({
-            'a': ak.arange(100),
-            'b': ak.randint(0, 20, 100),
-            'c': ak.random_strings_uniform(0, 16, 100),
-            'd': ak.randint(25, 75, 100)
-        })
-        df2 = df[['a', 'b']]
-        assert ['a', 'b'] == df2.columns
+        df = ak.DataFrame(
+            {
+                "a": ak.arange(100),
+                "b": ak.randint(0, 20, 100),
+                "c": ak.random_strings_uniform(0, 16, 100),
+                "d": ak.randint(25, 75, 100),
+            }
+        )
+        df2 = df[["a", "b"]]
+        assert ["a", "b"] == df2.columns
         assert df.index.to_list() == df2.index.to_list()
-        assert df['a'].to_list() == df2['a'].to_list()
-        assert df['b'].to_list() == df2['b'].to_list()
+        assert df["a"].to_list() == df2["a"].to_list()
+        assert df["b"].to_list() == df2["b"].to_list()
diff --git a/arkouda/pdarraycreation.py b/arkouda/pdarraycreation.py
index 0d25bde0e4..4450fc5632 100755
--- a/arkouda/pdarraycreation.py
+++ b/arkouda/pdarraycreation.py
@@ -241,7 +241,6 @@ def array(
         else:
             raise TypeError("Must be an iterable or have a numeric DType")
 
-
     # Check if array of strings
     # if a.dtype == numpy.object_ need to check first element
     if "U" in a.dtype.kind or (a.dtype == np.object_ and isinstance(a[0], str)):

From 3bcfbe228bdd9b434764b96bfa8f3ca61ccd7c5d Mon Sep 17 00:00:00 2001
From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com>
Date: Wed, 19 Jul 2023 11:57:49 -0400
Subject: [PATCH 5/8] Addressing review feedback

---
 PROTO_tests/tests/dataframe_test.py | 104 +++++++++-------------------
 1 file changed, 32 insertions(+), 72 deletions(-)

diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py
index 0b4636ff4c..bb9097ba5e 100644
--- a/PROTO_tests/tests/dataframe_test.py
+++ b/PROTO_tests/tests/dataframe_test.py
@@ -14,25 +14,6 @@ class TestDataFrame:
     df_test_base_tmp = "{}/df_test".format(os.getcwd())
     io_util.get_directory(df_test_base_tmp)
 
-    @staticmethod
-    def build_ak_df():
-        username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"])
-        userid = ak.array([111, 222, 111, 333, 222, 111])
-        item = ak.array([0, 0, 1, 1, 2, 0])
-        day = ak.array([5, 5, 6, 5, 6, 6])
-        amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6])
-        bi = ak.arange(2**200, 2**200 + 6)
-        return ak.DataFrame(
-            {
-                "userName": username,
-                "userID": userid,
-                "item": item,
-                "day": day,
-                "amount": amount,
-                "bi": bi,
-            }
-        )
-
     @staticmethod
     def build_pd_df():
         username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"]
@@ -40,7 +21,8 @@ def build_pd_df():
         item = [0, 0, 1, 1, 2, 0]
         day = [5, 5, 6, 5, 6, 6]
         amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6]
-        bi = [2**200, 2**200 + 1, 2**200 + 2, 2**200 + 3, 2**200 + 4, 2**200 + 5]
+        bi = [2 ** 200, 2 ** 200 + 1, 2 ** 200 + 2, 2 ** 200 + 3, 2 ** 200 + 4, 2 ** 200 + 5]
+        ui = np.arange(6).astype(ak.uint64)
         return pd.DataFrame(
             {
                 "userName": username,
@@ -49,16 +31,13 @@ def build_pd_df():
                 "day": day,
                 "amount": amount,
                 "bi": bi,
+                "ui": ui
             }
         )
 
     @staticmethod
-    def build_ak_df_duplicates():
-        username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"])
-        userid = ak.array([111, 222, 111, 333, 222, 111])
-        item = ak.array([0, 1, 0, 2, 1, 0])
-        day = ak.array([5, 5, 5, 5, 5, 5])
-        return ak.DataFrame({"userName": username, "userID": userid, "item": item, "day": day})
+    def build_ak_df():
+        return ak.DataFrame(TestDataFrame.build_pd_df())
 
     @staticmethod
     def build_pd_df_duplicates():
@@ -68,6 +47,10 @@ def build_pd_df_duplicates():
         day = [5, 5, 5, 5, 5, 5]
         return pd.DataFrame({"userName": username, "userID": userid, "item": item, "day": day})
 
+    @staticmethod
+    def build_ak_df_duplicates():
+        return ak.DataFrame(TestDataFrame.build_pd_df_duplicates())
+
     @staticmethod
     def build_ak_append():
         username = ak.array(["John", "Carol"])
@@ -76,6 +59,7 @@ def build_ak_append():
         day = ak.array([1, 2])
         amount = ak.array([0.5, 5.1])
         bi = ak.array([2**200 + 6, 2**200 + 7])
+        ui = ak.array([6, 7], dtype=ak.uint64)
         return ak.DataFrame(
             {
                 "userName": username,
@@ -84,6 +68,7 @@ def build_ak_append():
                 "day": day,
                 "amount": amount,
                 "bi": bi,
+                "ui": ui
             }
         )
 
@@ -94,16 +79,8 @@ def build_pd_df_append():
         item = [0, 0, 1, 1, 2, 0, 0, 2]
         day = [5, 5, 6, 5, 6, 6, 1, 2]
         amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6, 0.5, 5.1]
-        bi = [
-            2**200,
-            2**200 + 1,
-            2**200 + 2,
-            2**200 + 3,
-            2**200 + 4,
-            2**200 + 5,
-            2**200 + 6,
-            2**200 + 7,
-        ]
+        bi = (np.arange(8) + 2**200).tolist()
+        ui = np.arange(8).astype(ak.uint64)
         return pd.DataFrame(
             {
                 "userName": username,
@@ -112,6 +89,7 @@ def build_pd_df_append():
                 "day": day,
                 "amount": amount,
                 "bi": bi,
+                "ui": ui
             }
         )
 
@@ -129,6 +107,7 @@ def build_ak_typeerror():
         day = ak.array([5, 5, 6, 5, 6, 6])
         amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6])
         bi = ak.arange(2**200, 2**200 + 6)
+        ui = ak.arange(6, dtype=ak.uint64)
         return ak.DataFrame(
             {
                 "userName": username,
@@ -137,6 +116,7 @@ def build_ak_typeerror():
                 "day": day,
                 "amount": amount,
                 "bi": bi,
+                "ui": ui
             }
         )
 
@@ -184,8 +164,7 @@ def test_dataframe_creation(self, size):
             np.random.randint(5, 10, size),
         ]
         pddf = pd.DataFrame(x)
-        l_cols = [ak.array(val) for val in list(zip(x[0], x[1], x[2]))]
-        akdf = ak.DataFrame(l_cols)
+        akdf = ak.DataFrame([ak.array(val) for val in list(zip(*x))])
         assert isinstance(akdf, ak.DataFrame)
         assert len(akdf) == len(pddf)
         # arkouda does not allow for numeric columns.
@@ -233,11 +212,13 @@ def test_column_indexing(self):
         assert df.index.to_list() == ref_df.index.to_list()
 
         # column validation [] and . access
-        for cname, col, ref_col in zip(
-            df.columns,
-            [df.userName, df.userID, df.item, df.day, df.amount, df.bi],
-            [ref_df.userName, ref_df.userID, ref_df.item, ref_df.day, ref_df.amount, ref_df.bi],
-        ):
+        # for cname, col, ref_col in zip(
+        #     df.columns,
+        #     [df.userName, df.userID, df.item, df.day, df.amount, df.bi],
+        #     [ref_df.userName, ref_df.userID, ref_df.item, ref_df.day, ref_df.amount, ref_df.bi],
+        # ):
+        for cname in df.columns:
+            col, ref_col = getattr(df, cname), getattr(ref_df, cname)
             assert isinstance(col, ak.Series)
             assert col.to_list() == ref_col.to_list()
             assert isinstance(df[cname], (ak.pdarray, ak.Strings, ak.Categorical))
@@ -261,7 +242,7 @@ def test_dtype_prop(self):
             "c_1": ak.arange(3, 6, 1),
             "c_2": ak.arange(6, 9, 1),
             "c_3": str_arr,
-            "c_4": ak.Categorical(str_arr),
+            "c_4": ak.Categorical(ak.array(["str"] * 3)),
             "c_5": ak.SegArray(ak.array([0, 9, 14]), ak.arange(20)),
             "c_6": ak.arange(2**200, 2**200 + 3),
         }
@@ -274,11 +255,6 @@ def test_dtype_prop(self):
         ):
             assert ref_type == str(akdf.dtypes[c])
 
-    def test_from_pandas(self):
-        ref_df = self.build_pd_df()
-        df = ak.DataFrame(ref_df)
-        assert_frame_equal(ref_df, df.to_pandas())
-
     def test_drop(self):
         # create an arkouda df.
         df = self.build_ak_df()
@@ -299,7 +275,7 @@ def test_drop(self):
         df.drop("userName", axis=1, inplace=True)
         pd_df.drop(labels=["userName"], axis=1, inplace=True)
 
-        assert_frame_equal(pddf_drop, df_drop.to_pandas())
+        assert_frame_equal(pd_df, df.to_pandas())
 
         # Test dropping rows
         df.drop([0, 2, 5], inplace=True)
@@ -307,7 +283,7 @@ def test_drop(self):
         pd_df.drop(labels=[0, 2, 5], inplace=True)
         pd_df.reset_index(drop=True, inplace=True)
 
-        assert_frame_equal(pddf_drop, df_drop.to_pandas())
+        assert_frame_equal(pd_df, df.to_pandas())
 
         # verify that index keys must be ints
         with pytest.raises(TypeError):
@@ -336,7 +312,7 @@ def test_shape(self):
 
         row, col = df.shape
         assert row == 6
-        assert col == 6
+        assert col == 7
 
     def test_reset_index(self):
         df = self.build_ak_df()
@@ -391,9 +367,8 @@ def test_rename(self):
 
     def test_append(self):
         df = self.build_ak_df()
-        df_toappend = self.build_ak_append()
 
-        df.append(df_toappend)
+        df.append(self.build_ak_append())
 
         ref_df = self.build_pd_df_append()
 
@@ -401,6 +376,7 @@ def test_append(self):
         assert_frame_equal(ref_df, df.to_pandas())
 
         idx = np.arange(8)
+        print(type(df.index.index))
         assert idx.tolist() == df.index.index.to_list()
 
         df_keyerror = self.build_ak_keyerror()
@@ -413,9 +389,8 @@ def test_append(self):
 
     def test_concat(self):
         df = self.build_ak_df()
-        df_toappend = self.build_ak_append()
 
-        glued = ak.DataFrame.concat([df, df_toappend])
+        glued = ak.DataFrame.concat([df, self.build_ak_append()])
 
         ref_df = self.build_pd_df_append()
 
@@ -475,22 +450,7 @@ def test_groupby_standard(self):
         assert_series_equal(pds, s.to_pandas())
 
     def test_gb_series(self):
-        username = ak.array(["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"])
-        userid = ak.array([111, 222, 111, 333, 222, 111])
-        item = ak.array([0, 0, 1, 1, 2, 0])
-        day = ak.array([5, 5, 6, 5, 6, 6])
-        amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6])
-        bi = ak.arange(2**200, 2**200 + 6)
-        df = ak.DataFrame(
-            {
-                "userName": username,
-                "userID": userid,
-                "item": item,
-                "day": day,
-                "amount": amount,
-                "bi": bi,
-            }
-        )
+        df = self.build_ak_df()
 
         gb = df.GroupBy("userName", use_series=True)
 

From 169f0d0bb1be0571b9a41776946b94443e7981a8 Mon Sep 17 00:00:00 2001
From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com>
Date: Thu, 20 Jul 2023 08:01:16 -0400
Subject: [PATCH 6/8] Address comments from Pierce

---
 PROTO_tests/tests/dataframe_test.py | 55 +++++++++--------------------
 1 file changed, 16 insertions(+), 39 deletions(-)

diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py
index bb9097ba5e..4547ad0add 100644
--- a/PROTO_tests/tests/dataframe_test.py
+++ b/PROTO_tests/tests/dataframe_test.py
@@ -22,7 +22,7 @@ def build_pd_df():
         day = [5, 5, 6, 5, 6, 6]
         amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6]
         bi = [2 ** 200, 2 ** 200 + 1, 2 ** 200 + 2, 2 ** 200 + 3, 2 ** 200 + 4, 2 ** 200 + 5]
-        ui = np.arange(6).astype(ak.uint64)
+        ui = (np.arange(6).astype(ak.uint64)) + 2**63
         return pd.DataFrame(
             {
                 "userName": username,
@@ -59,7 +59,7 @@ def build_ak_append():
         day = ak.array([1, 2])
         amount = ak.array([0.5, 5.1])
         bi = ak.array([2**200 + 6, 2**200 + 7])
-        ui = ak.array([6, 7], dtype=ak.uint64)
+        ui = ak.array([6, 7], dtype=ak.uint64) + 2**63
         return ak.DataFrame(
             {
                 "userName": username,
@@ -80,7 +80,7 @@ def build_pd_df_append():
         day = [5, 5, 6, 5, 6, 6, 1, 2]
         amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6, 0.5, 5.1]
         bi = (np.arange(8) + 2**200).tolist()
-        ui = np.arange(8).astype(ak.uint64)
+        ui = (np.arange(8).astype(ak.uint64)) + 2**63
         return pd.DataFrame(
             {
                 "userName": username,
@@ -107,7 +107,7 @@ def build_ak_typeerror():
         day = ak.array([5, 5, 6, 5, 6, 6])
         amount = ak.array([0.5, 0.6, 1.1, 1.2, 4.3, 0.6])
         bi = ak.arange(2**200, 2**200 + 6)
-        ui = ak.arange(6, dtype=ak.uint64)
+        ui = ak.arange(6, dtype=ak.uint64) + 2**63
         return ak.DataFrame(
             {
                 "userName": username,
@@ -188,6 +188,13 @@ def test_client_type_creation(self):
             {"fields": f.to_list(), "ip": ip.to_list(), "date": pd_d, "bitvector": bv.to_list()}
         )
         assert_frame_equal(pddf, df.to_pandas())
+
+        # validate that set max_rows adjusts the repr properly
+        shape = f"({df._shape_str()})".replace("(", "[").replace(")", "]")
+        pd.set_option("display.max_rows", 4)
+        s = df.__repr__().replace(f" ({df._shape_str()})", f"\n\n{shape}")
+        assert s == pddf.__repr__()
+
         pddf = pd.DataFrame({"a": list(range(1000)), "b": list(range(1000))})
         pddf["a"] = pddf["a"].apply(lambda x: "AA" + str(x))
         pddf["b"] = pddf["b"].apply(lambda x: "BB" + str(x))
@@ -195,6 +202,11 @@ def test_client_type_creation(self):
         df = ak.DataFrame(pddf)
         assert_frame_equal(pddf, df.to_pandas())
 
+        pd.set_option("display.max_rows", 10)
+        shape = f"({df._shape_str()})".replace("(", "[").replace(")", "]")
+        s = df.__repr__().replace(f" ({df._shape_str()})", f"\n\n{shape}")
+        assert s == pddf.__repr__()
+
     def test_boolean_indexing(self):
         df = self.build_ak_df()
         ref_df = self.build_pd_df()
@@ -211,12 +223,6 @@ def test_column_indexing(self):
         assert isinstance(df.index, ak.Index)
         assert df.index.to_list() == ref_df.index.to_list()
 
-        # column validation [] and . access
-        # for cname, col, ref_col in zip(
-        #     df.columns,
-        #     [df.userName, df.userID, df.item, df.day, df.amount, df.bi],
-        #     [ref_df.userName, ref_df.userID, ref_df.item, ref_df.day, ref_df.amount, ref_df.bi],
-        # ):
         for cname in df.columns:
             col, ref_col = getattr(df, cname), getattr(ref_df, cname)
             assert isinstance(col, ak.Series)
@@ -376,7 +382,6 @@ def test_append(self):
         assert_frame_equal(ref_df, df.to_pandas())
 
         idx = np.arange(8)
-        print(type(df.index.index))
         assert idx.tolist() == df.index.index.to_list()
 
         df_keyerror = self.build_ak_keyerror()
@@ -620,34 +625,6 @@ def test_head_tail_resetting_index(self):
         slice_idx.__repr__()
         assert slice_idx.index.index.to_list() == idx.to_list()
 
-    def test_ipv4_columns(self):
-        # test with single IPv4 column
-        df = ak.DataFrame({"a": ak.arange(10), "b": ak.IPv4(ak.arange(10))})
-        with tempfile.TemporaryDirectory(dir=TestDataFrame.df_test_base_tmp) as tmp_dirname:
-            fname = tmp_dirname + "/ipv4_df"
-            df.to_parquet(fname)
-
-            data = ak.read(fname + "*")
-            rddf = ak.DataFrame({"a": data["a"], "b": ak.IPv4(data["b"])})
-
-            assert_frame_equal(df.to_pandas(), rddf.to_pandas())
-
-        # test with multiple
-        df = ak.DataFrame({"a": ak.IPv4(ak.arange(10)), "b": ak.IPv4(ak.arange(10))})
-        with tempfile.TemporaryDirectory(dir=TestDataFrame.df_test_base_tmp) as tmp_dirname:
-            fname = tmp_dirname + "/ipv4_df"
-            df.to_parquet(fname)
-
-            data = ak.read(fname + "*")
-            rddf = ak.DataFrame({"a": ak.IPv4(data["a"]), "b": ak.IPv4(data["b"])})
-
-            assert_frame_equal(df.to_pandas(), rddf.to_pandas())
-
-        # test replacement of IPv4 with uint representation
-        df = ak.DataFrame({"a": ak.IPv4(ak.arange(10))})
-        df["a"] = df["a"].export_uint()
-        assert ak.arange(10).to_list() == df["a"].to_list()
-
     def test_subset(self):
         df = ak.DataFrame(
             {

From a8ba48dc2fd67cd9bb10766de0bf292d45af5fa9 Mon Sep 17 00:00:00 2001
From: Ethan-DeBandi99 <16845933+Ethan-DeBandi99@users.noreply.github.com>
Date: Thu, 20 Jul 2023 12:00:02 -0400
Subject: [PATCH 7/8] Update PROTO_tests/tests/dataframe_test.py

Co-authored-by: pierce <48131946+pierce314159@users.noreply.github.com>
---
 PROTO_tests/tests/dataframe_test.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py
index 4547ad0add..5d7ecaec7f 100644
--- a/PROTO_tests/tests/dataframe_test.py
+++ b/PROTO_tests/tests/dataframe_test.py
@@ -11,8 +11,6 @@
 
 
 class TestDataFrame:
-    df_test_base_tmp = "{}/df_test".format(os.getcwd())
-    io_util.get_directory(df_test_base_tmp)
 
     @staticmethod
     def build_pd_df():

From c3e22ce3c8f283e494962cf8e8827ad4a497854c Mon Sep 17 00:00:00 2001
From: pierce <48131946+pierce314159@users.noreply.github.com>
Date: Thu, 20 Jul 2023 13:11:56 -0400
Subject: [PATCH 8/8] Update PROTO_tests/tests/dataframe_test.py

---
 PROTO_tests/tests/dataframe_test.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py
index 5d7ecaec7f..4c2b8d84a4 100644
--- a/PROTO_tests/tests/dataframe_test.py
+++ b/PROTO_tests/tests/dataframe_test.py
@@ -1,13 +1,9 @@
-import os
-import tempfile
-
 import numpy as np
 import pandas as pd
 import pytest
 from pandas.testing import assert_frame_equal, assert_series_equal
 
 import arkouda as ak
-from arkouda import io_util
 
 
 class TestDataFrame: