From 1a0f77b8460eaafbdfd0f776cb1240606956ee9e Mon Sep 17 00:00:00 2001 From: Brandon Neth Date: Wed, 12 Jun 2024 14:26:41 -0500 Subject: [PATCH 1/5] Revert "missed an iloc call that was affecting benchmark performance" This reverts commit d664ac9e7327b532f9cc408cb4b4a650a19110c9. --- arkouda/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py index 00503b1b9a..f23840052d 100644 --- a/arkouda/dataframe.py +++ b/arkouda/dataframe.py @@ -1211,7 +1211,7 @@ def _get_head_tail(self): if isinstance(self[col], Categorical): newdf[col] = self[col].categories[self[col].codes[idx]] else: - newdf[col] = self[col].iloc[idx] + newdf[col] = self[col][idx] newdf._set_index(self.index.index[idx]) return newdf.to_pandas(retain_index=True) From a2f00ed5efa3d7f28b7c803d9508061807d3106f Mon Sep 17 00:00:00 2001 From: Brandon Neth Date: Wed, 12 Jun 2024 14:26:43 -0500 Subject: [PATCH 2/5] Revert "missing iloc usage" This reverts commit 31d3185621481b21195f139b870a2dd1ae44c3cb. --- arkouda/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py index f23840052d..1f51bc0bdd 100644 --- a/arkouda/dataframe.py +++ b/arkouda/dataframe.py @@ -6274,7 +6274,7 @@ def _inner_join_merge( new_dict[new_col] = left[col].iloc[left_inds] for col in right_cols: new_col = col + right_suffix if col in col_intersect else col - new_dict[new_col] = right[col].iloc[right_inds] + new_dict[new_col] = right[col][right_inds] ret_df = DataFrame(new_dict) if sort is True: From dfe2b89642e6eb171ec99f3e57dcf37086e9be00 Mon Sep 17 00:00:00 2001 From: Brandon Neth Date: Wed, 12 Jun 2024 15:27:21 -0500 Subject: [PATCH 3/5] stragglers for reverting dataframe indexing --- PROTO_tests/tests/dataframe_test.py | 960 +-------------------------- arkouda/dataframe.py | 976 +++++----------------------- tests/dataframe_test.py | 681 +------------------ tests/io_test.py | 6 +- tests/parquet_test.py | 14 +- tests/segarray_test.py | 2 +- tests/series_test.py | 9 - tests/symbol_table_test.py | 2 +- 8 files changed, 220 insertions(+), 2430 deletions(-) diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py index c7018ade0c..31e020175d 100644 --- a/PROTO_tests/tests/dataframe_test.py +++ b/PROTO_tests/tests/dataframe_test.py @@ -318,7 +318,7 @@ def test_column_indexing(self): col, ref_col = getattr(df, cname), getattr(ref_df, cname) assert isinstance(col, ak.Series) assert col.to_list() == ref_col.to_list() - assert isinstance(df[cname].values, (ak.pdarray, ak.Strings, ak.Categorical)) + assert isinstance(df[cname], (ak.pdarray, ak.Strings, ak.Categorical)) assert df[cname].to_list() == ref_df[cname].to_list() # check mult-column list @@ -326,6 +326,12 @@ def test_column_indexing(self): assert isinstance(df[col_list], ak.DataFrame) assert_frame_equal(df[col_list].to_pandas(), ref_df[col_list]) + # check multi-column tuple + col_tup = ("userID", "item", "day", "bi") + assert isinstance(df[col_tup], ak.DataFrame) + # pandas only supports lists of columns, not tuples + assert_frame_equal(df[col_tup].to_pandas(), ref_df[list(col_tup)]) + def test_dtype_prop(self): str_arr = ak.random_strings_uniform(1, 5, 3) df_dict = { @@ -410,7 +416,7 @@ def test_shape(self): def test_reset_index(self): df = self.build_ak_df() - slice_df = df.loc[ak.array([1, 3, 5])] + slice_df = df[ak.array([1, 3, 5])] assert slice_df.index.to_list() == [1, 3, 5] df_reset = slice_df.reset_index() @@ -813,7 +819,7 @@ def test_sort_index(self): "t", ] ) - ak_df["negs"] = -1 * ak_df["int64"].values + ak_df["negs"] = -1 * ak_df["int64"] group_bys = [ "gb_id", @@ -861,7 +867,7 @@ def test_apply_perm(self): default_perm = ak.array(perm_list) ord.apply_permutation(default_perm) - ord_ref = ref_df.sort_values(by="userID") + ord_ref = ref_df.sort_values(by="userID").reset_index(drop=True) ord_ref = ord_ref.reindex(perm_list).reset_index(drop=True) assert_frame_equal(ord_ref, ord.to_pandas()) @@ -1028,7 +1034,7 @@ def test_multi_col_merge(self): for col in sorted_column_names: from_ak = ak_merge[col].to_ndarray() from_pd = pd_merge[col].to_numpy() - if isinstance(ak_merge[col].values, ak.pdarray): + if isinstance(ak_merge[col], ak.pdarray): assert np.allclose(np.sort(from_ak), np.sort(from_pd), equal_nan=True) else: # we have to cast to str because pandas arrays converted to numpy @@ -1215,950 +1221,6 @@ def test_to_markdown(self): ) assert df.to_markdown(tablefmt="jira") == df.to_pandas().to_markdown(tablefmt="jira") - def test_column_init(self): - unlabeled_data = [[1, 2], [True, False], ["foo", "bar"], [2.3, -1.8]] - good_labels = ["one1", "two2", "three3", "four4"] - bad_labels1 = ["one", "two"] - bad_labels2 = good_labels + ["five"] - - df = ak.DataFrame(unlabeled_data, columns=good_labels) - assert df.columns.values == good_labels - assert df["one1"][0] == 1 - assert df["three3"][0] == "foo" - assert df["four4"][1] == -1.8 - - with pytest.raises(ValueError): - df = ak.DataFrame(unlabeled_data, columns=bad_labels1) - with pytest.raises(ValueError): - df = ak.DataFrame(unlabeled_data, columns=bad_labels2) - with pytest.raises(TypeError): - df = ak.DataFrame(unlabeled_data, columns=["one", "two", 3, "four"]) - - def test_from_pandas(self): - username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice", "John", "Carol"] - userid = [111, 222, 111, 333, 222, 111, 444, 333] - item = [0, 0, 1, 1, 2, 0, 0, 2] - day = [5, 5, 6, 5, 6, 6, 1, 2] - amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6, 0.5, 5.1] - bi = 2**200 - bi_arr = [bi, bi + 1, bi + 2, bi + 3, bi + 4, bi + 5, bi + 6, bi + 7] - ref_df = pd.DataFrame( - { - "userName": username, - "userID": userid, - "item": item, - "day": day, - "amount": amount, - "bi": bi_arr, - } - ) - - df = ak.DataFrame(ref_df) - assert ((ref_df == df.to_pandas()).all()).all() - - df = ak.DataFrame.from_pandas(ref_df) - assert ((ref_df == df.to_pandas()).all()).all() - - def test_to_pandas(self): - df = self.build_ak_df() - pd_df = self.build_pd_df() - - assert_frame_equal(pd_df, df.to_pandas()) - - slice_df = df.iloc[ak.array([1, 3, 5])] - pd_df = slice_df.to_pandas(retain_index=True) - assert pd_df.index.tolist() == [1, 3, 5] - - pd_df = slice_df.to_pandas() - assert pd_df.index.tolist() == [0, 1, 2] - - def test_merge(self): - df1 = ak.DataFrame( - { - "key": ak.arange(4), - "value1": ak.array(["A", "B", "C", "D"]), - "value3": ak.arange(4, dtype=ak.int64), - } - ) - - df2 = ak.DataFrame( - { - "key": ak.arange(2, 6, 1), - "value1": ak.array(["A", "B", "D", "F"]), - "value2": ak.array(["apple", "banana", "cherry", "date"]), - "value3": ak.ones(4, dtype=ak.int64), - } - ) - - ij_expected_df = ak.DataFrame( - { - "key": ak.array([2, 3]), - "value1_x": ak.array(["C", "D"]), - "value3_x": ak.array([2, 3]), - "value1_y": ak.array(["A", "B"]), - "value2": ak.array(["apple", "banana"]), - "value3_y": ak.array([1, 1]), - } - ) - - ij_merged_df = ak.merge(df1, df2, how="inner", on="key") - - assert_frame_equal( - ij_expected_df.to_pandas(retain_index=True), ij_merged_df.to_pandas(retain_index=True) - ) - - rj_expected_df = ak.DataFrame( - { - "key": ak.array([2, 3, 4, 5]), - "value1_x": ak.array(["C", "D", "nan", "nan"]), - "value3_x": ak.array([2.0, 3.0, np.nan, np.nan]), - "value1_y": ak.array(["A", "B", "D", "F"]), - "value2": ak.array(["apple", "banana", "cherry", "date"]), - "value3_y": ak.array([1, 1, 1, 1]), - } - ) - - rj_merged_df = ak.merge(df1, df2, how="right", on="key") - - assert_frame_equal( - rj_expected_df.to_pandas(retain_index=True), rj_merged_df.to_pandas(retain_index=True) - ) - - rj_merged_df2 = ak.merge(df1, df2, how="right", on="key", convert_ints=False) - - assert rj_merged_df2.dtypes == { - "key": "int64", - "value1_x": "str", - "value3_x": "int64", - "value1_y": "str", - "value2": "str", - "value3_y": "int64", - } - - lj_expected_df = ak.DataFrame( - { - "key": ak.array( - [ - 0, - 1, - 2, - 3, - ] - ), - "value1_y": ak.array( - [ - "nan", - "nan", - "A", - "B", - ] - ), - "value2": ak.array( - [ - "nan", - "nan", - "apple", - "banana", - ] - ), - "value3_y": ak.array( - [ - np.nan, - np.nan, - 1.0, - 1.0, - ] - ), - "value1_x": ak.array( - [ - "A", - "B", - "C", - "D", - ] - ), - "value3_x": ak.array( - [ - 0, - 1, - 2, - 3, - ] - ), - } - ) - - lj_merged_df = ak.merge(df1, df2, how="left", on="key") - - assert_frame_equal( - lj_expected_df.to_pandas(retain_index=True), lj_merged_df.to_pandas(retain_index=True) - ) - - lj_merged_df2 = ak.merge(df1, df2, how="left", on="key", convert_ints=False) - - assert lj_merged_df2.dtypes == { - "key": "int64", - "value1_y": "str", - "value2": "str", - "value3_y": "int64", - "value1_x": "str", - "value3_x": "int64", - } - - oj_expected_df = ak.DataFrame( - { - "key": ak.array([0, 1, 2, 3, 4, 5]), - "value1_y": ak.array(["nan", "nan", "A", "B", "D", "F"]), - "value2": ak.array(["nan", "nan", "apple", "banana", "cherry", "date"]), - "value3_y": ak.array([np.nan, np.nan, 1.0, 1.0, 1.0, 1.0]), - "value1_x": ak.array( - [ - "A", - "B", - "C", - "D", - "nan", - "nan", - ] - ), - "value3_x": ak.array([0.0, 1.0, 2.0, 3.0, np.nan, np.nan]), - } - ) - - oj_merged_df = ak.merge(df1, df2, how="outer", on="key") - - assert_frame_equal( - oj_expected_df.to_pandas(retain_index=True), oj_merged_df.to_pandas(retain_index=True) - ) - - oj_merged_df2 = ak.merge(df1, df2, how="outer", on="key", convert_ints=False) - - assert oj_merged_df2.dtypes == { - "key": "int64", - "value1_y": "str", - "value2": "str", - "value3_y": "int64", - "value1_x": "str", - "value3_x": "int64", - } - - def test_ipv4_columns(self, df_test_base_tmp): - # test with single IPv4 column - df = ak.DataFrame({"a": ak.arange(10), "b": ak.IPv4(ak.arange(10))}) - with tempfile.TemporaryDirectory(dir=df_test_base_tmp) as tmp_dirname: - fname = tmp_dirname + "/ipv4_df" - df.to_parquet(fname) - - data = ak.read(fname + "*") - rddf = ak.DataFrame({"a": data["a"], "b": ak.IPv4(data["b"])}) - - assert df["a"].values.to_list() == rddf["a"].values.to_list() - assert df["b"].values.to_list() == rddf["b"].values.to_list() - - # test with multiple - df = ak.DataFrame({"a": ak.IPv4(ak.arange(10)), "b": ak.IPv4(ak.arange(10))}) - with tempfile.TemporaryDirectory(dir=df_test_base_tmp) as tmp_dirname: - fname = tmp_dirname + "/ipv4_df" - df.to_parquet(fname) - - data = ak.read(fname + "*") - rddf = ak.DataFrame({"a": ak.IPv4(data["a"]), "b": ak.IPv4(data["b"])}) - - assert df["a"].values.to_list() == rddf["a"].values.to_list() - assert df["b"].values.to_list() == rddf["b"].values.to_list() - - # test replacement of IPv4 with uint representation - df = ak.DataFrame({"a": ak.IPv4(ak.arange(10))}) - df["a"] = df["a"].values.export_uint() - assert ak.arange(10).to_list() == df["a"].values.to_list() - - def test_save(self, df_test_base_tmp): - i = list(range(3)) - c1 = [9, 7, 17] - c2 = [2, 4, 6] - df_dict = {"i": ak.array(i), "c_1": ak.array(c1), "c_2": ak.array(c2)} - - akdf = ak.DataFrame(df_dict) - - validation_df = pd.DataFrame( - { - "i": i, - "c_1": c1, - "c_2": c2, - } - ) - with tempfile.TemporaryDirectory(dir=df_test_base_tmp) as tmp_dirname: - akdf.to_parquet(f"{tmp_dirname}/testName") - - ak_loaded = ak.DataFrame.load(f"{tmp_dirname}/testName") - assert_frame_equal(validation_df, ak_loaded[akdf.columns.values].to_pandas()) - - # test save with index true - akdf.to_parquet(f"{tmp_dirname}/testName_with_index.pq", index=True) - assert ( - len(glob.glob(f"{tmp_dirname}/testName_with_index*.pq")) == ak.get_config()["numLocales"] - ) - - # Test for df having seg array col - df = ak.DataFrame({"a": ak.arange(10), "b": ak.SegArray(ak.arange(10), ak.arange(10))}) - df.to_hdf(f"{tmp_dirname}/seg_test.h5") - assert len(glob.glob(f"{tmp_dirname}/seg_test*.h5")) == ak.get_config()["numLocales"] - ak_loaded = ak.DataFrame.load(f"{tmp_dirname}/seg_test.h5") - assert_frame_equal(df.to_pandas(), ak_loaded.to_pandas()) - - # test with segarray with _ in column name - df_dict = { - "c_1": ak.arange(3, 6), - "c_2": ak.arange(6, 9), - "c_3": ak.SegArray(ak.array([0, 9, 14]), ak.arange(20)), - } - akdf = ak.DataFrame(df_dict) - akdf.to_hdf(f"{tmp_dirname}/seg_test.h5") - assert len(glob.glob(f"{tmp_dirname}/seg_test*.h5")) == ak.get_config()["numLocales"] - ak_loaded = ak.DataFrame.load(f"{tmp_dirname}/seg_test.h5") - assert_frame_equal(akdf.to_pandas(), ak_loaded.to_pandas()) - - # test load_all and read workflows - ak_load_all = ak.DataFrame(ak.load_all(f"{tmp_dirname}/seg_test.h5")) - assert_frame_equal(akdf.to_pandas(), ak_load_all.to_pandas()) - - ak_read = ak.DataFrame(ak.read(f"{tmp_dirname}/seg_test*")) - assert_frame_equal(akdf.to_pandas(), ak_read.to_pandas()) - - def make_dfs_and_refs(self): - ints = [0, 2, 3, 7, 3] - floats = [0.0, 1.5, 0.5, 1.5, -1.0] - strings = ["A", "C", "C", "DE", "Z"] - - unordered_index = [9, 3, 0, 23, 3] - string_index = ["one", "two", "three", "four", "five"] - - # default index - df1 = ak.DataFrame( - {"ints": ak.array(ints), "floats": ak.array(floats), "strings": ak.array(strings)} - ) - _df1 = pd.DataFrame( - {"ints": np.array(ints), "floats": np.array(floats), "strings": np.array(strings)} - ) - - # unorderd index, integer labels - df2 = ak.DataFrame( - {1: ak.array(ints), 2: ak.array(floats), 3: ak.array(strings)}, index=unordered_index - ) - _df2 = pd.DataFrame( - {1: np.array(ints), 2: np.array(floats), 3: np.array(strings)}, index=unordered_index - ) - - # string index - df3 = ak.DataFrame( - {"ints": ak.array(ints), "floats": ak.array(floats), "strings": ak.array(strings)}, - index=string_index, - ) - _df3 = pd.DataFrame( - {"ints": np.array(ints), "floats": np.array(floats), "strings": np.array(strings)}, - index=string_index, - ) - - return (df1, _df1, df2, _df2, df3, _df3) - - def test_getitem_scalars_and_slice(self): - default_index = [0, 1, 2, 3, 4] - unordered_index = [9, 3, 0, 23, 3] - string_index = ["one", "two", "three", "four", "five"] - - ints = [0, 2, 3, 7, 3] - floats = [0.0, 1.5, 0.5, 1.5, -1.0] - strings = ["A", "C", "C", "DE", "Z"] - - # group 1: string labels - df1, _df1, df2, _df2, df3, _df3 = self.make_dfs_and_refs() - - string_keys = ["ints", "floats", "strings"] - int_keys = [1, 2, 3] - - dfs = [df1, df2, df3] - _dfs = [_df1, _df2, _df3] - keys_list = [string_keys, int_keys, string_keys] - indexes = [default_index, unordered_index, string_index] - for df, _df, keys, index in zip(dfs, _dfs, keys_list, indexes): - # single column label returns a series - for key in keys: - access1_ = _df[key] - access1 = df[key] - assert isinstance(access1_, pd.Series) - assert isinstance(access1, ak.Series) - assert access1_.values.tolist() == access1.values.to_list() - assert access1_.index.tolist() == access1.index.to_list() - - # matching behavior for nonexistant label - with pytest.raises(KeyError): - _access2 = _df[keys[0] * 100] - with pytest.raises(KeyError): - access2 = df[keys[0] * 100] - - # result reference behavior - _access3 = _df[keys[0]] - access3 = df[keys[0]] - access3[index[0]] = 100 - _access3[index[0]] = 100 - assert _df[keys[0]][index[0]] == df[keys[0]][index[0]] - - # key type matches column label types - with pytest.raises(TypeError): - if isinstance(keys[0], int): - a = df["int"] - else: - a = df[3] - with pytest.raises(TypeError): - b = df[1.0] - - # slice both bounds - _slice_access = _df1[1:4] - slice_access = df1[1:4] - assert_frame_equal(_slice_access, slice_access.to_pandas(retain_index=True)) - - # slice high bound - _slice_access = _df1[:3] - slice_access = df1[:3] - assert_frame_equal(_slice_access, slice_access.to_pandas(retain_index=True)) - - # slice low bound - _slice_access = _df1[3:] - slice_access = df1[3:] - assert_frame_equal(_slice_access, slice_access.to_pandas(retain_index=True)) - - # slice no bounds - _slice_access = _df1[:] - slice_access = df1[:] - assert_frame_equal(_slice_access, slice_access.to_pandas(retain_index=True)) - - _d = pd.DataFrame( - {"ints": np.array(ints), "floats": np.array(floats), "strings": np.array(strings)}, - index=[0, 2, 5, 1, 5], - ) - _a = _d[1:4] - d = ak.DataFrame( - {"ints": ak.array(ints), "floats": ak.array(floats), "strings": ak.array(strings)}, - index=ak.array([0, 2, 5, 1, 5]), - ) - a = d[1:4] - assert_frame_equal(_a, a.to_pandas(retain_index=True)) - - # priority when same index and label types - df2 = ak.DataFrame( - {"A": ak.array(ints), "floats": ak.array(floats), "strings": ak.array(strings)}, - index=ak.array(strings), - ) - _df2 = pd.DataFrame( - {"A": pd.array(ints), "floats": pd.array(floats), "strings": pd.array(strings)}, - index=pd.array(strings), - ) - - access4 = df2["A"] - _access4 = _df2["A"] - assert isinstance(_access4, pd.Series) - assert isinstance(access4, ak.Series) - # arkouda to_pandas creates a list of objects for the index rather than a list of strings - assert _access4.values.tolist() == access4.values.to_list() - assert _access4.index.tolist() == access4.index.to_list() - - def test_getitem_vectors(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # multiple columns - _access1 = _df1[["ints", "floats"]] - access1 = df1[["ints", "floats"]] - assert_frame_equal(_access1, access1.to_pandas(retain_index=True)) - - _access2 = _df1[np.array(["ints", "floats"])] - access2 = df1[ak.array(["ints", "floats"])] - assert_frame_equal(_access2, access2.to_pandas(retain_index=True)) - - # boolean mask - _access3 = _df1[_df1["ints"] == 3] - access3 = df1[df1["ints"] == 3] - assert_frame_equal(_access3, access3.to_pandas(retain_index=True)) - - # boolean mask of incorrect length - bad = [True, True, False, False] - with pytest.raises(ValueError): - _df1[np.array(bad)] - with pytest.raises(ValueError): - df1[ak.array(bad)] - - # one key present one missing - with pytest.raises(KeyError): - _access4 = _df1[["ints", "not"]] - with pytest.raises(KeyError): - access4 = df1[["ints", "not"]] - - # repeated index - - _access5 = _df2[[1, 2]] - access5 = df2[[1, 2]] - assert_frame_equal(_access5, access5.to_pandas(retain_index=True)) - - # arg order - _access6 = _df2[[2, 1]] - access6 = df2[[2, 1]] - assert_frame_equal(_access6, access6.to_pandas(retain_index=True)) - - def test_setitem_scalars(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # add new column - new_ints = [8, 9, -10, 8, 12] - _df1["new"] = np.array(new_ints) - df1["new"] = ak.array(new_ints) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # modify existing column - _df1["ints"] = np.array([1, 2, 3, 4, 5]) - df1["ints"] = ak.array([1, 2, 3, 4, 5]) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # setting scalar value - _df1["ints"] = 100 - df1["ints"] = 100 - - # indexing with boolean mask, array value - _df1[_df1["ints"] == 100]["ints"] = np.array([1, 2, 3, 4, 5]) - df1[df1["ints"] == 100]["ints"] = ak.array([1, 2, 3, 4, 5]) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # indexing with boolean mask, array value, incorrect length - with pytest.raises(ValueError): - _df1[np.array([True, True, False, False, False])]["ints"] = np.array([1, 2, 3, 4]) - with pytest.raises(ValueError): - df1[ak.array([True, True, False, False, False])]["ints"] = ak.array([1, 2, 3, 4]) - - # incorrect column index type - with pytest.raises(TypeError): - df1[1] = ak.array([1, 2, 3, 4, 5]) - - # integer column labels, integer index labels - # add new column - new_ints = [8, 9, -10, 8, 12] - - _df2[4] = np.array(new_ints) - df2[4] = ak.array(new_ints) - assert_frame_equal(_df2, df2.to_pandas(retain_index=True)) - - # modify existing column - _df2[1] = np.array([1, 2, 3, 4, 5]) - df2[1] = ak.array([1, 2, 3, 4, 5]) - assert_frame_equal(_df2, df2.to_pandas(retain_index=True)) - - # indexing with boolean mask, scalar value - _df2[_df2[1] == 3][1] = 101 - df2[df2[1] == 3][1] = 101 - assert_frame_equal(_df2, df2.to_pandas(retain_index=True)) - - # setting to scalar value - _df2[1] = 100 - df2[1] = 100 - assert_frame_equal(_df2, df2.to_pandas(retain_index=True)) - - # indexing with boolean mask, array value - _df2[_df2[1] == 100][1] = np.array([1, 2, 3, 4, 5]) - df2[df2[1] == 100][1] = ak.array([1, 2, 3, 4, 5]) - assert_frame_equal(_df2, df2.to_pandas(retain_index=True)) - - # indexing with boolean mask, array value, incorrect length - with pytest.raises(ValueError): - _df2[np.array([True, True, False, False, False])][1] = np.array([1, 2, 3, 4]) - with pytest.raises(ValueError): - df2[ak.array([True, True, False, False, False])][1] = ak.array([1, 2, 3, 4]) - - # incorrect column index type - with pytest.raises(TypeError): - df2["new column"] = ak.array([1, 2, 3, 4, 5]) - - def test_setitem_vectors(self): - ints = [0, 1, 3, 7, 3] - floats = [0.0, 1.5, 0.5, 1.5, -1.0] - strings = ["A", "C", "C", "DE", "Z"] - - ints2 = [8, 9, -10, 8, 12] - floats2 = [8.5, 5.0, 6.2, 1.2, 0.0] - strings2 = ["B", "D", "D", "EF", "Y"] - - _df = pd.DataFrame( - {"ints": np.array(ints), "floats": np.array(floats), "strings": np.array(strings)} - ) - df = ak.DataFrame( - {"ints": ak.array(ints), "floats": ak.array(floats), "strings": ak.array(strings)} - ) - - _df2 = pd.DataFrame( - {"ints": np.array(ints2), "floats": np.array(floats2), "strings": np.array(strings2)} - ) - df2 = ak.DataFrame( - {"ints": ak.array(ints2), "floats": ak.array(floats2), "strings": ak.array(strings2)} - ) - - # assignment of one dataframe access to another - _df[["ints", "floats"]] = _df2[["ints", "floats"]] - df[["ints", "floats"]] = df2[["ints", "floats"]] - assert_frame_equal(_df, df.to_pandas()) - - # new contents for dataframe being read - _df2["ints"] = np.array(ints) - df2["ints"] = ak.array(ints) - _df2["floats"] = np.array(floats) - df2["floats"] = ak.array(floats) - - # assignment of one dataframe access to another, different order - _df[["floats", "ints"]] = _df2[["floats", "ints"]] - df[["floats", "ints"]] = df2[["floats", "ints"]] - assert_frame_equal(_df, df.to_pandas()) - - # inserting multiple columns at once - _df[["new1", "new2"]] = _df2[["ints", "floats"]] - df[["new1", "new2"]] = df2[["ints", "floats"]] - assert_frame_equal(_df, df.to_pandas()) - - # reset values - _df2["ints"] = np.array(ints2) - df2["ints"] = ak.array(ints2) - _df2["floats"] = np.array(floats2) - df2["floats"] = ak.array(floats2) - - # boolean mask, accessing two columns - _df[_df["ints"] == 3][["ints", "floats"]] = _df2[0:2][["ints", "floats"]] - df[df["ints"] == 3][["ints", "floats"]] = df2[0:2][["ints", "floats"]] - assert_frame_equal(_df, df.to_pandas()) - - def test_loc_get(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # single label for row - _loc1 = _df1.loc[2] - loc1 = df1.loc[2] - assert isinstance(_loc1, pd.Series) - assert isinstance(loc1, ak.DataFrame) - for column in _loc1.index: - assert _loc1[column] == loc1[column].values[0] - - # list of labels - _loc2 = _df1.loc[[2, 3, 4]] - loc2 = df1.loc[[2, 3, 4]] - assert_frame_equal(_loc2, loc2.to_pandas(retain_index=True)) - - # slice of labels - _loc3 = _df1.loc[1:3] - loc3 = df1.loc[1:3] - assert_frame_equal(_loc3, loc3.to_pandas(retain_index=True)) - - # boolean array of same length as array being sliced - _loc4 = _df1.loc[[True, True, False, False, True]] - loc4 = df1.loc[ak.array([True, True, False, False, True])] - assert_frame_equal(_loc4, loc4.to_pandas(retain_index=True)) - - # alignable boolean Series - _loc5 = _df1.loc[_df1["ints"] == 3] - loc5 = df1.loc[df1["ints"] == 3] - assert_frame_equal(_loc5, loc5.to_pandas(retain_index=True)) - - # single label for row and column - _loc6 = _df1.loc[2, "floats"] - loc6 = df1.loc[2, "floats"] - assert _loc6 == loc6 - - # slice with label for row and single label for column - _loc7 = _df1.loc[1:3, "floats"] - loc7 = df1.loc[1:3, "floats"] - assert isinstance(_loc7, pd.Series) - assert isinstance(loc7, ak.Series) - for column in _loc7.index: - assert _loc7.values.tolist() == loc7.values.to_list() - - # boolean array for row and array of labels for columns - _loc8 = _df1.loc[[True, True, False, False, True], ["ints", "floats"]] - loc8 = df1.loc[ak.array([True, True, False, False, True]), ["ints", "floats"]] - assert_frame_equal(_loc8, loc8.to_pandas(retain_index=True)) - - def test_loc_set_scalar(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - # single row, single column, scalar value - _df1.loc[2, "floats"] = 100.0 - df1.loc[2, "floats"] = 100.0 - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # multiple rows, single column, scalar value - _df1.loc[[2, 3, 4], "floats"] = 101.0 - df1.loc[[2, 3, 4], "floats"] = 101.0 - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # setting an entire column - _df1.loc[:, "floats"] = 99.0 - df1.loc[:, "floats"] = 99.0 - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - _df1.loc[1:3, "floats"] = 98.0 - df1.loc[1:3, "floats"] = 98.0 - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # setting value for rows matching boolean - _df1.loc[_df1["ints"] == 3, "floats"] = 102.0 - df1.loc[df1["ints"] == 3, "floats"] = 102.0 - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # incorrect column index type - with pytest.raises(TypeError): - df1.loc[2, 1] = 100.0 - - # incorrect row index type - with pytest.raises(TypeError): - df1.loc[1.0, "floats"] = 100.0 - - def test_loc_set_vector(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # two rows, one column, two values - _df1.loc[[2, 3], "floats"] = np.array([100.0, 101.0]) - df1.loc[[2, 3], "floats"] = ak.array([100.0, 101.0]) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # setting with Series matches index labels, not positions - _df1.loc[:, "floats"] = pd.Series([100.0, 101.0, 102.0, 103.0, 104.0], index=[0, 1, 2, 3, 4]) - df1.loc[:, "floats"] = ak.Series( - ak.array([100.0, 101.0, 102.0, 103.0, 104.0]), index=ak.array([0, 1, 2, 3, 4]) - ) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # setting with Series with unordered index - _df1.loc[:, "ints"] = pd.Series([2, 3, 4, 5, 6], index=[3, 2, 1, 0, 4]) - df1.loc[:, "ints"] = ak.Series(ak.array([2, 3, 4, 5, 6]), index=ak.array([3, 2, 1, 0, 4])) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # setting with Series against an array of indices - _df1.loc[np.array([2, 3, 4]), "floats"] = pd.Series([70.0, 71.0, 72.0], index=[3, 4, 2]) - df1.loc[ak.array([2, 3, 4]), "floats"] = ak.Series( - ak.array([70.0, 71.0, 72.0]), index=ak.array([3, 4, 2]) - ) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - def test_set_new_values(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # new column - _df1.loc[2, "not"] = 100.0 - df1.loc[2, "not"] = 100.0 - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # TODO: The following two lines behave differently because pandas - # converts the int column to floating point to accomodate the nan - # value of the new column - # _df1.loc[100, 'floats'] = 100.0 - # df1.loc[100, 'floats'] = 100.0 - # assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # cannot add new rows to a dataframe with string column - with pytest.raises(ValueError): - df2.loc[100, 7] = 100.0 - - def test_iloc_get(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - for _df1, df1 in zip([_df1, _df2, _df3], [df1, df2, df3]): - # integer input - _iloc1 = _df1.iloc[2] - iloc1 = df1.iloc[2] - assert isinstance(_iloc1, pd.Series) - assert isinstance(iloc1, ak.DataFrame) - for column in _iloc1.index: - assert _iloc1[column] == iloc1[column].values[0] - - # list of integers - _iloc2 = _df1.iloc[[2, 3, 4]] - iloc2 = df1.iloc[[2, 3, 4]] - assert_frame_equal(_iloc2, iloc2.to_pandas(retain_index=True)) - - # list of unordered integers - _iloc3 = _df1.iloc[[4, 2, 3]] - iloc3 = df1.iloc[[4, 2, 3]] - assert_frame_equal(_iloc3, iloc3.to_pandas(retain_index=True)) - - # array of integers - _iloc4 = _df1.iloc[np.array([2, 3, 4])] - iloc4 = df1.iloc[ak.array([2, 3, 4])] - assert_frame_equal(_iloc4, iloc4.to_pandas(retain_index=True)) - - # array of unordered integers - _iloc5 = _df1.iloc[np.array([4, 2, 3])] - iloc5 = df1.iloc[ak.array([4, 2, 3])] - assert_frame_equal(_iloc5, iloc5.to_pandas(retain_index=True)) - - # slice object with ints - _iloc6 = _df1.iloc[1:3] - iloc6 = df1.iloc[1:3] - assert_frame_equal(_iloc6, iloc6.to_pandas(retain_index=True)) - - # slice object with no lower bound - _iloc7 = _df1.iloc[:3] - iloc7 = df1.iloc[:3] - assert_frame_equal(_iloc7, iloc7.to_pandas(retain_index=True)) - - # slice object with no upper bound - _iloc8 = _df1.iloc[3:] - iloc8 = df1.iloc[3:] - assert_frame_equal(_iloc8, iloc8.to_pandas(retain_index=True)) - - # slice object with no bounds - _iloc9 = _df1.iloc[:] - iloc9 = df1.iloc[:] - assert_frame_equal(_iloc9, iloc9.to_pandas(retain_index=True)) - - # boolean array - _iloc10 = _df1.iloc[[True, True, False, False, True]] - iloc10 = df1.iloc[ak.array([True, True, False, False, True])] - assert_frame_equal(_iloc10, iloc10.to_pandas(retain_index=True)) - - # boolean array of incorrect length - with pytest.raises(IndexError): - _df1.iloc[[True, True, False, False]] - with pytest.raises(IndexError): - df1.iloc[ak.array([True, True, False, False])] - - # tuple of row and column indexes - _iloc11 = _df1.iloc[2, 1] - iloc11 = df1.iloc[2, 1] - assert isinstance(_iloc11, np.float64) - assert isinstance(iloc11, np.float64) - assert _iloc11 == iloc11 - - # integer row, list column - _iloc12 = _df1.iloc[2, [0, 1]] - iloc12 = df1.iloc[2, [0, 1]] - assert isinstance(_iloc12, pd.Series) - assert isinstance(iloc12, ak.DataFrame) - for column in _iloc12.index: - assert _iloc12[column] == iloc12[column].values[0] - - # list row, integer column - _iloc13 = _df1.iloc[[2, 3], 1] - iloc13 = df1.iloc[[2, 3], 1] - assert isinstance(_iloc13, pd.Series) - assert isinstance(iloc13, ak.Series) - for column in _iloc13.index: - assert _iloc13[column] == iloc13[column] - - # list row, list column - _iloc14 = _df1.iloc[[2, 3], [0, 1]] - iloc14 = df1.iloc[[2, 3], [0, 1]] - assert_frame_equal(_iloc14, iloc14.to_pandas(retain_index=True)) - - # slice row, boolean array column - _iloc15 = _df1.iloc[1:3, [True, False, True]] - iloc15 = df1.iloc[1:3, [True, False, True]] - assert_frame_equal(_iloc15, iloc15.to_pandas(retain_index=True)) - - # raises IndexError if requested indexer is out-of-bounds - with pytest.raises(IndexError): - _df1.iloc[100] - with pytest.raises(IndexError): - df1.iloc[100] - with pytest.raises(IndexError): - _df1.iloc[100, 1] - with pytest.raises(IndexError): - df1.iloc[100, 1] - with pytest.raises(IndexError): - _df1.iloc[[0, 2, 100], 1] - with pytest.raises(IndexError): - df1.iloc[[0, 2, 100], 1] - with pytest.raises(IndexError): - _df1.iloc[1, 100] - with pytest.raises(IndexError): - df1.iloc[1, 100] - - pass - - def test_iloc_set(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - for _df, df in zip([_df1, _df2, _df3], [df1, df2, df3]): - # tuple of integers - _df.iloc[2, 1] = 100.0 - df.iloc[2, 1] = 100.0 - assert_frame_equal(_df, df.to_pandas(retain_index=True)) - - # list row, integer column - _df.iloc[[2, 3], 1] = 102.0 - df.iloc[[2, 3], 1] = 102.0 - assert_frame_equal(_df, df.to_pandas(retain_index=True)) - - # slice row, integer column - _df.iloc[1:3, 1] = 103.0 - df.iloc[1:3, 1] = 103.0 - assert_frame_equal(_df, df.to_pandas(retain_index=True)) - - # slice row, no lower bound, integer column - _df.iloc[:3, 1] = 104.0 - df.iloc[:3, 1] = 104.0 - assert_frame_equal(_df, df.to_pandas(retain_index=True)) - - # slice row, no upper bound, integer column - _df.iloc[3:, 1] = 105.0 - df.iloc[3:, 1] = 105.0 - assert_frame_equal(_df, df.to_pandas(retain_index=True)) - - # slice row, no bounds, integer column - _df.iloc[:, 1] = 106.0 - df.iloc[:, 1] = 106.0 - assert_frame_equal(_df, df.to_pandas(retain_index=True)) - - # string columns immutable - with pytest.raises(TypeError): - df.iloc[2, 2] = "new string" - pass - - def test_at(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # single label for row and column - _at1 = _df1.at[2, "floats"] - at1 = df1.at[2, "floats"] - assert _at1 == at1 - - # does not support lists - with pytest.raises(pd.errors.InvalidIndexError): - _df1.at[[2, 3], "floats"] - with pytest.raises(ValueError): - df1.at[[2, 3], "floats"] - - # assignment - _df1.at[2, "floats"] = 100.0 - df1.at[2, "floats"] = 100.0 - assert_frame_equal(_df1, df1.to_pandas()) - - pass - - def test_iat(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # single label for row and column - _iat1 = _df1.iat[2, 1] - iat1 = df1.iat[2, 1] - assert _iat1 == iat1 - - # does not support lists - with pytest.raises(ValueError): - _df1.iat[[2, 3], 1] - with pytest.raises(ValueError): - df1.iat[[2, 3], 1] - - # indices must be integers - with pytest.raises(ValueError): - _df1.iat[1, "floats"] - with pytest.raises(ValueError): - df1.iat[1, "floats"] - - # assignment - _df1.iat[2, 1] = 100.0 - df1.iat[2, 1] = 100.0 - assert_frame_equal(_df1, df1.to_pandas()) - def test_sample_hypothesis_testing(self): # perform a weighted sample and use chisquare to test # if the observed frequency matches the expected frequency diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py index 1f51bc0bdd..c8fe464b98 100644 --- a/arkouda/dataframe.py +++ b/arkouda/dataframe.py @@ -31,15 +31,12 @@ from arkouda.join import inner_join from arkouda.numeric import cast as akcast from arkouda.numeric import cumsum, where -from arkouda.pdarrayclass import RegistrationError -from arkouda.pdarrayclass import any as akany -from arkouda.pdarrayclass import pdarray -from arkouda.pdarrayclass import sum as aksum +from arkouda.pdarrayclass import RegistrationError, pdarray from arkouda.pdarraycreation import arange, array, create_pdarray, full, zeros -from arkouda.pdarraysetops import concatenate, in1d, indexof1d, intersect1d +from arkouda.pdarraysetops import concatenate, in1d, intersect1d from arkouda.row import Row from arkouda.segarray import SegArray -from arkouda.series import Series, is_supported_scalar +from arkouda.series import Series from arkouda.sorting import argsort, coargsort from arkouda.strings import Strings from arkouda.timeclass import Datetime, Timedelta @@ -317,7 +314,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, random_state=No | 5 | 3 | 8 | +----+-----+-----+ """ - return self.df.loc[ + return self.df[ self.gb.sample( values=self.df.index.values, n=n, @@ -761,11 +758,9 @@ def __init__(self, initialdata=None, index=None, columns=None): for key, val in initialdata.items(): if isinstance(val, (list, tuple)): val = array(val) - if isinstance(val, Series): - val = val.values if not isinstance(val, self._COLUMN_CLASSES): raise ValueError(f"Values must be one of {self._COLUMN_CLASSES}.") - if isinstance(key, str) and key.lower() == "index": + if key.lower() == "index": # handles the index as an Index object instead of a column self._set_index(val) continue @@ -809,18 +804,17 @@ def __init__(self, initialdata=None, index=None, columns=None): # If the index param was passed in, use that instead of # creating a new one. - if index is not None: - self._set_index(index) - elif self.index is None: + if self.index is None: self._set_index(arange(self._nrows)) - + else: + self._set_index(index) self.update_nrows() def __getattr__(self, key): if key not in self.columns.values: raise AttributeError(f"Attribute {key} not found") # Should this be cached? - return self[key] + return Series(data=self[key], index=self.index.index) def __dir__(self): return dir(DataFrame) + self.columns.values + ["columns"] @@ -838,289 +832,74 @@ def __delitem__(self, key): self._empty = True self.update_nrows() - def validate_key(self, key): - if key is None: - raise ValueError("Key cannot be None") - if isinstance(key, Series): - # TODO: check index alignment - return self.validate_key(key.values) - if isinstance(key, list): - return self.validate_key(array(key)) - - if isinstance(key, slice): - if key.start is not None and key.start < 0: - raise ValueError("Negative start index not supported") - if key.stop is not None and key.stop > len(self): - raise ValueError("Slice stop index out of range") - return key - - if is_supported_scalar(key): - if len(self.columns) == 0: - # Empty DataFrame, scalar key is valid - return key - if self.column_label_type() != resolve_scalar_dtype(key): - raise TypeError(f"Expected key of type {self.column_label_type()}, received {type(key)}") - return key - - if isinstance(key, pdarray) and key.dtype == akbool: - if len(key) != len(self): - raise ValueError("Boolean mask arguments must have the same length as the DataFrame.") - return key - - if isinstance(key, (pdarray, Strings, Categorical, SegArray)): - k = key[0] - if len(self.columns) != 0 and resolve_scalar_dtype(k) != self.column_label_type(): - raise TypeError( - "Expected key of type {}, received {}".format(type(self.columns[0]), type(k)) - ) - return key - - raise TypeError("Indexing with keys of type {} not supported".format(type(key))) - - def column_label_type(self): - if len(self._columns) != 0: - return resolve_scalar_dtype(self._columns[0]) - else: - return None - def __getitem__(self, key): - """ - Name-based indexing of columns, except for integer slices and boolean masks, - which does position-based indexing of rows. - - Parameters - ---------- - key : str, int, float, list, pdarray, slice - The column label(s) the resulting Series or DataFrame should contain. If - using a slice, the indices of the desired rows. If using a boolean mask, - a pdarray where "True" entries correspond to desired rows. - - Returns - ------- - Series, DataFrame - The columns or rows of the DataFrame. If only one column label is - provided, the return type is a Series. Otherwise a DataFrame is returned. - - Raises - ------ - KeyError - Raised if a column label is not present in the DataFrame - ValueError - Raised if a boolean mask has the incorrect length or a slice's bounds are - out of range - TypeError - Raised if the key is not a supported type - """ # convert series to underlying values # Should check for index alignment - key = self.validate_key(key) - - # if a scalar argument, return a Series - if is_supported_scalar(key): - if key not in self._columns: - raise KeyError("column {} not present in DataFrame".format(key)) - values = UserDict.__getitem__(self, key) - index = self.index - return Series(values, index=index) - - # boolean mask - if isinstance(key, pdarray) and key.dtype == akbool: - return self._get_rows(key) - - # slice - if isinstance(key, slice): - return self._get_rows(key) - - if isinstance(key, (pdarray, Strings)): - for k in key.to_ndarray(): - if k not in self.columns: - raise KeyError("column {} not present in DataFrame".format(k)) + if isinstance(key, Series): + key = key.values + + # Select rows using an integer pdarray + if isinstance(key, pdarray): + if key.dtype == akbool: + key = arange(key.size)[key] + result = {} + for k in self._columns: + result[k] = UserDict.__getitem__(self, k)[key] + # To stay consistent with numpy, provide the old index values + return DataFrame(initialdata=result, index=self.index.index[key]) + + # Select rows or columns using a list + if isinstance(key, (list, tuple)): result = DataFrame() if len(key) <= 0: return result - for k in key.to_ndarray(): - result.data[k] = UserDict.__getitem__(self, k) - result._columns.append(k) - result._empty = False - result._set_index(self.index) - return result - - raise TypeError("key not supported: {}".format(key)) - - def validate_value(self, value): - if isinstance(value, Series): - # TODO: check index alignment - return self.validate_value(value.values) - if isinstance(value, list): - return self.validate_value(array(value)) - if isinstance(value, tuple): - raise TypeError("DataFrame does not support tuple values") - - return value - - def __setitem__(self, key, value): - """ - Inserts/updates columns in the DataFrame. Can also be used to - update one DataFrame with values from another. - - Parameters - ---------- - key : str, int, float, list, pdarray - The column label(s) the resulting Series or DataFrame should contain. - value : str, int, float, list, pdarray, Series, DataFrame - The value(s) that should be inserted or updated within the DataFrame. - - Raises - ------ - KeyError - Raised if a column label is not present in the DataFrame - IndexError - Raised if a boolean mask has the incorrect length - TypeError - Raised if the key or value are not a supported type - """ - self.update_nrows() - - key = self.validate_key(key) - value = self.validate_value(value) - - # adding first column - if len(self._columns) == 0 and is_supported_scalar(key): - self._columns.append(key) - self._empty = False - if is_supported_scalar(value): - value = full(1, value, resolve_scalar_dtype(value)) - UserDict.__setitem__(self, key, value) - self._set_index(Index(arange(len(value)))) - self.update_nrows() - return - - # Update or insert a single column into the dataframe - if resolve_scalar_dtype(key) == self.column_label_type(): - if is_supported_scalar(value): - value = full(len(self), value, resolve_scalar_dtype(value)) - assert isinstance(value, (pdarray, Strings, Categorical, SegArray)) - if len(value) != len(self): - raise ValueError("Column length must match DataFrame length") - - # Set a single column in the dataframe using a scalar value - if key not in self.columns: - self._empty = False - self._columns.append(key) - - UserDict.__setitem__(self, key, value) - return - - # Boolean mask - elif isinstance(key, pdarray) and key.dtype == akbool: - if not isinstance(value, DataFrame): - raise ValueError("Expected DataFrame type for boolean mask assignment") - shared_columns = intersect1d(array(self.columns.values), array(value.columns.values)) - if len(shared_columns) != len(self.columns) or len(shared_columns) != len(value.columns): - raise ValueError("Right-hand side columns do not match left-hand side columns") - if len(value) != aksum(key): - raise IndexError("Boolean mask length must match DataFrame length") - for k in self.columns: - self[k].values[key] = value[k].values - return - - # Index lists - if isinstance(key, (pdarray, Strings)): - if isinstance(value, DataFrame): - if not len(key) == len(value.columns): - raise ValueError( - f"Number of keys and values must match: {len(key)} != {len(value.columns)}" - ) + if len({type(x) for x in key}) > 1: + raise TypeError("Invalid selector: too many types in list.") + if isinstance(key[0], str): + for k in key: + result[k] = self[k] + result._empty = False + result._set_index(self.index) # column lens remain the same. Copy the indexing + return result else: - raise ValueError("When setting multiple columns, value must be a DataFrame") - - for k, valueColumn in zip(key.to_ndarray(), value.columns): - v = value[valueColumn].values - if len(v) != len(self): - raise ValueError("Column length must match DataFrame length") - if k not in self.columns: - self._empty = False - self._columns.append(k) - UserDict.__setitem__(self, k, v) - return - - raise TypeError("Setting on dataframe with unexpected type: {}".format(type(key))) - - @property - def loc(self): - """ - Label-based row indexing. Supports getting and setting. If there is - a single indexing argument, it is interpreted as a row selector. If - there are two, the first is interpreted as a row selector and the - second as a column selector. Selectors can be scalar label values; - lists, pdarrays, or slices of label values; boolean masks as pdarrays; - or Series. Setting values with .loc requires both the row and column - selectors to be present. - - Raises - ------ - KeyError - Raised if a label is not present in the DataFrame. - TypeError - Raised if the key or value types are not supported. - """ - return _LocIndexer(self) - - @property - def iloc(self): - """ - Position-based row indexing. Supports getting and setting. If there - is a single indexing argument, it is interpreted as a row selector. - If there are two, the first is interpreted as a row selector and the - second as a column selector. Selectors can be scalar integer values; - lists, pdarrays, or slices of integer values; boolean masks as pdarrays; - or integer Series. Setting values with .iloc requires both the row and - column selectors to be present. - - Raises - ------ - TypeError - Raised if the keys or value types are not supported. - - IndexError - Raised if an index is out of range. - - ValueError - Raised if a boolean mask is of the wrong length. - """ - return _ILocIndexer(self) - - @property - def at(self) -> AtIndexer: - """ - Access a single value for a row/column by label. Similar to `.loc`. - Use only if you need to get or set a single value. + raise TypeError( + "DataFrames only support lists for column indexing. " + "All list entries must be of type str." + ) - Raises - ------ - KeyError - Raised if a label is not present in the DataFrame. - TypeError - Raised if the key or value types are not supported. - """ - return AtIndexer(self) + # Select a single row using an integer + if isinstance(key, int): + result = {} + row = array([key]) + for k in self._columns: + result[k] = (UserDict.__getitem__(self, k)[row])[0] + return Row(result) + + # Select a single column using a string + elif isinstance(key, str): + if key not in self.keys(): + raise KeyError(f"Invalid column name '{key}'.") + return UserDict.__getitem__(self, key) + + # Select rows using a slice + elif isinstance(key, slice): + # result = DataFrame() + rtn_data = {} + s = key + for k in self._columns: + rtn_data[k] = UserDict.__getitem__(self, k)[s] + return DataFrame(initialdata=rtn_data, index=self.index.index[arange(self._nrows)[s]]) + else: + raise IndexError("Invalid selector: unknown error.") - @property - def iat(self) -> IAtIndexer: - """ - Access a single value for a row/column pair by integer position. - Similar to `.iloc`. Use only if you need to get or set a single value. + def __setitem__(self, key, value): + self.update_nrows() - Raises - ------ - IndexError - Raised if an index is out of range. - TypeError - Raised if the key or value types are not supported. - """ - return IAtIndexer(self) + # If this is the first column added, we must create an index column. + add_index = False + if self._empty: + add_index = True - def set_row(self, key, value): # Set a single row in the dataframe using a dict of values if isinstance(key, int): for k in self._columns: @@ -1143,6 +922,28 @@ def set_row(self, key, value): continue self[k][key] = v + # Set a single column in the dataframe using a an arkouda array + elif isinstance(key, str): + if not isinstance(value, self._COLUMN_CLASSES): + raise ValueError(f"Column must be one of {self._COLUMN_CLASSES}.") + elif self._nrows is not None and self._nrows != value.size: + raise ValueError(f"Expected size {self._nrows} but received size {value.size}.") + else: + self._empty = False + UserDict.__setitem__(self, key, value) + # Update the index values + if key not in self._columns: + self._columns.append(key) + + # Do nothing and return if there's no valid data + else: + raise ValueError("No valid data received.") + + # Update the dataframe indices and metadata. + if add_index: + self.update_nrows() + self._set_index(arange(self._nrows)) + def __len__(self): """ Return the number of rows. @@ -1223,10 +1024,10 @@ def _get_head_tail_server(self): if self._nrows <= maxrows: newdf = DataFrame() for col in self._columns: - if isinstance(self[col].values, Categorical): - newdf[col] = self[col].values.categories[self[col].values.codes] + if isinstance(self[col], Categorical): + newdf[col] = self[col].categories[self[col].codes] else: - newdf[col] = self[col].values + newdf[col] = self[col] newdf._set_index(self.index) return newdf.to_pandas(retain_index=True) # Being 1 above the threshold causes the PANDAS formatter to split the data frame vertically @@ -1235,27 +1036,22 @@ def _get_head_tail_server(self): ) msg_list = [] for col in self._columns: - if isinstance(self[col].values, Categorical): - msg_list.append( - f"Categorical+{col}+{self[col].values.codes.name}" - f"+{self[col].values.categories.name}" - ) - elif isinstance(self[col].values, SegArray): - msg_list.append( - f"SegArray+{col}+{self[col].values.segments.name}+{self[col].values.values.name}" - ) - elif isinstance(self[col].values, Strings): - msg_list.append(f"Strings+{col}+{self[col].values.name}") - elif isinstance(self[col].values, Fields): - msg_list.append(f"Fields+{col}+{self[col].values.name}") - elif isinstance(self[col].values, IPv4): - msg_list.append(f"IPv4+{col}+{self[col].values.name}") - elif isinstance(self[col].values, Datetime): - msg_list.append(f"Datetime+{col}+{self[col].values.name}") - elif isinstance(self[col].values, BitVector): - msg_list.append(f"BitVector+{col}+{self[col].values.name}") + if isinstance(self[col], Categorical): + msg_list.append(f"Categorical+{col}+{self[col].codes.name}+{self[col].categories.name}") + elif isinstance(self[col], SegArray): + msg_list.append(f"SegArray+{col}+{self[col].segments.name}+{self[col].values.name}") + elif isinstance(self[col], Strings): + msg_list.append(f"Strings+{col}+{self[col].name}") + elif isinstance(self[col], Fields): + msg_list.append(f"Fields+{col}+{self[col].name}") + elif isinstance(self[col], IPv4): + msg_list.append(f"IPv4+{col}+{self[col].name}") + elif isinstance(self[col], Datetime): + msg_list.append(f"Datetime+{col}+{self[col].name}") + elif isinstance(self[col], BitVector): + msg_list.append(f"BitVector+{col}+{self[col].name}") else: - msg_list.append(f"pdarray+{col}+{self[col].values.name}") + msg_list.append(f"pdarray+{col}+{self[col].name}") repMsg = cast( str, @@ -1285,11 +1081,11 @@ def _get_head_tail_server(self): elif t == "Fields": df_dict[msg[1]] = Fields( create_pdarray(msg[2]), - self[msg[1]].values.names, - MSB_left=self[msg[1]].values.MSB_left, - pad=self[msg[1]].values.padchar, - separator=self[msg[1]].values.separator, - show_int=self[msg[1]].values.show_int, + self[msg[1]].names, + MSB_left=self[msg[1]].MSB_left, + pad=self[msg[1]].padchar, + separator=self[msg[1]].separator, + show_int=self[msg[1]].show_int, ) elif t == "IPv4": df_dict[msg[1]] = IPv4(create_pdarray(msg[2])) @@ -1298,8 +1094,8 @@ def _get_head_tail_server(self): elif t == "BitVector": df_dict[msg[1]] = BitVector( create_pdarray(msg[2]), - width=self[msg[1]].values.width, - reverse=self[msg[1]].values.reverse, + width=self[msg[1]].width, + reverse=self[msg[1]].reverse, ) else: df_dict[msg[1]] = create_pdarray(msg[2]) @@ -1308,72 +1104,6 @@ def _get_head_tail_server(self): new_df._set_index(self.index.index[idx]) return new_df.to_pandas(retain_index=True)[self._columns] - def _get_rows(self, key): - """ - Gets rows of the dataframe based with the provided indices - """ - if not isinstance(key, (pdarray, slice)): - raise TypeError("_get_rows requires pdarray of row indices or a slice") - - if isinstance(key, slice): - start = key.start if key.start is not None else 0 - stop = key.stop if key.stop is not None else len(self) - step = key.step if key.step is not None else 1 - key = arange(start, stop, step) - if key.dtype == akbool: - key = arange(key.size)[key] - result = {} - for k in self._columns: - result[k] = UserDict.__getitem__(self, k)[key] - # To stay consistent with numpy, provide the old index values - return DataFrame(initialdata=result, index=self.index.index[key]) - - def column_labels(self): - """ - Return the column labels. - """ - return self._columns - - def _add_new_rows(self, key): - # If the key is a scalar, convert it to an array - if is_supported_scalar(key) and dtype(type(key)) == self.index.dtype: - key = array([key]) - - # Cannot add new rows to dataframes with String columns - for k in self._columns: - if isinstance(UserDict.__getitem__(self, k), Strings): - raise ValueError( - "This DataFrame has a column of type ak.Strings;" - " so this DataFrame is immutable. This feature could change" - " if arkouda supports mutable Strings in the future." - ) - - if isinstance(key, pdarray) and key.dtype == self.index.dtype: - new_keys = key[in1d(key, self.index.values, invert=True)] - self._set_index(self.index.concat(Index(new_keys))) - for k in self._columns: - current_col = UserDict.__getitem__(self, k) - default_val = np.nan if current_col.dtype == akfloat64 else 0 - new_col = concatenate( - [current_col, full(len(new_keys), default_val, dtype=current_col.dtype)] - ) - UserDict.__setitem__(self, k, new_col) - - self.update_nrows() - else: - raise ValueError("Invalid key type for adding new rows") - - def _add_column(self, key, dtype): - """ - Adds a column to the DataFrame with the given key and dtype. - """ - if key in self.columns: - raise ValueError(f"Column {key} already exists in DataFrame") - default_value = 0 - if dtype == akfloat64: - default_value = np.nan - self[key] = full(len(self), default_value, dtype=dtype) - def transfer(self, hostname, port): """ Sends a DataFrame to a different Arkouda server. @@ -1648,7 +1378,6 @@ def drop( if len(obj._columns) == 0: obj._set_index(None) obj._empty = True - obj.update_nrows() if not inplace: @@ -1726,9 +1455,9 @@ def drop_duplicates(self, subset=None, keep="first"): if keep == "last": _segment_ends = concatenate([gp.segments[1:] - 1, array([gp.permutation.size - 1])]) - return self.iloc[gp.permutation[_segment_ends]] + return self[gp.permutation[_segment_ends]] else: - return self.iloc[gp.permutation[gp.segments]] + return self[gp.permutation[gp.segments]] @property def size(self): @@ -2033,6 +1762,7 @@ def reset_index(self, size: Optional[int] = None, inplace: bool = False) -> Unio """ obj = self if inplace else self.copy() + if not size: obj.update_nrows() obj._set_index(arange(obj._nrows)) @@ -2100,36 +1830,6 @@ def info(self): rows = " row" return "DataFrame([" + keystr + "], {:,}".format(self._nrows) + rows + ", " + str(mem) + ")" - def items(self): - """ - Iterate over (column name, column) pairs. - - Returns - ------- - generator - A generator of (column name, column) pairs. - - Examples - -------- - - >>> import arkouda as ak - >>> ak.connect() - >>> df = ak.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) - >>> for key, value in df.items(): - ... print(key, value) - col1 [1 2] - col2 [3 4] - """ - for key in self._columns: - elt = UserDict.__getitem__(self, key) - if isinstance(elt, Series): - elt = elt.values - yield key, elt - - def values(self): - for key, elts in self.items(): - yield elts - def update_nrows(self): """ Computes the number of rows on the arkouda server and updates the size parameter. @@ -2139,7 +1839,7 @@ def update_nrows(self): if val is not None: sizes.add(val.size) if len(sizes) > 1: - raise ValueError("Size mismatch in DataFrame columns: ", sizes, ".") + raise ValueError("Size mismatch in DataFrame columns.") if len(sizes) == 0: self._nrows = None else: @@ -2453,13 +2153,13 @@ def append(self, other, ordered=True): tmp_data = {} for key in keylist: try: - tmp_data[key] = util_concatenate([self[key], other[key]], ordered=ordered).values + tmp_data[key] = util_concatenate([self[key], other[key]], ordered=ordered) except TypeError as e: raise TypeError( f"Incompatible types for column {key}: {type(self[key])} vs {type(other[key])}" ) from e self.data = tmp_data - self._set_index(self.index.concat(other.index)) + # Clean up self.update_nrows() self.reset_index(inplace=True) @@ -3044,7 +2744,7 @@ def to_pandas(self, datalimit=maxTransferBytes, retain_index=False): # Proceed with conversion if possible pandas_data = {} for key in self._columns: - val = self[key].values + val = self[key] try: # in order for proper pandas functionality, SegArrays must be seen as 1d # and therefore need to be converted to list @@ -3805,15 +3505,15 @@ def argsort(self, key, ascending=True): if self._empty: return array([], dtype=akint64) if ascending: - return argsort(self.data[key]) + return argsort(self[key]) else: - if isinstance(self.data[key], pdarray) and self.data[key].dtype in ( + if isinstance(self[key], pdarray) and self[key].dtype in ( akint64, akfloat64, ): - return argsort(-self.data[key]) + return argsort(-self[key]) else: - return argsort(self.data[key])[arange(self._nrows - 1, -1, -1)] + return argsort(self[key])[arange(self._nrows - 1, -1, -1)] def coargsort(self, keys, ascending=True): """ @@ -3858,7 +3558,7 @@ def coargsort(self, keys, ascending=True): return array([], dtype=akint64) arrays = [] for key in keys: - arrays.append(self[key].values) + arrays.append(self[key]) i = coargsort(arrays) if not ascending: i = i[arange(self._nrows - 1, -1, -1)] @@ -3872,7 +3572,7 @@ def _reindex(self, idx): else: new_index = Index(self.index[idx]) - return DataFrame(self.iloc[idx], index=new_index) + return DataFrame(self[idx], index=new_index) def sort_index(self, ascending=True): """ @@ -3997,7 +3697,7 @@ def sort_values(self, by=None, ascending=True): i = self.coargsort(by, ascending=ascending) else: raise TypeError("Column name(s) must be str or list/tuple of str") - return self.iloc[i] + return self[i] def apply_permutation(self, perm): """ @@ -4202,16 +3902,20 @@ def copy(self, deep=True): +----+--------+--------+ """ + if deep: res = DataFrame() res._size = self._nrows res._bytes = self._bytes res._empty = self._empty + res._columns = self._columns[:] # if this is not a slice, droping columns modifies both + + for key, val in self.items(): + res[key] = val[:] - for col in self._columns: - res[col] = self[col].iloc[:] # if this is not a slice, renaming indexes with update both res._set_index(Index(self.index.index[:])) + return res else: return DataFrame(self) @@ -4519,9 +4223,9 @@ def count(self, axis: Union[int, str] = 0, numeric_only=False) -> Series: index_values_list = [] count_values_list = [] for col in self.columns: - if is_numeric(self[col].values): + if is_numeric(self[col]): index_values_list.append(col) - count_values_list.append((~isnan(self[col].values)).sum()) + count_values_list.append((~isnan(self[col])).sum()) elif not numeric_only or self[col].dtype == bool: index_values_list.append(col) # Non-numeric columns do not have NaN values. @@ -4531,12 +4235,12 @@ def count(self, axis: Union[int, str] = 0, numeric_only=False) -> Series: first = True count_values = arange(0) for col in self.columns: - if is_numeric(self[col].values): + if is_numeric(self[col]): if first: - count_values = akcast(~isnan(self[col].values), dt="int64") + count_values = akcast(~isnan(self[col]), dt="int64") first = False else: - count_values += ~isnan(self[col].values) + count_values += ~isnan(self[col]) elif not numeric_only or self[col].dtype == bool: if first: count_values = full(self.index.size, 1, dtype=akint64) @@ -4611,7 +4315,7 @@ def numeric_help(d): args = { "size": len(self.columns.values), "columns": self.columns.values, - "data_names": [numeric_help(self[c].values) for c in self.columns.values], + "data_names": [numeric_help(self[c]) for c in self.columns.values], } ret_dict = json.loads(generic_msg(cmd="corrMatrix", args=args)) @@ -4828,8 +4532,8 @@ def isna(self) -> DataFrame: from arkouda.util import is_numeric def is_nan_col(col: str): - if is_numeric(self[col].values): - return isnan(self[col].values) + if is_numeric(self[col]): + return isnan(self[col]) else: return full(self.shape[0], False, dtype=akbool) @@ -4883,8 +4587,8 @@ def notna(self) -> DataFrame: from arkouda.util import is_numeric def not_nan_col(col: str): - if is_numeric(self[col].values): - return ~isnan(self[col].values) + if is_numeric(self[col]): + return ~isnan(self[col]) else: return full(self.shape[0], True, dtype=akbool) @@ -4969,7 +4673,7 @@ def any(self, axis=0) -> Union[Series, bool]: bool_cols = [col for col in self.columns.values if self.dtypes[col] == "bool"] if (isinstance(axis, int) and axis == 0) or (isinstance(axis, str) and axis == "index"): return Series( - array([akany(self[col].values) for col in bool_cols]), + array([akany(self[col]) for col in bool_cols]), index=Index(bool_cols), ) elif (isinstance(axis, int) and axis == 1) or (isinstance(axis, str) and axis == "columns"): @@ -4985,7 +4689,7 @@ def any(self, axis=0) -> Union[Series, bool]: mask = full(self.shape[0], False, dtype=bool) return Series(mask, index=self.index.values[:]) elif axis is None: - return any([akany(self[col].values) for col in bool_cols]) + return any([akany(self[col]) for col in bool_cols]) else: raise ValueError("axis must have value 0, 1, 'index', 'columns', or None.") @@ -5066,7 +4770,7 @@ def all(self, axis=0) -> Union[Series, bool]: bool_cols = [col for col in self.columns.values if self.dtypes[col] == "bool"] if (isinstance(axis, int) and axis == 0) or (isinstance(axis, str) and axis == "index"): return Series( - array([akall(self[col].values) for col in bool_cols]), + array([akall(self[col]) for col in bool_cols]), index=Index(bool_cols), ) elif (isinstance(axis, int) and axis == 1) or (isinstance(axis, str) and axis == "columns"): @@ -5083,7 +4787,7 @@ def all(self, axis=0) -> Union[Series, bool]: return Series(mask, index=self.index.values[:]) elif axis is None: - return all([akall(self[col].values) for col in bool_cols]) + return all([akall(self[col]) for col in bool_cols]) else: raise ValueError("axis must have value 0, 1, 'index', 'columns', or None.") @@ -5247,7 +4951,7 @@ def dropna( if isinstance(mask, Series): for col, truth in zip(mask.index.values.to_list(), mask.values.to_list()): if truth is True: - result[col] = self[col].values[:] + result[col] = self[col][:] if ignore_index is True and result.empty is False: result = result.reset_index() @@ -5625,341 +5329,6 @@ def from_return_msg(cls, rep_msg): return cls(columns, idx) -class _LocIndexer: - def __init__(self, df): - self.df = df - - def __getitem__(self, key): - if isinstance(key, tuple) and len(key) == 2: - return self._get_row_col(key[0], key[1]) - if isinstance(key, list): - key = array(key) - if isinstance(key, Series): - key = key.values - if is_supported_scalar(key) and self.df.index.dtype == dtype(type(key)): - return self.df._get_rows(indexof1d(array([key]), self.df.index.values)) - - if isinstance(key, pdarray) and key.dtype == self.df.index.dtype: - return self.df._get_rows(indexof1d(key, self.df.index.values)) - - if isinstance(key, slice): - if key.start is not None and akfind(array([key.start]), self.df.index.values)[0] == -1: - raise KeyError(f"Index {key.start} not found in DataFrame index") - if key.stop is not None and akfind(array([key.stop]), self.df.index.values)[0] == -1: - raise KeyError(f"Index {key.stop} not found in DataFrame index") - - start_idx = ( - akfind(array([key.start]), self.df.index.values)[0] if key.start is not None else 0 - ) - stop_idx = ( - akfind(array([key.stop]), self.df.index.values)[0] + 1 - if key.stop is not None - else self.df.index.size - ) - - indices = arange(start_idx, stop_idx) - return self.df._get_rows(indices) - - if isinstance(key, pdarray) and key.dtype == akbool: - return self.df._get_rows(key) - - return None - - def _get_row_col(self, row_key, col_key): - return self[row_key][col_key] - - def __setitem__(self, key, val): - if isinstance(key, tuple) and len(key) == 2: - self._set_row_col(key[0], key[1], val) - return - else: - raise ValueError( - "Invalid key type. '.loc' indexing only supports keys with row and column selectors." - ) - - def _set_row_col(self, row_key, col_key, val): - if isinstance(row_key, list): - row_key = array(row_key) - if isinstance(row_key, Series): - row_key = row_key.values - if is_supported_scalar(col_key) and col_key not in self.df.columns.values: - self.df._add_column(col_key, dtype(type(val))) - - if is_supported_scalar(val): - return self._set_row_col_scalar_val(row_key, col_key, val) - else: - assert isinstance(val, (pdarray, Series, Strings, SegArray)), "Invalid value type" - return self._set_row_col_vector_val(row_key, col_key, val) - - def _set_row_col_scalar_val(self, row_key, col_key, val): - if is_supported_scalar(row_key): - if not self.df.index.dtype == dtype(type(row_key)): - raise TypeError("Row key must be of the same type as the DataFrame index") - if akfind(array([row_key]), self.df.index.values)[0] == -1: - self.df._add_new_rows(row_key) - # updating a single row - row_idx = indexof1d(array([row_key]), self.df.index.values) - if row_idx.size == 0: - raise ValueError(f"Index {row_key} not found in DataFrame index") - - self.df.data[col_key][row_idx] = val - - if isinstance(row_key, pdarray) and row_key.dtype == self.df.index.dtype: - if akany(in1d(row_key, self.df.index.values, invert=True)): - self.df._add_new_rows(row_key) - # updating multiple rows - row_idx = indexof1d(row_key, self.df.index.values) - self.df.data[col_key][row_idx] = val - - if isinstance(row_key, pdarray) and row_key.dtype == akbool: - self.df.data[col_key][row_key] = val - if isinstance(row_key, slice): - if ( - row_key.start is not None - and akfind(array([row_key.start]), self.df.index.values)[0] == -1 - ): - raise KeyError(f"Index {row_key.start} not found in DataFrame index") - if row_key.stop is not None and akfind(array([row_key.stop]), self.df.index.values)[0] == -1: - raise KeyError(f"Index {row_key.stop} not found in DataFrame index") - - start_idx = ( - akfind(array([row_key.start]), self.df.index.values)[0] - if row_key.start is not None - else 0 - ) - stop_idx = ( - akfind(array([row_key.stop]), self.df.index.values)[0] + 1 - if row_key.stop is not None - else self.df.index.size - ) - indices = arange(start_idx, stop_idx) - self.df.data[col_key][indices] = val - return None - - def _set_row_col_vector_val(self, row_key, col_key, val): - if isinstance(val, Series): - aligned_indices = indexof1d(val.index.values, self.df.index.values) - self.df.data[col_key][aligned_indices] = val.values - return - if isinstance(row_key, pdarray) and row_key.dtype == self.df.index.dtype: - if akany(in1d(row_key, self.df.index.values, invert=True)): - self.df._add_new_rows(row_key) - # updating multiple rows - row_idx = indexof1d(row_key, self.df.index.values) - self.df.data[col_key][row_idx] = val - if isinstance(row_key, slice): - if ( - row_key.start is not None - and akfind(array([row_key.start]), self.df.index.values)[0] == -1 - ): - raise ValueError(f"Index {row_key.start} not found in DataFrame index") - if row_key.stop is not None and akfind(array([row_key.stop]), self.df.index.values)[0] == -1: - raise ValueError(f"Index {row_key.stop} not found in DataFrame index") - - start_idx = ( - akfind(array([row_key.start]), self.df.index.values)[0] - if row_key.start is not None - else 0 - ) - # should the below have + 1 like the other stop_idxs? - stop_idx = ( - akfind(array([row_key.stop]), self.df.index.values)[0] - if row_key.stop is not None - else self.df.index.size - ) - - indices = arange(start_idx, stop_idx) - self.df.data[col_key][indices] = val - return None - - -class _ILocIndexer: - def __init__(self, df): - self.df = df - - def __getitem__(self, key): - if isinstance(key, tuple) and len(key) == 2: - return self._get_row_col(key[0], key[1]) - if isinstance(key, list): - key = array(key) - if isinstance(key, Series): - key = key.values - - if is_supported_scalar(key): - if not isinstance(key, int): - raise TypeError("iloc key must be an integer") - if key >= len(self.df) or key < -len(self.df): - raise IndexError("Index out of range") - return self.df._get_rows(array([key])) - - if isinstance(key, pdarray): - if key.dtype == akint64: - if akany(key < -len(self.df)) or akany(key >= len(self.df)): - raise IndexError("Index out of range") - return self.df._get_rows(key) - if key.dtype == akbool: - if key.size != self.df.index.size: - raise IndexError("Boolean array must be the same size as the DataFrame index") - return self.df._get_rows(key) - raise TypeError("Invalid dtype for iloc key, must be int or bool: {}".format(key.dtype)) - - if isinstance(key, slice): - if key.start is not None and not isinstance(key.start, int): - raise TypeError("Start of slice must be an integer") - if key.stop is not None and not isinstance(key.stop, int): - raise TypeError("Stop of slice must be an integer") - if key.step is not None and not isinstance(key.step, int): - raise TypeError("Step of slice must be an integer") - start = key.start if key.start is not None else 0 - stop = key.stop if key.stop is not None else self.df.index.size - step = key.step if key.step is not None else 1 - if start < 0 or start >= len(self.df) or stop < 0 or stop > len(self.df) or step <= 0: - raise IndexError("Slice index out of range") - return self.df._get_rows(arange(start, stop, step)) - - raise TypeError("Invalid iloc key: {}".format(key)) - - def _get_row_col(self, row_key, col_key): - row_indexed = self[row_key] - - if isinstance(col_key, list): - col_key = array(col_key) - if isinstance(col_key, Series): - col_key = col_key.values - - if isinstance(row_indexed, DataFrame): - if isinstance(col_key, int): - column_name = row_indexed.columns[col_key] - column = row_indexed[column_name] - if len(column) == 1: - return column.values[0] - else: - return column - - if isinstance(col_key, pdarray): - column_array = array(row_indexed.columns) - if col_key.dtype == akbool or col_key.dtype == akint64: - return row_indexed[column_array[col_key]] - raise ValueError( - "Invalid dtype for iloc key, must be int or bool: {}".format(col_key.dtype) - ) - - def __setitem__(self, key, val): - if isinstance(key, tuple) and len(key) == 2: - self._set_row_col(key[0], key[1], val) - return - else: - raise ValueError( - "Invalid key type. '.iloc' indexing only supports keys with row and column selectors." - ) - - def _set_row_col(self, row_key, col_key, val): - if isinstance(row_key, list): - row_key = array(row_key) - if isinstance(row_key, Series): - row_key = row_key.values - - # Only supports setting a single column at a time - if not isinstance(col_key, int): - raise ValueError("Column key must be an integer") - if col_key >= len(self.df.columns) or col_key < -len(self.df.columns): - raise IndexError("Index out of range") - - row_indices = None - - if is_supported_scalar(row_key): - if not isinstance(row_key, int): - raise ValueError("Row key must be an integer") - if row_key >= len(self.df) or row_key < -len(self.df): - raise IndexError("Index out of range") - row_indices = array([row_key]) - elif isinstance(row_key, pdarray): - if row_key.dtype == akint64: - if akany(row_key < -len(self.df)) or akany(row_key >= len(self.df)): - raise IndexError("Index out of range") - row_indices = row_key - elif row_key.dtype == akbool: - if row_key.size != self.df.index.size: - raise IndexError("Boolean array must be the same size as the DataFrame index") - row_indices = row_key - else: - raise ValueError( - "Invalid dtype for iloc key, must be int or bool: {}".format(row_key.dtype) - ) - elif isinstance(row_key, slice): - if row_key.start is not None and not isinstance(row_key.start, int): - raise ValueError("Start of slice must be an integer") - if row_key.stop is not None and not isinstance(row_key.stop, int): - raise ValueError("Stop of slice must be an integer") - if row_key.step is not None and not isinstance(row_key.step, int): - raise ValueError("Step of slice must be an integer") - start = row_key.start if row_key.start is not None else 0 - stop = row_key.stop if row_key.stop is not None else self.df.index.size - step = row_key.step if row_key.step is not None else 1 - row_indices = arange(start, stop, step) - else: - raise TypeError("invalid row key type: {}".format(type(row_key))) - - if is_supported_scalar(val): - self.df.data[self.df.columns[col_key]][row_indices] = val - elif isinstance(val, pdarray): - if val.size != len(row_indices): - raise ValueError("Value array must be the same size as the row indices") - self.df.data[self.df.columns[col_key]][row_indices] = val - else: - raise ValueError("Invalid value type: {}".format(type(val))) - - -class AtIndexer: - def __init__(self, df) -> None: - self.df = df - - def __getitem__(self, key): - if not isinstance(key, tuple) or len(key) != 2: - raise ValueError(".at requires a row key and a column key") - (row, col) = key - if not is_supported_scalar(row): - raise ValueError(".at only supports scalar row keys") - if not is_supported_scalar(col): - raise ValueError(".at only supports scalar column keys") - return self.df.loc[row, col] - - def __setitem__(self, key, val): - if not isinstance(key, tuple) or len(key) != 2: - raise ValueError(".at requires a row key and a column key") - (row, col) = key - if not is_supported_scalar(row): - raise ValueError(".at only supports scalar row keys") - if not is_supported_scalar(col): - raise ValueError(".at only supports scalar column keys") - self.df.loc[row, col] = val - - -class IAtIndexer: - def __init__(self, df) -> None: - self.df = df - - def __getitem__(self, key): - if not isinstance(key, tuple) or len(key) != 2: - raise ValueError(".iat requires a row key and a column key") - (row, col) = key - if not isinstance(row, int): - raise ValueError(".iat requires integer row keys") - if not isinstance(col, int): - raise ValueError(".iat requires integer column keys") - return self.df.iloc[row, col] - - def __setitem__(self, key, val): - if not isinstance(key, tuple) or len(key) != 2: - raise ValueError(".iat requires a row key and a column key") - (row, col) = key - if not is_supported_scalar(row): - raise ValueError(".iat requires integer row keys") - if not is_supported_scalar(col): - raise ValueError(".iat requires integer column keys") - self.df.iloc[row, col] = val - - def intx(a, b): """ Find all the rows that are in both dataframes. @@ -6256,22 +5625,20 @@ def _inner_join_merge( """ left_cols, right_cols = left.columns.values.copy(), right.columns.values.copy() if isinstance(on, str): - left_inds, right_inds = inner_join(left[on].values, right[on].values) - new_dict = {on: left[on].iloc[left_inds]} + left_inds, right_inds = inner_join(left[on], right[on]) + new_dict = {on: left[on][left_inds]} left_cols.remove(on) right_cols.remove(on) else: - left_inds, right_inds = inner_join( - [left[col].values for col in on], [right[col].values for col in on] - ) - new_dict = {col: left[col].iloc[left_inds] for col in on} + left_inds, right_inds = inner_join([left[col] for col in on], [right[col] for col in on]) + new_dict = {col: left[col][left_inds] for col in on} for col in on: left_cols.remove(col) right_cols.remove(col) for col in left_cols: new_col = col + left_suffix if col in col_intersect else col - new_dict[new_col] = left[col].iloc[left_inds] + new_dict[new_col] = left[col][left_inds] for col in right_cols: new_col = col + right_suffix if col in col_intersect else col new_dict[new_col] = right[col][right_inds] @@ -6329,13 +5696,13 @@ def _right_join_merge( in_left = _inner_join_merge(left, right, on, col_intersect, left_suffix, right_suffix, sort=False) in_left_cols, left_cols = in_left.columns.values.copy(), left.columns.values.copy() if isinstance(on, str): - left_at_on = left[on].values - right_at_on = right[on].values + left_at_on = left[on] + right_at_on = right[on] left_cols.remove(on) in_left_cols.remove(on) else: - left_at_on = [left[col].values for col in on] - right_at_on = [right[col].values for col in on] + left_at_on = [left[col] for col in on] + right_at_on = [right[col] for col in on] for col in on: left_cols.remove(col) in_left_cols.remove(col) @@ -6349,10 +5716,10 @@ def _right_join_merge( nan_cols = list(set(in_left) - set(not_in_left)) for col in nan_cols: if convert_ints is True and in_left[col].dtype == int: - in_left[col] = akcast(in_left[col].values, akfloat64) + in_left[col] = akcast(in_left[col], akfloat64) # Create a nan array for all values not in the left df - not_in_left[col] = __nulls_like(in_left[col].values, len(not_in_left)) + not_in_left[col] = __nulls_like(in_left[col], len(not_in_left)) ret_df = DataFrame.append(in_left, not_in_left) if sort is True: ret_df = ret_df.sort_values(on).reset_index() @@ -6409,14 +5776,14 @@ def _outer_join_merge( ) if isinstance(on, str): - left_at_on = left[on].values - right_at_on = right[on].values + left_at_on = left[on] + right_at_on = right[on] left_cols.remove(on) right_cols.remove(on) else: - left_at_on = [left[col].values for col in on] - right_at_on = [right[col].values for col in on] + left_at_on = [left[col] for col in on] + right_at_on = [right[col] for col in on] for col in on: left_cols.remove(col) right_cols.remove(col) @@ -6438,25 +5805,25 @@ def _outer_join_merge( for col in set(left_nan_cols).union(set(right_nan_cols)): if convert_ints is True and inner[col].dtype == int: - inner[col] = akcast(inner[col].values, akfloat64) + inner[col] = akcast(inner[col], akfloat64) if col in left_nan_cols: if convert_ints is True and not_in_right[col].dtype == int: - not_in_right[col] = akcast(not_in_right[col].values, akfloat64) + not_in_right[col] = akcast(not_in_right[col], akfloat64) elif col in not_in_left.columns.values: - not_in_right[col] = akcast(not_in_right[col].values, not_in_left[col].dtype) + not_in_right[col] = akcast(not_in_right[col], not_in_left[col].dtype) if col in right_nan_cols: if convert_ints is True and not_in_left[col].dtype == int: - not_in_left[col] = akcast(not_in_left[col].values, akfloat64) + not_in_left[col] = akcast(not_in_left[col], akfloat64) elif col in not_in_right.columns.values: - not_in_left[col] = akcast(not_in_left[col].values, not_in_right[col].dtype) + not_in_left[col] = akcast(not_in_left[col], not_in_right[col].dtype) for col in left_nan_cols: # Create a nan array for all values not in the left df - not_in_left[col] = __nulls_like(inner[col].values, len(not_in_left)) + not_in_left[col] = __nulls_like(inner[col], len(not_in_left)) for col in right_nan_cols: # Create a nan array for all values not in the left df - not_in_right[col] = __nulls_like(inner[col].values, len(not_in_right)) + not_in_right[col] = __nulls_like(inner[col], len(not_in_right)) ret_df = DataFrame.append(DataFrame.append(inner, not_in_left), not_in_right) if sort is True: @@ -6654,8 +6021,7 @@ def merge( if not isinstance(on, str): if not all( - isinstance(left[col].values, (pdarray, Strings)) - and isinstance(right[col].values, (pdarray, Strings)) + isinstance(left[col], (pdarray, Strings)) and isinstance(right[col], (pdarray, Strings)) for col in on ): raise ValueError("All columns of a multi-column merge must be pdarrays") diff --git a/tests/dataframe_test.py b/tests/dataframe_test.py index bd7bb687da..8366ca4b4d 100644 --- a/tests/dataframe_test.py +++ b/tests/dataframe_test.py @@ -204,13 +204,13 @@ def test_convenience_init(self): for df in dict_dfs + lists_dfs: self.assertTrue(isinstance(df, ak.DataFrame)) - self.assertTrue(isinstance(df["0"].values, ak.pdarray)) + self.assertTrue(isinstance(df["0"], ak.pdarray)) self.assertEqual(df["0"].dtype, int) - self.assertTrue(isinstance(df["1"].values, ak.pdarray)) + self.assertTrue(isinstance(df["1"], ak.pdarray)) self.assertEqual(df["1"].dtype, bool) - self.assertTrue(isinstance(df["2"].values, ak.Strings)) + self.assertTrue(isinstance(df["2"], ak.Strings)) self.assertEqual(df["2"].dtype, str) - self.assertTrue(isinstance(df["3"].values, ak.pdarray)) + self.assertTrue(isinstance(df["3"], ak.pdarray)) self.assertEqual(df["3"].dtype, float) def test_column_init(self): @@ -240,6 +240,20 @@ def test_boolean_indexing(self): self.assertEqual(len(row), 1) self.assertTrue(ref_df[ref_df["userName"] == "Carol"].equals(row.to_pandas(retain_index=True))) + def test_column_indexing(self): + df = build_ak_df() + self.assertTrue(isinstance(df.userName, ak.Series)) + self.assertTrue(isinstance(df.userID, ak.Series)) + self.assertTrue(isinstance(df.item, ak.Series)) + self.assertTrue(isinstance(df.day, ak.Series)) + self.assertTrue(isinstance(df.amount, ak.Series)) + self.assertTrue(isinstance(df.bi, ak.Series)) + for col in ("userName", "userID", "item", "day", "amount", "bi"): + self.assertTrue(isinstance(df[col], (ak.pdarray, ak.Strings, ak.Categorical))) + self.assertTrue(isinstance(df[["userName", "amount", "bi"]], ak.DataFrame)) + self.assertTrue(isinstance(df[("userID", "item", "day", "bi")], ak.DataFrame)) + self.assertTrue(isinstance(df.index, ak.Index)) + def test_dtype_prop(self): str_arr = ak.array( ["".join(random.choices(string.ascii_letters + string.digits, k=5)) for _ in range(3)] @@ -344,7 +358,7 @@ def test_shape(self): def test_reset_index(self): df = build_ak_df() - slice_df = df.iloc[ak.array([1, 3, 5])] + slice_df = df[ak.array([1, 3, 5])] self.assertListEqual(slice_df.index.to_list(), [1, 3, 5]) df_reset = slice_df.reset_index() @@ -425,7 +439,6 @@ def test_concat(self): glued = ak.DataFrame.concat([df, df_toappend]) ref_df = build_pd_df_append() - assert_frame_equal(ref_df, glued.to_pandas()) # dataframe equality returns series with bool result for each row. self.assertTrue(ref_df.equals(glued.to_pandas())) @@ -687,7 +700,7 @@ def test_to_pandas(self): self.assertTrue(pd_df.equals(df.to_pandas())) - slice_df = df.iloc[ak.array([1, 3, 5])] + slice_df = df[ak.array([1, 3, 5])] pd_df = slice_df.to_pandas(retain_index=True) self.assertEqual(pd_df.index.tolist(), [1, 3, 5]) @@ -765,7 +778,7 @@ def test_sort_index(self): "t", ] ) - ak_df["negs"] = -1 * ak_df["int64"].values + ak_df["negs"] = -1 * ak_df["int64"] group_bys = [ "gb_id", @@ -811,7 +824,8 @@ def test_apply_perm(self): perm_list = [0, 3, 1, 5, 4, 2] default_perm = ak.array(perm_list) ord.apply_permutation(default_perm) - ord_ref = ref_df.sort_values(by="userID") + + ord_ref = ref_df.sort_values(by="userID").reset_index(drop=True) ord_ref = ord_ref.reindex(perm_list).reset_index(drop=True) self.assertTrue(ord_ref.equals(ord.to_pandas())) @@ -1022,7 +1036,7 @@ def test_ipv4_columns(self): # test replacement of IPv4 with uint representation df = ak.DataFrame({"a": ak.IPv4(ak.arange(10))}) - df["a"] = df["a"].values.export_uint() + df["a"] = df["a"].export_uint() self.assertListEqual(ak.arange(10).to_list(), df["a"].to_list()) def test_subset(self): @@ -1461,7 +1475,7 @@ def test_multi_col_merge(self): for col in sorted_columns: from_ak = ak_merge[col].to_ndarray() from_pd = pd_merge[col].to_numpy() - if isinstance(ak_merge[col].values, ak.pdarray): + if isinstance(ak_merge[col], ak.pdarray): self.assertTrue( np.allclose(np.sort(from_ak), np.sort(from_pd), equal_nan=True) ) @@ -1531,7 +1545,7 @@ def test_sample_hypothesis_testing(self): weighted_sample = g.sample(n=num_samples, replace=True, weights=weights, random_state=rng) # count how many of each category we saw - uk, f_obs = ak.GroupBy(weighted_sample["vals"].values).size() + uk, f_obs = ak.GroupBy(weighted_sample["vals"]).size() # I think the keys should always be sorted but just in case if not ak.is_sorted(uk): @@ -1599,649 +1613,6 @@ def test_sample_flags(self): print(f"Failure with seed:\n{seed}") self.assertTrue(res) - def make_dfs_and_refs(self): - ints = [0, 2, 3, 7, 3] - floats = [0.0, 1.5, 0.5, 1.5, -1.0] - strings = ["A", "C", "C", "DE", "Z"] - - unordered_index = [9, 3, 0, 23, 3] - string_index = ["one", "two", "three", "four", "five"] - - # default index - df1 = ak.DataFrame( - {"ints": ak.array(ints), "floats": ak.array(floats), "strings": ak.array(strings)} - ) - _df1 = pd.DataFrame( - {"ints": np.array(ints), "floats": np.array(floats), "strings": np.array(strings)} - ) - - # unorderd index, integer labels - df2 = ak.DataFrame( - {1: ak.array(ints), 2: ak.array(floats), 3: ak.array(strings)}, index=unordered_index - ) - _df2 = pd.DataFrame( - {1: np.array(ints), 2: np.array(floats), 3: np.array(strings)}, index=unordered_index - ) - - # string index - df3 = ak.DataFrame( - {"ints": ak.array(ints), "floats": ak.array(floats), "strings": ak.array(strings)}, - index=string_index, - ) - _df3 = pd.DataFrame( - {"ints": np.array(ints), "floats": np.array(floats), "strings": np.array(strings)}, - index=string_index, - ) - - return (df1, _df1, df2, _df2, df3, _df3) - - def test_getitem_scalars_and_slice(self): - default_index = [0, 1, 2, 3, 4] - unordered_index = [9, 3, 0, 23, 3] - string_index = ["one", "two", "three", "four", "five"] - - ints = [0, 2, 3, 7, 3] - floats = [0.0, 1.5, 0.5, 1.5, -1.0] - strings = ["A", "C", "C", "DE", "Z"] - - # group 1: string labels - df1, _df1, df2, _df2, df3, _df3 = self.make_dfs_and_refs() - - string_keys = ["ints", "floats", "strings"] - int_keys = [1, 2, 3] - - dfs = [df1, df2, df3] - _dfs = [_df1, _df2, _df3] - keys_list = [string_keys, int_keys, string_keys] - indexes = [default_index, unordered_index, string_index] - for df, _df, keys, index in zip(dfs, _dfs, keys_list, indexes): - # single column label returns a series - for key in keys: - access1_ = _df[key] - access1 = df[key] - self.assertIsInstance(access1_, pd.Series) - self.assertIsInstance(access1, ak.Series) - self.assertListEqual(access1_.values.tolist(), access1.values.to_list()) - self.assertListEqual(access1_.index.tolist(), access1.index.to_list()) - - # matching behavior for nonexistant label - with self.assertRaises(KeyError): - _access2 = _df[keys[0] * 100] - with self.assertRaises(KeyError): - access2 = df[keys[0] * 100] - - # result reference behavior - _access3 = _df[keys[0]] - access3 = df[keys[0]] - access3[index[0]] = 100 - _access3[index[0]] = 100 - self.assertEqual(_df[keys[0]][index[0]], df[keys[0]][index[0]]) - - # key type matches column label types - with self.assertRaises(TypeError): - if isinstance(keys[0], int): - a = df["int"] - else: - a = df[3] - with self.assertRaises(TypeError): - b = df[1.0] - - # slice both bounds - _slice_access = _df1[1:4] - slice_access = df1[1:4] - assert_frame_equal(_slice_access, slice_access.to_pandas(retain_index=True)) - - # slice high bound - _slice_access = _df1[:3] - slice_access = df1[:3] - assert_frame_equal(_slice_access, slice_access.to_pandas(retain_index=True)) - - # slice low bound - _slice_access = _df1[3:] - slice_access = df1[3:] - assert_frame_equal(_slice_access, slice_access.to_pandas(retain_index=True)) - - # slice no bounds - _slice_access = _df1[:] - slice_access = df1[:] - assert_frame_equal(_slice_access, slice_access.to_pandas(retain_index=True)) - - _d = pd.DataFrame( - {"ints": np.array(ints), "floats": np.array(floats), "strings": np.array(strings)}, - index=[0, 2, 5, 1, 5], - ) - _a = _d[1:4] - d = ak.DataFrame( - {"ints": ak.array(ints), "floats": ak.array(floats), "strings": ak.array(strings)}, - index=ak.array([0, 2, 5, 1, 5]), - ) - a = d[1:4] - assert_frame_equal(_a, a.to_pandas(retain_index=True)) - - # priority when same index and label types - df2 = ak.DataFrame( - {"A": ak.array(ints), "floats": ak.array(floats), "strings": ak.array(strings)}, - index=ak.array(strings), - ) - _df2 = pd.DataFrame( - {"A": pd.array(ints), "floats": pd.array(floats), "strings": pd.array(strings)}, - index=pd.array(strings), - ) - - access4 = df2["A"] - _access4 = _df2["A"] - self.assertIsInstance(_access4, pd.Series) - self.assertIsInstance(access4, ak.Series) - # arkouda to_pandas creates a list of objects for the index rather than a list of strings - self.assertListEqual(_access4.values.tolist(), access4.values.to_list()) - self.assertListEqual(_access4.index.tolist(), access4.index.to_list()) - - def test_getitem_vectors(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # multiple columns - _access1 = _df1[["ints", "floats"]] - access1 = df1[["ints", "floats"]] - assert_frame_equal(_access1, access1.to_pandas(retain_index=True)) - - _access2 = _df1[np.array(["ints", "floats"])] - access2 = df1[ak.array(["ints", "floats"])] - assert_frame_equal(_access2, access2.to_pandas(retain_index=True)) - - # boolean mask - _access3 = _df1[_df1["ints"] == 3] - access3 = df1[df1["ints"] == 3] - assert_frame_equal(_access3, access3.to_pandas(retain_index=True)) - - # boolean mask of incorrect length - bad = [True, True, False, False] - with self.assertRaises(ValueError): - _df1[np.array(bad)] - with self.assertRaises(ValueError): - df1[ak.array(bad)] - - # one key present one missing - with self.assertRaises(KeyError): - _access4 = _df1[["ints", "not"]] - with self.assertRaises(KeyError): - access4 = df1[["ints", "not"]] - - # repeated index - - _access5 = _df2[[1, 2]] - access5 = df2[[1, 2]] - assert_frame_equal(_access5, access5.to_pandas(retain_index=True)) - - # arg order - _access6 = _df2[[2, 1]] - access6 = df2[[2, 1]] - assert_frame_equal(_access6, access6.to_pandas(retain_index=True)) - - def test_setitem_scalars(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # add new column - new_ints = [8, 9, -10, 8, 12] - _df1["new"] = np.array(new_ints) - df1["new"] = ak.array(new_ints) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # modify existing column - _df1["ints"] = np.array([1, 2, 3, 4, 5]) - df1["ints"] = ak.array([1, 2, 3, 4, 5]) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # setting scalar value - _df1["ints"] = 100 - df1["ints"] = 100 - - # indexing with boolean mask, array value - _df1[_df1["ints"] == 100]["ints"] = np.array([1, 2, 3, 4, 5]) - df1[df1["ints"] == 100]["ints"] = ak.array([1, 2, 3, 4, 5]) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # indexing with boolean mask, array value, incorrect length - with self.assertRaises(ValueError): - _df1[np.array([True, True, False, False, False])]["ints"] = np.array([1, 2, 3, 4]) - with self.assertRaises(ValueError): - df1[ak.array([True, True, False, False, False])]["ints"] = ak.array([1, 2, 3, 4]) - - # incorrect column index type - with self.assertRaises(TypeError): - df1[1] = ak.array([1, 2, 3, 4, 5]) - - # integer column labels, integer index labels - # add new column - new_ints = [8, 9, -10, 8, 12] - - _df2[4] = np.array(new_ints) - df2[4] = ak.array(new_ints) - assert_frame_equal(_df2, df2.to_pandas(retain_index=True)) - - # modify existing column - _df2[1] = np.array([1, 2, 3, 4, 5]) - df2[1] = ak.array([1, 2, 3, 4, 5]) - assert_frame_equal(_df2, df2.to_pandas(retain_index=True)) - - # indexing with boolean mask, scalar value - _df2[_df2[1] == 3][1] = 101 - df2[df2[1] == 3][1] = 101 - assert_frame_equal(_df2, df2.to_pandas(retain_index=True)) - - # setting to scalar value - _df2[1] = 100 - df2[1] = 100 - assert_frame_equal(_df2, df2.to_pandas(retain_index=True)) - - # indexing with boolean mask, array value - _df2[_df2[1] == 100][1] = np.array([1, 2, 3, 4, 5]) - df2[df2[1] == 100][1] = ak.array([1, 2, 3, 4, 5]) - assert_frame_equal(_df2, df2.to_pandas(retain_index=True)) - - # indexing with boolean mask, array value, incorrect length - with self.assertRaises(ValueError): - _df2[np.array([True, True, False, False, False])][1] = np.array([1, 2, 3, 4]) - with self.assertRaises(ValueError): - df2[ak.array([True, True, False, False, False])][1] = ak.array([1, 2, 3, 4]) - - # incorrect column index type - with self.assertRaises(TypeError): - df2["new column"] = ak.array([1, 2, 3, 4, 5]) - - def test_setitem_vectors(self): - ints = [0, 1, 3, 7, 3] - floats = [0.0, 1.5, 0.5, 1.5, -1.0] - strings = ["A", "C", "C", "DE", "Z"] - - ints2 = [8, 9, -10, 8, 12] - floats2 = [8.5, 5.0, 6.2, 1.2, 0.0] - strings2 = ["B", "D", "D", "EF", "Y"] - - _df = pd.DataFrame( - {"ints": np.array(ints), "floats": np.array(floats), "strings": np.array(strings)} - ) - df = ak.DataFrame( - {"ints": ak.array(ints), "floats": ak.array(floats), "strings": ak.array(strings)} - ) - - _df2 = pd.DataFrame( - {"ints": np.array(ints2), "floats": np.array(floats2), "strings": np.array(strings2)} - ) - df2 = ak.DataFrame( - {"ints": ak.array(ints2), "floats": ak.array(floats2), "strings": ak.array(strings2)} - ) - - # assignment of one dataframe access to another - _df[["ints", "floats"]] = _df2[["ints", "floats"]] - df[["ints", "floats"]] = df2[["ints", "floats"]] - assert_frame_equal(_df, df.to_pandas()) - - # new contents for dataframe being read - _df2["ints"] = np.array(ints) - df2["ints"] = ak.array(ints) - _df2["floats"] = np.array(floats) - df2["floats"] = ak.array(floats) - - # assignment of one dataframe access to another, different order - _df[["floats", "ints"]] = _df2[["floats", "ints"]] - df[["floats", "ints"]] = df2[["floats", "ints"]] - assert_frame_equal(_df, df.to_pandas()) - - # inserting multiple columns at once - _df[["new1", "new2"]] = _df2[["ints", "floats"]] - df[["new1", "new2"]] = df2[["ints", "floats"]] - assert_frame_equal(_df, df.to_pandas()) - - # reset values - _df2["ints"] = np.array(ints2) - df2["ints"] = ak.array(ints2) - _df2["floats"] = np.array(floats2) - df2["floats"] = ak.array(floats2) - - # boolean mask, accessing two columns - _df[_df["ints"] == 3][["ints", "floats"]] = _df2[0:2][["ints", "floats"]] - df[df["ints"] == 3][["ints", "floats"]] = df2[0:2][["ints", "floats"]] - assert_frame_equal(_df, df.to_pandas()) - - _df3 = pd.DataFrame({"ints": np.array(ints), "floats": np.array(floats)}) - df3 = ak.DataFrame({"ints": ak.array(ints), "floats": ak.array(floats)}) - _df4 = pd.DataFrame({"ints": np.array(ints2), "floats": np.array(floats2)}) - df4 = ak.DataFrame({"ints": ak.array(ints2), "floats": ak.array(floats2)}) - # boolean mask, assignment of dataframe - _df3[[True, True, False, False, False]] = _df4[0:2] - df3[[True, True, False, False, False]] = df4[0:2] - assert_frame_equal(_df3, df3.to_pandas()) - - def test_loc_get(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # single label for row - _loc1 = _df1.loc[2] - loc1 = df1.loc[2] - self.assertIsInstance(_loc1, pd.Series) - self.assertIsInstance(loc1, ak.DataFrame) - for column in _loc1.index: - self.assertEqual(_loc1[column], loc1[column].values[0]) - - # list of labels - _loc2 = _df1.loc[[2, 3, 4]] - loc2 = df1.loc[[2, 3, 4]] - assert_frame_equal(_loc2, loc2.to_pandas(retain_index=True)) - - # slice of labels - _loc3 = _df1.loc[1:3] - loc3 = df1.loc[1:3] - assert_frame_equal(_loc3, loc3.to_pandas(retain_index=True)) - - # boolean array of same length as array being sliced - _loc4 = _df1.loc[[True, True, False, False, True]] - loc4 = df1.loc[ak.array([True, True, False, False, True])] - assert_frame_equal(_loc4, loc4.to_pandas(retain_index=True)) - - # alignable boolean Series - _loc5 = _df1.loc[_df1["ints"] == 3] - loc5 = df1.loc[df1["ints"] == 3] - assert_frame_equal(_loc5, loc5.to_pandas(retain_index=True)) - - # single label for row and column - _loc6 = _df1.loc[2, "floats"] - loc6 = df1.loc[2, "floats"] - self.assertEqual(_loc6, loc6) - - # slice with label for row and single label for column - _loc7 = _df1.loc[1:3, "floats"] - loc7 = df1.loc[1:3, "floats"] - self.assertIsInstance(_loc7, pd.Series) - self.assertIsInstance(loc7, ak.Series) - for column in _loc7.index: - self.assertListEqual(_loc7.values.tolist(), loc7.values.to_list()) - - # boolean array for row and array of labels for columns - _loc8 = _df1.loc[[True, True, False, False, True], ["ints", "floats"]] - loc8 = df1.loc[ak.array([True, True, False, False, True]), ["ints", "floats"]] - assert_frame_equal(_loc8, loc8.to_pandas(retain_index=True)) - - def test_loc_set_scalar(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - # single row, single column, scalar value - _df1.loc[2, "floats"] = 100.0 - df1.loc[2, "floats"] = 100.0 - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # multiple rows, single column, scalar value - _df1.loc[[2, 3, 4], "floats"] = 101.0 - df1.loc[[2, 3, 4], "floats"] = 101.0 - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # setting an entire column - _df1.loc[:, "floats"] = 99.0 - df1.loc[:, "floats"] = 99.0 - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - _df1.loc[1:3, "floats"] = 98.0 - df1.loc[1:3, "floats"] = 98.0 - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # setting value for rows matching boolean - _df1.loc[_df1["ints"] == 3, "floats"] = 102.0 - df1.loc[df1["ints"] == 3, "floats"] = 102.0 - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # incorrect column index type - with self.assertRaises(TypeError): - df1.loc[2, 1] = 100.0 - - # incorrect row index type - with self.assertRaises(TypeError): - df1.loc[1.0, "floats"] = 100.0 - - def test_loc_set_vector(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # two rows, one column, two values - _df1.loc[[2, 3], "floats"] = np.array([100.0, 101.0]) - df1.loc[[2, 3], "floats"] = ak.array([100.0, 101.0]) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # setting with Series matches index labels, not positions - _df1.loc[:, "floats"] = pd.Series([100.0, 101.0, 102.0, 103.0, 104.0], index=[0, 1, 2, 3, 4]) - df1.loc[:, "floats"] = ak.Series( - ak.array([100.0, 101.0, 102.0, 103.0, 104.0]), index=ak.array([0, 1, 2, 3, 4]) - ) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # setting with Series with unordered index - _df1.loc[:, "ints"] = pd.Series([2, 3, 4, 5, 6], index=[3, 2, 1, 0, 4]) - df1.loc[:, "ints"] = ak.Series(ak.array([2, 3, 4, 5, 6]), index=ak.array([3, 2, 1, 0, 4])) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # setting with Series against an array of indices - _df1.loc[np.array([2, 3, 4]), "floats"] = pd.Series([70.0, 71.0, 72.0], index=[3, 4, 2]) - df1.loc[ak.array([2, 3, 4]), "floats"] = ak.Series( - ak.array([70.0, 71.0, 72.0]), index=ak.array([3, 4, 2]) - ) - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - def test_set_new_values(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # new column - _df1.loc[2, "not"] = 100.0 - df1.loc[2, "not"] = 100.0 - assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # TODO: The following two lines behave differently because pandas - # converts the int column to floating point to accomodate the nan - # value of the new column - # _df1.loc[100, 'floats'] = 100.0 - # df1.loc[100, 'floats'] = 100.0 - # assert_frame_equal(_df1, df1.to_pandas(retain_index=True)) - - # cannot add new rows to a dataframe with string column - with self.assertRaises(ValueError): - df2.loc[100, 7] = 100.0 - - def test_iloc_get(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - for _df1, df1 in zip([_df1, _df2, _df3], [df1, df2, df3]): - # integer input - _iloc1 = _df1.iloc[2] - iloc1 = df1.iloc[2] - self.assertIsInstance(_iloc1, pd.Series) - self.assertIsInstance(iloc1, ak.DataFrame) - for column in _iloc1.index: - self.assertEqual(_iloc1[column], iloc1[column].values[0]) - - # list of integers - _iloc2 = _df1.iloc[[2, 3, 4]] - iloc2 = df1.iloc[[2, 3, 4]] - assert_frame_equal(_iloc2, iloc2.to_pandas(retain_index=True)) - - # list of unordered integers - _iloc3 = _df1.iloc[[4, 2, 3]] - iloc3 = df1.iloc[[4, 2, 3]] - assert_frame_equal(_iloc3, iloc3.to_pandas(retain_index=True)) - - # array of integers - _iloc4 = _df1.iloc[np.array([2, 3, 4])] - iloc4 = df1.iloc[ak.array([2, 3, 4])] - assert_frame_equal(_iloc4, iloc4.to_pandas(retain_index=True)) - - # array of unordered integers - _iloc5 = _df1.iloc[np.array([4, 2, 3])] - iloc5 = df1.iloc[ak.array([4, 2, 3])] - assert_frame_equal(_iloc5, iloc5.to_pandas(retain_index=True)) - - # slice object with ints - _iloc6 = _df1.iloc[1:3] - iloc6 = df1.iloc[1:3] - assert_frame_equal(_iloc6, iloc6.to_pandas(retain_index=True)) - - # slice object with no lower bound - _iloc7 = _df1.iloc[:3] - iloc7 = df1.iloc[:3] - assert_frame_equal(_iloc7, iloc7.to_pandas(retain_index=True)) - - # slice object with no upper bound - _iloc8 = _df1.iloc[3:] - iloc8 = df1.iloc[3:] - assert_frame_equal(_iloc8, iloc8.to_pandas(retain_index=True)) - - # slice object with no bounds - _iloc9 = _df1.iloc[:] - iloc9 = df1.iloc[:] - assert_frame_equal(_iloc9, iloc9.to_pandas(retain_index=True)) - - # boolean array - _iloc10 = _df1.iloc[[True, True, False, False, True]] - iloc10 = df1.iloc[ak.array([True, True, False, False, True])] - assert_frame_equal(_iloc10, iloc10.to_pandas(retain_index=True)) - - # boolean array of incorrect length - with self.assertRaises(IndexError): - _df1.iloc[[True, True, False, False]] - with self.assertRaises(IndexError): - df1.iloc[ak.array([True, True, False, False])] - - # tuple of row and column indexes - _iloc11 = _df1.iloc[2, 1] - iloc11 = df1.iloc[2, 1] - self.assertIsInstance(_iloc11, np.float64) - self.assertIsInstance(iloc11, np.float64) - self.assertEqual(_iloc11, iloc11) - - # integer row, list column - _iloc12 = _df1.iloc[2, [0, 1]] - iloc12 = df1.iloc[2, [0, 1]] - self.assertIsInstance(_iloc12, pd.Series) - self.assertIsInstance(iloc12, ak.DataFrame) - for column in _iloc12.index: - self.assertEqual(_iloc12[column], iloc12[column].values[0]) - - # list row, integer column - _iloc13 = _df1.iloc[[2, 3], 1] - iloc13 = df1.iloc[[2, 3], 1] - self.assertIsInstance(_iloc13, pd.Series) - self.assertIsInstance(iloc13, ak.Series) - for column in _iloc13.index: - self.assertEqual(_iloc13[column], iloc13[column]) - - # list row, list column - _iloc14 = _df1.iloc[[2, 3], [0, 1]] - iloc14 = df1.iloc[[2, 3], [0, 1]] - assert_frame_equal(_iloc14, iloc14.to_pandas(retain_index=True)) - - # slice row, boolean array column - _iloc15 = _df1.iloc[1:3, [True, False, True]] - iloc15 = df1.iloc[1:3, [True, False, True]] - assert_frame_equal(_iloc15, iloc15.to_pandas(retain_index=True)) - - # raises IndexError if requested indexer is out-of-bounds - with self.assertRaises(IndexError): - _df1.iloc[100] - with self.assertRaises(IndexError): - df1.iloc[100] - with self.assertRaises(IndexError): - _df1.iloc[100, 1] - with self.assertRaises(IndexError): - df1.iloc[100, 1] - with self.assertRaises(IndexError): - _df1.iloc[[0, 2, 100], 1] - with self.assertRaises(IndexError): - df1.iloc[[0, 2, 100], 1] - with self.assertRaises(IndexError): - _df1.iloc[1, 100] - with self.assertRaises(IndexError): - df1.iloc[1, 100] - - pass - - def test_iloc_set(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - for _df, df in zip([_df1, _df2, _df3], [df1, df2, df3]): - # tuple of integers - _df.iloc[2, 1] = 100.0 - df.iloc[2, 1] = 100.0 - assert_frame_equal(_df, df.to_pandas(retain_index=True)) - - # list row, integer column - _df.iloc[[2, 3], 1] = 102.0 - df.iloc[[2, 3], 1] = 102.0 - assert_frame_equal(_df, df.to_pandas(retain_index=True)) - - # slice row, integer column - _df.iloc[1:3, 1] = 103.0 - df.iloc[1:3, 1] = 103.0 - assert_frame_equal(_df, df.to_pandas(retain_index=True)) - - # slice row, no lower bound, integer column - _df.iloc[:3, 1] = 104.0 - df.iloc[:3, 1] = 104.0 - assert_frame_equal(_df, df.to_pandas(retain_index=True)) - - # slice row, no upper bound, integer column - _df.iloc[3:, 1] = 105.0 - df.iloc[3:, 1] = 105.0 - assert_frame_equal(_df, df.to_pandas(retain_index=True)) - - # slice row, no bounds, integer column - _df.iloc[:, 1] = 106.0 - df.iloc[:, 1] = 106.0 - assert_frame_equal(_df, df.to_pandas(retain_index=True)) - - # string columns immutable - with self.assertRaises(TypeError): - df.iloc[2, 2] = "new string" - pass - - def test_at(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # single label for row and column - _at1 = _df1.at[2, "floats"] - at1 = df1.at[2, "floats"] - self.assertEqual(_at1, at1) - - # does not support lists - with self.assertRaises(pd.errors.InvalidIndexError): - _df1.at[[2, 3], "floats"] - with self.assertRaises(ValueError): - df1.at[[2, 3], "floats"] - - # assignment - _df1.at[2, "floats"] = 100.0 - df1.at[2, "floats"] = 100.0 - assert_frame_equal(_df1, df1.to_pandas()) - - pass - - def test_iat(self): - (df1, _df1, df2, _df2, df3, _df3) = self.make_dfs_and_refs() - - # single label for row and column - _iat1 = _df1.iat[2, 1] - iat1 = df1.iat[2, 1] - self.assertEqual(_iat1, iat1) - - # does not support lists - with self.assertRaises(ValueError): - _df1.iat[[2, 3], 1] - with self.assertRaises(ValueError): - df1.iat[[2, 3], 1] - - # indices must be integers - with self.assertRaises(ValueError): - _df1.iat[1, "floats"] - with self.assertRaises(ValueError): - df1.iat[1, "floats"] - - # assignment - _df1.iat[2, 1] = 100.0 - df1.iat[2, 1] = 100.0 - assert_frame_equal(_df1, df1.to_pandas()) - def pda_to_str_helper(pda): return ak.array([f"str {i}" for i in pda.to_list()]) diff --git a/tests/io_test.py b/tests/io_test.py index a8a15cfd4f..f8ad846874 100644 --- a/tests/io_test.py +++ b/tests/io_test.py @@ -1336,7 +1336,7 @@ def test_segarr_edge(self): df.to_hdf(f"{tmp_dirname}/seg_test") rd_data = ak.read_hdf(f"{tmp_dirname}/seg_test*").popitem()[1] - self.assertListEqual(df["c_11"].values.to_list(), rd_data.to_list()) + self.assertListEqual(df["c_11"].to_list(), rd_data.to_list()) df = ak.DataFrame({"c_2": ak.SegArray(ak.array([0, 9, 14]), ak.arange(-10, 10))}) with tempfile.TemporaryDirectory(dir=IOTest.io_test_dir) as tmp_dirname: @@ -1383,8 +1383,8 @@ def test_special_dtypes(self): self.assertIsInstance(rd_df["datetime"], ak.Datetime) self.assertIsInstance(rd_df["timedelta"], ak.Timedelta) self.assertListEqual(df["ip"].to_list(), rd_df["ip"].to_list()) - self.assertListEqual(df["datetime"].values.to_list(), rd_df["datetime"].to_list()) - self.assertListEqual(df["timedelta"].values.to_list(), rd_df["timedelta"].to_list()) + self.assertListEqual(df["datetime"].to_list(), rd_df["datetime"].to_list()) + self.assertListEqual(df["timedelta"].to_list(), rd_df["timedelta"].to_list()) def test_index(self): tests = [ diff --git a/tests/parquet_test.py b/tests/parquet_test.py index 85d445a450..0ca97caf63 100644 --- a/tests/parquet_test.py +++ b/tests/parquet_test.py @@ -485,7 +485,7 @@ def test_multicol_write(self): # read files and ensure that all resulting fields are as expected rd_data = ak.read_parquet(f"{tmp_dirname}/multicol_parquet*") for k, v in rd_data.items(): - self.assertListEqual(v.to_list(), akdf[k].values.to_list()) + self.assertListEqual(v.to_list(), akdf[k].to_list()) # extra insurance, check dataframes are equivalent rd_df = ak.DataFrame(rd_data) @@ -517,20 +517,20 @@ def test_read_nested(self): data = ak.read_parquet(fname + "_*") self.assertTrue("idx" in data) self.assertTrue("seg" in data) - self.assertListEqual(df["idx"].values.to_list(), data["idx"].to_list()) - self.assertListEqual(df["seg"].values.to_list(), data["seg"].to_list()) + self.assertListEqual(df["idx"].to_list(), data["idx"].to_list()) + self.assertListEqual(df["seg"].to_list(), data["seg"].to_list()) # test read with read_nested=false and no supplied datasets data = ak.read_parquet(fname + "_*", read_nested=False).popitem()[1] self.assertIsInstance(data, ak.pdarray) - self.assertListEqual(df["idx"].values.to_list(), data.to_list()) + self.assertListEqual(df["idx"].to_list(), data.to_list()) # test read with read_nested=false and user supplied datasets. Should ignore read_nested data = ak.read_parquet(fname + "_*", datasets=["idx", "seg"], read_nested=False) self.assertTrue("idx" in data) self.assertTrue("seg" in data) - self.assertListEqual(df["idx"].values.to_list(), data["idx"].to_list()) - self.assertListEqual(df["seg"].values.to_list(), data["seg"].to_list()) + self.assertListEqual(df["idx"].to_list(), data["idx"].to_list()) + self.assertListEqual(df["seg"].to_list(), data["seg"].to_list()) def test_segarray_string(self): words = ak.array(["one,two,three", "uno,dos,tres"]) @@ -619,7 +619,7 @@ def test_empty_segs_segarray(self): pddf.to_parquet(file_path) akdf = ak.DataFrame(ak.read_parquet(file_path)) - to_pd = pd.Series(akdf["rand"].values.to_list()) + to_pd = pd.Series(akdf["rand"].to_list()) # raises an error if the two series aren't equal # we can't use np.allclose(pddf['rand'].to_list, akdf['rand'].to_list) since these # are lists of lists. assert_series_equal handles this and properly handles nans. diff --git a/tests/segarray_test.py b/tests/segarray_test.py index cfe5947973..0d5607c041 100644 --- a/tests/segarray_test.py +++ b/tests/segarray_test.py @@ -823,7 +823,7 @@ def test_equality(self): ) for col in df.columns: - a = df[col].values + a = df[col] if a.dtype == ak.float64: a = a.to_ndarray() if isinstance(a[0], np.ndarray): diff --git a/tests/series_test.py b/tests/series_test.py index caac5d934b..4096cf6f48 100644 --- a/tests/series_test.py +++ b/tests/series_test.py @@ -809,12 +809,3 @@ def test_fillna(self): fill_values3 = 100.0 self.assertListEqual(data.fillna(fill_values3).to_list(), [1.0, 100.0, 3.0, 100.0, 5.0]) - - def test_series_segarray_to_pandas(self): - # reproducer for issue #3222 - sa = ak.SegArray(ak.arange(0, 30, 3), ak.arange(30)) - akdf = ak.DataFrame({"test": sa}) - pddf = pd.DataFrame({"test": sa.to_list()}) - - assert_frame_equal(akdf.to_pandas(), pddf) - assert_series_equal(akdf["test"].to_pandas(), pddf["test"], check_names=False) diff --git a/tests/symbol_table_test.py b/tests/symbol_table_test.py index 4890c6db9e..faad01b995 100644 --- a/tests/symbol_table_test.py +++ b/tests/symbol_table_test.py @@ -630,7 +630,7 @@ def test_registered_component(self): # verify that components seen as registered after original unregistered s.unregister() - self.assertTrue(df["SegArray"].values.is_registered()) + self.assertTrue(df["SegArray"].is_registered()) cleanup() From 3b0272de4969f54798e95803cd303c4b5feefb6f Mon Sep 17 00:00:00 2001 From: Brandon Neth Date: Wed, 12 Jun 2024 16:57:01 -0500 Subject: [PATCH 4/5] updates to remove .value calls --- PROTO_tests/tests/dataframe_test.py | 18 +++++++++--------- PROTO_tests/tests/io_test.py | 22 +++++++++++----------- PROTO_tests/tests/series_test.py | 9 --------- 3 files changed, 20 insertions(+), 29 deletions(-) diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py index 31e020175d..2ee38c060d 100644 --- a/PROTO_tests/tests/dataframe_test.py +++ b/PROTO_tests/tests/dataframe_test.py @@ -252,14 +252,14 @@ def test_convenience_init(self): for df in dict_dfs + lists_dfs: assert isinstance(df, ak.DataFrame) - assert isinstance(df["0"].values, ak.pdarray) - assert df["0"].values.dtype == int - assert isinstance(df["1"].values, ak.pdarray) - assert df["1"].values.dtype == bool - assert isinstance(df["2"].values, ak.Strings) - assert df["2"].values.dtype == str - assert isinstance(df["3"].values, ak.pdarray) - assert df["3"].values.dtype == float + assert isinstance(df["0"], ak.pdarray) + assert df["0"].dtype == int + assert isinstance(df["1"], ak.pdarray) + assert df["1"].dtype == bool + assert isinstance(df["2"], ak.Strings) + assert df["2"].dtype == str + assert isinstance(df["3"], ak.pdarray) + assert df["3"].dtype == float def test_client_type_creation(self): f = ak.Fields(ak.arange(10), ["A", "B", "c"]) @@ -1242,7 +1242,7 @@ def test_sample_hypothesis_testing(self): weighted_sample = g.sample(n=num_samples, replace=True, weights=weights, random_state=rng) # count how many of each category we saw - uk, f_obs = ak.GroupBy(weighted_sample["vals"].values).size() + uk, f_obs = ak.GroupBy(weighted_sample["vals"]).size() # I think the keys should always be sorted but just in case if not ak.is_sorted(uk): diff --git a/PROTO_tests/tests/io_test.py b/PROTO_tests/tests/io_test.py index 2556011b7b..bfe1bff758 100644 --- a/PROTO_tests/tests/io_test.py +++ b/PROTO_tests/tests/io_test.py @@ -483,7 +483,7 @@ def test_read_nested(self): assert "idx" in data assert "seg" in data assert df["idx"].to_list() == data["idx"].to_list() - assert df["seg"].values.to_list() == data["seg"].to_list() + assert df["seg"].to_list() == data["seg"].to_list() # test read with read_nested=false and no supplied datasets data = ak.read_parquet(f"{file_name}*", read_nested=False)["idx"] @@ -495,7 +495,7 @@ def test_read_nested(self): assert "idx" in data assert "seg" in data assert df["idx"].to_list() == data["idx"].to_list() - assert df["seg"].values.to_list() == data["seg"].to_list() + assert df["seg"].to_list() == data["seg"].to_list() @pytest.mark.parametrize("comp", COMPRESSIONS) def test_ipv4_columns(self, comp): @@ -524,7 +524,7 @@ def test_ipv4_columns(self, comp): # test replacement of IPv4 with uint representation df = ak.DataFrame({"a": ak.IPv4(ak.arange(10))}) - df["a"] = df["a"].values.export_uint() + df["a"] = df["a"].export_uint() assert ak.arange(10).to_list() == df["a"].to_list() def test_empty_segs_segarray(self): @@ -562,7 +562,7 @@ def test_empty_segs_segarray(self): pddf.to_parquet(file_path) akdf = ak.DataFrame(ak.read_parquet(file_path)) - to_pd = pd.Series(akdf["rand"].values.to_list()) + to_pd = pd.Series(akdf["rand"].to_list()) # raises an error if the two series aren't equal # we can't use np.allclose(pddf['rand'].to_list, akdf['rand'].to_list) since these # are lists of lists. assert_series_equal handles this and properly handles nans. @@ -702,7 +702,7 @@ def test_read_and_write_with_dict(self): for col_name in akdf.columns.values: gen_arr = ak.read_hdf(f"{file_name}*", datasets=[col_name])[col_name] if akdf[col_name].dtype != ak.float64: - assert akdf[col_name].values.to_list() == gen_arr.to_list() + assert akdf[col_name].to_list() == gen_arr.to_list() else: a = akdf[col_name].to_ndarray() b = gen_arr.to_ndarray() @@ -742,7 +742,7 @@ def test_read_and_write_with_dict(self): # verify generic load works gen_arr = ak.load(path_prefix=file_name, dataset=col_name)[col_name] if akdf[col_name].dtype != ak.float64: - assert akdf[col_name].values.to_list() == gen_arr.to_list() + assert akdf[col_name].to_list() == gen_arr.to_list() else: a = akdf[col_name].to_ndarray() b = gen_arr.to_ndarray() @@ -754,7 +754,7 @@ def test_read_and_write_with_dict(self): # verify generic load works with file_format parameter gen_arr = ak.load(path_prefix=file_name, dataset=col_name, file_format="HDF5")[col_name] if akdf[col_name].dtype != ak.float64: - assert akdf[col_name].values.to_list() == gen_arr.to_list() + assert akdf[col_name].to_list() == gen_arr.to_list() else: a = akdf[col_name].to_ndarray() b = gen_arr.to_ndarray() @@ -1229,7 +1229,7 @@ def test_hdf_overwrite_dataframe(self): data = ak.read_hdf(f"{file_name}*") odf_keys = list(odf.keys()) for key in df.keys(): - assert (data[key] == (odf[key].values if key in odf_keys else df[key].values)).all() + assert (data[key] == (odf[key] if key in odf_keys else df[key])).all() def test_overwrite_segarray(self): sa1 = ak.SegArray(ak.arange(0, 1000, 5), ak.arange(1000)) @@ -1429,9 +1429,9 @@ def test_special_objtype(self): assert isinstance(rd_df["ip"], ak.IPv4) assert isinstance(rd_df["datetime"], ak.Datetime) assert isinstance(rd_df["timedelta"], ak.Timedelta) - assert df["ip"].values.to_list() == rd_df["ip"].to_list() - assert df["datetime"].values.to_list() == rd_df["datetime"].to_list() - assert df["timedelta"].values.to_list() == rd_df["timedelta"].to_list() + assert df["ip"].to_list() == rd_df["ip"].to_list() + assert df["datetime"].to_list() == rd_df["datetime"].to_list() + assert df["timedelta"].to_list() == rd_df["timedelta"].to_list() class TestCSV: diff --git a/PROTO_tests/tests/series_test.py b/PROTO_tests/tests/series_test.py index 4619e86ede..1d66930089 100644 --- a/PROTO_tests/tests/series_test.py +++ b/PROTO_tests/tests/series_test.py @@ -341,12 +341,3 @@ def test_fillna(self): fill_values3 = 100.0 assert data.fillna(fill_values3).to_list() == [1.0, 100.0, 3.0, 100.0, 5.0] - - def test_series_segarray_to_pandas(self): - # reproducer for issue #3222 - sa = ak.SegArray(ak.arange(0, 30, 3), ak.arange(30)) - akdf = ak.DataFrame({"test": sa}) - pddf = pd.DataFrame({"test": sa.to_list()}) - - assert_frame_equal(akdf.to_pandas(), pddf) - assert_series_equal(akdf['test'].to_pandas(), pddf['test'], check_names=False) From 9ea36511c7903b8a1cb32670cb6f69f02c042268 Mon Sep 17 00:00:00 2001 From: Brandon Neth Date: Wed, 12 Jun 2024 23:51:27 -0400 Subject: [PATCH 5/5] flake changes --- arkouda/dataframe.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py index c8fe464b98..267d35c7cd 100644 --- a/arkouda/dataframe.py +++ b/arkouda/dataframe.py @@ -13,16 +13,13 @@ from numpy._typing import _8Bit, _16Bit, _32Bit, _64Bit from typeguard import typechecked -from arkouda.alignment import find as akfind from arkouda.categorical import Categorical from arkouda.client import generic_msg, maxTransferBytes from arkouda.client_dtypes import BitVector, Fields, IPv4 from arkouda.dtypes import BigInt from arkouda.dtypes import bool as akbool -from arkouda.dtypes import dtype from arkouda.dtypes import float64 as akfloat64 from arkouda.dtypes import int64 as akint64 -from arkouda.dtypes import resolve_scalar_dtype from arkouda.dtypes import uint64 as akuint64 from arkouda.groupbyclass import GROUPBY_REDUCTION_TYPES from arkouda.groupbyclass import GroupBy as akGroupBy