From 2f584913243079221746480dca951ce7d8c3f244 Mon Sep 17 00:00:00 2001 From: Sheilah Kirui Date: Wed, 2 Dec 2020 12:49:58 -0800 Subject: [PATCH 01/14] implementing update() function --- python/cudf/cudf/core/dataframe.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 107d2d20e38..8cc8f3690a3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1434,6 +1434,10 @@ def add(self, other, axis="columns", level=None, fill_value=None): return self._apply_op("add", other, fill_value) + + def update(): + + def __add__(self, other): return self._apply_op("__add__", other) From 9c2b1aced1a2f161c2417dc17f60965d3cef3c5a Mon Sep 17 00:00:00 2001 From: Sheilah Kirui Date: Tue, 8 Dec 2020 21:30:43 -0800 Subject: [PATCH 02/14] stash changes to switch branch --- python/cudf/cudf/core/dataframe.py | 1 + python/cudf/cudf/tests/test_dataframe.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8cc8f3690a3..1a4c18147d0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1436,6 +1436,7 @@ def add(self, other, axis="columns", level=None, fill_value=None): def update(): + pass def __add__(self, other): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 69eb70e7201..b08e2347753 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7995,3 +7995,7 @@ def test_dataframe_from_pandas_duplicate_columns(): ValueError, match="Duplicate column names are not allowed" ): gd.from_pandas(pdf) + + + +def test_dataframe_for_update_function(data,other,overwrite, errors): From 7d31eb52f51092c8c2a6fa8926663ebf2c382dc3 Mon Sep 17 00:00:00 2001 From: Sheilah Kirui Date: Tue, 8 Dec 2020 21:32:01 -0800 Subject: [PATCH 03/14] stash changes to switch branch --- python/cudf/cudf/core/dataframe.py | 33 ++++++++++++++++++++++-- python/cudf/cudf/tests/test_dataframe.py | 1 + 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1a4c18147d0..85722e5ee19 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1435,8 +1435,37 @@ def add(self, other, axis="columns", level=None, fill_value=None): return self._apply_op("add", other, fill_value) - def update(): - pass +def update(self, other, join="left", overwrite=True, filter_func=None, errors="ignore"): + # TODO: Support other joins + if join != "left": # pragma: no cover + raise NotImplementedError("Only left join is supported") + if errors not in ["ignore", "raise"]: + raise ValueError("The parameter errors must be either 'ignore' or 'raise'") + + other = other.reindex(df.index, axis=0) + other = other.reindex(df.columns, axis=1) + + for col in df.columns: + this = df[col] + that = other[col] + + if errors == "raise": + mask_this = that.notna() + mask_that = this.notna() + if any(mask_this & mask_that): + raise ValueError("Data overlaps.") + + if overwrite: + mask = that.isna() + else: + mask = this.notna() + + # don't overwrite columns unnecessarily + if mask.all(): + continue + + df.loc[mask, col] = this[mask] + df.loc[~mask, col] = that[~mask] def __add__(self, other): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index b08e2347753..077f00730f8 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7999,3 +7999,4 @@ def test_dataframe_from_pandas_duplicate_columns(): def test_dataframe_for_update_function(data,other,overwrite, errors): + From b72a8d59ce37509647fb05b9bf66f3d4bf45bdff Mon Sep 17 00:00:00 2001 From: Sheilah Kirui Date: Mon, 14 Dec 2020 20:46:08 -0800 Subject: [PATCH 04/14] Made edits to update() --- python/cudf/cudf/core/dataframe.py | 102 ++++++++++++++++------- python/cudf/cudf/tests/test_dataframe.py | 34 +++++++- 2 files changed, 106 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 85722e5ee19..d7d56f74edb 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1435,37 +1435,81 @@ def add(self, other, axis="columns", level=None, fill_value=None): return self._apply_op("add", other, fill_value) -def update(self, other, join="left", overwrite=True, filter_func=None, errors="ignore"): - # TODO: Support other joins - if join != "left": # pragma: no cover - raise NotImplementedError("Only left join is supported") - if errors not in ["ignore", "raise"]: - raise ValueError("The parameter errors must be either 'ignore' or 'raise'") - - other = other.reindex(df.index, axis=0) - other = other.reindex(df.columns, axis=1) - - for col in df.columns: - this = df[col] - that = other[col] - - if errors == "raise": - mask_this = that.notna() - mask_that = this.notna() - if any(mask_this & mask_that): - raise ValueError("Data overlaps.") - - if overwrite: - mask = that.isna() - else: - mask = this.notna() + def update(self, other, join="left", overwrite=True, filter_func=None, errors="ignore"): + """ + Modify a DataFrame in place using non-NA values from another DataFrame. + + Aligns on indices. There is no return value. + + Parameters + ---------- + + other : DataFrame, or object coercible into a DataFrame + Should have at least one matching index/column label with the + original DataFrame.If a Series is passed, its name attribute must + be set, and that will be used as the column name to align with the + original DataFrame. + + join : {‘left’}, default ‘left’ + Only left join is implemented, keeping the index and + columns of the original object. + + overwrite : {True, False}, default True + How to handle non-NA values for overlapping keys: + True: overwrite original DataFrame’s values with values from other. + False: only update values that are NA in the original DataFrame. + + filter_func : callable(1d-array) -> bool 1d-array, optional + Can choose to replace values other than NA. + Return True for values that should be updated. + + errors : {‘raise’, ‘ignore’}, default ‘ignore’ + If ‘raise’, will raise a ValueError if the DataFrame and other both contain non-NA data in the same place. + + + Returns + ------- + None : method directly changes calling object + + Raises + ------- + ValueError + - When ``errors``= 'raise' and there’s overlapping non-NA data. + - When ``errors`` is not either 'ignore' or 'raise' + + NotImplementedError + - If ``join`` != ‘left’ + + """ + # TODO: Support other joins + if join != "left": + raise NotImplementedError("Only left join is supported") + if errors not in ["ignore", "raise"]: + raise ValueError("The parameter errors must be either 'ignore' or 'raise'") + + other = other.reindex(self.index, axis=0) + other = other.reindex(self.columns, axis=1) + for col in self.columns: + original = self[col] + new = other[col] + + if errors == "raise": + mask_original = original.notna() + mask_new = new.notna() + if any(mask_original & mask_new): + raise ValueError("Data overlaps.") + + if overwrite: + mask = new.isna() #overwrite original with values from other + else: + mask = original.notna() #only update values that are na in the original - # don't overwrite columns unnecessarily - if mask.all(): - continue + # don't overwrite columns unnecessarily + if mask.all(): + continue - df.loc[mask, col] = this[mask] - df.loc[~mask, col] = that[~mask] + self.loc[mask, col] = original[mask] + self.loc[~mask, col] = newr[~mask] def __add__(self, other): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 077f00730f8..8d025342bc2 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7997,6 +7997,38 @@ def test_dataframe_from_pandas_duplicate_columns(): gd.from_pandas(pdf) +@pytest.mark.parametrize("join",["left", "right"],) +@pytest.mark.parametrize("overwrite",[True, False],) +@pytest.mark.parametrize("errors",["raise", "ignore"],) +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": [3, 4, 5]}, + {"a": [1.0, 2.0, 3.0], "b": [3.0, 4.0, 5.0]}, + {"a": ["n", "p", "q"],"b": ["t", "l", "s"]}, + {"a": [False, False, True], "b": [True, True, False]}, + {"a": [2.0, np.nan, 4.0], "b": [np.nan, np.nan, np.nan]}, + {"a": [np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan]}, + ], +) +@pytest.mark.parametrize( + "data2", + [ + {"a": [7, 5, 8], "b": [2.0, 7.0, 9.0]}, + {"a": ["yp", "zn", "uf"],"b": ["md", "rl", "sv"]}, + {"a": [True, False, True], "b":[3.0, 4.0, 5.0], "c": [False, True, False]}, + {"a": [np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan]}, + {"a": [np.nan, 2.0, False], "b": [2, np.nan, "pl"], "c": [np.nan, np.nan, np.nan]}, + ], +) +def test_update_for_dataframes(data, data2, join, overwrite, errors): + pdf = pd.DataFrame(data) + gdf = gd.DataFrame(data) + + other_pd = pd.DataFrame(data2) + other_gd = gd.DataFrame(data2) -def test_dataframe_for_update_function(data,other,overwrite, errors): + expect = pdf.update(other_pd, join, overwrite, errors) + got = gdf.update(other_gd, join, overwrite, errors) + assert_eq(expect, got) From a2bbab352d1538a96c0c8df0e3e401858014b84b Mon Sep 17 00:00:00 2001 From: Sheilah Kirui Date: Mon, 14 Dec 2020 23:01:51 -0800 Subject: [PATCH 05/14] Made edits to update() --- python/cudf/cudf/core/dataframe.py | 20 +++++++++++++++----- python/cudf/cudf/tests/test_dataframe.py | 9 +++++---- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d7d56f74edb..3608837587a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1435,7 +1435,9 @@ def add(self, other, axis="columns", level=None, fill_value=None): return self._apply_op("add", other, fill_value) - def update(self, other, join="left", overwrite=True, filter_func=None, errors="ignore"): + def update( + self, other, join="left", overwrite=True, filter_func=None, errors="ignore" + ): """ Modify a DataFrame in place using non-NA values from another DataFrame. @@ -1487,20 +1489,28 @@ def update(self, other, join="left", overwrite=True, filter_func=None, errors="i if errors not in ["ignore", "raise"]: raise ValueError("The parameter errors must be either 'ignore' or 'raise'") + if not isinstance(other, DataFrame): + other = DataFrame(other) + other = other.reindex(self.index, axis=0) other = other.reindex(self.columns, axis=1) for col in self.columns: - original = self[col] + original = self[col] new = other[col] - +# if filter_func is not None: +# with np.errstate(all="ignore"): +# > mask = ~filter_func(original) | new.isna() + # else: + if errors == "raise": - mask_original = original.notna() - mask_new = new.notna() + mask_original = new.notna() + mask_new = original.notna() if any(mask_original & mask_new): raise ValueError("Data overlaps.") if overwrite: mask = new.isna() #overwrite original with values from other + else: mask = original.notna() #only update values that are na in the original diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8d025342bc2..d2c675856c4 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7999,13 +7999,14 @@ def test_dataframe_from_pandas_duplicate_columns(): @pytest.mark.parametrize("join",["left", "right"],) @pytest.mark.parametrize("overwrite",[True, False],) +@pytest.mark.parametrize("filter_func",[None],) @pytest.mark.parametrize("errors",["raise", "ignore"],) @pytest.mark.parametrize( "data", [ {"a": [1, 2, 3], "b": [3, 4, 5]}, {"a": [1.0, 2.0, 3.0], "b": [3.0, 4.0, 5.0]}, - {"a": ["n", "p", "q"],"b": ["t", "l", "s"]}, + #{"a": ["n", "p", "q"],"b": ["t", "l", "s"]}, {"a": [False, False, True], "b": [True, True, False]}, {"a": [2.0, np.nan, 4.0], "b": [np.nan, np.nan, np.nan]}, {"a": [np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan]}, @@ -8015,13 +8016,13 @@ def test_dataframe_from_pandas_duplicate_columns(): "data2", [ {"a": [7, 5, 8], "b": [2.0, 7.0, 9.0]}, - {"a": ["yp", "zn", "uf"],"b": ["md", "rl", "sv"]}, + #{"a": ["yp", "zn", "uf"],"b": ["md", "rl", "sv"]}, {"a": [True, False, True], "b":[3.0, 4.0, 5.0], "c": [False, True, False]}, {"a": [np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan]}, - {"a": [np.nan, 2.0, False], "b": [2, np.nan, "pl"], "c": [np.nan, np.nan, np.nan]}, + {"a": [np.nan, 2.0, False], "b": [2, np.nan, 5.0], "c": [np.nan, np.nan, np.nan]}, ], ) -def test_update_for_dataframes(data, data2, join, overwrite, errors): +def test_update_for_dataframes(data, data2, join, overwrite, filter_func, errors): pdf = pd.DataFrame(data) gdf = gd.DataFrame(data) From 73c36832c01e3dce66cf1ba40087519c058f5cfe Mon Sep 17 00:00:00 2001 From: Sheilah Kirui Date: Wed, 16 Dec 2020 14:07:24 -0800 Subject: [PATCH 06/14] edits --- python/cudf/cudf/core/dataframe.py | 34 +++++++++++------------- python/cudf/cudf/tests/test_dataframe.py | 9 +++---- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3608837587a..8dffd2a2260 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1435,9 +1435,7 @@ def add(self, other, axis="columns", level=None, fill_value=None): return self._apply_op("add", other, fill_value) - def update( - self, other, join="left", overwrite=True, filter_func=None, errors="ignore" - ): + def update(self, other, join="left", overwrite=True, filter_func=None, errors="ignore"): """ Modify a DataFrame in place using non-NA values from another DataFrame. @@ -1488,38 +1486,36 @@ def update( raise NotImplementedError("Only left join is supported") if errors not in ["ignore", "raise"]: raise ValueError("The parameter errors must be either 'ignore' or 'raise'") + if filter_func != None: + raise NotImplementedError("filter_func is not supported yet") if not isinstance(other, DataFrame): other = DataFrame(other) other = other.reindex(self.index, axis=0) other = other.reindex(self.columns, axis=1) + for col in self.columns: - original = self[col] - new = other[col] -# if filter_func is not None: -# with np.errstate(all="ignore"): -# > mask = ~filter_func(original) | new.isna() - # else: + this = df[col] + that = other[col] if errors == "raise": - mask_original = new.notna() - mask_new = original.notna() - if any(mask_original & mask_new): - raise ValueError("Data overlaps.") - - if overwrite: - mask = new.isna() #overwrite original with values from other + mask_this = that.notna() + mask_that = this.notna() + if any(mask_this & mask_that): + raise ValueError("Data overlaps.") + if overwrite: + mask = that.isna() else: - mask = original.notna() #only update values that are na in the original + mask = this.notna() # don't overwrite columns unnecessarily if mask.all(): continue - self.loc[mask, col] = original[mask] - self.loc[~mask, col] = newr[~mask] + df.loc[mask, col] = this[mask] + df.loc[~mask, col] = that[~mask] def __add__(self, other): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d2c675856c4..770472c7d39 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7999,14 +7999,12 @@ def test_dataframe_from_pandas_duplicate_columns(): @pytest.mark.parametrize("join",["left", "right"],) @pytest.mark.parametrize("overwrite",[True, False],) -@pytest.mark.parametrize("filter_func",[None],) @pytest.mark.parametrize("errors",["raise", "ignore"],) @pytest.mark.parametrize( "data", [ {"a": [1, 2, 3], "b": [3, 4, 5]}, {"a": [1.0, 2.0, 3.0], "b": [3.0, 4.0, 5.0]}, - #{"a": ["n", "p", "q"],"b": ["t", "l", "s"]}, {"a": [False, False, True], "b": [True, True, False]}, {"a": [2.0, np.nan, 4.0], "b": [np.nan, np.nan, np.nan]}, {"a": [np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan]}, @@ -8016,13 +8014,12 @@ def test_dataframe_from_pandas_duplicate_columns(): "data2", [ {"a": [7, 5, 8], "b": [2.0, 7.0, 9.0]}, - #{"a": ["yp", "zn", "uf"],"b": ["md", "rl", "sv"]}, - {"a": [True, False, True], "b":[3.0, 4.0, 5.0], "c": [False, True, False]}, + {"a": [True, False, True], "b":[3.0, 4.0, 5.0]}, {"a": [np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan]}, - {"a": [np.nan, 2.0, False], "b": [2, np.nan, 5.0], "c": [np.nan, np.nan, np.nan]}, + {"a": [np.nan, 2.0, False], "b": [2, np.nan, 5.0]}, ], ) -def test_update_for_dataframes(data, data2, join, overwrite, filter_func, errors): +def test_update_for_dataframes(data, data2, join, overwrite, errors): pdf = pd.DataFrame(data) gdf = gd.DataFrame(data) From 32730a0ab768c8ea501841e904361ac1c9ecd21a Mon Sep 17 00:00:00 2001 From: Sheilah Kirui Date: Thu, 17 Dec 2020 17:59:20 -0800 Subject: [PATCH 07/14] All tests passing. ready for review --- python/cudf/cudf/core/dataframe.py | 8 ++--- python/cudf/cudf/tests/test_dataframe.py | 37 ++++++++++++++++++++---- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 50f5d4d0902..b6abf67d8b9 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1516,13 +1516,13 @@ def update(self, other, join="left", overwrite=True, filter_func=None, errors="i other = other.reindex(self.columns, axis=1) for col in self.columns: - this = df[col] + this = self[col] that = other[col] if errors == "raise": mask_this = that.notna() mask_that = this.notna() - if any(mask_this & mask_that): + if ((mask_this & mask_that).any()): raise ValueError("Data overlaps.") if overwrite: @@ -1534,8 +1534,8 @@ def update(self, other, join="left", overwrite=True, filter_func=None, errors="i if mask.all(): continue - df.loc[mask, col] = this[mask] - df.loc[~mask, col] = that[~mask] + self.loc[mask, col] = this[mask] + self.loc[~mask, col] = that[~mask] def __add__(self, other): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 02d674bc4a1..dafd5f856b5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8178,9 +8178,10 @@ def test_agg_for_dataframe_with_string_columns(aggs): gdf.agg(aggs) -@pytest.mark.parametrize("join",["left", "right"],) +@pytest.mark.parametrize("join",["left"],) @pytest.mark.parametrize("overwrite",[True, False],) -@pytest.mark.parametrize("errors",["raise", "ignore"],) +@pytest.mark.parametrize("filter_func",[None],) +@pytest.mark.parametrize("errors",["ignore"],) @pytest.mark.parametrize( "data", [ @@ -8197,17 +8198,41 @@ def test_agg_for_dataframe_with_string_columns(aggs): {"a": [7, 5, 8], "b": [2.0, 7.0, 9.0]}, {"a": [True, False, True], "b":[3.0, 4.0, 5.0]}, {"a": [np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan]}, - {"a": [np.nan, 2.0, False], "b": [2, np.nan, 5.0]}, + {"a": [np.nan, 2.0, np.nan], "b": [2, np.nan, 5.0]}, ], ) -def test_update_for_dataframes(data, data2, join, overwrite, errors): +def test_update_for_dataframes(data, data2, join, overwrite, filter_func, errors): pdf = pd.DataFrame(data) gdf = gd.DataFrame(data) other_pd = pd.DataFrame(data2) other_gd = gd.DataFrame(data2) - expect = pdf.update(other_pd, join, overwrite, errors) - got = gdf.update(other_gd, join, overwrite, errors) + expect = pdf.update(other_pd, join, overwrite, filter_func, errors) + got = gdf.update(other_gd, join, overwrite, filter_func, errors) assert_eq(expect, got) + +@pytest.mark.parametrize("join",["right"],) +def test_update_for_right_join(join): + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) + gdf = gd.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) + + other_pd = pd.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) + other_gd = gd.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) + + with pytest.raises( + NotImplementedError, + match="Only left join is supported" + ): + gdf.update(other_gd,join) + +@pytest.mark.parametrize("errors",["raise"],) +def test_update_for_data_overlap(errors): + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) + gdf = gd.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) + + other_pd = pd.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) + other_gd = gd.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) + + assert_exceptions_equal(lambda: pdf.update(other_pd,errors), lambda: gdf.update(other_gd,errors)) \ No newline at end of file From 7e05ed59becd69fff6fe38e1e267c79afda2a9ae Mon Sep 17 00:00:00 2001 From: Sheilah Kirui Date: Tue, 5 Jan 2021 19:59:03 -0800 Subject: [PATCH 08/14] Addressed Prem's review comments --- python/cudf/cudf/core/dataframe.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b6abf67d8b9..34776957edf 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1463,10 +1463,9 @@ def update(self, other, join="left", overwrite=True, filter_func=None, errors="i Parameters ---------- - other : DataFrame, or object coercible into a DataFrame Should have at least one matching index/column label with the - original DataFrame.If a Series is passed, its name attribute must + original DataFrame. If a Series is passed, its name attribute must be set, and that will be used as the column name to align with the original DataFrame. @@ -1479,8 +1478,8 @@ def update(self, other, join="left", overwrite=True, filter_func=None, errors="i True: overwrite original DataFrame’s values with values from other. False: only update values that are NA in the original DataFrame. - filter_func : callable(1d-array) -> bool 1d-array, optional - Can choose to replace values other than NA. + filter_func : None + filter_func is not supported yet Return True for values that should be updated. errors : {‘raise’, ‘ignore’}, default ‘ignore’ @@ -1504,7 +1503,7 @@ def update(self, other, join="left", overwrite=True, filter_func=None, errors="i # TODO: Support other joins if join != "left": raise NotImplementedError("Only left join is supported") - if errors not in ["ignore", "raise"]: + if errors not in {"ignore", "raise"}: raise ValueError("The parameter errors must be either 'ignore' or 'raise'") if filter_func != None: raise NotImplementedError("filter_func is not supported yet") @@ -1512,8 +1511,9 @@ def update(self, other, join="left", overwrite=True, filter_func=None, errors="i if not isinstance(other, DataFrame): other = DataFrame(other) - other = other.reindex(self.index, axis=0) - other = other.reindex(self.columns, axis=1) + if self.index != self.columns: + other = other.reindex(self.index, axis=0) + other = other.reindex(self.columns, axis=1) for col in self.columns: this = self[col] @@ -1534,9 +1534,8 @@ def update(self, other, join="left", overwrite=True, filter_func=None, errors="i if mask.all(): continue - self.loc[mask, col] = this[mask] - self.loc[~mask, col] = that[~mask] - + self[col] = this.where(mask, that) + def __add__(self, other): return self._apply_op("__add__", other) From b923e16cb7a70373dcb12c58b9633c5c552f8e0c Mon Sep 17 00:00:00 2001 From: Sheilah Kirui Date: Fri, 8 Jan 2021 12:30:00 -0800 Subject: [PATCH 09/14] Addressed review comments. Ready for review --- python/cudf/cudf/core/dataframe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ad4d8646500..8261a842e54 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1513,9 +1513,10 @@ def update(self, other, join="left", overwrite=True, filter_func=None, errors="i if not isinstance(other, DataFrame): other = DataFrame(other) - if self.index != self.columns: - other = other.reindex(self.index, axis=0) + if self.columns.any() != other.columns.any(): other = other.reindex(self.columns, axis=1) + if self.index.any() != other.index.any(): + other = other.reindex(self.index, axis=0) for col in self.columns: this = self[col] From 388b6f90b0343eaf46fd5d562176ebe157a95b56 Mon Sep 17 00:00:00 2001 From: Sheilah Kirui Date: Mon, 11 Jan 2021 14:18:09 -0800 Subject: [PATCH 10/14] addressed review comments --- python/cudf/cudf/core/dataframe.py | 44 +++++++++------- python/cudf/cudf/tests/test_dataframe.py | 64 ++++++++++++++++-------- 2 files changed, 69 insertions(+), 39 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8261a842e54..f502ed3c22c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1456,8 +1456,14 @@ def add(self, other, axis="columns", level=None, fill_value=None): return self._apply_op("add", other, fill_value) - - def update(self, other, join="left", overwrite=True, filter_func=None, errors="ignore"): + def update( + self, + other, + join="left", + overwrite=True, + filter_func=None, + errors="ignore", + ): """ Modify a DataFrame in place using non-NA values from another DataFrame. @@ -1466,13 +1472,13 @@ def update(self, other, join="left", overwrite=True, filter_func=None, errors="i Parameters ---------- other : DataFrame, or object coercible into a DataFrame - Should have at least one matching index/column label with the + Should have at least one matching index/column label with the original DataFrame. If a Series is passed, its name attribute must - be set, and that will be used as the column name to align with the + be set, and that will be used as the column name to align with the original DataFrame. join : {‘left’}, default ‘left’ - Only left join is implemented, keeping the index and + Only left join is implemented, keeping the index and columns of the original object. overwrite : {True, False}, default True @@ -1485,13 +1491,14 @@ def update(self, other, join="left", overwrite=True, filter_func=None, errors="i Return True for values that should be updated. errors : {‘raise’, ‘ignore’}, default ‘ignore’ - If ‘raise’, will raise a ValueError if the DataFrame and other both contain non-NA data in the same place. + If ‘raise’, will raise a ValueError if the DataFrame and other + both contain non-NA data in the same place. Returns ------- None : method directly changes calling object - + Raises ------- ValueError @@ -1503,29 +1510,31 @@ def update(self, other, join="left", overwrite=True, filter_func=None, errors="i """ # TODO: Support other joins - if join != "left": + if join != "left": raise NotImplementedError("Only left join is supported") if errors not in {"ignore", "raise"}: - raise ValueError("The parameter errors must be either 'ignore' or 'raise'") - if filter_func != None: + raise ValueError( + "The parameter errors must be either 'ignore' or 'raise'" + ) + if filter_func is not None: raise NotImplementedError("filter_func is not supported yet") if not isinstance(other, DataFrame): other = DataFrame(other) - if self.columns.any() != other.columns.any(): + if not self.columns.equals(other.columns): other = other.reindex(self.columns, axis=1) - if self.index.any() != other.index.any(): + if not self.index.equals(other.index): other = other.reindex(self.index, axis=0) - for col in self.columns: + for col in self.columns: this = self[col] - that = other[col] - + that = other[col] + if errors == "raise": mask_this = that.notna() mask_that = this.notna() - if ((mask_this & mask_that).any()): + if (mask_this & mask_that).any(): raise ValueError("Data overlaps.") if overwrite: @@ -1538,8 +1547,7 @@ def update(self, other, join="left", overwrite=True, filter_func=None, errors="i continue self[col] = this.where(mask, that) - - + def __add__(self, other): return self._apply_op("__add__", other) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f9036312bd6..b62dab59272 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8193,30 +8193,45 @@ def test_agg_for_dataframe_with_string_columns(aggs): gdf.agg(aggs) -@pytest.mark.parametrize("join",["left"],) -@pytest.mark.parametrize("overwrite",[True, False],) -@pytest.mark.parametrize("filter_func",[None],) -@pytest.mark.parametrize("errors",["ignore"],) +@pytest.mark.parametrize( + "join", ["left"], +) +@pytest.mark.parametrize( + "overwrite", [True, False], +) +@pytest.mark.parametrize( + "filter_func", [None], +) +@pytest.mark.parametrize( + "errors", ["ignore"], +) @pytest.mark.parametrize( "data", [ {"a": [1, 2, 3], "b": [3, 4, 5]}, - {"a": [1.0, 2.0, 3.0], "b": [3.0, 4.0, 5.0]}, - {"a": [False, False, True], "b": [True, True, False]}, - {"a": [2.0, np.nan, 4.0], "b": [np.nan, np.nan, np.nan]}, - {"a": [np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan]}, + {"e": [1.0, 2.0, 3.0], "d": [3.0, 4.0, 5.0]}, + {"b": [4.0, 3.0, 1.0], "c": [2.0, 5.0, 8.0]}, + {"c": [True, False, False], "d": [False, True, True]}, + {"g": [2.0, np.nan, 4.0], "n": [np.nan, np.nan, np.nan]}, + {"b": [np.nan, np.nan, 3.0], "c": [6.0, np.nan, np.nan]}, + {"d": [np.nan, np.nan, np.nan], "e": [np.nan, np.nan, np.nan]}, + {"b": [np.nan, np.nan, np.nan], "c": [np.nan, np.nan, np.nan]}, ], ) @pytest.mark.parametrize( "data2", [ + {"b": [3, 5, 6], "e": [8, 2, 1]}, {"a": [7, 5, 8], "b": [2.0, 7.0, 9.0]}, - {"a": [True, False, True], "b":[3.0, 4.0, 5.0]}, + {"c": [True, False, True], "d": [3.0, 4.0, 5.0]}, + {"e": [False, False, True], "g": [True, True, False]}, {"a": [np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan]}, - {"a": [np.nan, 2.0, np.nan], "b": [2, np.nan, 5.0]}, + {"b": [np.nan, 2.0, np.nan], "c": [2, np.nan, 5.0]}, ], ) -def test_update_for_dataframes(data, data2, join, overwrite, filter_func, errors): +def test_update_for_dataframes( + data, data2, join, overwrite, filter_func, errors +): pdf = pd.DataFrame(data) gdf = gd.DataFrame(data) @@ -8225,24 +8240,26 @@ def test_update_for_dataframes(data, data2, join, overwrite, filter_func, errors expect = pdf.update(other_pd, join, overwrite, filter_func, errors) got = gdf.update(other_gd, join, overwrite, filter_func, errors) - + assert_eq(expect, got) -@pytest.mark.parametrize("join",["right"],) + +@pytest.mark.parametrize( + "join", ["right"], +) def test_update_for_right_join(join): - pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) gdf = gd.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) - - other_pd = pd.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) other_gd = gd.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) with pytest.raises( - NotImplementedError, - match="Only left join is supported" + NotImplementedError, match="Only left join is supported" ): - gdf.update(other_gd,join) + gdf.update(other_gd, join) -@pytest.mark.parametrize("errors",["raise"],) + +@pytest.mark.parametrize( + "errors", ["raise"], +) def test_update_for_data_overlap(errors): pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) gdf = gd.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) @@ -8250,4 +8267,9 @@ def test_update_for_data_overlap(errors): other_pd = pd.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) other_gd = gd.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) - assert_exceptions_equal(lambda: pdf.update(other_pd,errors), lambda: gdf.update(other_gd,errors)) \ No newline at end of file + assert_exceptions_equal( + lfunc=pdf.update, + rfunc=gdf.update, + lfunc_args_and_kwargs=([other_pd, errors], {}), + rfunc_args_and_kwargs=([other_gd, errors], {}), + ) From c6d6d597ca5dfc3af728159b88e139d30efe19e7 Mon Sep 17 00:00:00 2001 From: Sheilah Kirui Date: Tue, 12 Jan 2021 16:06:03 -0800 Subject: [PATCH 11/14] added test for unequal indices --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 29 +++++++++++++++++++----- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f502ed3c22c..47eba33cd24 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1477,7 +1477,7 @@ def update( be set, and that will be used as the column name to align with the original DataFrame. - join : {‘left’}, default ‘left’ + join : {`left`}, default `left` Only left join is implemented, keeping the index and columns of the original object. diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index b62dab59272..e3ac8520547 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8210,23 +8210,40 @@ def test_agg_for_dataframe_with_string_columns(aggs): [ {"a": [1, 2, 3], "b": [3, 4, 5]}, {"e": [1.0, 2.0, 3.0], "d": [3.0, 4.0, 5.0]}, - {"b": [4.0, 3.0, 1.0], "c": [2.0, 5.0, 8.0]}, {"c": [True, False, False], "d": [False, True, True]}, {"g": [2.0, np.nan, 4.0], "n": [np.nan, np.nan, np.nan]}, - {"b": [np.nan, np.nan, 3.0], "c": [6.0, np.nan, np.nan]}, {"d": [np.nan, np.nan, np.nan], "e": [np.nan, np.nan, np.nan]}, - {"b": [np.nan, np.nan, np.nan], "c": [np.nan, np.nan, np.nan]}, + {"a": [1.0, 2, 3], "b": pd.Series([4.0, 8.0, 3.0], index=[1, 2, 3])}, + { + "d": [1.0, 2.0, 3.0], + "c": pd.Series([np.nan, np.nan, np.nan], index=[1, 2, 3]), + }, + { + "a": [False, True, False], + "b": pd.Series([1.0, 2.0, np.nan], index=[1, 2, 3]), + }, + { + "a": [np.nan, np.nan, np.nan], + "e": pd.Series([np.nan, np.nan, np.nan], index=[1, 2, 3]), + }, ], ) @pytest.mark.parametrize( "data2", [ {"b": [3, 5, 6], "e": [8, 2, 1]}, - {"a": [7, 5, 8], "b": [2.0, 7.0, 9.0]}, {"c": [True, False, True], "d": [3.0, 4.0, 5.0]}, {"e": [False, False, True], "g": [True, True, False]}, - {"a": [np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan]}, - {"b": [np.nan, 2.0, np.nan], "c": [2, np.nan, 5.0]}, + {"g": [np.nan, np.nan, np.nan], "c": [np.nan, np.nan, np.nan]}, + {"a": [7, 5, 8], "b": pd.Series([2.0, 7.0, 9.0], index=[0, 1, 2])}, + { + "b": [np.nan, 2.0, np.nan], + "c": pd.Series([2, np.nan, 5.0], index=[2, 3, 4]), + }, + { + "a": [True, np.nan, True], + "d": pd.Series([False, True, np.nan], index=[0, 1, 3]), + }, ], ) def test_update_for_dataframes( From d4a34a6395e0ac797f24fa0aab219ff1aee34b07 Mon Sep 17 00:00:00 2001 From: skirui-source Date: Tue, 19 Jan 2021 10:29:03 -0800 Subject: [PATCH 12/14] Fixed changes in Docstrings --- python/cudf/cudf/core/dataframe.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 47eba33cd24..fd2f8be08f7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1477,7 +1477,7 @@ def update( be set, and that will be used as the column name to align with the original DataFrame. - join : {`left`}, default `left` + join : {'left'}, default 'left' Only left join is implemented, keeping the index and columns of the original object. @@ -1488,10 +1488,10 @@ def update( filter_func : None filter_func is not supported yet - Return True for values that should be updated. + Return True for values that should be updated.S - errors : {‘raise’, ‘ignore’}, default ‘ignore’ - If ‘raise’, will raise a ValueError if the DataFrame and other + errors : {'raise', 'ignore'}, default 'ignore' + If 'raise', will raise a ValueError if the DataFrame and other both contain non-NA data in the same place. @@ -1502,12 +1502,11 @@ def update( Raises ------- ValueError - - When ``errors``= 'raise' and there’s overlapping non-NA data. + - When ``errors`` = 'raise' and there's overlapping non-NA data. - When ``errors`` is not either 'ignore' or 'raise' NotImplementedError - - If ``join`` != ‘left’ - + - If ``join`` != 'left' """ # TODO: Support other joins if join != "left": From cf2648b041ad5a416dd1be85f25f9f78c766dd2c Mon Sep 17 00:00:00 2001 From: skirui-source Date: Wed, 20 Jan 2021 23:33:20 -0800 Subject: [PATCH 13/14] fixed typo --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c26fb3c5d4a..534e6adb5b3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1483,7 +1483,7 @@ def update( overwrite : {True, False}, default True How to handle non-NA values for overlapping keys: - True: overwrite original DataFrame’s values with values from other. + True: overwrite original DataFrame's values with values from other. False: only update values that are NA in the original DataFrame. filter_func : None From b5ad50136a01324b89e9387a00055299bafdcfb5 Mon Sep 17 00:00:00 2001 From: skirui-source Date: Thu, 21 Jan 2021 10:13:33 -0800 Subject: [PATCH 14/14] Recovered two deleted tests --- python/cudf/cudf/tests/test_dataframe.py | 58 ++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 4f91ac86003..f41714ec1ad 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8266,3 +8266,61 @@ def test_update_for_data_overlap(errors): lfunc_args_and_kwargs=([other_pd, errors], {}), rfunc_args_and_kwargs=([other_gd, errors], {}), ) + + +@pytest.mark.parametrize( + "gdf", + [ + gd.DataFrame({"a": [[1], [2], [3]]}), + gd.DataFrame( + { + "left-a": [0, 1, 2], + "a": [[1], None, [3]], + "right-a": ["abc", "def", "ghi"], + } + ), + gd.DataFrame( + { + "left-a": [[], None, None], + "a": [[1], None, [3]], + "right-a": ["abc", "def", "ghi"], + } + ), + ], +) +def test_dataframe_roundtrip_arrow_list_dtype(gdf): + table = gdf.to_arrow() + expected = gd.DataFrame.from_arrow(table) + + assert_eq(gdf, expected) + + +@pytest.mark.parametrize( + "gdf", + [ + gd.DataFrame({"a": [{"one": 3, "two": 4, "three": 10}]}), + gd.DataFrame( + { + "left-a": [0, 1, 2], + "a": [{"x": 0.23, "y": 43}, None, {"x": 23.9, "y": 4.3}], + "right-a": ["abc", "def", "ghi"], + } + ), + gd.DataFrame( + { + "left-a": [{"a": 1}, None, None], + "a": [ + {"one": 324, "two": 23432, "three": 324}, + None, + {"one": 3.24, "two": 1, "three": 324}, + ], + "right-a": ["abc", "def", "ghi"], + } + ), + ], +) +def test_dataframe_roundtrip_arrow_struct_dtype(gdf): + table = gdf.to_arrow() + expected = gd.DataFrame.from_arrow(table) + + assert_eq(gdf, expected)