From 544f039ae23743169e45ee3c99886f26847698df Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Fri, 1 Jul 2022 13:36:54 -0400 Subject: [PATCH] Fix index alignment for Series objects with repeated index (#11103) The logic for aligning the indexes of two Series objects with a repeated value of the index is broken. We previously introduced an optimization that avoids a join operation, but unfortunately, that doesn't work in every situation, as seen in https://github.com/rapidsai/cudf/issues/11094. This PR removes that special-casing. Closes #11094 Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/11103 --- python/cudf/cudf/core/series.py | 26 ++++++++++---------------- python/cudf/cudf/tests/test_binops.py | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0cb63fb99a4..e9e1b175839 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4712,22 +4712,16 @@ def _align_indices(series_list, how="outer", allow_non_unique=False): if all_index_equal: return series_list - if how == "outer": - combined_index = cudf.core.reshape.concat( - [sr.index for sr in series_list] - ).unique() - combined_index.names = new_index_names - else: - combined_index = series_list[0].index - for sr in series_list[1:]: - combined_index = ( - cudf.DataFrame(index=sr.index).join( - cudf.DataFrame(index=combined_index), - sort=True, - how="inner", - ) - ).index - combined_index.names = new_index_names + combined_index = series_list[0].index + for sr in series_list[1:]: + combined_index = ( + cudf.DataFrame(index=sr.index).join( + cudf.DataFrame(index=combined_index), + sort=True, + how=how, + ) + ).index + combined_index.names = new_index_names # align all Series to the combined index result = [ diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 0d1bac6aead..2397dba7f76 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -593,6 +593,14 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop): cd_frame["y"] = cd_frame["y"].astype(np.float64) utils.assert_eq(cd_frame, pd_frame) + pdf1 = pd.DataFrame({"x": [1, 1]}, index=["a", "a"]) + pdf2 = pd.DataFrame({"x": [2]}, index=["a"]) + gdf1 = cudf.DataFrame.from_pandas(pdf1) + gdf2 = cudf.DataFrame.from_pandas(pdf2) + pd_frame = binop(pdf1, pdf2) + cd_frame = binop(gdf1, gdf2) + utils.assert_eq(pd_frame, cd_frame) + @pytest.mark.parametrize( "df2", @@ -2962,3 +2970,14 @@ def test_binops_dot(df, other): got = df @ other utils.assert_eq(expected, got) + + +def test_binop_series_with_repeated_index(): + # GH: #11094 + psr1 = pd.Series([1, 1], index=["a", "a"]) + psr2 = pd.Series([1], index=["a"]) + gsr1 = cudf.from_pandas(psr1) + gsr2 = cudf.from_pandas(psr2) + expected = psr1 - psr2 + got = gsr1 - gsr2 + utils.assert_eq(expected, got)