From 34b54caf8e8d51284cb351cae0f943232130b181 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 1 Oct 2021 07:34:38 -0700 Subject: [PATCH] Deprecate method parameters to DataFrame.join, DataFrame.merge. (#9291) The `method` parameter to `DataFrame.join` and `DataFrame.merge` [isn't used internally](https://github.com/rapidsai/cudf/blob/e2098e56f0cb209b1d916ce617c04533444a056a/python/cudf/cudf/core/join/join.py#L90) after changes in #7454. This PR updates the docstrings and adds deprecation notices via `FutureWarning` as discussed in #9347. The parameter is now deprecated in the public API. I removed all internal uses of the `method` parameter. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Charles Blackmon-Luca (https://github.com/charlesbluca) URL: https://github.com/rapidsai/cudf/pull/9291 --- python/cudf/cudf/core/dataframe.py | 28 ++++++++++++++++----- python/cudf/cudf/core/frame.py | 2 -- python/cudf/cudf/core/join/join.py | 3 --- python/cudf/cudf/core/series.py | 1 - python/cudf/cudf/tests/test_joining.py | 34 ++++++-------------------- 5 files changed, 30 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 60f654f4836..9e11792fdd4 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4399,7 +4399,7 @@ def merge( sort=False, lsuffix=None, rsuffix=None, - method="hash", + method=None, indicator=False, suffixes=("_x", "_y"), ): @@ -4444,8 +4444,9 @@ def merge( suffixes: Tuple[str, str], defaults to ('_x', '_y') Suffixes applied to overlapping column names on the left and right sides - method : {‘hash’, ‘sort’}, default ‘hash’ - The implementation method to be used for the operation. + method : + This parameter is unused. It is deprecated and will be removed in a + future version. Returns ------- @@ -4507,6 +4508,13 @@ def merge( else: lsuffix, rsuffix = suffixes + if method is not None: + warnings.warn( + "The 'method' argument is deprecated and will be removed " + "in a future version of cudf.", + FutureWarning, + ) + # Compute merge gdf_result = super()._merge( right, @@ -4517,7 +4525,6 @@ def merge( right_index=right_index, how=how, sort=sort, - method=method, indicator=indicator, suffixes=suffixes, ) @@ -4532,7 +4539,7 @@ def join( lsuffix="", rsuffix="", sort=False, - method="hash", + method=None, ): """Join columns with other DataFrame on index or on a key column. @@ -4546,6 +4553,9 @@ def join( column names when avoiding conflicts. sort : bool Set to True to ensure sorted ordering. + method : + This parameter is unused. It is deprecated and will be removed in a + future version. Returns ------- @@ -4559,6 +4569,13 @@ def join( - *on* is not supported yet due to lack of multi-index support. """ + if method is not None: + warnings.warn( + "The 'method' argument is deprecated and will be removed " + "in a future version of cudf.", + FutureWarning, + ) + lhs = self rhs = other @@ -4568,7 +4585,6 @@ def join( right_index=True, how=how, suffixes=(lsuffix, rsuffix), - method=method, sort=sort, ) df.index.name = ( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b0315b7e8f1..444354ab52d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3552,7 +3552,6 @@ def _merge( right_index=False, how="inner", sort=False, - method="hash", indicator=False, suffixes=("_x", "_y"), ): @@ -3575,7 +3574,6 @@ def _merge( right_index=right_index, how=how, sort=sort, - method=method, indicator=indicator, suffixes=suffixes, ) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 276038146e1..55540d362ac 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -29,7 +29,6 @@ def merge( right_index, how, sort, - method, indicator, suffixes, ): @@ -47,7 +46,6 @@ def merge( right_index=right_index, how=how, sort=sort, - method=method, indicator=indicator, suffixes=suffixes, ) @@ -87,7 +85,6 @@ def __init__( right_index, how, sort, - method, indicator, suffixes, ): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 18eec84ccf6..7fd8efb1dab 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4651,7 +4651,6 @@ def merge( right_index=right_index, how=how, sort=sort, - method=method, indicator=False, suffixes=suffixes, ) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index b18cce60bfd..bcce80e0a26 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -21,47 +21,30 @@ def make_params(): np.random.seed(0) hows = _JOIN_TYPES - methods = "hash,sort".split(",") # Test specific cases (1) aa = [0, 0, 4, 5, 5] bb = [0, 0, 2, 3, 5] for how in hows: - if how in ["left", "inner", "right", "leftanti", "leftsemi"]: - for method in methods: - yield (aa, bb, how, method) - else: - yield (aa, bb, how, "sort") + yield (aa, bb, how) # Test specific cases (2) aa = [0, 0, 1, 2, 3] bb = [0, 1, 2, 2, 3] for how in hows: - if how in ["left", "inner", "right", "leftanti", "leftsemi"]: - for method in methods: - yield (aa, bb, how, method) - else: - yield (aa, bb, how, "sort") + yield (aa, bb, how) # Test large random integer inputs aa = np.random.randint(0, 50, 100) bb = np.random.randint(0, 50, 100) for how in hows: - if how in ["left", "inner", "right", "leftanti", "leftsemi"]: - for method in methods: - yield (aa, bb, how, method) - else: - yield (aa, bb, how, "sort") + yield (aa, bb, how) # Test floating point inputs aa = np.random.random(50) bb = np.random.random(50) for how in hows: - if how in ["left", "inner", "right", "leftanti", "leftsemi"]: - for method in methods: - yield (aa, bb, how, method) - else: - yield (aa, bb, how, "sort") + yield (aa, bb, how) def pd_odd_joins(left, right, join_type): @@ -102,8 +85,8 @@ def assert_join_results_equal(expect, got, how, **kwargs): raise ValueError(f"Not a join result: {type(expect).__name__}") -@pytest.mark.parametrize("aa,bb,how,method", make_params()) -def test_dataframe_join_how(aa, bb, how, method): +@pytest.mark.parametrize("aa,bb,how", make_params()) +def test_dataframe_join_how(aa, bb, how): df = cudf.DataFrame() df["a"] = aa df["b"] = bb @@ -122,7 +105,7 @@ def work_pandas(df, how): def work_gdf(df): df1 = df.set_index("a") df2 = df.set_index("b") - joined = df1.join(df2, how=how, sort=True, method=method) + joined = df1.join(df2, how=how, sort=True) return joined expect = work_pandas(df.to_pandas(), how) @@ -136,8 +119,7 @@ def work_gdf(df): assert got.index.name is None assert list(expect.columns) == list(got.columns) - # test disabled until libgdf sort join gets updated with new api - if method == "hash": + if how in {"left", "inner", "right", "leftanti", "leftsemi"}: assert_eq(sorted(expect.index.values), sorted(got.index.values)) if how != "outer": # Newly introduced ambiguous ValueError thrown when