From 4ce6674641def5a68dce633d3a21f17438ae48de Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 6 May 2024 14:43:20 -0500 Subject: [PATCH] Return `int64` when pandas compatible mode is turned on for `get_indexer` (#15659) Fixes: #15658 This PR makes a change to `get_indexer` to return `int64` indices when pandas compatible mode is turned on. Forks out of https://github.com/rapidsai/cudf/pull/14534 Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15659 --- python/cudf/cudf/core/_base_index.py | 6 ++++++ python/cudf/cudf/core/index.py | 8 ++++---- python/cudf/cudf/core/multiindex.py | 7 ++++--- python/cudf/cudf/tests/test_index.py | 20 ++++++++++++++++++++ 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index b5630ff9a54..fe0f39f9d0a 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2205,3 +2205,9 @@ def _split(self, splits): def _get_result_name(left_name, right_name): return left_name if _is_same_name(left_name, right_name) else None + + +def _return_get_indexer_result(result): + if cudf.get_option("mode.pandas_compatible"): + return result.astype("int64") + return result diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b51751a1b55..a2ad10a0590 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -38,7 +38,7 @@ is_list_like, is_scalar, ) -from cudf.core._base_index import BaseIndex +from cudf.core._base_index import BaseIndex, _return_get_indexer_result from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ( CategoricalColumn, @@ -1256,11 +1256,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): ) if not len(self): - return result.values + return _return_get_indexer_result(result.values) try: lcol, rcol = _match_join_keys(needle, self._column, "inner") except ValueError: - return result.values + return _return_get_indexer_result(result.values) scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner") (result,) = libcudf.copying.scatter([indices], scatter_map, [result]) @@ -1287,7 +1287,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "{['ffill'/'pad', 'bfill'/'backfill', 'nearest', None]}" ) - return result_series.to_cupy() + return _return_get_indexer_result(result_series.to_cupy()) @_cudf_nvtx_annotate def get_loc(self, key): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 1ab42df111f..c3184f51a4c 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -23,6 +23,7 @@ from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype from cudf.core import column +from cudf.core._base_index import _return_get_indexer_result from cudf.core.frame import Frame from cudf.core.index import ( BaseIndex, @@ -1858,11 +1859,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): dtype=libcudf.types.size_type_dtype, ) if not len(self): - return result.values + return _return_get_indexer_result(result.values) try: target = cudf.MultiIndex.from_tuples(target) except TypeError: - return result.values + return _return_get_indexer_result(result.values) join_keys = [ _match_join_keys(lcol, rcol, "inner") @@ -1892,7 +1893,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "{['ffill'/'pad', 'bfill'/'backfill', None]}" ) - return result_series.to_cupy() + return _return_get_indexer_result(result_series.to_cupy()) @_cudf_nvtx_annotate def get_loc(self, key): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 104a5fc0ffa..4ff1beb0a9a 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1741,6 +1741,10 @@ def test_get_indexer_single_unique_numeric(idx, key, method): assert_eq(expected, got) + with cudf.option_context("mode.pandas_compatible", True): + got = gi.get_indexer(key, method=method) + assert_eq(expected, got, check_dtype=True) + @pytest.mark.parametrize( "idx", @@ -1770,6 +1774,12 @@ def test_get_indexer_rangeindex(idx, key, method, tolerance): assert_eq(expected, got) + with cudf.option_context("mode.pandas_compatible", True): + got = gi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) + assert_eq(expected, got, check_dtype=True) + @pytest.mark.parametrize( "idx", @@ -1950,6 +1960,11 @@ def test_get_indexer_single_duplicate_string(idx, key, method): assert_eq(expected, got) + with cudf.option_context("mode.pandas_compatible", True): + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got, check_dtype=True) + @pytest.mark.parametrize( "idx", @@ -2009,6 +2024,11 @@ def test_get_indexer_multi_numeric(idx, key, method): assert_eq(expected, got) + with cudf.option_context("mode.pandas_compatible", True): + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got, check_dtype=True) + @pytest.mark.parametrize( "idx",