From a2840c646be2e2d1baf7e8f53da0ae2f7f320ada Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 5 Aug 2021 15:51:03 -0500 Subject: [PATCH] Fix concatenation of `cudf.RangeIndex` (#8970) Fixes: #6872 In cudf, we have been concatenating a collection of `RangeIndex`'s by materializing each one of them, but instead we should rather be introspecting each RangeIndex to decide whether to materialize of not. This PR fixes it. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Benjamin Zaitlen (https://github.com/quasiben) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/8970 --- python/cudf/cudf/core/index.py | 51 +++++++++++++++++-- python/cudf/cudf/tests/test_index.py | 19 +++++++ python/dask_cudf/dask_cudf/tests/test_core.py | 2 +- 3 files changed, 68 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 691b6ab2e29..c94aa940ec5 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -4,7 +4,7 @@ import pickle from numbers import Number -from typing import Any, Dict, Optional, Tuple, Type, Union +from typing import Any, Dict, List, Optional, Tuple, Type, Union import cupy import numpy as np @@ -588,13 +588,18 @@ def sum(self): @classmethod def _concat(cls, objs): - data = concat_columns([o._values for o in objs]) + if all(isinstance(obj, RangeIndex) for obj in objs): + result = _concat_range_index(objs) + else: + data = concat_columns([o._values for o in objs]) + result = as_index(data) + names = {obj.name for obj in objs} if len(names) == 1: [name] = names else: name = None - result = as_index(data) + result.name = name return result @@ -3043,3 +3048,43 @@ def __new__( ) return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs) + + +def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: + """ + An internal Utility function to concat RangeIndex objects. + """ + start = step = next_ = None + + # Filter the empty indexes + non_empty_indexes = [obj for obj in indexes if len(obj)] + + if not non_empty_indexes: + # Here all "indexes" had 0 length, i.e. were empty. + # In this case return an empty range index. + return RangeIndex(0, 0) + + for obj in non_empty_indexes: + if start is None: + # This is set by the first non-empty index + start = obj.start + if step is None and len(obj) > 1: + step = obj.step + elif step is None: + # First non-empty index had only one element + if obj.start == start: + result = as_index(concat_columns([x._values for x in indexes])) + return result + step = obj.start - start + + non_consecutive = (step != obj.step and len(obj) > 1) or ( + next_ is not None and obj.start != next_ + ) + if non_consecutive: + result = as_index(concat_columns([x._values for x in indexes])) + return result + if step is not None: + next_ = obj[-1] + step + + stop = non_empty_indexes[-1].stop if next_ is None else next_ + return RangeIndex(start, stop, step) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index f03454c479a..3f58eb3d6e7 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2316,3 +2316,22 @@ def test_get_loc_multi_string(idx, key, method): got = gi.get_loc(key, method=method) assert_eq(expected, got) + + +@pytest.mark.parametrize( + "objs", + [ + [pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)], + [pd.RangeIndex(10, 20), pd.RangeIndex(22, 40), pd.RangeIndex(50, 60)], + [pd.RangeIndex(10, 20, 2), pd.RangeIndex(20, 40, 2)], + ], +) +def test_range_index_concat(objs): + cudf_objs = [cudf.from_pandas(obj) for obj in objs] + + actual = cudf.concat(cudf_objs) + + expected = objs[0] + for obj in objs[1:]: + expected = expected.append(obj) + assert_eq(expected, actual) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index cf5203a22e5..ace9701b677 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -59,7 +59,7 @@ def test_from_cudf_with_generic_idx(): ddf = dgd.from_cudf(cdf, npartitions=2) - assert isinstance(ddf.index.compute(), cudf.core.index.GenericIndex) + assert isinstance(ddf.index.compute(), cudf.RangeIndex) dd.assert_eq(ddf.loc[1:2, ["a"]], cdf.loc[1:2, ["a"]])