Fix concatenation of cudf.RangeIndex (#8970)

Fixes: #6872 In cudf, we have been concatenating a collection of `RangeIndex`'s by materializing each one of them, but instead we should rather be introspecting each RangeIndex to decide whether to materialize of not. This PR fixes it.  Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Benjamin Zaitlen (https://github.com/quasiben) - Vyas Ramasubramani (https://github.com/vyasr) URL: #8970
rapidsai · Aug 5, 2021 · db63f61 · db63f61
1 parent e11b054
commit db63f61
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 4 deletions.
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -4,7 +4,7 @@
 
 import pickle
 from numbers import Number
-from typing import Any, Dict, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 import cupy
 import numpy as np
@@ -588,13 +588,18 @@ def sum(self):
 
     @classmethod
     def _concat(cls, objs):
-        data = concat_columns([o._values for o in objs])
+        if all(isinstance(obj, RangeIndex) for obj in objs):
+            result = _concat_range_index(objs)
+        else:
+            data = concat_columns([o._values for o in objs])
+            result = as_index(data)
+
         names = {obj.name for obj in objs}
         if len(names) == 1:
             [name] = names
         else:
             name = None
-        result = as_index(data)
+
         result.name = name
         return result
 
@@ -3032,3 +3037,43 @@ def __new__(
             )
 
         return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs)
+
+
+def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
+    """
+    An internal Utility function to concat RangeIndex objects.
+    """
+    start = step = next_ = None
+
+    # Filter the empty indexes
+    non_empty_indexes = [obj for obj in indexes if len(obj)]
+
+    if not non_empty_indexes:
+        # Here all "indexes" had 0 length, i.e. were empty.
+        # In this case return an empty range index.
+        return RangeIndex(0, 0)
+
+    for obj in non_empty_indexes:
+        if start is None:
+            # This is set by the first non-empty index
+            start = obj.start
+            if step is None and len(obj) > 1:
+                step = obj.step
+        elif step is None:
+            # First non-empty index had only one element
+            if obj.start == start:
+                result = as_index(concat_columns([x._values for x in indexes]))
+                return result
+            step = obj.start - start
+
+        non_consecutive = (step != obj.step and len(obj) > 1) or (
+            next_ is not None and obj.start != next_
+        )
+        if non_consecutive:
+            result = as_index(concat_columns([x._values for x in indexes]))
+            return result
+        if step is not None:
+            next_ = obj[-1] + step
+
+    stop = non_empty_indexes[-1].stop if next_ is None else next_
+    return RangeIndex(start, stop, step)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
@@ -2316,3 +2316,22 @@ def test_get_loc_multi_string(idx, key, method):
         got = gi.get_loc(key, method=method)
 
         assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    "objs",
+    [
+        [pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)],
+        [pd.RangeIndex(10, 20), pd.RangeIndex(22, 40), pd.RangeIndex(50, 60)],
+        [pd.RangeIndex(10, 20, 2), pd.RangeIndex(20, 40, 2)],
+    ],
+)
+def test_range_index_concat(objs):
+    cudf_objs = [cudf.from_pandas(obj) for obj in objs]
+
+    actual = cudf.concat(cudf_objs)
+
+    expected = objs[0]
+    for obj in objs[1:]:
+        expected = expected.append(obj)
+    assert_eq(expected, actual)
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -59,7 +59,7 @@ def test_from_cudf_with_generic_idx():
 
     ddf = dgd.from_cudf(cdf, npartitions=2)
 
-    assert isinstance(ddf.index.compute(), cudf.core.index.GenericIndex)
+    assert isinstance(ddf.index.compute(), cudf.RangeIndex)
     dd.assert_eq(ddf.loc[1:2, ["a"]], cdf.loc[1:2, ["a"]])