Fix DataFrame slicing issues for empty cases (#10310)

Closes #10292 Authors: - https://github.com/brandon-b-miller Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: #10310
rapidsai · Feb 22, 2022 · 58810af · 58810af
1 parent 36e8825
commit 58810af
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 14 deletions.
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -2289,7 +2289,9 @@ def arange(
     if step is None:
         step = 1
 
-    size = int(np.ceil((stop - start) / step))
+    size = len(range(int(start), int(stop), int(step)))
+    if size == 0:
+        return as_column([], dtype=dtype)
 
     return libcudf.filling.sequence(
         size,

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -53,7 +53,7 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame, _drop_rows_by_labels
 from cudf.core.groupby.groupby import DataFrameGroupBy
-from cudf.core.index import BaseIndex, RangeIndex, as_index
+from cudf.core.index import BaseIndex, Index, RangeIndex, as_index
 from cudf.core.indexed_frame import (
     IndexedFrame,
     _FrameIndexer,
@@ -1219,13 +1219,54 @@ def _slice(self: T, arg: slice) -> T:
             return self
         start, stop, stride = arg.indices(num_rows)
 
+        # early stop for empty cases
+        if len(range(start, stop, stride)) == 0:
+            columns = ColumnAccessor(
+                {
+                    colname: column.column_empty_like(col, newsize=0)
+                    for colname, col in self._data.items()
+                },
+                multiindex=self._data.multiindex,
+                level_names=self._data.level_names,
+            )
+
+            if isinstance(self.index, MultiIndex):
+                mi_columns = ColumnAccessor(
+                    {
+                        colname: column.column_empty_like(col, newsize=0)
+                        for colname, col in self.index._data.items()
+                    }
+                )
+                return DataFrame._from_data(
+                    columns,
+                    index=MultiIndex._from_data(
+                        mi_columns, name=self.index.name
+                    ),
+                )
+            else:
+                return DataFrame._from_data(
+                    columns,
+                    index=(
+                        RangeIndex(
+                            start=start,
+                            stop=stop,
+                            step=stride,
+                            name=self.index.name,
+                        )
+                        if isinstance(self.index, RangeIndex)
+                        else Index(
+                            [], dtype=self.index.dtype, name=self.index.name
+                        )
+                    ),
+                )
+
         # This is just to handle RangeIndex type, stop
         # it from materializing unnecessarily
         keep_index = True
         if self.index is not None and isinstance(self.index, RangeIndex):
             if self._num_columns == 0:
                 result = self._empty_like(keep_index)
-                result._index = self.index[start:stop]
+                result._index = self.index[start:stop:stride]
                 return result
             keep_index = False
 

diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
@@ -1294,8 +1294,8 @@ def test_loc_datetime_index(sli, is_dataframe):
 @pytest.mark.parametrize(
     "gdf_kwargs",
     [
-        {"data": {"a": range(100000)}},
-        {"data": {"a": range(100000), "b": range(100000)}},
+        {"data": {"a": range(1000)}},
+        {"data": {"a": range(1000), "b": range(1000)}},
         {
             "data": {
                 "a": range(20),
@@ -1304,26 +1304,33 @@ def test_loc_datetime_index(sli, is_dataframe):
             }
         },
         {"index": [1, 2, 3]},
-        {"index": range(100000)},
+        {"index": range(1000)},
         {"columns": ["a", "b", "c", "d"]},
-        {"columns": ["a"], "index": range(100000)},
-        {"columns": ["a", "col2", "...col n"], "index": range(100000)},
-        {"index": cudf.Series(range(100000)).astype("str")},
+        {"columns": ["a"], "index": range(1000)},
+        {"columns": ["a", "col2", "...col n"], "index": range(1000)},
+        {"index": cudf.Series(range(1000)).astype("str")},
         {
             "columns": ["a", "b", "c", "d"],
-            "index": cudf.Series(range(100000)).astype("str"),
+            "index": cudf.Series(range(1000)).astype("str"),
         },
     ],
 )
 @pytest.mark.parametrize(
     "slice",
     [
-        slice(25000, 50000),
-        slice(25000, 25001),
-        slice(50000),
+        slice(6, None),  # start but no stop, [6:]
+        slice(None, None, 3),  # only step, [::3]
+        slice(1, 10, 2),  # start, stop, step
+        slice(3, -5, 2),  # negative stop
+        slice(-2, -4),  # slice is empty
+        slice(-10, -20, -1),  # reversed slice
+        slice(None),  # slices everything, same as [:]
+        slice(250, 500),
+        slice(250, 251),
+        slice(50),
         slice(1, 10),
         slice(10, 20),
-        slice(15, 24000),
+        slice(15, 24),
         slice(6),
     ],
 )