Skip to content

Commit

Permalink
Fix DataFrame slicing issues for empty cases (#10310)
Browse files Browse the repository at this point in the history
Closes #10292

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #10310
  • Loading branch information
brandon-b-miller authored Feb 22, 2022
1 parent 36e8825 commit 58810af
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 14 deletions.
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2289,7 +2289,9 @@ def arange(
if step is None:
step = 1

size = int(np.ceil((stop - start) / step))
size = len(range(int(start), int(stop), int(step)))
if size == 0:
return as_column([], dtype=dtype)

return libcudf.filling.sequence(
size,
Expand Down
45 changes: 43 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame, _drop_rows_by_labels
from cudf.core.groupby.groupby import DataFrameGroupBy
from cudf.core.index import BaseIndex, RangeIndex, as_index
from cudf.core.index import BaseIndex, Index, RangeIndex, as_index
from cudf.core.indexed_frame import (
IndexedFrame,
_FrameIndexer,
Expand Down Expand Up @@ -1219,13 +1219,54 @@ def _slice(self: T, arg: slice) -> T:
return self
start, stop, stride = arg.indices(num_rows)

# early stop for empty cases
if len(range(start, stop, stride)) == 0:
columns = ColumnAccessor(
{
colname: column.column_empty_like(col, newsize=0)
for colname, col in self._data.items()
},
multiindex=self._data.multiindex,
level_names=self._data.level_names,
)

if isinstance(self.index, MultiIndex):
mi_columns = ColumnAccessor(
{
colname: column.column_empty_like(col, newsize=0)
for colname, col in self.index._data.items()
}
)
return DataFrame._from_data(
columns,
index=MultiIndex._from_data(
mi_columns, name=self.index.name
),
)
else:
return DataFrame._from_data(
columns,
index=(
RangeIndex(
start=start,
stop=stop,
step=stride,
name=self.index.name,
)
if isinstance(self.index, RangeIndex)
else Index(
[], dtype=self.index.dtype, name=self.index.name
)
),
)

# This is just to handle RangeIndex type, stop
# it from materializing unnecessarily
keep_index = True
if self.index is not None and isinstance(self.index, RangeIndex):
if self._num_columns == 0:
result = self._empty_like(keep_index)
result._index = self.index[start:stop]
result._index = self.index[start:stop:stride]
return result
keep_index = False

Expand Down
29 changes: 18 additions & 11 deletions python/cudf/cudf/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1294,8 +1294,8 @@ def test_loc_datetime_index(sli, is_dataframe):
@pytest.mark.parametrize(
"gdf_kwargs",
[
{"data": {"a": range(100000)}},
{"data": {"a": range(100000), "b": range(100000)}},
{"data": {"a": range(1000)}},
{"data": {"a": range(1000), "b": range(1000)}},
{
"data": {
"a": range(20),
Expand All @@ -1304,26 +1304,33 @@ def test_loc_datetime_index(sli, is_dataframe):
}
},
{"index": [1, 2, 3]},
{"index": range(100000)},
{"index": range(1000)},
{"columns": ["a", "b", "c", "d"]},
{"columns": ["a"], "index": range(100000)},
{"columns": ["a", "col2", "...col n"], "index": range(100000)},
{"index": cudf.Series(range(100000)).astype("str")},
{"columns": ["a"], "index": range(1000)},
{"columns": ["a", "col2", "...col n"], "index": range(1000)},
{"index": cudf.Series(range(1000)).astype("str")},
{
"columns": ["a", "b", "c", "d"],
"index": cudf.Series(range(100000)).astype("str"),
"index": cudf.Series(range(1000)).astype("str"),
},
],
)
@pytest.mark.parametrize(
"slice",
[
slice(25000, 50000),
slice(25000, 25001),
slice(50000),
slice(6, None), # start but no stop, [6:]
slice(None, None, 3), # only step, [::3]
slice(1, 10, 2), # start, stop, step
slice(3, -5, 2), # negative stop
slice(-2, -4), # slice is empty
slice(-10, -20, -1), # reversed slice
slice(None), # slices everything, same as [:]
slice(250, 500),
slice(250, 251),
slice(50),
slice(1, 10),
slice(10, 20),
slice(15, 24000),
slice(15, 24),
slice(6),
],
)
Expand Down

0 comments on commit 58810af

Please sign in to comment.