Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Dataframe.sort_index optimizations #9238

Merged
48 changes: 10 additions & 38 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3980,44 +3980,16 @@ def sort_index(
3 1 2
2 3 1
"""
if kind is not None:
raise NotImplementedError("kind is not yet supported")

if not sort_remaining:
raise NotImplementedError(
"sort_remaining == False is not yet supported"
)

if axis in (0, "index"):
if level is not None and isinstance(self.index, cudf.MultiIndex):
# Pandas currently don't handle na_position
# in case of MultiIndex
if ascending is True:
na_position = "first"
else:
na_position = "last"

if is_list_like(level):
labels = [
self.index._get_level_label(lvl) for lvl in level
]
else:
labels = [self.index._get_level_label(level)]
inds = self.index.to_frame(index=False)[labels].argsort(
ascending=ascending, na_position=na_position
)
else:
inds = self.index.argsort(
ascending=ascending, na_position=na_position
)
outdf = self.take(inds)
else:
labels = sorted(self._data.names, reverse=not ascending)
outdf = self[labels]

if ignore_index is True:
outdf = outdf.reset_index(drop=True)
return self._mimic_inplace(outdf, inplace=inplace)
return super()._sort_index(
axis=axis,
level=level,
ascending=ascending,
inplace=inplace,
kind=kind,
na_position=na_position,
sort_remaining=sort_remaining,
ignore_index=ignore_index,
)

def sort_values(
self,
Expand Down
89 changes: 89 additions & 0 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,95 @@ def _explode(self, explode_column: Any, ignore_index: bool):
res.index.names = self._index.names
return res

def _sort_index(
self,
axis=0,
level=None,
ascending=True,
inplace=False,
kind=None,
na_position="last",
sort_remaining=True,
ignore_index=False,
):
"""
Helper for .sort_index

Parameters
----------
axis : {0 or ‘index’, 1 or ‘columns’}, default 0
The axis along which to sort. The value 0 identifies the rows,
and 1 identifies the columns.
level : int or level name or list of ints or list of level names
If not None, sort on values in specified index level(s).
This is only useful in the case of MultiIndex.
ascending : bool, default True
Sort ascending vs. descending.
inplace : bool, default False
If True, perform operation in-place.
kind : sorting method such as `quick sort` and others.
Not yet supported.
na_position : {‘first’, ‘last’}, default ‘last’
Puts NaNs at the beginning if first; last puts NaNs at the end.
sort_remaining : bool, default True
Not yet supported
ignore_index : bool, default False
if True, index will be replaced with RangeIndex.

Returns
-------
DataFrame/Series or None
"""
if kind is not None:
raise NotImplementedError("kind is not yet supported")

if not sort_remaining:
raise NotImplementedError(
"sort_remaining == False is not yet supported"
)

if axis in (0, "index"):
if isinstance(self.index, cudf.MultiIndex):
if level is None:
midx_data = self.index.to_frame(index=False)
else:
# Pandas currently don't handle na_position
# in case of MultiIndex
if ascending is True:
na_position = "first"
else:
na_position = "last"

if cudf.api.types.is_list_like(level):
labels = [
self.index._get_level_label(lvl) for lvl in level
]
else:
labels = [self.index._get_level_label(level)]
midx_data = cudf.DataFrame._from_data(
self.index._data.select_by_label(labels)
)
inds = midx_data.argsort(
ascending=ascending, na_position=na_position
)
outdf = self.take(inds)
elif (ascending and self.index.is_monotonic_increasing) or (
not ascending and self.index.is_monotonic_decreasing
):
outdf = self.copy()
Comment on lines +504 to +507
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wondering, is_monotonic_* is available for both Index and MultiIndex. Maybe this optimization can be applied regardless of object type?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We would have to adhere to extracting level, which will be a DataFrame and again round-trip that back to MultiIndex object to do an is_monotonic_* check which seems to be inefficient and memory consuming.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also out of the context of this PR.. I can see the reason why we need to convert the index into a dataframe is because it's depending on argsort and take. Hopefully we can sink them into Frame so that there's no such need to convert to dataframes.

The difficulty of sinking argsort is that I believe Series depends on a single column sort while DataFrame depends on a multi column sort.

else:
inds = self.index.argsort(
ascending=ascending, na_position=na_position
)
outdf = self.take(inds)
else:
labels = sorted(self._data.names, reverse=not ascending)
outdf = self[labels]

if ignore_index is True:
outdf = outdf.reset_index(drop=True)
return self._mimic_inplace(outdf, inplace=inplace)

def _get_columns_by_label(self, labels, downcast=False):
"""
Returns columns of the Frame specified by `labels`
Expand Down
39 changes: 36 additions & 3 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2912,14 +2912,39 @@ def argsort(self, ascending=True, na_position="last"):
"""
return self._sort(ascending=ascending, na_position=na_position)[1]

def sort_index(self, ascending=True):
def sort_index(
self,
axis=0,
level=None,
ascending=True,
inplace=False,
kind=None,
na_position="last",
sort_remaining=True,
ignore_index=False,
):
"""
Sort by the index.

Parameters
----------
axis : {0 or ‘index’, 1 or ‘columns’}, default 0
Axis to direct sorting. This can only be 0 for Series.
level : int or level name or list of ints or list of level names
If not None, sort on values in specified index level(s).
This is only useful in the case of MultiIndex.
ascending : bool, default True
Sort ascending vs. descending.
inplace : bool, default False
If True, perform operation in-place.
kind : sorting method such as `quick sort` and others.
Not yet supported.
na_position : {‘first’, ‘last’}, default ‘last’
Puts NaNs at the beginning if first; last puts NaNs at the end.
sort_remaining : bool, default True
Not yet supported
ignore_index : bool, default False
if True, index will be replaced with RangeIndex.

Returns
-------
Expand Down Expand Up @@ -2952,8 +2977,16 @@ def sort_index(self, ascending=True):
1 c
dtype: object
"""
inds = self.index.argsort(ascending=ascending)
return self.take(inds)
return super()._sort_index(
axis=axis,
level=level,
ascending=ascending,
inplace=inplace,
kind=kind,
na_position=na_position,
sort_remaining=sort_remaining,
ignore_index=ignore_index,
)

def sort_values(
self,
Expand Down
27 changes: 21 additions & 6 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2898,20 +2898,34 @@ def test_dataframe_empty_sort_index():
expect = pdf.sort_index()
got = gdf.sort_index()

assert_eq(expect, got)
assert_eq(expect, got, check_index_type=True)


@pytest.mark.parametrize(
"index",
[
pd.RangeIndex(0, 3, 1),
[3.0, 1.0, np.nan],
pytest.param(
pd.RangeIndex(2, -1, -1),
marks=[
pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/43591"
)
],
),
],
)
@pytest.mark.parametrize("axis", [0, 1, "index", "columns"])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("ignore_index", [True, False])
@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize("na_position", ["first", "last"])
def test_dataframe_sort_index(
axis, ascending, inplace, ignore_index, na_position
index, axis, ascending, inplace, ignore_index, na_position
):
pdf = pd.DataFrame(
{"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]},
index=[3.0, 1.0, np.nan],
{"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, index=index,
)
gdf = cudf.DataFrame.from_pandas(pdf)

Expand All @@ -2931,9 +2945,9 @@ def test_dataframe_sort_index(
)

if inplace is True:
assert_eq(pdf, gdf)
assert_eq(pdf, gdf, check_index_type=True)
else:
assert_eq(expected, got)
assert_eq(expected, got, check_index_type=True)


@pytest.mark.parametrize("axis", [0, 1, "index", "columns"])
Expand Down Expand Up @@ -2972,6 +2986,7 @@ def test_dataframe_mulitindex_sort_index(
gdf = cudf.DataFrame.from_pandas(pdf)

# ignore_index is supported in v.1.0

expected = pdf.sort_index(
axis=axis,
level=level,
Expand Down
51 changes: 51 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,7 @@ def test_series_datetime_value_counts(data, nulls, normalize, dropna):
expected.reset_index(drop=True),
got.reset_index(drop=True),
check_dtype=False,
check_index_type=True,
)


Expand Down Expand Up @@ -547,11 +548,13 @@ def test_categorical_value_counts(dropna, normalize, num_elements):
pdf_value_counts.sort_index(),
gdf_value_counts.sort_index(),
check_dtype=False,
check_index_type=True,
)
assert_eq(
pdf_value_counts.reset_index(drop=True),
gdf_value_counts.reset_index(drop=True),
check_dtype=False,
check_index_type=True,
)


Expand Down Expand Up @@ -1228,3 +1231,51 @@ def test_series_upcast_float16(data):
actual_series = cudf.Series(data)
expected_series = cudf.Series(data, dtype="float32")
assert_eq(actual_series, expected_series)


@pytest.mark.parametrize(
"index",
[
pd.RangeIndex(0, 3, 1),
[3.0, 1.0, np.nan],
["a", "z", None],
pytest.param(
pd.RangeIndex(4, -1, -2),
marks=[
pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/43591"
)
],
),
],
)
@pytest.mark.parametrize("axis", [0, "index"])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("ignore_index", [True, False])
@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize("na_position", ["first", "last"])
def test_series_sort_index(
index, axis, ascending, inplace, ignore_index, na_position
):
ps = pd.Series([10, 3, 12], index=index)
gs = cudf.from_pandas(ps)

expected = ps.sort_index(
axis=axis,
ascending=ascending,
ignore_index=ignore_index,
inplace=inplace,
na_position=na_position,
)
got = gs.sort_index(
axis=axis,
ascending=ascending,
ignore_index=ignore_index,
inplace=inplace,
na_position=na_position,
)

if inplace is True:
assert_eq(ps, gs, check_index_type=True)
else:
assert_eq(expected, got, check_index_type=True)
12 changes: 6 additions & 6 deletions python/cudf/cudf/tests/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,12 @@ def test_series_argsort(nelem, dtype, asc):
def test_series_sort_index(nelem, asc):
np.random.seed(0)
sr = Series((100 * np.random.random(nelem)))
orig = sr.to_array()
got = sr.sort_values().sort_index(ascending=asc).to_array()
if not asc:
# Reverse the array for descending sort
got = got[::-1]
np.testing.assert_array_equal(orig, got)
psr = sr.to_pandas()

expected = psr.sort_index(ascending=asc)
got = sr.sort_index(ascending=asc)

assert_eq(expected, got)


@pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]])
Expand Down
1 change: 0 additions & 1 deletion python/dask_cudf/dask_cudf/tests/test_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,6 @@ def test_struct_field_integer(data):
def test_dask_struct_field_Key_Error(data):
got = dgd.from_cudf(Series(data), 2)

# import pdb; pdb.set_trace()
with pytest.raises(KeyError):
got.struct.field("notakey").compute()

Expand Down