From 5a82585279219521e226ad22f4f33f7b6f5eb9d6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 16 Sep 2021 18:13:06 -0500 Subject: [PATCH] `Dataframe.sort_index` optimizations (#9238) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: #9234 - [x] This PR introduces optimizations to `sort_index` when there is an already sorted `Index` object and avoids sorting them and performing a `take` operation on them. This **alleviates** a lot of **memory pressure** and has **a 3x to 6x speed up.** On a T4 GPU: `This PR`: ```python In [1]: import cudf In [2]: df = cudf.DataFrame({'a':[1, 2, 3]*100000000, 'b':['a', 'b', 'c']*100000000, 'c':[0.0, 0.12, 10.12]*100000000}) In [3]: %timeit df.sort_index() 174 ms ± 368 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ``` `branch-21.10`: Won't fit into memory and will error :( on T4 as it tries to perform argsort on an already sorted index. `THIS PR`: ```python In [1]: import cudf In [2]: df = cudf.DataFrame({'a':[1, 2, 3]*10000000, 'b':['a', 'b', 'c']*10000000, 'c':[0.0, 0.12, 10.12]*10000000}) In [3]: %timeit df.sort_index(ascending=False) 69.1 ms ± 221 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [4]: %timeit df.sort_index() 15.2 ms ± 213 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) In [5]: df_reversed = df[::-1] In [6]: %timeit df_reversed.sort_index() 72.6 ms ± 433 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [7]: %timeit df_reversed.sort_index(ascending=False) 24.1 ms ± 423 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ``` `branch-21.10`: ```python In [1]: import cudf In [2]: df = cudf.DataFrame({'a':[1, 2, 3]*10000000, 'b':['a', 'b', 'c']*10000000, 'c':[0.0, 0.12, 10.12]*10000000}) In [3]: %timeit df.sort_index(ascending=False) 71.6 ms ± 141 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [4]: %timeit df.sort_index() 71.7 ms ± 189 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [5]: df_reversed = df[::-1] In [6]: %timeit df_reversed.sort_index() 69.1 ms ± 201 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) In [7]: %timeit df_reversed.sort_index(ascending=False) 69 ms ± 127 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ``` - [x] Also expands params to `Series.sort_index` and refactored the common implementation to `Frame._sort_index`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Michael Wang (https://github.com/isVoid) - Benjamin Zaitlen (https://github.com/quasiben) URL: https://github.com/rapidsai/cudf/pull/9238 --- python/cudf/cudf/core/dataframe.py | 48 +++------- python/cudf/cudf/core/frame.py | 89 +++++++++++++++++++ python/cudf/cudf/core/series.py | 39 +++++++- python/cudf/cudf/tests/test_dataframe.py | 27 ++++-- python/cudf/cudf/tests/test_series.py | 51 +++++++++++ python/cudf/cudf/tests/test_sorting.py | 12 +-- .../dask_cudf/tests/test_accessor.py | 1 - 7 files changed, 213 insertions(+), 54 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b8fe4fcaff6..1143f85a4e6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3980,44 +3980,16 @@ def sort_index( 3 1 2 2 3 1 """ - if kind is not None: - raise NotImplementedError("kind is not yet supported") - - if not sort_remaining: - raise NotImplementedError( - "sort_remaining == False is not yet supported" - ) - - if axis in (0, "index"): - if level is not None and isinstance(self.index, cudf.MultiIndex): - # Pandas currently don't handle na_position - # in case of MultiIndex - if ascending is True: - na_position = "first" - else: - na_position = "last" - - if is_list_like(level): - labels = [ - self.index._get_level_label(lvl) for lvl in level - ] - else: - labels = [self.index._get_level_label(level)] - inds = self.index.to_frame(index=False)[labels].argsort( - ascending=ascending, na_position=na_position - ) - else: - inds = self.index.argsort( - ascending=ascending, na_position=na_position - ) - outdf = self.take(inds) - else: - labels = sorted(self._data.names, reverse=not ascending) - outdf = self[labels] - - if ignore_index is True: - outdf = outdf.reset_index(drop=True) - return self._mimic_inplace(outdf, inplace=inplace) + return super()._sort_index( + axis=axis, + level=level, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + sort_remaining=sort_remaining, + ignore_index=ignore_index, + ) def sort_values( self, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 64b96458218..09a6df67da5 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -429,6 +429,95 @@ def _explode(self, explode_column: Any, ignore_index: bool): res.index.names = self._index.names return res + def _sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind=None, + na_position="last", + sort_remaining=True, + ignore_index=False, + ): + """ + Helper for .sort_index + + Parameters + ---------- + axis : {0 or ‘index’, 1 or ‘columns’}, default 0 + The axis along which to sort. The value 0 identifies the rows, + and 1 identifies the columns. + level : int or level name or list of ints or list of level names + If not None, sort on values in specified index level(s). + This is only useful in the case of MultiIndex. + ascending : bool, default True + Sort ascending vs. descending. + inplace : bool, default False + If True, perform operation in-place. + kind : sorting method such as `quick sort` and others. + Not yet supported. + na_position : {‘first’, ‘last’}, default ‘last’ + Puts NaNs at the beginning if first; last puts NaNs at the end. + sort_remaining : bool, default True + Not yet supported + ignore_index : bool, default False + if True, index will be replaced with RangeIndex. + + Returns + ------- + DataFrame/Series or None + """ + if kind is not None: + raise NotImplementedError("kind is not yet supported") + + if not sort_remaining: + raise NotImplementedError( + "sort_remaining == False is not yet supported" + ) + + if axis in (0, "index"): + if isinstance(self.index, cudf.MultiIndex): + if level is None: + midx_data = self.index.to_frame(index=False) + else: + # Pandas currently don't handle na_position + # in case of MultiIndex + if ascending is True: + na_position = "first" + else: + na_position = "last" + + if cudf.api.types.is_list_like(level): + labels = [ + self.index._get_level_label(lvl) for lvl in level + ] + else: + labels = [self.index._get_level_label(level)] + midx_data = cudf.DataFrame._from_data( + self.index._data.select_by_label(labels) + ) + inds = midx_data.argsort( + ascending=ascending, na_position=na_position + ) + outdf = self.take(inds) + elif (ascending and self.index.is_monotonic_increasing) or ( + not ascending and self.index.is_monotonic_decreasing + ): + outdf = self.copy() + else: + inds = self.index.argsort( + ascending=ascending, na_position=na_position + ) + outdf = self.take(inds) + else: + labels = sorted(self._data.names, reverse=not ascending) + outdf = self[labels] + + if ignore_index is True: + outdf = outdf.reset_index(drop=True) + return self._mimic_inplace(outdf, inplace=inplace) + def _get_columns_by_label(self, labels, downcast=False): """ Returns columns of the Frame specified by `labels` diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 55400d03bf7..4ad2c325eeb 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2912,14 +2912,39 @@ def argsort(self, ascending=True, na_position="last"): """ return self._sort(ascending=ascending, na_position=na_position)[1] - def sort_index(self, ascending=True): + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind=None, + na_position="last", + sort_remaining=True, + ignore_index=False, + ): """ Sort by the index. Parameters ---------- + axis : {0 or ‘index’, 1 or ‘columns’}, default 0 + Axis to direct sorting. This can only be 0 for Series. + level : int or level name or list of ints or list of level names + If not None, sort on values in specified index level(s). + This is only useful in the case of MultiIndex. ascending : bool, default True Sort ascending vs. descending. + inplace : bool, default False + If True, perform operation in-place. + kind : sorting method such as `quick sort` and others. + Not yet supported. + na_position : {‘first’, ‘last’}, default ‘last’ + Puts NaNs at the beginning if first; last puts NaNs at the end. + sort_remaining : bool, default True + Not yet supported + ignore_index : bool, default False + if True, index will be replaced with RangeIndex. Returns ------- @@ -2952,8 +2977,16 @@ def sort_index(self, ascending=True): 1 c dtype: object """ - inds = self.index.argsort(ascending=ascending) - return self.take(inds) + return super()._sort_index( + axis=axis, + level=level, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + sort_remaining=sort_remaining, + ignore_index=ignore_index, + ) def sort_values( self, diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c95b41f3509..376fc3e6b88 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2898,20 +2898,34 @@ def test_dataframe_empty_sort_index(): expect = pdf.sort_index() got = gdf.sort_index() - assert_eq(expect, got) + assert_eq(expect, got, check_index_type=True) +@pytest.mark.parametrize( + "index", + [ + pd.RangeIndex(0, 3, 1), + [3.0, 1.0, np.nan], + pytest.param( + pd.RangeIndex(2, -1, -1), + marks=[ + pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/43591" + ) + ], + ), + ], +) @pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("na_position", ["first", "last"]) def test_dataframe_sort_index( - axis, ascending, inplace, ignore_index, na_position + index, axis, ascending, inplace, ignore_index, na_position ): pdf = pd.DataFrame( - {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, - index=[3.0, 1.0, np.nan], + {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, index=index, ) gdf = cudf.DataFrame.from_pandas(pdf) @@ -2931,9 +2945,9 @@ def test_dataframe_sort_index( ) if inplace is True: - assert_eq(pdf, gdf) + assert_eq(pdf, gdf, check_index_type=True) else: - assert_eq(expected, got) + assert_eq(expected, got, check_index_type=True) @pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) @@ -2972,6 +2986,7 @@ def test_dataframe_mulitindex_sort_index( gdf = cudf.DataFrame.from_pandas(pdf) # ignore_index is supported in v.1.0 + expected = pdf.sort_index( axis=axis, level=level, diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index d29ba08a848..11deb6c0842 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -512,6 +512,7 @@ def test_series_datetime_value_counts(data, nulls, normalize, dropna): expected.reset_index(drop=True), got.reset_index(drop=True), check_dtype=False, + check_index_type=True, ) @@ -547,11 +548,13 @@ def test_categorical_value_counts(dropna, normalize, num_elements): pdf_value_counts.sort_index(), gdf_value_counts.sort_index(), check_dtype=False, + check_index_type=True, ) assert_eq( pdf_value_counts.reset_index(drop=True), gdf_value_counts.reset_index(drop=True), check_dtype=False, + check_index_type=True, ) @@ -1228,3 +1231,51 @@ def test_series_upcast_float16(data): actual_series = cudf.Series(data) expected_series = cudf.Series(data, dtype="float32") assert_eq(actual_series, expected_series) + + +@pytest.mark.parametrize( + "index", + [ + pd.RangeIndex(0, 3, 1), + [3.0, 1.0, np.nan], + ["a", "z", None], + pytest.param( + pd.RangeIndex(4, -1, -2), + marks=[ + pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/43591" + ) + ], + ), + ], +) +@pytest.mark.parametrize("axis", [0, "index"]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("ignore_index", [True, False]) +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize("na_position", ["first", "last"]) +def test_series_sort_index( + index, axis, ascending, inplace, ignore_index, na_position +): + ps = pd.Series([10, 3, 12], index=index) + gs = cudf.from_pandas(ps) + + expected = ps.sort_index( + axis=axis, + ascending=ascending, + ignore_index=ignore_index, + inplace=inplace, + na_position=na_position, + ) + got = gs.sort_index( + axis=axis, + ascending=ascending, + ignore_index=ignore_index, + inplace=inplace, + na_position=na_position, + ) + + if inplace is True: + assert_eq(ps, gs, check_index_type=True) + else: + assert_eq(expected, got, check_index_type=True) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index ef9f853bd11..fed391ac6be 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -106,12 +106,12 @@ def test_series_argsort(nelem, dtype, asc): def test_series_sort_index(nelem, asc): np.random.seed(0) sr = Series((100 * np.random.random(nelem))) - orig = sr.to_array() - got = sr.sort_values().sort_index(ascending=asc).to_array() - if not asc: - # Reverse the array for descending sort - got = got[::-1] - np.testing.assert_array_equal(orig, got) + psr = sr.to_pandas() + + expected = psr.sort_index(ascending=asc) + got = sr.sort_index(ascending=asc) + + assert_eq(expected, got) @pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]]) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 8227023aa51..805927dd474 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -486,7 +486,6 @@ def test_struct_field_integer(data): def test_dask_struct_field_Key_Error(data): got = dgd.from_cudf(Series(data), 2) - # import pdb; pdb.set_trace() with pytest.raises(KeyError): got.struct.field("notakey").compute()