From 2a1b83bd8c361a28db71e172b428d32bd44c67fd Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 15 Sep 2021 10:46:22 -0700 Subject: [PATCH 01/10] optimize DataFrame.sort_index --- python/cudf/cudf/core/dataframe.py | 7 ++++++- python/dask_cudf/dask_cudf/tests/test_accessor.py | 1 - 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0da31cfe583..d90b2d236f6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4002,11 +4002,16 @@ def sort_index( inds = self.index._source_data[labels].argsort( ascending=ascending, na_position=na_position ) + outdf = self.take(inds) + elif (ascending and self.index.is_monotonic_increasing) or ( + not ascending and self.index.is_monotonic_decreasing + ): + outdf = self.copy() else: inds = self.index.argsort( ascending=ascending, na_position=na_position ) - outdf = self.take(inds) + outdf = self.take(inds) else: labels = sorted(self._data.names, reverse=not ascending) outdf = self[labels] diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 8227023aa51..805927dd474 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -486,7 +486,6 @@ def test_struct_field_integer(data): def test_dask_struct_field_Key_Error(data): got = dgd.from_cudf(Series(data), 2) - # import pdb; pdb.set_trace() with pytest.raises(KeyError): got.struct.field("notakey").compute() From 8e74cd046fddb9577ff15543a111efcb9c218c0d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 15 Sep 2021 13:37:05 -0700 Subject: [PATCH 02/10] fix sort_index --- python/cudf/cudf/core/dataframe.py | 53 ++++----------------- python/cudf/cudf/core/frame.py | 60 ++++++++++++++++++++++++ python/cudf/cudf/core/series.py | 39 +++++++++++++-- python/cudf/cudf/tests/test_dataframe.py | 7 +-- python/cudf/cudf/tests/test_series.py | 3 ++ python/cudf/cudf/tests/test_sorting.py | 12 ++--- 6 files changed, 119 insertions(+), 55 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d90b2d236f6..e3bbce658c6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3976,49 +3976,16 @@ def sort_index( 3 1 2 2 3 1 """ - if kind is not None: - raise NotImplementedError("kind is not yet supported") - - if not sort_remaining: - raise NotImplementedError( - "sort_remaining == False is not yet supported" - ) - - if axis in (0, "index"): - if level is not None and isinstance(self.index, cudf.MultiIndex): - # Pandas currently don't handle na_position - # in case of MultiIndex - if ascending is True: - na_position = "first" - else: - na_position = "last" - - if is_list_like(level): - labels = [ - self.index._get_level_label(lvl) for lvl in level - ] - else: - labels = [self.index._get_level_label(level)] - inds = self.index._source_data[labels].argsort( - ascending=ascending, na_position=na_position - ) - outdf = self.take(inds) - elif (ascending and self.index.is_monotonic_increasing) or ( - not ascending and self.index.is_monotonic_decreasing - ): - outdf = self.copy() - else: - inds = self.index.argsort( - ascending=ascending, na_position=na_position - ) - outdf = self.take(inds) - else: - labels = sorted(self._data.names, reverse=not ascending) - outdf = self[labels] - - if ignore_index is True: - outdf = outdf.reset_index(drop=True) - return self._mimic_inplace(outdf, inplace=inplace) + return super()._sort_index( + axis=axis, + level=level, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + sort_remaining=sort_remaining, + ignore_index=ignore_index, + ) def sort_values( self, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 037f6f7ff94..74cdb929621 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -428,6 +428,66 @@ def _explode(self, explode_column: Any, ignore_index: bool): res.index.names = self._index.names return res + def _sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind=None, + na_position="last", + sort_remaining=True, + ignore_index=False, + ): + + if kind is not None: + raise NotImplementedError("kind is not yet supported") + + if not sort_remaining: + raise NotImplementedError( + "sort_remaining == False is not yet supported" + ) + + if axis in (0, "index"): + if isinstance(self.index, cudf.MultiIndex): + if level is None: + midx_data = self.index._source_data + else: + # Pandas currently don't handle na_position + # in case of MultiIndex + if ascending is True: + na_position = "first" + else: + na_position = "last" + + if cudf.utils.dtypes.is_list_like(level): + labels = [ + self.index._get_level_label(lvl) for lvl in level + ] + else: + labels = [self.index._get_level_label(level)] + midx_data = self.index._source_data[labels] + inds = midx_data.argsort( + ascending=ascending, na_position=na_position + ) + outdf = self.take(inds) + elif (ascending and self.index.is_monotonic_increasing) or ( + not ascending and self.index.is_monotonic_decreasing + ): + outdf = self.copy() + else: + inds = self.index.argsort( + ascending=ascending, na_position=na_position + ) + outdf = self.take(inds) + else: + labels = sorted(self._data.names, reverse=not ascending) + outdf = self[labels] + + if ignore_index is True: + outdf = outdf.reset_index(drop=True) + return self._mimic_inplace(outdf, inplace=inplace) + def _get_columns_by_label(self, labels, downcast=False): """ Returns columns of the Frame specified by `labels` diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8ccd967b1b3..868e2256925 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2908,14 +2908,39 @@ def argsort(self, ascending=True, na_position="last"): """ return self._sort(ascending=ascending, na_position=na_position)[1] - def sort_index(self, ascending=True): + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind=None, + na_position="last", + sort_remaining=True, + ignore_index=False, + ): """ Sort by the index. Parameters ---------- + axis : {0 or ‘index’, 1 or ‘columns’}, default 0 + Axis to direct sorting. This can only be 0 for Series. + level : int or level name or list of ints or list of level names + If not None, sort on values in specified index level(s). + This is only useful in the case of MultiIndex. ascending : bool, default True Sort ascending vs. descending. + inplace : bool, default False + If True, perform operation in-place. + kind : sorting method such as `quick sort` and others. + Not yet supported. + na_position : {‘first’, ‘last’}, default ‘last’ + Puts NaNs at the beginning if first; last puts NaNs at the end. + sort_remaining : bool, default True + Not yet supported + ignore_index : bool, default False + if True, index will be replaced with RangeIndex. Returns ------- @@ -2948,8 +2973,16 @@ def sort_index(self, ascending=True): 1 c dtype: object """ - inds = self.index.argsort(ascending=ascending) - return self.take(inds) + return super()._sort_index( + axis=axis, + level=level, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + sort_remaining=sort_remaining, + ignore_index=ignore_index, + ) def sort_values( self, diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 03a140c138b..74b0b8cd4cd 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2898,7 +2898,7 @@ def test_dataframe_empty_sort_index(): expect = pdf.sort_index() got = gdf.sort_index() - assert_eq(expect, got) + assert_eq(expect, got, check_index_type=True) @pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) @@ -2931,9 +2931,9 @@ def test_dataframe_sort_index( ) if inplace is True: - assert_eq(pdf, gdf) + assert_eq(pdf, gdf, check_index_type=True) else: - assert_eq(expected, got) + assert_eq(expected, got, check_index_type=True) @pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) @@ -2972,6 +2972,7 @@ def test_dataframe_mulitindex_sort_index( gdf = cudf.DataFrame.from_pandas(pdf) # ignore_index is supported in v.1.0 + expected = pdf.sort_index( axis=axis, level=level, diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index d29ba08a848..0103895ce59 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -512,6 +512,7 @@ def test_series_datetime_value_counts(data, nulls, normalize, dropna): expected.reset_index(drop=True), got.reset_index(drop=True), check_dtype=False, + check_index_type=True, ) @@ -547,11 +548,13 @@ def test_categorical_value_counts(dropna, normalize, num_elements): pdf_value_counts.sort_index(), gdf_value_counts.sort_index(), check_dtype=False, + check_index_type=True, ) assert_eq( pdf_value_counts.reset_index(drop=True), gdf_value_counts.reset_index(drop=True), check_dtype=False, + check_index_type=True, ) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index ef9f853bd11..fed391ac6be 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -106,12 +106,12 @@ def test_series_argsort(nelem, dtype, asc): def test_series_sort_index(nelem, asc): np.random.seed(0) sr = Series((100 * np.random.random(nelem))) - orig = sr.to_array() - got = sr.sort_values().sort_index(ascending=asc).to_array() - if not asc: - # Reverse the array for descending sort - got = got[::-1] - np.testing.assert_array_equal(orig, got) + psr = sr.to_pandas() + + expected = psr.sort_index(ascending=asc) + got = sr.sort_index(ascending=asc) + + assert_eq(expected, got) @pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]]) From 0eb3302f828b6e76709926ad5eb09e0ae5e3379a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 15 Sep 2021 13:49:51 -0700 Subject: [PATCH 03/10] doc --- python/cudf/cudf/core/frame.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 74cdb929621..c7725764836 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -439,7 +439,34 @@ def _sort_index( sort_remaining=True, ignore_index=False, ): + """ + Helper for .sort_index + Parameters + ---------- + axis : {0 or ‘index’, 1 or ‘columns’}, default 0 + The axis along which to sort. The value 0 identifies the rows, + and 1 identifies the columns. + level : int or level name or list of ints or list of level names + If not None, sort on values in specified index level(s). + This is only useful in the case of MultiIndex. + ascending : bool, default True + Sort ascending vs. descending. + inplace : bool, default False + If True, perform operation in-place. + kind : sorting method such as `quick sort` and others. + Not yet supported. + na_position : {‘first’, ‘last’}, default ‘last’ + Puts NaNs at the beginning if first; last puts NaNs at the end. + sort_remaining : bool, default True + Not yet supported + ignore_index : bool, default False + if True, index will be replaced with RangeIndex. + + Returns + ------- + DataFrame/Series or None + """ if kind is not None: raise NotImplementedError("kind is not yet supported") From 509dd6cdfbd4dfca77bfb82d3fda44ec0aafe756 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 15 Sep 2021 16:20:47 -0700 Subject: [PATCH 04/10] add tests --- python/cudf/cudf/tests/test_dataframe.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 74b0b8cd4cd..80f03dbb916 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2901,17 +2901,31 @@ def test_dataframe_empty_sort_index(): assert_eq(expect, got, check_index_type=True) +@pytest.mark.parametrize( + "index", + [ + pd.RangeIndex(0, 3, 1), + [3.0, 1.0, np.nan], + pytest.param( + pd.RangeIndex(2, -1, -1), + marks=[ + pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/43591" + ) + ], + ), + ], +) @pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("na_position", ["first", "last"]) def test_dataframe_sort_index( - axis, ascending, inplace, ignore_index, na_position + index, axis, ascending, inplace, ignore_index, na_position ): pdf = pd.DataFrame( - {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, - index=[3.0, 1.0, np.nan], + {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, index=index, ) gdf = cudf.DataFrame.from_pandas(pdf) From 36fe12ed27e122782396cd0a386a57bc392bc56d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 15 Sep 2021 16:27:24 -0700 Subject: [PATCH 05/10] add Series.sort_index tests --- python/cudf/cudf/tests/test_series.py | 48 +++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 0103895ce59..11deb6c0842 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1231,3 +1231,51 @@ def test_series_upcast_float16(data): actual_series = cudf.Series(data) expected_series = cudf.Series(data, dtype="float32") assert_eq(actual_series, expected_series) + + +@pytest.mark.parametrize( + "index", + [ + pd.RangeIndex(0, 3, 1), + [3.0, 1.0, np.nan], + ["a", "z", None], + pytest.param( + pd.RangeIndex(4, -1, -2), + marks=[ + pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/43591" + ) + ], + ), + ], +) +@pytest.mark.parametrize("axis", [0, "index"]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("ignore_index", [True, False]) +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize("na_position", ["first", "last"]) +def test_series_sort_index( + index, axis, ascending, inplace, ignore_index, na_position +): + ps = pd.Series([10, 3, 12], index=index) + gs = cudf.from_pandas(ps) + + expected = ps.sort_index( + axis=axis, + ascending=ascending, + ignore_index=ignore_index, + inplace=inplace, + na_position=na_position, + ) + got = gs.sort_index( + axis=axis, + ascending=ascending, + ignore_index=ignore_index, + inplace=inplace, + na_position=na_position, + ) + + if inplace is True: + assert_eq(ps, gs, check_index_type=True) + else: + assert_eq(expected, got, check_index_type=True) From dc859593f11e78745a7dbfc952edc5ec06cc5dc3 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 15 Sep 2021 17:00:45 -0700 Subject: [PATCH 06/10] update api --- python/cudf/cudf/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 845d370e5fb..6a18361e290 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -488,7 +488,7 @@ def _sort_index( else: na_position = "last" - if cudf.utils.dtypes.is_list_like(level): + if cudf.api.types.is_list_like(level): labels = [ self.index._get_level_label(lvl) for lvl in level ] From b878e6bb11d0a9f9daf911fc6d351ab455a434f4 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 15 Sep 2021 20:00:58 -0500 Subject: [PATCH 07/10] Update python/cudf/cudf/core/frame.py --- python/cudf/cudf/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9f4d5855e2f..bda9e113849 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -494,7 +494,7 @@ def _sort_index( ] else: labels = [self.index._get_level_label(level)] - midx_data = self.index._source_data[labels] + midx_data = self.index.to_frame(index=False)[labels] inds = midx_data.argsort( ascending=ascending, na_position=na_position ) From ffa07ea72b68cc4e27b181be8eb6d8ef3e9f34bc Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 15 Sep 2021 20:01:03 -0500 Subject: [PATCH 08/10] Update python/cudf/cudf/core/frame.py --- python/cudf/cudf/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index bda9e113849..5ba11a55f5d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -479,7 +479,7 @@ def _sort_index( if axis in (0, "index"): if isinstance(self.index, cudf.MultiIndex): if level is None: - midx_data = self.index._source_data + midx_data = self.index.to_frame(index=False) else: # Pandas currently don't handle na_position # in case of MultiIndex From 68f1e9720e508e618acc97b6e6d0f3502a2af214 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 16 Sep 2021 14:46:08 -0500 Subject: [PATCH 09/10] Update python/cudf/cudf/core/frame.py Co-authored-by: Michael Wang --- python/cudf/cudf/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 5ba11a55f5d..2c763c60341 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -494,7 +494,7 @@ def _sort_index( ] else: labels = [self.index._get_level_label(level)] - midx_data = self.index.to_frame(index=False)[labels] + midx_data = cudf.DataFrame._from_data(self.index._data.select_by_label(labels)) inds = midx_data.argsort( ascending=ascending, na_position=na_position ) From 2c8110b7292c4c8bf2b860244c3f74ed562d7478 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 16 Sep 2021 13:02:08 -0700 Subject: [PATCH 10/10] style --- python/cudf/cudf/core/frame.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 2c763c60341..09a6df67da5 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -494,7 +494,9 @@ def _sort_index( ] else: labels = [self.index._get_level_label(level)] - midx_data = cudf.DataFrame._from_data(self.index._data.select_by_label(labels)) + midx_data = cudf.DataFrame._from_data( + self.index._data.select_by_label(labels) + ) inds = midx_data.argsort( ascending=ascending, na_position=na_position )