From 2d84400c8d1a6186840483e6172d6eb2929171d2 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Apr 2022 15:24:25 -0700 Subject: [PATCH 1/8] Rewrites transpose cython API and update `dataframe.transpose` --- python/cudf/cudf/_lib/transpose.pyx | 60 ++++------------------------- python/cudf/cudf/_lib/utils.pyx | 8 ++-- python/cudf/cudf/core/dataframe.py | 37 +++++++++++++++--- 3 files changed, 42 insertions(+), 63 deletions(-) diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index 931a2702612..b9eea6169bd 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -1,7 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. - -import cudf -from cudf.api.types import is_categorical_dtype +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair @@ -9,65 +6,22 @@ from libcpp.utility cimport move from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.transpose cimport transpose as cpp_transpose -from cudf._lib.utils cimport data_from_table_view, table_view_from_table - +from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns -def transpose(source): - """Transpose index and columns. - See Also - -------- - cudf.core.DataFrame.transpose +def transpose(list source_columns): + """Transpose m n-row columns into n m-row columns """ - - if source._num_columns == 0: - return source - - cats = None - columns = source._columns - dtype = columns[0].dtype - - if is_categorical_dtype(dtype): - if any(not is_categorical_dtype(c.dtype) for c in columns): - raise ValueError('Columns must all have the same dtype') - cats = list(c.categories for c in columns) - cats = cudf.core.column.concat_columns(cats).unique() - source = cudf.core.frame.Frame(index=source._index, data=[ - (name, col._set_categories(cats, is_unique=True).codes) - for name, col in source._data.items() - ]) - elif any(c.dtype != dtype for c in columns): - raise ValueError('Columns must all have the same dtype') - cdef pair[unique_ptr[column], table_view] c_result - cdef table_view c_input = table_view_from_table( - source, ignore_index=True) + cdef table_view c_input = table_view_from_columns(source_columns) with nogil: c_result = move(cpp_transpose(c_input)) result_owner = Column.from_unique_ptr(move(c_result.first)) - data, _ = data_from_table_view( + return columns_from_table_view( c_result.second, - owner=result_owner, - column_names=range(c_input.num_rows()) + owners=[result_owner] * c_result.second.num_columns() ) - - if cats is not None: - data= [ - (name, cudf.core.column.column.build_categorical_column( - codes=cudf.core.column.column.build_column( - col.base_data, dtype=col.dtype), - mask=col.base_mask, - size=col.size, - categories=cats, - offset=col.offset, - )) - for name, col in data.items() - ] - - return data diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 8557f430e25..643a1adca9f 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -317,10 +317,10 @@ cdef columns_from_table_view( ): """ Given a ``cudf::table_view``, construsts a list of columns from it, - along with referencing an ``owner`` Python object that owns the memory - lifetime. ``owner`` must be either None or a list of column. If ``owner`` - is a list of columns, the owner of the `i`th ``cudf::column_view`` in the - table view is ``owners[i]``. For more about memory ownership, + along with referencing an owner Python object that owns the memory + lifetime. owner must be either None or a list of column. If owner + is a list of columns, the owner of the `i`th ``cudf::column_view`` + in the table view is ``owners[i]``. For more about memory ownership, see ``Column.from_column_view``. """ diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8893b85c97c..063539c2c7a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3193,17 +3193,42 @@ def transpose(self): Difference from pandas: Not supporting *copy* because default and only behavior is copy=True """ - # Never transpose a MultiIndex - remove the existing columns and - # replace with a RangeIndex. Afterward, reassign. - columns = self.index.copy(deep=False) + index = self._data.to_pandas_index() + columns = self.index.copy(deep=False) if self._num_columns == 0 or self._num_rows == 0: return DataFrame(index=index, columns=columns) + + # No column from index is transposed with libcudf. + source_columns = [*self._columns] + source_dtype = source_columns[0].dtype + if is_categorical_dtype(source_dtype): + if any(not is_categorical_dtype(c.dtype) for c in source_columns): + raise ValueError("Columns must all have the same dtype") + cats = list(c.categories for c in source_columns) + cats = cudf.core.column.concat_columns(cats).unique() + source_columns = [ + col._set_categories(cats, is_unique=True).codes + for col in source_columns + ] + + if any(c.dtype != source_columns[0].dtype for c in source_columns): + raise ValueError("Columns must all have the same dtype") + + result_columns = libcudf.transpose.transpose(source_columns) + + if is_categorical_dtype(source_dtype): + result_columns = [ + codes._with_type_metadata( + cudf.core.dtypes.CategoricalDtype(categories=cats) + ) + for codes in result_columns + ] + # Set the old column names as the new index result = self.__class__._from_data( - # Cython renames the columns to the range [0...ncols] - libcudf.transpose.transpose(self), - as_index(index), + {i: col for i, col in enumerate(result_columns)}, + index=as_index(index), ) # Set the old index as the new column names result.columns = columns From fd2bb8e50760e31b2ea0c5c38e9f2e84fe038b39 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Apr 2022 15:50:11 -0700 Subject: [PATCH 2/8] Refactor `is_sorted` --- python/cudf/cudf/_lib/sort.pyx | 51 ++++++++++++++-------------------- python/cudf/cudf/core/frame.py | 16 ++++++++++- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index 3aa0b35e90e..d34f3a7aa87 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -1,6 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. - -import pandas as pd +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -23,19 +21,24 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport null_order, null_policy, order from cudf._lib.sort cimport underlying_type_t_rank_method -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport ( + columns_from_unique_ptr, + data_from_unique_ptr, + table_view_from_columns, + table_view_from_table, +) def is_sorted( - source_table, object ascending=None, object null_position=None + list source_columns, object ascending=None, object null_position=None ): """ Checks whether the rows of a `table` are sorted in lexicographical order. Parameters ---------- - source_table : Frame - Frame whose columns are to be checked for sort order + source_columns : list of columns + columns to be checked for sort order ascending : None or list-like of booleans None or list-like of boolean values indicating expected sort order of each column. If list-like, size of list-like must be len(columns). If @@ -58,51 +61,39 @@ def is_sorted( cdef vector[null_order] null_precedence if ascending is None: - column_order = vector[order]( - source_table._num_columns, order.ASCENDING - ) - elif pd.api.types.is_list_like(ascending): - if len(ascending) != source_table._num_columns: + column_order = vector[order](len(source_columns), order.ASCENDING) + else: + if len(ascending) != len(source_columns): raise ValueError( - f"Expected a list-like of length {source_table._num_columns}, " + f"Expected a list-like of length {len(source_columns)}, " f"got length {len(ascending)} for `ascending`" ) column_order = vector[order]( - source_table._num_columns, order.DESCENDING + len(source_columns), order.DESCENDING ) for idx, val in enumerate(ascending): if val: column_order[idx] = order.ASCENDING - else: - raise TypeError( - f"Expected a list-like or None for `ascending`, got " - f"{type(ascending)}" - ) if null_position is None: null_precedence = vector[null_order]( - source_table._num_columns, null_order.AFTER + len(source_columns), null_order.AFTER ) - elif pd.api.types.is_list_like(null_position): - if len(null_position) != source_table._num_columns: + else: + if len(null_position) != len(source_columns): raise ValueError( - f"Expected a list-like of length {source_table._num_columns}, " + f"Expected a list-like of length {len(source_columns)}, " f"got length {len(null_position)} for `null_position`" ) null_precedence = vector[null_order]( - source_table._num_columns, null_order.AFTER + len(source_columns), null_order.AFTER ) for idx, val in enumerate(null_position): if val: null_precedence[idx] = null_order.BEFORE - else: - raise TypeError( - f"Expected a list-like or None for `null_position`, got " - f"{type(null_position)}" - ) cdef bool c_result - cdef table_view source_table_view = table_view_from_table(source_table) + cdef table_view source_table_view = table_view_from_columns(source_columns) with nogil: c_result = cpp_is_sorted( source_table_view, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 806cdf14c71..e99f0fc8f78 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2441,8 +2441,22 @@ def _is_sorted(self, ascending=None, null_position=None): Returns True, if sorted as expected by ``ascending`` and ``null_position``, False otherwise. """ + if ascending is not None and not cudf.api.types.is_list_like( + ascending + ): + raise TypeError( + f"Expected a list-like or None for `ascending`, got " + f"{type(ascending)}" + ) + if null_position is not None and not cudf.api.types.is_list_like( + null_position + ): + raise TypeError( + f"Expected a list-like or None for `null_position`, got " + f"{type(null_position)}" + ) return libcudf.sort.is_sorted( - self, ascending=ascending, null_position=null_position + [*self._columns], ascending=ascending, null_position=null_position ) @_cudf_nvtx_annotate From db9ac6eb4e1f0daba08b91a0e69c33eaac8c3bb6 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Apr 2022 16:13:27 -0700 Subject: [PATCH 3/8] Refactor `order_by` --- python/cudf/cudf/_lib/sort.pyx | 12 ++++++------ python/cudf/cudf/core/frame.py | 14 ++++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index d34f3a7aa87..d36cd20792a 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -104,21 +104,21 @@ def is_sorted( return c_result -def order_by(source_table, object ascending, str na_position): +def order_by(list columns_from_table, object ascending, str na_position): """ - Sorting the table ascending/descending + Get index to sort the table in ascending/descending order. Parameters ---------- - source_table : table which will be sorted - ascending : list of boolean values which correspond to each column + columns_from_table : columns from the table which will be sorted + ascending : sequence of boolean values which correspond to each column in source_table signifying order of each column True - Ascending and False - Descending na_position : whether null value should show up at the "first" or "last" position of **all** sorted column. """ - cdef table_view source_table_view = table_view_from_table( - source_table, ignore_index=True + cdef table_view source_table_view = table_view_from_columns( + columns_from_table ) cdef vector[order] column_order column_order.reserve(len(ascending)) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e99f0fc8f78..38182a71676 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2273,15 +2273,17 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): # Get an int64 column consisting of the indices required to sort self # according to the columns specified in by. - to_sort = ( - self - if by is None - else self._get_columns_by_label(list(by), downcast=False) - ) + to_sort = [ + *( + self + if by is None + else self._get_columns_by_label(list(by), downcast=False) + )._columns + ] # If given a scalar need to construct a sequence of length # of columns if np.isscalar(ascending): - ascending = [ascending] * to_sort._num_columns + ascending = [ascending] * len(to_sort) return libcudf.sort.order_by(to_sort, ascending, na_position) From 46ab81272aad972f0bee186f115688c738b6dd29 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Apr 2022 16:26:45 -0700 Subject: [PATCH 4/8] Rename sort.pyx variables --- python/cudf/cudf/_lib/sort.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index d36cd20792a..9ad6b36f87a 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -125,13 +125,13 @@ def order_by(list columns_from_table, object ascending, str na_position): cdef vector[null_order] null_precedence null_precedence.reserve(len(ascending)) - for i in ascending: - if i is True: + for asc in ascending: + if asc: column_order.push_back(order.ASCENDING) else: column_order.push_back(order.DESCENDING) - if i ^ (na_position == "first"): + if asc ^ (na_position == "first"): null_precedence.push_back(null_order.AFTER) else: null_precedence.push_back(null_order.BEFORE) From 174280544e2d4cbbc535e3346c58e77e7f747fa3 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Apr 2022 16:29:38 -0700 Subject: [PATCH 5/8] Refactor `digitize` --- python/cudf/cudf/_lib/sort.pyx | 18 +++++++++--------- python/cudf/cudf/core/column/numerical.py | 4 +--- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index 9ad6b36f87a..56db54ee2a9 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -145,21 +145,21 @@ def order_by(list columns_from_table, object ascending, str na_position): return Column.from_unique_ptr(move(c_result)) -def digitize(source_values_table, bins, bool right=False): +def digitize(list source_columns, list bins, bool right=False): """ Return the indices of the bins to which each value in source_table belongs. Parameters ---------- - source_table : Input table to be binned. - bins : Frame containing columns of bins + source_columns : Input columns to be binned. + bins : List containing columns of bins right : Indicating whether the intervals include the right or the left bin edge. """ - cdef table_view bins_view = table_view_from_table(bins) - cdef table_view source_values_table_view = table_view_from_table( - source_values_table + cdef table_view bins_view = table_view_from_columns(bins) + cdef table_view source_table_view = table_view_from_columns( + source_columns ) cdef vector[order] column_order = ( vector[order]( @@ -175,11 +175,11 @@ def digitize(source_values_table, bins, bool right=False): ) cdef unique_ptr[column] c_result - if right is True: + if right: with nogil: c_result = move(lower_bound( bins_view, - source_values_table_view, + source_table_view, column_order, null_precedence) ) @@ -187,7 +187,7 @@ def digitize(source_values_table, bins, bool right=False): with nogil: c_result = move(upper_bound( bins_view, - source_values_table_view, + source_table_view, column_order, null_precedence) ) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 216faaa8250..e7b8d62f886 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -774,6 +774,4 @@ def digitize( if bin_col.nullable: raise ValueError("`bins` cannot contain null entries.") - return as_column( - libcudf.sort.digitize(column.as_frame(), bin_col.as_frame(), right) - ) + return as_column(libcudf.sort.digitize([column], [bin_col], right)) From 6a765c923bd72c58b0db47db0faef5ee1b4a795e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Apr 2022 16:30:19 -0700 Subject: [PATCH 6/8] Reduce digitize tests counts, move to test_series.py --- python/cudf/cudf/tests/test_dataframe.py | 29 ------------------------ python/cudf/cudf/tests/test_series.py | 29 ++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 07261534777..21488f1e8cd 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2468,35 +2468,6 @@ def test_arrow_handle_no_index_name(pdf, gdf): assert_eq(expect, got) -@pytest.mark.parametrize("num_rows", [1, 3, 10, 100]) -@pytest.mark.parametrize("num_bins", [1, 2, 4, 20]) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) -@pytest.mark.parametrize("series_bins", [True, False]) -def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): - data = np.random.randint(0, 100, num_rows).astype(dtype) - bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype))) - s = cudf.Series(data) - if series_bins: - s_bins = cudf.Series(bins) - indices = s.digitize(s_bins, right) - else: - indices = s.digitize(bins, right) - np.testing.assert_array_equal( - np.digitize(data, bins, right), indices.to_numpy() - ) - - -def test_series_digitize_invalid_bins(): - s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32") - bins = cudf.Series([2, None, None, 50, 90], dtype="int32") - - with pytest.raises( - ValueError, match="`bins` cannot contain null entries." - ): - _ = s.digitize(bins) - - def test_pandas_non_contiguious(): arr1 = np.random.sample([5000, 10]) assert arr1.flags["C_CONTIGUOUS"] is True diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index fccb9f680d9..87fb9bff7ed 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1782,3 +1782,32 @@ def test_diff_many_dtypes(data): gs = cudf.from_pandas(ps) assert_eq(ps.diff(), gs.diff()) assert_eq(ps.diff(periods=2), gs.diff(periods=2)) + + +@pytest.mark.parametrize("num_rows", [1, 100]) +@pytest.mark.parametrize("num_bins", [1, 10]) +@pytest.mark.parametrize("right", [True, False]) +@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) +@pytest.mark.parametrize("series_bins", [True, False]) +def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): + data = np.random.randint(0, 100, num_rows).astype(dtype) + bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype))) + s = cudf.Series(data) + if series_bins: + s_bins = cudf.Series(bins) + indices = s.digitize(s_bins, right) + else: + indices = s.digitize(bins, right) + np.testing.assert_array_equal( + np.digitize(data, bins, right), indices.to_numpy() + ) + + +def test_series_digitize_invalid_bins(): + s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32") + bins = cudf.Series([2, None, None, 50, 90], dtype="int32") + + with pytest.raises( + ValueError, match="`bins` cannot contain null entries." + ): + _ = s.digitize(bins) From 2993fbc89a3020cb138a8f0c2ebfb79a1e8ed337 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Apr 2022 16:59:58 -0700 Subject: [PATCH 7/8] Refactor `rank` --- python/cudf/cudf/_lib/sort.pyx | 26 +++++++------------------- python/cudf/cudf/core/frame.py | 9 ++++++--- 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index 56db54ee2a9..faa4279c1ca 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -21,12 +21,7 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport null_order, null_policy, order from cudf._lib.sort cimport underlying_type_t_rank_method -from cudf._lib.utils cimport ( - columns_from_unique_ptr, - data_from_unique_ptr, - table_view_from_columns, - table_view_from_table, -) +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns def is_sorted( @@ -203,15 +198,13 @@ class RankMethod(IntEnum): DENSE = < underlying_type_t_rank_method > rank_method.DENSE -def rank_columns(source_table, object method, str na_option, +def rank_columns(list source_columns, object method, str na_option, bool ascending, bool pct ): """ Compute numerical data ranks (1 through n) of each column in the dataframe """ - cdef table_view source_table_view = table_view_from_table( - source_table, ignore_index=True - ) + cdef table_view source_table_view = table_view_from_columns(source_columns) cdef rank_method c_rank_method = < rank_method > ( < underlying_type_t_rank_method > method @@ -251,7 +244,7 @@ def rank_columns(source_table, object method, str na_option, cdef vector[unique_ptr[column]] c_results cdef column_view c_view cdef Column col - for col in source_table._columns: + for col in source_columns: c_view = col.view() with nogil: c_results.push_back(move( @@ -265,11 +258,6 @@ def rank_columns(source_table, object method, str na_option, ) )) - cdef unique_ptr[table] c_result - c_result.reset(new table(move(c_results))) - data, _ = data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names, - index_names=None - ) - return data, source_table._index + return [Column.from_unique_ptr( + move(c_results[i]) + ) for i in range(c_results.size())] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 38182a71676..52de513141b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1365,11 +1365,14 @@ def rank( if source.empty: return source.astype("float64") - data, index = libcudf.sort.rank_columns( - source, method_enum, na_option, ascending, pct + result_columns = libcudf.sort.rank_columns( + [*source._columns], method_enum, na_option, ascending, pct ) - return self._from_data(data, index).astype(np.float64) + return self.__class__._from_data( + dict(zip(source._column_names, result_columns)), + index=source._index, + ).astype(np.float64) @_cudf_nvtx_annotate def shift(self, periods=1, freq=None, axis=0, fill_value=None): From e2f7c27a0fd1ce1b0a7466e41c535eba49ed2072 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 15 Apr 2022 17:02:38 -0700 Subject: [PATCH 8/8] Move rank to `indexed_frame` --- python/cudf/cudf/core/frame.py | 86 ------------------------- python/cudf/cudf/core/indexed_frame.py | 87 ++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 86 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 52de513141b..0d3e310dfa1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1288,92 +1288,6 @@ def _quantiles( result._copy_type_metadata(self) return result - @_cudf_nvtx_annotate - def rank( - self, - axis=0, - method="average", - numeric_only=None, - na_option="keep", - ascending=True, - pct=False, - ): - """ - Compute numerical data ranks (1 through n) along axis. - By default, equal values are assigned a rank that is the average of the - ranks of those values. - - Parameters - ---------- - axis : {0 or 'index'}, default 0 - Index to direct ranking. - method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' - How to rank the group of records that have the same value - (i.e. ties): - * average: average rank of the group - * min: lowest rank in the group - * max: highest rank in the group - * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups. - numeric_only : bool, optional - For DataFrame objects, rank only numeric columns if set to True. - na_option : {'keep', 'top', 'bottom'}, default 'keep' - How to rank NaN values: - * keep: assign NaN rank to NaN values - * top: assign smallest rank to NaN values if ascending - * bottom: assign highest rank to NaN values if ascending. - ascending : bool, default True - Whether or not the elements should be ranked in ascending order. - pct : bool, default False - Whether or not to display the returned rankings in percentile - form. - - Returns - ------- - same type as caller - Return a Series or DataFrame with data ranks as values. - """ - if isinstance(self, cudf.BaseIndex): - warnings.warn( - "Index.rank is deprecated and will be removed.", - FutureWarning, - ) - - if method not in {"average", "min", "max", "first", "dense"}: - raise KeyError(method) - - method_enum = libcudf.sort.RankMethod[method.upper()] - if na_option not in {"keep", "top", "bottom"}: - raise ValueError( - "na_option must be one of 'keep', 'top', or 'bottom'" - ) - - if axis not in (0, "index"): - raise NotImplementedError( - f"axis must be `0`/`index`, " - f"axis={axis} is not yet supported in rank" - ) - - source = self - if numeric_only: - numeric_cols = ( - name - for name in self._data.names - if _is_non_decimal_numeric_dtype(self._data[name]) - ) - source = self._get_columns_by_label(numeric_cols) - if source.empty: - return source.astype("float64") - - result_columns = libcudf.sort.rank_columns( - [*source._columns], method_enum, na_option, ascending, pct - ) - - return self.__class__._from_data( - dict(zip(source._column_names, result_columns)), - index=source._index, - ).astype(np.float64) - @_cudf_nvtx_annotate def shift(self, periods=1, freq=None, axis=0, fill_value=None): """Shift values by `periods` positions.""" diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ea722ec3968..7a00f6043b8 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3545,6 +3545,93 @@ def ge( other=other, op="__ge__", fill_value=fill_value, can_reindex=True ) + @_cudf_nvtx_annotate + def rank( + self, + axis=0, + method="average", + numeric_only=None, + na_option="keep", + ascending=True, + pct=False, + ): + """ + Compute numerical data ranks (1 through n) along axis. + + By default, equal values are assigned a rank that is the average of the + ranks of those values. + + Parameters + ---------- + axis : {0 or 'index'}, default 0 + Index to direct ranking. + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + How to rank the group of records that have the same value + (i.e. ties): + * average: average rank of the group + * min: lowest rank in the group + * max: highest rank in the group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups. + numeric_only : bool, optional + For DataFrame objects, rank only numeric columns if set to True. + na_option : {'keep', 'top', 'bottom'}, default 'keep' + How to rank NaN values: + * keep: assign NaN rank to NaN values + * top: assign smallest rank to NaN values if ascending + * bottom: assign highest rank to NaN values if ascending. + ascending : bool, default True + Whether or not the elements should be ranked in ascending order. + pct : bool, default False + Whether or not to display the returned rankings in percentile + form. + + Returns + ------- + same type as caller + Return a Series or DataFrame with data ranks as values. + """ + if isinstance(self, cudf.BaseIndex): + warnings.warn( + "Index.rank is deprecated and will be removed.", + FutureWarning, + ) + + if method not in {"average", "min", "max", "first", "dense"}: + raise KeyError(method) + + method_enum = libcudf.sort.RankMethod[method.upper()] + if na_option not in {"keep", "top", "bottom"}: + raise ValueError( + "na_option must be one of 'keep', 'top', or 'bottom'" + ) + + if axis not in (0, "index"): + raise NotImplementedError( + f"axis must be `0`/`index`, " + f"axis={axis} is not yet supported in rank" + ) + + source = self + if numeric_only: + numeric_cols = ( + name + for name in self._data.names + if _is_non_decimal_numeric_dtype(self._data[name]) + ) + source = self._get_columns_by_label(numeric_cols) + if source.empty: + return source.astype("float64") + + result_columns = libcudf.sort.rank_columns( + [*source._columns], method_enum, na_option, ascending, pct + ) + + return self.__class__._from_data( + dict(zip(source._column_names, result_columns)), + index=source._index, + ).astype(np.float64) + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`."""