diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index 3aa0b35e90e..faa4279c1ca 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -1,6 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. - -import pandas as pd +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -23,19 +21,19 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport null_order, null_policy, order from cudf._lib.sort cimport underlying_type_t_rank_method -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns def is_sorted( - source_table, object ascending=None, object null_position=None + list source_columns, object ascending=None, object null_position=None ): """ Checks whether the rows of a `table` are sorted in lexicographical order. Parameters ---------- - source_table : Frame - Frame whose columns are to be checked for sort order + source_columns : list of columns + columns to be checked for sort order ascending : None or list-like of booleans None or list-like of boolean values indicating expected sort order of each column. If list-like, size of list-like must be len(columns). If @@ -58,51 +56,39 @@ def is_sorted( cdef vector[null_order] null_precedence if ascending is None: - column_order = vector[order]( - source_table._num_columns, order.ASCENDING - ) - elif pd.api.types.is_list_like(ascending): - if len(ascending) != source_table._num_columns: + column_order = vector[order](len(source_columns), order.ASCENDING) + else: + if len(ascending) != len(source_columns): raise ValueError( - f"Expected a list-like of length {source_table._num_columns}, " + f"Expected a list-like of length {len(source_columns)}, " f"got length {len(ascending)} for `ascending`" ) column_order = vector[order]( - source_table._num_columns, order.DESCENDING + len(source_columns), order.DESCENDING ) for idx, val in enumerate(ascending): if val: column_order[idx] = order.ASCENDING - else: - raise TypeError( - f"Expected a list-like or None for `ascending`, got " - f"{type(ascending)}" - ) if null_position is None: null_precedence = vector[null_order]( - source_table._num_columns, null_order.AFTER + len(source_columns), null_order.AFTER ) - elif pd.api.types.is_list_like(null_position): - if len(null_position) != source_table._num_columns: + else: + if len(null_position) != len(source_columns): raise ValueError( - f"Expected a list-like of length {source_table._num_columns}, " + f"Expected a list-like of length {len(source_columns)}, " f"got length {len(null_position)} for `null_position`" ) null_precedence = vector[null_order]( - source_table._num_columns, null_order.AFTER + len(source_columns), null_order.AFTER ) for idx, val in enumerate(null_position): if val: null_precedence[idx] = null_order.BEFORE - else: - raise TypeError( - f"Expected a list-like or None for `null_position`, got " - f"{type(null_position)}" - ) cdef bool c_result - cdef table_view source_table_view = table_view_from_table(source_table) + cdef table_view source_table_view = table_view_from_columns(source_columns) with nogil: c_result = cpp_is_sorted( source_table_view, @@ -113,34 +99,34 @@ def is_sorted( return c_result -def order_by(source_table, object ascending, str na_position): +def order_by(list columns_from_table, object ascending, str na_position): """ - Sorting the table ascending/descending + Get index to sort the table in ascending/descending order. Parameters ---------- - source_table : table which will be sorted - ascending : list of boolean values which correspond to each column + columns_from_table : columns from the table which will be sorted + ascending : sequence of boolean values which correspond to each column in source_table signifying order of each column True - Ascending and False - Descending na_position : whether null value should show up at the "first" or "last" position of **all** sorted column. """ - cdef table_view source_table_view = table_view_from_table( - source_table, ignore_index=True + cdef table_view source_table_view = table_view_from_columns( + columns_from_table ) cdef vector[order] column_order column_order.reserve(len(ascending)) cdef vector[null_order] null_precedence null_precedence.reserve(len(ascending)) - for i in ascending: - if i is True: + for asc in ascending: + if asc: column_order.push_back(order.ASCENDING) else: column_order.push_back(order.DESCENDING) - if i ^ (na_position == "first"): + if asc ^ (na_position == "first"): null_precedence.push_back(null_order.AFTER) else: null_precedence.push_back(null_order.BEFORE) @@ -154,21 +140,21 @@ def order_by(source_table, object ascending, str na_position): return Column.from_unique_ptr(move(c_result)) -def digitize(source_values_table, bins, bool right=False): +def digitize(list source_columns, list bins, bool right=False): """ Return the indices of the bins to which each value in source_table belongs. Parameters ---------- - source_table : Input table to be binned. - bins : Frame containing columns of bins + source_columns : Input columns to be binned. + bins : List containing columns of bins right : Indicating whether the intervals include the right or the left bin edge. """ - cdef table_view bins_view = table_view_from_table(bins) - cdef table_view source_values_table_view = table_view_from_table( - source_values_table + cdef table_view bins_view = table_view_from_columns(bins) + cdef table_view source_table_view = table_view_from_columns( + source_columns ) cdef vector[order] column_order = ( vector[order]( @@ -184,11 +170,11 @@ def digitize(source_values_table, bins, bool right=False): ) cdef unique_ptr[column] c_result - if right is True: + if right: with nogil: c_result = move(lower_bound( bins_view, - source_values_table_view, + source_table_view, column_order, null_precedence) ) @@ -196,7 +182,7 @@ def digitize(source_values_table, bins, bool right=False): with nogil: c_result = move(upper_bound( bins_view, - source_values_table_view, + source_table_view, column_order, null_precedence) ) @@ -212,15 +198,13 @@ class RankMethod(IntEnum): DENSE = < underlying_type_t_rank_method > rank_method.DENSE -def rank_columns(source_table, object method, str na_option, +def rank_columns(list source_columns, object method, str na_option, bool ascending, bool pct ): """ Compute numerical data ranks (1 through n) of each column in the dataframe """ - cdef table_view source_table_view = table_view_from_table( - source_table, ignore_index=True - ) + cdef table_view source_table_view = table_view_from_columns(source_columns) cdef rank_method c_rank_method = < rank_method > ( < underlying_type_t_rank_method > method @@ -260,7 +244,7 @@ def rank_columns(source_table, object method, str na_option, cdef vector[unique_ptr[column]] c_results cdef column_view c_view cdef Column col - for col in source_table._columns: + for col in source_columns: c_view = col.view() with nogil: c_results.push_back(move( @@ -274,11 +258,6 @@ def rank_columns(source_table, object method, str na_option, ) )) - cdef unique_ptr[table] c_result - c_result.reset(new table(move(c_results))) - data, _ = data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names, - index_names=None - ) - return data, source_table._index + return [Column.from_unique_ptr( + move(c_results[i]) + ) for i in range(c_results.size())] diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index 931a2702612..b9eea6169bd 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -1,7 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. - -import cudf -from cudf.api.types import is_categorical_dtype +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair @@ -9,65 +6,22 @@ from libcpp.utility cimport move from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.transpose cimport transpose as cpp_transpose -from cudf._lib.utils cimport data_from_table_view, table_view_from_table - +from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns -def transpose(source): - """Transpose index and columns. - See Also - -------- - cudf.core.DataFrame.transpose +def transpose(list source_columns): + """Transpose m n-row columns into n m-row columns """ - - if source._num_columns == 0: - return source - - cats = None - columns = source._columns - dtype = columns[0].dtype - - if is_categorical_dtype(dtype): - if any(not is_categorical_dtype(c.dtype) for c in columns): - raise ValueError('Columns must all have the same dtype') - cats = list(c.categories for c in columns) - cats = cudf.core.column.concat_columns(cats).unique() - source = cudf.core.frame.Frame(index=source._index, data=[ - (name, col._set_categories(cats, is_unique=True).codes) - for name, col in source._data.items() - ]) - elif any(c.dtype != dtype for c in columns): - raise ValueError('Columns must all have the same dtype') - cdef pair[unique_ptr[column], table_view] c_result - cdef table_view c_input = table_view_from_table( - source, ignore_index=True) + cdef table_view c_input = table_view_from_columns(source_columns) with nogil: c_result = move(cpp_transpose(c_input)) result_owner = Column.from_unique_ptr(move(c_result.first)) - data, _ = data_from_table_view( + return columns_from_table_view( c_result.second, - owner=result_owner, - column_names=range(c_input.num_rows()) + owners=[result_owner] * c_result.second.num_columns() ) - - if cats is not None: - data= [ - (name, cudf.core.column.column.build_categorical_column( - codes=cudf.core.column.column.build_column( - col.base_data, dtype=col.dtype), - mask=col.base_mask, - size=col.size, - categories=cats, - offset=col.offset, - )) - for name, col in data.items() - ] - - return data diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 8557f430e25..643a1adca9f 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -317,10 +317,10 @@ cdef columns_from_table_view( ): """ Given a ``cudf::table_view``, construsts a list of columns from it, - along with referencing an ``owner`` Python object that owns the memory - lifetime. ``owner`` must be either None or a list of column. If ``owner`` - is a list of columns, the owner of the `i`th ``cudf::column_view`` in the - table view is ``owners[i]``. For more about memory ownership, + along with referencing an owner Python object that owns the memory + lifetime. owner must be either None or a list of column. If owner + is a list of columns, the owner of the `i`th ``cudf::column_view`` + in the table view is ``owners[i]``. For more about memory ownership, see ``Column.from_column_view``. """ diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 216faaa8250..e7b8d62f886 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -774,6 +774,4 @@ def digitize( if bin_col.nullable: raise ValueError("`bins` cannot contain null entries.") - return as_column( - libcudf.sort.digitize(column.as_frame(), bin_col.as_frame(), right) - ) + return as_column(libcudf.sort.digitize([column], [bin_col], right)) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 50255b07077..d87cb788a7e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3194,17 +3194,42 @@ def transpose(self): Difference from pandas: Not supporting *copy* because default and only behavior is copy=True """ - # Never transpose a MultiIndex - remove the existing columns and - # replace with a RangeIndex. Afterward, reassign. - columns = self.index.copy(deep=False) + index = self._data.to_pandas_index() + columns = self.index.copy(deep=False) if self._num_columns == 0 or self._num_rows == 0: return DataFrame(index=index, columns=columns) + + # No column from index is transposed with libcudf. + source_columns = [*self._columns] + source_dtype = source_columns[0].dtype + if is_categorical_dtype(source_dtype): + if any(not is_categorical_dtype(c.dtype) for c in source_columns): + raise ValueError("Columns must all have the same dtype") + cats = list(c.categories for c in source_columns) + cats = cudf.core.column.concat_columns(cats).unique() + source_columns = [ + col._set_categories(cats, is_unique=True).codes + for col in source_columns + ] + + if any(c.dtype != source_columns[0].dtype for c in source_columns): + raise ValueError("Columns must all have the same dtype") + + result_columns = libcudf.transpose.transpose(source_columns) + + if is_categorical_dtype(source_dtype): + result_columns = [ + codes._with_type_metadata( + cudf.core.dtypes.CategoricalDtype(categories=cats) + ) + for codes in result_columns + ] + # Set the old column names as the new index result = self.__class__._from_data( - # Cython renames the columns to the range [0...ncols] - libcudf.transpose.transpose(self), - as_index(index), + {i: col for i, col in enumerate(result_columns)}, + index=as_index(index), ) # Set the old index as the new column names result.columns = columns diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d10f7c690bf..e5863b52a5d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1289,89 +1289,6 @@ def _quantiles( column_names=self._column_names, ) - @_cudf_nvtx_annotate - def rank( - self, - axis=0, - method="average", - numeric_only=None, - na_option="keep", - ascending=True, - pct=False, - ): - """ - Compute numerical data ranks (1 through n) along axis. - By default, equal values are assigned a rank that is the average of the - ranks of those values. - - Parameters - ---------- - axis : {0 or 'index'}, default 0 - Index to direct ranking. - method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' - How to rank the group of records that have the same value - (i.e. ties): - * average: average rank of the group - * min: lowest rank in the group - * max: highest rank in the group - * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups. - numeric_only : bool, optional - For DataFrame objects, rank only numeric columns if set to True. - na_option : {'keep', 'top', 'bottom'}, default 'keep' - How to rank NaN values: - * keep: assign NaN rank to NaN values - * top: assign smallest rank to NaN values if ascending - * bottom: assign highest rank to NaN values if ascending. - ascending : bool, default True - Whether or not the elements should be ranked in ascending order. - pct : bool, default False - Whether or not to display the returned rankings in percentile - form. - - Returns - ------- - same type as caller - Return a Series or DataFrame with data ranks as values. - """ - if isinstance(self, cudf.BaseIndex): - warnings.warn( - "Index.rank is deprecated and will be removed.", - FutureWarning, - ) - - if method not in {"average", "min", "max", "first", "dense"}: - raise KeyError(method) - - method_enum = libcudf.sort.RankMethod[method.upper()] - if na_option not in {"keep", "top", "bottom"}: - raise ValueError( - "na_option must be one of 'keep', 'top', or 'bottom'" - ) - - if axis not in (0, "index"): - raise NotImplementedError( - f"axis must be `0`/`index`, " - f"axis={axis} is not yet supported in rank" - ) - - source = self - if numeric_only: - numeric_cols = ( - name - for name in self._data.names - if _is_non_decimal_numeric_dtype(self._data[name]) - ) - source = self._get_columns_by_label(numeric_cols) - if source.empty: - return source.astype("float64") - - data, index = libcudf.sort.rank_columns( - source, method_enum, na_option, ascending, pct - ) - - return self._from_data(data, index).astype(np.float64) - @_cudf_nvtx_annotate def shift(self, periods=1, freq=None, axis=0, fill_value=None): """Shift values by `periods` positions.""" @@ -2219,15 +2136,17 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): # Get an int64 column consisting of the indices required to sort self # according to the columns specified in by. - to_sort = ( - self - if by is None - else self._get_columns_by_label(list(by), downcast=False) - ) + to_sort = [ + *( + self + if by is None + else self._get_columns_by_label(list(by), downcast=False) + )._columns + ] # If given a scalar need to construct a sequence of length # of columns if np.isscalar(ascending): - ascending = [ascending] * to_sort._num_columns + ascending = [ascending] * len(to_sort) return libcudf.sort.order_by(to_sort, ascending, na_position) @@ -2387,8 +2306,22 @@ def _is_sorted(self, ascending=None, null_position=None): Returns True, if sorted as expected by ``ascending`` and ``null_position``, False otherwise. """ + if ascending is not None and not cudf.api.types.is_list_like( + ascending + ): + raise TypeError( + f"Expected a list-like or None for `ascending`, got " + f"{type(ascending)}" + ) + if null_position is not None and not cudf.api.types.is_list_like( + null_position + ): + raise TypeError( + f"Expected a list-like or None for `null_position`, got " + f"{type(null_position)}" + ) return libcudf.sort.is_sorted( - self, ascending=ascending, null_position=null_position + [*self._columns], ascending=ascending, null_position=null_position ) @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index ddb3082af96..fedbaed28db 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3577,6 +3577,93 @@ def ge( other=other, op="__ge__", fill_value=fill_value, can_reindex=True ) + @_cudf_nvtx_annotate + def rank( + self, + axis=0, + method="average", + numeric_only=None, + na_option="keep", + ascending=True, + pct=False, + ): + """ + Compute numerical data ranks (1 through n) along axis. + + By default, equal values are assigned a rank that is the average of the + ranks of those values. + + Parameters + ---------- + axis : {0 or 'index'}, default 0 + Index to direct ranking. + method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' + How to rank the group of records that have the same value + (i.e. ties): + * average: average rank of the group + * min: lowest rank in the group + * max: highest rank in the group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups. + numeric_only : bool, optional + For DataFrame objects, rank only numeric columns if set to True. + na_option : {'keep', 'top', 'bottom'}, default 'keep' + How to rank NaN values: + * keep: assign NaN rank to NaN values + * top: assign smallest rank to NaN values if ascending + * bottom: assign highest rank to NaN values if ascending. + ascending : bool, default True + Whether or not the elements should be ranked in ascending order. + pct : bool, default False + Whether or not to display the returned rankings in percentile + form. + + Returns + ------- + same type as caller + Return a Series or DataFrame with data ranks as values. + """ + if isinstance(self, cudf.BaseIndex): + warnings.warn( + "Index.rank is deprecated and will be removed.", + FutureWarning, + ) + + if method not in {"average", "min", "max", "first", "dense"}: + raise KeyError(method) + + method_enum = libcudf.sort.RankMethod[method.upper()] + if na_option not in {"keep", "top", "bottom"}: + raise ValueError( + "na_option must be one of 'keep', 'top', or 'bottom'" + ) + + if axis not in (0, "index"): + raise NotImplementedError( + f"axis must be `0`/`index`, " + f"axis={axis} is not yet supported in rank" + ) + + source = self + if numeric_only: + numeric_cols = ( + name + for name in self._data.names + if _is_non_decimal_numeric_dtype(self._data[name]) + ) + source = self._get_columns_by_label(numeric_cols) + if source.empty: + return source.astype("float64") + + result_columns = libcudf.sort.rank_columns( + [*source._columns], method_enum, na_option, ascending, pct + ) + + return self.__class__._from_data( + dict(zip(source._column_names, result_columns)), + index=source._index, + ).astype(np.float64) + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`.""" diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 2685524add4..957277d7f9b 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -2467,35 +2467,6 @@ def test_arrow_handle_no_index_name(pdf, gdf): assert_eq(expect, got) -@pytest.mark.parametrize("num_rows", [1, 3, 10, 100]) -@pytest.mark.parametrize("num_bins", [1, 2, 4, 20]) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) -@pytest.mark.parametrize("series_bins", [True, False]) -def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): - data = np.random.randint(0, 100, num_rows).astype(dtype) - bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype))) - s = cudf.Series(data) - if series_bins: - s_bins = cudf.Series(bins) - indices = s.digitize(s_bins, right) - else: - indices = s.digitize(bins, right) - np.testing.assert_array_equal( - np.digitize(data, bins, right), indices.to_numpy() - ) - - -def test_series_digitize_invalid_bins(): - s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32") - bins = cudf.Series([2, None, None, 50, 90], dtype="int32") - - with pytest.raises( - ValueError, match="`bins` cannot contain null entries." - ): - _ = s.digitize(bins) - - def test_pandas_non_contiguious(): arr1 = np.random.sample([5000, 10]) assert arr1.flags["C_CONTIGUOUS"] is True diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index fccb9f680d9..87fb9bff7ed 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1782,3 +1782,32 @@ def test_diff_many_dtypes(data): gs = cudf.from_pandas(ps) assert_eq(ps.diff(), gs.diff()) assert_eq(ps.diff(periods=2), gs.diff(periods=2)) + + +@pytest.mark.parametrize("num_rows", [1, 100]) +@pytest.mark.parametrize("num_bins", [1, 10]) +@pytest.mark.parametrize("right", [True, False]) +@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) +@pytest.mark.parametrize("series_bins", [True, False]) +def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): + data = np.random.randint(0, 100, num_rows).astype(dtype) + bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype))) + s = cudf.Series(data) + if series_bins: + s_bins = cudf.Series(bins) + indices = s.digitize(s_bins, right) + else: + indices = s.digitize(bins, right) + np.testing.assert_array_equal( + np.digitize(data, bins, right), indices.to_numpy() + ) + + +def test_series_digitize_invalid_bins(): + s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32") + bins = cudf.Series([2, None, None, 50, 90], dtype="int32") + + with pytest.raises( + ValueError, match="`bins` cannot contain null entries." + ): + _ = s.digitize(bins)