Skip to content

Commit

Permalink
Move rank to indexed_frame
Browse files Browse the repository at this point in the history
  • Loading branch information
isVoid committed Apr 16, 2022
1 parent 2993fbc commit e2f7c27
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 86 deletions.
86 changes: 0 additions & 86 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1288,92 +1288,6 @@ def _quantiles(
result._copy_type_metadata(self)
return result

@_cudf_nvtx_annotate
def rank(
self,
axis=0,
method="average",
numeric_only=None,
na_option="keep",
ascending=True,
pct=False,
):
"""
Compute numerical data ranks (1 through n) along axis.
By default, equal values are assigned a rank that is the average of the
ranks of those values.
Parameters
----------
axis : {0 or 'index'}, default 0
Index to direct ranking.
method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
How to rank the group of records that have the same value
(i.e. ties):
* average: average rank of the group
* min: lowest rank in the group
* max: highest rank in the group
* first: ranks assigned in order they appear in the array
* dense: like 'min', but rank always increases by 1 between groups.
numeric_only : bool, optional
For DataFrame objects, rank only numeric columns if set to True.
na_option : {'keep', 'top', 'bottom'}, default 'keep'
How to rank NaN values:
* keep: assign NaN rank to NaN values
* top: assign smallest rank to NaN values if ascending
* bottom: assign highest rank to NaN values if ascending.
ascending : bool, default True
Whether or not the elements should be ranked in ascending order.
pct : bool, default False
Whether or not to display the returned rankings in percentile
form.
Returns
-------
same type as caller
Return a Series or DataFrame with data ranks as values.
"""
if isinstance(self, cudf.BaseIndex):
warnings.warn(
"Index.rank is deprecated and will be removed.",
FutureWarning,
)

if method not in {"average", "min", "max", "first", "dense"}:
raise KeyError(method)

method_enum = libcudf.sort.RankMethod[method.upper()]
if na_option not in {"keep", "top", "bottom"}:
raise ValueError(
"na_option must be one of 'keep', 'top', or 'bottom'"
)

if axis not in (0, "index"):
raise NotImplementedError(
f"axis must be `0`/`index`, "
f"axis={axis} is not yet supported in rank"
)

source = self
if numeric_only:
numeric_cols = (
name
for name in self._data.names
if _is_non_decimal_numeric_dtype(self._data[name])
)
source = self._get_columns_by_label(numeric_cols)
if source.empty:
return source.astype("float64")

result_columns = libcudf.sort.rank_columns(
[*source._columns], method_enum, na_option, ascending, pct
)

return self.__class__._from_data(
dict(zip(source._column_names, result_columns)),
index=source._index,
).astype(np.float64)

@_cudf_nvtx_annotate
def shift(self, periods=1, freq=None, axis=0, fill_value=None):
"""Shift values by `periods` positions."""
Expand Down
87 changes: 87 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3545,6 +3545,93 @@ def ge(
other=other, op="__ge__", fill_value=fill_value, can_reindex=True
)

@_cudf_nvtx_annotate
def rank(
self,
axis=0,
method="average",
numeric_only=None,
na_option="keep",
ascending=True,
pct=False,
):
"""
Compute numerical data ranks (1 through n) along axis.
By default, equal values are assigned a rank that is the average of the
ranks of those values.
Parameters
----------
axis : {0 or 'index'}, default 0
Index to direct ranking.
method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
How to rank the group of records that have the same value
(i.e. ties):
* average: average rank of the group
* min: lowest rank in the group
* max: highest rank in the group
* first: ranks assigned in order they appear in the array
* dense: like 'min', but rank always increases by 1 between groups.
numeric_only : bool, optional
For DataFrame objects, rank only numeric columns if set to True.
na_option : {'keep', 'top', 'bottom'}, default 'keep'
How to rank NaN values:
* keep: assign NaN rank to NaN values
* top: assign smallest rank to NaN values if ascending
* bottom: assign highest rank to NaN values if ascending.
ascending : bool, default True
Whether or not the elements should be ranked in ascending order.
pct : bool, default False
Whether or not to display the returned rankings in percentile
form.
Returns
-------
same type as caller
Return a Series or DataFrame with data ranks as values.
"""
if isinstance(self, cudf.BaseIndex):
warnings.warn(
"Index.rank is deprecated and will be removed.",
FutureWarning,
)

if method not in {"average", "min", "max", "first", "dense"}:
raise KeyError(method)

method_enum = libcudf.sort.RankMethod[method.upper()]
if na_option not in {"keep", "top", "bottom"}:
raise ValueError(
"na_option must be one of 'keep', 'top', or 'bottom'"
)

if axis not in (0, "index"):
raise NotImplementedError(
f"axis must be `0`/`index`, "
f"axis={axis} is not yet supported in rank"
)

source = self
if numeric_only:
numeric_cols = (
name
for name in self._data.names
if _is_non_decimal_numeric_dtype(self._data[name])
)
source = self._get_columns_by_label(numeric_cols)
if source.empty:
return source.astype("float64")

result_columns = libcudf.sort.rank_columns(
[*source._columns], method_enum, na_option, ascending, pct
)

return self.__class__._from_data(
dict(zip(source._column_names, result_columns)),
index=source._index,
).astype(np.float64)


def _check_duplicate_level_names(specified, level_names):
"""Raise if any of `specified` has duplicates in `level_names`."""
Expand Down

0 comments on commit e2f7c27

Please sign in to comment.