Skip to content

Commit

Permalink
Add Series.drop api (rapidsai#7304)
Browse files Browse the repository at this point in the history
Closes rapidsai#7045 

This PR introduces `Series.drop` API. `Series.drop` allows users to drop certain elements in the series specified `labels` or `index` parameter.

Example:
```python3
>>> s = cudf.Series([1, 2, 3], index=['x', 'y', 'z'])
>>> s.drop(labels=['y'])
x    1
z    3
dtype: int64
```

- [x] Add series test case
- [x] Move common code path from `DataFrame.drop` to helper function
- [x] Add typing annotation
- [x] Add docstring

Authors:
  - Michael Wang (@isVoid)

Approvers:
  - Ashwin Srinath (@shwina)
  - GALI PREM SAGAR (@galipremsagar)

URL: rapidsai#7304
  • Loading branch information
isVoid authored and hyperbolic2346 committed Mar 23, 2021
1 parent 4a0be16 commit 2818928
Show file tree
Hide file tree
Showing 5 changed files with 421 additions and 45 deletions.
71 changes: 28 additions & 43 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.

from __future__ import division
from __future__ import annotations, division

import inspect
import itertools
Expand All @@ -10,7 +10,7 @@
import warnings
from collections import OrderedDict, defaultdict
from collections.abc import Iterable, Sequence
from typing import Any, Set, TypeVar
from typing import Any, Optional, Set, TypeVar

import cupy
import numpy as np
Expand All @@ -30,7 +30,7 @@
from cudf.core.abc import Serializable
from cudf.core.column import as_column, column_empty
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame
from cudf.core.frame import Frame, _drop_rows_by_labels
from cudf.core.groupby.groupby import DataFrameGroupBy
from cudf.core.index import Index, RangeIndex, as_index
from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer
Expand Down Expand Up @@ -495,7 +495,12 @@ def _from_table(cls, table, index=None):
return out

@classmethod
def _from_data(cls, data, index=None, columns=None):
def _from_data(
cls,
data: ColumnAccessor,
index: Optional[Index] = None,
columns: Any = None,
) -> DataFrame:
out = cls.__new__(cls)
out._data = data
if index is None:
Expand Down Expand Up @@ -3364,46 +3369,26 @@ def drop(
)

if inplace:
outdf = self
out = self
else:
outdf = self.copy()
out = self.copy()

if axis in (1, "columns"):
target = _get_host_unique(target)

_drop_columns(outdf, target, errors)
_drop_columns(out, target, errors)
elif axis in (0, "index"):
if not isinstance(target, (cudf.Series, cudf.Index)):
target = column.as_column(target)

if isinstance(self._index, cudf.MultiIndex):
if level is None:
level = 0

levels_index = outdf.index.get_level_values(level)
if errors == "raise" and not target.isin(levels_index).all():
raise KeyError("One or more values not found in axis")

# TODO : Could use anti-join as a future optimization
sliced_df = outdf.take(~levels_index.isin(target))
sliced_df._index.names = self._index.names
else:
if errors == "raise" and not target.isin(outdf.index).all():
raise KeyError("One or more values not found in axis")

sliced_df = outdf.join(
cudf.DataFrame(index=target), how="leftanti"
)
dropped = _drop_rows_by_labels(out, target, level, errors)

if columns is not None:
columns = _get_host_unique(columns)
_drop_columns(sliced_df, columns, errors)
_drop_columns(dropped, columns, errors)

outdf._data = sliced_df._data
outdf._index = sliced_df._index
out._data = dropped._data
out._index = dropped._index

if not inplace:
return outdf
return out

def _drop_column(self, name):
"""Drop a column by *name*
Expand Down Expand Up @@ -7962,17 +7947,6 @@ def _get_union_of_series_names(series_list):
return names_list


def _drop_columns(df, columns, errors):
for c in columns:
try:
df._drop_column(c)
except KeyError as e:
if errors == "ignore":
pass
else:
raise e


def _get_host_unique(array):
if isinstance(
array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase)
Expand All @@ -7982,3 +7956,14 @@ def _get_host_unique(array):
return [array]
else:
return set(array)


def _drop_columns(df: DataFrame, columns: Iterable, errors: str):
for c in columns:
try:
df._drop_column(c)
except KeyError as e:
if errors == "ignore":
pass
else:
raise e
84 changes: 83 additions & 1 deletion python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import functools
import warnings
from collections import OrderedDict, abc as abc
from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, overload
from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload

import cupy
import numpy as np
Expand All @@ -18,6 +18,7 @@

import cudf
from cudf import _lib as libcudf
from cudf._typing import ColumnLike, DataFrameOrSeries
from cudf.core.column import as_column, build_categorical_column, column_empty
from cudf.utils.dtypes import (
is_categorical_dtype,
Expand Down Expand Up @@ -3744,3 +3745,84 @@ def _is_series(obj):
instead of checking for isinstance(obj, cudf.Series)
"""
return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None


def _drop_rows_by_labels(
obj: DataFrameOrSeries,
labels: Union[ColumnLike, abc.Iterable, str],
level: Union[int, str],
errors: str,
) -> DataFrameOrSeries:
"""Remove rows specified by `labels`. If `errors=True`, an error is raised
if some items in `labels` do not exist in `obj._index`.
Will raise if level(int) is greater or equal to index nlevels
"""
if isinstance(level, int) and level >= obj.index.nlevels:
raise ValueError("Param level out of bounds.")

if not isinstance(labels, (cudf.Series, cudf.Index)):
labels = as_column(labels)

if isinstance(obj._index, cudf.MultiIndex):
if level is None:
level = 0

levels_index = obj.index.get_level_values(level)
if errors == "raise" and not labels.isin(levels_index).all():
raise KeyError("One or more values not found in axis")

if isinstance(level, int):
ilevel = level
else:
ilevel = obj._index.names.index(level)

# 1. Merge Index df and data df along column axis:
# | id | ._index df | data column(s) |
idx_nlv = obj._index.nlevels
working_df = obj._index._source_data
working_df.columns = [i for i in range(idx_nlv)]
for i, col in enumerate(obj._data):
working_df[idx_nlv + i] = obj._data[col]
# 2. Set `level` as common index:
# | level | ._index df w/o level | data column(s) |
working_df = working_df.set_index(level)

# 3. Use "leftanti" join to drop
# TODO: use internal API with "leftanti" and specify left and right
# join keys to bypass logic check
to_join = cudf.DataFrame(index=cudf.Index(labels, name=level))
join_res = working_df.join(to_join, how="leftanti")

# 4. Reconstruct original layout, and rename
join_res.insert(
ilevel, name=join_res._index.name, value=join_res._index
)
join_res = join_res.reset_index(drop=True)

midx = cudf.MultiIndex.from_frame(
join_res.iloc[:, 0:idx_nlv], names=obj._index.names
)

if isinstance(obj, cudf.Series):
return obj.__class__._from_data(
join_res.iloc[:, idx_nlv:]._data, index=midx, name=obj.name
)
else:
return obj.__class__._from_data(
join_res.iloc[:, idx_nlv:]._data,
index=midx,
columns=obj.columns,
)

else:
if errors == "raise" and not labels.isin(obj.index).all():
raise KeyError("One or more values not found in axis")

key_df = cudf.DataFrame(index=labels)
if isinstance(obj, cudf.Series):
res = obj.to_frame(name="tmp").join(key_df, how="leftanti")["tmp"]
res.name = obj.name
return res
else:
return obj.join(key_df, how="leftanti")
124 changes: 123 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from cudf.core.column.lists import ListMethods
from cudf.core.column.string import StringMethods
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame
from cudf.core.frame import Frame, _drop_rows_by_labels
from cudf.core.groupby.groupby import SeriesGroupBy
from cudf.core.index import Index, RangeIndex, as_index
from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer
Expand Down Expand Up @@ -545,6 +545,128 @@ def to_arrow(self):
"""
return self._column.to_arrow()

def drop(
self,
labels=None,
axis=0,
index=None,
columns=None,
level=None,
inplace=False,
errors="raise",
):
"""
Return Series with specified index labels removed.
Remove elements of a Series based on specifying the index labels.
When using a multi-index, labels on different levels can be removed by
specifying the level.
Parameters
----------
labels : single label or list-like
Index labels to drop.
axis : 0, default 0
Redundant for application on Series.
index : single label or list-like
Redundant for application on Series. But ``index`` can be used
instead of ``labels``
columns : single label or list-like
This parameter is ignored. Use ``index`` or ``labels`` to specify.
level : int or level name, optional
For MultiIndex, level from which the labels will be removed.
inplace : bool, default False
If False, return a copy. Otherwise, do operation
inplace and return None.
errors : {'ignore', 'raise'}, default 'raise'
If 'ignore', suppress error and only existing labels are
dropped.
Returns
-------
Series or None
Series with specified index labels removed or None if
``inplace=True``
Raises
------
KeyError
If any of the labels is not found in the selected axis and
``error='raise'``
See Also
--------
Series.reindex
Return only specified index labels of Series
Series.dropna
Return series without null values
Series.drop_duplicates
Return series with duplicate values removed
cudf.core.dataframe.DataFrame.drop
Drop specified labels from rows or columns in dataframe
Examples
--------
>>> s = cudf.Series([1,2,3], index=['x', 'y', 'z'])
>>> s
x 1
y 2
z 3
dtype: int64
Drop labels x and z
>>> s.drop(labels=['x', 'z'])
y 2
dtype: int64
Drop a label from the second level in MultiIndex Series.
>>> midx = cudf.MultiIndex.from_product([[0, 1, 2], ['x', 'y']])
>>> s = cudf.Series(range(6), index=midx)
>>> s
0 x 0
y 1
1 x 2
y 3
2 x 4
y 5
>>> s.drop(labels='y', level=1)
0 x 0
1 x 2
2 x 4
"""
if labels is not None:
if index is not None or columns is not None:
raise ValueError(
"Cannot specify both 'labels' and 'index'/'columns'"
)
if axis == 1:
raise ValueError("No axis named 1 for object type Series")
target = labels
elif index is not None:
target = index
elif columns is not None:
target = [] # Ignore parameter columns
else:
raise ValueError(
"Need to specify at least one of 'labels', "
"'index' or 'columns'"
)

if inplace:
out = self
else:
out = self.copy()

dropped = _drop_rows_by_labels(out, target, level, errors)

out._data = dropped._data
out._index = dropped._index

if not inplace:
return out

def __copy__(self, deep=True):
return self.copy(deep)

Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,15 @@ def test_dataframe_drop_raises():
expected_error_message="One or more values not found in axis",
)

# label dtype mismatch
assert_exceptions_equal(
lfunc=pdf.drop,
rfunc=df.drop,
lfunc_args_and_kwargs=([3],),
rfunc_args_and_kwargs=([3],),
expected_error_message="One or more values not found in axis",
)

expect = pdf.drop("p", errors="ignore")
actual = df.drop("p", errors="ignore")

Expand Down
Loading

0 comments on commit 2818928

Please sign in to comment.