Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Series.drop api #7304

Merged
merged 22 commits into from
Mar 10, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 31 additions & 40 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@
import cudf
from cudf import _lib as libcudf
from cudf._lib.null_mask import MaskState, create_null_mask
from cudf._typing import ColumnLike
from cudf.core import column, reshape
from cudf.core.abc import Serializable
from cudf.core.column import as_column, column_empty
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame
from cudf.core.frame import Frame, _drop_rows_by_labels
from cudf.core.groupby.groupby import DataFrameGroupBy
from cudf.core.index import Index, RangeIndex, as_index
from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer
Expand Down Expand Up @@ -3269,46 +3270,26 @@ def drop(
)

if inplace:
outdf = self
out = self
else:
outdf = self.copy()
out = self.copy()

if axis in (1, "columns"):
target = _get_host_unique(target)

_drop_columns(outdf, target, errors)
_drop_columns(out, target, errors)
elif axis in (0, "index"):
if not isinstance(target, (cudf.Series, cudf.Index)):
target = column.as_column(target)

if isinstance(self._index, cudf.MultiIndex):
if level is None:
level = 0

levels_index = outdf.index.get_level_values(level)
if errors == "raise" and not target.isin(levels_index).all():
raise KeyError("One or more values not found in axis")

# TODO : Could use anti-join as a future optimization
sliced_df = outdf.take(~levels_index.isin(target))
sliced_df._index.names = self._index.names
else:
if errors == "raise" and not target.isin(outdf.index).all():
raise KeyError("One or more values not found in axis")

sliced_df = outdf.join(
cudf.DataFrame(index=target), how="leftanti"
)
dropped = _drop_rows_by_labels(out, target, level, errors)

if columns is not None:
columns = _get_host_unique(columns)
_drop_columns(sliced_df, columns, errors)
_drop_columns(dropped, columns, errors)

outdf._data = sliced_df._data
outdf._index = sliced_df._index
out._data = dropped._data
out._index = dropped._index

if not inplace:
return outdf
return out

def _drop_column(self, name):
"""Drop a column by *name*
Expand Down Expand Up @@ -7362,6 +7343,16 @@ def equals(self, other):
return False
return super().equals(other)

def _drop_rows_by_labels(
self: "cudf.DataFrame", labels: ColumnLike
isVoid marked this conversation as resolved.
Show resolved Hide resolved
) -> "cudf.DataFrame":
"""Delete rows specified by `label` parameter. In `DataFrame`, this can
be achieved efficiently by a left-anti join operation

labels: a list of labels specifying the rows to drop
"""
return self.join(cudf.DataFrame(index=labels), how="leftanti")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice - very elegant!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the join parameters are fixed here, it might be worth jumping straight to the cython without first creating a dataframe. Something like

idx = Table(index=labels)
res = cudf._lib.join.join(self, idx, 'leftanti', None, left_on=[], right_on=[], left_index=True, right_index=True)
return self.__class__._from_table(res)

This way you avoid all the python overhead of constructing a dataframe and then a bunch of unecessary checking and typecasting that the merge codepath uses, which isn't necessarily needed in this case. The downside is you might want to throw if the labels passed dont match the dtype of the target series index.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a fast path that could exist in the merge code that we're not currently enabling?

Copy link
Contributor

@brandon-b-miller brandon-b-miller Feb 5, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure how useful it would be for users, but after the upcoming join improvements it might be useful as an internal utility for join at least, since join is just a narrow case of merge with left_index=True, right_index=True and no on parameters.

A lot of the merge code is there just to validate all the combinations of arguments a user could pass and fix their dtypes if necessary. We should be able to get around that internally if we know something about what our cython API expects and are sure what we are passing to it is right. It would also help if cudf._lib.join.join had a better "developer facing" API, so the actual call to it from didn't seem so weird.

The counterargument is that to avoid libcudf itself erroring in an invalid case, we have to do a dtype check, to make sure the labels and the index are the same type. It could be argued that we might as well let the full user facing merge code error since it's already built to error in that case. But then, if someone passes something invalid, it's almost just as weird that the error comes from inside join.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A lot of the merge code is there just to validate all the combinations of arguments a user could pass and fix their dtypes if necessary. We should be able to get around that internally...

Won't we still need dtype validation, etc., here? The user-provided inputs to drop() could be of different dtype from the index columns.

It would also help if cudf._lib.join.join had a better "developer facing" API, so the actual call to it from didn't seem so weird.

Agree that we need a better developer-facing join API. FWIW, that won't be coming from Cython though, as the Cython API is going to be very simple (just returning a tuple of gathermaps).

But then, if someone passes something invalid, it's almost just as weird that the error comes from inside join

Maybe we could wrap the call to join here in a try...except and raise a more informative error message?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with that, I guess my only question is in general, should we avoid using our own user facing API entrypoints internally in favor of a lower level API, just to avoid all the extra overhead we put in the user facing APIs that are meant to check every edge case of everything they could possibly be doing wrong?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we're in 100% agreement that we should use internal APIs that bypass Pandas semantics for efficiency. My only point is that doesn't always have to be calling all the way down to Cython. For example _gather is an internal API at the Python level.

Here specifically, it looks like we need a join API without all the bells and whistles of the top-level merge() method, and we currently don't really have that.

Copy link
Contributor Author

@isVoid isVoid Feb 8, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw in drop there is this line:

if errors == "raise" and not labels.isin(levels_index).all():
  raise KeyError("One or more values not found in axis")

which I think guards against the dtype mismatch issue? @brandon-b-miller

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Offline sync with @brandon-b-miller : There is room to optimize for join parameter logic checks, but should defer until the join refactor is finished.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now, can we add a TODO here to optimize later by calling a lower-level join API? Ideally, the TODO should document exactly what logic checks we would like to avoid here.


_accessors = set() # type: Set[Any]


Expand Down Expand Up @@ -7608,17 +7599,6 @@ def _get_union_of_series_names(series_list):
return names_list


def _drop_columns(df, columns, errors):
for c in columns:
try:
df._drop_column(c)
except KeyError as e:
if errors == "ignore":
pass
else:
raise e


def _get_host_unique(array):
if isinstance(
array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase)
Expand All @@ -7628,3 +7608,14 @@ def _get_host_unique(array):
return [array]
else:
return set(array)


def _drop_columns(df: DataFrame, columns: Iterable, errors: str):
for c in columns:
try:
df._drop_column(c)
except KeyError as e:
if errors == "ignore":
pass
else:
raise e
35 changes: 35 additions & 0 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import cudf
from cudf import _lib as libcudf
from cudf._typing import ColumnLike
from cudf.core.column import as_column, build_categorical_column, column_empty
from cudf.utils.dtypes import (
is_categorical_dtype,
Expand Down Expand Up @@ -3812,3 +3813,37 @@ def _is_series(obj):
instead of checking for isinstance(obj, cudf.Series)
"""
return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None


def _drop_rows_by_labels(
obj: Union[cudf.DataFrame, cudf.Series],
isVoid marked this conversation as resolved.
Show resolved Hide resolved
labels: Union[ColumnLike, abc.Iterable, str],
level: Union[int, str],
errors: str,
) -> Union[cudf.DataFrame, cudf.Series]:
"""Remove rows specified by `labels`. If `errors=True`, an error is raised
if some items in `labels` do not exist in `obj._index`.

Parameter `level` is ignored if `obj._index` is not `MultiIndex`
isVoid marked this conversation as resolved.
Show resolved Hide resolved
"""
if not isinstance(labels, (cudf.Series, cudf.Index)):
labels = as_column(labels)

if isinstance(obj._index, cudf.MultiIndex):
if level is None:
level = 0

levels_index = obj.index.get_level_values(level)
if errors == "raise" and not labels.isin(levels_index).all():
raise KeyError("One or more values not found in axis")

# TODO : Could use anti-join as a future optimization
sliced_df = obj.take(~levels_index.isin(labels))
sliced_df._index.names = obj._index.names
isVoid marked this conversation as resolved.
Show resolved Hide resolved
else:
if errors == "raise" and not labels.isin(obj.index).all():
raise KeyError("One or more values not found in axis")

sliced_df = obj._drop_rows_by_labels(labels)

return sliced_df
136 changes: 135 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from cudf.core.column.lists import ListMethods
from cudf.core.column.string import StringMethods
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame
from cudf.core.frame import Frame, _drop_rows_by_labels
from cudf.core.groupby.groupby import SeriesGroupBy
from cudf.core.index import Index, RangeIndex, as_index
from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer
Expand Down Expand Up @@ -554,6 +554,128 @@ def copy(self, deep=True):
result.index = self.index.copy(deep=deep)
return result

def drop(
self,
labels=None,
axis=0,
index=None,
columns=None,
level=None,
inplace=False,
errors="raise",
):
"""
Return Series with specified index labels removed.

Remove elements of a Series based on specifying the index labels.
When using a multi-index, labels on different levels can be removed by
specifying the level.

Parameters
----------
labels : single label or list-like
Index labels to drop.
axis : 0, default 0
Redundant for application on Series.
index : single label or list-like
Redundant for application on Series. But ``index`` can be used
instead of ``labels``
columns : single label or list-like
This parameter is ignored. Use ``index`` or ``labels`` to specify.
level : int or level name, optional
For MultiIndex, level from which the labels will be removed.
inplace : bool, default False
If False, return a copy. Otherwise, do operation
inplace and return None.
errors : {'ignore', 'raise'}, default 'raise'
If 'ignore', suppress error and only existing labels are
dropped.

Returns
-------
Series or None
Series with specified index labels removed or None if
``inplace=True``

Raises
------
KeyError
If any of the labels is not found in the selected axis and
``error='raise'``

See Also
--------
Series.reindex
Return only specified index labels of Series
Series.dropna
Return series without null values
Series.drop_duplicates
Return series with duplicate values removed
cudf.core.dataframe.DataFrame.drop
Drop specified labels from rows or columns in dataframe

Examples
--------
>>> s = cudf.Series([1,2,3], index=['x', 'y', 'z'])
>>> s
x 1
y 2
z 3
dtype: int64

Drop labels x and z

>>> s.drop(labels=['x', 'z'])
y 2
dtype: int64

Drop a label from the second level in MultiIndex Series.

>>> midx = cudf.MultiIndex.from_product([[0, 1, 2], ['x', 'y']])
>>> s = cudf.Series(range(6), index=midx)
>>> s
0 x 0
y 1
1 x 2
y 3
2 x 4
y 5
>>> s.drop(labels='y', level=1)
0 x 0
1 x 2
2 x 4
"""
if labels is not None:
if index is not None or columns is not None:
raise ValueError(
"Cannot specify both 'labels' and 'index'/'columns'"
)
if axis == 1:
raise ValueError("No axis named 1 for object type Series")
target = labels
elif index is not None:
target = index
elif columns is not None:
target = [] # Ignore parameter columns
else:
raise ValueError(
"Need to specify at least one of 'labels', "
"'index' or 'columns'"
)

if inplace:
out = self
else:
out = self.copy()

dropped = _drop_rows_by_labels(out, target, level, errors)

out._data = dropped._data
out._index = dropped._index

if not inplace:
return out

def __copy__(self, deep=True):
return self.copy(deep)

Expand Down Expand Up @@ -4567,6 +4689,18 @@ def keys(self):
"""
return self.index

def _drop_rows_by_labels(self, labels):
"""Delete rows specified by `label` parameter. Resort to the efficient
implementation in `cudf.DataFrame`

labels: a list of labels specifying the rows to drop
"""
df = self.to_frame(name="tmp")
shwina marked this conversation as resolved.
Show resolved Hide resolved
dropped = df._drop_rows_by_labels(labels)["tmp"]
dropped.name = self.name

return dropped

_accessors = set() # type: Set[Any]


Expand Down
Loading