Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Series.drop api #7304

Merged
merged 22 commits into from
Mar 10, 2021
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 40 additions & 43 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.

from __future__ import division
from __future__ import annotations, division

import inspect
import itertools
Expand All @@ -10,7 +10,7 @@
import warnings
from collections import OrderedDict, defaultdict
from collections.abc import Iterable, Mapping, Sequence
from typing import Any, Set, TypeVar
from typing import Any, Optional, Set, TypeVar

import cupy
import numpy as np
Expand All @@ -26,11 +26,12 @@
import cudf
from cudf import _lib as libcudf
from cudf._lib.null_mask import MaskState, create_null_mask
from cudf._typing import ColumnLike
from cudf.core import column, reshape
from cudf.core.abc import Serializable
from cudf.core.column import as_column, column_empty
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.frame import Frame
from cudf.core.frame import Frame, _drop_rows_by_labels
from cudf.core.groupby.groupby import DataFrameGroupBy
from cudf.core.index import Index, RangeIndex, as_index
from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer
Expand Down Expand Up @@ -491,7 +492,12 @@ def _from_table(cls, table, index=None):
return out

@classmethod
def _from_data(cls, data, index=None, columns=None):
def _from_data(
cls,
data: ColumnAccessor,
index: Optional[Index] = None,
columns: Any = None,
) -> DataFrame:
out = cls.__new__(cls)
out._data = data
if index is None:
Expand Down Expand Up @@ -3315,46 +3321,26 @@ def drop(
)

if inplace:
outdf = self
out = self
else:
outdf = self.copy()
out = self.copy()

if axis in (1, "columns"):
target = _get_host_unique(target)

_drop_columns(outdf, target, errors)
_drop_columns(out, target, errors)
elif axis in (0, "index"):
if not isinstance(target, (cudf.Series, cudf.Index)):
target = column.as_column(target)

if isinstance(self._index, cudf.MultiIndex):
if level is None:
level = 0

levels_index = outdf.index.get_level_values(level)
if errors == "raise" and not target.isin(levels_index).all():
raise KeyError("One or more values not found in axis")

# TODO : Could use anti-join as a future optimization
sliced_df = outdf.take(~levels_index.isin(target))
sliced_df._index.names = self._index.names
else:
if errors == "raise" and not target.isin(outdf.index).all():
raise KeyError("One or more values not found in axis")

sliced_df = outdf.join(
cudf.DataFrame(index=target), how="leftanti"
)
dropped = _drop_rows_by_labels(out, target, level, errors)

if columns is not None:
columns = _get_host_unique(columns)
_drop_columns(sliced_df, columns, errors)
_drop_columns(dropped, columns, errors)

outdf._data = sliced_df._data
outdf._index = sliced_df._index
out._data = dropped._data
out._index = dropped._index

if not inplace:
return outdf
return out

def _drop_column(self, name):
"""Drop a column by *name*
Expand Down Expand Up @@ -7408,6 +7394,17 @@ def equals(self, other):
return False
return super().equals(other)

def _drop_rows_by_labels(self, labels: ColumnLike) -> "cudf.DataFrame":
"""Delete rows specified by `label` parameter. In `DataFrame`, this can
be achieved efficiently by a left-anti join operation

labels: a list of labels specifying the rows to drop
"""

dropped = self.join(cudf.DataFrame(index=labels), how="leftanti")

return dropped

_accessors = set() # type: Set[Any]


Expand Down Expand Up @@ -7654,17 +7651,6 @@ def _get_union_of_series_names(series_list):
return names_list


def _drop_columns(df, columns, errors):
for c in columns:
try:
df._drop_column(c)
except KeyError as e:
if errors == "ignore":
pass
else:
raise e


def _get_host_unique(array):
if isinstance(
array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase)
Expand All @@ -7674,3 +7660,14 @@ def _get_host_unique(array):
return [array]
else:
return set(array)


def _drop_columns(df: DataFrame, columns: Iterable, errors: str):
for c in columns:
try:
df._drop_column(c)
except KeyError as e:
if errors == "ignore":
pass
else:
raise e
81 changes: 79 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import operator
import warnings
from collections import OrderedDict, abc as abc
from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, overload
from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload

import cupy
import numpy as np
Expand All @@ -18,6 +18,7 @@

import cudf
from cudf import _lib as libcudf
from cudf._typing import ColumnLike, DataFrameOrSeries
from cudf.core.column import as_column, build_categorical_column, column_empty
from cudf.utils.dtypes import (
is_categorical_dtype,
Expand All @@ -27,7 +28,6 @@
min_scalar_type,
)


T = TypeVar("T", bound="Frame")

if TYPE_CHECKING:
Expand Down Expand Up @@ -3838,3 +3838,80 @@ def _is_series(obj):
instead of checking for isinstance(obj, cudf.Series)
"""
return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None


def _drop_rows_by_labels(
obj: DataFrameOrSeries,
labels: Union[ColumnLike, abc.Iterable, str],
level: Union[int, str],
errors: str,
) -> DataFrameOrSeries:
"""Remove rows specified by `labels`. If `errors=True`, an error is raised
if some items in `labels` do not exist in `obj._index`.

Will raise if level(int) is greater or equal to index nlevels
"""
if isinstance(level, int) and level >= obj.index.nlevels:
raise ValueError("Param level out of bounds.")

if not isinstance(labels, (cudf.Series, cudf.Index)):
labels = as_column(labels)

res: DataFrameOrSeries
if isinstance(obj._index, cudf.MultiIndex):
if level is None:
level = 0

levels_index = obj.index.get_level_values(level)
if errors == "raise" and not labels.isin(levels_index).all():
raise KeyError("One or more values not found in axis")

if isinstance(level, int):
ilevel = level
else:
ilevel = obj._index.names.index(level)

# 1. Merge Index df and data df along column axis:
# | id | ._index df | data column(s) |
idx_nlv = obj._index.nlevels
working_df = obj._index._source_data
working_df.columns = [i for i in range(idx_nlv)]
for i, col in enumerate(obj._data):
working_df[idx_nlv + i] = obj._data[col]
# 2. Set `level` as common index:
# | level | ._index df w/o level | data column(s) |
working_df = working_df.set_index(level)

# 3. Use "leftanti" join to drop
# TODO: replace with Brandon's suggestion
to_join = cudf.DataFrame(index=cudf.Index(labels, name=level))
join_res = working_df.join(to_join, how="leftanti")

# 4. Reconstruct original layout, and rename
join_res.insert(
ilevel, name=join_res._index.name, value=join_res._index
)
join_res = join_res.reset_index(drop=True)

midx = cudf.MultiIndex.from_frame(
join_res.iloc[:, 0:idx_nlv], names=obj._index.names
)

if isinstance(obj, cudf.Series):
res = obj.__class__._from_data(
isVoid marked this conversation as resolved.
Show resolved Hide resolved
join_res.iloc[:, idx_nlv:]._data, index=midx, name=obj.name
)
else:
res = obj.__class__._from_data(
join_res.iloc[:, idx_nlv:]._data,
index=midx,
columns=obj.columns,
)

else:
if errors == "raise" and not labels.isin(obj.index).all():
raise KeyError("One or more values not found in axis")

res = obj._drop_rows_by_labels(labels)

return res
Loading