From 2818928283aaa7e823aaf2475cc6b7579b554cb8 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 10 Mar 2021 11:50:56 -0800
Subject: [PATCH] Add `Series.drop` api (#7304)

Closes #7045

This PR introduces `Series.drop` API. `Series.drop` allows users to drop certain elements in the series specified `labels` or `index` parameter.

Example:
```python3
>>> s = cudf.Series([1, 2, 3], index=['x', 'y', 'z'])
>>> s.drop(labels=['y'])
x    1
z    3
dtype: int64
```

- [x] Add series test case
- [x] Move common code path from `DataFrame.drop` to helper function
- [x] Add typing annotation
- [x] Add docstring

Authors:
  - Michael Wang (@isVoid)

Approvers:
  - Ashwin Srinath (@shwina)
  - GALI PREM SAGAR (@galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/7304
---
 python/cudf/cudf/core/dataframe.py       |  71 ++++-----
 python/cudf/cudf/core/frame.py           |  84 ++++++++++-
 python/cudf/cudf/core/series.py          | 124 +++++++++++++++-
 python/cudf/cudf/tests/test_dataframe.py |   9 ++
 python/cudf/cudf/tests/test_series.py    | 178 +++++++++++++++++++++++
 5 files changed, 421 insertions(+), 45 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 375f2886c29..5b59519423a 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
-from __future__ import division
+from __future__ import annotations, division
 
 import inspect
 import itertools
@@ -10,7 +10,7 @@
 import warnings
 from collections import OrderedDict, defaultdict
 from collections.abc import Iterable, Sequence
-from typing import Any, Set, TypeVar
+from typing import Any, Optional, Set, TypeVar
 
 import cupy
 import numpy as np
@@ -30,7 +30,7 @@
 from cudf.core.abc import Serializable
 from cudf.core.column import as_column, column_empty
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.frame import Frame
+from cudf.core.frame import Frame, _drop_rows_by_labels
 from cudf.core.groupby.groupby import DataFrameGroupBy
 from cudf.core.index import Index, RangeIndex, as_index
 from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer
@@ -495,7 +495,12 @@ def _from_table(cls, table, index=None):
         return out
 
     @classmethod
-    def _from_data(cls, data, index=None, columns=None):
+    def _from_data(
+        cls,
+        data: ColumnAccessor,
+        index: Optional[Index] = None,
+        columns: Any = None,
+    ) -> DataFrame:
         out = cls.__new__(cls)
         out._data = data
         if index is None:
@@ -3364,46 +3369,26 @@ def drop(
             )
 
         if inplace:
-            outdf = self
+            out = self
         else:
-            outdf = self.copy()
+            out = self.copy()
 
         if axis in (1, "columns"):
             target = _get_host_unique(target)
 
-            _drop_columns(outdf, target, errors)
+            _drop_columns(out, target, errors)
         elif axis in (0, "index"):
-            if not isinstance(target, (cudf.Series, cudf.Index)):
-                target = column.as_column(target)
-
-            if isinstance(self._index, cudf.MultiIndex):
-                if level is None:
-                    level = 0
-
-                levels_index = outdf.index.get_level_values(level)
-                if errors == "raise" and not target.isin(levels_index).all():
-                    raise KeyError("One or more values not found in axis")
-
-                # TODO : Could use anti-join as a future optimization
-                sliced_df = outdf.take(~levels_index.isin(target))
-                sliced_df._index.names = self._index.names
-            else:
-                if errors == "raise" and not target.isin(outdf.index).all():
-                    raise KeyError("One or more values not found in axis")
-
-                sliced_df = outdf.join(
-                    cudf.DataFrame(index=target), how="leftanti"
-                )
+            dropped = _drop_rows_by_labels(out, target, level, errors)
 
             if columns is not None:
                 columns = _get_host_unique(columns)
-                _drop_columns(sliced_df, columns, errors)
+                _drop_columns(dropped, columns, errors)
 
-            outdf._data = sliced_df._data
-            outdf._index = sliced_df._index
+            out._data = dropped._data
+            out._index = dropped._index
 
         if not inplace:
-            return outdf
+            return out
 
     def _drop_column(self, name):
         """Drop a column by *name*
@@ -7962,17 +7947,6 @@ def _get_union_of_series_names(series_list):
     return names_list
 
 
-def _drop_columns(df, columns, errors):
-    for c in columns:
-        try:
-            df._drop_column(c)
-        except KeyError as e:
-            if errors == "ignore":
-                pass
-            else:
-                raise e
-
-
 def _get_host_unique(array):
     if isinstance(
         array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase)
@@ -7982,3 +7956,14 @@ def _get_host_unique(array):
         return [array]
     else:
         return set(array)
+
+
+def _drop_columns(df: DataFrame, columns: Iterable, errors: str):
+    for c in columns:
+        try:
+            df._drop_column(c)
+        except KeyError as e:
+            if errors == "ignore":
+                pass
+            else:
+                raise e
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index fb18d9c3cf9..dd2bc8a83d5 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,7 +6,7 @@
 import functools
 import warnings
 from collections import OrderedDict, abc as abc
-from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, overload
+from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload
 
 import cupy
 import numpy as np
@@ -18,6 +18,7 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._typing import ColumnLike, DataFrameOrSeries
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils.dtypes import (
     is_categorical_dtype,
@@ -3744,3 +3745,84 @@ def _is_series(obj):
     instead of checking for isinstance(obj, cudf.Series)
     """
     return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None
+
+
+def _drop_rows_by_labels(
+    obj: DataFrameOrSeries,
+    labels: Union[ColumnLike, abc.Iterable, str],
+    level: Union[int, str],
+    errors: str,
+) -> DataFrameOrSeries:
+    """Remove rows specified by `labels`. If `errors=True`, an error is raised
+    if some items in `labels` do not exist in `obj._index`.
+
+    Will raise if level(int) is greater or equal to index nlevels
+    """
+    if isinstance(level, int) and level >= obj.index.nlevels:
+        raise ValueError("Param level out of bounds.")
+
+    if not isinstance(labels, (cudf.Series, cudf.Index)):
+        labels = as_column(labels)
+
+    if isinstance(obj._index, cudf.MultiIndex):
+        if level is None:
+            level = 0
+
+        levels_index = obj.index.get_level_values(level)
+        if errors == "raise" and not labels.isin(levels_index).all():
+            raise KeyError("One or more values not found in axis")
+
+        if isinstance(level, int):
+            ilevel = level
+        else:
+            ilevel = obj._index.names.index(level)
+
+        # 1. Merge Index df and data df along column axis:
+        # | id | ._index df | data column(s) |
+        idx_nlv = obj._index.nlevels
+        working_df = obj._index._source_data
+        working_df.columns = [i for i in range(idx_nlv)]
+        for i, col in enumerate(obj._data):
+            working_df[idx_nlv + i] = obj._data[col]
+        # 2. Set `level` as common index:
+        # | level | ._index df w/o level | data column(s) |
+        working_df = working_df.set_index(level)
+
+        # 3. Use "leftanti" join to drop
+        # TODO: use internal API with "leftanti" and specify left and right
+        # join keys to bypass logic check
+        to_join = cudf.DataFrame(index=cudf.Index(labels, name=level))
+        join_res = working_df.join(to_join, how="leftanti")
+
+        # 4. Reconstruct original layout, and rename
+        join_res.insert(
+            ilevel, name=join_res._index.name, value=join_res._index
+        )
+        join_res = join_res.reset_index(drop=True)
+
+        midx = cudf.MultiIndex.from_frame(
+            join_res.iloc[:, 0:idx_nlv], names=obj._index.names
+        )
+
+        if isinstance(obj, cudf.Series):
+            return obj.__class__._from_data(
+                join_res.iloc[:, idx_nlv:]._data, index=midx, name=obj.name
+            )
+        else:
+            return obj.__class__._from_data(
+                join_res.iloc[:, idx_nlv:]._data,
+                index=midx,
+                columns=obj.columns,
+            )
+
+    else:
+        if errors == "raise" and not labels.isin(obj.index).all():
+            raise KeyError("One or more values not found in axis")
+
+        key_df = cudf.DataFrame(index=labels)
+        if isinstance(obj, cudf.Series):
+            res = obj.to_frame(name="tmp").join(key_df, how="leftanti")["tmp"]
+            res.name = obj.name
+            return res
+        else:
+            return obj.join(key_df, how="leftanti")
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index dba7775060f..f59c91be4d5 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -37,7 +37,7 @@
 from cudf.core.column.lists import ListMethods
 from cudf.core.column.string import StringMethods
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.frame import Frame
+from cudf.core.frame import Frame, _drop_rows_by_labels
 from cudf.core.groupby.groupby import SeriesGroupBy
 from cudf.core.index import Index, RangeIndex, as_index
 from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer
@@ -545,6 +545,128 @@ def to_arrow(self):
         """
         return self._column.to_arrow()
 
+    def drop(
+        self,
+        labels=None,
+        axis=0,
+        index=None,
+        columns=None,
+        level=None,
+        inplace=False,
+        errors="raise",
+    ):
+        """
+        Return Series with specified index labels removed.
+
+        Remove elements of a Series based on specifying the index labels.
+        When using a multi-index, labels on different levels can be removed by
+        specifying the level.
+
+        Parameters
+        ----------
+        labels : single label or list-like
+            Index labels to drop.
+        axis : 0, default 0
+            Redundant for application on Series.
+        index : single label or list-like
+            Redundant for application on Series. But ``index`` can be used
+            instead of ``labels``
+        columns : single label or list-like
+            This parameter is ignored. Use ``index`` or ``labels`` to specify.
+        level : int or level name, optional
+            For MultiIndex, level from which the labels will be removed.
+        inplace : bool, default False
+            If False, return a copy. Otherwise, do operation
+            inplace and return None.
+        errors : {'ignore', 'raise'}, default 'raise'
+            If 'ignore', suppress error and only existing labels are
+            dropped.
+
+        Returns
+        -------
+        Series or None
+            Series with specified index labels removed or None if
+            ``inplace=True``
+
+        Raises
+        ------
+        KeyError
+            If any of the labels is not found in the selected axis and
+            ``error='raise'``
+
+        See Also
+        --------
+        Series.reindex
+            Return only specified index labels of Series
+        Series.dropna
+            Return series without null values
+        Series.drop_duplicates
+            Return series with duplicate values removed
+        cudf.core.dataframe.DataFrame.drop
+            Drop specified labels from rows or columns in dataframe
+
+        Examples
+        --------
+        >>> s = cudf.Series([1,2,3], index=['x', 'y', 'z'])
+        >>> s
+        x    1
+        y    2
+        z    3
+        dtype: int64
+
+        Drop labels x and z
+
+        >>> s.drop(labels=['x', 'z'])
+        y    2
+        dtype: int64
+
+        Drop a label from the second level in MultiIndex Series.
+
+        >>> midx = cudf.MultiIndex.from_product([[0, 1, 2], ['x', 'y']])
+        >>> s = cudf.Series(range(6), index=midx)
+        >>> s
+        0  x    0
+           y    1
+        1  x    2
+           y    3
+        2  x    4
+           y    5
+        >>> s.drop(labels='y', level=1)
+        0  x    0
+        1  x    2
+        2  x    4
+        """
+        if labels is not None:
+            if index is not None or columns is not None:
+                raise ValueError(
+                    "Cannot specify both 'labels' and 'index'/'columns'"
+                )
+            if axis == 1:
+                raise ValueError("No axis named 1 for object type Series")
+            target = labels
+        elif index is not None:
+            target = index
+        elif columns is not None:
+            target = []  # Ignore parameter columns
+        else:
+            raise ValueError(
+                "Need to specify at least one of 'labels', "
+                "'index' or 'columns'"
+            )
+
+        if inplace:
+            out = self
+        else:
+            out = self.copy()
+
+        dropped = _drop_rows_by_labels(out, target, level, errors)
+
+        out._data = dropped._data
+        out._index = dropped._index
+
+        if not inplace:
+            return out
+
     def __copy__(self, deep=True):
         return self.copy(deep)
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 69f6ecfeb17..ffd66e18314 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -524,6 +524,15 @@ def test_dataframe_drop_raises():
         expected_error_message="One or more values not found in axis",
     )
 
+    # label dtype mismatch
+    assert_exceptions_equal(
+        lfunc=pdf.drop,
+        rfunc=df.drop,
+        lfunc_args_and_kwargs=([3],),
+        rfunc_args_and_kwargs=([3],),
+        expected_error_message="One or more values not found in axis",
+    )
+
     expect = pdf.drop("p", errors="ignore")
     actual = df.drop("p", errors="ignore")
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index ab9d3d91f73..a1b4236719d 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -940,3 +940,181 @@ def test_fillna_with_nan(data, nan_as_null, fill_value):
     actual = gs.fillna(fill_value)
 
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "ps",
+    [
+        pd.Series(["a"] * 20, index=range(0, 20)),
+        pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"),
+    ],
+)
+@pytest.mark.parametrize(
+    "labels",
+    [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])],
+)
+@pytest.mark.parametrize("inplace", [True, False])
+def test_series_drop_labels(ps, labels, inplace):
+    ps = ps.copy()
+    gs = cudf.from_pandas(ps)
+
+    expected = ps.drop(labels=labels, axis=0, inplace=inplace)
+    actual = gs.drop(labels=labels, axis=0, inplace=inplace)
+
+    if inplace:
+        expected = ps
+        actual = gs
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "ps",
+    [
+        pd.Series(["a"] * 20, index=range(0, 20)),
+        pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"),
+    ],
+)
+@pytest.mark.parametrize(
+    "index",
+    [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])],
+)
+@pytest.mark.parametrize("inplace", [True, False])
+def test_series_drop_index(ps, index, inplace):
+    ps = ps.copy()
+    gs = cudf.from_pandas(ps)
+
+    expected = ps.drop(index=index, inplace=inplace)
+    actual = gs.drop(index=index, inplace=inplace)
+
+    if inplace:
+        expected = ps
+        actual = gs
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "ps",
+    [
+        pd.Series(
+            ["a" if i % 2 == 0 else "b" for i in range(0, 10)],
+            index=pd.MultiIndex(
+                levels=[
+                    ["lama", "cow", "falcon"],
+                    ["speed", "weight", "length"],
+                ],
+                codes=[
+                    [0, 0, 0, 1, 1, 1, 2, 2, 2, 1],
+                    [0, 1, 2, 0, 1, 2, 0, 1, 2, 1],
+                ],
+            ),
+            name="abc",
+        )
+    ],
+)
+@pytest.mark.parametrize(
+    "index,level",
+    [
+        ("cow", 0),
+        ("lama", 0),
+        ("falcon", 0),
+        ("speed", 1),
+        ("weight", 1),
+        ("length", 1),
+        pytest.param(
+            "cow",
+            None,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/36293"
+            ),
+        ),
+        pytest.param(
+            "lama",
+            None,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/36293"
+            ),
+        ),
+        pytest.param(
+            "falcon",
+            None,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/36293"
+            ),
+        ),
+    ],
+)
+@pytest.mark.parametrize("inplace", [True, False])
+def test_series_drop_multiindex(ps, index, level, inplace):
+    ps = ps.copy()
+    gs = cudf.from_pandas(ps)
+
+    expected = ps.drop(index=index, inplace=inplace, level=level)
+    actual = gs.drop(index=index, inplace=inplace, level=level)
+
+    if inplace:
+        expected = ps
+        actual = gs
+
+    assert_eq(expected, actual)
+
+
+def test_series_drop_edge_inputs():
+    gs = cudf.Series([42], name="a")
+    ps = gs.to_pandas()
+
+    assert_eq(ps.drop(columns=["b"]), gs.drop(columns=["b"]))
+
+    assert_eq(ps.drop(columns="b"), gs.drop(columns="b"))
+
+    assert_exceptions_equal(
+        lfunc=ps.drop,
+        rfunc=gs.drop,
+        lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}),
+        rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}),
+        expected_error_message="Cannot specify both",
+    )
+
+    assert_exceptions_equal(
+        lfunc=ps.drop,
+        rfunc=gs.drop,
+        lfunc_args_and_kwargs=([], {}),
+        rfunc_args_and_kwargs=([], {}),
+        expected_error_message="Need to specify at least one",
+    )
+
+    assert_exceptions_equal(
+        lfunc=ps.drop,
+        rfunc=gs.drop,
+        lfunc_args_and_kwargs=(["b"], {"axis": 1}),
+        rfunc_args_and_kwargs=(["b"], {"axis": 1}),
+        expected_error_message="No axis named 1",
+    )
+
+
+def test_series_drop_raises():
+    gs = cudf.Series([10, 20, 30], index=["x", "y", "z"], name="c")
+    ps = gs.to_pandas()
+
+    assert_exceptions_equal(
+        lfunc=ps.drop,
+        rfunc=gs.drop,
+        lfunc_args_and_kwargs=(["p"],),
+        rfunc_args_and_kwargs=(["p"],),
+        expected_error_message="One or more values not found in axis",
+    )
+
+    # dtype specified mismatch
+    assert_exceptions_equal(
+        lfunc=ps.drop,
+        rfunc=gs.drop,
+        lfunc_args_and_kwargs=([3],),
+        rfunc_args_and_kwargs=([3],),
+        expected_error_message="One or more values not found in axis",
+    )
+
+    expect = ps.drop("p", errors="ignore")
+    actual = gs.drop("p", errors="ignore")
+
+    assert_eq(actual, expect)