From 2818928283aaa7e823aaf2475cc6b7579b554cb8 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 10 Mar 2021 11:50:56 -0800 Subject: [PATCH] Add `Series.drop` api (#7304) Closes #7045 This PR introduces `Series.drop` API. `Series.drop` allows users to drop certain elements in the series specified `labels` or `index` parameter. Example: ```python3 >>> s = cudf.Series([1, 2, 3], index=['x', 'y', 'z']) >>> s.drop(labels=['y']) x 1 z 3 dtype: int64 ``` - [x] Add series test case - [x] Move common code path from `DataFrame.drop` to helper function - [x] Add typing annotation - [x] Add docstring Authors: - Michael Wang (@isVoid) Approvers: - Ashwin Srinath (@shwina) - GALI PREM SAGAR (@galipremsagar) URL: https://github.com/rapidsai/cudf/pull/7304 --- python/cudf/cudf/core/dataframe.py | 71 ++++----- python/cudf/cudf/core/frame.py | 84 ++++++++++- python/cudf/cudf/core/series.py | 124 +++++++++++++++- python/cudf/cudf/tests/test_dataframe.py | 9 ++ python/cudf/cudf/tests/test_series.py | 178 +++++++++++++++++++++++ 5 files changed, 421 insertions(+), 45 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 375f2886c29..5b59519423a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1,6 +1,6 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. -from __future__ import division +from __future__ import annotations, division import inspect import itertools @@ -10,7 +10,7 @@ import warnings from collections import OrderedDict, defaultdict from collections.abc import Iterable, Sequence -from typing import Any, Set, TypeVar +from typing import Any, Optional, Set, TypeVar import cupy import numpy as np @@ -30,7 +30,7 @@ from cudf.core.abc import Serializable from cudf.core.column import as_column, column_empty from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import Frame +from cudf.core.frame import Frame, _drop_rows_by_labels from cudf.core.groupby.groupby import DataFrameGroupBy from cudf.core.index import Index, RangeIndex, as_index from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer @@ -495,7 +495,12 @@ def _from_table(cls, table, index=None): return out @classmethod - def _from_data(cls, data, index=None, columns=None): + def _from_data( + cls, + data: ColumnAccessor, + index: Optional[Index] = None, + columns: Any = None, + ) -> DataFrame: out = cls.__new__(cls) out._data = data if index is None: @@ -3364,46 +3369,26 @@ def drop( ) if inplace: - outdf = self + out = self else: - outdf = self.copy() + out = self.copy() if axis in (1, "columns"): target = _get_host_unique(target) - _drop_columns(outdf, target, errors) + _drop_columns(out, target, errors) elif axis in (0, "index"): - if not isinstance(target, (cudf.Series, cudf.Index)): - target = column.as_column(target) - - if isinstance(self._index, cudf.MultiIndex): - if level is None: - level = 0 - - levels_index = outdf.index.get_level_values(level) - if errors == "raise" and not target.isin(levels_index).all(): - raise KeyError("One or more values not found in axis") - - # TODO : Could use anti-join as a future optimization - sliced_df = outdf.take(~levels_index.isin(target)) - sliced_df._index.names = self._index.names - else: - if errors == "raise" and not target.isin(outdf.index).all(): - raise KeyError("One or more values not found in axis") - - sliced_df = outdf.join( - cudf.DataFrame(index=target), how="leftanti" - ) + dropped = _drop_rows_by_labels(out, target, level, errors) if columns is not None: columns = _get_host_unique(columns) - _drop_columns(sliced_df, columns, errors) + _drop_columns(dropped, columns, errors) - outdf._data = sliced_df._data - outdf._index = sliced_df._index + out._data = dropped._data + out._index = dropped._index if not inplace: - return outdf + return out def _drop_column(self, name): """Drop a column by *name* @@ -7962,17 +7947,6 @@ def _get_union_of_series_names(series_list): return names_list -def _drop_columns(df, columns, errors): - for c in columns: - try: - df._drop_column(c) - except KeyError as e: - if errors == "ignore": - pass - else: - raise e - - def _get_host_unique(array): if isinstance( array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase) @@ -7982,3 +7956,14 @@ def _get_host_unique(array): return [array] else: return set(array) + + +def _drop_columns(df: DataFrame, columns: Iterable, errors: str): + for c in columns: + try: + df._drop_column(c) + except KeyError as e: + if errors == "ignore": + pass + else: + raise e diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index fb18d9c3cf9..dd2bc8a83d5 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import functools import warnings from collections import OrderedDict, abc as abc -from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, overload +from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload import cupy import numpy as np @@ -18,6 +18,7 @@ import cudf from cudf import _lib as libcudf +from cudf._typing import ColumnLike, DataFrameOrSeries from cudf.core.column import as_column, build_categorical_column, column_empty from cudf.utils.dtypes import ( is_categorical_dtype, @@ -3744,3 +3745,84 @@ def _is_series(obj): instead of checking for isinstance(obj, cudf.Series) """ return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None + + +def _drop_rows_by_labels( + obj: DataFrameOrSeries, + labels: Union[ColumnLike, abc.Iterable, str], + level: Union[int, str], + errors: str, +) -> DataFrameOrSeries: + """Remove rows specified by `labels`. If `errors=True`, an error is raised + if some items in `labels` do not exist in `obj._index`. + + Will raise if level(int) is greater or equal to index nlevels + """ + if isinstance(level, int) and level >= obj.index.nlevels: + raise ValueError("Param level out of bounds.") + + if not isinstance(labels, (cudf.Series, cudf.Index)): + labels = as_column(labels) + + if isinstance(obj._index, cudf.MultiIndex): + if level is None: + level = 0 + + levels_index = obj.index.get_level_values(level) + if errors == "raise" and not labels.isin(levels_index).all(): + raise KeyError("One or more values not found in axis") + + if isinstance(level, int): + ilevel = level + else: + ilevel = obj._index.names.index(level) + + # 1. Merge Index df and data df along column axis: + # | id | ._index df | data column(s) | + idx_nlv = obj._index.nlevels + working_df = obj._index._source_data + working_df.columns = [i for i in range(idx_nlv)] + for i, col in enumerate(obj._data): + working_df[idx_nlv + i] = obj._data[col] + # 2. Set `level` as common index: + # | level | ._index df w/o level | data column(s) | + working_df = working_df.set_index(level) + + # 3. Use "leftanti" join to drop + # TODO: use internal API with "leftanti" and specify left and right + # join keys to bypass logic check + to_join = cudf.DataFrame(index=cudf.Index(labels, name=level)) + join_res = working_df.join(to_join, how="leftanti") + + # 4. Reconstruct original layout, and rename + join_res.insert( + ilevel, name=join_res._index.name, value=join_res._index + ) + join_res = join_res.reset_index(drop=True) + + midx = cudf.MultiIndex.from_frame( + join_res.iloc[:, 0:idx_nlv], names=obj._index.names + ) + + if isinstance(obj, cudf.Series): + return obj.__class__._from_data( + join_res.iloc[:, idx_nlv:]._data, index=midx, name=obj.name + ) + else: + return obj.__class__._from_data( + join_res.iloc[:, idx_nlv:]._data, + index=midx, + columns=obj.columns, + ) + + else: + if errors == "raise" and not labels.isin(obj.index).all(): + raise KeyError("One or more values not found in axis") + + key_df = cudf.DataFrame(index=labels) + if isinstance(obj, cudf.Series): + res = obj.to_frame(name="tmp").join(key_df, how="leftanti")["tmp"] + res.name = obj.name + return res + else: + return obj.join(key_df, how="leftanti") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index dba7775060f..f59c91be4d5 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -37,7 +37,7 @@ from cudf.core.column.lists import ListMethods from cudf.core.column.string import StringMethods from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import Frame +from cudf.core.frame import Frame, _drop_rows_by_labels from cudf.core.groupby.groupby import SeriesGroupBy from cudf.core.index import Index, RangeIndex, as_index from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer @@ -545,6 +545,128 @@ def to_arrow(self): """ return self._column.to_arrow() + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): + """ + Return Series with specified index labels removed. + + Remove elements of a Series based on specifying the index labels. + When using a multi-index, labels on different levels can be removed by + specifying the level. + + Parameters + ---------- + labels : single label or list-like + Index labels to drop. + axis : 0, default 0 + Redundant for application on Series. + index : single label or list-like + Redundant for application on Series. But ``index`` can be used + instead of ``labels`` + columns : single label or list-like + This parameter is ignored. Use ``index`` or ``labels`` to specify. + level : int or level name, optional + For MultiIndex, level from which the labels will be removed. + inplace : bool, default False + If False, return a copy. Otherwise, do operation + inplace and return None. + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and only existing labels are + dropped. + + Returns + ------- + Series or None + Series with specified index labels removed or None if + ``inplace=True`` + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis and + ``error='raise'`` + + See Also + -------- + Series.reindex + Return only specified index labels of Series + Series.dropna + Return series without null values + Series.drop_duplicates + Return series with duplicate values removed + cudf.core.dataframe.DataFrame.drop + Drop specified labels from rows or columns in dataframe + + Examples + -------- + >>> s = cudf.Series([1,2,3], index=['x', 'y', 'z']) + >>> s + x 1 + y 2 + z 3 + dtype: int64 + + Drop labels x and z + + >>> s.drop(labels=['x', 'z']) + y 2 + dtype: int64 + + Drop a label from the second level in MultiIndex Series. + + >>> midx = cudf.MultiIndex.from_product([[0, 1, 2], ['x', 'y']]) + >>> s = cudf.Series(range(6), index=midx) + >>> s + 0 x 0 + y 1 + 1 x 2 + y 3 + 2 x 4 + y 5 + >>> s.drop(labels='y', level=1) + 0 x 0 + 1 x 2 + 2 x 4 + """ + if labels is not None: + if index is not None or columns is not None: + raise ValueError( + "Cannot specify both 'labels' and 'index'/'columns'" + ) + if axis == 1: + raise ValueError("No axis named 1 for object type Series") + target = labels + elif index is not None: + target = index + elif columns is not None: + target = [] # Ignore parameter columns + else: + raise ValueError( + "Need to specify at least one of 'labels', " + "'index' or 'columns'" + ) + + if inplace: + out = self + else: + out = self.copy() + + dropped = _drop_rows_by_labels(out, target, level, errors) + + out._data = dropped._data + out._index = dropped._index + + if not inplace: + return out + def __copy__(self, deep=True): return self.copy(deep) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 69f6ecfeb17..ffd66e18314 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -524,6 +524,15 @@ def test_dataframe_drop_raises(): expected_error_message="One or more values not found in axis", ) + # label dtype mismatch + assert_exceptions_equal( + lfunc=pdf.drop, + rfunc=df.drop, + lfunc_args_and_kwargs=([3],), + rfunc_args_and_kwargs=([3],), + expected_error_message="One or more values not found in axis", + ) + expect = pdf.drop("p", errors="ignore") actual = df.drop("p", errors="ignore") diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index ab9d3d91f73..a1b4236719d 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -940,3 +940,181 @@ def test_fillna_with_nan(data, nan_as_null, fill_value): actual = gs.fillna(fill_value) assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "ps", + [ + pd.Series(["a"] * 20, index=range(0, 20)), + pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"), + ], +) +@pytest.mark.parametrize( + "labels", + [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_series_drop_labels(ps, labels, inplace): + ps = ps.copy() + gs = cudf.from_pandas(ps) + + expected = ps.drop(labels=labels, axis=0, inplace=inplace) + actual = gs.drop(labels=labels, axis=0, inplace=inplace) + + if inplace: + expected = ps + actual = gs + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "ps", + [ + pd.Series(["a"] * 20, index=range(0, 20)), + pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"), + ], +) +@pytest.mark.parametrize( + "index", + [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_series_drop_index(ps, index, inplace): + ps = ps.copy() + gs = cudf.from_pandas(ps) + + expected = ps.drop(index=index, inplace=inplace) + actual = gs.drop(index=index, inplace=inplace) + + if inplace: + expected = ps + actual = gs + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "ps", + [ + pd.Series( + ["a" if i % 2 == 0 else "b" for i in range(0, 10)], + index=pd.MultiIndex( + levels=[ + ["lama", "cow", "falcon"], + ["speed", "weight", "length"], + ], + codes=[ + [0, 0, 0, 1, 1, 1, 2, 2, 2, 1], + [0, 1, 2, 0, 1, 2, 0, 1, 2, 1], + ], + ), + name="abc", + ) + ], +) +@pytest.mark.parametrize( + "index,level", + [ + ("cow", 0), + ("lama", 0), + ("falcon", 0), + ("speed", 1), + ("weight", 1), + ("length", 1), + pytest.param( + "cow", + None, + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/36293" + ), + ), + pytest.param( + "lama", + None, + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/36293" + ), + ), + pytest.param( + "falcon", + None, + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/36293" + ), + ), + ], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_series_drop_multiindex(ps, index, level, inplace): + ps = ps.copy() + gs = cudf.from_pandas(ps) + + expected = ps.drop(index=index, inplace=inplace, level=level) + actual = gs.drop(index=index, inplace=inplace, level=level) + + if inplace: + expected = ps + actual = gs + + assert_eq(expected, actual) + + +def test_series_drop_edge_inputs(): + gs = cudf.Series([42], name="a") + ps = gs.to_pandas() + + assert_eq(ps.drop(columns=["b"]), gs.drop(columns=["b"])) + + assert_eq(ps.drop(columns="b"), gs.drop(columns="b")) + + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), + rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), + expected_error_message="Cannot specify both", + ) + + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=([], {}), + rfunc_args_and_kwargs=([], {}), + expected_error_message="Need to specify at least one", + ) + + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=(["b"], {"axis": 1}), + rfunc_args_and_kwargs=(["b"], {"axis": 1}), + expected_error_message="No axis named 1", + ) + + +def test_series_drop_raises(): + gs = cudf.Series([10, 20, 30], index=["x", "y", "z"], name="c") + ps = gs.to_pandas() + + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=(["p"],), + rfunc_args_and_kwargs=(["p"],), + expected_error_message="One or more values not found in axis", + ) + + # dtype specified mismatch + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=([3],), + rfunc_args_and_kwargs=([3],), + expected_error_message="One or more values not found in axis", + ) + + expect = ps.drop("p", errors="ignore") + actual = gs.drop("p", errors="ignore") + + assert_eq(actual, expect)