diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index ef47e843723..4330c565982 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -75,24 +75,22 @@ def drop_nulls(columns: list, how="any", keys=None, thresh=None): return columns_from_unique_ptr(move(c_result)) -def apply_boolean_mask(source_table, Column boolean_mask): +def apply_boolean_mask(columns: list, Column boolean_mask): """ Drops the rows which correspond to False in boolean_mask. Parameters ---------- - source_table : source table whose rows are dropped as per boolean_mask + columns : list of columns whose rows are dropped as per boolean_mask boolean_mask : a boolean column of same size as source_table Returns ------- - Frame obtained from applying mask + columns obtained from applying mask """ - assert pd.api.types.is_bool_dtype(boolean_mask.dtype) - cdef unique_ptr[table] c_result - cdef table_view source_table_view = table_view_from_table(source_table) + cdef table_view source_table_view = table_view_from_columns(columns) cdef column_view boolean_mask_view = boolean_mask.view() with nogil: @@ -103,13 +101,7 @@ def apply_boolean_mask(source_table, Column boolean_mask): ) ) - return data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names, - index_names=( - None if source_table._index - is None else source_table._index_names) - ) + return columns_from_unique_ptr(move(c_result)) def drop_duplicates(columns: list, diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index aa89b8f849f..683f3fefe1c 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -9,8 +9,15 @@ import pandas as pd import cudf +from cudf._lib.stream_compaction import apply_boolean_mask from cudf._typing import DtypeObj -from cudf.api.types import is_dtype_equal, is_integer, is_list_like, is_scalar +from cudf.api.types import ( + is_bool_dtype, + is_dtype_equal, + is_integer, + is_list_like, + is_scalar, +) from cudf.core.abc import Serializable from cudf.core.column import ColumnBase, column from cudf.core.column_accessor import ColumnAccessor @@ -1414,6 +1421,22 @@ def from_pandas(cls, index, nan_as_null=None): def _constructor_expanddim(self): return cudf.MultiIndex + def _apply_boolean_mask(self, boolean_mask): + """Apply boolean mask to each row of `self`. + + Rows corresponding to `False` is dropped. + """ + boolean_mask = cudf.core.column.as_column(boolean_mask) + if not is_bool_dtype(boolean_mask.dtype): + raise ValueError("boolean_mask is not boolean type.") + + result = self.__class__._from_columns( + apply_boolean_mask(list(self._columns), boolean_mask), + column_names=self._column_names, + ) + result._copy_type_metadata(self) + return result + def _split_columns_by_levels(self, levels): if isinstance(levels, int) and levels > 0: raise ValueError(f"Out of bound level: {levels}") diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 18c86f82f9c..a2a909968dc 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -5,8 +5,8 @@ import numpy as np from cudf.core.column import as_column -from cudf.core.frame import Frame from cudf.core.index import Index, RangeIndex +from cudf.core.indexed_frame import IndexedFrame from cudf.core.series import Series @@ -92,7 +92,7 @@ def _index_or_values_interpolation(column, index=None): if num_nan == 0 or num_nan == len(column): return column - to_interp = Frame(data={None: column}, index=index) + to_interp = IndexedFrame(data={None: column}, index=index) known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask)) known_x = known_x_and_y._index._column.values diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c1e037499fc..a966276842f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -35,6 +35,7 @@ ) from cudf._lib.scalar import as_device_scalar from cudf._lib.stream_compaction import ( + apply_boolean_mask, distinct_count as cpp_distinct_count, drop_duplicates, drop_nulls, @@ -997,9 +998,12 @@ def as_decimal32_column( raise NotImplementedError def apply_boolean_mask(self, mask) -> ColumnBase: - mask = as_column(mask, dtype="bool") - return ( - self.as_frame()._apply_boolean_mask(boolean_mask=mask)._as_column() + mask = as_column(mask) + if not is_bool_dtype(mask.dtype): + raise ValueError("boolean_mask is not boolean type.") + + return apply_boolean_mask([self], mask)[0]._with_type_metadata( + self.dtype ) def argsort( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0345966d6bd..6e47c0f41cf 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1461,19 +1461,6 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): return self[out_cols] - def _apply_boolean_mask(self, boolean_mask): - """ - Applies boolean mask to each row of `self`, - rows corresponding to `False` is dropped - """ - result = self.__class__._from_data( - *libcudf.stream_compaction.apply_boolean_mask( - self, as_column(boolean_mask) - ) - ) - result._copy_type_metadata(self) - return result - def interpolate( self, method="linear", diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 2f4d4a88195..7c5783bf637 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -19,6 +19,7 @@ from cudf._typing import ColumnLike from cudf.api.types import ( _is_non_decimal_numeric_dtype, + is_bool_dtype, is_categorical_dtype, is_integer_dtype, is_list_like, @@ -1197,6 +1198,25 @@ def resample( else cudf.core.resample.DataFrameResampler(self, by=by) ) + def _apply_boolean_mask(self, boolean_mask): + """Apply boolean mask to each row of `self`. + + Rows corresponding to `False` is dropped. + """ + boolean_mask = cudf.core.column.as_column(boolean_mask) + if not is_bool_dtype(boolean_mask.dtype): + raise ValueError("boolean_mask is not boolean type.") + + result = self.__class__._from_columns( + libcudf.stream_compaction.apply_boolean_mask( + list(self._index._columns + self._columns), boolean_mask + ), + column_names=self._column_names, + index_names=self._index.names, + ) + result._copy_type_metadata(self) + return result + def _reset_index(self, level, drop, col_level=0, col_fill=""): """Shared path for DataFrame.reset_index and Series.reset_index.""" if level is not None and not isinstance(level, (tuple, list)):