From 813ac97b2143c8d1d8ca95435863f5234408a681 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 11 Jan 2022 15:16:25 -0800 Subject: [PATCH] Use list of column inputs for `apply_boolean_mask` (#9832) This PR brings changes from #9558 to `apply_boolean_mask` and removes the `as_frame` -> `as_column` round trip. Benchmark the column method: ``` ------------------------------------- benchmark 'col0': 2 tests ------------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------- column_apply_boolean_mask[col0] (afte) 87.0090 (1.0) 132.8980 (1.0) 95.8815 (1.0) column_apply_boolean_mask[col0] (befo) 210.4580 (2.42) 307.8270 (2.32) 225.4821 (2.35) ----------------------------------------------------------------------------------------------------- ------------------------------------- benchmark 'col1': 2 tests ------------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------- column_apply_boolean_mask[col1] (afte) 74.2240 (1.0) 110.0600 (1.0) 75.6356 (1.0) column_apply_boolean_mask[col1] (befo) 172.5240 (2.32) 278.5250 (2.53) 176.5672 (2.33) ----------------------------------------------------------------------------------------------------- ------------------------------------- benchmark 'col2': 2 tests ------------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------- column_apply_boolean_mask[col2] (afte) 101.5740 (1.0) 141.8850 (1.0) 110.2334 (1.0) column_apply_boolean_mask[col2] (befo) 234.1140 (2.30) 312.7140 (2.20) 245.5453 (2.23) ----------------------------------------------------------------------------------------------------- ------------------------------------- benchmark 'col3': 2 tests ------------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------- column_apply_boolean_mask[col3] (afte) 88.7710 (1.0) 142.7500 (1.0) 90.5082 (1.0) column_apply_boolean_mask[col3] (befo) 195.0980 (2.20) 303.1020 (2.12) 199.8368 (2.21) ----------------------------------------------------------------------------------------------------- ``` Dataframe benchmark ``` ----------------------------------- benchmark '100': 2 tests ----------------------------------- Name (time in us) Min Max Mean ------------------------------------------------------------------------------------------------ df_apply_boolean_mask[100] (afte) 380.6770 (1.05) 654.7080 (1.18) 389.3374 (1.03) df_apply_boolean_mask[100] (befo) 362.3220 (1.0) 554.6130 (1.0) 378.7087 (1.0) ------------------------------------------------------------------------------------------------ ----------------------------------- benchmark '10000': 2 tests ----------------------------------- Name (time in us) Min Max Mean -------------------------------------------------------------------------------------------------- df_apply_boolean_mask[10000] (afte) 399.5240 (1.05) 461.6310 (1.0) 405.1225 (1.04) df_apply_boolean_mask[10000] (befo) 379.4080 (1.0) 564.5770 (1.22) 389.6990 (1.0) -------------------------------------------------------------------------------------------------- ``` Authors: - Michael Wang (https://github.com/isVoid) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/9832 --- python/cudf/cudf/_lib/stream_compaction.pyx | 18 +++++---------- python/cudf/cudf/core/_base_index.py | 25 ++++++++++++++++++++- python/cudf/cudf/core/algorithms.py | 4 ++-- python/cudf/cudf/core/column/column.py | 10 ++++++--- python/cudf/cudf/core/frame.py | 13 ----------- python/cudf/cudf/core/indexed_frame.py | 20 +++++++++++++++++ 6 files changed, 58 insertions(+), 32 deletions(-) diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index ef47e843723..4330c565982 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -75,24 +75,22 @@ def drop_nulls(columns: list, how="any", keys=None, thresh=None): return columns_from_unique_ptr(move(c_result)) -def apply_boolean_mask(source_table, Column boolean_mask): +def apply_boolean_mask(columns: list, Column boolean_mask): """ Drops the rows which correspond to False in boolean_mask. Parameters ---------- - source_table : source table whose rows are dropped as per boolean_mask + columns : list of columns whose rows are dropped as per boolean_mask boolean_mask : a boolean column of same size as source_table Returns ------- - Frame obtained from applying mask + columns obtained from applying mask """ - assert pd.api.types.is_bool_dtype(boolean_mask.dtype) - cdef unique_ptr[table] c_result - cdef table_view source_table_view = table_view_from_table(source_table) + cdef table_view source_table_view = table_view_from_columns(columns) cdef column_view boolean_mask_view = boolean_mask.view() with nogil: @@ -103,13 +101,7 @@ def apply_boolean_mask(source_table, Column boolean_mask): ) ) - return data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names, - index_names=( - None if source_table._index - is None else source_table._index_names) - ) + return columns_from_unique_ptr(move(c_result)) def drop_duplicates(columns: list, diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index aa89b8f849f..683f3fefe1c 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -9,8 +9,15 @@ import pandas as pd import cudf +from cudf._lib.stream_compaction import apply_boolean_mask from cudf._typing import DtypeObj -from cudf.api.types import is_dtype_equal, is_integer, is_list_like, is_scalar +from cudf.api.types import ( + is_bool_dtype, + is_dtype_equal, + is_integer, + is_list_like, + is_scalar, +) from cudf.core.abc import Serializable from cudf.core.column import ColumnBase, column from cudf.core.column_accessor import ColumnAccessor @@ -1414,6 +1421,22 @@ def from_pandas(cls, index, nan_as_null=None): def _constructor_expanddim(self): return cudf.MultiIndex + def _apply_boolean_mask(self, boolean_mask): + """Apply boolean mask to each row of `self`. + + Rows corresponding to `False` is dropped. + """ + boolean_mask = cudf.core.column.as_column(boolean_mask) + if not is_bool_dtype(boolean_mask.dtype): + raise ValueError("boolean_mask is not boolean type.") + + result = self.__class__._from_columns( + apply_boolean_mask(list(self._columns), boolean_mask), + column_names=self._column_names, + ) + result._copy_type_metadata(self) + return result + def _split_columns_by_levels(self, levels): if isinstance(levels, int) and levels > 0: raise ValueError(f"Out of bound level: {levels}") diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 18c86f82f9c..a2a909968dc 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -5,8 +5,8 @@ import numpy as np from cudf.core.column import as_column -from cudf.core.frame import Frame from cudf.core.index import Index, RangeIndex +from cudf.core.indexed_frame import IndexedFrame from cudf.core.series import Series @@ -92,7 +92,7 @@ def _index_or_values_interpolation(column, index=None): if num_nan == 0 or num_nan == len(column): return column - to_interp = Frame(data={None: column}, index=index) + to_interp = IndexedFrame(data={None: column}, index=index) known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask)) known_x = known_x_and_y._index._column.values diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c1e037499fc..a966276842f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -35,6 +35,7 @@ ) from cudf._lib.scalar import as_device_scalar from cudf._lib.stream_compaction import ( + apply_boolean_mask, distinct_count as cpp_distinct_count, drop_duplicates, drop_nulls, @@ -997,9 +998,12 @@ def as_decimal32_column( raise NotImplementedError def apply_boolean_mask(self, mask) -> ColumnBase: - mask = as_column(mask, dtype="bool") - return ( - self.as_frame()._apply_boolean_mask(boolean_mask=mask)._as_column() + mask = as_column(mask) + if not is_bool_dtype(mask.dtype): + raise ValueError("boolean_mask is not boolean type.") + + return apply_boolean_mask([self], mask)[0]._with_type_metadata( + self.dtype ) def argsort( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0345966d6bd..6e47c0f41cf 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1461,19 +1461,6 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): return self[out_cols] - def _apply_boolean_mask(self, boolean_mask): - """ - Applies boolean mask to each row of `self`, - rows corresponding to `False` is dropped - """ - result = self.__class__._from_data( - *libcudf.stream_compaction.apply_boolean_mask( - self, as_column(boolean_mask) - ) - ) - result._copy_type_metadata(self) - return result - def interpolate( self, method="linear", diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 2f4d4a88195..7c5783bf637 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -19,6 +19,7 @@ from cudf._typing import ColumnLike from cudf.api.types import ( _is_non_decimal_numeric_dtype, + is_bool_dtype, is_categorical_dtype, is_integer_dtype, is_list_like, @@ -1197,6 +1198,25 @@ def resample( else cudf.core.resample.DataFrameResampler(self, by=by) ) + def _apply_boolean_mask(self, boolean_mask): + """Apply boolean mask to each row of `self`. + + Rows corresponding to `False` is dropped. + """ + boolean_mask = cudf.core.column.as_column(boolean_mask) + if not is_bool_dtype(boolean_mask.dtype): + raise ValueError("boolean_mask is not boolean type.") + + result = self.__class__._from_columns( + libcudf.stream_compaction.apply_boolean_mask( + list(self._index._columns + self._columns), boolean_mask + ), + column_names=self._column_names, + index_names=self._index.names, + ) + result._copy_type_metadata(self) + return result + def _reset_index(self, level, drop, col_level=0, col_fill=""): """Shared path for DataFrame.reset_index and Series.reset_index.""" if level is not None and not isinstance(level, (tuple, list)):