Skip to content

Commit

Permalink
Use list of column inputs for apply_boolean_mask (#9832)
Browse files Browse the repository at this point in the history
This PR brings changes from #9558 to `apply_boolean_mask` and removes the `as_frame` -> `as_column` round trip. Benchmark the column method:

```
------------------------------------- benchmark 'col0': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean          
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col0] (afte)      87.0090 (1.0)      132.8980 (1.0)       95.8815 (1.0)    
column_apply_boolean_mask[col0] (befo)     210.4580 (2.42)     307.8270 (2.32)     225.4821 (2.35)   
-----------------------------------------------------------------------------------------------------

------------------------------------- benchmark 'col1': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean          
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col1] (afte)      74.2240 (1.0)      110.0600 (1.0)       75.6356 (1.0)    
column_apply_boolean_mask[col1] (befo)     172.5240 (2.32)     278.5250 (2.53)     176.5672 (2.33)   
-----------------------------------------------------------------------------------------------------

------------------------------------- benchmark 'col2': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean          
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col2] (afte)     101.5740 (1.0)      141.8850 (1.0)      110.2334 (1.0)    
column_apply_boolean_mask[col2] (befo)     234.1140 (2.30)     312.7140 (2.20)     245.5453 (2.23)   
-----------------------------------------------------------------------------------------------------

------------------------------------- benchmark 'col3': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean          
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col3] (afte)      88.7710 (1.0)      142.7500 (1.0)       90.5082 (1.0)    
column_apply_boolean_mask[col3] (befo)     195.0980 (2.20)     303.1020 (2.12)     199.8368 (2.21)   
-----------------------------------------------------------------------------------------------------
```

Dataframe benchmark
```
----------------------------------- benchmark '100': 2 tests -----------------------------------
Name (time in us)                          Min                 Max                Mean          
------------------------------------------------------------------------------------------------
df_apply_boolean_mask[100] (afte)     380.6770 (1.05)     654.7080 (1.18)     389.3374 (1.03)   
df_apply_boolean_mask[100] (befo)     362.3220 (1.0)      554.6130 (1.0)      378.7087 (1.0)    
------------------------------------------------------------------------------------------------

----------------------------------- benchmark '10000': 2 tests -----------------------------------
Name (time in us)                            Min                 Max                Mean          
--------------------------------------------------------------------------------------------------
df_apply_boolean_mask[10000] (afte)     399.5240 (1.05)     461.6310 (1.0)      405.1225 (1.04)   
df_apply_boolean_mask[10000] (befo)     379.4080 (1.0)      564.5770 (1.22)     389.6990 (1.0)    
--------------------------------------------------------------------------------------------------
```

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #9832
  • Loading branch information
isVoid authored Jan 11, 2022
1 parent 3216342 commit 813ac97
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 32 deletions.
18 changes: 5 additions & 13 deletions python/cudf/cudf/_lib/stream_compaction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -75,24 +75,22 @@ def drop_nulls(columns: list, how="any", keys=None, thresh=None):
return columns_from_unique_ptr(move(c_result))


def apply_boolean_mask(source_table, Column boolean_mask):
def apply_boolean_mask(columns: list, Column boolean_mask):
"""
Drops the rows which correspond to False in boolean_mask.
Parameters
----------
source_table : source table whose rows are dropped as per boolean_mask
columns : list of columns whose rows are dropped as per boolean_mask
boolean_mask : a boolean column of same size as source_table
Returns
-------
Frame obtained from applying mask
columns obtained from applying mask
"""

assert pd.api.types.is_bool_dtype(boolean_mask.dtype)

cdef unique_ptr[table] c_result
cdef table_view source_table_view = table_view_from_table(source_table)
cdef table_view source_table_view = table_view_from_columns(columns)
cdef column_view boolean_mask_view = boolean_mask.view()

with nogil:
Expand All @@ -103,13 +101,7 @@ def apply_boolean_mask(source_table, Column boolean_mask):
)
)

return data_from_unique_ptr(
move(c_result),
column_names=source_table._column_names,
index_names=(
None if source_table._index
is None else source_table._index_names)
)
return columns_from_unique_ptr(move(c_result))


def drop_duplicates(columns: list,
Expand Down
25 changes: 24 additions & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,15 @@
import pandas as pd

import cudf
from cudf._lib.stream_compaction import apply_boolean_mask
from cudf._typing import DtypeObj
from cudf.api.types import is_dtype_equal, is_integer, is_list_like, is_scalar
from cudf.api.types import (
is_bool_dtype,
is_dtype_equal,
is_integer,
is_list_like,
is_scalar,
)
from cudf.core.abc import Serializable
from cudf.core.column import ColumnBase, column
from cudf.core.column_accessor import ColumnAccessor
Expand Down Expand Up @@ -1414,6 +1421,22 @@ def from_pandas(cls, index, nan_as_null=None):
def _constructor_expanddim(self):
return cudf.MultiIndex

def _apply_boolean_mask(self, boolean_mask):
"""Apply boolean mask to each row of `self`.
Rows corresponding to `False` is dropped.
"""
boolean_mask = cudf.core.column.as_column(boolean_mask)
if not is_bool_dtype(boolean_mask.dtype):
raise ValueError("boolean_mask is not boolean type.")

result = self.__class__._from_columns(
apply_boolean_mask(list(self._columns), boolean_mask),
column_names=self._column_names,
)
result._copy_type_metadata(self)
return result

def _split_columns_by_levels(self, levels):
if isinstance(levels, int) and levels > 0:
raise ValueError(f"Out of bound level: {levels}")
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import numpy as np

from cudf.core.column import as_column
from cudf.core.frame import Frame
from cudf.core.index import Index, RangeIndex
from cudf.core.indexed_frame import IndexedFrame
from cudf.core.series import Series


Expand Down Expand Up @@ -92,7 +92,7 @@ def _index_or_values_interpolation(column, index=None):
if num_nan == 0 or num_nan == len(column):
return column

to_interp = Frame(data={None: column}, index=index)
to_interp = IndexedFrame(data={None: column}, index=index)
known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))

known_x = known_x_and_y._index._column.values
Expand Down
10 changes: 7 additions & 3 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
)
from cudf._lib.scalar import as_device_scalar
from cudf._lib.stream_compaction import (
apply_boolean_mask,
distinct_count as cpp_distinct_count,
drop_duplicates,
drop_nulls,
Expand Down Expand Up @@ -997,9 +998,12 @@ def as_decimal32_column(
raise NotImplementedError

def apply_boolean_mask(self, mask) -> ColumnBase:
mask = as_column(mask, dtype="bool")
return (
self.as_frame()._apply_boolean_mask(boolean_mask=mask)._as_column()
mask = as_column(mask)
if not is_bool_dtype(mask.dtype):
raise ValueError("boolean_mask is not boolean type.")

return apply_boolean_mask([self], mask)[0]._with_type_metadata(
self.dtype
)

def argsort(
Expand Down
13 changes: 0 additions & 13 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1461,19 +1461,6 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):

return self[out_cols]

def _apply_boolean_mask(self, boolean_mask):
"""
Applies boolean mask to each row of `self`,
rows corresponding to `False` is dropped
"""
result = self.__class__._from_data(
*libcudf.stream_compaction.apply_boolean_mask(
self, as_column(boolean_mask)
)
)
result._copy_type_metadata(self)
return result

def interpolate(
self,
method="linear",
Expand Down
20 changes: 20 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from cudf._typing import ColumnLike
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
is_bool_dtype,
is_categorical_dtype,
is_integer_dtype,
is_list_like,
Expand Down Expand Up @@ -1197,6 +1198,25 @@ def resample(
else cudf.core.resample.DataFrameResampler(self, by=by)
)

def _apply_boolean_mask(self, boolean_mask):
"""Apply boolean mask to each row of `self`.
Rows corresponding to `False` is dropped.
"""
boolean_mask = cudf.core.column.as_column(boolean_mask)
if not is_bool_dtype(boolean_mask.dtype):
raise ValueError("boolean_mask is not boolean type.")

result = self.__class__._from_columns(
libcudf.stream_compaction.apply_boolean_mask(
list(self._index._columns + self._columns), boolean_mask
),
column_names=self._column_names,
index_names=self._index.names,
)
result._copy_type_metadata(self)
return result

def _reset_index(self, level, drop, col_level=0, col_fill=""):
"""Shared path for DataFrame.reset_index and Series.reset_index."""
if level is not None and not isinstance(level, (tuple, list)):
Expand Down

0 comments on commit 813ac97

Please sign in to comment.