Use list of column inputs for apply_boolean_mask (#9832)

This PR brings changes from #9558 to `apply_boolean_mask` and removes the `as_frame` -> `as_column` round trip. Benchmark the column method: ``` ------------------------------------- benchmark 'col0': 2 tests ------------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------- column_apply_boolean_mask[col0] (afte) 87.0090 (1.0) 132.8980 (1.0) 95.8815 (1.0) column_apply_boolean_mask[col0] (befo) 210.4580 (2.42) 307.8270 (2.32) 225.4821 (2.35) ----------------------------------------------------------------------------------------------------- ------------------------------------- benchmark 'col1': 2 tests ------------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------- column_apply_boolean_mask[col1] (afte) 74.2240 (1.0) 110.0600 (1.0) 75.6356 (1.0) column_apply_boolean_mask[col1] (befo) 172.5240 (2.32) 278.5250 (2.53) 176.5672 (2.33) ----------------------------------------------------------------------------------------------------- ------------------------------------- benchmark 'col2': 2 tests ------------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------- column_apply_boolean_mask[col2] (afte) 101.5740 (1.0) 141.8850 (1.0) 110.2334 (1.0) column_apply_boolean_mask[col2] (befo) 234.1140 (2.30) 312.7140 (2.20) 245.5453 (2.23) ----------------------------------------------------------------------------------------------------- ------------------------------------- benchmark 'col3': 2 tests ------------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------- column_apply_boolean_mask[col3] (afte) 88.7710 (1.0) 142.7500 (1.0) 90.5082 (1.0) column_apply_boolean_mask[col3] (befo) 195.0980 (2.20) 303.1020 (2.12) 199.8368 (2.21) ----------------------------------------------------------------------------------------------------- ``` Dataframe benchmark ``` ----------------------------------- benchmark '100': 2 tests ----------------------------------- Name (time in us) Min Max Mean ------------------------------------------------------------------------------------------------ df_apply_boolean_mask[100] (afte) 380.6770 (1.05) 654.7080 (1.18) 389.3374 (1.03) df_apply_boolean_mask[100] (befo) 362.3220 (1.0) 554.6130 (1.0) 378.7087 (1.0) ------------------------------------------------------------------------------------------------ ----------------------------------- benchmark '10000': 2 tests ----------------------------------- Name (time in us) Min Max Mean -------------------------------------------------------------------------------------------------- df_apply_boolean_mask[10000] (afte) 399.5240 (1.05) 461.6310 (1.0) 405.1225 (1.04) df_apply_boolean_mask[10000] (befo) 379.4080 (1.0) 564.5770 (1.22) 389.6990 (1.0) -------------------------------------------------------------------------------------------------- ``` Authors: - Michael Wang (https://github.com/isVoid) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) URL: #9832
rapidsai · Jan 11, 2022 · 813ac97 · 813ac97
1 parent 3216342
commit 813ac97
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 32 deletions.
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -75,24 +75,22 @@ def drop_nulls(columns: list, how="any", keys=None, thresh=None):
     return columns_from_unique_ptr(move(c_result))
 
 
-def apply_boolean_mask(source_table, Column boolean_mask):
+def apply_boolean_mask(columns: list, Column boolean_mask):
     """
     Drops the rows which correspond to False in boolean_mask.
 
     Parameters
     ----------
-    source_table : source table whose rows are dropped as per boolean_mask
+    columns : list of columns whose rows are dropped as per boolean_mask
     boolean_mask : a boolean column of same size as source_table
 
     Returns
     -------
-    Frame obtained from applying mask
+    columns obtained from applying mask
     """
 
-    assert pd.api.types.is_bool_dtype(boolean_mask.dtype)
-
     cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_table(source_table)
+    cdef table_view source_table_view = table_view_from_columns(columns)
     cdef column_view boolean_mask_view = boolean_mask.view()
 
     with nogil:
@@ -103,13 +101,7 @@ def apply_boolean_mask(source_table, Column boolean_mask):
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=(
-            None if source_table._index
-            is None else source_table._index_names)
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def drop_duplicates(columns: list,

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
@@ -9,8 +9,15 @@
 import pandas as pd
 
 import cudf
+from cudf._lib.stream_compaction import apply_boolean_mask
 from cudf._typing import DtypeObj
-from cudf.api.types import is_dtype_equal, is_integer, is_list_like, is_scalar
+from cudf.api.types import (
+    is_bool_dtype,
+    is_dtype_equal,
+    is_integer,
+    is_list_like,
+    is_scalar,
+)
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
 from cudf.core.column_accessor import ColumnAccessor
@@ -1414,6 +1421,22 @@ def from_pandas(cls, index, nan_as_null=None):
     def _constructor_expanddim(self):
         return cudf.MultiIndex
 
+    def _apply_boolean_mask(self, boolean_mask):
+        """Apply boolean mask to each row of `self`.
+
+        Rows corresponding to `False` is dropped.
+        """
+        boolean_mask = cudf.core.column.as_column(boolean_mask)
+        if not is_bool_dtype(boolean_mask.dtype):
+            raise ValueError("boolean_mask is not boolean type.")
+
+        result = self.__class__._from_columns(
+            apply_boolean_mask(list(self._columns), boolean_mask),
+            column_names=self._column_names,
+        )
+        result._copy_type_metadata(self)
+        return result
+
     def _split_columns_by_levels(self, levels):
         if isinstance(levels, int) and levels > 0:
             raise ValueError(f"Out of bound level: {levels}")

diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
@@ -5,8 +5,8 @@
 import numpy as np
 
 from cudf.core.column import as_column
-from cudf.core.frame import Frame
 from cudf.core.index import Index, RangeIndex
+from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.series import Series
 
 
@@ -92,7 +92,7 @@ def _index_or_values_interpolation(column, index=None):
     if num_nan == 0 or num_nan == len(column):
         return column
 
-    to_interp = Frame(data={None: column}, index=index)
+    to_interp = IndexedFrame(data={None: column}, index=index)
     known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))
 
     known_x = known_x_and_y._index._column.values

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -35,6 +35,7 @@
 )
 from cudf._lib.scalar import as_device_scalar
 from cudf._lib.stream_compaction import (
+    apply_boolean_mask,
     distinct_count as cpp_distinct_count,
     drop_duplicates,
     drop_nulls,
@@ -997,9 +998,12 @@ def as_decimal32_column(
         raise NotImplementedError
 
     def apply_boolean_mask(self, mask) -> ColumnBase:
-        mask = as_column(mask, dtype="bool")
-        return (
-            self.as_frame()._apply_boolean_mask(boolean_mask=mask)._as_column()
+        mask = as_column(mask)
+        if not is_bool_dtype(mask.dtype):
+            raise ValueError("boolean_mask is not boolean type.")
+
+        return apply_boolean_mask([self], mask)[0]._with_type_metadata(
+            self.dtype
         )
 
     def argsort(

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -1461,19 +1461,6 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
 
         return self[out_cols]
 
-    def _apply_boolean_mask(self, boolean_mask):
-        """
-        Applies boolean mask to each row of `self`,
-        rows corresponding to `False` is dropped
-        """
-        result = self.__class__._from_data(
-            *libcudf.stream_compaction.apply_boolean_mask(
-                self, as_column(boolean_mask)
-            )
-        )
-        result._copy_type_metadata(self)
-        return result
-
     def interpolate(
         self,
         method="linear",

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -19,6 +19,7 @@
 from cudf._typing import ColumnLike
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
+    is_bool_dtype,
     is_categorical_dtype,
     is_integer_dtype,
     is_list_like,
@@ -1197,6 +1198,25 @@ def resample(
             else cudf.core.resample.DataFrameResampler(self, by=by)
         )
 
+    def _apply_boolean_mask(self, boolean_mask):
+        """Apply boolean mask to each row of `self`.
+
+        Rows corresponding to `False` is dropped.
+        """
+        boolean_mask = cudf.core.column.as_column(boolean_mask)
+        if not is_bool_dtype(boolean_mask.dtype):
+            raise ValueError("boolean_mask is not boolean type.")
+
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.apply_boolean_mask(
+                list(self._index._columns + self._columns), boolean_mask
+            ),
+            column_names=self._column_names,
+            index_names=self._index.names,
+        )
+        result._copy_type_metadata(self)
+        return result
+
     def _reset_index(self, level, drop, col_level=0, col_fill=""):
         """Shared path for DataFrame.reset_index and Series.reset_index."""
         if level is not None and not isinstance(level, (tuple, list)):