From 813ac97b2143c8d1d8ca95435863f5234408a681 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 11 Jan 2022 15:16:25 -0800
Subject: [PATCH] Use list of column inputs for `apply_boolean_mask` (#9832)

This PR brings changes from #9558 to `apply_boolean_mask` and removes the `as_frame` -> `as_column` round trip. Benchmark the column method:

```
------------------------------------- benchmark 'col0': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col0] (afte)      87.0090 (1.0)      132.8980 (1.0)       95.8815 (1.0)
column_apply_boolean_mask[col0] (befo)     210.4580 (2.42)     307.8270 (2.32)     225.4821 (2.35)
-----------------------------------------------------------------------------------------------------

------------------------------------- benchmark 'col1': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col1] (afte)      74.2240 (1.0)      110.0600 (1.0)       75.6356 (1.0)
column_apply_boolean_mask[col1] (befo)     172.5240 (2.32)     278.5250 (2.53)     176.5672 (2.33)
-----------------------------------------------------------------------------------------------------

------------------------------------- benchmark 'col2': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col2] (afte)     101.5740 (1.0)      141.8850 (1.0)      110.2334 (1.0)
column_apply_boolean_mask[col2] (befo)     234.1140 (2.30)     312.7140 (2.20)     245.5453 (2.23)
-----------------------------------------------------------------------------------------------------

------------------------------------- benchmark 'col3': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col3] (afte)      88.7710 (1.0)      142.7500 (1.0)       90.5082 (1.0)
column_apply_boolean_mask[col3] (befo)     195.0980 (2.20)     303.1020 (2.12)     199.8368 (2.21)
-----------------------------------------------------------------------------------------------------
```

Dataframe benchmark
```
----------------------------------- benchmark '100': 2 tests -----------------------------------
Name (time in us)                          Min                 Max                Mean
------------------------------------------------------------------------------------------------
df_apply_boolean_mask[100] (afte)     380.6770 (1.05)     654.7080 (1.18)     389.3374 (1.03)
df_apply_boolean_mask[100] (befo)     362.3220 (1.0)      554.6130 (1.0)      378.7087 (1.0)
------------------------------------------------------------------------------------------------

----------------------------------- benchmark '10000': 2 tests -----------------------------------
Name (time in us)                            Min                 Max                Mean
--------------------------------------------------------------------------------------------------
df_apply_boolean_mask[10000] (afte)     399.5240 (1.05)     461.6310 (1.0)      405.1225 (1.04)
df_apply_boolean_mask[10000] (befo)     379.4080 (1.0)      564.5770 (1.22)     389.6990 (1.0)
--------------------------------------------------------------------------------------------------
```

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9832
---
 python/cudf/cudf/_lib/stream_compaction.pyx | 18 +++++----------
 python/cudf/cudf/core/_base_index.py        | 25 ++++++++++++++++++++-
 python/cudf/cudf/core/algorithms.py         |  4 ++--
 python/cudf/cudf/core/column/column.py      | 10 ++++++---
 python/cudf/cudf/core/frame.py              | 13 -----------
 python/cudf/cudf/core/indexed_frame.py      | 20 +++++++++++++++++
 6 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index ef47e843723..4330c565982 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -75,24 +75,22 @@ def drop_nulls(columns: list, how="any", keys=None, thresh=None):
     return columns_from_unique_ptr(move(c_result))
 
 
-def apply_boolean_mask(source_table, Column boolean_mask):
+def apply_boolean_mask(columns: list, Column boolean_mask):
     """
     Drops the rows which correspond to False in boolean_mask.
 
     Parameters
     ----------
-    source_table : source table whose rows are dropped as per boolean_mask
+    columns : list of columns whose rows are dropped as per boolean_mask
     boolean_mask : a boolean column of same size as source_table
 
     Returns
     -------
-    Frame obtained from applying mask
+    columns obtained from applying mask
     """
 
-    assert pd.api.types.is_bool_dtype(boolean_mask.dtype)
-
     cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_table(source_table)
+    cdef table_view source_table_view = table_view_from_columns(columns)
     cdef column_view boolean_mask_view = boolean_mask.view()
 
     with nogil:
@@ -103,13 +101,7 @@ def apply_boolean_mask(source_table, Column boolean_mask):
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=(
-            None if source_table._index
-            is None else source_table._index_names)
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def drop_duplicates(columns: list,
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index aa89b8f849f..683f3fefe1c 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -9,8 +9,15 @@
 import pandas as pd
 
 import cudf
+from cudf._lib.stream_compaction import apply_boolean_mask
 from cudf._typing import DtypeObj
-from cudf.api.types import is_dtype_equal, is_integer, is_list_like, is_scalar
+from cudf.api.types import (
+    is_bool_dtype,
+    is_dtype_equal,
+    is_integer,
+    is_list_like,
+    is_scalar,
+)
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
 from cudf.core.column_accessor import ColumnAccessor
@@ -1414,6 +1421,22 @@ def from_pandas(cls, index, nan_as_null=None):
     def _constructor_expanddim(self):
         return cudf.MultiIndex
 
+    def _apply_boolean_mask(self, boolean_mask):
+        """Apply boolean mask to each row of `self`.
+
+        Rows corresponding to `False` is dropped.
+        """
+        boolean_mask = cudf.core.column.as_column(boolean_mask)
+        if not is_bool_dtype(boolean_mask.dtype):
+            raise ValueError("boolean_mask is not boolean type.")
+
+        result = self.__class__._from_columns(
+            apply_boolean_mask(list(self._columns), boolean_mask),
+            column_names=self._column_names,
+        )
+        result._copy_type_metadata(self)
+        return result
+
     def _split_columns_by_levels(self, levels):
         if isinstance(levels, int) and levels > 0:
             raise ValueError(f"Out of bound level: {levels}")
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 18c86f82f9c..a2a909968dc 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -5,8 +5,8 @@
 import numpy as np
 
 from cudf.core.column import as_column
-from cudf.core.frame import Frame
 from cudf.core.index import Index, RangeIndex
+from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.series import Series
 
 
@@ -92,7 +92,7 @@ def _index_or_values_interpolation(column, index=None):
     if num_nan == 0 or num_nan == len(column):
         return column
 
-    to_interp = Frame(data={None: column}, index=index)
+    to_interp = IndexedFrame(data={None: column}, index=index)
     known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))
 
     known_x = known_x_and_y._index._column.values
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index c1e037499fc..a966276842f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -35,6 +35,7 @@
 )
 from cudf._lib.scalar import as_device_scalar
 from cudf._lib.stream_compaction import (
+    apply_boolean_mask,
     distinct_count as cpp_distinct_count,
     drop_duplicates,
     drop_nulls,
@@ -997,9 +998,12 @@ def as_decimal32_column(
         raise NotImplementedError
 
     def apply_boolean_mask(self, mask) -> ColumnBase:
-        mask = as_column(mask, dtype="bool")
-        return (
-            self.as_frame()._apply_boolean_mask(boolean_mask=mask)._as_column()
+        mask = as_column(mask)
+        if not is_bool_dtype(mask.dtype):
+            raise ValueError("boolean_mask is not boolean type.")
+
+        return apply_boolean_mask([self], mask)[0]._with_type_metadata(
+            self.dtype
         )
 
     def argsort(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 0345966d6bd..6e47c0f41cf 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1461,19 +1461,6 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
 
         return self[out_cols]
 
-    def _apply_boolean_mask(self, boolean_mask):
-        """
-        Applies boolean mask to each row of `self`,
-        rows corresponding to `False` is dropped
-        """
-        result = self.__class__._from_data(
-            *libcudf.stream_compaction.apply_boolean_mask(
-                self, as_column(boolean_mask)
-            )
-        )
-        result._copy_type_metadata(self)
-        return result
-
     def interpolate(
         self,
         method="linear",
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 2f4d4a88195..7c5783bf637 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -19,6 +19,7 @@
 from cudf._typing import ColumnLike
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
+    is_bool_dtype,
     is_categorical_dtype,
     is_integer_dtype,
     is_list_like,
@@ -1197,6 +1198,25 @@ def resample(
             else cudf.core.resample.DataFrameResampler(self, by=by)
         )
 
+    def _apply_boolean_mask(self, boolean_mask):
+        """Apply boolean mask to each row of `self`.
+
+        Rows corresponding to `False` is dropped.
+        """
+        boolean_mask = cudf.core.column.as_column(boolean_mask)
+        if not is_bool_dtype(boolean_mask.dtype):
+            raise ValueError("boolean_mask is not boolean type.")
+
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.apply_boolean_mask(
+                list(self._index._columns + self._columns), boolean_mask
+            ),
+            column_names=self._column_names,
+            index_names=self._index.names,
+        )
+        result._copy_type_metadata(self)
+        return result
+
     def _reset_index(self, level, drop, col_level=0, col_fill=""):
         """Shared path for DataFrame.reset_index and Series.reset_index."""
         if level is not None and not isinstance(level, (tuple, list)):