From 37a3bc3e6d2ebec3f5e1f40cafccc95be9c33b81 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 25 Nov 2024 11:59:58 -0800
Subject: [PATCH 1/4] Remove cudf._lib.null_mask in favor of inlining pylibcudf

---
 python/cudf/cudf/_lib/CMakeLists.txt     |  1 -
 python/cudf/cudf/_lib/__init__.py        |  1 -
 python/cudf/cudf/_lib/null_mask.pyx      | 65 ------------------------
 python/cudf/cudf/core/column/column.py   | 23 +++++----
 python/cudf/cudf/core/column/lists.py    |  9 ++--
 python/cudf/cudf/core/column/string.py   |  6 +--
 python/cudf/cudf/core/column/struct.py   |  5 +-
 python/cudf/cudf/core/dataframe.py       |  9 ++--
 python/cudf/cudf/core/groupby/groupby.py |  9 +++-
 python/cudf/cudf/testing/_utils.py       |  5 +-
 python/cudf/cudf/utils/utils.py          |  3 +-
 11 files changed, 37 insertions(+), 99 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/null_mask.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 1c2b24d2391..22d6abddeff 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -26,7 +26,6 @@ set(cython_sources
     json.pyx
     lists.pyx
     merge.pyx
-    null_mask.pyx
     orc.pyx
     parquet.pyx
     partitioning.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 13d05033c11..cad521073ad 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -12,7 +12,6 @@
     join,
     json,
     merge,
-    null_mask,
     nvtext,
     orc,
     parquet,
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
deleted file mode 100644
index d54e8e66281..00000000000
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import pylibcudf
-from pylibcudf.null_mask import MaskState
-
-from cudf.core.buffer import acquire_spill_lock, as_buffer
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def copy_bitmask(Column col):
-    """
-    Copies column's validity mask buffer into a new buffer, shifting by the
-    offset if nonzero
-    """
-    if col.base_mask is None:
-        return None
-
-    rmm_db = pylibcudf.null_mask.copy_bitmask(col.to_pylibcudf(mode="read"))
-    buf = as_buffer(rmm_db)
-    return buf
-
-
-def bitmask_allocation_size_bytes(num_bits):
-    """
-    Given a size, calculates the number of bytes that should be allocated for a
-    column validity mask
-    """
-    return pylibcudf.null_mask.bitmask_allocation_size_bytes(num_bits)
-
-
-def create_null_mask(size, state=MaskState.UNINITIALIZED):
-    """
-    Given a size and a mask state, allocate a mask that can properly represent
-    the given size with the given mask state
-
-    Parameters
-    ----------
-    size : int
-        Number of elements the mask needs to be able to represent
-    state : ``MaskState``, default ``MaskState.UNINITIALIZED``
-        State the null mask should be created in
-    """
-    rmm_db = pylibcudf.null_mask.create_null_mask(size, state)
-    buf = as_buffer(rmm_db)
-    return buf
-
-
-@acquire_spill_lock()
-def bitmask_and(list columns):
-    rmm_db, other = pylibcudf.null_mask.bitmask_and(
-        [col.to_pylibcudf(mode="read") for col in columns]
-    )
-    buf = as_buffer(rmm_db)
-    return buf, other
-
-
-@acquire_spill_lock()
-def bitmask_or(list columns):
-    rmm_db, other = pylibcudf.null_mask.bitmask_or(
-        [col.to_pylibcudf(mode="read") for col in columns]
-    )
-    buf = as_buffer(rmm_db)
-    return buf, other
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 53946be1c49..1cb73464854 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -25,11 +25,6 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.column import Column
-from cudf._lib.null_mask import (
-    MaskState,
-    bitmask_allocation_size_bytes,
-    create_null_mask,
-)
 from cudf._lib.scalar import as_device_scalar
 from cudf._lib.stream_compaction import (
     apply_boolean_mask,
@@ -383,7 +378,7 @@ def memory_usage(self) -> int:
         if self.data is not None:
             n += self.data.size
         if self.nullable:
-            n += bitmask_allocation_size_bytes(self.size)
+            n += plc.null_mask.bitmask_allocation_size_bytes(self.size)
         return n
 
     def _fill(
@@ -410,7 +405,11 @@ def _fill(
             )
 
         if not slr.is_valid() and not self.nullable:
-            mask = create_null_mask(self.size, state=MaskState.ALL_VALID)
+            mask = as_buffer(
+                plc.null_mask.create_null_mask(
+                    self.size, plc.null_mask.MaskState.ALL_VALID
+                )
+            )
             self.set_base_mask(mask)
 
         libcudf.filling.fill_in_place(self, begin, end, slr.device_value)
@@ -1549,7 +1548,11 @@ def column_empty(
         data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize))
 
     if masked:
-        mask = create_null_mask(row_count, state=MaskState.ALL_NULL)
+        mask = as_buffer(
+            plc.null_mask.create_null_mask(
+                row_count, plc.null_mask.MaskState.ALL_NULL
+            )
+        )
     else:
         mask = None
 
@@ -2206,7 +2209,9 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer:
     typestr = desc["typestr"]
     typecode = typestr[1]
     if typecode == "t":
-        mask_size = bitmask_allocation_size_bytes(desc["shape"][0])
+        mask_size = plc.null_mask.bitmask_allocation_size_bytes(
+            desc["shape"][0]
+        )
         return as_buffer(data=desc["data"][0], size=mask_size, owner=obj)
     elif typecode == "b":
         col = as_column(cai_mask)
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 6b25e568f00..55b31e43818 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -10,6 +10,8 @@
 import pyarrow as pa
 from typing_extensions import Self
 
+import pylibcudf as plc
+
 import cudf
 from cudf._lib.lists import (
     concatenate_list_elements,
@@ -79,10 +81,7 @@ def __init__(
 
     @cached_property
     def memory_usage(self):
-        n = 0
-        if self.nullable:
-            n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size)
-
+        n = super().memory_usage
         child0_size = (self.size + 1) * self.base_children[0].dtype.itemsize
         current_base_child = self.base_children[1]
         current_offset = self.offset
@@ -107,7 +106,7 @@ def memory_usage(self):
         ) * current_base_child.dtype.itemsize
 
         if current_base_child.nullable:
-            n += cudf._lib.null_mask.bitmask_allocation_size_bytes(
+            n += plc.null_mask.bitmask_allocation_size_bytes(
                 current_base_child.size
             )
         return n
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index a9ab2d373fd..47763063c4c 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5750,17 +5750,13 @@ def end_offset(self) -> int:
 
     @cached_property
     def memory_usage(self) -> int:
-        n = 0
-        if self.data is not None:
-            n += self.data.size
+        n = super().memory_usage
         if len(self.base_children) == 1:
             child0_size = (self.size + 1) * self.base_children[
                 0
             ].dtype.itemsize
 
             n += child0_size
-        if self.nullable:
-            n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size)
         return n
 
     @property
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 8f16ba4e15b..2adc6b54bab 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -101,10 +101,7 @@ def to_pandas(
 
     @cached_property
     def memory_usage(self) -> int:
-        n = 0
-        if self.nullable:
-            n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size)
-
+        n = super().memory_usage
         for child in self.children:
             n += child.memory_usage
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 728cc47a7c9..abc04df182b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -45,7 +45,7 @@
 from cudf.core import column, df_protocol, indexing_utils, reshape
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
-from cudf.core.buffer import acquire_spill_lock
+from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,
@@ -3156,9 +3156,10 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
 
                 out.append(result._with_type_metadata(col.dtype))
             else:
-                out_mask = cudf._lib.null_mask.create_null_mask(
-                    len(source_col),
-                    state=cudf._lib.null_mask.MaskState.ALL_NULL,
+                out_mask = as_buffer(
+                    plc.null_mask.create_null_mask(
+                        len(source_col), plc.null_mask.MaskState.ALL_NULL
+                    )
                 )
                 out.append(source_col.set_mask(out_mask))
 
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index e59b948aba9..77a0f26dc70 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -14,10 +14,11 @@
 import numpy as np
 import pandas as pd
 
+import pylibcudf as plc
+
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import groupby as libgroupby
-from cudf._lib.null_mask import bitmask_or
 from cudf._lib.reshape import interleave_columns
 from cudf._lib.sort import segmented_sort_by_key
 from cudf._lib.types import size_type_dtype
@@ -25,6 +26,7 @@
 from cudf.api.types import is_list_like, is_numeric_dtype
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import GatherMap
@@ -1103,7 +1105,10 @@ def ngroup(self, ascending=True):
         """
         index = self.grouping.keys.unique().sort_values()
         num_groups = len(index)
-        _, has_null_group = bitmask_or([*index._columns])
+        with acquire_spill_lock():
+            _, has_null_group = plc.null_mask.bitmask_or(
+                [col.to_pylibcudf(mode="read") for col in index._columns]
+            )
 
         if ascending:
             # Count ascending from 0 to num_groups - 1
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index a5dc8a5498c..6624a1a150e 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -15,8 +15,9 @@
 from numba.cuda.cudadecl import registry as cuda_decl_registry
 from numba.cuda.cudaimpl import lower as cuda_lower
 
+import pylibcudf as plc
+
 import cudf
-from cudf._lib.null_mask import bitmask_allocation_size_bytes
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.core.udf.strings_lowering import cast_string_view_to_udf_string
 from cudf.core.udf.strings_typing import StringView, string_view, udf_string
@@ -91,7 +92,7 @@ def random_bitmask(size):
     size : int
         number of bits
     """
-    sz = bitmask_allocation_size_bytes(size)
+    sz = plc.null_mask.bitmask_allocation_size_bytes(size)
     rng = np.random.default_rng(seed=0)
     data = rng.integers(0, 255, dtype="u1", size=sz)
     return data.view("i1")
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 294253cd119..e6d252b8807 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -11,6 +11,7 @@
 import numpy as np
 import pandas as pd
 
+import pylibcudf as plc
 import rmm
 
 import cudf
@@ -252,7 +253,7 @@ def pa_mask_buffer_to_mask(mask_buf, size):
     """
     Convert PyArrow mask buffer to cuDF mask buffer
     """
-    mask_size = cudf._lib.null_mask.bitmask_allocation_size_bytes(size)
+    mask_size = plc.null_mask.bitmask_allocation_size_bytes(size)
     if mask_buf.size < mask_size:
         dbuf = rmm.DeviceBuffer(size=mask_size)
         dbuf.copy_from_host(np.asarray(mask_buf).view("u1"))

From 78de18e895a56ddc402e3c847f5d5b88312a2b24 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 25 Nov 2024 16:22:31 -0800
Subject: [PATCH 2/4] Another reference

---
 python/cudf/cudf/_lib/column.pyx | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 94dbdf5534d..b0c41b6a179 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -36,7 +36,6 @@ from cudf._lib.types cimport (
     dtype_to_pylibcudf_type,
 )
 
-from cudf._lib.null_mask import bitmask_allocation_size_bytes
 from cudf._lib.types import dtype_from_pylibcudf_column
 
 cimport pylibcudf.libcudf.copying as cpp_copying
@@ -183,7 +182,9 @@ cdef class Column:
 
         if value is not None:
             # bitmask size must be relative to offset = 0 data.
-            required_size = bitmask_allocation_size_bytes(self.base_size)
+            required_size = pylibcudf.null_mask.bitmask_allocation_size_bytes(
+                self.base_size
+            )
             if value.size < required_size:
                 error_msg = (
                     "The Buffer for mask is smaller than expected, "
@@ -220,7 +221,7 @@ cdef class Column:
         and compute new data Buffers zero-copy that use pointer arithmetic to
         properly adjust the pointer.
         """
-        mask_size = bitmask_allocation_size_bytes(self.size)
+        mask_size = pylibcudf.null_mask.bitmask_allocation_size_bytes(self.size)
         required_num_bytes = -(-self.size // 8)  # ceiling divide
         error_msg = (
             "The value for mask is smaller than expected, got {}  bytes, "
@@ -790,13 +791,17 @@ cdef class Column:
                     mask = as_buffer(
                         rmm.DeviceBuffer(
                             ptr=mask_ptr,
-                            size=bitmask_allocation_size_bytes(base_size)
+                            size=pylibcudf.null_mask.bitmask_allocation_size_bytes(
+                                base_size
+                            )
                         )
                     )
             else:
                 mask = as_buffer(
                     data=mask_ptr,
-                    size=bitmask_allocation_size_bytes(base_size),
+                    size=pylibcudf.null_mask.bitmask_allocation_size_bytes(
+                        base_size
+                    ),
                     owner=mask_owner,
                     exposed=True
                 )

From cc3eee5ec96fbd0ee288a01a48902d4963982158 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 25 Nov 2024 18:45:28 -0800
Subject: [PATCH 3/4] address copy_bitmask usage

---
 python/cudf/cudf/_lib/column.pyx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index b0c41b6a179..9cbe11d61ac 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -11,7 +11,6 @@ import pylibcudf
 import rmm
 
 import cudf
-import cudf._lib as libcudf
 from cudf.core.buffer import (
     Buffer,
     ExposureTrackedBuffer,
@@ -158,7 +157,10 @@ cdef class Column:
             if self.base_mask is None or self.offset == 0:
                 self._mask = self.base_mask
             else:
-                self._mask = libcudf.null_mask.copy_bitmask(self)
+                with acquire_spill_lock():
+                    self._mask = as_buffer(
+                        pylibcudf.null_mask.copy_bitmask(self.to_pylibcudf(mode="read"))
+                    )
         return self._mask
 
     @property

From 125b844c3fc7ca0183cca4c62c340e4b22a97fdb Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 26 Nov 2024 10:59:18 -0800
Subject: [PATCH 4/4] Avoid bitmask_or

---
 python/cudf/cudf/core/groupby/groupby.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index ac0958aa7f0..18dd741a663 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1104,11 +1104,7 @@ def ngroup(self, ascending=True):
         """
         index = self.grouping.keys.unique().sort_values()
         num_groups = len(index)
-        with acquire_spill_lock():
-            _, has_null_group = plc.null_mask.bitmask_or(
-                [col.to_pylibcudf(mode="read") for col in index._columns]
-            )
-
+        has_null_group = any(col.has_nulls() for col in index._columns)
         if ascending:
             # Count ascending from 0 to num_groups - 1
             groups = range(num_groups)