diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 45e0fc345b5..f7c9f6df8ad 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -24,7 +24,6 @@ set(cython_sources interop.pyx json.pyx merge.pyx - null_mask.pyx orc.pyx parquet.pyx reduce.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index c51db601985..7474c4e8cd1 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -11,7 +11,6 @@ interop, json, merge, - null_mask, nvtext, orc, parquet, diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 94dbdf5534d..9cbe11d61ac 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -11,7 +11,6 @@ import pylibcudf import rmm import cudf -import cudf._lib as libcudf from cudf.core.buffer import ( Buffer, ExposureTrackedBuffer, @@ -36,7 +35,6 @@ from cudf._lib.types cimport ( dtype_to_pylibcudf_type, ) -from cudf._lib.null_mask import bitmask_allocation_size_bytes from cudf._lib.types import dtype_from_pylibcudf_column cimport pylibcudf.libcudf.copying as cpp_copying @@ -159,7 +157,10 @@ cdef class Column: if self.base_mask is None or self.offset == 0: self._mask = self.base_mask else: - self._mask = libcudf.null_mask.copy_bitmask(self) + with acquire_spill_lock(): + self._mask = as_buffer( + pylibcudf.null_mask.copy_bitmask(self.to_pylibcudf(mode="read")) + ) return self._mask @property @@ -183,7 +184,9 @@ cdef class Column: if value is not None: # bitmask size must be relative to offset = 0 data. - required_size = bitmask_allocation_size_bytes(self.base_size) + required_size = pylibcudf.null_mask.bitmask_allocation_size_bytes( + self.base_size + ) if value.size < required_size: error_msg = ( "The Buffer for mask is smaller than expected, " @@ -220,7 +223,7 @@ cdef class Column: and compute new data Buffers zero-copy that use pointer arithmetic to properly adjust the pointer. """ - mask_size = bitmask_allocation_size_bytes(self.size) + mask_size = pylibcudf.null_mask.bitmask_allocation_size_bytes(self.size) required_num_bytes = -(-self.size // 8) # ceiling divide error_msg = ( "The value for mask is smaller than expected, got {} bytes, " @@ -790,13 +793,17 @@ cdef class Column: mask = as_buffer( rmm.DeviceBuffer( ptr=mask_ptr, - size=bitmask_allocation_size_bytes(base_size) + size=pylibcudf.null_mask.bitmask_allocation_size_bytes( + base_size + ) ) ) else: mask = as_buffer( data=mask_ptr, - size=bitmask_allocation_size_bytes(base_size), + size=pylibcudf.null_mask.bitmask_allocation_size_bytes( + base_size + ), owner=mask_owner, exposed=True ) diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx deleted file mode 100644 index d54e8e66281..00000000000 --- a/python/cudf/cudf/_lib/null_mask.pyx +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf -from pylibcudf.null_mask import MaskState - -from cudf.core.buffer import acquire_spill_lock, as_buffer - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def copy_bitmask(Column col): - """ - Copies column's validity mask buffer into a new buffer, shifting by the - offset if nonzero - """ - if col.base_mask is None: - return None - - rmm_db = pylibcudf.null_mask.copy_bitmask(col.to_pylibcudf(mode="read")) - buf = as_buffer(rmm_db) - return buf - - -def bitmask_allocation_size_bytes(num_bits): - """ - Given a size, calculates the number of bytes that should be allocated for a - column validity mask - """ - return pylibcudf.null_mask.bitmask_allocation_size_bytes(num_bits) - - -def create_null_mask(size, state=MaskState.UNINITIALIZED): - """ - Given a size and a mask state, allocate a mask that can properly represent - the given size with the given mask state - - Parameters - ---------- - size : int - Number of elements the mask needs to be able to represent - state : ``MaskState``, default ``MaskState.UNINITIALIZED`` - State the null mask should be created in - """ - rmm_db = pylibcudf.null_mask.create_null_mask(size, state) - buf = as_buffer(rmm_db) - return buf - - -@acquire_spill_lock() -def bitmask_and(list columns): - rmm_db, other = pylibcudf.null_mask.bitmask_and( - [col.to_pylibcudf(mode="read") for col in columns] - ) - buf = as_buffer(rmm_db) - return buf, other - - -@acquire_spill_lock() -def bitmask_or(list columns): - rmm_db, other = pylibcudf.null_mask.bitmask_or( - [col.to_pylibcudf(mode="read") for col in columns] - ) - buf = as_buffer(rmm_db) - return buf, other diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f0df4a3c1b3..8ddfd4a54ae 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -25,11 +25,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.null_mask import ( - MaskState, - bitmask_allocation_size_bytes, - create_null_mask, -) from cudf._lib.scalar import as_device_scalar from cudf._lib.stream_compaction import ( apply_boolean_mask, @@ -383,7 +378,7 @@ def memory_usage(self) -> int: if self.data is not None: n += self.data.size if self.nullable: - n += bitmask_allocation_size_bytes(self.size) + n += plc.null_mask.bitmask_allocation_size_bytes(self.size) return n def _fill( @@ -410,7 +405,11 @@ def _fill( ) if not slr.is_valid() and not self.nullable: - mask = create_null_mask(self.size, state=MaskState.ALL_VALID) + mask = as_buffer( + plc.null_mask.create_null_mask( + self.size, plc.null_mask.MaskState.ALL_VALID + ) + ) self.set_base_mask(mask) libcudf.filling.fill_in_place(self, begin, end, slr.device_value) @@ -1553,7 +1552,11 @@ def column_empty( data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) if masked: - mask = create_null_mask(row_count, state=MaskState.ALL_NULL) + mask = as_buffer( + plc.null_mask.create_null_mask( + row_count, plc.null_mask.MaskState.ALL_NULL + ) + ) else: mask = None @@ -2210,7 +2213,9 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer: typestr = desc["typestr"] typecode = typestr[1] if typecode == "t": - mask_size = bitmask_allocation_size_bytes(desc["shape"][0]) + mask_size = plc.null_mask.bitmask_allocation_size_bytes( + desc["shape"][0] + ) return as_buffer(data=desc["data"][0], size=mask_size, owner=obj) elif typecode == "b": col = as_column(cai_mask) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 9962663e811..42df5123014 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -69,10 +69,7 @@ def __init__( @cached_property def memory_usage(self): - n = 0 - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) - + n = super().memory_usage child0_size = (self.size + 1) * self.base_children[0].dtype.itemsize current_base_child = self.base_children[1] current_offset = self.offset @@ -97,7 +94,7 @@ def memory_usage(self): ) * current_base_child.dtype.itemsize if current_base_child.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes( + n += plc.null_mask.bitmask_allocation_size_bytes( current_base_child.size ) return n diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a9ab2d373fd..47763063c4c 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5750,17 +5750,13 @@ def end_offset(self) -> int: @cached_property def memory_usage(self) -> int: - n = 0 - if self.data is not None: - n += self.data.size + n = super().memory_usage if len(self.base_children) == 1: child0_size = (self.size + 1) * self.base_children[ 0 ].dtype.itemsize n += child0_size - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) return n @property diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 8f16ba4e15b..2adc6b54bab 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -101,10 +101,7 @@ def to_pandas( @cached_property def memory_usage(self) -> int: - n = 0 - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) - + n = super().memory_usage for child in self.children: n += child.memory_usage diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 73c0af45293..b58ab13be93 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -45,7 +45,7 @@ from cudf.core import column, df_protocol, indexing_utils, reshape from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable -from cudf.core.buffer import acquire_spill_lock +from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -3191,9 +3191,10 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): out.append(result._with_type_metadata(col.dtype)) else: - out_mask = cudf._lib.null_mask.create_null_mask( - len(source_col), - state=cudf._lib.null_mask.MaskState.ALL_NULL, + out_mask = as_buffer( + plc.null_mask.create_null_mask( + len(source_col), plc.null_mask.MaskState.ALL_NULL + ) ) out.append(source_col.set_mask(out_mask)) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 315324c130c..e977f037b79 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -19,7 +19,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib import groupby as libgroupby -from cudf._lib.null_mask import bitmask_or from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default @@ -1118,8 +1117,7 @@ def ngroup(self, ascending=True): """ index = self.grouping.keys.unique().sort_values() num_groups = len(index) - _, has_null_group = bitmask_or([*index._columns]) - + has_null_group = any(col.has_nulls() for col in index._columns) if ascending: # Count ascending from 0 to num_groups - 1 groups = range(num_groups) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index a5dc8a5498c..6624a1a150e 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -15,8 +15,9 @@ from numba.cuda.cudadecl import registry as cuda_decl_registry from numba.cuda.cudaimpl import lower as cuda_lower +import pylibcudf as plc + import cudf -from cudf._lib.null_mask import bitmask_allocation_size_bytes from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.core.udf.strings_lowering import cast_string_view_to_udf_string from cudf.core.udf.strings_typing import StringView, string_view, udf_string @@ -91,7 +92,7 @@ def random_bitmask(size): size : int number of bits """ - sz = bitmask_allocation_size_bytes(size) + sz = plc.null_mask.bitmask_allocation_size_bytes(size) rng = np.random.default_rng(seed=0) data = rng.integers(0, 255, dtype="u1", size=sz) return data.view("i1") diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 294253cd119..e6d252b8807 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -11,6 +11,7 @@ import numpy as np import pandas as pd +import pylibcudf as plc import rmm import cudf @@ -252,7 +253,7 @@ def pa_mask_buffer_to_mask(mask_buf, size): """ Convert PyArrow mask buffer to cuDF mask buffer """ - mask_size = cudf._lib.null_mask.bitmask_allocation_size_bytes(size) + mask_size = plc.null_mask.bitmask_allocation_size_bytes(size) if mask_buf.size < mask_size: dbuf = rmm.DeviceBuffer(size=mask_size) dbuf.copy_from_host(np.asarray(mask_buf).view("u1"))