diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 43e2d6031bc..c58bc42327c 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -342,6 +342,7 @@ def clean_all_xml_files(path): "cudf.Series": ("cudf.core.series.Series", "cudf.Series"), "cudf.Index": ("cudf.core.index.Index", "cudf.Index"), "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"), + "DeviceBuffer": ("rmm._lib.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"), } @@ -383,6 +384,7 @@ def _generate_namespaces(namespaces): # Cython types that don't alias cleanly because of # https://github.com/cython/cython/issues/5609 "size_type", + "size_t", "type_id", # Unknown base types "int32_t", diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 505765bba0f..6a2b66e8ea0 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -23,6 +23,7 @@ This page provides API documentation for pylibcudf. join lists merge + null_mask quantiles reduce replace diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/null_mask.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/null_mask.rst new file mode 100644 index 00000000000..4799c62eace --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/null_mask.rst @@ -0,0 +1,6 @@ +========= +null_mask +========= + +.. automodule:: pylibcudf.null_mask + :members: diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx index 3a7b6a59bf3..d54e8e66281 100644 --- a/python/cudf/cudf/_lib/null_mask.pyx +++ b/python/cudf/cudf/_lib/null_mask.pyx @@ -1,39 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from enum import Enum - -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +import pylibcudf +from pylibcudf.null_mask import MaskState from cudf.core.buffer import acquire_spill_lock, as_buffer -from libcpp.memory cimport make_unique, unique_ptr -from libcpp.pair cimport pair -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.null_mask cimport ( - bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes, - bitmask_and as cpp_bitmask_and, - bitmask_or as cpp_bitmask_or, - copy_bitmask as cpp_copy_bitmask, - create_null_mask as cpp_create_null_mask, - underlying_type_t_mask_state, -) -from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport mask_state, size_type - from cudf._lib.column cimport Column -from cudf._lib.utils cimport table_view_from_columns - - -class MaskState(Enum): - """ - Enum for null mask creation state - """ - UNALLOCATED = mask_state.UNALLOCATED - UNINITIALIZED = mask_state.UNINITIALIZED - ALL_VALID = mask_state.ALL_VALID - ALL_NULL = mask_state.ALL_NULL @acquire_spill_lock() @@ -45,33 +17,20 @@ def copy_bitmask(Column col): if col.base_mask is None: return None - cdef column_view col_view = col.view() - cdef device_buffer db - cdef unique_ptr[device_buffer] up_db - - with nogil: - db = move(cpp_copy_bitmask(col_view)) - up_db = move(make_unique[device_buffer](move(db))) - - rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db)) + rmm_db = pylibcudf.null_mask.copy_bitmask(col.to_pylibcudf(mode="read")) buf = as_buffer(rmm_db) return buf -def bitmask_allocation_size_bytes(size_type num_bits): +def bitmask_allocation_size_bytes(num_bits): """ Given a size, calculates the number of bytes that should be allocated for a column validity mask """ - cdef size_t output_size - - with nogil: - output_size = cpp_bitmask_allocation_size_bytes(num_bits) + return pylibcudf.null_mask.bitmask_allocation_size_bytes(num_bits) - return output_size - -def create_null_mask(size_type size, state=MaskState.UNINITIALIZED): +def create_null_mask(size, state=MaskState.UNINITIALIZED): """ Given a size and a mask state, allocate a mask that can properly represent the given size with the given mask state @@ -83,48 +42,24 @@ def create_null_mask(size_type size, state=MaskState.UNINITIALIZED): state : ``MaskState``, default ``MaskState.UNINITIALIZED`` State the null mask should be created in """ - if not isinstance(state, MaskState): - raise TypeError( - "`state` is required to be of type `MaskState`, got " - + (type(state).__name__) - ) - - cdef device_buffer db - cdef unique_ptr[device_buffer] up_db - cdef mask_state c_mask_state = ( - (state.value) - ) - - with nogil: - db = move(cpp_create_null_mask(size, c_mask_state)) - up_db = move(make_unique[device_buffer](move(db))) - - rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db)) + rmm_db = pylibcudf.null_mask.create_null_mask(size, state) buf = as_buffer(rmm_db) return buf @acquire_spill_lock() -def bitmask_and(columns: list): - cdef table_view c_view = table_view_from_columns(columns) - cdef pair[device_buffer, size_type] c_result - cdef unique_ptr[device_buffer] up_db - with nogil: - c_result = move(cpp_bitmask_and(c_view)) - up_db = move(make_unique[device_buffer](move(c_result.first))) - dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_buffer(dbuf) - return buf, c_result.second +def bitmask_and(list columns): + rmm_db, other = pylibcudf.null_mask.bitmask_and( + [col.to_pylibcudf(mode="read") for col in columns] + ) + buf = as_buffer(rmm_db) + return buf, other @acquire_spill_lock() -def bitmask_or(columns: list): - cdef table_view c_view = table_view_from_columns(columns) - cdef pair[device_buffer, size_type] c_result - cdef unique_ptr[device_buffer] up_db - with nogil: - c_result = move(cpp_bitmask_or(c_view)) - up_db = move(make_unique[device_buffer](move(c_result.first))) - dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_buffer(dbuf) - return buf, c_result.second +def bitmask_or(list columns): + rmm_db, other = pylibcudf.null_mask.bitmask_or( + [col.to_pylibcudf(mode="read") for col in columns] + ) + buf = as_buffer(rmm_db) + return buf, other diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index f81a32e07f9..a4f17344cb0 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -29,6 +29,7 @@ set(cython_sources join.pyx lists.pyx merge.pyx + null_mask.pyx quantiles.pyx reduce.pyx replace.pyx diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index 71f523fc3cd..841efa59bda 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -15,6 +15,7 @@ from . cimport ( join, lists, merge, + null_mask, quantiles, reduce, replace, @@ -57,6 +58,7 @@ __all__ = [ "join", "lists", "merge", + "null_mask", "quantiles", "reduce", "replace", diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index e784c6c6dd5..d3878a89a6a 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -26,6 +26,7 @@ join, lists, merge, + null_mask, quantiles, reduce, replace, @@ -69,6 +70,7 @@ "join", "lists", "merge", + "null_mask", "quantiles", "reduce", "replace", diff --git a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd index 3fc2c7e8f1e..5f582091b06 100644 --- a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd @@ -8,8 +8,6 @@ from pylibcudf.libcudf.types cimport bitmask_type, mask_state, size_type from rmm._lib.device_buffer cimport device_buffer -ctypedef int32_t underlying_type_t_mask_state - cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil: cdef device_buffer copy_bitmask "cudf::copy_bitmask" ( diff --git a/python/pylibcudf/pylibcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/null_mask.pxd new file mode 100644 index 00000000000..ab5c0080312 --- /dev/null +++ b/python/pylibcudf/pylibcudf/null_mask.pxd @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.libcudf.types cimport mask_state, size_type + +from rmm._lib.device_buffer cimport DeviceBuffer + +from .column cimport Column + + +cpdef DeviceBuffer copy_bitmask(Column col) + +cpdef size_t bitmask_allocation_size_bytes(size_type number_of_bits) + +cpdef DeviceBuffer create_null_mask(size_type size, mask_state state = *) + +cpdef tuple bitmask_and(list columns) + +cpdef tuple bitmask_or(list columns) diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx new file mode 100644 index 00000000000..5bdde06f21f --- /dev/null +++ b/python/pylibcudf/pylibcudf/null_mask.pyx @@ -0,0 +1,142 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport make_unique +from libcpp.pair cimport pair +from libcpp.utility cimport move +from pylibcudf.libcudf cimport null_mask as cpp_null_mask +from pylibcudf.libcudf.types cimport mask_state, size_type + +from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer + +from pylibcudf.libcudf.types import mask_state as MaskState # no-cython-lint + +from .column cimport Column +from .table cimport Table + + +cdef DeviceBuffer buffer_to_python(device_buffer buf): + return DeviceBuffer.c_from_unique_ptr(make_unique[device_buffer](move(buf))) + + +cpdef DeviceBuffer copy_bitmask(Column col): + """Copies ``col``'s bitmask into a ``DeviceBuffer``. + + For details, see :cpp:func:`copy_bitmask`. + + Parameters + ---------- + col : Column + Column whose bitmask needs to be copied + + Returns + ------- + rmm.DeviceBuffer + A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty ``DeviceBuffer`` + if ``col`` is not nullable + """ + cdef device_buffer db + + with nogil: + db = move(cpp_null_mask.copy_bitmask(col.view())) + + return buffer_to_python(move(db)) + +cpdef size_t bitmask_allocation_size_bytes(size_type number_of_bits): + """ + Computes the required bytes necessary to represent the specified number of bits + with a 64B padding boundary. + + For details, see :cpp:func:`bitmask_allocation_size_bytes`. + + Parameters + ---------- + number_of_bits : size_type + The number of bits that need to be represented + + Returns + ------- + size_t + The necessary number of bytes + """ + with nogil: + return cpp_null_mask.bitmask_allocation_size_bytes(number_of_bits) + + +cpdef DeviceBuffer create_null_mask( + size_type size, + mask_state state = mask_state.UNINITIALIZED +): + """Creates a ``DeviceBuffer`` for use as a null value indicator bitmask of a + ``Column``. + + For details, see :cpp:func:`create_null_mask`. + + Parameters + ---------- + size : size_type + The number of elements to be represented by the mask + state : mask_state, optional + The desired state of the mask. Can be one of { MaskState.UNALLOCATED, + MaskState.UNINITIALIZED, MaskState.ALL_VALID, MaskState.ALL_NULL } + (default MaskState.UNINITIALIZED) + + Returns + ------- + rmm.DeviceBuffer + A ``DeviceBuffer`` for use as a null bitmask satisfying the desired size and + state + """ + cdef device_buffer db + + with nogil: + db = move(cpp_null_mask.create_null_mask(size, state)) + + return buffer_to_python(move(db)) + + +cpdef tuple bitmask_and(list columns): + """Performs bitwise AND of the bitmasks of a list of columns. + + For details, see :cpp:func:`bitmask_and`. + + Parameters + ---------- + columns : list + The list of columns + + Returns + ------- + tuple[DeviceBuffer, size_type] + A tuple of the resulting mask and count of unset bits + """ + cdef Table c_table = Table(columns) + cdef pair[device_buffer, size_type] c_result + + with nogil: + c_result = move(cpp_null_mask.bitmask_and(c_table.view())) + + return buffer_to_python(move(c_result.first)), c_result.second + + +cpdef tuple bitmask_or(list columns): + """Performs bitwise OR of the bitmasks of a list of columns. + + For details, see :cpp:func:`bitmask_or`. + + Parameters + ---------- + columns : list + The list of columns + + Returns + ------- + tuple[DeviceBuffer, size_type] + A tuple of the resulting mask and count of unset bits + """ + cdef Table c_table = Table(columns) + cdef pair[device_buffer, size_type] c_result + + with nogil: + c_result = move(cpp_null_mask.bitmask_or(c_table.view())) + + return buffer_to_python(move(c_result.first)), c_result.second diff --git a/python/pylibcudf/pylibcudf/tests/test_null_mask.py b/python/pylibcudf/pylibcudf/tests/test_null_mask.py new file mode 100644 index 00000000000..3edcae59edc --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_null_mask.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from pylibcudf.null_mask import MaskState + +import rmm + + +@pytest.fixture(params=[False, True]) +def nullable(request): + return request.param + + +@pytest.fixture(params=["float32", "float64"]) +def column(request, nullable): + values = [2.5, 2.49, 1.6, 8, -1.5, -1.7, -0.5, 0.5] + typ = {"float32": pa.float32(), "float64": pa.float64()}[request.param] + if nullable: + values[2] = None + return plc.interop.from_arrow(pa.array(values, type=typ)) + + +def test_copy_bitmask(column, nullable): + expected = column.null_mask().obj if nullable else rmm.DeviceBuffer() + got = plc.null_mask.copy_bitmask(column) + + assert expected.size == got.size + assert expected.tobytes() == got.tobytes() + + +def test_bitmask_allocation_size_bytes(): + assert plc.null_mask.bitmask_allocation_size_bytes(0) == 0 + assert plc.null_mask.bitmask_allocation_size_bytes(1) == 64 + assert plc.null_mask.bitmask_allocation_size_bytes(512) == 64 + assert plc.null_mask.bitmask_allocation_size_bytes(513) == 128 + assert plc.null_mask.bitmask_allocation_size_bytes(1024) == 128 + assert plc.null_mask.bitmask_allocation_size_bytes(1025) == 192 + + +@pytest.mark.parametrize("size", [0, 1, 512, 1024]) +@pytest.mark.parametrize( + "state", + [ + MaskState.UNALLOCATED, + MaskState.UNINITIALIZED, + MaskState.ALL_VALID, + MaskState.ALL_NULL, + ], +) +def test_create_null_mask(size, state): + mask = plc.null_mask.create_null_mask(size, state) + + assert mask.size == ( + 0 + if state == MaskState.UNALLOCATED + else plc.null_mask.bitmask_allocation_size_bytes(size) + )