From eb460169786665b1624cb6c4f9b502b800810b37 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 4 Jun 2024 06:32:49 -0500 Subject: [PATCH] Migrate column factories to pylibcudf (#15257) This PR implements `column_factories.hpp` using `pylibcudf` and migrates the cuDF cython to use them cc @vyasr Authors: - https://github.com/brandon-b-miller - Lawrence Mitchell (https://github.com/wence-) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15257 --- cpp/src/column/column_factories.cpp | 17 +- cpp/tests/column/factories_test.cpp | 4 +- cpp/tests/fixed_point/fixed_point_tests.cpp | 2 +- .../api_docs/pylibcudf/column_factories.rst | 6 + .../user_guide/api_docs/pylibcudf/index.rst | 1 + python/cudf/cudf/_lib/column.pyx | 21 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 4 +- .../cudf/_lib/pylibcudf/column_factories.pxd | 52 ++++ .../cudf/_lib/pylibcudf/column_factories.pyx | 205 ++++++++++++++ python/cudf/cudf/_lib/pylibcudf/interop.pyx | 82 ++++++ .../libcudf/column/column_factories.pxd | 73 ++++- python/cudf/cudf/_lib/pylibcudf/types.pxd | 1 + python/cudf/cudf/_lib/pylibcudf/types.pyx | 3 +- .../pylibcudf_tests/test_column_factories.py | 253 ++++++++++++++++++ .../cudf/cudf/pylibcudf_tests/test_interop.py | 69 +++++ 17 files changed, 767 insertions(+), 29 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/column_factories.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/column_factories.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_column_factories.py create mode 100644 python/cudf/cudf/pylibcudf_tests/test_interop.py diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index e40056fc8a1..0260068d4db 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -65,7 +65,8 @@ std::size_t size_of(data_type element_type) std::unique_ptr make_empty_column(data_type type) { CUDF_EXPECTS(type.id() == type_id::EMPTY || !cudf::is_nested(type), - "make_empty_column is invalid to call on nested types"); + "make_empty_column is invalid to call on nested types", + cudf::data_type_error); return std::make_unique(type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); } @@ -80,7 +81,9 @@ std::unique_ptr make_numeric_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type."); + CUDF_EXPECTS(type.id() != type_id::EMPTY && is_numeric(type), + "Invalid, non-numeric type.", + cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -100,7 +103,7 @@ std::unique_ptr make_fixed_point_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type."); + CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.", cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -120,7 +123,7 @@ std::unique_ptr make_timestamp_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type."); + CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.", cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -140,7 +143,7 @@ std::unique_ptr make_duration_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type."); + CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.", cudf::data_type_error); CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique( @@ -160,7 +163,9 @@ std::unique_ptr make_fixed_width_column(data_type type, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type."); + CUDF_EXPECTS(type.id() != type_id::EMPTY && is_fixed_width(type), + "Invalid, non-fixed-width type.", + cudf::data_type_error); // clang-format off if (is_timestamp (type)) return make_timestamp_column (type, size, state, stream, mr); diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index afebc91dd73..dca36eaa4e7 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -164,7 +164,7 @@ TEST_P(NonNumericFactoryTest, NonNumericThrow) auto column = cudf::make_numeric_column( cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED); }; - EXPECT_THROW(construct(), cudf::logic_error); + EXPECT_THROW(construct(), cudf::data_type_error); } INSTANTIATE_TEST_CASE_P(NonNumeric, @@ -307,7 +307,7 @@ TEST_P(NonFixedWidthFactoryTest, NonFixedWidthThrow) auto column = cudf::make_fixed_width_column( cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED); }; - EXPECT_THROW(construct(), cudf::logic_error); + EXPECT_THROW(construct(), cudf::data_type_error); } INSTANTIATE_TEST_CASE_P(NonFixedWidth, diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index 73de1fbaa68..ab7984d4b03 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -498,7 +498,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper) TYPED_TEST(FixedPointTestAllReps, NoScaleOrWrongTypeID) { EXPECT_THROW(cudf::make_fixed_point_column(cudf::data_type{cudf::type_id::INT32}, 0), - cudf::logic_error); + cudf::data_type_error); } TYPED_TEST(FixedPointTestAllReps, SimpleFixedPointColumnWrapper) diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst new file mode 100644 index 00000000000..c858135b6ce --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst @@ -0,0 +1,6 @@ +================ +column_factories +================ + +.. automodule:: cudf._lib.pylibcudf.column_factories + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 26875ce7d12..58fea77adaa 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -11,6 +11,7 @@ This page provides API documentation for pylibcudf. aggregation binaryop column + column_factories concatenate copying filling diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index f33e121241d..7155017b7af 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -39,14 +39,10 @@ from cudf._lib.types cimport ( from cudf._lib.null_mask import bitmask_allocation_size_bytes from cudf._lib.types import dtype_from_pylibcudf_column -# TODO: We currently need this for "casting" empty pylibcudf columns in -# from_pylibcudf by instead creating an empty numeric column. We will be able -# to remove this once column factories are exposed to pylibcudf. cimport cudf._lib.pylibcudf.libcudf.copying as cpp_copying cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types cimport cudf._lib.pylibcudf.libcudf.unary as libcudf_unary -from cudf._lib.pylibcudf cimport Column as plc_Column from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents from cudf._lib.pylibcudf.libcudf.column.column_factories cimport ( make_column_from_scalar as cpp_make_column_from_scalar, @@ -623,22 +619,17 @@ cdef class Column: pylibcudf.Column A new pylibcudf.Column referencing the same data. """ - cdef libcudf_types.data_type new_dtype if col.type().id() == pylibcudf.TypeId.TIMESTAMP_DAYS: col = pylibcudf.unary.cast( col, pylibcudf.DataType(pylibcudf.TypeId.TIMESTAMP_SECONDS) ) elif col.type().id() == pylibcudf.TypeId.EMPTY: - new_dtype = libcudf_types.data_type(libcudf_types.type_id.INT8) - # TODO: This function call is what requires cimporting pylibcudf. - # We can remove the cimport once we can directly do - # pylibcudf.column_factories.make_numeric_column or equivalent. - col = plc_Column.from_libcudf( - move( - make_numeric_column( - new_dtype, col.size(), libcudf_types.mask_state.ALL_NULL - ) - ) + new_dtype = pylibcudf.DataType(pylibcudf.TypeId.INT8) + + col = pylibcudf.column_factories.make_numeric_column( + new_dtype, + col.size(), + pylibcudf.column_factories.MaskState.ALL_NULL ) dtype = dtype_from_pylibcudf_column(col) diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index eff14ad549b..7d0676f6def 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -16,6 +16,7 @@ set(cython_sources aggregation.pyx binaryop.pyx column.pyx + column_factories.pyx concatenate.pyx copying.pyx filling.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 4f77f8cbaef..b289d112a90 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -4,6 +4,7 @@ from . cimport ( aggregation, binaryop, + column_factories, concatenate, copying, filling, @@ -40,6 +41,7 @@ __all__ = [ "binaryop", "concatenate", "copying", + "column_factories", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 048b62b6013..2565332f3ed 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -3,6 +3,7 @@ from . import ( aggregation, binaryop, + column_factories, concatenate, copying, filling, @@ -27,7 +28,7 @@ from .gpumemoryview import gpumemoryview from .scalar import Scalar from .table import Table -from .types import DataType, TypeId +from .types import DataType, MaskState, TypeId __all__ = [ "Column", @@ -39,6 +40,7 @@ "binaryop", "concatenate", "copying", + "column_factories", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd new file mode 100644 index 00000000000..9dbd74ab16c --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type + +from .column cimport Column +from .types cimport DataType, size_type, type_id + +ctypedef fused MakeEmptyColumnOperand: + DataType + type_id + object + +ctypedef fused MaskArg: + mask_state + object + + +cpdef Column make_empty_column( + MakeEmptyColumnOperand type_or_id +) + +cpdef Column make_numeric_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_fixed_point_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_timestamp_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_duration_column( + DataType type_, + size_type size, + MaskArg mask, +) + +cpdef Column make_fixed_width_column( + DataType type_, + size_type size, + MaskArg mask, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx new file mode 100644 index 00000000000..ef7f512f0e5 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx @@ -0,0 +1,205 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.column.column_factories cimport ( + make_duration_column as cpp_make_duration_column, + make_empty_column as cpp_make_empty_column, + make_fixed_point_column as cpp_make_fixed_point_column, + make_fixed_width_column as cpp_make_fixed_width_column, + make_numeric_column as cpp_make_numeric_column, + make_timestamp_column as cpp_make_timestamp_column, +) +from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type + +from .types cimport DataType, type_id + +from .types import MaskState, TypeId + + +cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): + cdef unique_ptr[column] result + cdef type_id id + + if MakeEmptyColumnOperand is object: + if isinstance(type_or_id, TypeId): + id = type_or_id + with nogil: + result = move( + cpp_make_empty_column( + id + ) + ) + else: + raise TypeError( + "Must pass a TypeId or DataType" + ) + elif MakeEmptyColumnOperand is DataType: + with nogil: + result = move( + cpp_make_empty_column( + type_or_id.c_obj + ) + ) + elif MakeEmptyColumnOperand is type_id: + with nogil: + result = move( + cpp_make_empty_column( + type_or_id + ) + ) + else: + raise TypeError( + "Must pass a TypeId or DataType" + ) + return Column.from_libcudf(move(result)) + + +cpdef Column make_numeric_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_numeric_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + +cpdef Column make_fixed_point_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_fixed_point_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + + +cpdef Column make_timestamp_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_timestamp_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + + +cpdef Column make_duration_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_duration_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) + + +cpdef Column make_fixed_width_column( + DataType type_, + size_type size, + MaskArg mstate +): + + cdef unique_ptr[column] result + cdef mask_state state + + if MaskArg is object: + if isinstance(mstate, MaskState): + state = mstate + else: + raise TypeError("Invalid mask argument") + elif MaskArg is mask_state: + state = mstate + else: + raise TypeError("Invalid mask argument") + with nogil: + result = move( + cpp_make_fixed_width_column( + type_.c_obj, + size, + state + ) + ) + + return Column.from_libcudf(move(result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index f172080cece..1e4102e4b64 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -33,6 +33,33 @@ from .scalar cimport Scalar from .table cimport Table from .types cimport DataType, type_id +ARROW_TO_PYLIBCUDF_TYPES = { + pa.int8(): type_id.INT8, + pa.int16(): type_id.INT16, + pa.int32(): type_id.INT32, + pa.int64(): type_id.INT64, + pa.uint8(): type_id.UINT8, + pa.uint16(): type_id.UINT16, + pa.uint32(): type_id.UINT32, + pa.uint64(): type_id.UINT64, + pa.float32(): type_id.FLOAT32, + pa.float64(): type_id.FLOAT64, + pa.bool_(): type_id.BOOL8, + pa.string(): type_id.STRING, + pa.duration('s'): type_id.DURATION_SECONDS, + pa.duration('ms'): type_id.DURATION_MILLISECONDS, + pa.duration('us'): type_id.DURATION_MICROSECONDS, + pa.duration('ns'): type_id.DURATION_NANOSECONDS, + pa.timestamp('s'): type_id.TIMESTAMP_SECONDS, + pa.timestamp('ms'): type_id.TIMESTAMP_MILLISECONDS, + pa.timestamp('us'): type_id.TIMESTAMP_MICROSECONDS, + pa.timestamp('ns'): type_id.TIMESTAMP_NANOSECONDS, + pa.date32(): type_id.TIMESTAMP_DAYS, +} + +LIBCUDF_TO_ARROW_TYPES = { + v: k for k, v in ARROW_TO_PYLIBCUDF_TYPES.items() +} cdef column_metadata _metadata_to_libcudf(metadata): """Convert a ColumnMetadata object to C++ column_metadata. @@ -77,6 +104,21 @@ def from_arrow(pyarrow_object, *, DataType data_type=None): raise TypeError("from_arrow only accepts Table and Scalar objects") +@from_arrow.register(pa.DataType) +def _from_arrow_datatype(pyarrow_object): + if isinstance(pyarrow_object, pa.Decimal128Type): + return DataType(type_id.DECIMAL128, scale=-pyarrow_object.scale) + elif isinstance(pyarrow_object, pa.StructType): + return DataType(type_id.STRUCT) + elif isinstance(pyarrow_object, pa.ListType): + return DataType(type_id.LIST) + else: + try: + return DataType(ARROW_TO_PYLIBCUDF_TYPES[pyarrow_object]) + except KeyError: + raise TypeError(f"Unable to convert {pyarrow_object} to cudf datatype") + + @from_arrow.register(pa.Table) def _from_arrow_table(pyarrow_object, *, DataType data_type=None): if data_type is not None: @@ -170,6 +212,46 @@ def to_arrow(cudf_object, metadata=None): raise TypeError("to_arrow only accepts Table and Scalar objects") +@to_arrow.register(DataType) +def _to_arrow_datatype(cudf_object, **kwargs): + """ + Convert a datatype to arrow. + + Translation of some types requires extra information as a keyword + argument. Specifically: + + - When translating a decimal type, provide ``precision`` + - When translating a struct type, provide ``fields`` + - When translating a list type, provide the wrapped ``value_type`` + """ + if cudf_object.id() in {type_id.DECIMAL32, type_id.DECIMAL64, type_id.DECIMAL128}: + if not (precision := kwargs.get("precision")): + raise ValueError( + "Precision must be provided for decimal types" + ) + # no pa.decimal32 or pa.decimal64 + return pa.decimal128(precision, -cudf_object.scale()) + elif cudf_object.id() == type_id.STRUCT: + if not (fields := kwargs.get("fields")): + raise ValueError( + "Fields must be provided for struct types" + ) + return pa.struct(fields) + elif cudf_object.id() == type_id.LIST: + if not (value_type := kwargs.get("value_type")): + raise ValueError( + "Value type must be provided for list types" + ) + return pa.list_(value_type) + else: + try: + return ARROW_TO_PYLIBCUDF_TYPES[cudf_object.id()] + except KeyError: + raise TypeError( + f"Unable to convert {cudf_object.id()} to arrow datatype" + ) + + @to_arrow.register(Table) def _to_arrow_table(cudf_object, metadata=None): if metadata is None: diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd index fd22d92cb30..2faff21a77b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd @@ -2,9 +2,17 @@ from libcpp.memory cimport unique_ptr +from rmm._lib.device_buffer cimport device_buffer + from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar -from cudf._lib.pylibcudf.libcudf.types cimport data_type, mask_state, size_type +from cudf._lib.pylibcudf.libcudf.types cimport ( + bitmask_type, + data_type, + mask_state, + size_type, + type_id, +) cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil: @@ -12,5 +20,64 @@ cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil: size_type size, mask_state state) except + - cdef unique_ptr[column] make_column_from_scalar (const scalar & s, - size_type size) except + + cdef unique_ptr[column] make_numeric_column(data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_fixed_point_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_fixed_point_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_timestamp_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_timestamp_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_duration_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_duration_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_fixed_width_column( + data_type type, + size_type size, + mask_state state) except + + + cdef unique_ptr[column] make_fixed_width_column( + data_type type, + size_type size, + device_buffer mask, + size_type null_count) except + + + cdef unique_ptr[column] make_column_from_scalar(const scalar& s, + size_type size) except + + + cdef unique_ptr[column] make_dictionary_from_scalar(const scalar& s, + size_type size) except + + + cdef unique_ptr[column] make_empty_column(type_id id) except + + cdef unique_ptr[column] make_empty_column(data_type type_) except + + + cdef unique_ptr[column] make_dictionary_column( + unique_ptr[column] keys_column, + unique_ptr[column] indices_column) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd index e54a259819e..7d3ddca14a1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd @@ -13,6 +13,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport ( null_order, null_policy, order, + size_type, sorted, type_id, ) diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index a5248ad0a1f..6dbb287f3c4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -8,6 +8,7 @@ from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId # no-cython-lin from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import null_policy as NullPolicy # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import interpolation as Interpolation # no-cython-lint, isort:skip +from cudf._lib.pylibcudf.libcudf.types import mask_state as MaskState # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import nan_equality as NanEquality # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import null_equality as NullEquality # no-cython-lint, isort:skip from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder # no-cython-lint, isort:skip @@ -22,7 +23,7 @@ cdef class DataType: Parameters ---------- - id : TypeId + id : type_id The type's identifier scale : int The scale associated with the data. Only used for decimal data types. diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_factories.py b/python/cudf/cudf/pylibcudf_tests/test_column_factories.py new file mode 100644 index 00000000000..4c05770a41f --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_column_factories.py @@ -0,0 +1,253 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq + +from cudf._lib import pylibcudf as plc + +EMPTY_COL_SIZE = 3 + +NUMERIC_TYPES = [ + pa.uint8(), + pa.uint16(), + pa.uint32(), + pa.uint64(), + pa.int8(), + pa.int16(), + pa.int32(), + pa.int64(), + pa.float32(), + pa.float64(), + pa.bool_(), +] + +TIMESTAMP_TYPES = [ + pa.timestamp("s"), + pa.timestamp("ms"), + pa.timestamp("us"), + pa.timestamp("ns"), +] + +DURATION_TYPES = [ + pa.duration("s"), + pa.duration("ms"), + pa.duration("us"), + pa.duration("ns"), +] + +DECIMAL_TYPES = [pa.decimal128(38, 2)] + +STRING_TYPES = [pa.string()] +STRUCT_TYPES = [DEFAULT_STRUCT_TESTING_TYPE] +LIST_TYPES = [pa.list_(pa.int64())] + +ALL_TYPES = ( + NUMERIC_TYPES + + TIMESTAMP_TYPES + + DURATION_TYPES + + STRING_TYPES + + DECIMAL_TYPES + + STRUCT_TYPES + + LIST_TYPES +) + + +@pytest.fixture(scope="module", params=NUMERIC_TYPES, ids=repr) +def numeric_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=DECIMAL_TYPES, + ids=repr, +) +def fixed_point_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=TIMESTAMP_TYPES, + ids=repr, +) +def timestamp_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=DURATION_TYPES, + ids=repr, +) +def duration_pa_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=[ + plc.MaskState.UNALLOCATED, + plc.MaskState.ALL_VALID, + plc.MaskState.ALL_NULL, + plc.MaskState.UNINITIALIZED, + ], + ids=["unallocated", "all_valid", "all_null", "uninitialized"], +) +def mask_state(request): + return request.param + + +def test_make_empty_column_dtype(pa_type): + pa_col = pa.array([], type=pa_type) + + plc_type = plc.interop.from_arrow(pa_col).type() + + if isinstance(pa_type, (pa.ListType, pa.StructType)): + with pytest.raises(ValueError): + plc.column_factories.make_empty_column(plc_type) + return + + cudf_col = plc.column_factories.make_empty_column(plc_type) + assert_column_eq(cudf_col, pa_col) + + +def test_make_empty_column_typeid(pa_type): + pa_col = pa.array([], type=pa_type) + + tid = plc.interop.from_arrow(pa_col).type().id() + + if isinstance(pa_type, (pa.ListType, pa.StructType)): + with pytest.raises(ValueError): + plc.column_factories.make_empty_column(tid) + return + + cudf_col = plc.column_factories.make_empty_column(tid) + assert_column_eq(cudf_col, pa_col) + + +def validate_empty_column(col, mask_state, dtype): + assert col.size() == EMPTY_COL_SIZE + + if mask_state == plc.types.MaskState.UNALLOCATED: + assert col.null_count() == 0 + elif mask_state == plc.types.MaskState.ALL_VALID: + assert col.null_count() == 0 + elif mask_state == plc.types.MaskState.ALL_NULL: + assert col.null_count() == EMPTY_COL_SIZE + + assert plc.interop.to_arrow(col).type == dtype + + +def test_make_numeric_column(numeric_pa_type, mask_state): + plc_type = plc.interop.from_arrow(numeric_pa_type) + + got = plc.column_factories.make_numeric_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + validate_empty_column(got, mask_state, numeric_pa_type) + + +@pytest.mark.parametrize( + "non_numeric_pa_type", [t for t in ALL_TYPES if t not in NUMERIC_TYPES] +) +def test_make_numeric_column_dtype_err(non_numeric_pa_type): + plc_type = plc.interop.from_arrow(non_numeric_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_numeric_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_numeric_column_negative_size_err(numeric_pa_type): + plc_type = plc.interop.from_arrow(numeric_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_numeric_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_fixed_point_column(fixed_point_pa_type, mask_state): + plc_type = plc.interop.from_arrow(fixed_point_pa_type) + + got = plc.column_factories.make_fixed_point_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + + validate_empty_column(got, mask_state, fixed_point_pa_type) + + +@pytest.mark.parametrize( + "non_fixed_point_pa_type", [t for t in ALL_TYPES if t not in DECIMAL_TYPES] +) +def test_make_fixed_point_column_dtype_err(non_fixed_point_pa_type): + plc_type = plc.interop.from_arrow(non_fixed_point_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_fixed_point_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_fixed_point_column_negative_size_err(fixed_point_pa_type): + plc_type = plc.interop.from_arrow(fixed_point_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_fixed_point_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_timestamp_column(timestamp_pa_type, mask_state): + plc_type = plc.interop.from_arrow(timestamp_pa_type) + + got = plc.column_factories.make_timestamp_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + validate_empty_column(got, mask_state, timestamp_pa_type) + + +@pytest.mark.parametrize( + "non_timestamp_pa_type", [t for t in ALL_TYPES if t not in TIMESTAMP_TYPES] +) +def test_make_timestamp_column_dtype_err(non_timestamp_pa_type): + plc_type = plc.interop.from_arrow(non_timestamp_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_timestamp_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_timestamp_column_negative_size_err(timestamp_pa_type): + plc_type = plc.interop.from_arrow(timestamp_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_timestamp_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_duration_column(duration_pa_type, mask_state): + plc_type = plc.interop.from_arrow(duration_pa_type) + + got = plc.column_factories.make_duration_column( + plc_type, EMPTY_COL_SIZE, mask_state + ) + validate_empty_column(got, mask_state, duration_pa_type) + + +@pytest.mark.parametrize( + "non_duration_pa_type", [t for t in ALL_TYPES if t not in DURATION_TYPES] +) +def test_make_duration_column_dtype_err(non_duration_pa_type): + plc_type = plc.interop.from_arrow(non_duration_pa_type) + with pytest.raises(ValueError): + plc.column_factories.make_duration_column( + plc_type, 3, plc.types.MaskState.UNALLOCATED + ) + + +def test_make_duration_column_negative_size_err(duration_pa_type): + plc_type = plc.interop.from_arrow(duration_pa_type) + with pytest.raises(RuntimeError): + plc.column_factories.make_duration_column( + plc_type, -1, plc.types.MaskState.UNALLOCATED + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_interop.py b/python/cudf/cudf/pylibcudf_tests/test_interop.py new file mode 100644 index 00000000000..5c05f460e28 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_interop.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest + +import cudf._lib.pylibcudf as plc + + +def test_list_dtype_roundtrip(): + list_type = pa.list_(pa.int32()) + plc_type = plc.interop.from_arrow(list_type) + + assert plc_type == plc.types.DataType(plc.types.TypeId.LIST) + + with pytest.raises(ValueError): + plc.interop.to_arrow(plc_type) + + arrow_type = plc.interop.to_arrow( + plc_type, value_type=list_type.value_type + ) + assert arrow_type == list_type + + +def test_struct_dtype_roundtrip(): + struct_type = pa.struct([("a", pa.int32()), ("b", pa.string())]) + plc_type = plc.interop.from_arrow(struct_type) + + assert plc_type == plc.types.DataType(plc.types.TypeId.STRUCT) + + with pytest.raises(ValueError): + plc.interop.to_arrow(plc_type) + + arrow_type = plc.interop.to_arrow( + plc_type, + fields=[struct_type.field(i) for i in range(struct_type.num_fields)], + ) + assert arrow_type == struct_type + + +def test_decimal128_roundtrip(): + decimal_type = pa.decimal128(10, 2) + plc_type = plc.interop.from_arrow(decimal_type) + + assert plc_type.id() == plc.types.TypeId.DECIMAL128 + + with pytest.raises(ValueError): + plc.interop.to_arrow(plc_type) + + arrow_type = plc.interop.to_arrow( + plc_type, precision=decimal_type.precision + ) + assert arrow_type == decimal_type + + +@pytest.mark.parametrize( + "data_type", + [ + plc.types.DataType(plc.types.TypeId.DECIMAL32), + plc.types.DataType(plc.types.TypeId.DECIMAL64), + ], +) +def test_decimal_other(data_type): + precision = 3 + + with pytest.raises(ValueError): + plc.interop.to_arrow(data_type) + + arrow_type = plc.interop.to_arrow(data_type, precision=precision) + assert arrow_type == pa.decimal128(precision, 0)