From 353d2de0e7a2e83039d82996fdea75924f370c0f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Nov 2024 13:25:59 -0800 Subject: [PATCH] Clean up misc, unneeded pylibcudf.libcudf in cudf._lib (#17309) * Removed `ctypedef const scalar constscalar` usage * Use `dtype_to_pylibcudf_type` where appropriate * Use pylibcudf enums instead of `pylibcudf.libcudf` types Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17309 --- python/cudf/cudf/_lib/copying.pyx | 4 ---- python/cudf/cudf/_lib/groupby.pyx | 4 ---- python/cudf/cudf/_lib/json.pyx | 32 ++++++------------------- python/cudf/cudf/_lib/lists.pyx | 30 ++++++++++++++--------- python/pylibcudf/pylibcudf/io/types.pyx | 2 ++ 5 files changed, 28 insertions(+), 44 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 4221e745e65..8b4d6199600 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -30,14 +30,10 @@ from libcpp.memory cimport make_unique cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_table_view -# workaround for https://github.com/cython/cython/issues/3885 -ctypedef const scalar constscalar - def _gather_map_is_valid( gather_map: "cudf.core.column.ColumnBase", diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 1ce6dfab15e..4e712be6738 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -18,8 +18,6 @@ from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf._lib.scalar import as_device_scalar -from pylibcudf.libcudf.scalar.scalar cimport scalar - import pylibcudf from cudf._lib.aggregation import make_aggregation @@ -53,8 +51,6 @@ _DECIMAL_AGGS = { "NUNIQUE", "SUM", } -# workaround for https://github.com/cython/cython/issues/3885 -ctypedef const scalar constscalar @singledispatch diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 7dc9cd01a00..960010899c1 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -1,6 +1,5 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. -import io import os from collections import abc @@ -9,12 +8,9 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from pylibcudf.libcudf.types cimport data_type, type_id -from pylibcudf.types cimport DataType - from cudf._lib.column cimport Column from cudf._lib.io.utils cimport add_df_col_struct_names -from cudf._lib.types cimport dtype_to_data_type +from cudf._lib.types cimport dtype_to_pylibcudf_type from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io import pylibcudf as plc @@ -42,13 +38,9 @@ cpdef read_json(object filepaths_or_buffers, # the encoded memoryview externally to ensure the encoded buffer # isn't destroyed before calling libcudf `read_json()` - for idx in range(len(filepaths_or_buffers)): - if isinstance(filepaths_or_buffers[idx], io.StringIO): - filepaths_or_buffers[idx] = \ - filepaths_or_buffers[idx].read().encode() - elif isinstance(filepaths_or_buffers[idx], str) and \ - not os.path.isfile(filepaths_or_buffers[idx]): - filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode() + for idx, source in enumerate(filepaths_or_buffers): + if isinstance(source, str) and not os.path.isfile(source): + filepaths_or_buffers[idx] = source.encode() # Setup arguments if compression is not None: @@ -181,7 +173,7 @@ def write_json( ) -cdef _get_cudf_schema_element_from_dtype(object dtype) except *: +def _get_cudf_schema_element_from_dtype(object dtype): dtype = cudf.dtype(dtype) if isinstance(dtype, cudf.CategoricalDtype): raise NotImplementedError( @@ -189,7 +181,7 @@ cdef _get_cudf_schema_element_from_dtype(object dtype) except *: "supported in JSON reader" ) - lib_type = DataType.from_libcudf(dtype_to_data_type(dtype)) + lib_type = dtype_to_pylibcudf_type(dtype) child_types = [] if isinstance(dtype, cudf.StructDtype): @@ -202,23 +194,13 @@ cdef _get_cudf_schema_element_from_dtype(object dtype) except *: _get_cudf_schema_element_from_dtype(dtype.element_type) child_types = [ - ("offsets", DataType.from_libcudf(data_type(type_id.INT32)), []), + ("offsets", plc.DataType(plc.TypeId.INT32), []), ("element", child_lib_type, grandchild_types) ] return lib_type, child_types -cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: - dtype = cudf.dtype(dtype) - if isinstance(dtype, cudf.CategoricalDtype): - raise NotImplementedError( - "CategoricalDtype as dtype is not yet " - "supported in JSON reader" - ) - return dtype_to_data_type(dtype) - - def _dtype_to_names_list(col): if isinstance(col.dtype, cudf.StructDtype): return [(name, _dtype_to_names_list(child)) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 9a2aa4a6130..90a137dd546 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -4,17 +4,13 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from pylibcudf.libcudf.types cimport ( - nan_equality, null_equality, null_order, order, size_type -) +from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column from cudf._lib.utils cimport columns_from_pylibcudf_table import pylibcudf as plc -from pylibcudf cimport Scalar - @acquire_spill_lock() def count_elements(Column col): @@ -39,8 +35,16 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal): return Column.from_pylibcudf( plc.lists.distinct( col.to_pylibcudf(mode="read"), - null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL, - nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL, + ( + plc.types.NullEquality.EQUAL + if nulls_equal + else plc.types.NullEquality.UNEQUAL + ), + ( + plc.types.NanEquality.ALL_EQUAL + if nans_all_equal + else plc.types.NanEquality.UNEQUAL + ), ) ) @@ -50,8 +54,12 @@ def sort_lists(Column col, bool ascending, str na_position): return Column.from_pylibcudf( plc.lists.sort_lists( col.to_pylibcudf(mode="read"), - order.ASCENDING if ascending else order.DESCENDING, - null_order.BEFORE if na_position == "first" else null_order.AFTER, + plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING, + ( + plc.types.NullOrder.BEFORE + if na_position == "first" + else plc.types.NullOrder.AFTER + ), False, ) ) @@ -82,7 +90,7 @@ def contains_scalar(Column col, py_search_key): return Column.from_pylibcudf( plc.lists.contains( col.to_pylibcudf(mode="read"), - py_search_key.device_value.c_value, + py_search_key.device_value.c_value, ) ) @@ -92,7 +100,7 @@ def index_of_scalar(Column col, object py_search_key): return Column.from_pylibcudf( plc.lists.index_of( col.to_pylibcudf(mode="read"), - py_search_key.device_value.c_value, + py_search_key.device_value.c_value, plc.lists.DuplicateFindOption.FIND_FIRST, ) ) diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 5db4eeb9583..7a3f16c4c50 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -182,6 +182,8 @@ cdef class SourceInfo: raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), src ) + # TODO: Keep the sources alive (self.byte_sources = sources) + # for str data (e.g. read_json)? c_files.push_back( str(src).encode()) self.c_obj = move(source_info(c_files))