From 353d2de0e7a2e83039d82996fdea75924f370c0f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 13 Nov 2024 13:25:59 -0800
Subject: [PATCH] Clean up misc, unneeded pylibcudf.libcudf in cudf._lib
 (#17309)

* Removed `ctypedef const scalar constscalar` usage
* Use `dtype_to_pylibcudf_type` where appropriate
* Use pylibcudf enums instead of `pylibcudf.libcudf` types

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17309
---
 python/cudf/cudf/_lib/copying.pyx       |  4 ----
 python/cudf/cudf/_lib/groupby.pyx       |  4 ----
 python/cudf/cudf/_lib/json.pyx          | 32 ++++++-------------------
 python/cudf/cudf/_lib/lists.pyx         | 30 ++++++++++++++---------
 python/pylibcudf/pylibcudf/io/types.pyx |  2 ++
 5 files changed, 28 insertions(+), 44 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 4221e745e65..8b4d6199600 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -30,14 +30,10 @@ from libcpp.memory cimport make_unique
 cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_table_view
 
-# workaround for https://github.com/cython/cython/issues/3885
-ctypedef const scalar constscalar
-
 
 def _gather_map_is_valid(
     gather_map: "cudf.core.column.ColumnBase",
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 1ce6dfab15e..4e712be6738 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -18,8 +18,6 @@ from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib.scalar import as_device_scalar
 
-from pylibcudf.libcudf.scalar.scalar cimport scalar
-
 import pylibcudf
 
 from cudf._lib.aggregation import make_aggregation
@@ -53,8 +51,6 @@ _DECIMAL_AGGS = {
     "NUNIQUE",
     "SUM",
 }
-# workaround for https://github.com/cython/cython/issues/3885
-ctypedef const scalar constscalar
 
 
 @singledispatch
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 7dc9cd01a00..960010899c1 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -1,6 +1,5 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
-import io
 import os
 from collections import abc
 
@@ -9,12 +8,9 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
-from pylibcudf.libcudf.types cimport data_type, type_id
-from pylibcudf.types cimport DataType
-
 from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport add_df_col_struct_names
-from cudf._lib.types cimport dtype_to_data_type
+from cudf._lib.types cimport dtype_to_pylibcudf_type
 from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
 import pylibcudf as plc
@@ -42,13 +38,9 @@ cpdef read_json(object filepaths_or_buffers,
     # the encoded memoryview externally to ensure the encoded buffer
     # isn't destroyed before calling libcudf `read_json()`
 
-    for idx in range(len(filepaths_or_buffers)):
-        if isinstance(filepaths_or_buffers[idx], io.StringIO):
-            filepaths_or_buffers[idx] = \
-                filepaths_or_buffers[idx].read().encode()
-        elif isinstance(filepaths_or_buffers[idx], str) and \
-                not os.path.isfile(filepaths_or_buffers[idx]):
-            filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode()
+    for idx, source in enumerate(filepaths_or_buffers):
+        if isinstance(source, str) and not os.path.isfile(source):
+            filepaths_or_buffers[idx] = source.encode()
 
     # Setup arguments
     if compression is not None:
@@ -181,7 +173,7 @@ def write_json(
         )
 
 
-cdef _get_cudf_schema_element_from_dtype(object dtype) except *:
+def _get_cudf_schema_element_from_dtype(object dtype):
     dtype = cudf.dtype(dtype)
     if isinstance(dtype, cudf.CategoricalDtype):
         raise NotImplementedError(
@@ -189,7 +181,7 @@ cdef _get_cudf_schema_element_from_dtype(object dtype) except *:
             "supported in JSON reader"
         )
 
-    lib_type = DataType.from_libcudf(dtype_to_data_type(dtype))
+    lib_type = dtype_to_pylibcudf_type(dtype)
     child_types = []
 
     if isinstance(dtype, cudf.StructDtype):
@@ -202,23 +194,13 @@ cdef _get_cudf_schema_element_from_dtype(object dtype) except *:
             _get_cudf_schema_element_from_dtype(dtype.element_type)
 
         child_types = [
-            ("offsets", DataType.from_libcudf(data_type(type_id.INT32)), []),
+            ("offsets", plc.DataType(plc.TypeId.INT32), []),
             ("element", child_lib_type, grandchild_types)
         ]
 
     return lib_type, child_types
 
 
-cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
-    dtype = cudf.dtype(dtype)
-    if isinstance(dtype, cudf.CategoricalDtype):
-        raise NotImplementedError(
-            "CategoricalDtype as dtype is not yet "
-            "supported in JSON reader"
-        )
-    return dtype_to_data_type(dtype)
-
-
 def _dtype_to_names_list(col):
     if isinstance(col.dtype, cudf.StructDtype):
         return [(name, _dtype_to_names_list(child))
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 9a2aa4a6130..90a137dd546 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -4,17 +4,13 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
-from pylibcudf.libcudf.types cimport (
-    nan_equality, null_equality, null_order, order, size_type
-)
+from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 import pylibcudf as plc
 
-from pylibcudf cimport Scalar
-
 
 @acquire_spill_lock()
 def count_elements(Column col):
@@ -39,8 +35,16 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
     return Column.from_pylibcudf(
         plc.lists.distinct(
             col.to_pylibcudf(mode="read"),
-            null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL,
-            nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL,
+            (
+                plc.types.NullEquality.EQUAL
+                if nulls_equal
+                else plc.types.NullEquality.UNEQUAL
+            ),
+            (
+                plc.types.NanEquality.ALL_EQUAL
+                if nans_all_equal
+                else plc.types.NanEquality.UNEQUAL
+            ),
         )
     )
 
@@ -50,8 +54,12 @@ def sort_lists(Column col, bool ascending, str na_position):
     return Column.from_pylibcudf(
         plc.lists.sort_lists(
             col.to_pylibcudf(mode="read"),
-            order.ASCENDING if ascending else order.DESCENDING,
-            null_order.BEFORE if na_position == "first" else null_order.AFTER,
+            plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING,
+            (
+                plc.types.NullOrder.BEFORE
+                if na_position == "first"
+                else plc.types.NullOrder.AFTER
+            ),
             False,
         )
     )
@@ -82,7 +90,7 @@ def contains_scalar(Column col, py_search_key):
     return Column.from_pylibcudf(
         plc.lists.contains(
             col.to_pylibcudf(mode="read"),
-            <Scalar> py_search_key.device_value.c_value,
+            py_search_key.device_value.c_value,
         )
     )
 
@@ -92,7 +100,7 @@ def index_of_scalar(Column col, object py_search_key):
     return Column.from_pylibcudf(
         plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
-            <Scalar> py_search_key.device_value.c_value,
+            py_search_key.device_value.c_value,
             plc.lists.DuplicateFindOption.FIND_FIRST,
         )
     )
diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
index 5db4eeb9583..7a3f16c4c50 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -182,6 +182,8 @@ cdef class SourceInfo:
                     raise FileNotFoundError(
                         errno.ENOENT, os.strerror(errno.ENOENT), src
                     )
+                # TODO: Keep the sources alive (self.byte_sources = sources)
+                # for str data (e.g. read_json)?
                 c_files.push_back(<string> str(src).encode())
 
             self.c_obj = move(source_info(c_files))