From e60aad110efcd94003ad78d0f46ac94e531bd1c0 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 1 Mar 2024 18:22:33 -0800
Subject: [PATCH 01/18] Implement search using pylibcudf (#15166)

Contributes to #15162

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15166
---
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 .../user_guide/api_docs/pylibcudf/search.rst  |   6 +
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   2 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   2 +
 python/cudf/cudf/_lib/pylibcudf/search.pxd    |  21 ++++
 python/cudf/cudf/_lib/pylibcudf/search.pyx    | 116 ++++++++++++++++++
 python/cudf/cudf/_lib/search.pyx              |  91 +++++---------
 8 files changed, 178 insertions(+), 62 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/search.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/search.pyx

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 73f63ae1343..2e5b3916c65 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf.
     reduce
     rolling
     scalar
+    search
     stream_compaction
     sorting
     replace
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst
new file mode 100644
index 00000000000..aa57bcd9d92
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst
@@ -0,0 +1,6 @@
+======
+search
+======
+
+.. automodule:: cudf._lib.pylibcudf.search
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 68e6765cc49..fd749a5edc1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -28,6 +28,7 @@ set(cython_sources
     replace.pyx
     rolling.pyx
     scalar.pyx
+    search.pyx
     stream_compaction.pyx
     sorting.pyx
     table.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 5ef10fb2ffc..96aa42cc257 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -14,6 +14,7 @@ from . cimport (
     reduce,
     replace,
     rolling,
+    search,
     sorting,
     stream_compaction,
     types,
@@ -45,6 +46,7 @@ __all__ = [
     "reduce",
     "replace",
     "rolling",
+    "search",
     "stream_compaction",
     "sorting",
     "types",
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 4689c49fdb1..19cc782dd92 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -13,6 +13,7 @@
     reduce,
     replace,
     rolling,
+    search,
     sorting,
     stream_compaction,
     types,
@@ -43,6 +44,7 @@
     "reduce",
     "replace",
     "rolling",
+    "search",
     "stream_compaction",
     "sorting",
     "types",
diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pxd b/python/cudf/cudf/_lib/pylibcudf/search.pxd
new file mode 100644
index 00000000000..0faf18b108f
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/search.pxd
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef Column lower_bound(
+    Table haystack,
+    Table needles,
+    list column_order,
+    list null_precedence,
+)
+
+cpdef Column upper_bound(
+    Table haystack,
+    Table needles,
+    list column_order,
+    list null_precedence,
+)
+
+cpdef Column contains(Column haystack, Column needles)
diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pyx b/python/cudf/cudf/_lib/pylibcudf/search.pyx
new file mode 100644
index 00000000000..a186167af13
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/search.pyx
@@ -0,0 +1,116 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+from cudf._lib.cpp cimport search as cpp_search
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.types cimport null_order, order
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef Column lower_bound(
+    Table haystack,
+    Table needles,
+    list column_order,
+    list null_precedence,
+):
+    """Find smallest indices in haystack where needles may be inserted to retain order.
+
+    Parameters
+    ----------
+    haystack : Table
+        The search space.
+    needles : Table
+        The values for which to find insertion points.
+    column_order : List[ColumnOrder]
+        Whether each column should be sorted in ascending or descending order.
+    null_precedence : List[NullOrder]
+        Whether nulls should come before or after non-nulls.
+
+    Returns
+    -------
+    Column
+        The insertion points
+    """
+    cdef unique_ptr[column] c_result
+    cdef vector[order] c_orders = column_order
+    cdef vector[null_order] c_null_precedence = null_precedence
+    with nogil:
+        c_result = move(
+            cpp_search.lower_bound(
+                haystack.view(),
+                needles.view(),
+                c_orders,
+                c_null_precedence,
+            )
+        )
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column upper_bound(
+    Table haystack,
+    Table needles,
+    list column_order,
+    list null_precedence,
+):
+    """Find largest indices in haystack where needles may be inserted to retain order.
+
+    Parameters
+    ----------
+    haystack : Table
+        The search space.
+    needles : Table
+        The values for which to find insertion points.
+    column_order : List[ColumnOrder]
+        Whether each column should be sorted in ascending or descending order.
+    null_precedence : List[NullOrder]
+        Whether nulls should come before or after non-nulls.
+
+    Returns
+    -------
+    Column
+        The insertion points
+    """
+    cdef unique_ptr[column] c_result
+    cdef vector[order] c_orders = column_order
+    cdef vector[null_order] c_null_precedence = null_precedence
+    with nogil:
+        c_result = move(
+            cpp_search.upper_bound(
+                haystack.view(),
+                needles.view(),
+                c_orders,
+                c_null_precedence,
+            )
+        )
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column contains(Column haystack, Column needles):
+    """Check whether needles are present in haystack.
+
+    Parameters
+    ----------
+    haystack : Table
+        The search space.
+    needles : Table
+        The values for which to search.
+
+    Returns
+    -------
+    Column
+        Boolean indicator for each needle.
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(
+            cpp_search.contains(
+                haystack.view(),
+                needles.view(),
+            )
+        )
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx
index fef3a08c6d7..1ee73949fd3 100644
--- a/python/cudf/cudf/_lib/search.pyx
+++ b/python/cudf/cudf/_lib/search.pyx
@@ -1,18 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
-
-cimport cudf._lib.cpp.search as cpp_search
-cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.utils cimport table_view_from_columns
+
+from cudf._lib import pylibcudf
 
 
 @acquire_spill_lock()
@@ -31,50 +23,31 @@ def search_sorted(
         If 'left', the index of the first suitable location is given.
         If 'right', return the last such index
     """
-    cdef unique_ptr[column] c_result
-    cdef vector[libcudf_types.order] c_column_order
-    cdef vector[libcudf_types.null_order] c_null_precedence
-    cdef libcudf_types.order c_order
-    cdef libcudf_types.null_order c_null_order
-    cdef table_view c_table_data = table_view_from_columns(source)
-    cdef table_view c_values_data = table_view_from_columns(values)
-
     # Note: We are ignoring index columns here
-    c_order = (libcudf_types.order.ASCENDING
-               if ascending
-               else libcudf_types.order.DESCENDING)
-    c_null_order = (
-        libcudf_types.null_order.AFTER
-        if na_position=="last"
-        else libcudf_types.null_order.BEFORE
+    column_order = [
+        pylibcudf.types.Order.ASCENDING
+        if ascending
+        else pylibcudf.types.Order.DESCENDING
+    ] * len(source)
+    null_precedence = [
+        pylibcudf.types.NullOrder.AFTER
+        if na_position == "last"
+        else pylibcudf.types.NullOrder.BEFORE
+    ] * len(source)
+
+    func = getattr(
+        pylibcudf.search,
+        "lower_bound" if side == "left" else "upper_bound",
     )
-    c_column_order = vector[libcudf_types.order](len(source), c_order)
-    c_null_precedence = vector[libcudf_types.null_order](
-        len(source), c_null_order
+    return Column.from_pylibcudf(
+        func(
+            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source]),
+            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
+            column_order,
+            null_precedence,
+        )
     )
 
-    if side == 'left':
-        with nogil:
-            c_result = move(
-                cpp_search.lower_bound(
-                    c_table_data,
-                    c_values_data,
-                    c_column_order,
-                    c_null_precedence,
-                )
-            )
-    elif side == 'right':
-        with nogil:
-            c_result = move(
-                cpp_search.upper_bound(
-                    c_table_data,
-                    c_values_data,
-                    c_column_order,
-                    c_null_precedence,
-                )
-            )
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def contains(Column haystack, Column needles):
@@ -87,15 +60,9 @@ def contains(Column haystack, Column needles):
     needles :
         A column of values to search for
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view c_haystack = haystack.view()
-    cdef column_view c_needles = needles.view()
-
-    with nogil:
-        c_result = move(
-            cpp_search.contains(
-                c_haystack,
-                c_needles,
-            )
+    return Column.from_pylibcudf(
+        pylibcudf.search.contains(
+            haystack.to_pylibcudf(mode="read"),
+            needles.to_pylibcudf(mode="read"),
         )
-    return Column.from_unique_ptr(move(c_result))
+    )

From 8dbe7cb12a752c44ce3027b96fc37ab0b0db923d Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 4 Mar 2024 08:43:02 -0600
Subject: [PATCH 02/18] Disable testChunkedPackTwoPasses for now (#15210)

Signed-off-by: Alessandro Bellina <abellina@nvidia.com>
Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 java/src/test/java/ai/rapids/cudf/TableTest.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 6f0b2b51f4c..bee8d1cbb88 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -3739,7 +3739,7 @@ void testChunkedPackBasic() {
       }
     }
   }
-
+/*
   @Test
   void testChunkedPackTwoPasses() {
     // this test packes ~2MB worth of long into a 1MB bounce buffer
@@ -3768,6 +3768,7 @@ void testChunkedPackTwoPasses() {
       }
     }
   }
+*/
 
   @Test
   void testContiguousSplitWithStrings() {

From 903dcac6a5341c200c4981c7b9d188897164e89c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 4 Mar 2024 08:43:13 -0600
Subject: [PATCH 03/18] Fix accessing .columns issue (#15212)

---
 python/cudf/cudf/_lib/utils.pyx          |  4 +-
 python/cudf/cudf/core/indexed_frame.py   |  7 ++-
 python/cudf/cudf/tests/test_dataframe.py | 55 ++++++++++++------------
 3 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 896cc55b425..b6637e9df08 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -149,7 +149,9 @@ cpdef generate_pandas_metadata(table, index):
             col
             for col in table._columns
         ],
-        df=table,
+        # It is OKAY to do `.head(0).to_pandas()` because
+        # this method will extract `.columns` metadata only
+        df=table.head(0).to_pandas(),
         column_names=col_names,
         index_levels=index_levels,
         index_descriptors=index_descriptors,
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 8e43000d0a8..3c6e1e17142 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2872,6 +2872,8 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
             self._column_names,
             None if has_range_index or not keep_index else self._index.names,
         )
+        result._data.label_dtype = self._data.label_dtype
+        result._data.rangeindex = self._data.rangeindex
 
         if keep_index and has_range_index:
             result.index = self.index[start:stop]
@@ -3053,7 +3055,7 @@ def duplicated(self, subset=None, keep="first"):
 
     @_cudf_nvtx_annotate
     def _empty_like(self, keep_index=True) -> Self:
-        return self._from_columns_like_self(
+        result = self._from_columns_like_self(
             libcudf.copying.columns_empty_like(
                 [
                     *(self._index._data.columns if keep_index else ()),
@@ -3063,6 +3065,9 @@ def _empty_like(self, keep_index=True) -> Self:
             self._column_names,
             self._index.names if keep_index else None,
         )
+        result._data.label_dtype = self._data.label_dtype
+        result._data.rangeindex = self._data.rangeindex
+        return result
 
     def _split(self, splits, keep_index=True):
         if self._num_rows == 0:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2084db89909..50b14d532e4 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -3012,43 +3012,31 @@ def test_series_rename():
 @pytest.mark.parametrize("data_type", dtypes)
 @pytest.mark.parametrize("nelem", [0, 100])
 def test_head_tail(nelem, data_type):
-    def check_index_equality(left, right):
-        assert left.index.equals(right.index)
-
-    def check_values_equality(left, right):
-        if len(left) == 0 and len(right) == 0:
-            return None
-
-        np.testing.assert_array_equal(left.to_pandas(), right.to_pandas())
-
-    def check_frame_series_equality(left, right):
-        check_index_equality(left, right)
-        check_values_equality(left, right)
-
-    gdf = cudf.DataFrame(
+    pdf = pd.DataFrame(
         {
             "a": np.random.randint(0, 1000, nelem).astype(data_type),
             "b": np.random.randint(0, 1000, nelem).astype(data_type),
         }
     )
+    gdf = cudf.from_pandas(pdf)
 
-    check_frame_series_equality(gdf.head(), gdf[:5])
-    check_frame_series_equality(gdf.head(3), gdf[:3])
-    check_frame_series_equality(gdf.head(-2), gdf[:-2])
-    check_frame_series_equality(gdf.head(0), gdf[0:0])
+    assert_eq(gdf.head(), pdf.head())
+    assert_eq(gdf.head(3), pdf.head(3))
+    assert_eq(gdf.head(-2), pdf.head(-2))
+    assert_eq(gdf.head(0), pdf.head(0))
 
-    check_frame_series_equality(gdf["a"].head(), gdf["a"][:5])
-    check_frame_series_equality(gdf["a"].head(3), gdf["a"][:3])
-    check_frame_series_equality(gdf["a"].head(-2), gdf["a"][:-2])
+    assert_eq(gdf["a"].head(), pdf["a"].head())
+    assert_eq(gdf["a"].head(3), pdf["a"].head(3))
+    assert_eq(gdf["a"].head(-2), pdf["a"].head(-2))
 
-    check_frame_series_equality(gdf.tail(), gdf[-5:])
-    check_frame_series_equality(gdf.tail(3), gdf[-3:])
-    check_frame_series_equality(gdf.tail(-2), gdf[2:])
-    check_frame_series_equality(gdf.tail(0), gdf[0:0])
+    assert_eq(gdf.tail(), pdf.tail())
+    assert_eq(gdf.tail(3), pdf.tail(3))
+    assert_eq(gdf.tail(-2), pdf.tail(-2))
+    assert_eq(gdf.tail(0), pdf.tail(0))
 
-    check_frame_series_equality(gdf["a"].tail(), gdf["a"][-5:])
-    check_frame_series_equality(gdf["a"].tail(3), gdf["a"][-3:])
-    check_frame_series_equality(gdf["a"].tail(-2), gdf["a"][2:])
+    assert_eq(gdf["a"].tail(), pdf["a"].tail())
+    assert_eq(gdf["a"].tail(3), pdf["a"].tail(3))
+    assert_eq(gdf["a"].tail(-2), pdf["a"].tail(-2))
 
 
 def test_tail_for_string():
@@ -4328,6 +4316,17 @@ def test_one_row_head():
     assert_eq(head_pdf, head_gdf)
 
 
+@pytest.mark.parametrize("index", [None, [123], ["a", "b"]])
+def test_no_cols_head(index):
+    pdf = pd.DataFrame(index=index)
+    gdf = cudf.from_pandas(pdf)
+
+    head_gdf = gdf.head()
+    head_pdf = pdf.head()
+
+    assert_eq(head_pdf, head_gdf)
+
+
 @pytest.mark.parametrize("dtype", ALL_TYPES)
 @pytest.mark.parametrize(
     "np_dtype,pd_dtype",

From dbdcc31fe1cbe902d495428da3c68dc59d289dc5 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 4 Mar 2024 18:22:49 +0000
Subject: [PATCH 04/18] Expose new stable_sort and finish stream_compaction in
 pylibcudf (#15175)

Completes coverage of `sorting.hpp` and `stream_compaction.hpp`

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15175
---
 python/cudf/cudf/_lib/cpp/sorting.pxd         |   7 +-
 .../cudf/cudf/_lib/cpp/stream_compaction.pxd  |  43 +++-
 python/cudf/cudf/_lib/pylibcudf/sorting.pxd   |   2 +
 python/cudf/cudf/_lib/pylibcudf/sorting.pyx   |  39 +++-
 .../cudf/_lib/pylibcudf/stream_compaction.pxd |  34 +++-
 .../cudf/_lib/pylibcudf/stream_compaction.pyx | 185 ++++++++++++++++--
 python/cudf/cudf/_lib/stream_compaction.pyx   |   1 +
 7 files changed, 275 insertions(+), 36 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd
index 68f01003fe6..86dc0f0de95 100644
--- a/python/cudf/cudf/_lib/cpp/sorting.pxd
+++ b/python/cudf/cudf/_lib/cpp/sorting.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -68,3 +68,8 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         table_view source_table,
         vector[libcudf_types.order] column_order,
         vector[libcudf_types.null_order] null_precedence) except +
+
+    cdef unique_ptr[table] stable_sort(
+        table_view source_table,
+        vector[libcudf_types.order] column_order,
+        vector[libcudf_types.null_order] null_precedence) except +
diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
index e8539ecb9c3..55854a9444f 100644
--- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
@@ -30,21 +30,28 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil:
                                       vector[size_type] keys,
                                       size_type keep_threshold) except +
 
+    cdef unique_ptr[table] drop_nans(table_view source_table,
+                                     vector[size_type] keys,
+                                     size_type keep_threshold) except +
+
     cdef unique_ptr[table] apply_boolean_mask(
         table_view source_table,
         column_view boolean_mask
     ) except +
 
-    cdef size_type distinct_count(
-        column_view source_table,
-        null_policy null_handling,
-        nan_policy nan_handling) except +
+    cdef unique_ptr[table] unique(
+        table_view input,
+        vector[size_type] keys,
+        duplicate_keep_option keep,
+        null_equality nulls_equal,
+    ) except +
 
-    cdef unique_ptr[table] stable_distinct(
+    cdef unique_ptr[table] distinct(
         table_view input,
         vector[size_type] keys,
         duplicate_keep_option keep,
         null_equality nulls_equal,
+        nan_equality nans_equals,
     ) except +
 
     cdef unique_ptr[column] distinct_indices(
@@ -53,3 +60,29 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil:
         null_equality nulls_equal,
         nan_equality nans_equal,
     ) except +
+
+    cdef unique_ptr[table] stable_distinct(
+        table_view input,
+        vector[size_type] keys,
+        duplicate_keep_option keep,
+        null_equality nulls_equal,
+        nan_equality nans_equal,
+    ) except +
+
+    cdef size_type unique_count(
+        column_view column,
+        null_policy null_handling,
+        nan_policy nan_handling) except +
+
+    cdef size_type unique_count(
+        table_view source_table,
+        null_policy null_handling) except +
+
+    cdef size_type distinct_count(
+        column_view column,
+        null_policy null_handling,
+        nan_policy nan_handling) except +
+
+    cdef size_type distinct_count(
+        table_view source_table,
+        null_policy null_handling) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
index fb22da0b0fd..3ed241622c0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
@@ -59,3 +59,5 @@ cpdef Table stable_sort_by_key(
 )
 
 cpdef Table sort(Table source_table, list column_order, list null_precedence)
+
+cpdef Table stable_sort(Table source_table, list column_order, list null_precedence)
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
index 4e73760720a..1668a3efc7c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
@@ -50,7 +50,8 @@ cpdef Column stable_sorted_order(
     list column_order,
     list null_precedence,
 ):
-    """Computes the row indices required to sort the table, maintaining input order.
+    """Computes the row indices required to sort the table,
+    preserving order of equal elements.
 
     Parameters
     ----------
@@ -206,7 +207,8 @@ cpdef Table stable_segmented_sort_by_key(
     list column_order,
     list null_precedence,
 ):
-    """Sorts the table by key, within segments, maintaining input order.
+    """Sorts the table by key preserving order of equal elements,
+    within segments.
 
     Parameters
     ----------
@@ -287,7 +289,7 @@ cpdef Table stable_sort_by_key(
     list column_order,
     list null_precedence,
 ):
-    """Sorts the table by key, maintaining input order.
+    """Sorts the table by key preserving order of equal elements.
 
     Parameters
     ----------
@@ -349,3 +351,34 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence):
             )
         )
     return Table.from_libcudf(move(c_result))
+
+
+cpdef Table stable_sort(Table source_table, list column_order, list null_precedence):
+    """Sorts the table preserving order of equal elements.
+
+    Parameters
+    ----------
+    source_table : Table
+        The table to sort.
+    column_order : List[ColumnOrder]
+        Whether each column should be sorted in ascending or descending order.
+    null_precedence : List[NullOrder]
+        Whether nulls should come before or after non-nulls.
+
+    Returns
+    -------
+    Table
+        The sorted table.
+    """
+    cdef unique_ptr[table] c_result
+    cdef vector[order] c_orders = column_order
+    cdef vector[null_order] c_null_precedence = null_precedence
+    with nogil:
+        c_result = move(
+            cpp_sorting.stable_sort(
+                source_table.view(),
+                c_orders,
+                c_null_precedence,
+            )
+        )
+    return Table.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
index 78adb20021c..29acc21fc05 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
@@ -15,19 +15,21 @@ from .table cimport Table
 
 cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold)
 
-cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask)
+cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold)
 
-cpdef size_type distinct_count(
-    Column source_table,
-    null_policy null_handling,
-    nan_policy nan_handling
+cpdef Table unique(
+    Table input,
+    list keys,
+    duplicate_keep_option keep,
+    null_equality nulls_equal,
 )
 
-cpdef Table stable_distinct(
+cpdef Table distinct(
     Table input,
     list keys,
     duplicate_keep_option keep,
     null_equality nulls_equal,
+    nan_equality nans_equal,
 )
 
 cpdef Column distinct_indices(
@@ -36,3 +38,23 @@ cpdef Column distinct_indices(
     null_equality nulls_equal,
     nan_equality nans_equal,
 )
+
+cpdef Table stable_distinct(
+    Table input,
+    list keys,
+    duplicate_keep_option keep,
+    null_equality nulls_equal,
+    nan_equality nans_equal,
+)
+
+cpdef size_type unique_count(
+    Column column,
+    null_policy null_handling,
+    nan_policy nan_handling
+)
+
+cpdef size_type distinct_count(
+    Column column,
+    null_policy null_handling,
+    nan_policy nan_handling
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
index 0357866980a..af7a85d31bf 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
@@ -51,6 +51,34 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold):
     return Table.from_libcudf(move(c_result))
 
 
+cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold):
+    """Filters out rows from the input table based on the presence of NaNs.
+
+    Parameters
+    ----------
+    source_table : Table
+        The input table to filter.
+    keys : List[size_type]
+        The list of column indexes to consider for NaN filtering.
+    keep_threshold : size_type
+        The minimum number of non-NaNs required to keep a row.
+
+    Returns
+    -------
+    Table
+        A new table with rows removed based on NaNs.
+    """
+    cdef unique_ptr[table] c_result
+    cdef vector[size_type] c_keys = keys
+    with nogil:
+        c_result = move(
+            cpp_stream_compaction.drop_nulls(
+                source_table.view(), c_keys, keep_threshold
+            )
+        )
+    return Table.from_libcudf(move(c_result))
+
+
 cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask):
     """Filters out rows from the input table based on a boolean mask.
 
@@ -76,39 +104,55 @@ cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask):
     return Table.from_libcudf(move(c_result))
 
 
-cpdef size_type distinct_count(
-    Column source_table,
-    null_policy null_handling,
-    nan_policy nan_handling
+cpdef Table unique(
+    Table input,
+    list keys,
+    duplicate_keep_option keep,
+    null_equality nulls_equal,
 ):
-    """Returns the number of unique elements in the input column.
+    """Filter duplicate consecutive rows from the input table.
 
     Parameters
     ----------
-    source_table : Column
-        The input column to count the unique elements of.
-    null_handling : null_policy
-        Flag to include or exclude nulls from the count.
-    nan_handling : nan_policy
-        Flag to include or exclude NaNs from the count.
+    input : Table
+        The input table to filter
+    keys : list[int]
+        The list of column indexes to consider for filtering.
+    keep : duplicate_keep_option
+        The option to specify which rows to keep in the case of duplicates.
+    nulls_equal : null_equality
+        The option to specify how nulls are handled in the comparison.
 
     Returns
     -------
-    size_type
-        The number of unique elements in the input column.
+    Table
+        New Table with unique rows from each sequence of equivalent rows
+        as specified by keep. In the same order as the input table.
+
+    Notes
+    -----
+    If the input columns to be filtered on are sorted, then
+    unique can produce the same result as stable_distinct, but faster.
     """
-    return cpp_stream_compaction.distinct_count(
-        source_table.view(), null_handling, nan_handling
-    )
+    cdef unique_ptr[table] c_result
+    cdef vector[size_type] c_keys = keys
+    with nogil:
+        c_result = move(
+            cpp_stream_compaction.unique(
+                input.view(), c_keys, keep, nulls_equal
+            )
+        )
+    return Table.from_libcudf(move(c_result))
 
 
-cpdef Table stable_distinct(
+cpdef Table distinct(
     Table input,
     list keys,
     duplicate_keep_option keep,
     null_equality nulls_equal,
+    nan_equality nans_equal,
 ):
-    """Get the distinct rows from the input table, preserving input order.
+    """Get the distinct rows from the input table.
 
     Parameters
     ----------
@@ -120,18 +164,21 @@ cpdef Table stable_distinct(
         The option to specify which rows to keep in the case of duplicates.
     nulls_equal : null_equality
         The option to specify how nulls are handled in the comparison.
+    nans_equal : nan_equality
+        The option to specify how NaNs are handled in the comparison.
 
     Returns
     -------
     Table
-        A new table with distinct rows from the input table.
+        A new table with distinct rows from the input table. The
+        output will not necessarily be in the same order as the input.
     """
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
     with nogil:
         c_result = move(
-            cpp_stream_compaction.stable_distinct(
-                input.view(), c_keys, keep, nulls_equal
+            cpp_stream_compaction.distinct(
+                input.view(), c_keys, keep, nulls_equal, nans_equal
             )
         )
     return Table.from_libcudf(move(c_result))
@@ -169,3 +216,99 @@ cpdef Column distinct_indices(
             )
         )
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Table stable_distinct(
+    Table input,
+    list keys,
+    duplicate_keep_option keep,
+    null_equality nulls_equal,
+    nan_equality nans_equal,
+):
+    """Get the distinct rows from the input table, preserving input order.
+
+    Parameters
+    ----------
+    input : Table
+        The input table to filter.
+    keys : list
+        The list of column indexes to consider for distinct filtering.
+    keep : duplicate_keep_option
+        The option to specify which rows to keep in the case of duplicates.
+    nulls_equal : null_equality
+        The option to specify how nulls are handled in the comparison.
+    nans_equal : nan_equality
+        The option to specify how NaNs are handled in the comparison.
+
+    Returns
+    -------
+    Table
+        A new table with distinct rows from the input table, preserving
+        the input table order.
+    """
+    cdef unique_ptr[table] c_result
+    cdef vector[size_type] c_keys = keys
+    with nogil:
+        c_result = move(
+            cpp_stream_compaction.stable_distinct(
+                input.view(), c_keys, keep, nulls_equal, nans_equal
+            )
+        )
+    return Table.from_libcudf(move(c_result))
+
+
+cpdef size_type unique_count(
+    Column source,
+    null_policy null_handling,
+    nan_policy nan_handling
+):
+    """Returns the number of unique consecutive elements in the input column.
+
+    Parameters
+    ----------
+    source : Column
+        The input column to count the unique elements of.
+    null_handling : null_policy
+        Flag to include or exclude nulls from the count.
+    nan_handling : nan_policy
+        Flag to include or exclude NaNs from the count.
+
+    Returns
+    -------
+    size_type
+        The number of unique consecutive elements in the input column.
+
+    Notes
+    -----
+    If the input column is sorted, then unique_count can produce the
+    same result as distinct_count, but faster.
+    """
+    return cpp_stream_compaction.unique_count(
+        source.view(), null_handling, nan_handling
+    )
+
+
+cpdef size_type distinct_count(
+    Column source,
+    null_policy null_handling,
+    nan_policy nan_handling
+):
+    """Returns the number of distinct elements in the input column.
+
+    Parameters
+    ----------
+    source : Column
+        The input column to count the unique elements of.
+    null_handling : null_policy
+        Flag to include or exclude nulls from the count.
+    nan_handling : nan_policy
+        Flag to include or exclude NaNs from the count.
+
+    Returns
+    -------
+    size_type
+        The number of distinct elements in the input column.
+    """
+    return cpp_stream_compaction.distinct_count(
+        source.view(), null_handling, nan_handling
+    )
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index 04883eac559..834f91f48d9 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -109,6 +109,7 @@ def drop_duplicates(list columns,
             keep_option,
             pylibcudf.types.NullEquality.EQUAL
             if nulls_are_equal else pylibcudf.types.NullEquality.UNEQUAL,
+            pylibcudf.types.NanEquality.ALL_EQUAL,
         )
     )
 

From da113015aade79d78628d00578dff22a4dd5cf35 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 4 Mar 2024 13:17:33 -0600
Subject: [PATCH 05/18] Switch `pytest-xdist` algo to `worksteal` (#15207)

This PR switches `pytest-xdist` distribution algorithm to a much more efficient algorithm `worksteal`, that will assign any idle pytest worker to pickup remaining pytests.

I see a 25% time savings when this switch is made locally:
```
`loadscope`:
== 101421 passed, 2115 skipped, 867 xfailed in 1179.48s (0:19:39) ==
`worksteal`:
== 101423 passed, 2115 skipped, 867 xfailed in 891.79s (0:14:51) ==
```

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15207
---
 ci/test_python_cudf.sh  | 6 +++---
 ci/test_python_other.sh | 4 ++--
 ci/test_wheel_cudf.sh   | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh
index ace71bb0b75..bacb54b3896 100755
--- a/ci/test_python_cudf.sh
+++ b/ci/test_python_cudf.sh
@@ -18,7 +18,7 @@ rapids-logger "pytest cudf"
 ./ci/run_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
   --numprocesses=8 \
-  --dist=loadscope \
+  --dist=worksteal \
   --cov-config=../.coveragerc \
   --cov=cudf \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-coverage.xml" \
@@ -32,7 +32,7 @@ rapids-logger "pytest cudf"
 rapids-logger "pytest for cudf benchmarks"
 ./ci/run_cudf_pytest_benchmarks.sh \
   --numprocesses=8 \
-  --dist=loadscope \
+  --dist=worksteal \
   --cov-config=.coveragerc \
   --cov=cudf \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-benchmark-coverage.xml" \
@@ -41,7 +41,7 @@ rapids-logger "pytest for cudf benchmarks"
 rapids-logger "pytest for cudf benchmarks using pandas"
 ./ci/run_cudf_pandas_pytest_benchmarks.sh \
   --numprocesses=8 \
-  --dist=loadscope \
+  --dist=worksteal \
   --cov-config=.coveragerc \
   --cov=cudf \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-benchmark-pandas-coverage.xml" \
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index bc15747b26a..9cdceb295db 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -23,7 +23,7 @@ rapids-logger "pytest dask_cudf"
 ./ci/run_dask_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
-  --dist=loadscope \
+  --dist=worksteal \
   --cov-config=../.coveragerc \
   --cov=dask_cudf \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \
@@ -33,7 +33,7 @@ rapids-logger "pytest custreamz"
 ./ci/run_custreamz_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-custreamz.xml" \
   --numprocesses=8 \
-  --dist=loadscope \
+  --dist=worksteal \
   --cov-config=../.coveragerc \
   --cov=custreamz \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/custreamz-coverage.xml" \
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index b7e8f862ed5..af5779f478a 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -37,7 +37,7 @@ else
       --cache-clear \
       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
       --numprocesses=8 \
-      --dist=loadscope \
+      --dist=worksteal \
       .
     popd
 fi

From 0ff5a2c59cb62d6b3c473885ebbe883d1aae8c4f Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 4 Mar 2024 15:20:32 -0500
Subject: [PATCH 06/18] Replace local copyright check with pre-commit-hooks
 verify-copyright (#14917)

The local `copyright.py` script is bug-prone. Replace it with a more robust centralized script from `pre-commit-hooks`.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)
  - Karthikeyan (https://github.com/karthikeyann)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14917
---
 .pre-commit-config.yaml |  13 +-
 ci/checks/copyright.py  | 277 ----------------------------------------
 2 files changed, 7 insertions(+), 283 deletions(-)
 delete mode 100644 ci/checks/copyright.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d302543368e..9235c80bdc9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -126,12 +126,6 @@ repos:
           - cmakelang==0.6.13
         verbose: true
         require_serial: true
-      - id: copyright-check
-        name: copyright-check
-        entry: python ./ci/checks/copyright.py --git-modified-only --update-current-year
-        language: python
-        pass_filenames: false
-        additional_dependencies: [gitpython]
       - id: doxygen-check
         name: doxygen-check
         entry: ./ci/checks/doxygen.sh
@@ -161,6 +155,13 @@ repos:
     hooks:
       - id: ruff
         files: python/.*$
+  - repo: https://github.com/rapidsai/pre-commit-hooks
+    rev: v0.0.1
+    hooks:
+      - id: verify-copyright
+        exclude: |
+          (?x)
+              cpp/include/cudf_test/cxxopts[.]hpp$
 
 
 default_language_version:
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
deleted file mode 100644
index dd89b092496..00000000000
--- a/ci/checks/copyright.py
+++ /dev/null
@@ -1,277 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-import datetime
-import os
-import re
-import sys
-
-import git
-
-FilesToCheck = [
-    re.compile(r"[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$"),
-    re.compile(r"CMakeLists[.]txt$"),
-    re.compile(r"CMakeLists_standalone[.]txt$"),
-    re.compile(r"setup[.]cfg$"),
-    re.compile(r"meta[.]yaml$"),
-]
-ExemptFiles = [
-    re.compile(r"cpp/include/cudf_test/cxxopts.hpp"),
-]
-
-# this will break starting at year 10000, which is probably OK :)
-CheckSimple = re.compile(
-    r"Copyright *(?:\(c\))? *(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)"
-)
-CheckDouble = re.compile(
-    r"Copyright *(?:\(c\))? *(\d{4})-(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)"  # noqa: E501
-)
-
-
-def checkThisFile(f):
-    if isinstance(f, git.Diff):
-        if f.deleted_file or f.b_blob.size == 0:
-            return False
-        f = f.b_path
-    elif not os.path.exists(f) or os.stat(f).st_size == 0:
-        # This check covers things like symlinks which point to files that DNE
-        return False
-    for exempt in ExemptFiles:
-        if exempt.search(f):
-            return False
-    for checker in FilesToCheck:
-        if checker.search(f):
-            return True
-    return False
-
-
-def modifiedFiles():
-    """Get a set of all modified files, as Diff objects.
-
-    The files returned have been modified in git since the merge base of HEAD
-    and the upstream of the target branch. We return the Diff objects so that
-    we can read only the staged changes.
-    """
-    repo = git.Repo()
-    # Use the environment variable TARGET_BRANCH or RAPIDS_BASE_BRANCH (defined in CI) if possible
-    target_branch = os.environ.get("TARGET_BRANCH", os.environ.get("RAPIDS_BASE_BRANCH"))
-    if target_branch is None:
-        # Fall back to the closest branch if not on CI
-        target_branch = repo.git.describe(
-            all=True, tags=True, match="branch-*", abbrev=0
-        ).lstrip("heads/")
-
-    upstream_target_branch = None
-    if target_branch in repo.heads:
-        # Use the tracking branch of the local reference if it exists. This
-        # returns None if no tracking branch is set.
-        upstream_target_branch = repo.heads[target_branch].tracking_branch()
-    if upstream_target_branch is None:
-        # Fall back to the remote with the newest target_branch. This code
-        # path is used on CI because the only local branch reference is
-        # current-pr-branch, and thus target_branch is not in repo.heads.
-        # This also happens if no tracking branch is defined for the local
-        # target_branch. We use the remote with the latest commit if
-        # multiple remotes are defined.
-        candidate_branches = [
-            remote.refs[target_branch] for remote in repo.remotes
-            if target_branch in remote.refs
-        ]
-        if len(candidate_branches) > 0:
-            upstream_target_branch = sorted(
-                candidate_branches,
-                key=lambda branch: branch.commit.committed_datetime,
-            )[-1]
-        else:
-            # If no remotes are defined, try to use the local version of the
-            # target_branch. If this fails, the repo configuration must be very
-            # strange and we can fix this script on a case-by-case basis.
-            upstream_target_branch = repo.heads[target_branch]
-    merge_base = repo.merge_base("HEAD", upstream_target_branch.commit)[0]
-    diff = merge_base.diff()
-    changed_files = {f for f in diff if f.b_path is not None}
-    return changed_files
-
-
-def getCopyrightYears(line):
-    res = CheckSimple.search(line)
-    if res:
-        return int(res.group(1)), int(res.group(1))
-    res = CheckDouble.search(line)
-    if res:
-        return int(res.group(1)), int(res.group(2))
-    return None, None
-
-
-def replaceCurrentYear(line, start, end):
-    # first turn a simple regex into double (if applicable). then update years
-    res = CheckSimple.sub(r"Copyright (c) \1-\1, NVIDIA CORPORATION", line)
-    res = CheckDouble.sub(
-        rf"Copyright (c) {start:04d}-{end:04d}, NVIDIA CORPORATION",
-        res,
-    )
-    return res
-
-
-def checkCopyright(f, update_current_year):
-    """Checks for copyright headers and their years."""
-    errs = []
-    thisYear = datetime.datetime.now().year
-    lineNum = 0
-    crFound = False
-    yearMatched = False
-
-    if isinstance(f, git.Diff):
-        path = f.b_path
-        lines = f.b_blob.data_stream.read().decode().splitlines(keepends=True)
-    else:
-        path = f
-        with open(f, encoding="utf-8") as fp:
-            lines = fp.readlines()
-
-    for line in lines:
-        lineNum += 1
-        start, end = getCopyrightYears(line)
-        if start is None:
-            continue
-        crFound = True
-        if start > end:
-            e = [
-                path,
-                lineNum,
-                "First year after second year in the copyright "
-                "header (manual fix required)",
-                None,
-            ]
-            errs.append(e)
-        elif thisYear < start or thisYear > end:
-            e = [
-                path,
-                lineNum,
-                "Current year not included in the copyright header",
-                None,
-            ]
-            if thisYear < start:
-                e[-1] = replaceCurrentYear(line, thisYear, end)
-            if thisYear > end:
-                e[-1] = replaceCurrentYear(line, start, thisYear)
-            errs.append(e)
-        else:
-            yearMatched = True
-    # copyright header itself not found
-    if not crFound:
-        e = [
-            path,
-            0,
-            "Copyright header missing or formatted incorrectly "
-            "(manual fix required)",
-            None,
-        ]
-        errs.append(e)
-    # even if the year matches a copyright header, make the check pass
-    if yearMatched:
-        errs = []
-
-    if update_current_year:
-        errs_update = [x for x in errs if x[-1] is not None]
-        if len(errs_update) > 0:
-            lines_changed = ", ".join(str(x[1]) for x in errs_update)
-            print(f"File: {path}. Changing line(s) {lines_changed}")
-            for _, lineNum, __, replacement in errs_update:
-                lines[lineNum - 1] = replacement
-            with open(path, "w", encoding="utf-8") as out_file:
-                out_file.writelines(lines)
-
-    return errs
-
-
-def getAllFilesUnderDir(root, pathFilter=None):
-    retList = []
-    for dirpath, dirnames, filenames in os.walk(root):
-        for fn in filenames:
-            filePath = os.path.join(dirpath, fn)
-            if pathFilter(filePath):
-                retList.append(filePath)
-    return retList
-
-
-def checkCopyright_main():
-    """
-    Checks for copyright headers in all the modified files. In case of local
-    repo, this script will just look for uncommitted files and in case of CI
-    it compares between branches "$PR_TARGET_BRANCH" and "current-pr-branch"
-    """
-    retVal = 0
-
-    argparser = argparse.ArgumentParser(
-        "Checks for a consistent copyright header in git's modified files"
-    )
-    argparser.add_argument(
-        "--update-current-year",
-        dest="update_current_year",
-        action="store_true",
-        required=False,
-        help="If set, "
-        "update the current year if a header is already "
-        "present and well formatted.",
-    )
-    argparser.add_argument(
-        "--git-modified-only",
-        dest="git_modified_only",
-        action="store_true",
-        required=False,
-        help="If set, "
-        "only files seen as modified by git will be "
-        "processed.",
-    )
-
-    args, dirs = argparser.parse_known_args()
-
-    if args.git_modified_only:
-        files = [f for f in modifiedFiles() if checkThisFile(f)]
-    else:
-        files = []
-        for d in [os.path.abspath(d) for d in dirs]:
-            if not os.path.isdir(d):
-                raise ValueError(f"{d} is not a directory.")
-            files += getAllFilesUnderDir(d, pathFilter=checkThisFile)
-
-    errors = []
-    for f in files:
-        errors += checkCopyright(f, args.update_current_year)
-
-    if len(errors) > 0:
-        if any(e[-1] is None for e in errors):
-            print("Copyright headers incomplete in some of the files!")
-        for e in errors:
-            print("  %s:%d Issue: %s" % (e[0], e[1], e[2]))
-        print("")
-        n_fixable = sum(1 for e in errors if e[-1] is not None)
-        path_parts = os.path.abspath(__file__).split(os.sep)
-        file_from_repo = os.sep.join(path_parts[path_parts.index("ci") :])
-        if n_fixable > 0 and not args.update_current_year:
-            print(
-                f"You can run `python {file_from_repo} --git-modified-only "
-                "--update-current-year` and stage the results in git to "
-                f"fix {n_fixable} of these errors.\n"
-            )
-        retVal = 1
-
-    return retVal
-
-
-if __name__ == "__main__":
-    sys.exit(checkCopyright_main())

From d158ccdbe651952bd649cb0f17c41467c5209824 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 4 Mar 2024 15:25:51 -0500
Subject: [PATCH 07/18] API for JSON unquoted whitespace normalization (#15033)

This work is a follow-up to PR #14931 which provided a proof-of-concept for using the a FST to normalize unquoted whitespaces. This PR implements the pre-processing FST in cuIO and adds a JSON reader option that needs to be set to true to invoke the normalizer.
Addresses feature request #14865

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15033
---
 cpp/CMakeLists.txt                            |   2 +-
 cpp/include/cudf/io/detail/json.hpp           |  10 +
 cpp/include/cudf/io/json.hpp                  |  31 +++
 ...normalization.cu => json_normalization.cu} | 142 ++++++++++++-
 cpp/src/io/json/read_json.cu                  |   7 +
 .../io/json_whitespace_normalization_test.cu  | 201 ++++--------------
 .../main/java/ai/rapids/cudf/JSONOptions.java |  15 ++
 java/src/main/java/ai/rapids/cudf/Table.java  |   9 +
 java/src/main/native/src/TableJni.cpp         |  27 ++-
 .../test/java/ai/rapids/cudf/TableTest.java   |  49 +++--
 java/src/test/resources/whitespaces.json      |   5 +
 11 files changed, 314 insertions(+), 184 deletions(-)
 rename cpp/src/io/json/{json_quote_normalization.cu => json_normalization.cu} (57%)
 create mode 100644 java/src/test/resources/whitespaces.json

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5fd6cd3544a..c74963be50d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -376,7 +376,7 @@ add_library(
   src/io/functions.cpp
   src/io/json/byte_range_info.cu
   src/io/json/json_column.cu
-  src/io/json/json_quote_normalization.cu
+  src/io/json/json_normalization.cu
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu
   src/io/json/read_json.cu
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 0eb0e17ea10..3f7f7e9bb32 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -63,4 +63,14 @@ rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& in
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Normalize unquoted whitespace (space and tab characters) using FST
+ *
+ * @param inbuf Input device buffer
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ */
+rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index f0c3d48ab7e..593dd044d51 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -118,6 +118,9 @@ class json_reader_options {
   // Normalize single quotes
   bool _normalize_single_quotes = false;
 
+  // Normalize unquoted spaces and tabs
+  bool _normalize_whitespace = false;
+
   // Whether to recover after an invalid JSON line
   json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
 
@@ -265,6 +268,13 @@ class json_reader_options {
    */
   bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; }
 
+  /**
+   * @brief Whether the reader should normalize unquoted whitespace characters
+   *
+   * @returns true if the reader should normalize whitespace, false otherwise
+   */
+  bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; }
+
   /**
    * @brief Queries the JSON reader's behavior on invalid JSON lines.
    *
@@ -358,6 +368,14 @@ class json_reader_options {
    */
   void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; }
 
+  /**
+   * @brief Set whether the reader should enable normalization of unquoted whitespace
+   *
+   * @param val Boolean value to indicate whether the reader should normalize unquoted whitespace
+   * characters i.e. tabs and spaces
+   */
+  void enable_normalize_whitespace(bool val) { _normalize_whitespace = val; }
+
   /**
    * @brief Specifies the JSON reader's behavior on invalid JSON lines.
    *
@@ -533,6 +551,19 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether the reader should normalize unquoted whitespace
+   *
+   * @param val Boolean value to indicate whether the reader should normalize unquoted
+   * whitespace
+   * @return this for chaining
+   */
+  json_reader_options_builder& normalize_whitespace(bool val)
+  {
+    options._normalize_whitespace = val;
+    return *this;
+  }
+
   /**
    * @brief Specifies the JSON reader's behavior on invalid JSON lines.
    *
diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_normalization.cu
similarity index 57%
rename from cpp/src/io/json/json_quote_normalization.cu
rename to cpp/src/io/json/json_normalization.cu
index a13b6e0b016..86e4da664a8 100644
--- a/cpp/src/io/json/json_quote_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -32,13 +32,15 @@
 
 namespace cudf::io::json {
 
-using SymbolT       = char;
-using StateT        = char;
+// Type used to represent the atomic symbol type used within the finite-state machine
+using SymbolT = char;
+using StateT  = char;
+
+// Type sufficiently large to index symbols within the input and output (may be unsigned)
 using SymbolOffsetT = uint32_t;
 
 namespace normalize_quotes {
 
-// Type sufficiently large to index symbols within the input and output (may be unsigned)
 enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
 enum class dfa_symbol_group_id : uint32_t {
   DOUBLE_QUOTE_CHAR,  ///< Quote character SG: "
@@ -172,6 +174,116 @@ struct TransduceToNormalizedQuotes {
 
 }  // namespace normalize_quotes
 
+namespace normalize_whitespace {
+
+enum class dfa_symbol_group_id : uint32_t {
+  DOUBLE_QUOTE_CHAR,   ///< Quote character SG: "
+  ESCAPE_CHAR,         ///< Escape character SG: '\\'
+  NEWLINE_CHAR,        ///< Newline character SG: '\n'
+  WHITESPACE_SYMBOLS,  ///< Whitespace characters SG: '\t' or ' '
+  OTHER_SYMBOLS,       ///< SG implicitly matching all other characters
+  NUM_SYMBOL_GROUPS    ///< Total number of symbol groups
+};
+// Alias for readability of symbol group ids
+constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
+// The i-th string representing all the characters of a symbol group
+std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
+  {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};
+
+/**
+ * -------- FST states ---------
+ * -----------------------------
+ * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
+ *        |   quotes as well as any other character not enclosed by a string. Also handles
+ *        |   newline character present within a string
+ * TT_DQS | Double-quoted string state handling all characters within double quotes except
+ *        |   newline character
+ * TT_DEC | State handling escaped characters inside double-quoted string. Note that this
+ *        |   state is necessary to process escaped double-quote characters. Without this
+ *        |   state, whitespaces following escaped double quotes inside strings may be removed.
+ *
+ * NOTE: An important case NOT handled by this FST is that of whitespace following newline
+ * characters within a string. Consider the following example
+ * Input:           {"a":"x\n y"}
+ * FST output:      {"a":"x\ny"}
+ * Expected output: {"a":"x\n y"}
+ * Such strings are not part of the JSON standard (characters allowed within quotes should
+ * have ASCII at least 0x20 i.e. space character and above) but may be encountered while
+ * reading JSON files
+ */
+enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES };
+// Aliases for readability of the transition table
+constexpr auto TT_OOS        = dfa_states::TT_OOS;
+constexpr auto TT_DQS        = dfa_states::TT_DQS;
+constexpr auto TT_DEC        = dfa_states::TT_DEC;
+constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
+
+// Transition table
+std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
+  {/* IN_STATE      "       \       \n    <SPC>   OTHER  */
+   /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
+   /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
+   /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
+
+// The DFA's starting state
+constexpr StateT start_state = static_cast<StateT>(TT_OOS);
+
+struct TransduceToNormalizedWS {
+  /**
+   * @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
+   */
+  template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
+                                                SymbolGroupT const match_id,
+                                                RelativeOffsetT const relative_offset,
+                                                SymbolT const read_symbol) const
+  {
+    // -------- TRANSLATION TABLE ------------
+    //      Let the alphabet set be Sigma
+    // ---------------------------------------
+    // ---------- NON-SPECIAL CASES: ----------
+    //      Output symbol same as input symbol <s>
+    // state | read_symbol <s>  -> output_symbol <s>
+    // DQS   | Sigma            -> Sigma
+    // OOS   | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
+    // DEC   | Sigma            -> Sigma
+    // ---------- SPECIAL CASES: --------------
+    //    Input symbol translates to output symbol
+    // OOS   | {<SPC>}          -> <nop>
+    // OOS   | {\t}             -> <nop>
+
+    // Case when read symbol is a space or tab but is unquoted
+    // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
+    // However, since there is no output in this case i.e. the count returned by
+    // operator()(state_id, match_id, read_symbol) is zero, this function is never called.
+    // So skipping the check for this case.
+
+    // In all other cases, we have an output symbol for the input symbol.
+    // We simply output the input symbol
+    return read_symbol;
+  }
+
+  /**
+   * @brief Returns the number of output characters for a given transition.
+   * During whitespace normalization, we always emit one output character i.e., the input
+   * character, except when we need to remove the space/tab character
+   */
+  template <typename StateT, typename SymbolGroupT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id,
+                                                 SymbolGroupT const match_id,
+                                                 SymbolT const read_symbol) const
+  {
+    // Case when read symbol is a space or tab but is unquoted
+    if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
+        state_id == static_cast<StateT>(dfa_states::TT_OOS)) {
+      return 0;
+    }
+    return 1;
+  }
+};
+
+}  // namespace normalize_whitespace
+
 namespace detail {
 
 rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
@@ -198,5 +310,29 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
   return outbuf;
 }
 
+rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&& inbuf,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  auto parser = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
+    fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
+    fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
+    stream);
+
+  rmm::device_uvector<SymbolT> outbuf(inbuf.size(), stream, mr);
+  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
+  parser.Transduce(inbuf.data(),
+                   static_cast<SymbolOffsetT>(inbuf.size()),
+                   outbuf.data(),
+                   thrust::make_discard_iterator(),
+                   outbuf_size.data(),
+                   normalize_whitespace::start_state,
+                   stream);
+
+  outbuf.resize(outbuf_size.value(stream), stream);
+  return outbuf;
+}
+
 }  // namespace detail
 }  // namespace cudf::io::json
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index ba8acf2d47a..506d7b6cddc 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -235,6 +235,13 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
       normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource());
   }
 
+  // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
+  // enabled, invoke pre-processing FST
+  if (reader_opts.is_enabled_normalize_whitespace()) {
+    buffer =
+      normalize_whitespace(std::move(buffer), stream, rmm::mr::get_current_device_resource());
+  }
+
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
   // For debug purposes, use host_parse_nested_json()
 }
diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json_whitespace_normalization_test.cu
index 545d8d2c4f9..336d360063f 100644
--- a/cpp/tests/io/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json_whitespace_normalization_test.cu
@@ -13,177 +13,41 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "io/fst/lookup_tables.cuh"
-#include "io/utilities/hostdevice_vector.hpp"
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/default_stream.hpp>
+#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
-#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/json.hpp>
+#include <cudf/io/json.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
-#include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
-#include <thrust/iterator/discard_iterator.h>
-
-#include <cstdlib>
 #include <string>
 
-namespace {
-// Type used to represent the atomic symbol type used within the finite-state machine
-using SymbolT = char;
-using StateT  = char;
-
-// Type sufficiently large to index symbols within the input and output (may be unsigned)
-using SymbolOffsetT = uint32_t;
-
-enum class dfa_symbol_group_id : uint32_t {
-  DOUBLE_QUOTE_CHAR,   ///< Quote character SG: "
-  ESCAPE_CHAR,         ///< Escape character SG: '\\'
-  NEWLINE_CHAR,        ///< Newline character SG: '\n'
-  WHITESPACE_SYMBOLS,  ///< Whitespace characters SG: '\t' or ' '
-  OTHER_SYMBOLS,       ///< SG implicitly matching all other characters
-  NUM_SYMBOL_GROUPS    ///< Total number of symbol groups
-};
-// Alias for readability of symbol group ids
-constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
-// The i-th string representing all the characters of a symbol group
-std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
-  {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};
-
-/**
- * -------- FST states ---------
- * -----------------------------
- * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
- *        |   quotes as well as any other character not enclosed by a string. Also handles
- *        |   newline character present within a string
- * TT_DQS | Double-quoted string state handling all characters within double quotes except
- *        |   newline character
- * TT_DEC | State handling escaped characters inside double-quoted string. Note that this
- *        |   state is necessary to process escaped double-quote characters. Without this
- *        |   state, whitespaces following escaped double quotes inside strings may be removed.
- *
- * NOTE: An important case NOT handled by this FST is that of whitespace following newline
- * characters within a string. Consider the following example
- * Input:           {"a":"x\n y"}
- * FST output:      {"a":"x\ny"}
- * Expected output: {"a":"x\n y"}
- */
-enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES };
-// Aliases for readability of the transition table
-constexpr auto TT_OOS        = dfa_states::TT_OOS;
-constexpr auto TT_DQS        = dfa_states::TT_DQS;
-constexpr auto TT_DEC        = dfa_states::TT_DEC;
-constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
-
-// Transition table
-std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
-  {/* IN_STATE      "       \       \n    <SPC>   OTHER  */
-   /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
-   /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
-   /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
-
-// The DFA's starting state
-constexpr StateT start_state = static_cast<StateT>(TT_OOS);
-
-struct TransduceToNormalizedWS {
-  /**
-   * @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
-   */
-  template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
-                                                SymbolGroupT const match_id,
-                                                RelativeOffsetT const relative_offset,
-                                                SymbolT const read_symbol) const
-  {
-    // -------- TRANSLATION TABLE ------------
-    //      Let the alphabet set be Sigma
-    // ---------------------------------------
-    // ---------- NON-SPECIAL CASES: ----------
-    //      Output symbol same as input symbol <s>
-    // state | read_symbol <s>  -> output_symbol <s>
-    // DQS   | Sigma            -> Sigma
-    // OOS   | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
-    // DEC   | Sigma            -> Sigma
-    // ---------- SPECIAL CASES: --------------
-    //    Input symbol translates to output symbol
-    // OOS   | {<SPC>}          -> <nop>
-    // OOS   | {\t}             -> <nop>
-
-    // Case when read symbol is a space or tab but is unquoted
-    // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
-    // However, since there is no output in this case i.e. the count returned by
-    // operator()(state_id, match_id, read_symbol) is zero, this function is never called.
-    // So skipping the check for this case.
-
-    // In all other cases, we have an output symbol for the input symbol.
-    // We simply output the input symbol
-    return read_symbol;
-  }
-
-  /**
-   * @brief Returns the number of output characters for a given transition.
-   * During whitespace normalization, we always emit one output character i.e., the input
-   * character, except when we need to remove the space/tab character
-   */
-  template <typename StateT, typename SymbolGroupT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id,
-                                                 SymbolGroupT const match_id,
-                                                 SymbolT const read_symbol) const
-  {
-    // Case when read symbol is a space or tab but is unquoted
-    if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
-        state_id == static_cast<StateT>(dfa_states::TT_OOS)) {
-      return 0;
-    }
-    return 1;
-  }
-};
-}  // namespace
-
 // Base test fixture for tests
 struct JsonWSNormalizationTest : public cudf::test::BaseFixture {};
 
-void run_test(std::string const& input, std::string const& output)
+void run_test(std::string const& host_input, std::string const& expected_host_output)
 {
-  auto parser = cudf::io::fst::detail::make_fst(
-    cudf::io::fst::detail::make_symbol_group_lut(wna_sgs),
-    cudf::io::fst::detail::make_transition_table(wna_state_tt),
-    cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedWS{}),
-    cudf::test::get_default_stream());
-
-  auto d_input_scalar = cudf::make_string_scalar(input, cudf::test::get_default_stream());
-  auto& d_input       = static_cast<cudf::scalar_type_t<std::string>&>(*d_input_scalar);
+  auto stream_view  = cudf::get_default_stream();
+  auto device_input = cudf::detail::make_device_uvector_async(
+    host_input, stream_view, rmm::mr::get_current_device_resource());
 
-  // Prepare input & output buffers
-  constexpr std::size_t single_item = 1;
-  cudf::detail::hostdevice_vector<SymbolT> output_gpu(input.size(),
-                                                      cudf::test::get_default_stream());
-  cudf::detail::hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item,
-                                                                 cudf::test::get_default_stream());
+  // Preprocessing FST
+  auto device_fst_output = cudf::io::json::detail::normalize_whitespace(
+    std::move(device_input), stream_view, rmm::mr::get_current_device_resource());
 
-  // Allocate device-side temporary storage & run algorithm
-  parser.Transduce(d_input.data(),
-                   static_cast<SymbolOffsetT>(d_input.size()),
-                   output_gpu.device_ptr(),
-                   thrust::make_discard_iterator(),
-                   output_gpu_size.device_ptr(),
-                   start_state,
-                   cudf::test::get_default_stream());
+  auto const preprocessed_host_output =
+    cudf::detail::make_std_vector_sync(device_fst_output, stream_view);
 
-  // Async copy results from device to host
-  output_gpu.device_to_host_async(cudf::test::get_default_stream());
-  output_gpu_size.device_to_host_async(cudf::test::get_default_stream());
-
-  // Make sure results have been copied back to host
-  cudf::test::get_default_stream().synchronize();
-
-  // Verify results
-  ASSERT_EQ(output_gpu_size[0], output.size());
-  CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output, output.size());
+  ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size());
+  CUDF_TEST_EXPECT_VECTOR_EQUAL(
+    preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
 }
 
 TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces)
@@ -259,4 +123,33 @@ TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput)
   run_test(input, output);
 }
 
+TEST_F(JsonWSNormalizationTest, ReadJsonOption)
+{
+  // When mixed type fields are read as strings, the table read will differ depending the
+  // value of normalize_whitespace
+
+  // Test input
+  std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(true);
+
+  cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
+
+  // Expected table
+  std::string const expected_input = R"({ "a" : {"b":"c"}})";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(false);
+
+  cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index 62496e32f7a..b37d0d88ec9 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -31,6 +31,7 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean lines;
   private final boolean recoverWithNull;
   private final boolean normalizeSingleQuotes;
+  private final boolean normalizeWhitespace;
   private final boolean mixedTypesAsStrings;
   private final boolean keepStringQuotes;
 
@@ -40,6 +41,7 @@ private JSONOptions(Builder builder) {
     lines = builder.lines;
     recoverWithNull = builder.recoverWithNull;
     normalizeSingleQuotes = builder.normalizeSingleQuotes;
+    normalizeWhitespace = builder.normalizeWhitespace;
     mixedTypesAsStrings = builder.mixedTypesAsStrings;
     keepStringQuotes = builder.keepQuotes;
   }
@@ -61,6 +63,10 @@ public boolean isNormalizeSingleQuotes() {
     return normalizeSingleQuotes;
   }
 
+  public boolean isNormalizeWhitespace() {
+    return normalizeWhitespace;
+  }
+
   public boolean isMixedTypesAsStrings() {
     return mixedTypesAsStrings;
   }
@@ -84,6 +90,7 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
 
     private boolean recoverWithNull = false;
     private boolean normalizeSingleQuotes = false;
+    private boolean normalizeWhitespace = false;
 
     private boolean mixedTypesAsStrings = false;
     private boolean keepQuotes = false;
@@ -131,6 +138,14 @@ public Builder withNormalizeSingleQuotes(boolean normalizeSingleQuotes) {
       return this;
     }
 
+    /**
+     * Should the unquoted whitespace be removed.
+     */
+    public Builder withNormalizeWhitespace(boolean normalizeWhitespace) {
+      this.normalizeWhitespace = normalizeWhitespace;
+      return this;
+    }
+
     /**
      * Specify how to handle columns that contain mixed types.
      *
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index c562e08b4c8..a1bdfe9a796 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -252,6 +252,7 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean dayFirst, boolean lines,
                                         boolean recoverWithNulls,
                                         boolean normalizeSingleQuotes,
+                                        boolean normalizeWhitespace,
                                         boolean mixedTypesAsStrings,
                                         boolean keepStringQuotes) throws CudfException;
 
@@ -260,6 +261,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean dayFirst, boolean lines,
                                       boolean recoverWithNulls,
                                       boolean normalizeSingleQuotes,
+                                      boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
                                       long dsHandle) throws CudfException;
@@ -267,6 +269,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
   private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
                                       boolean recoverWithNulls,
                                       boolean normalizeSingleQuotes,
+                                      boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
                                       long dsHandle) throws CudfException;
@@ -275,6 +278,7 @@ private static native long readAndInferJSON(long address, long length,
                                               boolean lines,
                                               boolean recoverWithNulls,
                                               boolean normalizeSingleQuotes,
+                                              boolean normalizeWhitespace,
                                               boolean mixedTypesAsStrings,
                                               boolean keepStringQuotes) throws CudfException;
 
@@ -1257,6 +1261,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     0, 0,
                     opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
                     opts.isNormalizeSingleQuotes(),
+                    opts.isNormalizeWhitespace(),
                     opts.isMixedTypesAsStrings(),
                 opts.keepStringQuotes()))) {
 
@@ -1312,6 +1317,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
     return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
         opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
         opts.isNormalizeSingleQuotes(),
+        opts.isNormalizeWhitespace(),
         opts.isMixedTypesAsStrings(), opts.keepStringQuotes()));
   }
 
@@ -1327,6 +1333,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
           opts.isLines(),
           opts.isRecoverWithNull(),
           opts.isNormalizeSingleQuotes(),
+          opts.isNormalizeWhitespace(),
           opts.isMixedTypesAsStrings(),
           opts.keepStringQuotes(),
           dsHandle));
@@ -1358,6 +1365,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
             schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
             buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
             opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+            opts.isNormalizeWhitespace(),
             opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) {
       return gatherJSONColumns(schema, twm);
     }
@@ -1375,6 +1383,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
         schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(),
         opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+        opts.isNormalizeWhitespace(),
         opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), dsHandle))) {
       return gatherJSONColumns(schema, twm);
     } finally {
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 84f1174fd3f..357705824d2 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1429,8 +1429,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(
     JNIEnv *env, jclass, jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes,
-    jlong ds_handle) {
+    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
+    jboolean keep_quotes, jlong ds_handle) {
 
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
@@ -1448,8 +1448,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource
             .lines(static_cast<bool>(lines))
             .recovery_mode(recovery_mode)
             .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .keep_quotes(keep_quotes)
-            .mixed_types_as_string(mixed_types_as_string);
+            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+            .mixed_types_as_string(mixed_types_as_string)
+            .keep_quotes(keep_quotes);
 
     auto result =
         std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1461,8 +1462,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
     JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string,
-    jboolean keep_quotes) {
+    jboolean recover_with_null, jboolean normalize_single_quotes, jboolean normalize_whitespace,
+    jboolean mixed_types_as_string, jboolean keep_quotes) {
 
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1484,8 +1485,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
             .lines(static_cast<bool>(lines))
             .recovery_mode(recovery_mode)
             .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .keep_quotes(keep_quotes)
-            .mixed_types_as_string(mixed_types_as_string);
+            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+            .mixed_types_as_string(mixed_types_as_string)
+            .keep_quotes(keep_quotes);
 
     auto result =
         std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1573,8 +1575,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
     jintArray j_scales, jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes,
-    jlong ds_handle) {
+    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
+    jboolean keep_quotes, jlong ds_handle) {
 
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
@@ -1606,6 +1608,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
             .lines(static_cast<bool>(lines))
             .recovery_mode(recovery_mode)
             .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
             .mixed_types_as_string(mixed_types_as_string)
             .keep_quotes(keep_quotes);
 
@@ -1646,7 +1649,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
     jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length,
     jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes) {
+    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
+    jboolean keep_quotes) {
 
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1693,6 +1697,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
             .lines(static_cast<bool>(lines))
             .recovery_mode(recovery_mode)
             .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
             .mixed_types_as_string(mixed_types_as_string)
             .keep_quotes(keep_quotes);
 
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index bee8d1cbb88..3f0470d854a 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -88,6 +88,7 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
   private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json");
   private static final File TEST_JSON_SINGLE_QUOTES_FILE = TestUtils.getResourceAsFile("single_quotes.json");
+  private static final File TEST_JSON_WHITESPACES_FILE = TestUtils.getResourceAsFile("whitespaces.json");
   private static final File TEST_MIXED_TYPE_1_JSON = TestUtils.getResourceAsFile("mixed_types_1.json");
   private static final File TEST_MIXED_TYPE_2_JSON = TestUtils.getResourceAsFile("mixed_types_2.json");
 
@@ -349,6 +350,39 @@ void testReadSingleQuotesJSONFile() throws IOException {
   }
 
   @Test
+  void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException {
+    Schema schema = Schema.builder()
+      .column(DType.STRING, "A")
+      .build();
+    JSONOptions opts = JSONOptions.builder()
+      .withLines(true)
+      .withNormalizeSingleQuotes(false)
+      .build();
+    try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) {
+      assertThrows(CudfException.class, () ->
+        Table.readJSON(schema, opts, source));
+    }
+  }
+
+  @Test
+  void testReadWhitespacesJSONFile() throws IOException {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "a")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .withMixedTypesAsStrings(true)
+            .withNormalizeWhitespace(true)
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("b", "50", "[1,2,3,4,5,6,7,8]", "{\"c\":\"d\"}", "b")
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_JSON_WHITESPACES_FILE);
+         Table table = Table.readJSON(schema, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
     Schema schema = Schema.builder()
         .column(DType.STRING, "A")
@@ -547,21 +581,6 @@ void testReadMixedType2JSONFile() throws IOException {
     }
   }
 
-  @Test
-  void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException {
-    Schema schema = Schema.builder()
-      .column(DType.STRING, "A")
-      .build();
-    JSONOptions opts = JSONOptions.builder()
-      .withLines(true)
-      .withNormalizeSingleQuotes(false)
-      .build();
-    try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) {
-      assertThrows(CudfException.class, () ->
-        Table.readJSON(schema, opts, source));
-    }
-  }
-
   @Test
   void testReadJSONFromDataSource() throws IOException {
     Schema schema = Schema.builder()
diff --git a/java/src/test/resources/whitespaces.json b/java/src/test/resources/whitespaces.json
new file mode 100644
index 00000000000..f5ddd8cde5f
--- /dev/null
+++ b/java/src/test/resources/whitespaces.json
@@ -0,0 +1,5 @@
+{"a":"b"}
+ { "a" : "50" }
+{"a": [1, 2, 3, 4, 5, 6, 7, 8]}
+{"a": {"c": "d"}}
+{"a":   "b"}

From c3cad1d7a0aa799a64ec767edb64686f99be78e6 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 4 Mar 2024 16:22:01 -0600
Subject: [PATCH 08/18] Fix `ListColumn.to_pandas()` to retain `list` type
 (#15155)

Fixes: #14568

This PR fixes `ListColumn.to_pandas()` by calling `ArrowArray.to_pylist()` method to retain `list` type in pandas series.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/15155
---
 python/cudf/cudf/core/column/lists.py          | 18 ++++++++++++++++++
 python/cudf/cudf/tests/test_list.py            |  4 +++-
 .../dask_cudf/dask_cudf/tests/test_groupby.py  |  6 +-----
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index b2205af34e8..d1bf0b74d3c 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -6,6 +6,7 @@
 from typing import List, Optional, Sequence, Tuple, Union
 
 import numpy as np
+import pandas as pd
 import pyarrow as pa
 from typing_extensions import Self
 
@@ -288,6 +289,23 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self:
             )
         return lc
 
+    def to_pandas(
+        self,
+        *,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
+    ) -> pd.Series:
+        # Can't rely on Column.to_pandas implementation for lists.
+        # Need to perform `to_pylist` to preserve list types.
+        if nullable:
+            raise NotImplementedError(f"{nullable=} is not implemented.")
+
+        pd_series = pd.Series(self.to_arrow().to_pylist(), dtype="object")
+
+        if index is not None:
+            pd_series.index = index
+        return pd_series
+
 
 class ListMethods(ColumnMethods):
     """
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 7ae7ae34b97..f04cb8a91a4 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import functools
 import operator
@@ -41,6 +41,8 @@ def test_create_list_series(data):
     expect = pd.Series(data)
     got = cudf.Series(data)
     assert_eq(expect, got)
+    assert isinstance(got[0], type(expect[0]))
+    assert isinstance(got.to_pandas()[0], type(expect[0]))
 
 
 @pytest.mark.parametrize(
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index c8cc6e65fa5..30251b88dea 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -702,13 +702,9 @@ def test_is_supported(arg, supported):
 
 def test_groupby_unique_lists():
     df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]})
-    ddf = dd.from_pandas(df, 2)
     gdf = cudf.from_pandas(df)
     gddf = dask_cudf.from_cudf(gdf, 2)
-    dd.assert_eq(
-        ddf.groupby("a").b.unique().compute(),
-        gddf.groupby("a").b.unique().compute(),
-    )
+
     dd.assert_eq(
         gdf.groupby("a").b.unique(),
         gddf.groupby("a").b.unique().compute(),

From 4f1315587df1d64c384f018d90d4ef4fe69a96be Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 4 Mar 2024 14:38:53 -0800
Subject: [PATCH 09/18] Update labeler and codeowner configs for CMake files
 (#15208)

When working on #15206, I noticed the `rapids_config.cmake` file was not properly labeled. Based on offline discussions, we also noticed that the file's codeowner was misconfigured as well.

This PR updates both github `labeler` and `CODEOWNER` files to properly handle files with `.cmake` extension.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15208
---
 .github/CODEOWNERS  | 1 +
 .github/labeler.yml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 9578d32d13d..31cfeaf4ca3 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -11,6 +11,7 @@ python/dask_cudf/  @rapidsai/cudf-dask-codeowners
 cpp/CMakeLists.txt               @rapidsai/cudf-cmake-codeowners
 cpp/libcudf_kafka/CMakeLists.txt @rapidsai/cudf-cmake-codeowners
 **/cmake/                        @rapidsai/cudf-cmake-codeowners
+*.cmake                          @rapidsai/cudf-cmake-codeowners
 
 #java code owners
 java/              @rapidsai/cudf-java-codeowners
diff --git a/.github/labeler.yml b/.github/labeler.yml
index b0b0db9684a..d14344384d1 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -10,6 +10,7 @@ libcudf:
 CMake:
   - '**/CMakeLists.txt'
   - '**/cmake/**'
+  - '**/*.cmake'
 
 cuDF (Java):
   - 'java/**'

From e8c13795709c3561cffcb99b3e435d0b4bb6c397 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Mon, 4 Mar 2024 15:13:49 -0800
Subject: [PATCH 10/18] Update devcontainers to CUDA Toolkit 12.2 (#15099)

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15099
---
 .devcontainer/cuda11.8-pip/devcontainer.json              | 2 +-
 .../{cuda12.0-conda => cuda12.2-conda}/devcontainer.json  | 6 +++---
 .../{cuda12.0-pip => cuda12.2-pip}/devcontainer.json      | 8 ++++----
 .github/workflows/pr.yaml                                 | 4 +++-
 4 files changed, 11 insertions(+), 9 deletions(-)
 rename .devcontainer/{cuda12.0-conda => cuda12.2-conda}/devcontainer.json (92%)
 rename .devcontainer/{cuda12.0-pip => cuda12.2-pip}/devcontainer.json (87%)

diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 84616c25cf2..15b51da8dea 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,7 +5,7 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
similarity index 92%
rename from .devcontainer/cuda12.0-conda/devcontainer.json
rename to .devcontainer/cuda12.2-conda/devcontainer.json
index ef2b34b41a6..31ae8426763 100644
--- a/.devcontainer/cuda12.0-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -3,7 +3,7 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.0",
+      "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "conda",
       "BASE": "rapidsai/devcontainers:24.04-cpp-mambaforge-ubuntu22.04"
     }
@@ -15,7 +15,7 @@
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.0-envs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.2-envs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
@@ -24,7 +24,7 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.2-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
similarity index 87%
rename from .devcontainer/cuda12.0-pip/devcontainer.json
rename to .devcontainer/cuda12.2-pip/devcontainer.json
index d3257b6cf43..93367527a86 100644
--- a/.devcontainer/cuda12.0-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -3,9 +3,9 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.0",
+      "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda12.0-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda12.2-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
@@ -15,7 +15,7 @@
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
@@ -23,7 +23,7 @@
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 9e11993048f..4a662ed0f43 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -141,8 +141,10 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@fix/devcontainer-json-location
     with:
+      arch: '["amd64"]'
+      cuda: '["12.2"]'
       build_command: |
         sccache -z;
         build-all -DBUILD_BENCHMARKS=ON --verbose;

From f12b8e1b378ae5a4806bce86a1801c2c488097ac Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 4 Mar 2024 13:18:42 -1000
Subject: [PATCH 11/18] Allow to_pandas to return pandas.ArrowDtype (#15182)

Adds a `arrow_type: bool` parameter to `to_pandas` to allow the conversion to return `pandas.ArrowDtype` in pandas.

(Opens up the dream of cudf to pandas round tripping to happen via arrow formatted data)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15182
---
 python/cudf/cudf/core/_base_index.py        | 10 +++-
 python/cudf/cudf/core/column/categorical.py |  8 ++-
 python/cudf/cudf/core/column/column.py      | 21 +++++--
 python/cudf/cudf/core/column/datetime.py    | 53 ++++++++++++------
 python/cudf/cudf/core/column/interval.py    | 12 +++-
 python/cudf/cudf/core/column/numerical.py   | 11 +++-
 python/cudf/cudf/core/column/string.py      | 11 +++-
 python/cudf/cudf/core/column/struct.py      | 21 +++++--
 python/cudf/cudf/core/column/timedelta.py   | 31 +++++++----
 python/cudf/cudf/core/dataframe.py          | 22 ++++++--
 python/cudf/cudf/core/index.py              | 61 ++++++++++++++-------
 python/cudf/cudf/core/multiindex.py         |  6 +-
 python/cudf/cudf/core/series.py             | 32 +++++++++--
 python/cudf/cudf/tests/test_dataframe.py    | 41 ++++++++++++++
 python/cudf/cudf/tests/test_index.py        | 38 +++++++++++++
 python/cudf/cudf/tests/test_multiindex.py   | 34 ++++++++++++
 python/cudf/cudf/tests/test_series.py       | 43 ++++++++++++++-
 17 files changed, 382 insertions(+), 73 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 58e2241e810..de44f392eef 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -910,7 +910,7 @@ def notna(self):
         """
         raise NotImplementedError
 
-    def to_pandas(self, *, nullable: bool = False):
+    def to_pandas(self, *, nullable: bool = False, arrow_type: bool = False):
         """
         Convert to a Pandas Index.
 
@@ -924,6 +924,12 @@ def to_pandas(self, *, nullable: bool = False):
             If ``nullable`` is ``False``, the resulting index will
             either convert null values to ``np.nan`` or ``None``
             depending on the dtype.
+        arrow_type : bool, Default False
+            Return the Index with a ``pandas.ArrowDtype``
+
+        Notes
+        -----
+        nullable and arrow_type cannot both be set to ``True``
 
         Examples
         --------
@@ -937,6 +943,8 @@ def to_pandas(self, *, nullable: bool = False):
         <class 'pandas.core.indexes.base.Index'>
         >>> type(idx)
         <class 'cudf.core.index.Index'>
+        >>> idx.to_pandas(arrow_type=True)
+        Index([-3, 10, 15, 20], dtype='int64[pyarrow]')
         """
         raise NotImplementedError
 
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 9ecd461cf99..4c64e7085c9 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -770,10 +770,16 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
         )
 
     def to_pandas(
-        self, *, index: Optional[pd.Index] = None, nullable: bool = False
+        self,
+        *,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
         if nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
+        elif arrow_type:
+            raise NotImplementedError(f"{arrow_type=} is not implemented.")
 
         if self.categories.dtype.kind == "f":
             new_mask = bools_to_mask(self.notnull())
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index cecdaf70750..be196833f32 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -199,6 +199,7 @@ def to_pandas(
         *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
         """Convert object to pandas type.
 
@@ -206,13 +207,23 @@ def to_pandas(
         """
         # This default implementation does not handle nulls in any meaningful
         # way
-        if nullable:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        elif nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
-        pd_series = self.to_arrow().to_pandas()
+        pa_array = self.to_arrow()
+        if arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(pa_array), index=index
+            )
+        else:
+            pd_series = pa_array.to_pandas()
 
-        if index is not None:
-            pd_series.index = index
-        return pd_series
+            if index is not None:
+                pd_series.index = index
+            return pd_series
 
     @property
     def values_host(self) -> "np.ndarray":
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b03b21a7aba..85f07064c97 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -318,18 +318,27 @@ def to_pandas(
         *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
-        if nullable:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        elif nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
-        # `copy=True` workaround until following issue is fixed:
-        # https://issues.apache.org/jira/browse/ARROW-9772
-
-        return pd.Series(
-            self.to_arrow(),
-            copy=True,
-            dtype=self.dtype,
-            index=index,
-        )
+        elif arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
+            )
+        else:
+            # `copy=True` workaround until following issue is fixed:
+            # https://issues.apache.org/jira/browse/ARROW-9772
+            return pd.Series(
+                self.to_arrow(),
+                copy=True,
+                dtype=self.dtype,
+                index=index,
+            )
 
     @property
     def values(self):
@@ -723,15 +732,25 @@ def to_pandas(
         *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
-        if nullable:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        elif nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
-        series = self._local_time.to_pandas().dt.tz_localize(
-            self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
-        )
-        if index is not None:
-            series.index = index
-        return series
+        elif arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
+            )
+        else:
+            series = self._local_time.to_pandas().dt.tz_localize(
+                self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
+            )
+            if index is not None:
+                series.index = index
+            return series
 
     def to_arrow(self):
         return pa.compute.assume_timezone(
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index 5d93fa26298..dcec8957bb2 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -105,15 +105,25 @@ def as_interval_column(self, dtype):
             raise ValueError("dtype must be IntervalDtype")
 
     def to_pandas(
-        self, *, index: Optional[pd.Index] = None, nullable: bool = False
+        self,
+        *,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
         # Note: This does not handle null values in the interval column.
         # However, this exact sequence (calling __from_arrow__ on the output of
         # self.to_arrow) is currently the best known way to convert interval
         # types into pandas (trying to convert the underlying numerical columns
         # directly is problematic), so we're stuck with this for now.
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
         if nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
+        elif arrow_type:
+            raise NotImplementedError(f"{nullable=} is not implemented.")
         return pd.Series(
             self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index
         )
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b80dd626066..82d82593c77 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -690,8 +690,17 @@ def to_pandas(
         *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
-        if nullable and self.dtype in np_dtypes_to_pandas_dtypes:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        if arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
+            )
+        elif nullable and self.dtype in np_dtypes_to_pandas_dtypes:
             pandas_nullable_dtype = np_dtypes_to_pandas_dtypes[self.dtype]
             arrow_array = self.to_arrow()
             pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 2373f94ee97..dea60f58690 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5791,8 +5791,17 @@ def to_pandas(
         *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
-        if nullable:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        if arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
+            )
+        elif nullable:
             pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow())
             pd_series = pd.Series(pandas_array, copy=False)
         else:
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 69e9a50956b..1b2ffcc2700 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -58,14 +58,27 @@ def to_arrow(self):
         )
 
     def to_pandas(
-        self, *, index: Optional[pd.Index] = None, nullable: bool = False
+        self,
+        *,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
         # We cannot go via Arrow's `to_pandas` because of the following issue:
         # https://issues.apache.org/jira/browse/ARROW-12680
-        if nullable:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        elif nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
-
-        return pd.Series(self.to_arrow().tolist(), dtype="object", index=index)
+        pa_array = self.to_arrow()
+        if arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(pa_array), index=index
+            )
+        else:
+            return pd.Series(pa_array.tolist(), dtype="object", index=index)
 
     @cached_property
     def memory_usage(self):
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index b911c86fa01..dab2723795e 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -147,20 +147,31 @@ def to_arrow(self) -> pa.Array:
         )
 
     def to_pandas(
-        self, *, index: Optional[pd.Index] = None, nullable: bool = False
+        self,
+        *,
+        index: Optional[pd.Index] = None,
+        nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
         # `copy=True` workaround until following issue is fixed:
         # https://issues.apache.org/jira/browse/ARROW-9772
-
-        if nullable:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        elif nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
-
-        return pd.Series(
-            self.to_arrow(),
-            copy=True,
-            dtype=self.dtype,
-            index=index,
-        )
+        elif arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
+            )
+        else:
+            return pd.Series(
+                self.to_arrow(),
+                copy=True,
+                dtype=self.dtype,
+                index=index,
+            )
 
     def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         reflect, op = self._check_reflected_op(op)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a0e1a041342..d7d2e1acd85 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5203,7 +5203,9 @@ def describe(
             return res
 
     @_cudf_nvtx_annotate
-    def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
+    def to_pandas(
+        self, *, nullable: bool = False, arrow_type: bool = False
+    ) -> pd.DataFrame:
         """
         Convert to a Pandas DataFrame.
 
@@ -5218,11 +5220,17 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
             If ``nullable`` is ``False``,
             the resulting columns will either convert null
             values to ``np.nan`` or ``None`` depending on the dtype.
+        arrow_type : bool, Default False
+            Return the Index with a ``pandas.ArrowDtype``
 
         Returns
         -------
         out : Pandas DataFrame
 
+        Notes
+        -----
+        nullable and arrow_type cannot both be set to ``True``
+
         Examples
         --------
         >>> import cudf
@@ -5236,8 +5244,7 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
         >>> type(pdf)
         <class 'pandas.core.frame.DataFrame'>
 
-        ``nullable`` parameter can be used to control
-        whether dtype can be Pandas Nullable or not:
+        ``nullable=True`` converts the result to pandas nullable types:
 
         >>> df = cudf.DataFrame({'a': [0, None, 2], 'b': [True, False, None]})
         >>> df
@@ -5265,13 +5272,20 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame:
         a    float64
         b     object
         dtype: object
+
+        ``arrow_type=True`` converts the result to ``pandas.ArrowDtype``:
+
+        >>> df.to_pandas(arrow_type=True).dtypes
+        a    int64[pyarrow]
+        b     bool[pyarrow]
+        dtype: object
         """
         out_data = {}
         out_index = self.index.to_pandas()
 
         for i, col_key in enumerate(self._data):
             out_data[i] = self._data[col_key].to_pandas(
-                index=out_index, nullable=nullable
+                index=out_index, nullable=nullable, arrow_type=arrow_type
             )
 
         out_df = pd.DataFrame(out_data, index=out_index)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 1b9893d1256..9d481037ec6 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -483,9 +483,13 @@ def dtype(self):
         return _maybe_convert_to_default_type(dtype)
 
     @_cudf_nvtx_annotate
-    def to_pandas(self, *, nullable: bool = False) -> pd.RangeIndex:
+    def to_pandas(
+        self, *, nullable: bool = False, arrow_type: bool = False
+    ) -> pd.RangeIndex:
         if nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
+        elif arrow_type:
+            raise NotImplementedError(f"{arrow_type=} is not implemented.")
         return pd.RangeIndex(
             start=self._start,
             stop=self._stop,
@@ -1521,9 +1525,12 @@ def _clean_nulls_from_index(self):
     def any(self):
         return self._values.any()
 
-    def to_pandas(self, *, nullable: bool = False) -> pd.Index:
+    def to_pandas(
+        self, *, nullable: bool = False, arrow_type: bool = False
+    ) -> pd.Index:
         return pd.Index(
-            self._values.to_pandas(nullable=nullable), name=self.name
+            self._values.to_pandas(nullable=nullable, arrow_type=arrow_type),
+            name=self.name,
         )
 
     def append(self, other):
@@ -2094,18 +2101,26 @@ def isocalendar(self):
         return cudf.core.tools.datetimes._to_iso_calendar(self)
 
     @_cudf_nvtx_annotate
-    def to_pandas(self, *, nullable: bool = False) -> pd.DatetimeIndex:
-        if nullable:
+    def to_pandas(
+        self, *, nullable: bool = False, arrow_type: bool = False
+    ) -> pd.DatetimeIndex:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        elif nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
 
-        freq = (
-            self._freq._maybe_as_fast_pandas_offset()
-            if self._freq is not None
-            else None
-        )
-        return pd.DatetimeIndex(
-            self._values.to_pandas(), name=self.name, freq=freq
-        )
+        result = self._values.to_pandas(arrow_type=arrow_type)
+        if arrow_type:
+            return pd.Index(result, name=self.name)
+        else:
+            freq = (
+                self._freq._maybe_as_fast_pandas_offset()
+                if self._freq is not None
+                else None
+            )
+            return pd.DatetimeIndex(result, name=self.name, freq=freq)
 
     @_cudf_nvtx_annotate
     def _get_dt_field(self, field):
@@ -2426,13 +2441,21 @@ def __getitem__(self, index):
         return value
 
     @_cudf_nvtx_annotate
-    def to_pandas(self, *, nullable: bool = False) -> pd.TimedeltaIndex:
-        if nullable:
+    def to_pandas(
+        self, *, nullable: bool = False, arrow_type: bool = False
+    ) -> pd.TimedeltaIndex:
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
+        elif nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
-        return pd.TimedeltaIndex(
-            self._values.to_pandas(),
-            name=self.name,
-        )
+
+        result = self._values.to_pandas(arrow_type=arrow_type)
+        if arrow_type:
+            return pd.Index(result, name=self.name)
+        else:
+            return pd.TimedeltaIndex(result, name=self.name)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index df1b1ea10cd..70112044f75 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1574,10 +1574,12 @@ def droplevel(self, level=-1):
             return mi
 
     @_cudf_nvtx_annotate
-    def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex:
+    def to_pandas(
+        self, *, nullable: bool = False, arrow_type: bool = False
+    ) -> pd.MultiIndex:
         result = self.to_frame(
             index=False, name=list(range(self.nlevels))
-        ).to_pandas(nullable=nullable)
+        ).to_pandas(nullable=nullable, arrow_type=arrow_type)
         return pd.MultiIndex.from_frame(result, names=self.names)
 
     @classmethod
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 3f51ecdf7dc..cb5008af3ad 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1983,10 +1983,14 @@ def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
 
     @_cudf_nvtx_annotate
     def to_pandas(
-        self, *, index: bool = True, nullable: bool = False
+        self,
+        *,
+        index: bool = True,
+        nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
         """
-        Convert to a Pandas Series.
+        Convert to a pandas Series.
 
         Parameters
         ----------
@@ -2003,10 +2007,16 @@ def to_pandas(
             If ``nullable`` is ``False``, the resulting series will
             either convert null values to ``np.nan`` or ``None``
             depending on the dtype.
+        arrow_type : bool, Default False
+            Return the Series with a ``pandas.ArrowDtype``
 
         Returns
         -------
-        out : Pandas Series
+        out : pandas Series
+
+        Notes
+        -----
+        nullable and arrow_type cannot both be set to ``True``
 
         Examples
         --------
@@ -2021,8 +2031,7 @@ def to_pandas(
         >>> type(pds)
         <class 'pandas.core.series.Series'>
 
-        ``nullable`` parameter can be used to control
-        whether dtype can be Pandas Nullable or not:
+        ``nullable=True`` converts the result to pandas nullable types:
 
         >>> ser = cudf.Series([10, 20, None, 30])
         >>> ser
@@ -2043,12 +2052,23 @@ def to_pandas(
         2     NaN
         3    30.0
         dtype: float64
+
+        ``arrow_type=True`` converts the result to ``pandas.ArrowDtype``:
+
+        >>> ser.to_pandas(arrow_type=True)
+        0      10
+        1      20
+        2    <NA>
+        3      30
+        dtype: int64[pyarrow]
         """
         if index is True:
             index = self.index.to_pandas()
         else:
             index = None  # type: ignore[assignment]
-        s = self._column.to_pandas(index=index, nullable=nullable)
+        s = self._column.to_pandas(
+            index=index, nullable=nullable, arrow_type=arrow_type
+        )
         s.name = self.name
         return s
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 50b14d532e4..3143851ddd6 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10861,3 +10861,44 @@ def test_dataframe_duplicate_index_reindex():
         lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
         rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
     )
+
+
+@pytest.mark.parametrize(
+    "scalar",
+    [
+        1,
+        1.0,
+        "a",
+        datetime.datetime(2020, 1, 1),
+        datetime.timedelta(1),
+        {"1": 2},
+        [1],
+        decimal.Decimal("1.0"),
+    ],
+)
+def test_dataframe_to_pandas_arrow_type_nullable_raises(scalar):
+    pa_array = pa.array([scalar, None])
+    df = cudf.DataFrame({"a": pa_array})
+    with pytest.raises(ValueError):
+        df.to_pandas(nullable=True, arrow_type=True)
+
+
+@pytest.mark.parametrize(
+    "scalar",
+    [
+        1,
+        1.0,
+        "a",
+        datetime.datetime(2020, 1, 1),
+        datetime.timedelta(1),
+        {"1": 2},
+        [1],
+        decimal.Decimal("1.0"),
+    ],
+)
+def test_dataframe_to_pandas_arrow_type(scalar):
+    pa_array = pa.array([scalar, None])
+    df = cudf.DataFrame({"a": pa_array})
+    result = df.to_pandas(arrow_type=True)
+    expected = pd.DataFrame({"a": pd.arrays.ArrowExtensionArray(pa_array)})
+    pd.testing.assert_frame_equal(result, expected)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index cced05d2217..51e9a3022f4 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3,6 +3,7 @@
 """
 Test related to Index
 """
+import datetime
 import operator
 import re
 
@@ -3138,3 +3139,40 @@ def test_from_pandas_rangeindex_return_rangeindex():
 def test_index_to_pandas_nullable_notimplemented(idx):
     with pytest.raises(NotImplementedError):
         idx.to_pandas(nullable=True)
+
+
+@pytest.mark.parametrize(
+    "scalar",
+    [
+        1,
+        1.0,
+        "a",
+        datetime.datetime(2020, 1, 1),
+        datetime.timedelta(1),
+        {"1": 2},
+    ],
+)
+def test_index_to_pandas_arrow_type_nullable_raises(scalar):
+    pa_array = pa.array([scalar, None])
+    idx = cudf.Index(pa_array)
+    with pytest.raises(ValueError):
+        idx.to_pandas(nullable=True, arrow_type=True)
+
+
+@pytest.mark.parametrize(
+    "scalar",
+    [
+        1,
+        1.0,
+        "a",
+        datetime.datetime(2020, 1, 1),
+        datetime.timedelta(1),
+        {"1": 2},
+    ],
+)
+def test_index_to_pandas_arrow_type(scalar):
+    pa_array = pa.array([scalar, None])
+    idx = cudf.Index(pa_array)
+    result = idx.to_pandas(arrow_type=True)
+    expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array))
+    pd.testing.assert_index_equal(result, expected)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index a13fe333107..4926d79e734 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -3,6 +3,7 @@
 """
 Test related to MultiIndex
 """
+import datetime
 import itertools
 import operator
 import pickle
@@ -13,6 +14,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import pytest
 
 import cudf
@@ -2118,3 +2120,35 @@ def test_multiindex_from_arrays(array):
 def test_multiindex_from_arrays_wrong_arg(arg):
     with pytest.raises(TypeError):
         cudf.MultiIndex.from_arrays(arg)
+
+
+@pytest.mark.parametrize(
+    "scalar",
+    [
+        1,
+        1.0,
+        "a",
+        datetime.datetime(2020, 1, 1),
+        datetime.timedelta(1),
+        {"1": 2},
+    ],
+)
+def test_index_to_pandas_arrow_type_nullable_raises(scalar):
+    pa_array = pa.array([scalar, None])
+    midx = cudf.MultiIndex(levels=[pa_array], codes=[[0]])
+    with pytest.raises(ValueError):
+        midx.to_pandas(nullable=True, arrow_type=True)
+
+
+@pytest.mark.parametrize(
+    "scalar",
+    [1, 1.0, "a", datetime.datetime(2020, 1, 1), datetime.timedelta(1)],
+)
+def test_index_to_pandas_arrow_type(scalar):
+    pa_array = pa.array([scalar, None])
+    midx = cudf.MultiIndex(levels=[pa_array], codes=[[0]])
+    result = midx.to_pandas(arrow_type=True)
+    expected = pd.MultiIndex(
+        levels=[pd.arrays.ArrowExtensionArray(pa_array)], codes=[[0]]
+    )
+    pd.testing.assert_index_equal(result, expected)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index caf8947e3b0..6b5c0406deb 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1,5 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
+import datetime
 import decimal
 import hashlib
 import operator
@@ -2708,3 +2708,44 @@ def test_series_from_large_string():
     expected = pd.Series(pa_large_string_array)
 
     assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    "scalar",
+    [
+        1,
+        1.0,
+        "a",
+        datetime.datetime(2020, 1, 1),
+        datetime.timedelta(1),
+        {"1": 2},
+        [1],
+        decimal.Decimal("1.0"),
+    ],
+)
+def test_series_to_pandas_arrow_type_nullable_raises(scalar):
+    pa_array = pa.array([scalar, None])
+    ser = cudf.Series(pa_array)
+    with pytest.raises(ValueError):
+        ser.to_pandas(nullable=True, arrow_type=True)
+
+
+@pytest.mark.parametrize(
+    "scalar",
+    [
+        1,
+        1.0,
+        "a",
+        datetime.datetime(2020, 1, 1),
+        datetime.timedelta(1),
+        {"1": 2},
+        [1],
+        decimal.Decimal("1.0"),
+    ],
+)
+def test_series_to_pandas_arrow_type(scalar):
+    pa_array = pa.array([scalar, None])
+    ser = cudf.Series(pa_array)
+    result = ser.to_pandas(arrow_type=True)
+    expected = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))
+    pd.testing.assert_series_equal(result, expected)

From 3571291c533412f8efa4c5d41caa865564b5391b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 4 Mar 2024 15:04:54 -1000
Subject: [PATCH 12/18] Use as_column instead of full (#14698)

Similar to https://github.com/rapidsai/cudf/pull/14689, ensures there's 1 entrypoint to create a column from a scalar.

This builds on https://github.com/rapidsai/cudf/pull/14620

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14698
---
 python/cudf/cudf/core/column/__init__.py    |   1 -
 python/cudf/cudf/core/column/categorical.py |  12 +--
 python/cudf/cudf/core/column/column.py      | 100 ++++++--------------
 python/cudf/cudf/core/column/decimal.py     |   4 +-
 python/cudf/cudf/core/column/numerical.py   |   3 +-
 python/cudf/cudf/core/column/string.py      |  12 ++-
 python/cudf/cudf/core/column/timedelta.py   |   4 +-
 python/cudf/cudf/core/dataframe.py          |  26 +++--
 python/cudf/cudf/core/index.py              |   6 +-
 python/cudf/cudf/core/indexed_frame.py      |  14 ++-
 python/cudf/cudf/core/multiindex.py         |   8 +-
 python/cudf/cudf/core/series.py             |   5 +-
 python/cudf/cudf/core/tools/datetimes.py    |   4 +-
 python/cudf/cudf/core/window/rolling.py     |   5 +-
 python/cudf/cudf/io/parquet.py              |  14 +--
 python/cudf/cudf/tests/test_testing.py      |   6 +-
 python/cudf/cudf/utils/utils.py             |   6 +-
 python/dask_cudf/dask_cudf/backends.py      |   6 +-
 18 files changed, 101 insertions(+), 135 deletions(-)

diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index a1c86b617b0..2a46654ccc2 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -16,7 +16,6 @@
     column_empty_like_same_mask,
     concat_columns,
     deserialize_columns,
-    full,
     serialize_columns,
 )
 from cudf.core.column.datetime import DatetimeColumn  # noqa: F401
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 4c64e7085c9..88bb4521a5b 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -734,8 +734,8 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn:
                 )
             return other
 
-        ary = column.full(
-            len(self), self._encode(other), dtype=self.codes.dtype
+        ary = column.as_column(
+            self._encode(other), length=len(self), dtype=self.codes.dtype
         )
         return column.build_categorical_column(
             categories=self.dtype.categories._values,
@@ -1444,11 +1444,9 @@ def _create_empty_categorical_column(
     return column.build_categorical_column(
         categories=column.as_column(dtype.categories),
         codes=column.as_column(
-            column.full(
-                categorical_column.size,
-                _DEFAULT_CATEGORICAL_VALUE,
-                categorical_column.codes.dtype,
-            )
+            _DEFAULT_CATEGORICAL_VALUE,
+            length=categorical_column.size,
+            dtype=categorical_column.codes.dtype,
         ),
         offset=categorical_column.offset,
         size=categorical_column.size,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index be196833f32..8941d111d02 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -58,7 +58,6 @@
     infer_dtype,
     is_bool_dtype,
     is_datetime64_dtype,
-    is_decimal_dtype,
     is_dtype_equal,
     is_integer_dtype,
     is_list_dtype,
@@ -866,7 +865,7 @@ def isin(self, values: Sequence) -> ColumnBase:
         except ValueError:
             # pandas functionally returns all False when cleansing via
             # typecasting fails
-            return full(len(self), False, dtype="bool")
+            return as_column(False, length=len(self), dtype="bool")
 
         return lhs._obtain_isin_result(rhs)
 
@@ -893,9 +892,9 @@ def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]:
             if self.null_count and rhs.null_count:
                 return self.isnull()
             else:
-                return cudf.core.column.full(len(self), False, dtype="bool")
+                return as_column(False, length=len(self), dtype="bool")
         elif self.null_count == 0 and (rhs.null_count == len(rhs)):
-            return cudf.core.column.full(len(self), False, dtype="bool")
+            return as_column(False, length=len(self), dtype="bool")
         else:
             return None
 
@@ -1356,9 +1355,7 @@ def _label_encoding(
             na_sentinel = cudf.Scalar(-1)
 
         def _return_sentinel_column():
-            return cudf.core.column.full(
-                size=len(self), fill_value=na_sentinel, dtype=dtype
-            )
+            return as_column(na_sentinel, dtype=dtype, length=len(self))
 
         if dtype is None:
             dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
@@ -1455,7 +1452,9 @@ def column_empty(
     elif isinstance(dtype, ListDtype):
         data = None
         children = (
-            full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype),
+            as_column(
+                0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
+            ),
             column_empty(row_count, dtype=dtype.element_type),
         )
     elif isinstance(dtype, CategoricalDtype):
@@ -1474,7 +1473,9 @@ def column_empty(
     elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype):
         data = as_buffer(rmm.DeviceBuffer(size=0))
         children = (
-            full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype),
+            as_column(
+                0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
+            ),
         )
     else:
         data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize))
@@ -2017,33 +2018,32 @@ def as_column(
         if dtype is not None:
             data = data.astype(dtype)
 
-    elif isinstance(arbitrary, (pd.Timestamp, pd.Timedelta)):
-        # This will always treat NaTs as nulls since it's not technically a
-        # discrete value like NaN
-        length = length or 1
-        data = as_column(
-            pa.array(pd.Series([arbitrary] * length), from_pandas=True)
-        )
-        if dtype is not None:
-            data = data.astype(dtype)
-
-    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
-        length = length or 1
+    elif is_scalar(arbitrary) and not isinstance(arbitrary, memoryview):
+        if length is None:
+            length = 1
+        elif length < 0:
+            raise ValueError(f"{length=} must be >=0.")
+        if isinstance(arbitrary, pd.Interval):
+            # No cudf.Scalar support yet
+            return as_column(
+                pd.Series([arbitrary] * length),
+                nan_as_null=nan_as_null,
+                dtype=dtype,
+                length=length,
+            )
         if (
-            (nan_as_null is True)
+            nan_as_null is True
             and isinstance(arbitrary, (np.floating, float))
             and np.isnan(arbitrary)
         ):
-            arbitrary = None
             if dtype is None:
-                dtype = cudf.dtype("float64")
-
-        data = as_column(full(length, arbitrary, dtype=dtype))
-        if not nan_as_null and not is_decimal_dtype(data.dtype):
-            if np.issubdtype(data.dtype, np.floating):
-                data = data.fillna(np.nan)
-            elif np.issubdtype(data.dtype, np.datetime64):
-                data = data.fillna(np.datetime64("NaT"))
+                dtype = getattr(arbitrary, "dtype", cudf.dtype("float64"))
+            arbitrary = None
+        arbitrary = cudf.Scalar(arbitrary, dtype=dtype)
+        if length == 0:
+            return column_empty(length, dtype=arbitrary.dtype)
+        else:
+            return ColumnBase.from_scalar(arbitrary, length)
 
     elif hasattr(arbitrary, "__array_interface__"):
         # CUDF assumes values are always contiguous
@@ -2161,8 +2161,6 @@ def as_column(
         return as_column(
             np.asarray(view), dtype=dtype, nan_as_null=nan_as_null
         )
-    elif isinstance(arbitrary, cudf.Scalar):
-        data = ColumnBase.from_scalar(arbitrary, length if length else 1)
     else:
         if dtype is not None:
             # Arrow throws a type error if the input is of
@@ -2505,42 +2503,6 @@ def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]:
     return columns
 
 
-def full(
-    size: int, fill_value: ScalarLike, dtype: Optional[Dtype] = None
-) -> ColumnBase:
-    """
-    Returns a column of given size and dtype, filled with a given value.
-
-    Parameters
-    ----------
-    size : int
-        size of the expected column.
-    fill_value : scalar
-         A scalar value to fill a new array.
-    dtype : default None
-        Data type specifier. It is inferred from other arguments by default.
-
-    Returns
-    -------
-    Column
-
-    Examples
-    --------
-    >>> import cudf
-    >>> col = cudf.core.column.full(size=5, fill_value=7, dtype='int8')
-    >>> col
-    <cudf.core.column.numerical.NumericalColumn object at 0x7fa0912e8b90>
-    >>> cudf.Series(col)
-    0    7
-    1    7
-    2    7
-    3    7
-    4    7
-    dtype: int8
-    """
-    return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size)
-
-
 def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     """Concatenate a sequence of columns."""
     if len(objs) == 0:
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 0e90b522f2c..b83a6ded416 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -69,8 +69,8 @@ def as_string_column(
     def __pow__(self, other):
         if isinstance(other, int):
             if other == 0:
-                res = cudf.core.column.full(
-                    size=len(self), fill_value=1, dtype=self.dtype
+                res = cudf.core.column.as_column(
+                    1, dtype=self.dtype, length=len(self)
                 )
                 if self.nullable:
                     res = res.set_mask(self.mask)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 82d82593c77..8d9da8982ac 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -42,7 +42,6 @@
     as_column,
     build_column,
     column,
-    full,
     string,
 )
 from cudf.core.dtypes import CategoricalDtype
@@ -513,7 +512,7 @@ def find_and_replace(
             )
         if len(replacement_col) == 1 and len(to_replace_col) > 1:
             replacement_col = column.as_column(
-                full(len(to_replace_col), replacement[0], self.dtype)
+                replacement[0], length=len(to_replace_col), dtype=self.dtype
             )
         elif len(replacement_col) == 1 and len(to_replace_col) == 0:
             return self.copy()
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index dea60f58690..e947c9375d7 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5499,7 +5499,9 @@ def __init__(
 
         if len(children) == 0 and size != 0:
             # all nulls-column:
-            offsets = column.full(size + 1, 0, dtype=size_type_dtype)
+            offsets = column.as_column(
+                0, length=size + 1, dtype=size_type_dtype
+            )
 
             children = (offsets,)
 
@@ -5930,8 +5932,8 @@ def _binaryop(
                     "__eq__",
                     "__ne__",
                 }:
-                    return column.full(
-                        len(self), op == "__ne__", dtype="bool"
+                    return column.as_column(
+                        op == "__ne__", length=len(self), dtype="bool"
                     ).set_mask(self.mask)
                 else:
                     return NotImplemented
@@ -5940,7 +5942,9 @@ def _binaryop(
                 if isinstance(other, cudf.Scalar):
                     other = cast(
                         StringColumn,
-                        column.full(len(self), other, dtype="object"),
+                        column.as_column(
+                            other, length=len(self), dtype="object"
+                        ),
                     )
 
                 # Explicit types are necessary because mypy infers ColumnBase
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index dab2723795e..ee326b254b9 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -510,7 +510,7 @@ def components(self, index=None) -> "cudf.DataFrame":
                 break
 
         for name in keys_list:
-            res_col = cudf.core.column.full(len(self), 0, dtype="int64")
+            res_col = column.as_column(0, length=len(self), dtype="int64")
             if self.nullable:
                 res_col = res_col.set_mask(self.mask)
             data[name] = res_col
@@ -599,7 +599,7 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
         # of nanoseconds.
 
         if self._time_unit != "ns":
-            res_col = cudf.core.column.full(len(self), 0, dtype="int64")
+            res_col = column.as_column(0, length=len(self), dtype="int64")
             if self.nullable:
                 res_col = res_col.set_mask(self.mask)
             return cast("cudf.core.column.NumericalColumn", res_col)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d7d2e1acd85..31a748da856 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1407,7 +1407,7 @@ def __setitem__(self, arg, value):
                             allow_non_unique=True,
                         )
                     if is_scalar(value):
-                        self._data[arg] = column.full(len(self), value)
+                        self._data[arg] = as_column(value, length=len(self))
                     else:
                         value = as_column(value)
                         self._data[arg] = value
@@ -1455,8 +1455,8 @@ def __setitem__(self, arg, value):
                 else:
                     for col in arg:
                         if is_scalar(value):
-                            self._data[col] = column.full(
-                                size=len(self), fill_value=value
+                            self._data[col] = as_column(
+                                value, length=len(self)
                             )
                         else:
                             self._data[col] = column.as_column(value)
@@ -3205,10 +3205,16 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
             )
 
         if _is_scalar_or_zero_d_array(value):
-            value = column.full(
-                len(self),
+            dtype = None
+            if isinstance(value, (np.ndarray, cupy.ndarray)):
+                dtype = value.dtype
+                value = value.item()
+            if libcudf.scalar._is_null_host_scalar(value):
+                dtype = "str"
+            value = as_column(
                 value,
-                "str" if libcudf.scalar._is_null_host_scalar(value) else None,
+                length=len(self),
+                dtype=dtype,
             )
 
         if len(self) == 0:
@@ -5912,7 +5918,7 @@ def isin(self, values):
         fill_value = cudf.Scalar(False)
 
         def make_false_column_like_self():
-            return column.full(len(self), fill_value, "bool")
+            return column.as_column(fill_value, length=len(self), dtype="bool")
 
         # Preprocess different input types into a mapping from column names to
         # a list of values to check.
@@ -6031,7 +6037,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
                 {
                     name: filtered._data[name]._get_mask_as_column()
                     if filtered._data[name].nullable
-                    else column.full(len(filtered._data[name]), True)
+                    else as_column(True, length=len(filtered._data[name]))
                     for name in filtered._data.names
                 }
             )
@@ -7822,8 +7828,8 @@ def func(left, right, output):
             return output
 
         for name in uncommon_columns:
-            output._data[name] = column.full(
-                size=len(output), fill_value=value, dtype="bool"
+            output._data[name] = as_column(
+                value, length=len(output), dtype="bool"
             )
         return output
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 9d481037ec6..bd9dc1ae3da 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1231,9 +1231,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             )
 
         needle = as_column(target)
-        result = cudf.core.column.full(
-            len(needle),
-            fill_value=-1,
+        result = as_column(
+            -1,
+            length=len(needle),
             dtype=libcudf.types.size_type_dtype,
         )
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 3c6e1e17142..df703370f78 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -50,7 +50,7 @@
 from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.buffer import acquire_spill_lock
-from cudf.core.column import ColumnBase, as_column, full
+from cudf.core.column import ColumnBase, as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import BooleanMask, GatherMap
 from cudf.core.dtypes import ListDtype
@@ -3048,7 +3048,7 @@ def duplicated(self, subset=None, keep="first"):
         (result,) = libcudf.copying.scatter(
             [cudf.Scalar(False, dtype=bool)],
             distinct,
-            [full(len(self), True, dtype=bool)],
+            [as_column(True, length=len(self), dtype=bool)],
             bounds_check=False,
         )
         return cudf.Series(result, index=self.index)
@@ -3327,9 +3327,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
 
         # Mask and data column preallocated
         ans_col = _return_arr_from_dtype(retty, len(self))
-        ans_mask = cudf.core.column.full(
-            size=len(self), fill_value=True, dtype="bool"
-        )
+        ans_mask = as_column(True, length=len(self), dtype="bool")
         output_args = [(ans_col, ans_mask), len(self)]
         input_args = _get_input_args_from_frame(self)
         launch_args = output_args + input_args + list(args)
@@ -6260,10 +6258,10 @@ def _get_replacement_values_for_columns(
             values_columns = {
                 col: [value]
                 if _is_non_decimal_numeric_dtype(columns_dtype_map[col])
-                else full(
-                    len(to_replace),
+                else as_column(
                     value,
-                    cudf.dtype(type(value)),
+                    length=len(to_replace),
+                    dtype=cudf.dtype(type(value)),
                 )
                 for col in columns_dtype_map
             }
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 70112044f75..315a21020a2 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -667,7 +667,7 @@ def isin(self, values, level=None):
             self_df = self.to_frame(index=False).reset_index()
             values_df = values_idx.to_frame(index=False)
             idx = self_df.merge(values_df, how="leftsemi")._data["index"]
-            res = cudf.core.column.full(size=len(self), fill_value=False)
+            res = column.as_column(False, length=len(self))
             res[idx] = True
             result = res.values
         else:
@@ -1845,9 +1845,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 "index must be monotonic increasing or decreasing"
             )
 
-        result = cudf.core.column.full(
-            len(target),
-            fill_value=-1,
+        result = column.as_column(
+            -1,
+            length=len(target),
             dtype=libcudf.types.size_type_dtype,
         )
         if not len(self):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index cb5008af3ad..1b18e11c047 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -55,7 +55,6 @@
     IntervalColumn,
     TimeDeltaColumn,
     as_column,
-    full,
 )
 from cudf.core.column.categorical import (
     CategoricalAccessor as CategoricalAccessor,
@@ -1311,7 +1310,7 @@ def map(self, arg, na_action=None) -> "Series":
                 {
                     "x": arg.keys(),
                     "s": arg.values(),
-                    "bool": full(len(arg), True, dtype=self.dtype),
+                    "bool": as_column(True, length=len(arg), dtype=self.dtype),
                 }
             )
             res = lhs.merge(rhs, on="x", how="left").sort_values(
@@ -1333,7 +1332,7 @@ def map(self, arg, na_action=None) -> "Series":
                 {
                     "x": arg.keys(),
                     "s": arg,
-                    "bool": full(len(arg), True, dtype=self.dtype),
+                    "bool": as_column(True, length=len(arg), dtype=self.dtype),
                 }
             )
             res = lhs.merge(rhs, on="x", how="left").sort_values(
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 0e0df4ecf6e..d182b7b4a7c 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -770,7 +770,7 @@ def _isin_datetimelike(
         was_string = len(rhs) and rhs.dtype.kind == "O"
 
         if rhs.dtype.kind in {"f", "i", "u"}:
-            return cudf.core.column.full(len(lhs), False, dtype="bool")
+            return column.as_column(False, length=len(lhs), dtype="bool")
         rhs = rhs.astype(lhs.dtype)
         if was_string:
             warnings.warn(
@@ -787,7 +787,7 @@ def _isin_datetimelike(
     except ValueError:
         # pandas functionally returns all False when cleansing via
         # typecasting fails
-        return cudf.core.column.full(len(lhs), False, dtype="bool")
+        return column.as_column(False, length=len(lhs), dtype="bool")
 
     res = lhs._obtain_isin_result(rhs)
     return res
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 890e4ecc2f0..2037b1682db 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -9,7 +9,6 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf.api.types import is_integer, is_number
-from cudf.core import column
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import as_column
 from cudf.core.mixins import Reducible
@@ -236,8 +235,8 @@ def _apply_agg_column(self, source_column, agg_name):
             window = None
         else:
             preceding_window = as_column(self.window)
-            following_window = column.full(
-                self.window.size, 0, dtype=self.window.dtype
+            following_window = as_column(
+                0, length=self.window.size, dtype=self.window.dtype
             )
             window = None
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 6c70b08384f..bead9c352ef 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -20,7 +20,7 @@
 import cudf
 from cudf._lib import parquet as libparquet
 from cudf.api.types import is_list_like
-from cudf.core.column import build_categorical_column, column_empty, full
+from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils import ioutils
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
@@ -762,9 +762,9 @@ def _parquet_to_frame(
             _len = len(dfs[-1])
             if partition_categories and name in partition_categories:
                 # Build the categorical column from `codes`
-                codes = full(
-                    size=_len,
-                    fill_value=partition_categories[name].index(value),
+                codes = as_column(
+                    partition_categories[name].index(value),
+                    length=_len,
                 )
                 dfs[-1][name] = build_categorical_column(
                     categories=partition_categories[name],
@@ -788,10 +788,10 @@ def _parquet_to_frame(
                         masked=True,
                     )
                 else:
-                    dfs[-1][name] = full(
-                        size=_len,
-                        fill_value=value,
+                    dfs[-1][name] = as_column(
+                        value,
                         dtype=_dtype,
+                        length=_len,
                     )
 
     if len(dfs) > 1:
diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py
index 091cd6b57a4..1994536f395 100644
--- a/python/cudf/cudf/tests/test_testing.py
+++ b/python/cudf/cudf/tests/test_testing.py
@@ -6,7 +6,7 @@
 import pytest
 
 import cudf
-from cudf.core.column.column import as_column, full
+from cudf.core.column.column import as_column
 from cudf.testing import (
     assert_frame_equal,
     assert_index_equal,
@@ -172,8 +172,8 @@ def test_assert_column_equal_dtype_edge_cases(other):
     assert_column_equal(base.slice(0, 0), other.slice(0, 0), check_dtype=False)
     assert_column_equal(other.slice(0, 0), base.slice(0, 0), check_dtype=False)
 
-    base = full(len(base), fill_value=cudf.NA, dtype=base.dtype)
-    other = full(len(other), fill_value=cudf.NA, dtype=other.dtype)
+    base = as_column(cudf.NA, length=len(base), dtype=base.dtype)
+    other = as_column(cudf.NA, length=len(other), dtype=other.dtype)
 
     assert_column_equal(base, other, check_dtype=False)
     assert_column_equal(other, base, check_dtype=False)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index ec5693e14d2..95621cf9519 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import decimal
 import functools
@@ -396,8 +396,8 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value):
     else:
         result_mask = None
 
-    result_col = column.full(
-        size=len(lhs), fill_value=bool_fill_value, dtype=cudf.dtype(np.bool_)
+    result_col = column.as_column(
+        bool_fill_value, dtype=cudf.dtype(np.bool_), length=len(lhs)
     )
     if result_mask is not None:
         result_col = result_col.set_mask(result_mask.as_mask())
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 454cce76ff2..317c45ba582 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -105,8 +105,10 @@ def _get_non_empty_data(s):
         categories = (
             s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]
         )
-        codes = cudf.core.column.full(
-            size=2, fill_value=0, dtype=cudf._lib.types.size_type_dtype
+        codes = cudf.core.column.as_column(
+            0,
+            dtype=cudf._lib.types.size_type_dtype,
+            length=2,
         )
         ordered = s.ordered
         data = cudf.core.column.build_categorical_column(

From 427ce014bbefba17c47fc032c71c3f513f2fce06 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 4 Mar 2024 17:34:20 -1000
Subject: [PATCH 13/18] Add ListColumns.to_pandas(arrow_type=) (#15228)

I think there will be a mypy error on main soon as https://github.com/rapidsai/cudf/pull/15182 and https://github.com/rapidsai/cudf/pull/15155 were merge in close succession (my fault for not rebasing first)

Also address a review I forgot in https://github.com/rapidsai/cudf/pull/15182/files#r1507154770

cc @galipremsagar

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15228
---
 python/cudf/cudf/core/column/interval.py |  2 +-
 python/cudf/cudf/core/column/lists.py    | 18 ++++++++++++------
 python/cudf/cudf/tests/test_series.py    |  2 +-
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index dcec8957bb2..dc609f732e0 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -123,7 +123,7 @@ def to_pandas(
         if nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
         elif arrow_type:
-            raise NotImplementedError(f"{nullable=} is not implemented.")
+            raise NotImplementedError(f"{arrow_type=} is not implemented.")
         return pd.Series(
             self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index
         )
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index d1bf0b74d3c..1c2bcbef2ec 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -294,17 +294,23 @@ def to_pandas(
         *,
         index: Optional[pd.Index] = None,
         nullable: bool = False,
+        arrow_type: bool = False,
     ) -> pd.Series:
         # Can't rely on Column.to_pandas implementation for lists.
         # Need to perform `to_pylist` to preserve list types.
+        if arrow_type and nullable:
+            raise ValueError(
+                f"{arrow_type=} and {nullable=} cannot both be set."
+            )
         if nullable:
             raise NotImplementedError(f"{nullable=} is not implemented.")
-
-        pd_series = pd.Series(self.to_arrow().to_pylist(), dtype="object")
-
-        if index is not None:
-            pd_series.index = index
-        return pd_series
+        pa_array = self.to_arrow()
+        if arrow_type:
+            return pd.Series(
+                pd.arrays.ArrowExtensionArray(pa_array), index=index
+            )
+        else:
+            return pd.Series(pa_array.tolist(), dtype="object", index=index)
 
 
 class ListMethods(ColumnMethods):
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 6b5c0406deb..e043f358bbe 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2726,7 +2726,7 @@ def test_series_from_large_string():
 def test_series_to_pandas_arrow_type_nullable_raises(scalar):
     pa_array = pa.array([scalar, None])
     ser = cudf.Series(pa_array)
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match=".* cannot both be set"):
         ser.to_pandas(nullable=True, arrow_type=True)
 
 

From cd79fe55d9e4d296f5b865b7b556448fbc50a828 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 4 Mar 2024 20:04:19 -0800
Subject: [PATCH 14/18] Implement zero-copy host buffer source instead of using
 an arrow implementation (#15189)

Avoids an arrow dependency with a bit of simple code.

No real impact on performance.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/15189
---
 cpp/src/io/utilities/datasource.cpp | 33 ++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index cf2ba369023..d2026473b6c 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -18,7 +18,6 @@
 #include "io/utilities/config_utils.hpp"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
@@ -27,7 +26,6 @@
 
 #include <rmm/device_buffer.hpp>
 
-#include <arrow/io/memory.h>
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
@@ -338,6 +336,33 @@ class device_buffer_source final : public datasource {
   cudf::device_span<std::byte const> _d_buffer;  ///< A non-owning view of the existing device data
 };
 
+// zero-copy host buffer source
+class host_buffer_source final : public datasource {
+ public:
+  explicit host_buffer_source(cudf::host_span<std::byte const> h_buffer) : _h_buffer{h_buffer} {}
+
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
+  {
+    auto const count = std::min(size, this->size() - offset);
+    std::memcpy(dst, _h_buffer.data() + offset, count);
+    return count;
+  }
+
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  {
+    auto const count = std::min(size, this->size() - offset);
+    return std::make_unique<non_owning_buffer>(
+      reinterpret_cast<uint8_t const*>(_h_buffer.data() + offset), count);
+  }
+
+  [[nodiscard]] bool supports_device_read() const override { return false; }
+
+  [[nodiscard]] size_t size() const override { return _h_buffer.size(); }
+
+ private:
+  cudf::host_span<std::byte const> _h_buffer;  ///< A non-owning view of the existing host data
+};
+
 /**
  * @brief Wrapper class for user implemented data sources
  *
@@ -424,9 +449,7 @@ std::unique_ptr<datasource> datasource::create(host_buffer const& buffer)
 
 std::unique_ptr<datasource> datasource::create(cudf::host_span<std::byte const> buffer)
 {
-  // Use Arrow IO buffer class for zero-copy reads of host memory
-  return std::make_unique<arrow_io_source>(std::make_shared<arrow::io::BufferReader>(
-    reinterpret_cast<uint8_t const*>(buffer.data()), buffer.size()));
+  return std::make_unique<host_buffer_source>(buffer);
 }
 
 std::unique_ptr<datasource> datasource::create(cudf::device_span<std::byte const> buffer)

From f804aa69ca22124f648aba70096df6f1efe27629 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 4 Mar 2024 23:26:09 -0600
Subject: [PATCH 15/18] Fix testchunkedPackTwoPasses to copy from the bounce
 buffer (#15220)

This is a follow on from https://github.com/rapidsai/cudf/pull/15210. We bring back the test and fix it so it copies from the right buffer this time. I also set the original column to have some values and nulls, to make sure we are checking something interesting.

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/15220
---
 java/src/test/java/ai/rapids/cudf/TableTest.java | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 3f0470d854a..44dd20561bf 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -3758,12 +3758,16 @@ void testChunkedPackBasic() {
       }
     }
   }
-/*
+
   @Test
   void testChunkedPackTwoPasses() {
     // this test packes ~2MB worth of long into a 1MB bounce buffer
     // this is 3 iterations because of the validity buffer
     Long[] longs = new Long[256*1024];
+    // Initialize elements at odd-numbered indices
+    for (int i = 1; i < longs.length; i += 2) {
+      longs[i] = (long)i;
+    }
     try (Table t1 = new Table.TestBuilder().column(longs).build();
          DeviceMemoryBuffer bounceBuffer = DeviceMemoryBuffer.allocate(1L*1024*1024);
          ChunkedPack cp = t1.makeChunkedPack(1L*1024*1024);
@@ -3776,7 +3780,7 @@ void testChunkedPackTwoPasses() {
       while (cp.hasNext()) {
         long copied = cp.next(bounceBuffer);
         target.copyFromDeviceBufferAsync(
-          offset, target, 0, copied, Cuda.DEFAULT_STREAM);
+          offset, bounceBuffer, 0, copied, Cuda.DEFAULT_STREAM);
         offset += copied;
       }
 
@@ -3787,7 +3791,6 @@ void testChunkedPackTwoPasses() {
       }
     }
   }
-*/
 
   @Test
   void testContiguousSplitWithStrings() {

From 8d073e4ca0a6cb9d9a4d9fe5e4e0147f01d7eb36 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 5 Mar 2024 08:06:46 -0500
Subject: [PATCH 16/18] Change strings_column_view::char_size to return int64
 (#15197)

Changes the `cudf::strings_column_view::chars_size()` function to return `int64_t` instead of `size_type`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15197
---
 cpp/benchmarks/string/case.cpp                   | 4 +++-
 cpp/include/cudf/strings/strings_column_view.hpp | 2 +-
 cpp/src/strings/strings_column_view.cpp          | 8 ++++----
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index 639a3dc1181..a7db972d39f 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -45,7 +45,9 @@ void bench_case(nvbench::state& state)
       cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126);  // nice ASCII range
     auto input        = cudf::strings_column_view(col_view);
     auto ascii_column = create_random_column(
-      cudf::type_id::INT8, row_count{input.chars_size(cudf::get_default_stream())}, ascii_profile);
+      cudf::type_id::INT8,
+      row_count{static_cast<cudf::size_type>(input.chars_size(cudf::get_default_stream()))},
+      ascii_profile);
     auto ascii_data = ascii_column->view();
 
     col_view = cudf::column_view(col_view.type(),
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index 840a2dd1165..036589e17fe 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -112,7 +112,7 @@ class strings_column_view : private column_view {
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return Number of bytes in the chars child column
    */
-  [[nodiscard]] size_type chars_size(rmm::cuda_stream_view stream) const noexcept;
+  [[nodiscard]] int64_t chars_size(rmm::cuda_stream_view stream) const noexcept;
 
   /**
    * @brief Return an iterator for the chars child column.
diff --git a/cpp/src/strings/strings_column_view.cpp b/cpp/src/strings/strings_column_view.cpp
index 6be22d8e729..83ae916afc3 100644
--- a/cpp/src/strings/strings_column_view.cpp
+++ b/cpp/src/strings/strings_column_view.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/get_value.cuh>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -45,10 +45,10 @@ strings_column_view::offset_iterator strings_column_view::offsets_end() const
   return offsets_begin() + size() + 1;
 }
 
-size_type strings_column_view::chars_size(rmm::cuda_stream_view stream) const noexcept
+int64_t strings_column_view::chars_size(rmm::cuda_stream_view stream) const noexcept
 {
-  if (size() == 0) return 0;
-  return detail::get_value<size_type>(offsets(), offsets().size() - 1, stream);
+  if (size() == 0) { return 0L; }
+  return cudf::strings::detail::get_offset_value(offsets(), offsets().size() - 1, stream);
 }
 
 strings_column_view::chars_iterator strings_column_view::chars_begin(rmm::cuda_stream_view) const

From 1f5fcf679ee6052ab320220ee7218fcad51d99f2 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 5 Mar 2024 08:17:53 -0800
Subject: [PATCH 17/18] Improvements for `__cuda_array_interface__` tests
 (#15188)

This PR contains a few minor improvements for `__cuda_array_interface__` and its tests. Found while working on #15111.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15188
---
 python/cudf/cudf/core/single_column_frame.py  |  5 ++++-
 .../cudf/tests/test_cuda_array_interface.py   | 20 ++++++++++++-------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 97779522b8b..19dde2e51b9 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -242,7 +242,10 @@ def __cuda_array_interface__(self):
         try:
             return self._column.__cuda_array_interface__
         except NotImplementedError:
-            raise AttributeError
+            raise AttributeError(
+                f"'{type(self).__name__}' object has no attribute "
+                "'__cuda_array_interface__'"
+            )
 
     @_cudf_nvtx_annotate
     def factorize(self, sort=False, use_na_sentinel=True):
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 1f20152172b..213c6c2c1f9 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -4,10 +4,10 @@
 from contextlib import ExitStack as does_not_raise
 
 import cupy
+import numba.cuda
 import numpy as np
 import pandas as pd
 import pytest
-from numba import cuda
 
 import cudf
 from cudf.core.buffer.spill_manager import get_global_manager
@@ -25,7 +25,7 @@ def test_cuda_array_interface_interop_in(dtype, module):
         if dtype in DATETIME_TYPES:
             expectation = pytest.raises(ValueError)
     elif module == "numba":
-        module_constructor = cuda.to_device
+        module_constructor = numba.cuda.to_device
 
     with expectation:
         module_data = module_constructor(np_data)
@@ -55,7 +55,7 @@ def to_host_function(x):
             return cupy.asnumpy(x)
 
     elif module == "numba":
-        module_constructor = cuda.as_cuda_array
+        module_constructor = numba.cuda.as_cuda_array
 
         def to_host_function(x):
             return x.copy_to_host()
@@ -89,7 +89,7 @@ def to_host_function(x):
 
     elif module == "numba":
         expectation = pytest.raises(NotImplementedError)
-        module_constructor = cuda.as_cuda_array
+        module_constructor = numba.cuda.as_cuda_array
 
         def to_host_function(x):
             return x.copy_to_host()
@@ -135,9 +135,11 @@ def test_cuda_array_interface_as_column(dtype, nulls, mask_type):
 
     if mask_type == "bools":
         if nulls == "some":
-            obj.__cuda_array_interface__["mask"] = cuda.to_device(mask)
+            obj.__cuda_array_interface__["mask"] = numba.cuda.to_device(mask)
         elif nulls == "all":
-            obj.__cuda_array_interface__["mask"] = cuda.to_device([False] * 10)
+            obj.__cuda_array_interface__["mask"] = numba.cuda.to_device(
+                [False] * 10
+            )
 
     expect = sr
     got = cudf.Series(obj)
@@ -193,7 +195,11 @@ def test_cuda_array_interface_pytorch():
 
     assert_eq(got, cudf.Series(buffer, dtype=np.bool_))
 
-    # TODO: This test fails with PyTorch 2. Is it still expected to be valid?
+    # TODO: This test fails with PyTorch 2. It appears that PyTorch
+    # checks that the pointer is device-accessible even when the
+    # size is zero. See
+    # https://github.com/pytorch/pytorch/issues/98133
+    #
     # index = cudf.Index([], dtype="float64")
     # tensor = torch.tensor(index)
     # got = cudf.Index(tensor)

From d4368e98a4b92ade651a5f5df98035a297658f16 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 5 Mar 2024 16:45:18 +0000
Subject: [PATCH 18/18] Fix GroupBy.get_group and GroupBy.indices (#15143)

These are supposed to index based on row indices, not row labels.

- Closes https://github.com/rapidsai/cudf/issues/14955

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15143
---
 python/cudf/cudf/core/groupby/groupby.py      | 22 +++++++++++++------
 .../cudf/tests/groupby/test_groupby_obj.py    | 15 +++++++++++++
 2 files changed, 30 insertions(+), 7 deletions(-)
 create mode 100644 python/cudf/cudf/tests/groupby/test_groupby_obj.py

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index e4370be304a..caf5ac5928f 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -363,13 +363,22 @@ def indices(self):
         >>> df.groupby(by=["a"]).indices
         {10: array([0, 1]), 40: array([2])}
         """
-        group_names, offsets, _, grouped_values = self._grouped()
+        offsets, group_keys, (indices,) = self._groupby.groups(
+            [
+                cudf.core.column.as_column(
+                    range(len(self.obj)), dtype=size_type_dtype
+                )
+            ]
+        )
 
+        group_keys = libcudf.stream_compaction.drop_duplicates(group_keys)
+        if len(group_keys) > 1:
+            index = cudf.MultiIndex.from_arrays(group_keys)
+        else:
+            (group_keys,) = group_keys
+            index = cudf.Index(group_keys)
         return dict(
-            zip(
-                group_names.to_pandas(),
-                np.split(grouped_values.index.values, offsets[1:-1]),
-            )
+            zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1]))
         )
 
     @_cudf_nvtx_annotate
@@ -414,8 +423,7 @@ def get_group(self, name, obj=None):
                 "instead of ``gb.get_group(name, obj=df)``.",
                 FutureWarning,
             )
-
-        return obj.loc[self.groups[name].drop_duplicates()]
+        return obj.iloc[self.indices[name]]
 
     @_cudf_nvtx_annotate
     def size(self):
diff --git a/python/cudf/cudf/tests/groupby/test_groupby_obj.py b/python/cudf/cudf/tests/groupby/test_groupby_obj.py
new file mode 100644
index 00000000000..04b483e08dc
--- /dev/null
+++ b/python/cudf/cudf/tests/groupby/test_groupby_obj.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from numpy.testing import assert_array_equal
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+def test_groupby_14955():
+    # https://github.com/rapidsai/cudf/issues/14955
+    df = cudf.DataFrame({"a": [1, 2] * 2}, index=[0] * 4)
+    agg = df.groupby("a")
+    pagg = df.to_pandas().groupby("a")
+    for key in agg.groups:
+        assert_array_equal(pagg.indices[key], agg.indices[key].get())
+        assert_eq(pagg.get_group(key), agg.get_group(key))