From c4bc2331eeb5d350e9effe987f4379a7cecfe8e2 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 15 Nov 2024 13:54:37 -0800
Subject: [PATCH 1/2] Remove cudf._lib.concat in favor of inlining pylibcudf

---
 python/cudf/cudf/_lib/CMakeLists.txt        |  1 -
 python/cudf/cudf/_lib/__init__.py           |  1 -
 python/cudf/cudf/_lib/column.pyi            |  9 ++++++
 python/cudf/cudf/_lib/concat.pyx            | 35 ---------------------
 python/cudf/cudf/_lib/utils.pxd             |  2 +-
 python/cudf/cudf/_lib/utils.pyx             |  2 +-
 python/cudf/cudf/core/column/categorical.py |  4 +--
 python/cudf/cudf/core/column/column.py      |  9 +++++-
 python/cudf/cudf/core/dataframe.py          | 23 +++++++++++---
 9 files changed, 39 insertions(+), 47 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/concat.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 41a7db2285a..7969cc4c13c 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -16,7 +16,6 @@ set(cython_sources
     aggregation.pyx
     binaryop.pyx
     column.pyx
-    concat.pyx
     copying.pyx
     csv.pyx
     datetime.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 57df6899a22..0dd1c369afd 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -3,7 +3,6 @@
 
 from . import (
     binaryop,
-    concat,
     copying,
     csv,
     datetime,
diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
index bb38488eefb..bdd90be45b8 100644
--- a/python/cudf/cudf/_lib/column.pyi
+++ b/python/cudf/cudf/_lib/column.pyi
@@ -2,8 +2,12 @@
 
 from __future__ import annotations
 
+from typing import Literal
+
 from typing_extensions import Self
 
+import pylibcudf as plc
+
 from cudf._typing import Dtype, DtypeObj, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase
@@ -71,3 +75,8 @@ class Column:
     # TODO: The val parameter should be Scalar, not ScalarLike
     @staticmethod
     def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ...
+    @staticmethod
+    def from_pylibcudf(
+        col: plc.Column, data_ptr_exposed: bool = False
+    ) -> ColumnBase: ...
+    def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: ...
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
deleted file mode 100644
index e6c2d136f0d..00000000000
--- a/python/cudf/cudf/_lib/concat.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-
-from cudf._lib.column cimport Column
-from cudf._lib.utils cimport data_from_pylibcudf_table
-
-import pylibcudf
-
-from cudf.core.buffer import acquire_spill_lock
-
-
-@acquire_spill_lock()
-def concat_columns(object columns):
-    return Column.from_pylibcudf(
-        pylibcudf.concatenate.concatenate(
-            [col.to_pylibcudf(mode="read") for col in columns]
-        )
-    )
-
-
-@acquire_spill_lock()
-def concat_tables(object tables, bool ignore_index=False):
-    plc_tables = []
-    for table in tables:
-        cols = table._columns
-        if not ignore_index:
-            cols = table._index._columns + cols
-        plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols]))
-
-    return data_from_pylibcudf_table(
-        pylibcudf.concatenate.concatenate(plc_tables),
-        column_names=tables[0]._column_names,
-        index_names=None if ignore_index else tables[0]._index_names
-    )
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 623c5064a1a..f273aeb4270 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -10,7 +10,7 @@ from pylibcudf.libcudf.table.table cimport table, table_view
 
 cdef data_from_unique_ptr(
     unique_ptr[table] c_tbl, column_names, index_names=*)
-cdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
+cpdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
 cpdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *)
 cdef data_from_table_view(
     table_view tv, object owner, object column_names, object index_names=*)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 292de82e4c4..2ccc6ca34dc 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -309,7 +309,7 @@ cdef data_from_unique_ptr(
     )
 
 
-cdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
+cpdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
     return _data_from_columns(
         columns_from_pylibcudf_table(tbl),
         column_names,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 087d0ed65f5..0afcbb256c0 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1203,9 +1203,7 @@ def _concat(
         elif newsize == 0:
             codes_col = column.column_empty(0, head.codes.dtype, masked=True)
         else:
-            # Filter out inputs that have 0 length, then concatenate.
-            codes = [o for o in codes if len(o)]
-            codes_col = libcudf.concat.concat_columns(objs)
+            codes_col = column.concat_columns(codes)  # type: ignore[arg-type]
 
         codes_col = as_unsigned_codes(
             len(cats),
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index d2f9d208c77..b3c17c626b6 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -19,6 +19,7 @@
 from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
 from typing_extensions import Self
 
+import pylibcudf as plc
 import rmm
 
 import cudf
@@ -2299,4 +2300,10 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
         return column_empty(0, head.dtype, masked=True)
 
     # Filter out inputs that have 0 length, then concatenate.
-    return libcudf.concat.concat_columns([o for o in objs if len(o)])
+    objs_with_len = [o for o in objs if len(o)]
+    with acquire_spill_lock():
+        return Column.from_pylibcudf(
+            plc.concatenate.concatenate(
+                [col.to_pylibcudf(mode="read") for col in objs_with_len]
+            )
+        )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bf1c39b23da..aba3254a660 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -26,6 +26,8 @@
 from pandas.io.formats.printing import pprint_thing
 from typing_extensions import Self, assert_never
 
+import pylibcudf as plc
+
 import cudf
 import cudf.core.common
 from cudf import _lib as libcudf
@@ -43,6 +45,7 @@
 from cudf.core import column, df_protocol, indexing_utils, reshape
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,
@@ -1784,11 +1787,23 @@ def _concat(
             )
 
         # Concatenate the Tables
-        out = cls._from_data(
-            *libcudf.concat.concat_tables(
-                tables, ignore_index=ignore_index or are_all_range_index
+        ignore = ignore_index or are_all_range_index
+        with acquire_spill_lock():
+            plc_tables = []
+            for table in tables:
+                cols = table._columns
+                if not ignore:
+                    cols = table._index._columns + cols
+                plc_tables.append(
+                    plc.Table([c.to_pylibcudf(mode="read") for c in cols])
+                )
+
+            concatted = libcudf.utils.data_from_pylibcudf_table(
+                plc.concatenate.concatenate(plc_tables),
+                column_names=tables[0]._column_names,
+                index_names=None if ignore else tables[0]._index_names,
             )
-        )
+        out = cls._from_data(*concatted)
 
         # If ignore_index is True, all input frames are empty, and at
         # least one input frame has an index, assign a new RangeIndex

From da9c68df7532bb61865f01290330e3d14908c2b5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 15 Nov 2024 14:06:57 -0800
Subject: [PATCH 2/2] Use list comprehension

---
 python/cudf/cudf/core/dataframe.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index aba3254a660..61fcaaec419 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1788,20 +1788,29 @@ def _concat(
 
         # Concatenate the Tables
         ignore = ignore_index or are_all_range_index
+        index_names = None if ignore else tables[0]._index_names
+        column_names = tables[0]._column_names
         with acquire_spill_lock():
-            plc_tables = []
-            for table in tables:
-                cols = table._columns
-                if not ignore:
-                    cols = table._index._columns + cols
-                plc_tables.append(
-                    plc.Table([c.to_pylibcudf(mode="read") for c in cols])
+            plc_tables = [
+                plc.Table(
+                    [
+                        c.to_pylibcudf(mode="read")
+                        for c in (
+                            table._columns
+                            if ignore
+                            else itertools.chain(
+                                table._index._columns, table._columns
+                            )
+                        )
+                    ]
                 )
+                for table in tables
+            ]
 
             concatted = libcudf.utils.data_from_pylibcudf_table(
                 plc.concatenate.concatenate(plc_tables),
-                column_names=tables[0]._column_names,
-                index_names=None if ignore else tables[0]._index_names,
+                column_names=column_names,
+                index_names=index_names,
             )
         out = cls._from_data(*concatted)