From c4bc2331eeb5d350e9effe987f4379a7cecfe8e2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Nov 2024 13:54:37 -0800 Subject: [PATCH 1/2] Remove cudf._lib.concat in favor of inlining pylibcudf --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/column.pyi | 9 ++++++ python/cudf/cudf/_lib/concat.pyx | 35 --------------------- python/cudf/cudf/_lib/utils.pxd | 2 +- python/cudf/cudf/_lib/utils.pyx | 2 +- python/cudf/cudf/core/column/categorical.py | 4 +-- python/cudf/cudf/core/column/column.py | 9 +++++- python/cudf/cudf/core/dataframe.py | 23 +++++++++++--- 9 files changed, 39 insertions(+), 47 deletions(-) delete mode 100644 python/cudf/cudf/_lib/concat.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 41a7db2285a..7969cc4c13c 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -16,7 +16,6 @@ set(cython_sources aggregation.pyx binaryop.pyx column.pyx - concat.pyx copying.pyx csv.pyx datetime.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 57df6899a22..0dd1c369afd 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -3,7 +3,6 @@ from . import ( binaryop, - concat, copying, csv, datetime, diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index bb38488eefb..bdd90be45b8 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -2,8 +2,12 @@ from __future__ import annotations +from typing import Literal + from typing_extensions import Self +import pylibcudf as plc + from cudf._typing import Dtype, DtypeObj, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase @@ -71,3 +75,8 @@ class Column: # TODO: The val parameter should be Scalar, not ScalarLike @staticmethod def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ... + @staticmethod + def from_pylibcudf( + col: plc.Column, data_ptr_exposed: bool = False + ) -> ColumnBase: ... + def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: ... diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx deleted file mode 100644 index e6c2d136f0d..00000000000 --- a/python/cudf/cudf/_lib/concat.pyx +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport data_from_pylibcudf_table - -import pylibcudf - -from cudf.core.buffer import acquire_spill_lock - - -@acquire_spill_lock() -def concat_columns(object columns): - return Column.from_pylibcudf( - pylibcudf.concatenate.concatenate( - [col.to_pylibcudf(mode="read") for col in columns] - ) - ) - - -@acquire_spill_lock() -def concat_tables(object tables, bool ignore_index=False): - plc_tables = [] - for table in tables: - cols = table._columns - if not ignore_index: - cols = table._index._columns + cols - plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols])) - - return data_from_pylibcudf_table( - pylibcudf.concatenate.concatenate(plc_tables), - column_names=tables[0]._column_names, - index_names=None if ignore_index else tables[0]._index_names - ) diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index 623c5064a1a..f273aeb4270 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -10,7 +10,7 @@ from pylibcudf.libcudf.table.table cimport table, table_view cdef data_from_unique_ptr( unique_ptr[table] c_tbl, column_names, index_names=*) -cdef data_from_pylibcudf_table(tbl, column_names, index_names=*) +cpdef data_from_pylibcudf_table(tbl, column_names, index_names=*) cpdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *) cdef data_from_table_view( table_view tv, object owner, object column_names, object index_names=*) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 292de82e4c4..2ccc6ca34dc 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -309,7 +309,7 @@ cdef data_from_unique_ptr( ) -cdef data_from_pylibcudf_table(tbl, column_names, index_names=None): +cpdef data_from_pylibcudf_table(tbl, column_names, index_names=None): return _data_from_columns( columns_from_pylibcudf_table(tbl), column_names, diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 087d0ed65f5..0afcbb256c0 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1203,9 +1203,7 @@ def _concat( elif newsize == 0: codes_col = column.column_empty(0, head.codes.dtype, masked=True) else: - # Filter out inputs that have 0 length, then concatenate. - codes = [o for o in codes if len(o)] - codes_col = libcudf.concat.concat_columns(objs) + codes_col = column.concat_columns(codes) # type: ignore[arg-type] codes_col = as_unsigned_codes( len(cats), diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d2f9d208c77..b3c17c626b6 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -19,6 +19,7 @@ from pandas.core.arrays.arrow.extension_types import ArrowIntervalType from typing_extensions import Self +import pylibcudf as plc import rmm import cudf @@ -2299,4 +2300,10 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: return column_empty(0, head.dtype, masked=True) # Filter out inputs that have 0 length, then concatenate. - return libcudf.concat.concat_columns([o for o in objs if len(o)]) + objs_with_len = [o for o in objs if len(o)] + with acquire_spill_lock(): + return Column.from_pylibcudf( + plc.concatenate.concatenate( + [col.to_pylibcudf(mode="read") for col in objs_with_len] + ) + ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bf1c39b23da..aba3254a660 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -26,6 +26,8 @@ from pandas.io.formats.printing import pprint_thing from typing_extensions import Self, assert_never +import pylibcudf as plc + import cudf import cudf.core.common from cudf import _lib as libcudf @@ -43,6 +45,7 @@ from cudf.core import column, df_protocol, indexing_utils, reshape from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( CategoricalColumn, ColumnBase, @@ -1784,11 +1787,23 @@ def _concat( ) # Concatenate the Tables - out = cls._from_data( - *libcudf.concat.concat_tables( - tables, ignore_index=ignore_index or are_all_range_index + ignore = ignore_index or are_all_range_index + with acquire_spill_lock(): + plc_tables = [] + for table in tables: + cols = table._columns + if not ignore: + cols = table._index._columns + cols + plc_tables.append( + plc.Table([c.to_pylibcudf(mode="read") for c in cols]) + ) + + concatted = libcudf.utils.data_from_pylibcudf_table( + plc.concatenate.concatenate(plc_tables), + column_names=tables[0]._column_names, + index_names=None if ignore else tables[0]._index_names, ) - ) + out = cls._from_data(*concatted) # If ignore_index is True, all input frames are empty, and at # least one input frame has an index, assign a new RangeIndex From da9c68df7532bb61865f01290330e3d14908c2b5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Nov 2024 14:06:57 -0800 Subject: [PATCH 2/2] Use list comprehension --- python/cudf/cudf/core/dataframe.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index aba3254a660..61fcaaec419 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1788,20 +1788,29 @@ def _concat( # Concatenate the Tables ignore = ignore_index or are_all_range_index + index_names = None if ignore else tables[0]._index_names + column_names = tables[0]._column_names with acquire_spill_lock(): - plc_tables = [] - for table in tables: - cols = table._columns - if not ignore: - cols = table._index._columns + cols - plc_tables.append( - plc.Table([c.to_pylibcudf(mode="read") for c in cols]) + plc_tables = [ + plc.Table( + [ + c.to_pylibcudf(mode="read") + for c in ( + table._columns + if ignore + else itertools.chain( + table._index._columns, table._columns + ) + ) + ] ) + for table in tables + ] concatted = libcudf.utils.data_from_pylibcudf_table( plc.concatenate.concatenate(plc_tables), - column_names=tables[0]._column_names, - index_names=None if ignore else tables[0]._index_names, + column_names=column_names, + index_names=index_names, ) out = cls._from_data(*concatted)