diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 9806ae11339..d1765f55c90 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -22,7 +22,6 @@ set(cython_sources filling.pyx groupby.pyx interop.pyx - merge.pyx orc.pyx parquet.pyx reduce.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 9af1dc976a6..096f9e5b4b4 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -9,7 +9,6 @@ filling, groupby, interop, - merge, nvtext, orc, parquet, diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx deleted file mode 100644 index 9372acdab44..00000000000 --- a/python/cudf/cudf/_lib/merge.pyx +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf - - -def merge_sorted( - list input_columns, - list key_columns_indices, - bool ascending=True, - str na_position="last", -): - """Merge multiple lists of lexicographically sorted columns into one list - of sorted columns. `input_columns` is a list of lists of columns to be - merged. - """ - c_input_tables = [ - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ) for source_columns in input_columns - ] - - num_keys = len(key_columns_indices) - - column_order = ( - pylibcudf.types.Order.ASCENDING if ascending - else pylibcudf.types.Order.DESCENDING - ) - - if not ascending: - na_position = "last" if na_position == "first" else "first" - null_precedence = ( - pylibcudf.types.NullOrder.BEFORE if na_position == "first" - else pylibcudf.types.NullOrder.AFTER - ) - - return columns_from_pylibcudf_table( - pylibcudf.merge.merge( - c_input_tables, - key_columns_indices, - [column_order] * num_keys, - [null_precedence] * num_keys, - ) - ) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 016bd1225cd..2e02fe91fa0 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -8,7 +8,10 @@ import numpy as np import pandas as pd +import pylibcudf as plc + import cudf +from cudf._lib.column import Column from cudf._lib.transform import one_hot_encode from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default @@ -941,21 +944,46 @@ def _merge_sorted( idx + objs[0].index.nlevels for idx in key_columns_indices ] - columns = [ - [ - *(obj.index._columns if not ignore_index else ()), - *obj._columns, - ] + columns = ( + itertools.chain(obj.index._columns, obj._columns) + if not ignore_index + else obj._columns for obj in objs + ) + + input_tables = [ + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]) + for source_columns in columns + ] + + num_keys = len(key_columns_indices) + + column_order = ( + plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING + ) + + if not ascending: + na_position = "last" if na_position == "first" else "first" + + null_precedence = ( + plc.types.NullOrder.BEFORE + if na_position == "first" + else plc.types.NullOrder.AFTER + ) + + plc_table = plc.merge.merge( + input_tables, + key_columns_indices, + [column_order] * num_keys, + [null_precedence] * num_keys, + ) + + result_columns = [ + Column.from_pylibcudf(col) for col in plc_table.columns() ] return objs[0]._from_columns_like_self( - cudf._lib.merge.merge_sorted( - input_columns=columns, - key_columns_indices=key_columns_indices, - ascending=ascending, - na_position=na_position, - ), + result_columns, column_names=objs[0]._column_names, index_names=None if ignore_index else objs[0]._index_names, )