From 658a6334d8b1e12d9b0d48daf6914fe543945a23 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 18 Nov 2024 13:57:40 -0800 Subject: [PATCH] Remove cudf._lib.merge in favor of inlining pylibcudf --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/merge.pyx | 47 -------------------------- python/cudf/cudf/core/reshape.py | 50 ++++++++++++++++++++++------ 4 files changed, 39 insertions(+), 60 deletions(-) delete mode 100644 python/cudf/cudf/_lib/merge.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 2958c286d20..d465469d685 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -25,7 +25,6 @@ set(cython_sources join.pyx json.pyx lists.pyx - merge.pyx null_mask.pyx orc.pyx parquet.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 19dc4488560..17667eceeeb 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -11,7 +11,6 @@ interop, join, json, - merge, null_mask, nvtext, orc, diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx deleted file mode 100644 index 9372acdab44..00000000000 --- a/python/cudf/cudf/_lib/merge.pyx +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf - - -def merge_sorted( - list input_columns, - list key_columns_indices, - bool ascending=True, - str na_position="last", -): - """Merge multiple lists of lexicographically sorted columns into one list - of sorted columns. `input_columns` is a list of lists of columns to be - merged. - """ - c_input_tables = [ - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ) for source_columns in input_columns - ] - - num_keys = len(key_columns_indices) - - column_order = ( - pylibcudf.types.Order.ASCENDING if ascending - else pylibcudf.types.Order.DESCENDING - ) - - if not ascending: - na_position = "last" if na_position == "first" else "first" - null_precedence = ( - pylibcudf.types.NullOrder.BEFORE if na_position == "first" - else pylibcudf.types.NullOrder.AFTER - ) - - return columns_from_pylibcudf_table( - pylibcudf.merge.merge( - c_input_tables, - key_columns_indices, - [column_order] * num_keys, - [null_precedence] * num_keys, - ) - ) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 3d132c92d54..4e92d0043da 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -8,7 +8,10 @@ import numpy as np import pandas as pd +import pylibcudf as plc + import cudf +from cudf._lib.column import Column from cudf._lib.transform import one_hot_encode from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default @@ -941,21 +944,46 @@ def _merge_sorted( idx + objs[0].index.nlevels for idx in key_columns_indices ] - columns = [ - [ - *(obj.index._columns if not ignore_index else ()), - *obj._columns, - ] + columns = ( + itertools.chain(obj.index._columns, obj._columns) + if not ignore_index + else obj._columns for obj in objs + ) + + input_tables = [ + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]) + for source_columns in columns + ] + + num_keys = len(key_columns_indices) + + column_order = ( + plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING + ) + + if not ascending: + na_position = "last" if na_position == "first" else "first" + + null_precedence = ( + plc.types.NullOrder.BEFORE + if na_position == "first" + else plc.types.NullOrder.AFTER + ) + + plc_table = plc.merge.merge( + input_tables, + key_columns_indices, + [column_order] * num_keys, + [null_precedence] * num_keys, + ) + + result_columns = [ + Column.from_pylibcudf(col) for col in plc_table.columns() ] return objs[0]._from_columns_like_self( - cudf._lib.merge.merge_sorted( - input_columns=columns, - key_columns_indices=key_columns_indices, - ascending=ascending, - na_position=na_position, - ), + result_columns, column_names=objs[0]._column_names, index_names=None if ignore_index else objs[0]._index_names, )