From d6e30681aab3167db8eb3fa38fb3be05fde18627 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 21 Apr 2022 19:43:31 -0700 Subject: [PATCH] Cython API refactor: `merge.pyx` (#10698) This PR refactors `merge_sorted` in `merge.pyx` to accept a list of columns, contributes to #10153 Authors: - Michael Wang (https://github.com/isVoid) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/10698 --- python/cudf/cudf/_lib/merge.pyx | 88 +++++++++----------------------- python/cudf/cudf/core/reshape.py | 42 +++++++++++---- 2 files changed, 55 insertions(+), 75 deletions(-) diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx index 915b46c5691..dae2c466266 100644 --- a/python/cudf/cudf/_lib/merge.pyx +++ b/python/cudf/cudf/_lib/merge.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -10,79 +10,43 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.merge cimport merge as cpp_merge from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns def merge_sorted( - object tables, - object keys=None, - bool by_index=False, - bool ignore_index=False, + list input_columns, + list key_columns_indices, bool ascending=True, - object na_position="last", + str na_position="last", ): - cdef vector[libcudf_types.size_type] c_column_keys + """Merge multiple lists of lexicographically sorted columns into one list + of sorted columns. `input_columns` is a list of lists of columns to be + merged. + """ + cdef vector[libcudf_types.size_type] c_column_keys = key_columns_indices cdef vector[table_view] c_input_tables cdef vector[libcudf_types.order] c_column_order cdef vector[libcudf_types.null_order] c_null_precedence - cdef libcudf_types.order column_order - cdef libcudf_types.null_order null_precedence - cdef source_table - # Create vector of tables - # Use metadata from 0th table for names, etc - c_input_tables.reserve(len(tables)) - for source_table in tables: + c_input_tables.reserve(len(input_columns)) + for source_columns in input_columns: c_input_tables.push_back( - table_view_from_table(source_table, ignore_index)) - source_table = tables[0] + table_view_from_columns(source_columns)) - # Define sorting order and null precedence - column_order = (libcudf_types.order.ASCENDING - if ascending - else libcudf_types.order.DESCENDING) + num_keys = len(key_columns_indices) - if ascending is False: - if na_position == "last": - na_position = "first" - else: - na_position = "last" - null_precedence = ( + cdef libcudf_types.order column_order = ( + libcudf_types.order.ASCENDING if ascending + else libcudf_types.order.DESCENDING + ) + c_column_order = vector[libcudf_types.order](num_keys, column_order) + + if not ascending: + na_position = "last" if na_position == "first" else "first" + cdef libcudf_types.null_order null_precedence = ( libcudf_types.null_order.BEFORE if na_position == "first" else libcudf_types.null_order.AFTER ) - - # Determine index-column offset and index names - if ignore_index: - num_index_columns = 0 - index_names = None - else: - num_index_columns = ( - 0 if source_table._index is None - else source_table._index._num_columns - ) - index_names = source_table._index_names - - # Define C vectors for each key column - if not by_index and keys is not None: - num_keys = len(keys) - c_column_keys.reserve(num_keys) - for name in keys: - c_column_keys.push_back( - num_index_columns + source_table._column_names.index(name) - ) - else: - if by_index: - start = 0 - stop = num_index_columns - else: - start = num_index_columns - stop = num_index_columns + source_table._num_columns - num_keys = stop - start - c_column_keys.reserve(num_keys) - for key in range(start, stop): - c_column_keys.push_back(key) - c_column_order = vector[libcudf_types.order](num_keys, column_order) c_null_precedence = vector[libcudf_types.null_order]( num_keys, null_precedence @@ -100,8 +64,4 @@ def merge_sorted( ) ) - return data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names, - index_names=index_names, - ) + return columns_from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index f58c93aa0dc..5977b63777f 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -772,10 +772,10 @@ def merge_sorted( Parameters ---------- - objs : list of DataFrame, Series, or Index + objs : list of DataFrame or Series keys : list, default None List of Column names to sort by. If None, all columns used - (Ignored if `index=True`) + (Ignored if `by_index=True`) by_index : bool, default False Use index for sorting. `keys` input will be ignored if True ignore_index : bool, default False @@ -806,18 +806,38 @@ def merge_sorted( if by_index and ignore_index: raise ValueError("`by_index` and `ignore_index` cannot both be True") - result = objs[0].__class__._from_data( - *cudf._lib.merge.merge_sorted( - objs, - keys=keys, - by_index=by_index, - ignore_index=ignore_index, + if by_index: + key_columns_indices = list(range(0, objs[0]._index.nlevels)) + else: + if keys is None: + key_columns_indices = list(range(0, objs[0]._num_columns)) + else: + key_columns_indices = [ + objs[0]._column_names.index(key) for key in keys + ] + if not ignore_index: + key_columns_indices = [ + idx + objs[0]._index.nlevels for idx in key_columns_indices + ] + + columns = [ + [ + *(obj._index._data.columns if not ignore_index else ()), + *obj._columns, + ] + for obj in objs + ] + + return objs[0]._from_columns_like_self( + cudf._lib.merge.merge_sorted( + input_columns=columns, + key_columns_indices=key_columns_indices, ascending=ascending, na_position=na_position, - ) + ), + column_names=objs[0]._column_names, + index_names=None if ignore_index else objs[0]._index_names, ) - result._copy_type_metadata(objs[0]) - return result def _pivot(df, index, columns):