Skip to content

Commit

Permalink
Cython API refactor: merge.pyx (#10698)
Browse files Browse the repository at this point in the history
This PR refactors `merge_sorted` in `merge.pyx` to accept a list of columns, contributes to #10153

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: #10698
  • Loading branch information
isVoid authored Apr 22, 2022
1 parent 5053a1a commit d6e3068
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 75 deletions.
88 changes: 24 additions & 64 deletions python/cudf/cudf/_lib/merge.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
Expand All @@ -10,79 +10,43 @@ from cudf._lib.column cimport Column
from cudf._lib.cpp.merge cimport merge as cpp_merge
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns


def merge_sorted(
object tables,
object keys=None,
bool by_index=False,
bool ignore_index=False,
list input_columns,
list key_columns_indices,
bool ascending=True,
object na_position="last",
str na_position="last",
):
cdef vector[libcudf_types.size_type] c_column_keys
"""Merge multiple lists of lexicographically sorted columns into one list
of sorted columns. `input_columns` is a list of lists of columns to be
merged.
"""
cdef vector[libcudf_types.size_type] c_column_keys = key_columns_indices
cdef vector[table_view] c_input_tables
cdef vector[libcudf_types.order] c_column_order
cdef vector[libcudf_types.null_order] c_null_precedence
cdef libcudf_types.order column_order
cdef libcudf_types.null_order null_precedence
cdef source_table

# Create vector of tables
# Use metadata from 0th table for names, etc
c_input_tables.reserve(len(tables))
for source_table in tables:
c_input_tables.reserve(len(input_columns))
for source_columns in input_columns:
c_input_tables.push_back(
table_view_from_table(source_table, ignore_index))
source_table = tables[0]
table_view_from_columns(source_columns))

# Define sorting order and null precedence
column_order = (libcudf_types.order.ASCENDING
if ascending
else libcudf_types.order.DESCENDING)
num_keys = len(key_columns_indices)

if ascending is False:
if na_position == "last":
na_position = "first"
else:
na_position = "last"
null_precedence = (
cdef libcudf_types.order column_order = (
libcudf_types.order.ASCENDING if ascending
else libcudf_types.order.DESCENDING
)
c_column_order = vector[libcudf_types.order](num_keys, column_order)

if not ascending:
na_position = "last" if na_position == "first" else "first"
cdef libcudf_types.null_order null_precedence = (
libcudf_types.null_order.BEFORE if na_position == "first"
else libcudf_types.null_order.AFTER
)

# Determine index-column offset and index names
if ignore_index:
num_index_columns = 0
index_names = None
else:
num_index_columns = (
0 if source_table._index is None
else source_table._index._num_columns
)
index_names = source_table._index_names

# Define C vectors for each key column
if not by_index and keys is not None:
num_keys = len(keys)
c_column_keys.reserve(num_keys)
for name in keys:
c_column_keys.push_back(
num_index_columns + source_table._column_names.index(name)
)
else:
if by_index:
start = 0
stop = num_index_columns
else:
start = num_index_columns
stop = num_index_columns + source_table._num_columns
num_keys = stop - start
c_column_keys.reserve(num_keys)
for key in range(start, stop):
c_column_keys.push_back(key)
c_column_order = vector[libcudf_types.order](num_keys, column_order)
c_null_precedence = vector[libcudf_types.null_order](
num_keys,
null_precedence
Expand All @@ -100,8 +64,4 @@ def merge_sorted(
)
)

return data_from_unique_ptr(
move(c_result),
column_names=source_table._column_names,
index_names=index_names,
)
return columns_from_unique_ptr(move(c_result))
42 changes: 31 additions & 11 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,10 +772,10 @@ def merge_sorted(
Parameters
----------
objs : list of DataFrame, Series, or Index
objs : list of DataFrame or Series
keys : list, default None
List of Column names to sort by. If None, all columns used
(Ignored if `index=True`)
(Ignored if `by_index=True`)
by_index : bool, default False
Use index for sorting. `keys` input will be ignored if True
ignore_index : bool, default False
Expand Down Expand Up @@ -806,18 +806,38 @@ def merge_sorted(
if by_index and ignore_index:
raise ValueError("`by_index` and `ignore_index` cannot both be True")

result = objs[0].__class__._from_data(
*cudf._lib.merge.merge_sorted(
objs,
keys=keys,
by_index=by_index,
ignore_index=ignore_index,
if by_index:
key_columns_indices = list(range(0, objs[0]._index.nlevels))
else:
if keys is None:
key_columns_indices = list(range(0, objs[0]._num_columns))
else:
key_columns_indices = [
objs[0]._column_names.index(key) for key in keys
]
if not ignore_index:
key_columns_indices = [
idx + objs[0]._index.nlevels for idx in key_columns_indices
]

columns = [
[
*(obj._index._data.columns if not ignore_index else ()),
*obj._columns,
]
for obj in objs
]

return objs[0]._from_columns_like_self(
cudf._lib.merge.merge_sorted(
input_columns=columns,
key_columns_indices=key_columns_indices,
ascending=ascending,
na_position=na_position,
)
),
column_names=objs[0]._column_names,
index_names=None if ignore_index else objs[0]._index_names,
)
result._copy_type_metadata(objs[0])
return result


def _pivot(df, index, columns):
Expand Down

0 comments on commit d6e3068

Please sign in to comment.