Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cython API refactor: merge.pyx #10698

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 20 additions & 64 deletions python/cudf/cudf/_lib/merge.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
Expand All @@ -10,79 +10,39 @@ from cudf._lib.column cimport Column
from cudf._lib.cpp.merge cimport merge as cpp_merge
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns


def merge_sorted(
object tables,
object keys=None,
bool by_index=False,
bool ignore_index=False,
list list_of_columns,
isVoid marked this conversation as resolved.
Show resolved Hide resolved
list key_columns_indices,
bool ascending=True,
object na_position="last",
str na_position="last",
):
cdef vector[libcudf_types.size_type] c_column_keys
cdef vector[libcudf_types.size_type] c_column_keys = key_columns_indices
cdef vector[table_view] c_input_tables
cdef vector[libcudf_types.order] c_column_order
cdef vector[libcudf_types.null_order] c_null_precedence
cdef libcudf_types.order column_order
cdef libcudf_types.null_order null_precedence
cdef source_table

# Create vector of tables
# Use metadata from 0th table for names, etc
c_input_tables.reserve(len(tables))
for source_table in tables:
c_input_tables.reserve(len(list_of_columns))
for source_columns in list_of_columns:
c_input_tables.push_back(
table_view_from_table(source_table, ignore_index))
source_table = tables[0]
table_view_from_columns(source_columns))

# Define sorting order and null precedence
column_order = (libcudf_types.order.ASCENDING
if ascending
else libcudf_types.order.DESCENDING)
num_keys = len(key_columns_indices)

if ascending is False:
if na_position == "last":
na_position = "first"
else:
na_position = "last"
null_precedence = (
cdef libcudf_types.order column_order = (
libcudf_types.order.ASCENDING if ascending
else libcudf_types.order.DESCENDING
)
c_column_order = vector[libcudf_types.order](num_keys, column_order)

if not ascending:
na_position = "last" if na_position == "first" else "first"
cdef libcudf_types.null_order null_precedence = (
libcudf_types.null_order.BEFORE if na_position == "first"
else libcudf_types.null_order.AFTER
)

# Determine index-column offset and index names
if ignore_index:
num_index_columns = 0
index_names = None
else:
num_index_columns = (
0 if source_table._index is None
else source_table._index._num_columns
)
index_names = source_table._index_names

# Define C vectors for each key column
if not by_index and keys is not None:
num_keys = len(keys)
c_column_keys.reserve(num_keys)
for name in keys:
c_column_keys.push_back(
num_index_columns + source_table._column_names.index(name)
)
else:
if by_index:
start = 0
stop = num_index_columns
else:
start = num_index_columns
stop = num_index_columns + source_table._num_columns
num_keys = stop - start
c_column_keys.reserve(num_keys)
for key in range(start, stop):
c_column_keys.push_back(key)
c_column_order = vector[libcudf_types.order](num_keys, column_order)
c_null_precedence = vector[libcudf_types.null_order](
num_keys,
null_precedence
Expand All @@ -100,8 +60,4 @@ def merge_sorted(
)
)

return data_from_unique_ptr(
move(c_result),
column_names=source_table._column_names,
index_names=index_names,
)
return columns_from_unique_ptr(move(c_result))
42 changes: 31 additions & 11 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,10 +772,10 @@ def merge_sorted(

Parameters
----------
objs : list of DataFrame, Series, or Index
objs : list of DataFrame or Series
keys : list, default None
List of Column names to sort by. If None, all columns used
(Ignored if `index=True`)
(Ignored if `by_index=True`)
by_index : bool, default False
Use index for sorting. `keys` input will be ignored if True
ignore_index : bool, default False
Expand Down Expand Up @@ -806,18 +806,38 @@ def merge_sorted(
if by_index and ignore_index:
raise ValueError("`by_index` and `ignore_index` cannot both be True")

result = objs[0].__class__._from_data(
*cudf._lib.merge.merge_sorted(
objs,
keys=keys,
by_index=by_index,
ignore_index=ignore_index,
if by_index:
key_columns_indices = list(range(0, objs[0]._index.nlevels))
else:
if keys is None:
key_columns_indices = list(range(0, objs[0]._num_columns))
else:
key_columns_indices = [
objs[0]._column_names.index(key) for key in keys
]
if not ignore_index:
key_columns_indices = [
idx + objs[0]._index.nlevels for idx in key_columns_indices
]

columns = [
[
*(obj._index._data.columns if not ignore_index else ()),
*obj._columns,
]
for obj in objs
]

return objs[0]._from_columns_like_self(
cudf._lib.merge.merge_sorted(
list_of_columns=columns,
key_columns_indices=key_columns_indices,
ascending=ascending,
na_position=na_position,
)
),
column_names=objs[0]._column_names,
index_names=None if ignore_index else objs[0]._index_names,
)
result._copy_type_metadata(objs[0])
return result


def _pivot(df, index, columns):
Expand Down