Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cython API Refactor: transpose.pyx, sort.pyx #10675

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 40 additions & 61 deletions python/cudf/cudf/_lib/sort.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.

import pandas as pd
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
Expand All @@ -23,19 +21,19 @@ from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport null_order, null_policy, order
from cudf._lib.sort cimport underlying_type_t_rank_method
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns


def is_sorted(
source_table, object ascending=None, object null_position=None
list source_columns, object ascending=None, object null_position=None
):
"""
Checks whether the rows of a `table` are sorted in lexicographical order.

Parameters
----------
source_table : Frame
Frame whose columns are to be checked for sort order
source_columns : list of columns
columns to be checked for sort order
ascending : None or list-like of booleans
None or list-like of boolean values indicating expected sort order of
each column. If list-like, size of list-like must be len(columns). If
Expand All @@ -58,51 +56,39 @@ def is_sorted(
cdef vector[null_order] null_precedence

if ascending is None:
column_order = vector[order](
source_table._num_columns, order.ASCENDING
)
elif pd.api.types.is_list_like(ascending):
if len(ascending) != source_table._num_columns:
column_order = vector[order](len(source_columns), order.ASCENDING)
else:
if len(ascending) != len(source_columns):
raise ValueError(
f"Expected a list-like of length {source_table._num_columns}, "
f"Expected a list-like of length {len(source_columns)}, "
f"got length {len(ascending)} for `ascending`"
)
column_order = vector[order](
source_table._num_columns, order.DESCENDING
len(source_columns), order.DESCENDING
)
for idx, val in enumerate(ascending):
if val:
column_order[idx] = order.ASCENDING
else:
raise TypeError(
f"Expected a list-like or None for `ascending`, got "
f"{type(ascending)}"
)

if null_position is None:
null_precedence = vector[null_order](
source_table._num_columns, null_order.AFTER
len(source_columns), null_order.AFTER
)
elif pd.api.types.is_list_like(null_position):
if len(null_position) != source_table._num_columns:
else:
if len(null_position) != len(source_columns):
raise ValueError(
f"Expected a list-like of length {source_table._num_columns}, "
f"Expected a list-like of length {len(source_columns)}, "
f"got length {len(null_position)} for `null_position`"
)
null_precedence = vector[null_order](
source_table._num_columns, null_order.AFTER
len(source_columns), null_order.AFTER
)
for idx, val in enumerate(null_position):
if val:
null_precedence[idx] = null_order.BEFORE
else:
raise TypeError(
f"Expected a list-like or None for `null_position`, got "
f"{type(null_position)}"
)

cdef bool c_result
cdef table_view source_table_view = table_view_from_table(source_table)
cdef table_view source_table_view = table_view_from_columns(source_columns)
with nogil:
c_result = cpp_is_sorted(
source_table_view,
Expand All @@ -113,34 +99,34 @@ def is_sorted(
return c_result


def order_by(source_table, object ascending, str na_position):
def order_by(list columns_from_table, object ascending, str na_position):
"""
Sorting the table ascending/descending
Get index to sort the table in ascending/descending order.

Parameters
----------
source_table : table which will be sorted
ascending : list of boolean values which correspond to each column
columns_from_table : columns from the table which will be sorted
ascending : sequence of boolean values which correspond to each column
in source_table signifying order of each column
True - Ascending and False - Descending
na_position : whether null value should show up at the "first" or "last"
position of **all** sorted column.
"""
cdef table_view source_table_view = table_view_from_table(
source_table, ignore_index=True
cdef table_view source_table_view = table_view_from_columns(
columns_from_table
)
cdef vector[order] column_order
column_order.reserve(len(ascending))
cdef vector[null_order] null_precedence
null_precedence.reserve(len(ascending))

for i in ascending:
if i is True:
for asc in ascending:
if asc:
column_order.push_back(order.ASCENDING)
else:
column_order.push_back(order.DESCENDING)

if i ^ (na_position == "first"):
if asc ^ (na_position == "first"):
null_precedence.push_back(null_order.AFTER)
else:
null_precedence.push_back(null_order.BEFORE)
Expand All @@ -154,21 +140,21 @@ def order_by(source_table, object ascending, str na_position):
return Column.from_unique_ptr(move(c_result))


def digitize(source_values_table, bins, bool right=False):
def digitize(list source_columns, list bins, bool right=False):
"""
Return the indices of the bins to which each value in source_table belongs.

Parameters
----------
source_table : Input table to be binned.
bins : Frame containing columns of bins
source_columns : Input columns to be binned.
bins : List containing columns of bins
right : Indicating whether the intervals include the
right or the left bin edge.
"""

cdef table_view bins_view = table_view_from_table(bins)
cdef table_view source_values_table_view = table_view_from_table(
source_values_table
cdef table_view bins_view = table_view_from_columns(bins)
cdef table_view source_table_view = table_view_from_columns(
source_columns
)
cdef vector[order] column_order = (
vector[order](
Expand All @@ -184,19 +170,19 @@ def digitize(source_values_table, bins, bool right=False):
)

cdef unique_ptr[column] c_result
if right is True:
if right:
with nogil:
c_result = move(lower_bound(
bins_view,
source_values_table_view,
source_table_view,
column_order,
null_precedence)
)
else:
with nogil:
c_result = move(upper_bound(
bins_view,
source_values_table_view,
source_table_view,
column_order,
null_precedence)
)
Expand All @@ -212,15 +198,13 @@ class RankMethod(IntEnum):
DENSE = < underlying_type_t_rank_method > rank_method.DENSE


def rank_columns(source_table, object method, str na_option,
def rank_columns(list source_columns, object method, str na_option,
bool ascending, bool pct
):
"""
Compute numerical data ranks (1 through n) of each column in the dataframe
"""
cdef table_view source_table_view = table_view_from_table(
source_table, ignore_index=True
)
cdef table_view source_table_view = table_view_from_columns(source_columns)

cdef rank_method c_rank_method = < rank_method > (
< underlying_type_t_rank_method > method
Expand Down Expand Up @@ -260,7 +244,7 @@ def rank_columns(source_table, object method, str na_option,
cdef vector[unique_ptr[column]] c_results
cdef column_view c_view
cdef Column col
for col in source_table._columns:
for col in source_columns:
c_view = col.view()
with nogil:
c_results.push_back(move(
Expand All @@ -274,11 +258,6 @@ def rank_columns(source_table, object method, str na_option,
)
))

cdef unique_ptr[table] c_result
c_result.reset(new table(move(c_results)))
data, _ = data_from_unique_ptr(
move(c_result),
column_names=source_table._column_names,
index_names=None
)
return data, source_table._index
return [Column.from_unique_ptr(
move(c_results[i])
) for i in range(c_results.size())]
60 changes: 7 additions & 53 deletions python/cudf/cudf/_lib/transpose.pyx
Original file line number Diff line number Diff line change
@@ -1,73 +1,27 @@
# Copyright (c) 2020, NVIDIA CORPORATION.

import cudf
from cudf.api.types import is_categorical_dtype
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.pair cimport pair
from libcpp.utility cimport move

from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.transpose cimport transpose as cpp_transpose
from cudf._lib.utils cimport data_from_table_view, table_view_from_table

from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns

def transpose(source):
"""Transpose index and columns.

See Also
--------
cudf.core.DataFrame.transpose
def transpose(list source_columns):
Copy link
Contributor Author

@isVoid isVoid Apr 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change to transpose converts the categorical column into numerical column codes. These calls depends on higher level APIs/external APIs, which I would like to avoid in cython. I thus moved them to the python API.

"""Transpose m n-row columns into n m-row columns
"""

if source._num_columns == 0:
return source

cats = None
columns = source._columns
dtype = columns[0].dtype

if is_categorical_dtype(dtype):
if any(not is_categorical_dtype(c.dtype) for c in columns):
raise ValueError('Columns must all have the same dtype')
cats = list(c.categories for c in columns)
cats = cudf.core.column.concat_columns(cats).unique()
source = cudf.core.frame.Frame(index=source._index, data=[
(name, col._set_categories(cats, is_unique=True).codes)
for name, col in source._data.items()
])
elif any(c.dtype != dtype for c in columns):
raise ValueError('Columns must all have the same dtype')

cdef pair[unique_ptr[column], table_view] c_result
cdef table_view c_input = table_view_from_table(
source, ignore_index=True)
cdef table_view c_input = table_view_from_columns(source_columns)

with nogil:
c_result = move(cpp_transpose(c_input))

result_owner = Column.from_unique_ptr(move(c_result.first))
data, _ = data_from_table_view(
return columns_from_table_view(
c_result.second,
owner=result_owner,
column_names=range(c_input.num_rows())
owners=[result_owner] * c_result.second.num_columns()
)

if cats is not None:
data= [
(name, cudf.core.column.column.build_categorical_column(
codes=cudf.core.column.column.build_column(
col.base_data, dtype=col.dtype),
mask=col.base_mask,
size=col.size,
categories=cats,
offset=col.offset,
))
for name, col in data.items()
]

return data
8 changes: 4 additions & 4 deletions python/cudf/cudf/_lib/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -317,10 +317,10 @@ cdef columns_from_table_view(
):
"""
Given a ``cudf::table_view``, construsts a list of columns from it,
along with referencing an ``owner`` Python object that owns the memory
lifetime. ``owner`` must be either None or a list of column. If ``owner``
is a list of columns, the owner of the `i`th ``cudf::column_view`` in the
table view is ``owners[i]``. For more about memory ownership,
along with referencing an owner Python object that owns the memory
lifetime. owner must be either None or a list of column. If owner
is a list of columns, the owner of the `i`th ``cudf::column_view``
in the table view is ``owners[i]``. For more about memory ownership,
Comment on lines +320 to +323
Copy link
Contributor Author

@isVoid isVoid Apr 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current docstring is refers to owner as an argument with backticks. But owners (plural form) is the actual argument here.

see ``Column.from_column_view``.
"""

Expand Down
4 changes: 1 addition & 3 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,4 @@ def digitize(
if bin_col.nullable:
raise ValueError("`bins` cannot contain null entries.")

return as_column(
libcudf.sort.digitize(column.as_frame(), bin_col.as_frame(), right)
)
return as_column(libcudf.sort.digitize([column], [bin_col], right))
37 changes: 31 additions & 6 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3193,17 +3193,42 @@ def transpose(self):
Difference from pandas:
Not supporting *copy* because default and only behavior is copy=True
"""
# Never transpose a MultiIndex - remove the existing columns and
# replace with a RangeIndex. Afterward, reassign.
columns = self.index.copy(deep=False)

index = self._data.to_pandas_index()
columns = self.index.copy(deep=False)
if self._num_columns == 0 or self._num_rows == 0:
return DataFrame(index=index, columns=columns)

# No column from index is transposed with libcudf.
source_columns = [*self._columns]
source_dtype = source_columns[0].dtype
if is_categorical_dtype(source_dtype):
if any(not is_categorical_dtype(c.dtype) for c in source_columns):
raise ValueError("Columns must all have the same dtype")
cats = list(c.categories for c in source_columns)
cats = cudf.core.column.concat_columns(cats).unique()
source_columns = [
col._set_categories(cats, is_unique=True).codes
for col in source_columns
]

if any(c.dtype != source_columns[0].dtype for c in source_columns):
raise ValueError("Columns must all have the same dtype")

result_columns = libcudf.transpose.transpose(source_columns)

if is_categorical_dtype(source_dtype):
result_columns = [
codes._with_type_metadata(
cudf.core.dtypes.CategoricalDtype(categories=cats)
)
for codes in result_columns
]

# Set the old column names as the new index
result = self.__class__._from_data(
# Cython renames the columns to the range [0...ncols]
libcudf.transpose.transpose(self),
as_index(index),
{i: col for i, col in enumerate(result_columns)},
index=as_index(index),
)
# Set the old index as the new column names
result.columns = columns
Expand Down
Loading