Skip to content

Commit

Permalink
Cython API Refactor: transpose.pyx, sort.pyx (#10675)
Browse files Browse the repository at this point in the history
This PR contributes to #10153, refactors all cython APIs in `transpose.pyx`, `sort.pyx` to accept a list of columns as input.

This PR also includes several minor improvements in the code base, see comments below for detail.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: #10675
  • Loading branch information
isVoid authored Apr 19, 2022
1 parent 304711a commit 31a5f44
Show file tree
Hide file tree
Showing 9 changed files with 222 additions and 246 deletions.
101 changes: 40 additions & 61 deletions python/cudf/cudf/_lib/sort.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.

import pandas as pd
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
Expand All @@ -23,19 +21,19 @@ from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport null_order, null_policy, order
from cudf._lib.sort cimport underlying_type_t_rank_method
from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns


def is_sorted(
source_table, object ascending=None, object null_position=None
list source_columns, object ascending=None, object null_position=None
):
"""
Checks whether the rows of a `table` are sorted in lexicographical order.
Parameters
----------
source_table : Frame
Frame whose columns are to be checked for sort order
source_columns : list of columns
columns to be checked for sort order
ascending : None or list-like of booleans
None or list-like of boolean values indicating expected sort order of
each column. If list-like, size of list-like must be len(columns). If
Expand All @@ -58,51 +56,39 @@ def is_sorted(
cdef vector[null_order] null_precedence

if ascending is None:
column_order = vector[order](
source_table._num_columns, order.ASCENDING
)
elif pd.api.types.is_list_like(ascending):
if len(ascending) != source_table._num_columns:
column_order = vector[order](len(source_columns), order.ASCENDING)
else:
if len(ascending) != len(source_columns):
raise ValueError(
f"Expected a list-like of length {source_table._num_columns}, "
f"Expected a list-like of length {len(source_columns)}, "
f"got length {len(ascending)} for `ascending`"
)
column_order = vector[order](
source_table._num_columns, order.DESCENDING
len(source_columns), order.DESCENDING
)
for idx, val in enumerate(ascending):
if val:
column_order[idx] = order.ASCENDING
else:
raise TypeError(
f"Expected a list-like or None for `ascending`, got "
f"{type(ascending)}"
)

if null_position is None:
null_precedence = vector[null_order](
source_table._num_columns, null_order.AFTER
len(source_columns), null_order.AFTER
)
elif pd.api.types.is_list_like(null_position):
if len(null_position) != source_table._num_columns:
else:
if len(null_position) != len(source_columns):
raise ValueError(
f"Expected a list-like of length {source_table._num_columns}, "
f"Expected a list-like of length {len(source_columns)}, "
f"got length {len(null_position)} for `null_position`"
)
null_precedence = vector[null_order](
source_table._num_columns, null_order.AFTER
len(source_columns), null_order.AFTER
)
for idx, val in enumerate(null_position):
if val:
null_precedence[idx] = null_order.BEFORE
else:
raise TypeError(
f"Expected a list-like or None for `null_position`, got "
f"{type(null_position)}"
)

cdef bool c_result
cdef table_view source_table_view = table_view_from_table(source_table)
cdef table_view source_table_view = table_view_from_columns(source_columns)
with nogil:
c_result = cpp_is_sorted(
source_table_view,
Expand All @@ -113,34 +99,34 @@ def is_sorted(
return c_result


def order_by(source_table, object ascending, str na_position):
def order_by(list columns_from_table, object ascending, str na_position):
"""
Sorting the table ascending/descending
Get index to sort the table in ascending/descending order.
Parameters
----------
source_table : table which will be sorted
ascending : list of boolean values which correspond to each column
columns_from_table : columns from the table which will be sorted
ascending : sequence of boolean values which correspond to each column
in source_table signifying order of each column
True - Ascending and False - Descending
na_position : whether null value should show up at the "first" or "last"
position of **all** sorted column.
"""
cdef table_view source_table_view = table_view_from_table(
source_table, ignore_index=True
cdef table_view source_table_view = table_view_from_columns(
columns_from_table
)
cdef vector[order] column_order
column_order.reserve(len(ascending))
cdef vector[null_order] null_precedence
null_precedence.reserve(len(ascending))

for i in ascending:
if i is True:
for asc in ascending:
if asc:
column_order.push_back(order.ASCENDING)
else:
column_order.push_back(order.DESCENDING)

if i ^ (na_position == "first"):
if asc ^ (na_position == "first"):
null_precedence.push_back(null_order.AFTER)
else:
null_precedence.push_back(null_order.BEFORE)
Expand All @@ -154,21 +140,21 @@ def order_by(source_table, object ascending, str na_position):
return Column.from_unique_ptr(move(c_result))


def digitize(source_values_table, bins, bool right=False):
def digitize(list source_columns, list bins, bool right=False):
"""
Return the indices of the bins to which each value in source_table belongs.
Parameters
----------
source_table : Input table to be binned.
bins : Frame containing columns of bins
source_columns : Input columns to be binned.
bins : List containing columns of bins
right : Indicating whether the intervals include the
right or the left bin edge.
"""

cdef table_view bins_view = table_view_from_table(bins)
cdef table_view source_values_table_view = table_view_from_table(
source_values_table
cdef table_view bins_view = table_view_from_columns(bins)
cdef table_view source_table_view = table_view_from_columns(
source_columns
)
cdef vector[order] column_order = (
vector[order](
Expand All @@ -184,19 +170,19 @@ def digitize(source_values_table, bins, bool right=False):
)

cdef unique_ptr[column] c_result
if right is True:
if right:
with nogil:
c_result = move(lower_bound(
bins_view,
source_values_table_view,
source_table_view,
column_order,
null_precedence)
)
else:
with nogil:
c_result = move(upper_bound(
bins_view,
source_values_table_view,
source_table_view,
column_order,
null_precedence)
)
Expand All @@ -212,15 +198,13 @@ class RankMethod(IntEnum):
DENSE = < underlying_type_t_rank_method > rank_method.DENSE


def rank_columns(source_table, object method, str na_option,
def rank_columns(list source_columns, object method, str na_option,
bool ascending, bool pct
):
"""
Compute numerical data ranks (1 through n) of each column in the dataframe
"""
cdef table_view source_table_view = table_view_from_table(
source_table, ignore_index=True
)
cdef table_view source_table_view = table_view_from_columns(source_columns)

cdef rank_method c_rank_method = < rank_method > (
< underlying_type_t_rank_method > method
Expand Down Expand Up @@ -260,7 +244,7 @@ def rank_columns(source_table, object method, str na_option,
cdef vector[unique_ptr[column]] c_results
cdef column_view c_view
cdef Column col
for col in source_table._columns:
for col in source_columns:
c_view = col.view()
with nogil:
c_results.push_back(move(
Expand All @@ -274,11 +258,6 @@ def rank_columns(source_table, object method, str na_option,
)
))

cdef unique_ptr[table] c_result
c_result.reset(new table(move(c_results)))
data, _ = data_from_unique_ptr(
move(c_result),
column_names=source_table._column_names,
index_names=None
)
return data, source_table._index
return [Column.from_unique_ptr(
move(c_results[i])
) for i in range(c_results.size())]
60 changes: 7 additions & 53 deletions python/cudf/cudf/_lib/transpose.pyx
Original file line number Diff line number Diff line change
@@ -1,73 +1,27 @@
# Copyright (c) 2020, NVIDIA CORPORATION.

import cudf
from cudf.api.types import is_categorical_dtype
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.pair cimport pair
from libcpp.utility cimport move

from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.transpose cimport transpose as cpp_transpose
from cudf._lib.utils cimport data_from_table_view, table_view_from_table

from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns

def transpose(source):
"""Transpose index and columns.

See Also
--------
cudf.core.DataFrame.transpose
def transpose(list source_columns):
"""Transpose m n-row columns into n m-row columns
"""

if source._num_columns == 0:
return source

cats = None
columns = source._columns
dtype = columns[0].dtype

if is_categorical_dtype(dtype):
if any(not is_categorical_dtype(c.dtype) for c in columns):
raise ValueError('Columns must all have the same dtype')
cats = list(c.categories for c in columns)
cats = cudf.core.column.concat_columns(cats).unique()
source = cudf.core.frame.Frame(index=source._index, data=[
(name, col._set_categories(cats, is_unique=True).codes)
for name, col in source._data.items()
])
elif any(c.dtype != dtype for c in columns):
raise ValueError('Columns must all have the same dtype')

cdef pair[unique_ptr[column], table_view] c_result
cdef table_view c_input = table_view_from_table(
source, ignore_index=True)
cdef table_view c_input = table_view_from_columns(source_columns)

with nogil:
c_result = move(cpp_transpose(c_input))

result_owner = Column.from_unique_ptr(move(c_result.first))
data, _ = data_from_table_view(
return columns_from_table_view(
c_result.second,
owner=result_owner,
column_names=range(c_input.num_rows())
owners=[result_owner] * c_result.second.num_columns()
)

if cats is not None:
data= [
(name, cudf.core.column.column.build_categorical_column(
codes=cudf.core.column.column.build_column(
col.base_data, dtype=col.dtype),
mask=col.base_mask,
size=col.size,
categories=cats,
offset=col.offset,
))
for name, col in data.items()
]

return data
8 changes: 4 additions & 4 deletions python/cudf/cudf/_lib/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -317,10 +317,10 @@ cdef columns_from_table_view(
):
"""
Given a ``cudf::table_view``, construsts a list of columns from it,
along with referencing an ``owner`` Python object that owns the memory
lifetime. ``owner`` must be either None or a list of column. If ``owner``
is a list of columns, the owner of the `i`th ``cudf::column_view`` in the
table view is ``owners[i]``. For more about memory ownership,
along with referencing an owner Python object that owns the memory
lifetime. owner must be either None or a list of column. If owner
is a list of columns, the owner of the `i`th ``cudf::column_view``
in the table view is ``owners[i]``. For more about memory ownership,
see ``Column.from_column_view``.
"""

Expand Down
4 changes: 1 addition & 3 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,4 @@ def digitize(
if bin_col.nullable:
raise ValueError("`bins` cannot contain null entries.")

return as_column(
libcudf.sort.digitize(column.as_frame(), bin_col.as_frame(), right)
)
return as_column(libcudf.sort.digitize([column], [bin_col], right))
37 changes: 31 additions & 6 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3194,17 +3194,42 @@ def transpose(self):
Difference from pandas:
Not supporting *copy* because default and only behavior is copy=True
"""
# Never transpose a MultiIndex - remove the existing columns and
# replace with a RangeIndex. Afterward, reassign.
columns = self.index.copy(deep=False)

index = self._data.to_pandas_index()
columns = self.index.copy(deep=False)
if self._num_columns == 0 or self._num_rows == 0:
return DataFrame(index=index, columns=columns)

# No column from index is transposed with libcudf.
source_columns = [*self._columns]
source_dtype = source_columns[0].dtype
if is_categorical_dtype(source_dtype):
if any(not is_categorical_dtype(c.dtype) for c in source_columns):
raise ValueError("Columns must all have the same dtype")
cats = list(c.categories for c in source_columns)
cats = cudf.core.column.concat_columns(cats).unique()
source_columns = [
col._set_categories(cats, is_unique=True).codes
for col in source_columns
]

if any(c.dtype != source_columns[0].dtype for c in source_columns):
raise ValueError("Columns must all have the same dtype")

result_columns = libcudf.transpose.transpose(source_columns)

if is_categorical_dtype(source_dtype):
result_columns = [
codes._with_type_metadata(
cudf.core.dtypes.CategoricalDtype(categories=cats)
)
for codes in result_columns
]

# Set the old column names as the new index
result = self.__class__._from_data(
# Cython renames the columns to the range [0...ncols]
libcudf.transpose.transpose(self),
as_index(index),
{i: col for i, col in enumerate(result_columns)},
index=as_index(index),
)
# Set the old index as the new column names
result.columns = columns
Expand Down
Loading

0 comments on commit 31a5f44

Please sign in to comment.