Cython API Refactor: transpose.pyx, sort.pyx (#10675)

This PR contributes to #10153, refactors all cython APIs in `transpose.pyx`, `sort.pyx` to accept a list of columns as input. This PR also includes several minor improvements in the code base, see comments below for detail. Authors: - Michael Wang (https://github.com/isVoid) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: #10675
rapidsai · Apr 19, 2022 · 31a5f44 · 31a5f44
1 parent 304711a
commit 31a5f44
Show file tree

Hide file tree

Showing 9 changed files with 222 additions and 246 deletions.
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
@@ -1,6 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-import pandas as pd
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -23,19 +21,19 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport null_order, null_policy, order
 from cudf._lib.sort cimport underlying_type_t_rank_method
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 def is_sorted(
-    source_table, object ascending=None, object null_position=None
+    list source_columns, object ascending=None, object null_position=None
 ):
     """
     Checks whether the rows of a `table` are sorted in lexicographical order.
 
     Parameters
     ----------
-    source_table : Frame
-        Frame whose columns are to be checked for sort order
+    source_columns : list of columns
+        columns to be checked for sort order
     ascending : None or list-like of booleans
         None or list-like of boolean values indicating expected sort order of
         each column. If list-like, size of list-like must be len(columns). If
@@ -58,51 +56,39 @@ def is_sorted(
     cdef vector[null_order] null_precedence
 
     if ascending is None:
-        column_order = vector[order](
-            source_table._num_columns, order.ASCENDING
-        )
-    elif pd.api.types.is_list_like(ascending):
-        if len(ascending) != source_table._num_columns:
+        column_order = vector[order](len(source_columns), order.ASCENDING)
+    else:
+        if len(ascending) != len(source_columns):
             raise ValueError(
-                f"Expected a list-like of length {source_table._num_columns}, "
+                f"Expected a list-like of length {len(source_columns)}, "
                 f"got length {len(ascending)} for `ascending`"
             )
         column_order = vector[order](
-            source_table._num_columns, order.DESCENDING
+            len(source_columns), order.DESCENDING
         )
         for idx, val in enumerate(ascending):
             if val:
                 column_order[idx] = order.ASCENDING
-    else:
-        raise TypeError(
-            f"Expected a list-like or None for `ascending`, got "
-            f"{type(ascending)}"
-        )
 
     if null_position is None:
         null_precedence = vector[null_order](
-            source_table._num_columns, null_order.AFTER
+            len(source_columns), null_order.AFTER
         )
-    elif pd.api.types.is_list_like(null_position):
-        if len(null_position) != source_table._num_columns:
+    else:
+        if len(null_position) != len(source_columns):
             raise ValueError(
-                f"Expected a list-like of length {source_table._num_columns}, "
+                f"Expected a list-like of length {len(source_columns)}, "
                 f"got length {len(null_position)} for `null_position`"
             )
         null_precedence = vector[null_order](
-            source_table._num_columns, null_order.AFTER
+            len(source_columns), null_order.AFTER
         )
         for idx, val in enumerate(null_position):
             if val:
                 null_precedence[idx] = null_order.BEFORE
-    else:
-        raise TypeError(
-            f"Expected a list-like or None for `null_position`, got "
-            f"{type(null_position)}"
-        )
 
     cdef bool c_result
-    cdef table_view source_table_view = table_view_from_table(source_table)
+    cdef table_view source_table_view = table_view_from_columns(source_columns)
     with nogil:
         c_result = cpp_is_sorted(
             source_table_view,
@@ -113,34 +99,34 @@ def is_sorted(
     return c_result
 
 
-def order_by(source_table, object ascending, str na_position):
+def order_by(list columns_from_table, object ascending, str na_position):
     """
-    Sorting the table ascending/descending
+    Get index to sort the table in ascending/descending order.
 
     Parameters
     ----------
-    source_table : table which will be sorted
-    ascending : list of boolean values which correspond to each column
+    columns_from_table : columns from the table which will be sorted
+    ascending : sequence of boolean values which correspond to each column
                 in source_table signifying order of each column
                 True - Ascending and False - Descending
     na_position : whether null value should show up at the "first" or "last"
                 position of **all** sorted column.
     """
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, ignore_index=True
+    cdef table_view source_table_view = table_view_from_columns(
+        columns_from_table
     )
     cdef vector[order] column_order
     column_order.reserve(len(ascending))
     cdef vector[null_order] null_precedence
     null_precedence.reserve(len(ascending))
 
-    for i in ascending:
-        if i is True:
+    for asc in ascending:
+        if asc:
             column_order.push_back(order.ASCENDING)
         else:
             column_order.push_back(order.DESCENDING)
 
-        if i ^ (na_position == "first"):
+        if asc ^ (na_position == "first"):
             null_precedence.push_back(null_order.AFTER)
         else:
             null_precedence.push_back(null_order.BEFORE)
@@ -154,21 +140,21 @@ def order_by(source_table, object ascending, str na_position):
     return Column.from_unique_ptr(move(c_result))
 
 
-def digitize(source_values_table, bins, bool right=False):
+def digitize(list source_columns, list bins, bool right=False):
     """
     Return the indices of the bins to which each value in source_table belongs.
 
     Parameters
     ----------
-    source_table : Input table to be binned.
-    bins : Frame containing columns of bins
+    source_columns : Input columns to be binned.
+    bins : List containing columns of bins
     right : Indicating whether the intervals include the
             right or the left bin edge.
     """
 
-    cdef table_view bins_view = table_view_from_table(bins)
-    cdef table_view source_values_table_view = table_view_from_table(
-        source_values_table
+    cdef table_view bins_view = table_view_from_columns(bins)
+    cdef table_view source_table_view = table_view_from_columns(
+        source_columns
     )
     cdef vector[order] column_order = (
         vector[order](
@@ -184,19 +170,19 @@ def digitize(source_values_table, bins, bool right=False):
     )
 
     cdef unique_ptr[column] c_result
-    if right is True:
+    if right:
         with nogil:
             c_result = move(lower_bound(
                 bins_view,
-                source_values_table_view,
+                source_table_view,
                 column_order,
                 null_precedence)
             )
     else:
         with nogil:
             c_result = move(upper_bound(
                 bins_view,
-                source_values_table_view,
+                source_table_view,
                 column_order,
                 null_precedence)
             )
@@ -212,15 +198,13 @@ class RankMethod(IntEnum):
     DENSE = < underlying_type_t_rank_method > rank_method.DENSE
 
 
-def rank_columns(source_table, object method, str na_option,
+def rank_columns(list source_columns, object method, str na_option,
                  bool ascending, bool pct
                  ):
     """
     Compute numerical data ranks (1 through n) of each column in the dataframe
     """
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, ignore_index=True
-    )
+    cdef table_view source_table_view = table_view_from_columns(source_columns)
 
     cdef rank_method c_rank_method = < rank_method > (
         < underlying_type_t_rank_method > method
@@ -260,7 +244,7 @@ def rank_columns(source_table, object method, str na_option,
     cdef vector[unique_ptr[column]] c_results
     cdef column_view c_view
     cdef Column col
-    for col in source_table._columns:
+    for col in source_columns:
         c_view = col.view()
         with nogil:
             c_results.push_back(move(
@@ -274,11 +258,6 @@ def rank_columns(source_table, object method, str na_option,
                 )
             ))
 
-    cdef unique_ptr[table] c_result
-    c_result.reset(new table(move(c_results)))
-    data, _ = data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=None
-    )
-    return data, source_table._index
+    return [Column.from_unique_ptr(
+        move(c_results[i])
+    ) for i in range(c_results.size())]
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
@@ -1,73 +1,27 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-import cudf
-from cudf.api.types import is_categorical_dtype
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.transpose cimport transpose as cpp_transpose
-from cudf._lib.utils cimport data_from_table_view, table_view_from_table
-
+from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
 
-def transpose(source):
-    """Transpose index and columns.
 
-    See Also
-    --------
-    cudf.core.DataFrame.transpose
+def transpose(list source_columns):
+    """Transpose m n-row columns into n m-row columns
     """
-
-    if source._num_columns == 0:
-        return source
-
-    cats = None
-    columns = source._columns
-    dtype = columns[0].dtype
-
-    if is_categorical_dtype(dtype):
-        if any(not is_categorical_dtype(c.dtype) for c in columns):
-            raise ValueError('Columns must all have the same dtype')
-        cats = list(c.categories for c in columns)
-        cats = cudf.core.column.concat_columns(cats).unique()
-        source = cudf.core.frame.Frame(index=source._index, data=[
-            (name, col._set_categories(cats, is_unique=True).codes)
-            for name, col in source._data.items()
-        ])
-    elif any(c.dtype != dtype for c in columns):
-        raise ValueError('Columns must all have the same dtype')
-
     cdef pair[unique_ptr[column], table_view] c_result
-    cdef table_view c_input = table_view_from_table(
-        source, ignore_index=True)
+    cdef table_view c_input = table_view_from_columns(source_columns)
 
     with nogil:
         c_result = move(cpp_transpose(c_input))
 
     result_owner = Column.from_unique_ptr(move(c_result.first))
-    data, _ = data_from_table_view(
+    return columns_from_table_view(
         c_result.second,
-        owner=result_owner,
-        column_names=range(c_input.num_rows())
+        owners=[result_owner] * c_result.second.num_columns()
     )
-
-    if cats is not None:
-        data= [
-            (name, cudf.core.column.column.build_categorical_column(
-                codes=cudf.core.column.column.build_column(
-                    col.base_data, dtype=col.dtype),
-                mask=col.base_mask,
-                size=col.size,
-                categories=cats,
-                offset=col.offset,
-            ))
-            for name, col in data.items()
-        ]
-
-    return data
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
@@ -317,10 +317,10 @@ cdef columns_from_table_view(
 ):
     """
     Given a ``cudf::table_view``, construsts a list of columns from it,
-    along with referencing an ``owner`` Python object that owns the memory
-    lifetime. ``owner`` must be either None or a list of column. If ``owner``
-    is a list of columns, the owner of the `i`th ``cudf::column_view`` in the
-    table view is ``owners[i]``. For more about memory ownership,
+    along with referencing an owner Python object that owns the memory
+    lifetime. owner must be either None or a list of column. If owner
+    is a list of columns, the owner of the `i`th ``cudf::column_view``
+    in the table view is ``owners[i]``. For more about memory ownership,
     see ``Column.from_column_view``.
     """
 

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -774,6 +774,4 @@ def digitize(
     if bin_col.nullable:
         raise ValueError("`bins` cannot contain null entries.")
 
-    return as_column(
-        libcudf.sort.digitize(column.as_frame(), bin_col.as_frame(), right)
-    )
+    return as_column(libcudf.sort.digitize([column], [bin_col], right))
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -3194,17 +3194,42 @@ def transpose(self):
         Difference from pandas:
         Not supporting *copy* because default and only behavior is copy=True
         """
-        # Never transpose a MultiIndex - remove the existing columns and
-        # replace with a RangeIndex. Afterward, reassign.
-        columns = self.index.copy(deep=False)
+
         index = self._data.to_pandas_index()
+        columns = self.index.copy(deep=False)
         if self._num_columns == 0 or self._num_rows == 0:
             return DataFrame(index=index, columns=columns)
+
+        # No column from index is transposed with libcudf.
+        source_columns = [*self._columns]
+        source_dtype = source_columns[0].dtype
+        if is_categorical_dtype(source_dtype):
+            if any(not is_categorical_dtype(c.dtype) for c in source_columns):
+                raise ValueError("Columns must all have the same dtype")
+            cats = list(c.categories for c in source_columns)
+            cats = cudf.core.column.concat_columns(cats).unique()
+            source_columns = [
+                col._set_categories(cats, is_unique=True).codes
+                for col in source_columns
+            ]
+
+        if any(c.dtype != source_columns[0].dtype for c in source_columns):
+            raise ValueError("Columns must all have the same dtype")
+
+        result_columns = libcudf.transpose.transpose(source_columns)
+
+        if is_categorical_dtype(source_dtype):
+            result_columns = [
+                codes._with_type_metadata(
+                    cudf.core.dtypes.CategoricalDtype(categories=cats)
+                )
+                for codes in result_columns
+            ]
+
         # Set the old column names as the new index
         result = self.__class__._from_data(
-            # Cython renames the columns to the range [0...ncols]
-            libcudf.transpose.transpose(self),
-            as_index(index),
+            {i: col for i, col in enumerate(result_columns)},
+            index=as_index(index),
         )
         # Set the old index as the new column names
         result.columns = columns