diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst new file mode 100644 index 00000000000..739305af5d4 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst @@ -0,0 +1,6 @@ +=========== +aggregation +=========== + +.. automodule:: cudf._lib.pylibcudf.aggregation + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst new file mode 100644 index 00000000000..d6e994f7dbc --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst @@ -0,0 +1,6 @@ +======= +groupby +======= + +.. automodule:: cudf._lib.pylibcudf.groupby + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 7504295de92..4735b0d9414 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -8,10 +8,12 @@ This page provides API documentation for pylibcudf. :maxdepth: 1 :caption: API Documentation + aggregation binaryop column copying gpumemoryview + groupby scalar table types diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 72c5e288f0b..b202d08ac2e 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from enum import Enum, IntEnum @@ -51,7 +51,7 @@ class AggregationKind(Enum): NUNIQUE = libcudf_aggregation.aggregation.Kind.NUNIQUE NTH = libcudf_aggregation.aggregation.Kind.NTH_ELEMENT RANK = libcudf_aggregation.aggregation.Kind.RANK - COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT + COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT_LIST UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET PTX = libcudf_aggregation.aggregation.Kind.PTX CUDA = libcudf_aggregation.aggregation.Kind.CUDA @@ -191,7 +191,7 @@ cdef class RollingAggregation: cdef RollingAggregation agg = cls() agg.c_obj = move( libcudf_aggregation.make_collect_list_aggregation[ - rolling_aggregation]()) + rolling_aggregation](libcudf_types.null_policy.INCLUDE)) return agg @classmethod @@ -335,7 +335,9 @@ cdef class GroupbyAggregation: cdef GroupbyAggregation agg = cls() agg.c_obj = move( libcudf_aggregation. - make_collect_list_aggregation[groupby_aggregation]()) + make_collect_list_aggregation[groupby_aggregation]( + libcudf_types.null_policy.INCLUDE + )) return agg @classmethod @@ -343,7 +345,9 @@ cdef class GroupbyAggregation: cdef GroupbyAggregation agg = cls() agg.c_obj = move( libcudf_aggregation. - make_nunique_aggregation[groupby_aggregation]()) + make_nunique_aggregation[groupby_aggregation]( + libcudf_types.null_policy.EXCLUDE + )) return agg @classmethod @@ -422,7 +426,11 @@ cdef class GroupbyAggregation: cdef GroupbyAggregation agg = cls() agg.c_obj = move( libcudf_aggregation. - make_collect_set_aggregation[groupby_aggregation]()) + make_collect_set_aggregation[groupby_aggregation]( + libcudf_types.null_policy.INCLUDE, + libcudf_types.null_equality.EQUAL, + libcudf_types.nan_equality.ALL_EQUAL, + )) return agg @classmethod @@ -724,7 +732,9 @@ cdef class ReduceAggregation: def nunique(cls): cdef ReduceAggregation agg = cls() agg.c_obj = move( - libcudf_aggregation.make_nunique_aggregation[reduce_aggregation]()) + libcudf_aggregation.make_nunique_aggregation[reduce_aggregation]( + libcudf_types.null_policy.EXCLUDE + )) return agg @classmethod diff --git a/python/cudf/cudf/_lib/cpp/CMakeLists.txt b/python/cudf/cudf/_lib/cpp/CMakeLists.txt index 764f28add0e..316541c9bc5 100644 --- a/python/cudf/cudf/_lib/cpp/CMakeLists.txt +++ b/python/cudf/cudf/_lib/cpp/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources binaryop.pyx copying.pyx types.pyx) +set(cython_sources aggregation.pyx binaryop.pyx copying.pyx types.pyx) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index a1d1485e1e8..16f48b30a50 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -8,6 +8,8 @@ from libcpp.vector cimport vector from cudf._lib.cpp.types cimport ( data_type, interpolation, + nan_equality, + null_equality, null_order, null_policy, order, @@ -19,71 +21,74 @@ ctypedef int32_t underlying_type_t_rank_method cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: - cdef cppclass aggregation: - ctypedef enum Kind: - SUM 'cudf::aggregation::SUM' - PRODUCT 'cudf::aggregation::PRODUCT' - MIN 'cudf::aggregation::MIN' - MAX 'cudf::aggregation::MAX' - COUNT_VALID 'cudf::aggregation::COUNT_VALID' - COUNT_ALL 'cudf::aggregation::COUNT_ALL' - ANY 'cudf::aggregation::ANY' - ALL 'cudf::aggregation::ALL' - SUM_OF_SQUARES 'cudf::aggregation::SUM_OF_SQUARES' - MEAN 'cudf::aggregation::MEAN' - VARIANCE 'cudf::aggregation::VARIANCE' - STD 'cudf::aggregation::STD' - MEDIAN 'cudf::aggregation::MEDIAN' - QUANTILE 'cudf::aggregation::QUANTILE' - ARGMAX 'cudf::aggregation::ARGMAX' - ARGMIN 'cudf::aggregation::ARGMIN' - NUNIQUE 'cudf::aggregation::NUNIQUE' - NTH_ELEMENT 'cudf::aggregation::NTH_ELEMENT' - RANK 'cudf::aggregation::RANK' - COLLECT 'cudf::aggregation::COLLECT_LIST' - COLLECT_SET 'cudf::aggregation::COLLECT_SET' - PTX 'cudf::aggregation::PTX' - CUDA 'cudf::aggregation::CUDA' - CORRELATION 'cudf::aggregation::CORRELATION' - COVARIANCE 'cudf::aggregation::COVARIANCE' + # Cython doesn't appear to support enum class nested inside a class, so + # have to namespace it manually + cpdef enum class Kind "cudf::aggregation::Kind": + SUM + PRODUCT + MIN + MAX + COUNT_VALID + COUNT_ALL + ANY + ALL + SUM_OF_SQUARES + MEAN + VARIANCE + STD + MEDIAN + QUANTILE + ARGMAX + ARGMIN + NUNIQUE + NTH_ELEMENT + RANK + COLLECT_LIST + COLLECT_SET + PTX + CUDA + CORRELATION + COVARIANCE + cdef cppclass aggregation: Kind kind + unique_ptr[aggregation] clone() - cdef cppclass rolling_aggregation: - aggregation.Kind kind + cdef cppclass rolling_aggregation(aggregation): + pass - cdef cppclass groupby_aggregation: - aggregation.Kind kind + cdef cppclass groupby_aggregation(aggregation): + pass - cdef cppclass groupby_scan_aggregation: - aggregation.Kind kind + cdef cppclass groupby_scan_aggregation(aggregation): + pass - cdef cppclass reduce_aggregation: - aggregation.Kind kind + cdef cppclass reduce_aggregation(aggregation): + pass - cdef cppclass scan_aggregation: - aggregation.Kind kind + cdef cppclass scan_aggregation(aggregation): + pass - ctypedef enum udf_type: - CUDA 'cudf::udf_type::CUDA' - PTX 'cudf::udf_type::PTX' + cpdef enum class udf_type(bool): + CUDA + PTX - ctypedef enum correlation_type: - PEARSON 'cudf::correlation_type::PEARSON' - KENDALL 'cudf::correlation_type::KENDALL' - SPEARMAN 'cudf::correlation_type::SPEARMAN' + cpdef enum class correlation_type(int32_t): + PEARSON + KENDALL + SPEARMAN - ctypedef enum rank_method: - FIRST "cudf::rank_method::FIRST" - AVERAGE "cudf::rank_method::AVERAGE" - MIN "cudf::rank_method::MIN" - MAX "cudf::rank_method::MAX" - DENSE "cudf::rank_method::DENSE" + cpdef enum class rank_method(int32_t): + FIRST + AVERAGE + MIN + MAX + DENSE - ctypedef enum rank_percentage: - NONE "cudf::rank_percentage::NONE" - ZERO_NORMALIZED "cudf::rank_percentage::ZERO_NORMALIZED" - ONE_NORMALIZED "cudf::rank_percentage::ONE_NORMALIZED" + cpdef enum class rank_percentage(int32_t): + NONE + ZERO_NORMALIZED + ONE_NORMALIZED cdef unique_ptr[T] make_sum_aggregation[T]() except + @@ -93,8 +98,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: cdef unique_ptr[T] make_max_aggregation[T]() except + - cdef unique_ptr[T] make_count_aggregation[T]() except + - cdef unique_ptr[T] make_count_aggregation[T](null_policy) except + cdef unique_ptr[T] make_any_aggregation[T]() except + @@ -119,20 +122,20 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: cdef unique_ptr[T] make_argmin_aggregation[T]() except + - cdef unique_ptr[T] make_nunique_aggregation[T]() except + - - cdef unique_ptr[T] make_nth_element_aggregation[T]( - size_type n - ) except + + cdef unique_ptr[T] make_nunique_aggregation[T](null_policy null_handling) except + cdef unique_ptr[T] make_nth_element_aggregation[T]( size_type n, null_policy null_handling ) except + - cdef unique_ptr[T] make_collect_list_aggregation[T]() except + + cdef unique_ptr[T] make_collect_list_aggregation[T]( + null_policy null_handling + ) except + - cdef unique_ptr[T] make_collect_set_aggregation[T]() except + + cdef unique_ptr[T] make_collect_set_aggregation[T]( + null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal + ) except + cdef unique_ptr[T] make_udf_aggregation[T]( udf_type type, diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pyx b/python/cudf/cudf/_lib/cpp/aggregation.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd index 0266404fc50..8bbefcde0dd 100644 --- a/python/cudf/cudf/_lib/cpp/groupby.pxd +++ b/python/cudf/cudf/_lib/cpp/groupby.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.functional cimport reference_wrapper @@ -16,7 +16,13 @@ from cudf._lib.cpp.replace cimport replace_policy from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport null_order, null_policy, order, size_type +from cudf._lib.cpp.types cimport ( + null_order, + null_policy, + order, + size_type, + sorted, +) from cudf._lib.cpp.utilities.host_span cimport host_span # workaround for https://github.com/cython/cython/issues/3885 @@ -55,20 +61,20 @@ cdef extern from "cudf/groupby.hpp" \ groupby( const table_view& keys, null_policy include_null_keys, - bool keys_are_sorted, + sorted keys_are_sorted, ) except + groupby( const table_view& keys, null_policy include_null_keys, - bool keys_are_sorted, + sorted keys_are_sorted, const vector[order]& column_order, ) except + groupby( const table_view& keys, null_policy include_null_keys, - bool keys_are_sorted, + sorted keys_are_sorted, const vector[order]& column_order, const vector[null_order]& null_precedence ) except + @@ -100,6 +106,6 @@ cdef extern from "cudf/groupby.hpp" \ groups get_groups(table_view values) except + pair[unique_ptr[table], unique_ptr[table]] replace_nulls( - const table_view& value, + const table_view& values, const vector[replace_policy] replace_policy ) except + diff --git a/python/cudf/cudf/_lib/cpp/replace.pxd b/python/cudf/cudf/_lib/cpp/replace.pxd index c1ec89a6233..74bc9c2bb4c 100644 --- a/python/cudf/cudf/_lib/cpp/replace.pxd +++ b/python/cudf/cudf/_lib/cpp/replace.pxd @@ -1,5 +1,6 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libcpp cimport bool from libcpp.memory cimport unique_ptr from cudf._lib.types import cudf_to_np_types, np_to_cudf_types @@ -11,9 +12,9 @@ from cudf._lib.cpp.scalar.scalar cimport scalar cdef extern from "cudf/replace.hpp" namespace "cudf" nogil: - ctypedef enum replace_policy: - PRECEDING 'cudf::replace_policy::PRECEDING', - FOLLOWING 'cudf::replace_policy::FOLLOWING' + cdef enum class replace_policy(bool): + PRECEDING + FOLLOWING cdef unique_ptr[column] replace_nulls( column_view source_column, diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index acb013c8b8c..0ca0c122c38 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx - table.pyx types.pyx utils.pyx +set(cython_sources aggregation.pyx binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx + groupby.pyx interop.pyx scalar.pyx table.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index f4b8c50eecc..14c98af3fff 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -1,22 +1,25 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. # TODO: Verify consistent usage of relative/absolute imports in pylibcudf. -from . cimport binaryop, copying, interop +from . cimport aggregation, binaryop, copying, groupby, interop from .column cimport Column from .gpumemoryview cimport gpumemoryview from .scalar cimport Scalar from .table cimport Table # TODO: cimport type_id once # https://github.com/cython/cython/issues/5609 is resolved -from .types cimport DataType +from .types cimport DataType, type_id __all__ = [ "Column", "DataType", "Scalar", "Table", + "aggregation", "binaryop", "copying", "gpumemoryview", + "groupby", "interop", + "types", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index a27d80fc5a2..07612d76540 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. -from . import binaryop, copying, interop +from . import aggregation, binaryop, copying, groupby, interop from .column import Column from .gpumemoryview import gpumemoryview from .scalar import Scalar @@ -13,8 +13,11 @@ "Scalar", "Table", "TypeId", + "aggregation", "binaryop", "copying", "gpumemoryview", + "groupby", "interop", + "types", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd new file mode 100644 index 00000000000..8eda16c4165 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -0,0 +1,89 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.aggregation cimport ( + Kind as kind_t, + aggregation, + correlation_type, + groupby_aggregation, + groupby_scan_aggregation, + rank_method, + rank_percentage, +) +from cudf._lib.cpp.types cimport ( + interpolation, + nan_equality, + null_equality, + null_order, + null_policy, + order, + size_type, +) + +from .types cimport DataType + + +cdef class Aggregation: + cdef unique_ptr[aggregation] c_obj + cpdef kind(self) + cdef unique_ptr[groupby_aggregation] clone_underlying_as_groupby(self) except * + cdef unique_ptr[groupby_scan_aggregation] clone_underlying_as_groupby_scan( + self + ) except * + + @staticmethod + cdef Aggregation from_libcudf(unique_ptr[aggregation] agg) + + +cpdef Aggregation sum() + +cpdef Aggregation product() + +cpdef Aggregation min() + +cpdef Aggregation max() + +cpdef Aggregation count(null_policy null_handling = *) + +cpdef Aggregation any() + +cpdef Aggregation all() + +cpdef Aggregation sum_of_squares() + +cpdef Aggregation mean() + +cpdef Aggregation variance(size_type ddof = *) + +cpdef Aggregation std(size_type ddof = *) + +cpdef Aggregation median() + +cpdef Aggregation quantile(list quantiles, interpolation interp = *) + +cpdef Aggregation argmax() + +cpdef Aggregation argmin() + +cpdef Aggregation nunique(null_policy null_handling = *) + +cpdef Aggregation nth_element(size_type n, null_policy null_handling = *) + +cpdef Aggregation collect_list(null_policy null_handling = *) + +cpdef Aggregation collect_set(null_handling = *, nulls_equal = *, nans_equal = *) + +cpdef Aggregation udf(str operation, DataType output_type) + +cpdef Aggregation correlation(correlation_type type, size_type min_periods) + +cpdef Aggregation covariance(size_type min_periods, size_type ddof) + +cpdef Aggregation rank( + rank_method method, + order column_order = *, + null_policy null_handling = *, + null_order null_precedence = *, + rank_percentage percentage = *, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx new file mode 100644 index 00000000000..0b91263d720 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -0,0 +1,513 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.cast cimport dynamic_cast +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.cpp.aggregation cimport ( + aggregation, + correlation_type, + groupby_aggregation, + groupby_scan_aggregation, + make_all_aggregation, + make_any_aggregation, + make_argmax_aggregation, + make_argmin_aggregation, + make_collect_list_aggregation, + make_collect_set_aggregation, + make_correlation_aggregation, + make_count_aggregation, + make_covariance_aggregation, + make_max_aggregation, + make_mean_aggregation, + make_median_aggregation, + make_min_aggregation, + make_nth_element_aggregation, + make_nunique_aggregation, + make_product_aggregation, + make_quantile_aggregation, + make_rank_aggregation, + make_std_aggregation, + make_sum_aggregation, + make_sum_of_squares_aggregation, + make_udf_aggregation, + make_variance_aggregation, + rank_method, + rank_percentage, +) +from cudf._lib.cpp.types cimport ( + interpolation, + nan_equality, + null_equality, + null_order, + null_policy, + order, + size_type, +) + +from cudf._lib.cpp.aggregation import Kind # no-cython-lint +from cudf._lib.cpp.aggregation import \ + correlation_type as CorrelationType # no-cython-lint +from cudf._lib.cpp.aggregation import \ + rank_method as RankMethod # no-cython-lint +from cudf._lib.cpp.aggregation import \ + rank_percentage as RankPercentage # no-cython-lint +from cudf._lib.cpp.aggregation import udf_type as UdfType # no-cython-lint + +from .types cimport DataType + +# workaround for https://github.com/cython/cython/issues/3885 +ctypedef groupby_aggregation * gba_ptr +ctypedef groupby_scan_aggregation * gbsa_ptr + + +cdef class Aggregation: + """A type of aggregation to perform. + + Aggregations are passed to APIs like + :py:func:`~cudf._lib.pylibcudf.groupby.GroupBy.aggregate` to indicate what + operations to perform. Using a class for aggregations provides a unified + API for handling parametrizable aggregations. This class should never be + instantiated directly, only via one of the factory functions. + """ + def __init__(self): + raise ValueError( + "Aggregations should not be constructed directly. Use one of the factories." + ) + + # TODO: Ideally we would include the return type here, but we need to do so + # in a way that Sphinx understands (currently have issues due to + # https://github.com/cython/cython/issues/5609). + cpdef kind(self): + """Get the kind of the aggregation.""" + return dereference(self.c_obj).kind + + cdef unique_ptr[groupby_aggregation] clone_underlying_as_groupby(self) except *: + """Make a copy of the underlying aggregation that can be used in a groupby. + + This function will raise an exception if the aggregation is not supported as a + groupby aggregation. This failure to cast translates the per-algorithm + aggregation logic encoded in libcudf's type hierarchy into Python. + """ + cdef unique_ptr[aggregation] agg = dereference(self.c_obj).clone() + cdef groupby_aggregation *agg_cast = dynamic_cast[gba_ptr](agg.get()) + if agg_cast is NULL: + agg_repr = str(self.kind()).split(".")[1].title() + raise TypeError(f"{agg_repr} aggregations are not supported by groupby") + agg.release() + return unique_ptr[groupby_aggregation](agg_cast) + + # Ideally this function could reuse the code above, but Cython lacks the + # first-class support for type-aliasing and templates that would make it possible. + cdef unique_ptr[groupby_scan_aggregation] clone_underlying_as_groupby_scan( + self + ) except *: + """Make a copy of the underlying aggregation that can be used in a groupby scan. + + This function will raise an exception if the aggregation is not supported as a + groupby scan aggregation. This failure to cast translates the per-algorithm + aggregation logic encoded in libcudf's type hierarchy into Python. + """ + cdef unique_ptr[aggregation] agg = dereference(self.c_obj).clone() + cdef groupby_scan_aggregation *agg_cast = dynamic_cast[gbsa_ptr](agg.get()) + if agg_cast is NULL: + agg_repr = str(self.kind()).split(".")[1].title() + raise TypeError(f"{agg_repr} scans are not supported by groupby") + agg.release() + return unique_ptr[groupby_scan_aggregation](agg_cast) + + @staticmethod + cdef Aggregation from_libcudf(unique_ptr[aggregation] agg): + """Create a Python Aggregation from a libcudf aggregation.""" + cdef Aggregation out = Aggregation.__new__(Aggregation) + out.c_obj = move(agg) + return out + + +cpdef Aggregation sum(): + """Create a sum aggregation. + + Returns + ------- + Aggregation + The sum aggregation. + """ + return Aggregation.from_libcudf(move(make_sum_aggregation[aggregation]())) + + +cpdef Aggregation product(): + """Create a product aggregation. + + Returns + ------- + Aggregation + The product aggregation. + """ + return Aggregation.from_libcudf(move(make_product_aggregation[aggregation]())) + + +cpdef Aggregation min(): + """Create a min aggregation. + + Returns + ------- + Aggregation + The min aggregation. + """ + return Aggregation.from_libcudf(move(make_min_aggregation[aggregation]())) + + +cpdef Aggregation max(): + """Create a max aggregation. + + Returns + ------- + Aggregation + The max aggregation. + """ + return Aggregation.from_libcudf(move(make_max_aggregation[aggregation]())) + + +cpdef Aggregation count(null_policy null_handling = null_policy.EXCLUDE): + """Create a count aggregation. + + Parameters + ---------- + null_handling : null_policy, default EXCLUDE + Whether or not nulls should be included. + + Returns + ------- + Aggregation + The count aggregation. + """ + return Aggregation.from_libcudf( + move(make_count_aggregation[aggregation](null_handling)) + ) + + +cpdef Aggregation any(): + """Create an any aggregation. + + Returns + ------- + Aggregation + The any aggregation. + """ + return Aggregation.from_libcudf(move(make_any_aggregation[aggregation]())) + + +cpdef Aggregation all(): + """Create an all aggregation. + + Returns + ------- + Aggregation + The all aggregation. + """ + return Aggregation.from_libcudf(move(make_all_aggregation[aggregation]())) + + +cpdef Aggregation sum_of_squares(): + """Create a sum_of_squares aggregation. + + Returns + ------- + Aggregation + The sum_of_squares aggregation. + """ + return Aggregation.from_libcudf( + move(make_sum_of_squares_aggregation[aggregation]()) + ) + + +cpdef Aggregation mean(): + """Create a mean aggregation. + + Returns + ------- + Aggregation + The mean aggregation. + """ + return Aggregation.from_libcudf(move(make_mean_aggregation[aggregation]())) + + +cpdef Aggregation variance(size_type ddof=1): + """Create a variance aggregation. + + Parameters + ---------- + ddof : int, default 1 + Delta degrees of freedom. + + Returns + ------- + Aggregation + The variance aggregation. + """ + return Aggregation.from_libcudf(move(make_variance_aggregation[aggregation](ddof))) + + +cpdef Aggregation std(size_type ddof=1): + """Create a std aggregation. + + Parameters + ---------- + ddof : int, default 1 + Delta degrees of freedom. The default value is 1. + + Returns + ------- + Aggregation + The std aggregation. + """ + return Aggregation.from_libcudf(move(make_std_aggregation[aggregation](ddof))) + + +cpdef Aggregation median(): + """Create a median aggregation. + + Returns + ------- + Aggregation + The median aggregation. + """ + return Aggregation.from_libcudf(move(make_median_aggregation[aggregation]())) + + +cpdef Aggregation quantile(list quantiles, interpolation interp = interpolation.LINEAR): + """Create a quantile aggregation. + + Parameters + ---------- + quantiles : list + List of quantiles to compute, should be between 0 and 1. + interp : interpolation, default LINEAR + Interpolation technique to use when the desired quantile lies between + two data points. + + Returns + ------- + Aggregation + The quantile aggregation. + """ + return Aggregation.from_libcudf( + move(make_quantile_aggregation[aggregation](quantiles, interp)) + ) + + +cpdef Aggregation argmax(): + """Create an argmax aggregation. + + Returns + ------- + Aggregation + The argmax aggregation. + """ + return Aggregation.from_libcudf(move(make_argmax_aggregation[aggregation]())) + + +cpdef Aggregation argmin(): + """Create an argmin aggregation. + + Returns + ------- + Aggregation + The argmin aggregation. + """ + return Aggregation.from_libcudf(move(make_argmin_aggregation[aggregation]())) + + +cpdef Aggregation nunique(null_policy null_handling = null_policy.EXCLUDE): + """Create a nunique aggregation. + + Parameters + ---------- + null_handling : null_policy, default EXCLUDE + Whether or not nulls should be included. + + Returns + ------- + Aggregation + The nunique aggregation. + """ + return Aggregation.from_libcudf( + move(make_nunique_aggregation[aggregation](null_handling)) + ) + + +cpdef Aggregation nth_element( + size_type n, null_policy null_handling = null_policy.INCLUDE +): + """Create a nth_element aggregation. + + Parameters + ---------- + null_handling : null_policy, default INCLUDE + Whether or not nulls should be included. + + Returns + ------- + Aggregation + The nth_element aggregation. + """ + return Aggregation.from_libcudf( + move(make_nth_element_aggregation[aggregation](n, null_handling)) + ) + + +cpdef Aggregation collect_list(null_policy null_handling = null_policy.INCLUDE): + """Create a collect_list aggregation. + + Parameters + ---------- + null_handling : null_policy, default INCLUDE + Whether or not nulls should be included. + + Returns + ------- + Aggregation + The collect_list aggregation. + """ + return Aggregation.from_libcudf( + move(make_collect_list_aggregation[aggregation](null_handling)) + ) + + +cpdef Aggregation collect_set( + null_handling = null_policy.INCLUDE, + nulls_equal = null_equality.EQUAL, + nans_equal = nan_equality.ALL_EQUAL, +): + """Create a collect_set aggregation. + + Parameters + ---------- + null_handling : null_policy, default INCLUDE + Whether or not nulls should be included. + nulls_equal : null_equality, default EQUAL + Whether or not nulls should be considered equal. + nans_equal : nan_equality, default ALL_EQUAL + Whether or not NaNs should be considered equal. + + Returns + ------- + Aggregation + The collect_set aggregation. + """ + return Aggregation.from_libcudf( + move( + make_collect_set_aggregation[aggregation]( + null_handling, nulls_equal, nans_equal + ) + ) + ) + +cpdef Aggregation udf(str operation, DataType output_type): + """Create a udf aggregation. + + Parameters + ---------- + operation : str + The operation to perform as a string of PTX code. + output_type : DataType + The output type of the aggregation. + + Returns + ------- + Aggregation + The udf aggregation. + """ + return Aggregation.from_libcudf( + move( + make_udf_aggregation[aggregation]( + UdfType.PTX, + operation.encode("utf-8"), + output_type.c_obj, + ) + ) + ) + + +cpdef Aggregation correlation(correlation_type type, size_type min_periods): + """Create a correlation aggregation. + + Parameters + ---------- + type : correlation_type + The type of correlation to compute. + min_periods : int + The minimum number of observations to consider for computing the + correlation. + + Returns + ------- + Aggregation + The correlation aggregation. + """ + return Aggregation.from_libcudf( + move(make_correlation_aggregation[aggregation](type, min_periods)) + ) + + +cpdef Aggregation covariance(size_type min_periods, size_type ddof): + """Create a covariance aggregation. + + Parameters + ---------- + min_periods : int + The minimum number of observations to consider for computing the + covariance. + ddof : int + Delta degrees of freedom. + + Returns + ------- + Aggregation + The covariance aggregation. + """ + return Aggregation.from_libcudf( + move(make_covariance_aggregation[aggregation](min_periods, ddof)) + ) + + +cpdef Aggregation rank( + rank_method method, + order column_order = order.ASCENDING, + null_policy null_handling = null_policy.EXCLUDE, + null_order null_precedence = null_order.AFTER, + rank_percentage percentage = rank_percentage.NONE, +): + """Create a rank aggregation. + + Parameters + ---------- + method : rank_method + The method to use for ranking. + column_order : order, default ASCENDING + The order in which to sort the column. + null_handling : null_policy, default EXCLUDE + Whether or not nulls should be included. + null_precedence : null_order, default AFTER + Whether nulls should come before or after non-nulls. + percentage : rank_percentage, default NONE + Whether or not ranks should be converted to percentages, and if so, + the type of normalization to use. + + Returns + ------- + Aggregation + The rank aggregation. + """ + return Aggregation.from_libcudf( + move( + make_rank_aggregation[aggregation]( + method, + column_order, + null_handling, + null_precedence, + percentage, + ) + ) + ) diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx index 65f8c7a1854..12e592f3a92 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx @@ -26,23 +26,9 @@ from cudf._lib.cpp.copying import \ out_of_bounds_policy as OutOfBoundsPolicy # no-cython-lint from .column cimport Column +from .scalar cimport Scalar from .table cimport Table - -# This is a workaround for -# https://github.com/cython/cython/issues/4180 -# when creating reference_wrapper[constscalar] in the constructor -ctypedef const scalar constscalar - - -cdef vector[reference_wrapper[const scalar]] _as_vector(list source): - """Make a vector of reference_wrapper[const scalar] from a list of scalars.""" - cdef vector[reference_wrapper[const scalar]] c_scalars - c_scalars.reserve(len(source)) - cdef Scalar slr - for slr in source: - c_scalars.push_back( - reference_wrapper[constscalar](dereference((slr).c_obj))) - return c_scalars +from .utils cimport _as_vector cpdef Table gather( diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd new file mode 100644 index 00000000000..ce472e3c990 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd @@ -0,0 +1,47 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair +from libcpp.vector cimport vector + +from cudf._lib.cpp.aggregation cimport ( + aggregation, + groupby_aggregation, + groupby_scan_aggregation, +) +from cudf._lib.cpp.groupby cimport ( + aggregation_request, + aggregation_result, + groupby, + scan_request, +) +from cudf._lib.cpp.table.table cimport table + +from .column cimport Column +from .table cimport Table + + +cdef class GroupByRequest: + # The groupby APIs accept vectors of unique_ptrs to aggregation requests. + # This ownership model means that if GroupByRequest owned the + # corresponding C++ object, that object would have to be copied by e.g. + # each groupby.aggregate call to avoid invalidating this object. Therefore, + # this class instead stores only Python/Cython objects and constructs the + # C++ object on the fly as requested. + cdef Column _values + cdef list _aggregations + + cdef aggregation_request _to_libcudf_agg_request(self) except * + cdef scan_request _to_libcudf_scan_request(self) except * + + +cdef class GroupBy: + cdef unique_ptr[groupby] c_obj + cpdef tuple aggregate(self, list requests) + cpdef tuple scan(self, list requests) + cpdef tuple shift(self, Table values, list offset, list fill_values) + cpdef tuple replace_nulls(self, Table values, list replace_policy) + cpdef tuple get_groups(self, Table values=*) + + @staticmethod + cdef tuple _parse_outputs(pair[unique_ptr[table], vector[aggregation_result]] c_res) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx new file mode 100644 index 00000000000..f442aafa4bd --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -0,0 +1,251 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.functional cimport reference_wrapper +from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.cpp.groupby cimport ( + aggregation_request, + aggregation_result, + groupby, + groups, + scan_request, +) +from cudf._lib.cpp.scalar.scalar cimport scalar +from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.types cimport size_type + +from .aggregation cimport Aggregation +from .column cimport Column +from .table cimport Table +from .types cimport null_policy, sorted +from .utils cimport _as_vector + + +cdef class GroupByRequest: + """A request for a groupby aggregation or scan. + + Parameters + ---------- + values : Column + The column to aggregate. + aggregations : List[Aggregation] + The list of aggregations to perform. + """ + def __init__(self, Column values, list aggregations): + self._values = values + self._aggregations = aggregations + + cdef aggregation_request _to_libcudf_agg_request(self) except *: + """Convert to a libcudf aggregation_request object. + + This method is for internal use only. It creates a new libcudf + :cpp:class:`cudf::groupby::aggregation_request` object each time it is + called. + """ + cdef aggregation_request c_obj + c_obj.values = self._values.view() + + cdef Aggregation agg + for agg in self._aggregations: + c_obj.aggregations.push_back(move(agg.clone_underlying_as_groupby())) + return move(c_obj) + + cdef scan_request _to_libcudf_scan_request(self) except *: + """Convert to a libcudf scan_request object. + + This method is for internal use only. It creates a new libcudf + :cpp:class:`cudf::groupby::scan_request` object each time it is + called. + """ + cdef scan_request c_obj + c_obj.values = self._values.view() + + cdef Aggregation agg + for agg in self._aggregations: + c_obj.aggregations.push_back(move(agg.clone_underlying_as_groupby_scan())) + return move(c_obj) + + +cdef class GroupBy: + """Group values by keys and compute various aggregate quantities. + + Parameters + ---------- + keys : Table + The columns to group by. + null_handling : null_policy, optional + Whether or not to include null rows in ``keys``. Default is null_policy.EXCLUDE. + keys_are_sorted : sorted, optional + Whether the keys are already sorted. Default is sorted.NO. + """ + def __init__( + self, + Table keys, + null_policy null_handling=null_policy.EXCLUDE, + sorted keys_are_sorted=sorted.NO + ): + self.c_obj.reset(new groupby(keys.view(), null_handling, keys_are_sorted)) + + @staticmethod + cdef tuple _parse_outputs( + pair[unique_ptr[table], vector[aggregation_result]] c_res + ): + # Convert libcudf aggregation/scan outputs into pylibcudf objects. + # This function is for internal use only. + cdef Table group_keys = Table.from_libcudf(move(c_res.first)) + + cdef int i, j + cdef list results = [] + cdef list inner_results + for i in range(c_res.second.size()): + inner_results = [] + for j in range(c_res.second[i].results.size()): + inner_results.append( + Column.from_libcudf(move(c_res.second[i].results[j])) + ) + results.append(Table(inner_results)) + return group_keys, results + + cpdef tuple aggregate(self, list requests): + """Compute aggregations on columns. + + Parameters + ---------- + requests : List[GroupByRequest] + The list of `~.cudf._lib.pylibcudf.groupby.GroupByRequest` , each + representing a set of aggregations to perform on a given column of values. + + Returns + ------- + Tuple[Table, List[Table, ...]] + A tuple whose first element is the unique keys and whose second + element is a table of aggregation results. One table is returned + for each aggregation request, with the columns corresponding to the + sequence of aggregations in the request. + """ + cdef GroupByRequest request + cdef vector[aggregation_request] c_requests + for request in requests: + c_requests.push_back(move(request._to_libcudf_agg_request())) + + cdef pair[unique_ptr[table], vector[aggregation_result]] c_res = move( + dereference(self.c_obj).aggregate(c_requests) + ) + return GroupBy._parse_outputs(move(c_res)) + + cpdef tuple scan(self, list requests): + """Compute scans on columns. + + Parameters + ---------- + requests : List[GroupByRequest] + The list of `~.cudf._lib.pylibcudf.groupby.GroupByRequest` , each + representing a set of aggregations to perform on a given column of values. + + Returns + ------- + Tuple[Table, List[Table, ...]] + A tuple whose first element is the unique keys and whose second + element is a table of aggregation results. One table is returned + for each aggregation request, with the columns corresponding to the + sequence of aggregations in the request. + """ + cdef GroupByRequest request + cdef vector[scan_request] c_requests + for request in requests: + c_requests.push_back(move(request._to_libcudf_scan_request())) + + cdef pair[unique_ptr[table], vector[aggregation_result]] c_res = move( + dereference(self.c_obj).scan(c_requests) + ) + return GroupBy._parse_outputs(move(c_res)) + + cpdef tuple shift(self, Table values, list offset, list fill_values): + """Compute shifts on columns. + + Parameters + ---------- + values : Table + The columns to shift. + offset : List[int] + The offsets to shift by. + fill_values : List[Scalar] + The values to use to fill in missing values. + + Returns + ------- + Tuple[Table, Table] + A tuple whose first element is the group's keys and whose second + element is a table of shifted values. + """ + cdef vector[reference_wrapper[const scalar]] c_fill_values = \ + _as_vector(fill_values) + + cdef vector[size_type] c_offset = offset + cdef pair[unique_ptr[table], unique_ptr[table]] c_res = move( + dereference(self.c_obj).shift(values.view(), c_offset, c_fill_values) + ) + + return ( + Table.from_libcudf(move(c_res.first)), + Table.from_libcudf(move(c_res.second)), + ) + + cpdef tuple replace_nulls(self, Table value, list replace_policies): + """Replace nulls in columns. + + Parameters + ---------- + values : Table + The columns to replace nulls in. + replace_policies : List[replace_policy] + The policies to use to replace nulls. + + Returns + ------- + Tuple[Table, Table] + A tuple whose first element is the group's keys and whose second + element is a table of values with nulls replaced. + """ + cdef pair[unique_ptr[table], unique_ptr[table]] c_res = move( + dereference(self.c_obj).replace_nulls(value.view(), replace_policies) + ) + + return ( + Table.from_libcudf(move(c_res.first)), + Table.from_libcudf(move(c_res.second)), + ) + + cpdef tuple get_groups(self, Table values=None): + """Get the grouped keys and values labels for each row. + + Parameters + ---------- + values : Table, optional + The columns to get group labels for. If not specified, the group + labels for the group keys are returned. + + Returns + ------- + Tuple[Table, Table, List[int]] + A tuple of tables containing three items: + - A table of group keys + - A table of group values + - A list of integer offsets into the tables + """ + + cdef groups c_groups + if values: + c_groups = dereference(self.c_obj).get_groups(values.view()) + else: + c_groups = dereference(self.c_obj).get_groups() + + return ( + Table.from_libcudf(move(c_groups.keys)), + Table.from_libcudf(move(c_groups.values)), + c_groups.offsets, + ) diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd index 80baa484be7..1ad3d19f15c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd @@ -1,9 +1,19 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp cimport bool as cbool -from cudf._lib.cpp.types cimport data_type, type_id +from cudf._lib.cpp.types cimport ( + data_type, + interpolation, + nan_equality, + null_equality, + null_order, + null_policy, + order, + sorted, + type_id, +) cdef class DataType: diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index 931ab9fde39..5b25e7674e2 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -4,7 +4,14 @@ from libc.stdint cimport int32_t from cudf._lib.cpp.types cimport data_type, type_id -from cudf._lib.cpp.types import type_id as TypeId # no-cython-lint +from cudf._lib.cpp.types import type_id as TypeId # no-cython-lint, isort:skip +from cudf._lib.cpp.types import null_policy as NullPolicy # no-cython-lint, isort:skip +from cudf._lib.cpp.types import interpolation as Interpolation # no-cython-lint, isort:skip +from cudf._lib.cpp.types import nan_equality as NanEquality # no-cython-lint, isort:skip +from cudf._lib.cpp.types import null_equality as NullEquality # no-cython-lint, isort:skip +from cudf._lib.cpp.types import null_order as NullOrder # no-cython-lint, isort:skip +from cudf._lib.cpp.types import order as Order # no-cython-lint, isort:skip +from cudf._lib.cpp.types import sorted as Sorted # no-cython-lint, isort:skip cdef class DataType: diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pxd b/python/cudf/cudf/_lib/pylibcudf/utils.pxd index 18bcd9cc91a..7efeaaf7e24 100644 --- a/python/cudf/cudf/_lib/pylibcudf/utils.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/utils.pxd @@ -1,7 +1,12 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +from libcpp.functional cimport reference_wrapper +from libcpp.vector cimport vector + +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport bitmask_type cdef void * int_to_void_ptr(Py_ssize_t ptr) nogil cdef bitmask_type * int_to_bitmask_ptr(Py_ssize_t ptr) nogil +cdef vector[reference_wrapper[const scalar]] _as_vector(list source) diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pyx b/python/cudf/cudf/_lib/pylibcudf/utils.pyx index ccf9ea2bd70..ea34a87a72a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/utils.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/utils.pyx @@ -1,9 +1,21 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +from cython.operator import dereference from libc.stdint cimport uintptr_t +from libcpp.functional cimport reference_wrapper +from libcpp.vector cimport vector +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport bitmask_type +from .scalar cimport Scalar + +# This is a workaround for +# https://github.com/cython/cython/issues/4180 +# when creating reference_wrapper[constscalar] in the constructor +ctypedef const scalar constscalar + cdef void * int_to_void_ptr(Py_ssize_t ptr) nogil: return (ptr) @@ -11,3 +23,14 @@ cdef void * int_to_void_ptr(Py_ssize_t ptr) nogil: cdef bitmask_type * int_to_bitmask_ptr(Py_ssize_t ptr) nogil: return (ptr) + + +cdef vector[reference_wrapper[const scalar]] _as_vector(list source): + """Make a vector of reference_wrapper[const scalar] from a list of scalars.""" + cdef vector[reference_wrapper[const scalar]] c_scalars + c_scalars.reserve(len(source)) + cdef Scalar slr + for slr in source: + c_scalars.push_back( + reference_wrapper[constscalar](dereference((slr).c_obj))) + return c_scalars