From acbf97f544964edff5ff7ef0a4080f19052fd0c2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 24 Jan 2024 22:37:33 +0000 Subject: [PATCH 01/34] Move to enum classes --- python/cudf/cudf/_lib/aggregation.pyx | 4 +- python/cudf/cudf/_lib/cpp/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/cpp/aggregation.pxd | 110 +++++++++++----------- python/cudf/cudf/_lib/cpp/aggregation.pyx | 0 4 files changed, 59 insertions(+), 57 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/aggregation.pyx diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 72c5e288f0b..7714676cf65 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from enum import Enum, IntEnum @@ -51,7 +51,7 @@ class AggregationKind(Enum): NUNIQUE = libcudf_aggregation.aggregation.Kind.NUNIQUE NTH = libcudf_aggregation.aggregation.Kind.NTH_ELEMENT RANK = libcudf_aggregation.aggregation.Kind.RANK - COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT + COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT_LIST UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET PTX = libcudf_aggregation.aggregation.Kind.PTX CUDA = libcudf_aggregation.aggregation.Kind.CUDA diff --git a/python/cudf/cudf/_lib/cpp/CMakeLists.txt b/python/cudf/cudf/_lib/cpp/CMakeLists.txt index 764f28add0e..316541c9bc5 100644 --- a/python/cudf/cudf/_lib/cpp/CMakeLists.txt +++ b/python/cudf/cudf/_lib/cpp/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources binaryop.pyx copying.pyx types.pyx) +set(cython_sources aggregation.pyx binaryop.pyx copying.pyx types.pyx) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index a1d1485e1e8..6bc67858c42 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -19,71 +19,73 @@ ctypedef int32_t underlying_type_t_rank_method cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: - cdef cppclass aggregation: - ctypedef enum Kind: - SUM 'cudf::aggregation::SUM' - PRODUCT 'cudf::aggregation::PRODUCT' - MIN 'cudf::aggregation::MIN' - MAX 'cudf::aggregation::MAX' - COUNT_VALID 'cudf::aggregation::COUNT_VALID' - COUNT_ALL 'cudf::aggregation::COUNT_ALL' - ANY 'cudf::aggregation::ANY' - ALL 'cudf::aggregation::ALL' - SUM_OF_SQUARES 'cudf::aggregation::SUM_OF_SQUARES' - MEAN 'cudf::aggregation::MEAN' - VARIANCE 'cudf::aggregation::VARIANCE' - STD 'cudf::aggregation::STD' - MEDIAN 'cudf::aggregation::MEDIAN' - QUANTILE 'cudf::aggregation::QUANTILE' - ARGMAX 'cudf::aggregation::ARGMAX' - ARGMIN 'cudf::aggregation::ARGMIN' - NUNIQUE 'cudf::aggregation::NUNIQUE' - NTH_ELEMENT 'cudf::aggregation::NTH_ELEMENT' - RANK 'cudf::aggregation::RANK' - COLLECT 'cudf::aggregation::COLLECT_LIST' - COLLECT_SET 'cudf::aggregation::COLLECT_SET' - PTX 'cudf::aggregation::PTX' - CUDA 'cudf::aggregation::CUDA' - CORRELATION 'cudf::aggregation::CORRELATION' - COVARIANCE 'cudf::aggregation::COVARIANCE' + # Cython doesn't appear to support enum class nested inside a class, so + # have to namespace it manually + cpdef enum class Kind "cudf::aggregation::Kind": + SUM + PRODUCT + MIN + MAX + COUNT_VALID + COUNT_ALL + ANY + ALL + SUM_OF_SQUARES + MEAN + VARIANCE + STD + MEDIAN + QUANTILE + ARGMAX + ARGMIN + NUNIQUE + NTH_ELEMENT + RANK + COLLECT_LIST + COLLECT_SET + PTX + CUDA + CORRELATION + COVARIANCE + cdef cppclass aggregation: Kind kind cdef cppclass rolling_aggregation: - aggregation.Kind kind + Kind kind cdef cppclass groupby_aggregation: - aggregation.Kind kind + Kind kind cdef cppclass groupby_scan_aggregation: - aggregation.Kind kind + Kind kind cdef cppclass reduce_aggregation: - aggregation.Kind kind + Kind kind cdef cppclass scan_aggregation: - aggregation.Kind kind - - ctypedef enum udf_type: - CUDA 'cudf::udf_type::CUDA' - PTX 'cudf::udf_type::PTX' - - ctypedef enum correlation_type: - PEARSON 'cudf::correlation_type::PEARSON' - KENDALL 'cudf::correlation_type::KENDALL' - SPEARMAN 'cudf::correlation_type::SPEARMAN' - - ctypedef enum rank_method: - FIRST "cudf::rank_method::FIRST" - AVERAGE "cudf::rank_method::AVERAGE" - MIN "cudf::rank_method::MIN" - MAX "cudf::rank_method::MAX" - DENSE "cudf::rank_method::DENSE" - - ctypedef enum rank_percentage: - NONE "cudf::rank_percentage::NONE" - ZERO_NORMALIZED "cudf::rank_percentage::ZERO_NORMALIZED" - ONE_NORMALIZED "cudf::rank_percentage::ONE_NORMALIZED" + Kind kind + + cpdef enum class udf_type: + CUDA + PTX + + cpdef enum class correlation_type: + PEARSON + KENDALL + SPEARMAN + + cpdef enum class rank_method: + FIRST + AVERAGE + MIN + MAX + DENSE + + cpdef enum class rank_percentage: + NONE + ZERO_NORMALIZED + ONE_NORMALIZED cdef unique_ptr[T] make_sum_aggregation[T]() except + diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pyx b/python/cudf/cudf/_lib/cpp/aggregation.pyx new file mode 100644 index 00000000000..e69de29bb2d From 2757384776a0f13071f4a9ddbd42a516e7e39759 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 24 Jan 2024 22:41:59 +0000 Subject: [PATCH 02/34] Some more minor improvements to cpp exports --- python/cudf/cudf/_lib/cpp/aggregation.pxd | 28 +++++++++++------------ 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 6bc67858c42..d1864cc30fd 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -51,38 +51,38 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: cdef cppclass aggregation: Kind kind - cdef cppclass rolling_aggregation: - Kind kind + cdef cppclass rolling_aggregation(aggregation): + pass - cdef cppclass groupby_aggregation: - Kind kind + cdef cppclass groupby_aggregation(aggregation): + pass - cdef cppclass groupby_scan_aggregation: - Kind kind + cdef cppclass groupby_scan_aggregation(aggregation): + pass - cdef cppclass reduce_aggregation: - Kind kind + cdef cppclass reduce_aggregation(aggregation): + pass - cdef cppclass scan_aggregation: - Kind kind + cdef cppclass scan_aggregation(aggregation): + pass - cpdef enum class udf_type: + cpdef enum class udf_type(bool): CUDA PTX - cpdef enum class correlation_type: + cpdef enum class correlation_type(int32_t): PEARSON KENDALL SPEARMAN - cpdef enum class rank_method: + cpdef enum class rank_method(int32_t): FIRST AVERAGE MIN MAX DENSE - cpdef enum class rank_percentage: + cpdef enum class rank_percentage(int32_t): NONE ZERO_NORMALIZED ONE_NORMALIZED From 0c60be24f2070c23e46cfa7d803645045224b97b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 25 Jan 2024 01:23:40 +0000 Subject: [PATCH 03/34] Add scaffolding for aggregation classes --- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 4 +- .../cudf/cudf/_lib/pylibcudf/aggregation.pxd | 28 +++++++++++ .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 48 +++++++++++++++++++ 3 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/aggregation.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/aggregation.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index acb013c8b8c..a2047247bfd 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx - table.pyx types.pyx utils.pyx +set(cython_sources aggregation.pyx binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx + interop.pyx scalar.pyx table.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd new file mode 100644 index 00000000000..c0db2cdaafe --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -0,0 +1,28 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.cpp.aggregation cimport Kind as kind_t, aggregation + + +cdef class Aggregation: + cdef aggregation *c_ptr + cpdef kind_t kind(self) + + +cdef class RollingAggregation(Aggregation): + pass + + +cdef class GroupbyAggregation(Aggregation): + pass + + +cdef class GroupbyScanAggregation(Aggregation): + pass + + +cdef class ReduceAggregation(Aggregation): + pass + + +cdef class ScanAggregation(Aggregation): + pass diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx new file mode 100644 index 00000000000..a8ddb895d40 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -0,0 +1,48 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference + +# Have to alias this so that the Pythonic name gets capitalized +from cudf._lib.cpp.aggregation cimport Kind as kind_t + + +# TODO: If/when https://github.com/cython/cython/issues/1271 is resolved, we +# should migrate the various factories to cpdef classmethods instead of Python +# def methods. +cdef class Aggregation: + """A wrapper for aggregations. + + **Neither this class nor any of its subclasses should ever be instantiated + using a standard constructor, only using one of its many factories.** The + factory approach matches the libcudf approach, which is necessary because + some aggregations require additional arguments beyond the kind. + """ + def __init__(self): + raise ValueError( + "Aggregation types should not be constructed directly. " + "Use one of the factory functions." + ) + + cpdef kind_t kind(self): + """Get the kind of the aggregation.""" + return dereference(self.c_parent_obj).kind + + +cdef class RollingAggregation(Aggregation): + pass + + +cdef class GroupbyAggregation(Aggregation): + pass + + +cdef class GroupbyScanAggregation(Aggregation): + pass + + +cdef class ReduceAggregation(Aggregation): + pass + + +cdef class ScanAggregation(Aggregation): + pass From 46d412bbed1bbd2397cbd501b038366292b5b7a8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 26 Jan 2024 06:46:04 +0000 Subject: [PATCH 04/34] Implement new strategy via multiple inheritance --- .../cudf/cudf/_lib/pylibcudf/aggregation.pxd | 15 ++++++++++-- .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 24 +++++++++++++++++-- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd index c0db2cdaafe..c6923d5f666 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -1,6 +1,13 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from cudf._lib.cpp.aggregation cimport Kind as kind_t, aggregation +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp cimport aggregation as cpp_aggregation +from cudf._lib.cpp.aggregation cimport ( + Kind as kind_t, + aggregation, + rolling_aggregation, +) cdef class Aggregation: @@ -8,7 +15,11 @@ cdef class Aggregation: cpdef kind_t kind(self) -cdef class RollingAggregation(Aggregation): +cdef class SumAggregation(Aggregation): + pass + + +cdef class RollingAggregation(SumAggregation): pass diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index a8ddb895d40..6a31bdd3e44 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from cython.operator cimport dereference +from libcpp.utility cimport move # Have to alias this so that the Pythonic name gets capitalized from cudf._lib.cpp.aggregation cimport Kind as kind_t @@ -23,12 +24,31 @@ cdef class Aggregation: "Use one of the factory functions." ) + def __dealloc__(self): + # Deletion is always handled by child classes. + if type(self) is not Aggregation and self.c_ptr is not NULL: + del self.c_ptr + cpdef kind_t kind(self): """Get the kind of the aggregation.""" - return dereference(self.c_parent_obj).kind + return dereference(self.c_ptr).kind + + +cdef class SumAggregation(Aggregation): + @classmethod + def sum(cls): + # We construct using cls but cdef as the base class because Cython + # objects are effectively pointers so this will polymorphically store + # the derived object. This allows us to construct the desired derived + # object without needing to override this method in the base class. + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[rolling_aggregation] ptr = move( + cpp_aggregation.make_sum_aggregation[rolling_aggregation]()) + out.c_ptr = ptr.release() + return out -cdef class RollingAggregation(Aggregation): +cdef class RollingAggregation(SumAggregation): pass From bdc92e89138be7a3ec71002f5e7a38a145128007 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 26 Jan 2024 08:42:54 +0000 Subject: [PATCH 05/34] Implement all aggregations --- python/cudf/cudf/_lib/cpp/aggregation.pxd | 6 - .../cudf/cudf/_lib/pylibcudf/aggregation.pxd | 32 +- .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 346 +++++++++++++++++- 3 files changed, 334 insertions(+), 50 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index d1864cc30fd..b9588d64053 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -95,8 +95,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: cdef unique_ptr[T] make_max_aggregation[T]() except + - cdef unique_ptr[T] make_count_aggregation[T]() except + - cdef unique_ptr[T] make_count_aggregation[T](null_policy) except + cdef unique_ptr[T] make_any_aggregation[T]() except + @@ -123,10 +121,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: cdef unique_ptr[T] make_nunique_aggregation[T]() except + - cdef unique_ptr[T] make_nth_element_aggregation[T]( - size_type n - ) except + - cdef unique_ptr[T] make_nth_element_aggregation[T]( size_type n, null_policy null_handling diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd index c6923d5f666..5df9489b106 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -1,39 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr - from cudf._lib.cpp cimport aggregation as cpp_aggregation -from cudf._lib.cpp.aggregation cimport ( - Kind as kind_t, - aggregation, - rolling_aggregation, -) +from cudf._lib.cpp.aggregation cimport Kind as kind_t, aggregation cdef class Aggregation: cdef aggregation *c_ptr cpdef kind_t kind(self) - - -cdef class SumAggregation(Aggregation): - pass - - -cdef class RollingAggregation(SumAggregation): - pass - - -cdef class GroupbyAggregation(Aggregation): - pass - - -cdef class GroupbyScanAggregation(Aggregation): - pass - - -cdef class ReduceAggregation(Aggregation): - pass - - -cdef class ScanAggregation(Aggregation): - pass diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index 6a31bdd3e44..9e50ec1844e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -1,10 +1,25 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr from libcpp.utility cimport move # Have to alias this so that the Pythonic name gets capitalized -from cudf._lib.cpp.aggregation cimport Kind as kind_t +# TODO: Expose the enums to Python +from cudf._lib.cpp.aggregation cimport ( + Kind as kind_t, + aggregation, + correlation_type, + rank_method, + rank_percentage, +) +from cudf._lib.cpp.types cimport ( + interpolation, + null_order, + null_policy, + order, + size_type, +) # TODO: If/when https://github.com/cython/cython/issues/1271 is resolved, we @@ -34,35 +49,340 @@ cdef class Aggregation: return dereference(self.c_ptr).kind -cdef class SumAggregation(Aggregation): +# The below aggregation types are effectively mixins used to implement specific +# types of aggregations. They function like mixins when combined with the +# algorithm-specific classes below. The factory methods all construct an object +# of type cls but save that to a variable of the base class Aggregation, +# allowing the logic of the function to be generic regardless of which concrete +# algorithm-type the mixin is mixed into. Since the methods are pure Python, +# the end result is an object that Python will dynamically resolve to the +# correct type at the call site. +class SumAggregation(Aggregation): @classmethod def sum(cls): - # We construct using cls but cdef as the base class because Cython - # objects are effectively pointers so this will polymorphically store - # the derived object. This allows us to construct the desired derived - # object without needing to override this method in the base class. cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[rolling_aggregation] ptr = move( - cpp_aggregation.make_sum_aggregation[rolling_aggregation]()) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_sum_aggregation[aggregation]()) out.c_ptr = ptr.release() return out -cdef class RollingAggregation(SumAggregation): +class ProductAggregation(Aggregation): + @classmethod + def product(cls): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_product_aggregation[aggregation]()) + out.c_ptr = ptr.release() + return out + + +class MinAggregation(Aggregation): + @classmethod + def min(cls): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_min_aggregation[aggregation]()) + out.c_ptr = ptr.release() + return out + + +class MaxAggregation(Aggregation): + @classmethod + def max(cls): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_max_aggregation[aggregation]()) + out.c_ptr = ptr.release() + return out + + +class CountAggregation(Aggregation): + @classmethod + def count(cls, null_policy null_handling): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_count_aggregation[aggregation](null_handling)) + out.c_ptr = ptr.release() + return out + + +class AnyAggregation(Aggregation): + @classmethod + def any(cls): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_any_aggregation[aggregation]()) + out.c_ptr = ptr.release() + return out + + +class AllAggregation(Aggregation): + @classmethod + def all(cls): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_any_aggregation[aggregation]()) + out.c_ptr = ptr.release() + return out + + +class SumOfSquaresAggregation(Aggregation): + @classmethod + def sum_of_squares(cls): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_product_aggregation[aggregation]()) + out.c_ptr = ptr.release() + return out + + +class MeanAggregation(Aggregation): + @classmethod + def mean(cls): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_mean_aggregation[aggregation]()) + out.c_ptr = ptr.release() + return out + + +class VarianceAggregation(Aggregation): + @classmethod + def variance(cls, size_type ddof): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_variance_aggregation[aggregation](ddof)) + out.c_ptr = ptr.release() + return out + + +class StdAggregation(Aggregation): + @classmethod + def std(cls, size_type ddof): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_std_aggregation[aggregation](ddof)) + out.c_ptr = ptr.release() + return out + + +class MedianAggregation(Aggregation): + @classmethod + def median(cls): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_median_aggregation[aggregation]()) + out.c_ptr = ptr.release() + return out + + +class QuantileAggregation(Aggregation): + @classmethod + def quantile(cls, list quantiles, interpolation interp): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_quantile_aggregation[aggregation](quantiles, interp)) + out.c_ptr = ptr.release() + return out + + +class ArgmaxAggregation(Aggregation): + @classmethod + def argmax(cls): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_argmax_aggregation[aggregation]()) + out.c_ptr = ptr.release() + return out + + +class ArgminAggregation(Aggregation): + @classmethod + def argmin(cls): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_argmin_aggregation[aggregation]()) + out.c_ptr = ptr.release() + return out + + +class NuniqueAggregation(Aggregation): + @classmethod + def nunique(cls): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_nunique_aggregation[aggregation]()) + out.c_ptr = ptr.release() + return out + + +class NthElementAggregation(Aggregation): + @classmethod + def nth_element(cls, size_type n, null_policy null_handling): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_nth_element_aggregation[aggregation](n, null_handling)) + out.c_ptr = ptr.release() + return out + + +class CollectListAggregation(Aggregation): + @classmethod + def collect_list(cls): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_collect_list_aggregation[aggregation]()) + out.c_ptr = ptr.release() + return out + + +class CollectSetAggregation(Aggregation): + @classmethod + def collect_set(cls): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_collect_set_aggregation[aggregation]()) + out.c_ptr = ptr.release() + return out + + +# class UdfAggregation(Aggregation): +# @classmethod +# def udf( +# cls, +# udf_type type, +# string user_defined_aggregator, +# data_type output_type +# ): + + +class CorrelationAggregation(Aggregation): + @classmethod + def correlation(cls, correlation_type type, size_type min_periods): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_correlation_aggregation[aggregation]( + type, min_periods + ) + ) + out.c_ptr = ptr.release() + return out + + +class CovarianceAggregation(Aggregation): + @classmethod + def covariance(cls, size_type min_periods, size_type ddof): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_covariance_aggregation[aggregation]( + min_periods, ddof + ) + ) + out.c_ptr = ptr.release() + return out + + +class RankAggregation(Aggregation): + @classmethod + def rank( + cls, + rank_method method, + order column_order, + null_policy null_handling, + null_order null_precedence, + rank_percentage percentage + ): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_rank_aggregation[aggregation]( + method, column_order, null_handling, null_precedence, percentage + ) + ) + out.c_ptr = ptr.release() + return out + + +# The following are the concrete aggregation types corresponding to aggregation +# algorithms. +class RollingAggregation( + SumAggregation, + MinAggregation, + MaxAggregation, + CountAggregation, + MeanAggregation, + StdAggregation, + VarianceAggregation, + ArgmaxAggregation, + ArgminAggregation, + NthElementAggregation, + RankAggregation, + CollectListAggregation, + CollectSetAggregation, +): pass -cdef class GroupbyAggregation(Aggregation): +class GroupbyAggregation( + SumAggregation, + ProductAggregation, + MinAggregation, + MaxAggregation, + CountAggregation, + SumOfSquaresAggregation, + MeanAggregation, + StdAggregation, + VarianceAggregation, + MedianAggregation, + QuantileAggregation, + ArgmaxAggregation, + ArgminAggregation, + NuniqueAggregation, + NthElementAggregation, + CollectListAggregation, + CollectSetAggregation, + CovarianceAggregation, + CorrelationAggregation, +): pass -cdef class GroupbyScanAggregation(Aggregation): +class GroupbyScanAggregation( + SumAggregation, + MinAggregation, + MaxAggregation, + CountAggregation, + RankAggregation, +): pass -cdef class ReduceAggregation(Aggregation): +class ReduceAggregation( + SumAggregation, + ProductAggregation, + MinAggregation, + MaxAggregation, + AnyAggregation, + AllAggregation, + SumOfSquaresAggregation, + MeanAggregation, + StdAggregation, + VarianceAggregation, + MedianAggregation, + QuantileAggregation, + NuniqueAggregation, + NthElementAggregation, + CollectListAggregation, + CollectSetAggregation, +): pass -cdef class ScanAggregation(Aggregation): +class ScanAggregation( + SumAggregation, + ProductAggregation, + MinAggregation, + MaxAggregation, + RankAggregation, +): pass From cdadd4a3b19b242df969bc4958caf1a4106bcad0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 26 Jan 2024 10:08:25 +0000 Subject: [PATCH 06/34] Consolidate logic into some simple helpers --- python/cudf/cudf/_lib/aggregation.pyx | 20 +- python/cudf/cudf/_lib/cpp/aggregation.pxd | 12 +- .../cudf/cudf/_lib/pylibcudf/aggregation.pxd | 17 ++ .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 177 +++++++++--------- 4 files changed, 133 insertions(+), 93 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 7714676cf65..b202d08ac2e 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -191,7 +191,7 @@ cdef class RollingAggregation: cdef RollingAggregation agg = cls() agg.c_obj = move( libcudf_aggregation.make_collect_list_aggregation[ - rolling_aggregation]()) + rolling_aggregation](libcudf_types.null_policy.INCLUDE)) return agg @classmethod @@ -335,7 +335,9 @@ cdef class GroupbyAggregation: cdef GroupbyAggregation agg = cls() agg.c_obj = move( libcudf_aggregation. - make_collect_list_aggregation[groupby_aggregation]()) + make_collect_list_aggregation[groupby_aggregation]( + libcudf_types.null_policy.INCLUDE + )) return agg @classmethod @@ -343,7 +345,9 @@ cdef class GroupbyAggregation: cdef GroupbyAggregation agg = cls() agg.c_obj = move( libcudf_aggregation. - make_nunique_aggregation[groupby_aggregation]()) + make_nunique_aggregation[groupby_aggregation]( + libcudf_types.null_policy.EXCLUDE + )) return agg @classmethod @@ -422,7 +426,11 @@ cdef class GroupbyAggregation: cdef GroupbyAggregation agg = cls() agg.c_obj = move( libcudf_aggregation. - make_collect_set_aggregation[groupby_aggregation]()) + make_collect_set_aggregation[groupby_aggregation]( + libcudf_types.null_policy.INCLUDE, + libcudf_types.null_equality.EQUAL, + libcudf_types.nan_equality.ALL_EQUAL, + )) return agg @classmethod @@ -724,7 +732,9 @@ cdef class ReduceAggregation: def nunique(cls): cdef ReduceAggregation agg = cls() agg.c_obj = move( - libcudf_aggregation.make_nunique_aggregation[reduce_aggregation]()) + libcudf_aggregation.make_nunique_aggregation[reduce_aggregation]( + libcudf_types.null_policy.EXCLUDE + )) return agg @classmethod diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index b9588d64053..7e8bb4e7775 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -8,6 +8,8 @@ from libcpp.vector cimport vector from cudf._lib.cpp.types cimport ( data_type, interpolation, + nan_equality, + null_equality, null_order, null_policy, order, @@ -119,16 +121,20 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: cdef unique_ptr[T] make_argmin_aggregation[T]() except + - cdef unique_ptr[T] make_nunique_aggregation[T]() except + + cdef unique_ptr[T] make_nunique_aggregation[T](null_policy null_handling) except + cdef unique_ptr[T] make_nth_element_aggregation[T]( size_type n, null_policy null_handling ) except + - cdef unique_ptr[T] make_collect_list_aggregation[T]() except + + cdef unique_ptr[T] make_collect_list_aggregation[T]( + null_policy null_handling + ) except + - cdef unique_ptr[T] make_collect_set_aggregation[T]() except + + cdef unique_ptr[T] make_collect_set_aggregation[T]( + null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal + ) except + cdef unique_ptr[T] make_udf_aggregation[T]( udf_type type, diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd index 5df9489b106..86e9f64ef32 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -1,9 +1,26 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr + from cudf._lib.cpp cimport aggregation as cpp_aggregation from cudf._lib.cpp.aggregation cimport Kind as kind_t, aggregation +from cudf._lib.cpp.types cimport null_policy cdef class Aggregation: cdef aggregation *c_ptr cpdef kind_t kind(self) + + +ctypedef unique_ptr[aggregation](*nullary_factory_type)() except + +ctypedef unique_ptr[aggregation](*unary_null_handling_factory_type)( + null_policy +) except + + + +cdef Aggregation _create_nullary_agg(cls, nullary_factory_type cpp_agg_factory) +cdef Aggregation _create_unary_null_handling_agg( + cls, + unary_null_handling_factory_type cpp_agg_factory, + null_policy null_handling, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index 9e50ec1844e..545f259e678 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -15,6 +15,8 @@ from cudf._lib.cpp.aggregation cimport ( ) from cudf._lib.cpp.types cimport ( interpolation, + nan_equality, + null_equality, null_order, null_policy, order, @@ -22,9 +24,6 @@ from cudf._lib.cpp.types cimport ( ) -# TODO: If/when https://github.com/cython/cython/issues/1271 is resolved, we -# should migrate the various factories to cpdef classmethods instead of Python -# def methods. cdef class Aggregation: """A wrapper for aggregations. @@ -49,6 +48,28 @@ cdef class Aggregation: return dereference(self.c_ptr).kind +cdef Aggregation _create_nullary_agg(cls, nullary_factory_type cpp_agg_factory): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move(cpp_agg_factory()) + out.c_ptr = ptr.release() + return out + + +cdef Aggregation _create_unary_null_handling_agg( + cls, + unary_null_handling_factory_type cpp_agg_factory, + null_policy null_handling, +): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move(cpp_agg_factory(null_handling)) + out.c_ptr = ptr.release() + return out + + +# TODO: If/when https://github.com/cython/cython/issues/1271 is resolved, we +# should migrate the various factories to cpdef classmethods instead of Python +# def methods. + # The below aggregation types are effectively mixins used to implement specific # types of aggregations. They function like mixins when combined with the # algorithm-specific classes below. The factory methods all construct an object @@ -60,91 +81,76 @@ cdef class Aggregation: class SumAggregation(Aggregation): @classmethod def sum(cls): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_sum_aggregation[aggregation]()) - out.c_ptr = ptr.release() - return out + return _create_nullary_agg( + cls, cpp_aggregation.make_sum_aggregation[aggregation] + ) class ProductAggregation(Aggregation): @classmethod def product(cls): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_product_aggregation[aggregation]()) - out.c_ptr = ptr.release() - return out + return _create_nullary_agg( + cls, cpp_aggregation.make_product_aggregation[aggregation] + ) class MinAggregation(Aggregation): @classmethod def min(cls): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_min_aggregation[aggregation]()) - out.c_ptr = ptr.release() - return out + return _create_nullary_agg( + cls, cpp_aggregation.make_min_aggregation[aggregation] + ) class MaxAggregation(Aggregation): @classmethod def max(cls): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_max_aggregation[aggregation]()) - out.c_ptr = ptr.release() - return out + return _create_nullary_agg( + cls, cpp_aggregation.make_max_aggregation[aggregation] + ) class CountAggregation(Aggregation): @classmethod def count(cls, null_policy null_handling): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_count_aggregation[aggregation](null_handling)) - out.c_ptr = ptr.release() - return out + return _create_unary_null_handling_agg( + cls, + cpp_aggregation.make_count_aggregation[aggregation], + null_handling, + ) class AnyAggregation(Aggregation): @classmethod def any(cls): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_any_aggregation[aggregation]()) - out.c_ptr = ptr.release() - return out + return _create_nullary_agg( + cls, cpp_aggregation.make_any_aggregation[aggregation], + ) class AllAggregation(Aggregation): @classmethod def all(cls): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_any_aggregation[aggregation]()) - out.c_ptr = ptr.release() - return out + return _create_nullary_agg( + cls, cpp_aggregation.make_all_aggregation[aggregation] + ) class SumOfSquaresAggregation(Aggregation): @classmethod def sum_of_squares(cls): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_product_aggregation[aggregation]()) - out.c_ptr = ptr.release() - return out + return _create_nullary_agg( + cls, cpp_aggregation.make_sum_of_squares_aggregation[aggregation], + ) class MeanAggregation(Aggregation): @classmethod def mean(cls): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_mean_aggregation[aggregation]()) - out.c_ptr = ptr.release() - return out + return _create_nullary_agg( + cls, + cpp_aggregation.make_mean_aggregation[aggregation], + ) class VarianceAggregation(Aggregation): @@ -170,16 +176,14 @@ class StdAggregation(Aggregation): class MedianAggregation(Aggregation): @classmethod def median(cls): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_median_aggregation[aggregation]()) - out.c_ptr = ptr.release() - return out + return _create_nullary_agg( + cls, cpp_aggregation.make_median_aggregation[aggregation] + ) class QuantileAggregation(Aggregation): @classmethod - def quantile(cls, list quantiles, interpolation interp): + def quantile(cls, list quantiles, interpolation interp = interpolation.LINEAR): cdef Aggregation out = cls.__new__(cls) cdef unique_ptr[aggregation] ptr = move( cpp_aggregation.make_quantile_aggregation[aggregation](quantiles, interp)) @@ -190,36 +194,32 @@ class QuantileAggregation(Aggregation): class ArgmaxAggregation(Aggregation): @classmethod def argmax(cls): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_argmax_aggregation[aggregation]()) - out.c_ptr = ptr.release() - return out + return _create_nullary_agg( + cls, cpp_aggregation.make_argmax_aggregation[aggregation] + ) class ArgminAggregation(Aggregation): @classmethod def argmin(cls): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_argmin_aggregation[aggregation]()) - out.c_ptr = ptr.release() - return out + return _create_nullary_agg( + cls, cpp_aggregation.make_argmin_aggregation[aggregation] + ) class NuniqueAggregation(Aggregation): @classmethod - def nunique(cls): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_nunique_aggregation[aggregation]()) - out.c_ptr = ptr.release() - return out + def nunique(cls, null_policy null_handling = null_policy.EXCLUDE): + return _create_unary_null_handling_agg( + cls, + cpp_aggregation.make_nunique_aggregation[aggregation], + null_handling, + ) class NthElementAggregation(Aggregation): @classmethod - def nth_element(cls, size_type n, null_policy null_handling): + def nth_element(cls, size_type n, null_policy null_handling = null_policy.INCLUDE): cdef Aggregation out = cls.__new__(cls) cdef unique_ptr[aggregation] ptr = move( cpp_aggregation.make_nth_element_aggregation[aggregation](n, null_handling)) @@ -229,20 +229,27 @@ class NthElementAggregation(Aggregation): class CollectListAggregation(Aggregation): @classmethod - def collect_list(cls): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_collect_list_aggregation[aggregation]()) - out.c_ptr = ptr.release() - return out + def collect_list(cls, null_policy null_handling = null_policy.INCLUDE): + return _create_unary_null_handling_agg( + cls, + cpp_aggregation.make_collect_list_aggregation[aggregation], + null_handling, + ) class CollectSetAggregation(Aggregation): @classmethod - def collect_set(cls): + def collect_set( + cls, + null_handling = null_policy.INCLUDE, + nulls_equal = null_equality.EQUAL, + nans_equal = nan_equality.ALL_EQUAL, + ): cdef Aggregation out = cls.__new__(cls) cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_collect_set_aggregation[aggregation]()) + cpp_aggregation.make_collect_set_aggregation[aggregation]( + null_handling, nulls_equal, nans_equal + )) out.c_ptr = ptr.release() return out @@ -286,12 +293,12 @@ class CovarianceAggregation(Aggregation): class RankAggregation(Aggregation): @classmethod def rank( - cls, - rank_method method, - order column_order, - null_policy null_handling, - null_order null_precedence, - rank_percentage percentage + cls, + rank_method method, + order column_order = order.ASCENDING, + null_policy null_handling = null_policy.EXCLUDE, + null_order null_precedence = null_order.AFTER, + rank_percentage percentage = rank_percentage.NONE, ): cdef Aggregation out = cls.__new__(cls) cdef unique_ptr[aggregation] ptr = move( From daf971c14ad294a4099db30c0a6d8b42b6cb7331 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 26 Jan 2024 10:18:47 +0000 Subject: [PATCH 07/34] Add some notes --- python/cudf/cudf/_lib/pylibcudf/aggregation.pyx | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index 545f259e678..127a2f0ab20 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -23,6 +23,20 @@ from cudf._lib.cpp.types cimport ( size_type, ) +# Notes: +# - We use raw pointers because Cython does not understand converting between +# compatible unique_ptr types +# - We invert the inheritance hierarchy from the C++ side. In C++, the +# operation classes inherit from the algorithm classes, which are effectively +# used as tags to determine which operations are supported. This pattern is +# useful in C++ because it provides compile-time checks on what can be +# constructed. In Python we cannot leverage this information though since we +# work with a precompiled libcudf library so the symbols are fully +# predetermined and the template definitions are not available. Moreover, +# Cython's knowledge of templating is insufficient. Therefore, we have to +# manage the set of supported operations by algorithm. The inverted hierarchy +# and use of mixins is the cleanest approach for doing that. + cdef class Aggregation: """A wrapper for aggregations. @@ -148,8 +162,7 @@ class MeanAggregation(Aggregation): @classmethod def mean(cls): return _create_nullary_agg( - cls, - cpp_aggregation.make_mean_aggregation[aggregation], + cls, cpp_aggregation.make_mean_aggregation[aggregation], ) From 48c3a853cbb3251f818be1d06497cfd61b819db2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 28 Jan 2024 20:35:59 +0000 Subject: [PATCH 08/34] Move all aggregations into the same class. --- .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 182 ++---------------- 1 file changed, 21 insertions(+), 161 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index 127a2f0ab20..0d4745be4e4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -38,6 +38,24 @@ from cudf._lib.cpp.types cimport ( # and use of mixins is the cleanest approach for doing that. +cdef Aggregation _create_nullary_agg(cls, nullary_factory_type cpp_agg_factory): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move(cpp_agg_factory()) + out.c_ptr = ptr.release() + return out + + +cdef Aggregation _create_unary_null_handling_agg( + cls, + unary_null_handling_factory_type cpp_agg_factory, + null_policy null_handling, +): + cdef Aggregation out = cls.__new__(cls) + cdef unique_ptr[aggregation] ptr = move(cpp_agg_factory(null_handling)) + out.c_ptr = ptr.release() + return out + + cdef class Aggregation: """A wrapper for aggregations. @@ -61,70 +79,33 @@ cdef class Aggregation: """Get the kind of the aggregation.""" return dereference(self.c_ptr).kind - -cdef Aggregation _create_nullary_agg(cls, nullary_factory_type cpp_agg_factory): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move(cpp_agg_factory()) - out.c_ptr = ptr.release() - return out - - -cdef Aggregation _create_unary_null_handling_agg( - cls, - unary_null_handling_factory_type cpp_agg_factory, - null_policy null_handling, -): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move(cpp_agg_factory(null_handling)) - out.c_ptr = ptr.release() - return out - - -# TODO: If/when https://github.com/cython/cython/issues/1271 is resolved, we -# should migrate the various factories to cpdef classmethods instead of Python -# def methods. - -# The below aggregation types are effectively mixins used to implement specific -# types of aggregations. They function like mixins when combined with the -# algorithm-specific classes below. The factory methods all construct an object -# of type cls but save that to a variable of the base class Aggregation, -# allowing the logic of the function to be generic regardless of which concrete -# algorithm-type the mixin is mixed into. Since the methods are pure Python, -# the end result is an object that Python will dynamically resolve to the -# correct type at the call site. -class SumAggregation(Aggregation): + # TODO: If/when https://github.com/cython/cython/issues/1271 is resolved, we + # should migrate the various factories to cpdef classmethods instead of Python + # def methods. @classmethod def sum(cls): return _create_nullary_agg( cls, cpp_aggregation.make_sum_aggregation[aggregation] ) - -class ProductAggregation(Aggregation): @classmethod def product(cls): return _create_nullary_agg( cls, cpp_aggregation.make_product_aggregation[aggregation] ) - -class MinAggregation(Aggregation): @classmethod def min(cls): return _create_nullary_agg( cls, cpp_aggregation.make_min_aggregation[aggregation] ) - -class MaxAggregation(Aggregation): @classmethod def max(cls): return _create_nullary_agg( cls, cpp_aggregation.make_max_aggregation[aggregation] ) - -class CountAggregation(Aggregation): @classmethod def count(cls, null_policy null_handling): return _create_unary_null_handling_agg( @@ -133,40 +114,30 @@ class CountAggregation(Aggregation): null_handling, ) - -class AnyAggregation(Aggregation): @classmethod def any(cls): return _create_nullary_agg( cls, cpp_aggregation.make_any_aggregation[aggregation], ) - -class AllAggregation(Aggregation): @classmethod def all(cls): return _create_nullary_agg( cls, cpp_aggregation.make_all_aggregation[aggregation] ) - -class SumOfSquaresAggregation(Aggregation): @classmethod def sum_of_squares(cls): return _create_nullary_agg( cls, cpp_aggregation.make_sum_of_squares_aggregation[aggregation], ) - -class MeanAggregation(Aggregation): @classmethod def mean(cls): return _create_nullary_agg( cls, cpp_aggregation.make_mean_aggregation[aggregation], ) - -class VarianceAggregation(Aggregation): @classmethod def variance(cls, size_type ddof): cdef Aggregation out = cls.__new__(cls) @@ -175,8 +146,6 @@ class VarianceAggregation(Aggregation): out.c_ptr = ptr.release() return out - -class StdAggregation(Aggregation): @classmethod def std(cls, size_type ddof): cdef Aggregation out = cls.__new__(cls) @@ -185,16 +154,12 @@ class StdAggregation(Aggregation): out.c_ptr = ptr.release() return out - -class MedianAggregation(Aggregation): @classmethod def median(cls): return _create_nullary_agg( cls, cpp_aggregation.make_median_aggregation[aggregation] ) - -class QuantileAggregation(Aggregation): @classmethod def quantile(cls, list quantiles, interpolation interp = interpolation.LINEAR): cdef Aggregation out = cls.__new__(cls) @@ -203,24 +168,18 @@ class QuantileAggregation(Aggregation): out.c_ptr = ptr.release() return out - -class ArgmaxAggregation(Aggregation): @classmethod def argmax(cls): return _create_nullary_agg( cls, cpp_aggregation.make_argmax_aggregation[aggregation] ) - -class ArgminAggregation(Aggregation): @classmethod def argmin(cls): return _create_nullary_agg( cls, cpp_aggregation.make_argmin_aggregation[aggregation] ) - -class NuniqueAggregation(Aggregation): @classmethod def nunique(cls, null_policy null_handling = null_policy.EXCLUDE): return _create_unary_null_handling_agg( @@ -229,8 +188,6 @@ class NuniqueAggregation(Aggregation): null_handling, ) - -class NthElementAggregation(Aggregation): @classmethod def nth_element(cls, size_type n, null_policy null_handling = null_policy.INCLUDE): cdef Aggregation out = cls.__new__(cls) @@ -239,8 +196,6 @@ class NthElementAggregation(Aggregation): out.c_ptr = ptr.release() return out - -class CollectListAggregation(Aggregation): @classmethod def collect_list(cls, null_policy null_handling = null_policy.INCLUDE): return _create_unary_null_handling_agg( @@ -249,8 +204,6 @@ class CollectListAggregation(Aggregation): null_handling, ) - -class CollectSetAggregation(Aggregation): @classmethod def collect_set( cls, @@ -266,8 +219,6 @@ class CollectSetAggregation(Aggregation): out.c_ptr = ptr.release() return out - -# class UdfAggregation(Aggregation): # @classmethod # def udf( # cls, @@ -276,8 +227,6 @@ class CollectSetAggregation(Aggregation): # data_type output_type # ): - -class CorrelationAggregation(Aggregation): @classmethod def correlation(cls, correlation_type type, size_type min_periods): cdef Aggregation out = cls.__new__(cls) @@ -289,8 +238,6 @@ class CorrelationAggregation(Aggregation): out.c_ptr = ptr.release() return out - -class CovarianceAggregation(Aggregation): @classmethod def covariance(cls, size_type min_periods, size_type ddof): cdef Aggregation out = cls.__new__(cls) @@ -302,8 +249,6 @@ class CovarianceAggregation(Aggregation): out.c_ptr = ptr.release() return out - -class RankAggregation(Aggregation): @classmethod def rank( cls, @@ -321,88 +266,3 @@ class RankAggregation(Aggregation): ) out.c_ptr = ptr.release() return out - - -# The following are the concrete aggregation types corresponding to aggregation -# algorithms. -class RollingAggregation( - SumAggregation, - MinAggregation, - MaxAggregation, - CountAggregation, - MeanAggregation, - StdAggregation, - VarianceAggregation, - ArgmaxAggregation, - ArgminAggregation, - NthElementAggregation, - RankAggregation, - CollectListAggregation, - CollectSetAggregation, -): - pass - - -class GroupbyAggregation( - SumAggregation, - ProductAggregation, - MinAggregation, - MaxAggregation, - CountAggregation, - SumOfSquaresAggregation, - MeanAggregation, - StdAggregation, - VarianceAggregation, - MedianAggregation, - QuantileAggregation, - ArgmaxAggregation, - ArgminAggregation, - NuniqueAggregation, - NthElementAggregation, - CollectListAggregation, - CollectSetAggregation, - CovarianceAggregation, - CorrelationAggregation, -): - pass - - -class GroupbyScanAggregation( - SumAggregation, - MinAggregation, - MaxAggregation, - CountAggregation, - RankAggregation, -): - pass - - -class ReduceAggregation( - SumAggregation, - ProductAggregation, - MinAggregation, - MaxAggregation, - AnyAggregation, - AllAggregation, - SumOfSquaresAggregation, - MeanAggregation, - StdAggregation, - VarianceAggregation, - MedianAggregation, - QuantileAggregation, - NuniqueAggregation, - NthElementAggregation, - CollectListAggregation, - CollectSetAggregation, -): - pass - - -class ScanAggregation( - SumAggregation, - ProductAggregation, - MinAggregation, - MaxAggregation, - RankAggregation, -): - pass From e57a7550cd16ec17fa04065d49b731760c708642 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 28 Jan 2024 20:58:32 +0000 Subject: [PATCH 09/34] Implement helper for generating groupby aggregations --- python/cudf/cudf/_lib/cpp/aggregation.pxd | 1 + .../cudf/cudf/_lib/pylibcudf/aggregation.pxd | 8 ++++++- .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 21 +++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 7e8bb4e7775..16f48b30a50 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -52,6 +52,7 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: cdef cppclass aggregation: Kind kind + unique_ptr[aggregation] clone() cdef cppclass rolling_aggregation(aggregation): pass diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd index 86e9f64ef32..ea72ab6be1c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -3,13 +3,19 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp cimport aggregation as cpp_aggregation -from cudf._lib.cpp.aggregation cimport Kind as kind_t, aggregation +from cudf._lib.cpp.aggregation cimport ( + Kind as kind_t, + aggregation, + groupby_aggregation, +) from cudf._lib.cpp.types cimport null_policy +ctypedef groupby_aggregation * gba_ptr cdef class Aggregation: cdef aggregation *c_ptr cpdef kind_t kind(self) + cdef unique_ptr[groupby_aggregation] make_groupby_copy(self) except * ctypedef unique_ptr[aggregation](*nullary_factory_type)() except + diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index 0d4745be4e4..c78c02e2b90 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from cython.operator cimport dereference +from libcpp.cast cimport dynamic_cast from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -10,6 +11,7 @@ from cudf._lib.cpp.aggregation cimport ( Kind as kind_t, aggregation, correlation_type, + groupby_aggregation, rank_method, rank_percentage, ) @@ -79,6 +81,25 @@ cdef class Aggregation: """Get the kind of the aggregation.""" return dereference(self.c_ptr).kind + cdef unique_ptr[groupby_aggregation] make_groupby_copy(self) except *: + """Make a copy of the aggregation that can be used in a groupby. + + This function will raise an exception if the aggregation is not supported + as a groupby aggregation. + """ + cdef unique_ptr[aggregation] agg = self.c_ptr.clone() + cdef aggregation *raw_agg = agg.release() + cdef groupby_aggregation *agg_cast = dynamic_cast[gba_ptr](raw_agg) + if agg_cast is NULL: + del raw_agg + raise TypeError( + "Aggregation type {} is not supported as a groupby aggregation".format( + type(self) + ) + ) + return unique_ptr[groupby_aggregation](NULL) + return unique_ptr[groupby_aggregation](agg_cast) + # TODO: If/when https://github.com/cython/cython/issues/1271 is resolved, we # should migrate the various factories to cpdef classmethods instead of Python # def methods. From e1d4f4c162661f832c3d748ea76a7a50ad4e7982 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 28 Jan 2024 21:03:14 +0000 Subject: [PATCH 10/34] Reduce manual ownership handling and use f-string --- python/cudf/cudf/_lib/pylibcudf/aggregation.pyx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index c78c02e2b90..fc366192803 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -88,16 +88,14 @@ cdef class Aggregation: as a groupby aggregation. """ cdef unique_ptr[aggregation] agg = self.c_ptr.clone() - cdef aggregation *raw_agg = agg.release() + cdef aggregation *raw_agg = agg.get() cdef groupby_aggregation *agg_cast = dynamic_cast[gba_ptr](raw_agg) if agg_cast is NULL: - del raw_agg raise TypeError( - "Aggregation type {} is not supported as a groupby aggregation".format( - type(self) - ) + f"{self.kind().title()} aggregations are not supported by groupby" ) return unique_ptr[groupby_aggregation](NULL) + agg.release() return unique_ptr[groupby_aggregation](agg_cast) # TODO: If/when https://github.com/cython/cython/issues/1271 is resolved, we From fe5014c85da586d1ab6720cfe5c39a3d517225ca Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 28 Jan 2024 21:25:42 +0000 Subject: [PATCH 11/34] Implement groupby --- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 2 +- .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 2 +- python/cudf/cudf/_lib/pylibcudf/groupby.pxd | 27 ++++++++ python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 65 +++++++++++++++++++ 4 files changed, 94 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/groupby.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/groupby.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index a2047247bfd..0ca0c122c38 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -13,7 +13,7 @@ # ============================================================================= set(cython_sources aggregation.pyx binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx - interop.pyx scalar.pyx table.pyx types.pyx utils.pyx + groupby.pyx interop.pyx scalar.pyx table.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index fc366192803..70c6c7730f2 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -92,7 +92,7 @@ cdef class Aggregation: cdef groupby_aggregation *agg_cast = dynamic_cast[gba_ptr](raw_agg) if agg_cast is NULL: raise TypeError( - f"{self.kind().title()} aggregations are not supported by groupby" + f"{str(self.kind()).title()} aggregations are not supported by groupby" ) return unique_ptr[groupby_aggregation](NULL) agg.release() diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd new file mode 100644 index 00000000000..71a974fcedc --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.aggregation cimport aggregation, groupby_aggregation +from cudf._lib.cpp.groupby cimport aggregation_request, groupby + +from .column cimport Column + + +cdef class AggregationRequest: + # The groupby APIs accept vectors of unique_ptrs to aggregation requests. + # This ownership model means that if AggregationRequest owned the + # corresponding C++ object, that object would have to be copied by e.g. + # each groupby.aggregate call to avoid invalidating this object. Therefore, + # this class instead stores only Python/Cython objects and constructs the + # C++ object on the fly as requested. + cdef Column values + cdef list aggregations + cdef aggregation_request to_libcudf(self) except * + + +cdef class GroupBy: + # groupby is not default constructible, so we must allocate it on the heap. + cdef unique_ptr[groupby] c_obj + cpdef aggregate(self, list requests) + cdef groupby * get(self) nogil diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx new file mode 100644 index 00000000000..d4d6609c6c0 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -0,0 +1,65 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.cpp.groupby cimport ( + aggregation_request, + aggregation_result, + groupby, +) +from cudf._lib.cpp.table.table cimport table + +from .aggregation cimport Aggregation +from .column cimport Column +from .table cimport Table + + +# TODO: This belongs in a separate groupby module eventually. +cdef class AggregationRequest: + def __init__(self, Column values, list aggregations): + self.values = values + self.aggregations = aggregations + + cdef aggregation_request to_libcudf(self) except *: + cdef aggregation_request c_obj + c_obj.values = self.values.view() + + cdef Aggregation agg + for agg in self.aggregations: + c_obj.aggregations.push_back(move(agg.make_groupby_copy())) + return move(c_obj) + + +cdef class GroupBy: + def __init__(self, Table keys): + self.c_obj.reset(new groupby(keys.view())) + + cpdef aggregate(self, list requests): + cdef AggregationRequest request + cdef vector[aggregation_request] c_requests + for request in requests: + c_requests.push_back(move(request.to_libcudf())) + + cdef pair[unique_ptr[table], vector[aggregation_result]] c_res = move( + self.get().aggregate(c_requests) + ) + cdef Table group_keys = Table.from_libcudf(move(c_res.first)) + + # TODO: For now, I'm assuming that all aggregations produce a single + # column. I'm not sure what the exceptions are, but I know there must + # be some. I expect that to be obvious when I start replacing the + # existing libcudf code. + cdef int i + cdef list results = [] + for i in range(c_res.second.size()): + results.append( + Column.from_libcudf(move(c_res.second[i].results[0])) + ) + return group_keys, results + + cdef groupby * get(self) nogil: + """Get the underlying groupby object.""" + return self.c_obj.get() From 366c092e63040a0c238f3a92f35893864545c10f Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 28 Jan 2024 21:25:59 +0000 Subject: [PATCH 12/34] Add to init --- python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 4 +++- python/cudf/cudf/_lib/pylibcudf/__init__.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index f4b8c50eecc..d463e5bd0a1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -1,7 +1,7 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. # TODO: Verify consistent usage of relative/absolute imports in pylibcudf. -from . cimport binaryop, copying, interop +from . cimport aggregation, binaryop, copying, groupby, interop from .column cimport Column from .gpumemoryview cimport gpumemoryview from .scalar cimport Scalar @@ -15,8 +15,10 @@ __all__ = [ "DataType", "Scalar", "Table", + "aggregation", "binaryop", "copying", "gpumemoryview", + "groupby", "interop", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index a27d80fc5a2..73e2d98f545 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. -from . import binaryop, copying, interop +from . import aggregation, binaryop, copying, groupby, interop from .column import Column from .gpumemoryview import gpumemoryview from .scalar import Scalar @@ -13,8 +13,10 @@ "Scalar", "Table", "TypeId", + "aggregation", "binaryop", "copying", "gpumemoryview", + "groupby", "interop", ] From 0b4a334436c5c6086753ff9e09e79b403f0f45e6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 28 Jan 2024 22:27:03 +0000 Subject: [PATCH 13/34] Fix some bugs --- python/cudf/cudf/_lib/pylibcudf/aggregation.pyx | 8 ++++---- python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index 70c6c7730f2..ca3aea57fb5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -88,13 +88,13 @@ cdef class Aggregation: as a groupby aggregation. """ cdef unique_ptr[aggregation] agg = self.c_ptr.clone() + # This roundabout casting is required because Cython does not understand + # that unique_ptrs can be cast along class hierarchies. cdef aggregation *raw_agg = agg.get() cdef groupby_aggregation *agg_cast = dynamic_cast[gba_ptr](raw_agg) if agg_cast is NULL: - raise TypeError( - f"{str(self.kind()).title()} aggregations are not supported by groupby" - ) - return unique_ptr[groupby_aggregation](NULL) + agg_repr = str(self.kind()).split(".")[1].title() + raise TypeError(f"{agg_repr} aggregations are not supported by groupby") agg.release() return unique_ptr[groupby_aggregation](agg_cast) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index d4d6609c6c0..bf33ffbe8d3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -48,16 +48,16 @@ cdef class GroupBy: ) cdef Table group_keys = Table.from_libcudf(move(c_res.first)) - # TODO: For now, I'm assuming that all aggregations produce a single - # column. I'm not sure what the exceptions are, but I know there must - # be some. I expect that to be obvious when I start replacing the - # existing libcudf code. - cdef int i + cdef int i, j cdef list results = [] + cdef list inner_results for i in range(c_res.second.size()): - results.append( - Column.from_libcudf(move(c_res.second[i].results[0])) - ) + inner_results = [] + for j in range(c_res.second[i].results.size()): + inner_results.append( + Column.from_libcudf(move(c_res.second[i].results[j])) + ) + results.append(inner_results) return group_keys, results cdef groupby * get(self) nogil: From f60a316a48ae786ec1ff93917ef5cf6199049eff Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 29 Jan 2024 03:41:09 +0000 Subject: [PATCH 14/34] Make all factories cpdef free functions --- .../cudf/cudf/_lib/pylibcudf/aggregation.pxd | 3 +- .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 370 +++++++++--------- 2 files changed, 181 insertions(+), 192 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd index ea72ab6be1c..78aa7d9de1c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -24,9 +24,8 @@ ctypedef unique_ptr[aggregation](*unary_null_handling_factory_type)( ) except + -cdef Aggregation _create_nullary_agg(cls, nullary_factory_type cpp_agg_factory) +cdef Aggregation _create_nullary_agg(nullary_factory_type cpp_agg_factory) cdef Aggregation _create_unary_null_handling_agg( - cls, unary_null_handling_factory_type cpp_agg_factory, null_policy null_handling, ) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index ca3aea57fb5..0395ee6532a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -40,24 +40,6 @@ from cudf._lib.cpp.types cimport ( # and use of mixins is the cleanest approach for doing that. -cdef Aggregation _create_nullary_agg(cls, nullary_factory_type cpp_agg_factory): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move(cpp_agg_factory()) - out.c_ptr = ptr.release() - return out - - -cdef Aggregation _create_unary_null_handling_agg( - cls, - unary_null_handling_factory_type cpp_agg_factory, - null_policy null_handling, -): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move(cpp_agg_factory(null_handling)) - out.c_ptr = ptr.release() - return out - - cdef class Aggregation: """A wrapper for aggregations. @@ -73,8 +55,7 @@ cdef class Aggregation: ) def __dealloc__(self): - # Deletion is always handled by child classes. - if type(self) is not Aggregation and self.c_ptr is not NULL: + if self.c_ptr is not NULL: del self.c_ptr cpdef kind_t kind(self): @@ -98,190 +79,199 @@ cdef class Aggregation: agg.release() return unique_ptr[groupby_aggregation](agg_cast) - # TODO: If/when https://github.com/cython/cython/issues/1271 is resolved, we - # should migrate the various factories to cpdef classmethods instead of Python - # def methods. - @classmethod - def sum(cls): - return _create_nullary_agg( - cls, cpp_aggregation.make_sum_aggregation[aggregation] - ) - @classmethod - def product(cls): - return _create_nullary_agg( - cls, cpp_aggregation.make_product_aggregation[aggregation] - ) +cdef Aggregation _create_nullary_agg(nullary_factory_type cpp_agg_factory): + cdef Aggregation out = Aggregation.__new__(Aggregation) + cdef unique_ptr[aggregation] ptr = move(cpp_agg_factory()) + out.c_ptr = ptr.release() + return out - @classmethod - def min(cls): - return _create_nullary_agg( - cls, cpp_aggregation.make_min_aggregation[aggregation] - ) - @classmethod - def max(cls): - return _create_nullary_agg( - cls, cpp_aggregation.make_max_aggregation[aggregation] - ) +cdef Aggregation _create_unary_null_handling_agg( + unary_null_handling_factory_type cpp_agg_factory, + null_policy null_handling, +): + cdef Aggregation out = Aggregation.__new__(Aggregation) + cdef unique_ptr[aggregation] ptr = move(cpp_agg_factory(null_handling)) + out.c_ptr = ptr.release() + return out - @classmethod - def count(cls, null_policy null_handling): - return _create_unary_null_handling_agg( - cls, - cpp_aggregation.make_count_aggregation[aggregation], - null_handling, - ) - @classmethod - def any(cls): - return _create_nullary_agg( - cls, cpp_aggregation.make_any_aggregation[aggregation], - ) +cpdef Aggregation sum(): + return _create_nullary_agg( + cpp_aggregation.make_sum_aggregation[aggregation] + ) - @classmethod - def all(cls): - return _create_nullary_agg( - cls, cpp_aggregation.make_all_aggregation[aggregation] - ) - @classmethod - def sum_of_squares(cls): - return _create_nullary_agg( - cls, cpp_aggregation.make_sum_of_squares_aggregation[aggregation], - ) +cpdef Aggregation product(): + return _create_nullary_agg( + cpp_aggregation.make_product_aggregation[aggregation] + ) - @classmethod - def mean(cls): - return _create_nullary_agg( - cls, cpp_aggregation.make_mean_aggregation[aggregation], - ) - @classmethod - def variance(cls, size_type ddof): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_variance_aggregation[aggregation](ddof)) - out.c_ptr = ptr.release() - return out - - @classmethod - def std(cls, size_type ddof): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_std_aggregation[aggregation](ddof)) - out.c_ptr = ptr.release() - return out - - @classmethod - def median(cls): - return _create_nullary_agg( - cls, cpp_aggregation.make_median_aggregation[aggregation] - ) +cpdef Aggregation min(): + return _create_nullary_agg( + cpp_aggregation.make_min_aggregation[aggregation] + ) - @classmethod - def quantile(cls, list quantiles, interpolation interp = interpolation.LINEAR): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_quantile_aggregation[aggregation](quantiles, interp)) - out.c_ptr = ptr.release() - return out - - @classmethod - def argmax(cls): - return _create_nullary_agg( - cls, cpp_aggregation.make_argmax_aggregation[aggregation] - ) - @classmethod - def argmin(cls): - return _create_nullary_agg( - cls, cpp_aggregation.make_argmin_aggregation[aggregation] - ) +cpdef Aggregation max(): + return _create_nullary_agg( + cpp_aggregation.make_max_aggregation[aggregation] + ) - @classmethod - def nunique(cls, null_policy null_handling = null_policy.EXCLUDE): - return _create_unary_null_handling_agg( - cls, - cpp_aggregation.make_nunique_aggregation[aggregation], - null_handling, - ) - @classmethod - def nth_element(cls, size_type n, null_policy null_handling = null_policy.INCLUDE): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_nth_element_aggregation[aggregation](n, null_handling)) - out.c_ptr = ptr.release() - return out - - @classmethod - def collect_list(cls, null_policy null_handling = null_policy.INCLUDE): - return _create_unary_null_handling_agg( - cls, - cpp_aggregation.make_collect_list_aggregation[aggregation], - null_handling, - ) +cpdef Aggregation count(null_policy null_handling): + return _create_unary_null_handling_agg( + cpp_aggregation.make_count_aggregation[aggregation], + null_handling, + ) + + +cpdef Aggregation any(): + return _create_nullary_agg( + cpp_aggregation.make_any_aggregation[aggregation], + ) + + +cpdef Aggregation all(): + return _create_nullary_agg( + cpp_aggregation.make_all_aggregation[aggregation] + ) + + +cpdef Aggregation sum_of_squares(): + return _create_nullary_agg( + cpp_aggregation.make_sum_of_squares_aggregation[aggregation], + ) + + +cpdef Aggregation mean(): + return _create_nullary_agg( + cpp_aggregation.make_mean_aggregation[aggregation], + ) - @classmethod - def collect_set( - cls, - null_handling = null_policy.INCLUDE, - nulls_equal = null_equality.EQUAL, - nans_equal = nan_equality.ALL_EQUAL, - ): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_collect_set_aggregation[aggregation]( - null_handling, nulls_equal, nans_equal - )) - out.c_ptr = ptr.release() - return out - -# @classmethod -# def udf( -# cls, -# udf_type type, -# string user_defined_aggregator, -# data_type output_type -# ): - - @classmethod - def correlation(cls, correlation_type type, size_type min_periods): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_correlation_aggregation[aggregation]( - type, min_periods - ) + +cpdef Aggregation variance(size_type ddof): + cdef Aggregation out = Aggregation.__new__(Aggregation) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_variance_aggregation[aggregation](ddof)) + out.c_ptr = ptr.release() + return out + + +cpdef Aggregation std(size_type ddof): + cdef Aggregation out = Aggregation.__new__(Aggregation) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_std_aggregation[aggregation](ddof)) + out.c_ptr = ptr.release() + return out + + +cpdef Aggregation median(): + return _create_nullary_agg( + cpp_aggregation.make_median_aggregation[aggregation] + ) + + +cpdef Aggregation quantile(list quantiles, interpolation interp = interpolation.LINEAR): + cdef Aggregation out = Aggregation.__new__(Aggregation) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_quantile_aggregation[aggregation](quantiles, interp)) + out.c_ptr = ptr.release() + return out + + +cpdef Aggregation argmax(): + return _create_nullary_agg( + cpp_aggregation.make_argmax_aggregation[aggregation] + ) + + +cpdef Aggregation argmin(): + return _create_nullary_agg( + cpp_aggregation.make_argmin_aggregation[aggregation] + ) + + +cpdef Aggregation nunique(null_policy null_handling = null_policy.EXCLUDE): + return _create_unary_null_handling_agg( + cpp_aggregation.make_nunique_aggregation[aggregation], + null_handling, + ) + + +cpdef Aggregation nth_element( + size_type n, null_policy null_handling = null_policy.INCLUDE +): + cdef Aggregation out = Aggregation.__new__(Aggregation) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_nth_element_aggregation[aggregation](n, null_handling)) + out.c_ptr = ptr.release() + return out + + +cpdef Aggregation collect_list(null_policy null_handling = null_policy.INCLUDE): + return _create_unary_null_handling_agg( + cpp_aggregation.make_collect_list_aggregation[aggregation], + null_handling, + ) + + +cpdef Aggregation collect_set( + null_handling = null_policy.INCLUDE, + nulls_equal = null_equality.EQUAL, + nans_equal = nan_equality.ALL_EQUAL, +): + cdef Aggregation out = Aggregation.__new__(Aggregation) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_collect_set_aggregation[aggregation]( + null_handling, nulls_equal, nans_equal + )) + out.c_ptr = ptr.release() + return out + +# cpdef Aggregation udf( +# udf_type type, +# string user_defined_aggregator, +# data_type output_type +# ): + + +cpdef Aggregation correlation(correlation_type type, size_type min_periods): + cdef Aggregation out = Aggregation.__new__(Aggregation) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_correlation_aggregation[aggregation]( + type, min_periods ) - out.c_ptr = ptr.release() - return out - - @classmethod - def covariance(cls, size_type min_periods, size_type ddof): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_covariance_aggregation[aggregation]( - min_periods, ddof - ) + ) + out.c_ptr = ptr.release() + return out + + +cpdef Aggregation covariance(size_type min_periods, size_type ddof): + cdef Aggregation out = Aggregation.__new__(Aggregation) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_covariance_aggregation[aggregation]( + min_periods, ddof ) - out.c_ptr = ptr.release() - return out - - @classmethod - def rank( - cls, - rank_method method, - order column_order = order.ASCENDING, - null_policy null_handling = null_policy.EXCLUDE, - null_order null_precedence = null_order.AFTER, - rank_percentage percentage = rank_percentage.NONE, - ): - cdef Aggregation out = cls.__new__(cls) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_rank_aggregation[aggregation]( - method, column_order, null_handling, null_precedence, percentage - ) + ) + out.c_ptr = ptr.release() + return out + + +cpdef Aggregation rank( + rank_method method, + order column_order = order.ASCENDING, + null_policy null_handling = null_policy.EXCLUDE, + null_order null_precedence = null_order.AFTER, + rank_percentage percentage = rank_percentage.NONE, +): + cdef Aggregation out = Aggregation.__new__(Aggregation) + cdef unique_ptr[aggregation] ptr = move( + cpp_aggregation.make_rank_aggregation[aggregation]( + method, column_order, null_handling, null_precedence, percentage ) - out.c_ptr = ptr.release() - return out + ) + out.c_ptr = ptr.release() + return out From acbb5b6c35e1f808bc79ecdfe0f680cba6086987 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 29 Jan 2024 03:54:47 +0000 Subject: [PATCH 15/34] Simplify and unify logic for factories --- .../cudf/cudf/_lib/pylibcudf/aggregation.pxd | 16 +- .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 172 ++++++++---------- 2 files changed, 78 insertions(+), 110 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd index 78aa7d9de1c..9aeaa3d3130 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -13,19 +13,9 @@ from cudf._lib.cpp.types cimport null_policy ctypedef groupby_aggregation * gba_ptr cdef class Aggregation: - cdef aggregation *c_ptr + cdef unique_ptr[aggregation] c_ptr cpdef kind_t kind(self) cdef unique_ptr[groupby_aggregation] make_groupby_copy(self) except * - -ctypedef unique_ptr[aggregation](*nullary_factory_type)() except + -ctypedef unique_ptr[aggregation](*unary_null_handling_factory_type)( - null_policy -) except + - - -cdef Aggregation _create_nullary_agg(nullary_factory_type cpp_agg_factory) -cdef Aggregation _create_unary_null_handling_agg( - unary_null_handling_factory_type cpp_agg_factory, - null_policy null_handling, -) + @staticmethod + cdef Aggregation from_libcudf(unique_ptr[aggregation] agg) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index 0395ee6532a..1d47a8b8b79 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -54,10 +54,6 @@ cdef class Aggregation: "Use one of the factory functions." ) - def __dealloc__(self): - if self.c_ptr is not NULL: - del self.c_ptr - cpdef kind_t kind(self): """Get the kind of the aggregation.""" return dereference(self.c_ptr).kind @@ -68,7 +64,7 @@ cdef class Aggregation: This function will raise an exception if the aggregation is not supported as a groupby aggregation. """ - cdef unique_ptr[aggregation] agg = self.c_ptr.clone() + cdef unique_ptr[aggregation] agg = dereference(self.c_ptr).clone() # This roundabout casting is required because Cython does not understand # that unique_ptrs can be cast along class hierarchies. cdef aggregation *raw_agg = agg.get() @@ -79,142 +75,123 @@ cdef class Aggregation: agg.release() return unique_ptr[groupby_aggregation](agg_cast) - -cdef Aggregation _create_nullary_agg(nullary_factory_type cpp_agg_factory): - cdef Aggregation out = Aggregation.__new__(Aggregation) - cdef unique_ptr[aggregation] ptr = move(cpp_agg_factory()) - out.c_ptr = ptr.release() - return out - - -cdef Aggregation _create_unary_null_handling_agg( - unary_null_handling_factory_type cpp_agg_factory, - null_policy null_handling, -): - cdef Aggregation out = Aggregation.__new__(Aggregation) - cdef unique_ptr[aggregation] ptr = move(cpp_agg_factory(null_handling)) - out.c_ptr = ptr.release() - return out + @staticmethod + cdef Aggregation from_libcudf(unique_ptr[aggregation] agg): + """Create a Python Aggregation from a libcudf aggregation.""" + cdef Aggregation out = Aggregation.__new__(Aggregation) + out.c_ptr = move(agg) + return out cpdef Aggregation sum(): - return _create_nullary_agg( - cpp_aggregation.make_sum_aggregation[aggregation] + return Aggregation.from_libcudf( + move(cpp_aggregation.make_sum_aggregation[aggregation]()) ) cpdef Aggregation product(): - return _create_nullary_agg( - cpp_aggregation.make_product_aggregation[aggregation] + return Aggregation.from_libcudf( + move(cpp_aggregation.make_product_aggregation[aggregation]()) ) cpdef Aggregation min(): - return _create_nullary_agg( - cpp_aggregation.make_min_aggregation[aggregation] + return Aggregation.from_libcudf( + move(cpp_aggregation.make_min_aggregation[aggregation]()) ) cpdef Aggregation max(): - return _create_nullary_agg( - cpp_aggregation.make_max_aggregation[aggregation] + return Aggregation.from_libcudf( + move(cpp_aggregation.make_max_aggregation[aggregation]()) ) cpdef Aggregation count(null_policy null_handling): - return _create_unary_null_handling_agg( - cpp_aggregation.make_count_aggregation[aggregation], - null_handling, + return Aggregation.from_libcudf( + move(cpp_aggregation.make_count_aggregation[aggregation](null_handling)) ) cpdef Aggregation any(): - return _create_nullary_agg( - cpp_aggregation.make_any_aggregation[aggregation], + return Aggregation.from_libcudf( + move(cpp_aggregation.make_any_aggregation[aggregation]()) ) cpdef Aggregation all(): - return _create_nullary_agg( - cpp_aggregation.make_all_aggregation[aggregation] + return Aggregation.from_libcudf( + move(cpp_aggregation.make_all_aggregation[aggregation]()) ) cpdef Aggregation sum_of_squares(): - return _create_nullary_agg( - cpp_aggregation.make_sum_of_squares_aggregation[aggregation], + return Aggregation.from_libcudf( + move(cpp_aggregation.make_sum_of_squares_aggregation[aggregation]()) ) cpdef Aggregation mean(): - return _create_nullary_agg( - cpp_aggregation.make_mean_aggregation[aggregation], + return Aggregation.from_libcudf( + move(cpp_aggregation.make_mean_aggregation[aggregation]()) ) cpdef Aggregation variance(size_type ddof): - cdef Aggregation out = Aggregation.__new__(Aggregation) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_variance_aggregation[aggregation](ddof)) - out.c_ptr = ptr.release() - return out + return Aggregation.from_libcudf( + move(cpp_aggregation.make_variance_aggregation[aggregation](ddof)) + ) cpdef Aggregation std(size_type ddof): - cdef Aggregation out = Aggregation.__new__(Aggregation) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_std_aggregation[aggregation](ddof)) - out.c_ptr = ptr.release() - return out + return Aggregation.from_libcudf( + move(cpp_aggregation.make_std_aggregation[aggregation](ddof)) + ) cpdef Aggregation median(): - return _create_nullary_agg( - cpp_aggregation.make_median_aggregation[aggregation] + return Aggregation.from_libcudf( + move(cpp_aggregation.make_median_aggregation[aggregation]()) ) cpdef Aggregation quantile(list quantiles, interpolation interp = interpolation.LINEAR): - cdef Aggregation out = Aggregation.__new__(Aggregation) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_quantile_aggregation[aggregation](quantiles, interp)) - out.c_ptr = ptr.release() - return out + return Aggregation.from_libcudf( + move(cpp_aggregation.make_quantile_aggregation[aggregation](quantiles, interp)) + ) cpdef Aggregation argmax(): - return _create_nullary_agg( - cpp_aggregation.make_argmax_aggregation[aggregation] + return Aggregation.from_libcudf( + move(cpp_aggregation.make_argmax_aggregation[aggregation]()) ) cpdef Aggregation argmin(): - return _create_nullary_agg( - cpp_aggregation.make_argmin_aggregation[aggregation] + return Aggregation.from_libcudf( + move(cpp_aggregation.make_argmin_aggregation[aggregation]()) ) cpdef Aggregation nunique(null_policy null_handling = null_policy.EXCLUDE): - return _create_unary_null_handling_agg( - cpp_aggregation.make_nunique_aggregation[aggregation], - null_handling, + return Aggregation.from_libcudf( + move(cpp_aggregation.make_nunique_aggregation[aggregation](null_handling)) ) cpdef Aggregation nth_element( size_type n, null_policy null_handling = null_policy.INCLUDE ): - cdef Aggregation out = Aggregation.__new__(Aggregation) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_nth_element_aggregation[aggregation](n, null_handling)) - out.c_ptr = ptr.release() - return out + return Aggregation.from_libcudf( + move( + cpp_aggregation.make_nth_element_aggregation[aggregation](n, null_handling) + ) + ) cpdef Aggregation collect_list(null_policy null_handling = null_policy.INCLUDE): - return _create_unary_null_handling_agg( - cpp_aggregation.make_collect_list_aggregation[aggregation], - null_handling, + return Aggregation.from_libcudf( + move(cpp_aggregation.make_collect_list_aggregation[aggregation](null_handling)) ) @@ -223,13 +200,13 @@ cpdef Aggregation collect_set( nulls_equal = null_equality.EQUAL, nans_equal = nan_equality.ALL_EQUAL, ): - cdef Aggregation out = Aggregation.__new__(Aggregation) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_collect_set_aggregation[aggregation]( - null_handling, nulls_equal, nans_equal - )) - out.c_ptr = ptr.release() - return out + return Aggregation.from_libcudf( + move( + cpp_aggregation.make_collect_set_aggregation[aggregation]( + null_handling, nulls_equal, nans_equal + ) + ) + ) # cpdef Aggregation udf( # udf_type type, @@ -239,25 +216,23 @@ cpdef Aggregation collect_set( cpdef Aggregation correlation(correlation_type type, size_type min_periods): - cdef Aggregation out = Aggregation.__new__(Aggregation) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_correlation_aggregation[aggregation]( - type, min_periods + return Aggregation.from_libcudf( + move( + cpp_aggregation.make_correlation_aggregation[aggregation]( + type, min_periods + ) ) ) - out.c_ptr = ptr.release() - return out cpdef Aggregation covariance(size_type min_periods, size_type ddof): - cdef Aggregation out = Aggregation.__new__(Aggregation) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_covariance_aggregation[aggregation]( - min_periods, ddof + return Aggregation.from_libcudf( + move( + cpp_aggregation.make_covariance_aggregation[aggregation]( + min_periods, ddof + ) ) ) - out.c_ptr = ptr.release() - return out cpdef Aggregation rank( @@ -267,11 +242,14 @@ cpdef Aggregation rank( null_order null_precedence = null_order.AFTER, rank_percentage percentage = rank_percentage.NONE, ): - cdef Aggregation out = Aggregation.__new__(Aggregation) - cdef unique_ptr[aggregation] ptr = move( - cpp_aggregation.make_rank_aggregation[aggregation]( - method, column_order, null_handling, null_precedence, percentage + return Aggregation.from_libcudf( + move( + cpp_aggregation.make_rank_aggregation[aggregation]( + method, + column_order, + null_handling, + null_precedence, + percentage, + ) ) ) - out.c_ptr = ptr.release() - return out From 1bef3f1f2dda0da4c83e5c74a1b4cdb526c3e8b0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 29 Jan 2024 03:56:39 +0000 Subject: [PATCH 16/34] Rename method --- python/cudf/cudf/_lib/pylibcudf/aggregation.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/aggregation.pyx | 2 +- python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd index 9aeaa3d3130..4d65a8562cb 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -15,7 +15,7 @@ ctypedef groupby_aggregation * gba_ptr cdef class Aggregation: cdef unique_ptr[aggregation] c_ptr cpdef kind_t kind(self) - cdef unique_ptr[groupby_aggregation] make_groupby_copy(self) except * + cdef unique_ptr[groupby_aggregation] clone_as_groupby(self) except * @staticmethod cdef Aggregation from_libcudf(unique_ptr[aggregation] agg) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index 1d47a8b8b79..36d7f0d259f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -58,7 +58,7 @@ cdef class Aggregation: """Get the kind of the aggregation.""" return dereference(self.c_ptr).kind - cdef unique_ptr[groupby_aggregation] make_groupby_copy(self) except *: + cdef unique_ptr[groupby_aggregation] clone_as_groupby(self) except *: """Make a copy of the aggregation that can be used in a groupby. This function will raise an exception if the aggregation is not supported diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index bf33ffbe8d3..1c50d8f1a35 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -29,7 +29,7 @@ cdef class AggregationRequest: cdef Aggregation agg for agg in self.aggregations: - c_obj.aggregations.push_back(move(agg.make_groupby_copy())) + c_obj.aggregations.push_back(move(agg.clone_as_groupby())) return move(c_obj) From 898573372af8841f6733809a51b532e79bdc33fe Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 29 Jan 2024 04:19:19 +0000 Subject: [PATCH 17/34] Lots of cleanup --- .../cudf/cudf/_lib/pylibcudf/aggregation.pxd | 6 +- .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 138 +++++++----------- python/cudf/cudf/_lib/pylibcudf/groupby.pxd | 4 +- python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 12 +- 4 files changed, 63 insertions(+), 97 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd index 4d65a8562cb..7f316de2568 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -2,7 +2,6 @@ from libcpp.memory cimport unique_ptr -from cudf._lib.cpp cimport aggregation as cpp_aggregation from cudf._lib.cpp.aggregation cimport ( Kind as kind_t, aggregation, @@ -10,12 +9,11 @@ from cudf._lib.cpp.aggregation cimport ( ) from cudf._lib.cpp.types cimport null_policy -ctypedef groupby_aggregation * gba_ptr cdef class Aggregation: - cdef unique_ptr[aggregation] c_ptr + cdef unique_ptr[aggregation] c_obj cpdef kind_t kind(self) - cdef unique_ptr[groupby_aggregation] clone_as_groupby(self) except * + cdef unique_ptr[groupby_aggregation] clone_underlying_as_groupby(self) except * @staticmethod cdef Aggregation from_libcudf(unique_ptr[aggregation] agg) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index 36d7f0d259f..f39684b6979 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -12,6 +12,28 @@ from cudf._lib.cpp.aggregation cimport ( aggregation, correlation_type, groupby_aggregation, + make_all_aggregation, + make_any_aggregation, + make_argmax_aggregation, + make_argmin_aggregation, + make_collect_list_aggregation, + make_collect_set_aggregation, + make_correlation_aggregation, + make_count_aggregation, + make_covariance_aggregation, + make_max_aggregation, + make_mean_aggregation, + make_median_aggregation, + make_min_aggregation, + make_nth_element_aggregation, + make_nunique_aggregation, + make_product_aggregation, + make_quantile_aggregation, + make_rank_aggregation, + make_std_aggregation, + make_sum_aggregation, + make_sum_of_squares_aggregation, + make_variance_aggregation, rank_method, rank_percentage, ) @@ -25,19 +47,8 @@ from cudf._lib.cpp.types cimport ( size_type, ) -# Notes: -# - We use raw pointers because Cython does not understand converting between -# compatible unique_ptr types -# - We invert the inheritance hierarchy from the C++ side. In C++, the -# operation classes inherit from the algorithm classes, which are effectively -# used as tags to determine which operations are supported. This pattern is -# useful in C++ because it provides compile-time checks on what can be -# constructed. In Python we cannot leverage this information though since we -# work with a precompiled libcudf library so the symbols are fully -# predetermined and the template definitions are not available. Moreover, -# Cython's knowledge of templating is insufficient. Therefore, we have to -# manage the set of supported operations by algorithm. The inverted hierarchy -# and use of mixins is the cleanest approach for doing that. +# workaround for https://github.com/cython/cython/issues/3885 +ctypedef groupby_aggregation * gba_ptr cdef class Aggregation: @@ -50,25 +61,22 @@ cdef class Aggregation: """ def __init__(self): raise ValueError( - "Aggregation types should not be constructed directly. " - "Use one of the factory functions." + "Aggregations should not be constructed directly. Use one of the factories." ) cpdef kind_t kind(self): """Get the kind of the aggregation.""" - return dereference(self.c_ptr).kind + return dereference(self.c_obj).kind - cdef unique_ptr[groupby_aggregation] clone_as_groupby(self) except *: - """Make a copy of the aggregation that can be used in a groupby. + cdef unique_ptr[groupby_aggregation] clone_underlying_as_groupby(self) except *: + """Make a copy of the underlying aggregation that can be used in a groupby. - This function will raise an exception if the aggregation is not supported - as a groupby aggregation. + This function will raise an exception if the aggregation is not supported as a + groupby aggregation. This failure to cast translates the per-algorithm + aggregation logic encoded in libcudf's type hierarchy into Python. """ - cdef unique_ptr[aggregation] agg = dereference(self.c_ptr).clone() - # This roundabout casting is required because Cython does not understand - # that unique_ptrs can be cast along class hierarchies. - cdef aggregation *raw_agg = agg.get() - cdef groupby_aggregation *agg_cast = dynamic_cast[gba_ptr](raw_agg) + cdef unique_ptr[aggregation] agg = dereference(self.c_obj).clone() + cdef groupby_aggregation *agg_cast = dynamic_cast[gba_ptr](agg.get()) if agg_cast is NULL: agg_repr = str(self.kind()).split(".")[1].title() raise TypeError(f"{agg_repr} aggregations are not supported by groupby") @@ -79,103 +87,79 @@ cdef class Aggregation: cdef Aggregation from_libcudf(unique_ptr[aggregation] agg): """Create a Python Aggregation from a libcudf aggregation.""" cdef Aggregation out = Aggregation.__new__(Aggregation) - out.c_ptr = move(agg) + out.c_obj = move(agg) return out cpdef Aggregation sum(): - return Aggregation.from_libcudf( - move(cpp_aggregation.make_sum_aggregation[aggregation]()) - ) + return Aggregation.from_libcudf(move(make_sum_aggregation[aggregation]())) cpdef Aggregation product(): - return Aggregation.from_libcudf( - move(cpp_aggregation.make_product_aggregation[aggregation]()) - ) + return Aggregation.from_libcudf(move(make_product_aggregation[aggregation]())) cpdef Aggregation min(): - return Aggregation.from_libcudf( - move(cpp_aggregation.make_min_aggregation[aggregation]()) - ) + return Aggregation.from_libcudf(move(make_min_aggregation[aggregation]())) cpdef Aggregation max(): - return Aggregation.from_libcudf( - move(cpp_aggregation.make_max_aggregation[aggregation]()) - ) + return Aggregation.from_libcudf(move(make_max_aggregation[aggregation]())) cpdef Aggregation count(null_policy null_handling): return Aggregation.from_libcudf( - move(cpp_aggregation.make_count_aggregation[aggregation](null_handling)) + move(make_count_aggregation[aggregation](null_handling)) ) cpdef Aggregation any(): - return Aggregation.from_libcudf( - move(cpp_aggregation.make_any_aggregation[aggregation]()) - ) + return Aggregation.from_libcudf(move(make_any_aggregation[aggregation]())) cpdef Aggregation all(): - return Aggregation.from_libcudf( - move(cpp_aggregation.make_all_aggregation[aggregation]()) - ) + return Aggregation.from_libcudf(move(make_all_aggregation[aggregation]())) cpdef Aggregation sum_of_squares(): return Aggregation.from_libcudf( - move(cpp_aggregation.make_sum_of_squares_aggregation[aggregation]()) + move(make_sum_of_squares_aggregation[aggregation]()) ) cpdef Aggregation mean(): - return Aggregation.from_libcudf( - move(cpp_aggregation.make_mean_aggregation[aggregation]()) - ) + return Aggregation.from_libcudf(move(make_mean_aggregation[aggregation]())) cpdef Aggregation variance(size_type ddof): - return Aggregation.from_libcudf( - move(cpp_aggregation.make_variance_aggregation[aggregation](ddof)) - ) + return Aggregation.from_libcudf(move(make_variance_aggregation[aggregation](ddof))) cpdef Aggregation std(size_type ddof): - return Aggregation.from_libcudf( - move(cpp_aggregation.make_std_aggregation[aggregation](ddof)) - ) + return Aggregation.from_libcudf(move(make_std_aggregation[aggregation](ddof))) cpdef Aggregation median(): - return Aggregation.from_libcudf( - move(cpp_aggregation.make_median_aggregation[aggregation]()) - ) + return Aggregation.from_libcudf(move(make_median_aggregation[aggregation]())) cpdef Aggregation quantile(list quantiles, interpolation interp = interpolation.LINEAR): return Aggregation.from_libcudf( - move(cpp_aggregation.make_quantile_aggregation[aggregation](quantiles, interp)) + move(make_quantile_aggregation[aggregation](quantiles, interp)) ) cpdef Aggregation argmax(): - return Aggregation.from_libcudf( - move(cpp_aggregation.make_argmax_aggregation[aggregation]()) - ) + return Aggregation.from_libcudf(move(make_argmax_aggregation[aggregation]())) cpdef Aggregation argmin(): - return Aggregation.from_libcudf( - move(cpp_aggregation.make_argmin_aggregation[aggregation]()) - ) + return Aggregation.from_libcudf(move(make_argmin_aggregation[aggregation]())) cpdef Aggregation nunique(null_policy null_handling = null_policy.EXCLUDE): return Aggregation.from_libcudf( - move(cpp_aggregation.make_nunique_aggregation[aggregation](null_handling)) + move(make_nunique_aggregation[aggregation](null_handling)) ) @@ -183,15 +167,13 @@ cpdef Aggregation nth_element( size_type n, null_policy null_handling = null_policy.INCLUDE ): return Aggregation.from_libcudf( - move( - cpp_aggregation.make_nth_element_aggregation[aggregation](n, null_handling) - ) + move(make_nth_element_aggregation[aggregation](n, null_handling)) ) cpdef Aggregation collect_list(null_policy null_handling = null_policy.INCLUDE): return Aggregation.from_libcudf( - move(cpp_aggregation.make_collect_list_aggregation[aggregation](null_handling)) + move(make_collect_list_aggregation[aggregation](null_handling)) ) @@ -202,7 +184,7 @@ cpdef Aggregation collect_set( ): return Aggregation.from_libcudf( move( - cpp_aggregation.make_collect_set_aggregation[aggregation]( + make_collect_set_aggregation[aggregation]( null_handling, nulls_equal, nans_equal ) ) @@ -217,21 +199,13 @@ cpdef Aggregation collect_set( cpdef Aggregation correlation(correlation_type type, size_type min_periods): return Aggregation.from_libcudf( - move( - cpp_aggregation.make_correlation_aggregation[aggregation]( - type, min_periods - ) - ) + move(make_correlation_aggregation[aggregation](type, min_periods)) ) cpdef Aggregation covariance(size_type min_periods, size_type ddof): return Aggregation.from_libcudf( - move( - cpp_aggregation.make_covariance_aggregation[aggregation]( - min_periods, ddof - ) - ) + move(make_covariance_aggregation[aggregation](min_periods, ddof)) ) @@ -244,7 +218,7 @@ cpdef Aggregation rank( ): return Aggregation.from_libcudf( move( - cpp_aggregation.make_rank_aggregation[aggregation]( + make_rank_aggregation[aggregation]( method, column_order, null_handling, diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd index 71a974fcedc..b661eec17a4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd @@ -21,7 +21,5 @@ cdef class AggregationRequest: cdef class GroupBy: - # groupby is not default constructible, so we must allocate it on the heap. cdef unique_ptr[groupby] c_obj - cpdef aggregate(self, list requests) - cdef groupby * get(self) nogil + cpdef tuple aggregate(self, list requests) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index 1c50d8f1a35..c43aecaebf0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from cython.operator cimport dereference from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from libcpp.utility cimport move @@ -17,7 +18,6 @@ from .column cimport Column from .table cimport Table -# TODO: This belongs in a separate groupby module eventually. cdef class AggregationRequest: def __init__(self, Column values, list aggregations): self.values = values @@ -29,7 +29,7 @@ cdef class AggregationRequest: cdef Aggregation agg for agg in self.aggregations: - c_obj.aggregations.push_back(move(agg.clone_as_groupby())) + c_obj.aggregations.push_back(move(agg.clone_underlying_as_groupby())) return move(c_obj) @@ -37,14 +37,14 @@ cdef class GroupBy: def __init__(self, Table keys): self.c_obj.reset(new groupby(keys.view())) - cpdef aggregate(self, list requests): + cpdef tuple aggregate(self, list requests): cdef AggregationRequest request cdef vector[aggregation_request] c_requests for request in requests: c_requests.push_back(move(request.to_libcudf())) cdef pair[unique_ptr[table], vector[aggregation_result]] c_res = move( - self.get().aggregate(c_requests) + dereference(self.c_obj).aggregate(c_requests) ) cdef Table group_keys = Table.from_libcudf(move(c_res.first)) @@ -59,7 +59,3 @@ cdef class GroupBy: ) results.append(inner_results) return group_keys, results - - cdef groupby * get(self) nogil: - """Get the underlying groupby object.""" - return self.c_obj.get() From b25442e0c1b4929699ccafb2cab017c8814e4148 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 29 Jan 2024 04:37:30 +0000 Subject: [PATCH 18/34] Add declarations to pxd --- .../cudf/cudf/_lib/pylibcudf/aggregation.pxd | 70 ++++++++++++++++++- .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 1 - 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd index 7f316de2568..6ac3813504c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -5,9 +5,20 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp.aggregation cimport ( Kind as kind_t, aggregation, + correlation_type, groupby_aggregation, + rank_method, + rank_percentage, +) +from cudf._lib.cpp.types cimport ( + interpolation, + nan_equality, + null_equality, + null_order, + null_policy, + order, + size_type, ) -from cudf._lib.cpp.types cimport null_policy cdef class Aggregation: @@ -17,3 +28,60 @@ cdef class Aggregation: @staticmethod cdef Aggregation from_libcudf(unique_ptr[aggregation] agg) + + +cpdef Aggregation sum() + +cpdef Aggregation product() + +cpdef Aggregation min() + +cpdef Aggregation max() + +cpdef Aggregation count(null_policy null_handling) + +cpdef Aggregation any() + +cpdef Aggregation all() + +cpdef Aggregation sum_of_squares() + +cpdef Aggregation mean() + +cpdef Aggregation variance(size_type ddof) + +cpdef Aggregation std(size_type ddof) + +cpdef Aggregation median() + +cpdef Aggregation quantile(list quantiles, interpolation interp = *) + +cpdef Aggregation argmax() + +cpdef Aggregation argmin() + +cpdef Aggregation nunique(null_policy null_handling = *) + +cpdef Aggregation nth_element(size_type n, null_policy null_handling = *) + +cpdef Aggregation collect_list(null_policy null_handling = *) + +cpdef Aggregation collect_set(null_handling = *, nulls_equal = *, nans_equal = *) + +# cpdef Aggregation udf( +# udf_type type, +# string user_defined_aggregator, +# data_type output_type +# ) + +cpdef Aggregation correlation(correlation_type type, size_type min_periods) + +cpdef Aggregation covariance(size_type min_periods, size_type ddof) + +cpdef Aggregation rank( + rank_method method, + order column_order = *, + null_policy null_handling = *, + null_order null_precedence = *, + rank_percentage percentage = *, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index f39684b6979..1aa2ee9f890 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -5,7 +5,6 @@ from libcpp.cast cimport dynamic_cast from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -# Have to alias this so that the Pythonic name gets capitalized # TODO: Expose the enums to Python from cudf._lib.cpp.aggregation cimport ( Kind as kind_t, From 3b2e75c8edd9738427682b7a20a0a9e14567ba2e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 29 Jan 2024 19:04:11 +0000 Subject: [PATCH 19/34] Return list of tables --- python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index c43aecaebf0..663edb9dcb4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -57,5 +57,5 @@ cdef class GroupBy: inner_results.append( Column.from_libcudf(move(c_res.second[i].results[j])) ) - results.append(inner_results) + results.append(Table(inner_results)) return group_keys, results From 31b8d4c05fea759380c69fc714bb6258e272a45d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 29 Jan 2024 20:29:42 +0000 Subject: [PATCH 20/34] Expose enums --- python/cudf/cudf/_lib/pylibcudf/aggregation.pyx | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index 1aa2ee9f890..01b32ba3d41 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -46,6 +46,15 @@ from cudf._lib.cpp.types cimport ( size_type, ) +from cudf._lib.cpp.aggregation import Kind # no-cython-lint +from cudf._lib.cpp.aggregation import \ + correlation_type as CorrelationType # no-cython-lint +from cudf._lib.cpp.aggregation import \ + rank_method as RankMethod # no-cython-lint +from cudf._lib.cpp.aggregation import \ + rank_percentage as RankPercentage # no-cython-lint +from cudf._lib.cpp.aggregation import udf_type as UdfType # no-cython-lint + # workaround for https://github.com/cython/cython/issues/3885 ctypedef groupby_aggregation * gba_ptr From bc2cefb07be807448ce410f5bfb8a79f77840d54 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 29 Jan 2024 22:29:11 +0000 Subject: [PATCH 21/34] Implement udf aggregation --- .../cudf/cudf/_lib/pylibcudf/aggregation.pxd | 8 +++----- .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 18 +++++++++++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd index 6ac3813504c..7c707a3e97e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -20,6 +20,8 @@ from cudf._lib.cpp.types cimport ( size_type, ) +from .types cimport DataType + cdef class Aggregation: cdef unique_ptr[aggregation] c_obj @@ -68,11 +70,7 @@ cpdef Aggregation collect_list(null_policy null_handling = *) cpdef Aggregation collect_set(null_handling = *, nulls_equal = *, nans_equal = *) -# cpdef Aggregation udf( -# udf_type type, -# string user_defined_aggregator, -# data_type output_type -# ) +cpdef Aggregation udf(str operation, DataType output_type) cpdef Aggregation correlation(correlation_type type, size_type min_periods) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index 01b32ba3d41..36bbb615f36 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -32,6 +32,7 @@ from cudf._lib.cpp.aggregation cimport ( make_std_aggregation, make_sum_aggregation, make_sum_of_squares_aggregation, + make_udf_aggregation, make_variance_aggregation, rank_method, rank_percentage, @@ -55,6 +56,8 @@ from cudf._lib.cpp.aggregation import \ rank_percentage as RankPercentage # no-cython-lint from cudf._lib.cpp.aggregation import udf_type as UdfType # no-cython-lint +from .types cimport DataType + # workaround for https://github.com/cython/cython/issues/3885 ctypedef groupby_aggregation * gba_ptr @@ -198,11 +201,16 @@ cpdef Aggregation collect_set( ) ) -# cpdef Aggregation udf( -# udf_type type, -# string user_defined_aggregator, -# data_type output_type -# ): +cpdef Aggregation udf(str operation, DataType output_type): + return Aggregation.from_libcudf( + move( + make_udf_aggregation[aggregation]( + UdfType.PTX, + operation.encode("utf-8"), + output_type.c_obj, + ) + ) + ) cpdef Aggregation correlation(correlation_type type, size_type min_periods): From 391f074849ab1b23423339f1dfd72eec9a5b6fa6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 29 Jan 2024 23:30:57 +0000 Subject: [PATCH 22/34] Add docs and fix a few function defaults --- .../cudf/cudf/_lib/pylibcudf/aggregation.pxd | 6 +- .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 263 +++++++++++++++++- python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 38 +++ 3 files changed, 295 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd index 7c707a3e97e..801dc8011e8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -40,7 +40,7 @@ cpdef Aggregation min() cpdef Aggregation max() -cpdef Aggregation count(null_policy null_handling) +cpdef Aggregation count(null_policy null_handling = *) cpdef Aggregation any() @@ -50,9 +50,9 @@ cpdef Aggregation sum_of_squares() cpdef Aggregation mean() -cpdef Aggregation variance(size_type ddof) +cpdef Aggregation variance(size_type ddof = *) -cpdef Aggregation std(size_type ddof) +cpdef Aggregation std(size_type ddof = *) cpdef Aggregation median() diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index 36bbb615f36..b2a319e45fa 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -5,7 +5,6 @@ from libcpp.cast cimport dynamic_cast from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -# TODO: Expose the enums to Python from cudf._lib.cpp.aggregation cimport ( Kind as kind_t, aggregation, @@ -63,12 +62,13 @@ ctypedef groupby_aggregation * gba_ptr cdef class Aggregation: - """A wrapper for aggregations. + """A type of aggregation to perform. - **Neither this class nor any of its subclasses should ever be instantiated - using a standard constructor, only using one of its many factories.** The - factory approach matches the libcudf approach, which is necessary because - some aggregations require additional arguments beyond the kind. + Aggregations are passed to APIs like + :py:func:`~cudf._lib.pylibcudf.groupby.Groupby.aggregate` to indicate what + operations to perform. Using a class for aggregations provides a unified + API for handling parametrizable aggregations. This class should never be + instantiated directly, only via one of the factory functions. """ def __init__(self): raise ValueError( @@ -103,72 +103,212 @@ cdef class Aggregation: cpdef Aggregation sum(): + """Create a sum aggregation. + + Returns + ------- + Aggregation + The sum aggregation. + """ return Aggregation.from_libcudf(move(make_sum_aggregation[aggregation]())) cpdef Aggregation product(): + """Create a product aggregation. + + Returns + ------- + Aggregation + The product aggregation. + """ return Aggregation.from_libcudf(move(make_product_aggregation[aggregation]())) cpdef Aggregation min(): + """Create a min aggregation. + + Returns + ------- + Aggregation + The min aggregation. + """ return Aggregation.from_libcudf(move(make_min_aggregation[aggregation]())) cpdef Aggregation max(): + """Create a max aggregation. + + Returns + ------- + Aggregation + The max aggregation. + """ return Aggregation.from_libcudf(move(make_max_aggregation[aggregation]())) -cpdef Aggregation count(null_policy null_handling): +cpdef Aggregation count(null_policy null_handling = null_policy.EXCLUDE): + """Create a count aggregation. + + Parameters + ---------- + null_handling : null_policy, default EXCLUDE + Whether or not nulls should be included. + + Returns + ------- + Aggregation + The count aggregation. + """ return Aggregation.from_libcudf( move(make_count_aggregation[aggregation](null_handling)) ) cpdef Aggregation any(): + """Create an any aggregation. + + Returns + ------- + Aggregation + The any aggregation. + """ return Aggregation.from_libcudf(move(make_any_aggregation[aggregation]())) cpdef Aggregation all(): + """Create an all aggregation. + + Returns + ------- + Aggregation + The all aggregation. + """ return Aggregation.from_libcudf(move(make_all_aggregation[aggregation]())) cpdef Aggregation sum_of_squares(): + """Create a sum_of_squares aggregation. + + Returns + ------- + Aggregation + The sum_of_squares aggregation. + """ return Aggregation.from_libcudf( move(make_sum_of_squares_aggregation[aggregation]()) ) cpdef Aggregation mean(): + """Create a mean aggregation. + + Returns + ------- + Aggregation + The mean aggregation. + """ return Aggregation.from_libcudf(move(make_mean_aggregation[aggregation]())) -cpdef Aggregation variance(size_type ddof): +cpdef Aggregation variance(size_type ddof=1): + """Create a variance aggregation. + + Parameters + ---------- + ddof : int, default 1 + Delta degrees of freedom. + + Returns + ------- + Aggregation + The variance aggregation. + """ return Aggregation.from_libcudf(move(make_variance_aggregation[aggregation](ddof))) -cpdef Aggregation std(size_type ddof): +cpdef Aggregation std(size_type ddof=1): + """Create a std aggregation. + + Parameters + ---------- + ddof : int, default 1 + Delta degrees of freedom. The default value is 1. + + Returns + ------- + Aggregation + The std aggregation. + """ return Aggregation.from_libcudf(move(make_std_aggregation[aggregation](ddof))) cpdef Aggregation median(): + """Create a median aggregation. + + Returns + ------- + Aggregation + The median aggregation. + """ return Aggregation.from_libcudf(move(make_median_aggregation[aggregation]())) cpdef Aggregation quantile(list quantiles, interpolation interp = interpolation.LINEAR): + """Create a quantile aggregation. + + Parameters + ---------- + quantiles : list + List of quantiles to compute, should be between 0 and 1. + interp : interpolation, default LINEAR + Interpolation technique to use when the desired quantile lies between + two data points. + + Returns + ------- + Aggregation + The quantile aggregation. + """ return Aggregation.from_libcudf( move(make_quantile_aggregation[aggregation](quantiles, interp)) ) cpdef Aggregation argmax(): + """Create an argmax aggregation. + + Returns + ------- + Aggregation + The argmax aggregation. + """ return Aggregation.from_libcudf(move(make_argmax_aggregation[aggregation]())) cpdef Aggregation argmin(): + """Create an argmin aggregation. + + Returns + ------- + Aggregation + The argmin aggregation. + """ return Aggregation.from_libcudf(move(make_argmin_aggregation[aggregation]())) cpdef Aggregation nunique(null_policy null_handling = null_policy.EXCLUDE): + """Create a nunique aggregation. + + Parameters + ---------- + null_handling : null_policy, default EXCLUDE + Whether or not nulls should be included. + + Returns + ------- + Aggregation + The nunique aggregation. + """ return Aggregation.from_libcudf( move(make_nunique_aggregation[aggregation](null_handling)) ) @@ -177,12 +317,36 @@ cpdef Aggregation nunique(null_policy null_handling = null_policy.EXCLUDE): cpdef Aggregation nth_element( size_type n, null_policy null_handling = null_policy.INCLUDE ): + """Create a nth_element aggregation. + + Parameters + ---------- + null_handling : null_policy, default INCLUDE + Whether or not nulls should be included. + + Returns + ------- + Aggregation + The nth_element aggregation. + """ return Aggregation.from_libcudf( move(make_nth_element_aggregation[aggregation](n, null_handling)) ) cpdef Aggregation collect_list(null_policy null_handling = null_policy.INCLUDE): + """Create a collect_list aggregation. + + Parameters + ---------- + null_handling : null_policy, default INCLUDE + Whether or not nulls should be included. + + Returns + ------- + Aggregation + The collect_list aggregation. + """ return Aggregation.from_libcudf( move(make_collect_list_aggregation[aggregation](null_handling)) ) @@ -193,6 +357,22 @@ cpdef Aggregation collect_set( nulls_equal = null_equality.EQUAL, nans_equal = nan_equality.ALL_EQUAL, ): + """Create a collect_set aggregation. + + Parameters + ---------- + null_handling : null_policy, default INCLUDE + Whether or not nulls should be included. + nulls_equal : null_equality, default EQUAL + Whether or not nulls should be considered equal. + nans_equal : nan_equality, default ALL_EQUAL + Whether or not NaNs should be considered equal. + + Returns + ------- + Aggregation + The collect_set aggregation. + """ return Aggregation.from_libcudf( move( make_collect_set_aggregation[aggregation]( @@ -202,6 +382,20 @@ cpdef Aggregation collect_set( ) cpdef Aggregation udf(str operation, DataType output_type): + """Create a udf aggregation. + + Parameters + ---------- + operation : str + The operation to perform as a string of PTX code. + output_type : DataType + The output type of the aggregation. + + Returns + ------- + Aggregation + The udf aggregation. + """ return Aggregation.from_libcudf( move( make_udf_aggregation[aggregation]( @@ -214,12 +408,42 @@ cpdef Aggregation udf(str operation, DataType output_type): cpdef Aggregation correlation(correlation_type type, size_type min_periods): + """Create a correlation aggregation. + + Parameters + ---------- + type : correlation_type + The type of correlation to compute. + min_periods : int + The minimum number of observations to consider for computing the + correlation. + + Returns + ------- + Aggregation + The correlation aggregation. + """ return Aggregation.from_libcudf( move(make_correlation_aggregation[aggregation](type, min_periods)) ) cpdef Aggregation covariance(size_type min_periods, size_type ddof): + """Create a covariance aggregation. + + Parameters + ---------- + min_periods : int + The minimum number of observations to consider for computing the + covariance. + ddof : int + Delta degrees of freedom. + + Returns + ------- + Aggregation + The covariance aggregation. + """ return Aggregation.from_libcudf( move(make_covariance_aggregation[aggregation](min_periods, ddof)) ) @@ -232,6 +456,27 @@ cpdef Aggregation rank( null_order null_precedence = null_order.AFTER, rank_percentage percentage = rank_percentage.NONE, ): + """Create a rank aggregation. + + Parameters + ---------- + method : rank_method + The method to use for ranking. + column_order : order, default ASCENDING + The order in which to sort the column. + null_handling : null_policy, default EXCLUDE + Whether or not nulls should be included. + null_precedence : null_order, default AFTER + Whether nulls should come before or after non-nulls. + percentage : rank_percentage, default NONE + Whether or not ranks should be converted to percentages, and if so, + the type of normalization to use. + + Returns + ------- + Aggregation + The rank aggregation. + """ return Aggregation.from_libcudf( move( make_rank_aggregation[aggregation]( diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index 663edb9dcb4..bc5f19a839c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -19,11 +19,26 @@ from .table cimport Table cdef class AggregationRequest: + """A request for a groupby aggregation. + + Parameters + ---------- + values : Column + The column to aggregate. + aggregations : list + The list of aggregations to perform. + """ def __init__(self, Column values, list aggregations): self.values = values self.aggregations = aggregations cdef aggregation_request to_libcudf(self) except *: + """Convert to a libcudf aggregation_request object. + + This method is for internal use only. It creates a new libcudf + :cpp:class:`cudf::groupby::aggregation_request` object each time it is + called. + """ cdef aggregation_request c_obj c_obj.values = self.values.view() @@ -34,10 +49,33 @@ cdef class AggregationRequest: cdef class GroupBy: + """Group values by keys and compute various aggregate quantities. + + Parameters + ---------- + keys : Table + The columns to group by. + """ def __init__(self, Table keys): self.c_obj.reset(new groupby(keys.view())) cpdef tuple aggregate(self, list requests): + """Compute aggregations on columns. + + Parameters + ---------- + requests : list + The list of aggregation requests, each representing a set of + aggregations to perform on a given column of values. + + Returns + ------- + Tuple[Table, List[Table, ...]] + A tuple whose first element is the unique keys and whose second + element is a table of aggregation results. One table is returned + for each aggregation request, with the columns corresponding to the + sequence of aggregations in the request. + """ cdef AggregationRequest request cdef vector[aggregation_request] c_requests for request in requests: From eb49091ad425ad9a9aaa66909bed54995c6d06c4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 00:46:57 +0000 Subject: [PATCH 23/34] Add to Sphinx and fix some issues --- .../source/user_guide/api_docs/pylibcudf/aggregation.rst | 6 ++++++ .../cudf/source/user_guide/api_docs/pylibcudf/groupby.rst | 6 ++++++ docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst | 2 ++ python/cudf/cudf/_lib/pylibcudf/aggregation.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/aggregation.pyx | 8 +++++--- 5 files changed, 20 insertions(+), 4 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst new file mode 100644 index 00000000000..739305af5d4 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst @@ -0,0 +1,6 @@ +=========== +aggregation +=========== + +.. automodule:: cudf._lib.pylibcudf.aggregation + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst new file mode 100644 index 00000000000..d6e994f7dbc --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst @@ -0,0 +1,6 @@ +======= +groupby +======= + +.. automodule:: cudf._lib.pylibcudf.groupby + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 7504295de92..4735b0d9414 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -8,10 +8,12 @@ This page provides API documentation for pylibcudf. :maxdepth: 1 :caption: API Documentation + aggregation binaryop column copying gpumemoryview + groupby scalar table types diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd index 801dc8011e8..8ef38615f0f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -25,7 +25,7 @@ from .types cimport DataType cdef class Aggregation: cdef unique_ptr[aggregation] c_obj - cpdef kind_t kind(self) + cpdef kind(self) cdef unique_ptr[groupby_aggregation] clone_underlying_as_groupby(self) except * @staticmethod diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index b2a319e45fa..2da5f92c77f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -6,7 +6,6 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from cudf._lib.cpp.aggregation cimport ( - Kind as kind_t, aggregation, correlation_type, groupby_aggregation, @@ -65,7 +64,7 @@ cdef class Aggregation: """A type of aggregation to perform. Aggregations are passed to APIs like - :py:func:`~cudf._lib.pylibcudf.groupby.Groupby.aggregate` to indicate what + :py:func:`~cudf._lib.pylibcudf.groupby.GroupBy.aggregate` to indicate what operations to perform. Using a class for aggregations provides a unified API for handling parametrizable aggregations. This class should never be instantiated directly, only via one of the factory functions. @@ -75,7 +74,10 @@ cdef class Aggregation: "Aggregations should not be constructed directly. Use one of the factories." ) - cpdef kind_t kind(self): + # TODO: Ideally we would include the return type here, but we need to do so + # in a way that Sphinx understands (currently have issues due to + # https://github.com/cython/cython/issues/5609). + cpdef kind(self): """Get the kind of the aggregation.""" return dereference(self.c_obj).kind From d9de5648095fa1777f14d55123ca4667a74595b5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 02:13:17 +0000 Subject: [PATCH 24/34] Enable scans --- .../cudf/cudf/_lib/pylibcudf/aggregation.pxd | 4 + .../cudf/cudf/_lib/pylibcudf/aggregation.pyx | 21 +++++ python/cudf/cudf/_lib/pylibcudf/groupby.pxd | 26 +++++- python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 87 +++++++++++++++---- 4 files changed, 115 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd index 8ef38615f0f..8eda16c4165 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd @@ -7,6 +7,7 @@ from cudf._lib.cpp.aggregation cimport ( aggregation, correlation_type, groupby_aggregation, + groupby_scan_aggregation, rank_method, rank_percentage, ) @@ -27,6 +28,9 @@ cdef class Aggregation: cdef unique_ptr[aggregation] c_obj cpdef kind(self) cdef unique_ptr[groupby_aggregation] clone_underlying_as_groupby(self) except * + cdef unique_ptr[groupby_scan_aggregation] clone_underlying_as_groupby_scan( + self + ) except * @staticmethod cdef Aggregation from_libcudf(unique_ptr[aggregation] agg) diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx index 2da5f92c77f..0b91263d720 100644 --- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx @@ -9,6 +9,7 @@ from cudf._lib.cpp.aggregation cimport ( aggregation, correlation_type, groupby_aggregation, + groupby_scan_aggregation, make_all_aggregation, make_any_aggregation, make_argmax_aggregation, @@ -58,6 +59,7 @@ from .types cimport DataType # workaround for https://github.com/cython/cython/issues/3885 ctypedef groupby_aggregation * gba_ptr +ctypedef groupby_scan_aggregation * gbsa_ptr cdef class Aggregation: @@ -96,6 +98,25 @@ cdef class Aggregation: agg.release() return unique_ptr[groupby_aggregation](agg_cast) + # Ideally this function could reuse the code above, but Cython lacks the + # first-class support for type-aliasing and templates that would make it possible. + cdef unique_ptr[groupby_scan_aggregation] clone_underlying_as_groupby_scan( + self + ) except *: + """Make a copy of the underlying aggregation that can be used in a groupby scan. + + This function will raise an exception if the aggregation is not supported as a + groupby scan aggregation. This failure to cast translates the per-algorithm + aggregation logic encoded in libcudf's type hierarchy into Python. + """ + cdef unique_ptr[aggregation] agg = dereference(self.c_obj).clone() + cdef groupby_scan_aggregation *agg_cast = dynamic_cast[gbsa_ptr](agg.get()) + if agg_cast is NULL: + agg_repr = str(self.kind()).split(".")[1].title() + raise TypeError(f"{agg_repr} scans are not supported by groupby") + agg.release() + return unique_ptr[groupby_scan_aggregation](agg_cast) + @staticmethod cdef Aggregation from_libcudf(unique_ptr[aggregation] agg): """Create a Python Aggregation from a libcudf aggregation.""" diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd index b661eec17a4..6affd6f1f5d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd @@ -1,14 +1,26 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair +from libcpp.vector cimport vector -from cudf._lib.cpp.aggregation cimport aggregation, groupby_aggregation -from cudf._lib.cpp.groupby cimport aggregation_request, groupby +from cudf._lib.cpp.aggregation cimport ( + aggregation, + groupby_aggregation, + groupby_scan_aggregation, +) +from cudf._lib.cpp.groupby cimport ( + aggregation_request, + aggregation_result, + groupby, + scan_request, +) +from cudf._lib.cpp.table.table cimport table from .column cimport Column -cdef class AggregationRequest: +cdef class GroupByRequest: # The groupby APIs accept vectors of unique_ptrs to aggregation requests. # This ownership model means that if AggregationRequest owned the # corresponding C++ object, that object would have to be copied by e.g. @@ -17,9 +29,15 @@ cdef class AggregationRequest: # C++ object on the fly as requested. cdef Column values cdef list aggregations - cdef aggregation_request to_libcudf(self) except * + + cdef aggregation_request to_libcudf_agg_request(self) except * + cdef scan_request to_libcudf_scan_request(self) except * cdef class GroupBy: cdef unique_ptr[groupby] c_obj cpdef tuple aggregate(self, list requests) + cpdef tuple scan(self, list requests) + + @staticmethod + cdef tuple _parse_outputs(pair[unique_ptr[table], vector[aggregation_result]] c_res) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index bc5f19a839c..c00b540435a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -10,6 +10,7 @@ from cudf._lib.cpp.groupby cimport ( aggregation_request, aggregation_result, groupby, + scan_request, ) from cudf._lib.cpp.table.table cimport table @@ -18,7 +19,7 @@ from .column cimport Column from .table cimport Table -cdef class AggregationRequest: +cdef class GroupByRequest: """A request for a groupby aggregation. Parameters @@ -32,7 +33,7 @@ cdef class AggregationRequest: self.values = values self.aggregations = aggregations - cdef aggregation_request to_libcudf(self) except *: + cdef aggregation_request to_libcudf_agg_request(self) except *: """Convert to a libcudf aggregation_request object. This method is for internal use only. It creates a new libcudf @@ -47,6 +48,21 @@ cdef class AggregationRequest: c_obj.aggregations.push_back(move(agg.clone_underlying_as_groupby())) return move(c_obj) + cdef scan_request to_libcudf_scan_request(self) except *: + """Convert to a libcudf scan_request object. + + This method is for internal use only. It creates a new libcudf + :cpp:class:`cudf::groupby::scan_request` object each time it is + called. + """ + cdef scan_request c_obj + c_obj.values = self.values.view() + + cdef Aggregation agg + for agg in self.aggregations: + c_obj.aggregations.push_back(move(agg.clone_underlying_as_groupby_scan())) + return move(c_obj) + cdef class GroupBy: """Group values by keys and compute various aggregate quantities. @@ -59,14 +75,32 @@ cdef class GroupBy: def __init__(self, Table keys): self.c_obj.reset(new groupby(keys.view())) + @staticmethod + cdef tuple _parse_outputs( + pair[unique_ptr[table], vector[aggregation_result]] c_res + ): + cdef Table group_keys = Table.from_libcudf(move(c_res.first)) + + cdef int i, j + cdef list results = [] + cdef list inner_results + for i in range(c_res.second.size()): + inner_results = [] + for j in range(c_res.second[i].results.size()): + inner_results.append( + Column.from_libcudf(move(c_res.second[i].results[j])) + ) + results.append(Table(inner_results)) + return group_keys, results + cpdef tuple aggregate(self, list requests): """Compute aggregations on columns. Parameters ---------- - requests : list - The list of aggregation requests, each representing a set of - aggregations to perform on a given column of values. + requests : List[GroupByRequest] + The list of `~.cudf._lib.pylibcudf.groupby.GroupByRequest` , each + representing a set of aggregations to perform on a given column of values. Returns ------- @@ -76,24 +110,39 @@ cdef class GroupBy: for each aggregation request, with the columns corresponding to the sequence of aggregations in the request. """ - cdef AggregationRequest request + cdef GroupByRequest request cdef vector[aggregation_request] c_requests for request in requests: - c_requests.push_back(move(request.to_libcudf())) + c_requests.push_back(move(request.to_libcudf_agg_request())) cdef pair[unique_ptr[table], vector[aggregation_result]] c_res = move( dereference(self.c_obj).aggregate(c_requests) ) - cdef Table group_keys = Table.from_libcudf(move(c_res.first)) + return GroupBy._parse_outputs(move(c_res)) - cdef int i, j - cdef list results = [] - cdef list inner_results - for i in range(c_res.second.size()): - inner_results = [] - for j in range(c_res.second[i].results.size()): - inner_results.append( - Column.from_libcudf(move(c_res.second[i].results[j])) - ) - results.append(Table(inner_results)) - return group_keys, results + cpdef tuple scan(self, list requests): + """Compute scans on columns. + + Parameters + ---------- + requests : List[GroupByRequest] + The list of `~.cudf._lib.pylibcudf.groupby.GroupByRequest` , each + representing a set of aggregations to perform on a given column of values. + + Returns + ------- + Tuple[Table, List[Table, ...]] + A tuple whose first element is the unique keys and whose second + element is a table of aggregation results. One table is returned + for each aggregation request, with the columns corresponding to the + sequence of aggregations in the request. + """ + cdef GroupByRequest request + cdef vector[scan_request] c_requests + for request in requests: + c_requests.push_back(move(request.to_libcudf_scan_request())) + + cdef pair[unique_ptr[table], vector[aggregation_result]] c_res = move( + dereference(self.c_obj).scan(c_requests) + ) + return GroupBy._parse_outputs(move(c_res)) From f79f340b6fe310fc7f7f0fb626178ad32789cbd2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 08:04:45 +0000 Subject: [PATCH 25/34] Some minor cleanup --- python/cudf/cudf/_lib/pylibcudf/groupby.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd index 6affd6f1f5d..d0768c1ba48 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd @@ -22,7 +22,7 @@ from .column cimport Column cdef class GroupByRequest: # The groupby APIs accept vectors of unique_ptrs to aggregation requests. - # This ownership model means that if AggregationRequest owned the + # This ownership model means that if GroupByRequest owned the # corresponding C++ object, that object would have to be copied by e.g. # each groupby.aggregate call to avoid invalidating this object. Therefore, # this class instead stores only Python/Cython objects and constructs the diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index c00b540435a..22ca5c8b1e0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -20,7 +20,7 @@ from .table cimport Table cdef class GroupByRequest: - """A request for a groupby aggregation. + """A request for a groupby aggregation or scan. Parameters ---------- From 5c8c44f4df070e1d112650c7c615c09228709078 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 30 Jan 2024 09:08:45 +0000 Subject: [PATCH 26/34] Add some missing imports to the package --- python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 3 ++- python/cudf/cudf/_lib/pylibcudf/__init__.py | 1 + python/cudf/cudf/_lib/pylibcudf/types.pxd | 13 +++++++++++-- python/cudf/cudf/_lib/pylibcudf/types.pyx | 8 +++++++- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index d463e5bd0a1..14c98af3fff 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -8,7 +8,7 @@ from .scalar cimport Scalar from .table cimport Table # TODO: cimport type_id once # https://github.com/cython/cython/issues/5609 is resolved -from .types cimport DataType +from .types cimport DataType, type_id __all__ = [ "Column", @@ -21,4 +21,5 @@ __all__ = [ "gpumemoryview", "groupby", "interop", + "types", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 73e2d98f545..07612d76540 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -19,4 +19,5 @@ "gpumemoryview", "groupby", "interop", + "types", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd index 80baa484be7..95452721260 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd @@ -1,9 +1,18 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp cimport bool as cbool -from cudf._lib.cpp.types cimport data_type, type_id +from cudf._lib.cpp.types cimport ( + data_type, + interpolation, + nan_equality, + null_equality, + null_order, + null_policy, + order, + type_id, +) cdef class DataType: diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index 931ab9fde39..d6e94329fe4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -4,7 +4,13 @@ from libc.stdint cimport int32_t from cudf._lib.cpp.types cimport data_type, type_id -from cudf._lib.cpp.types import type_id as TypeId # no-cython-lint +from cudf._lib.cpp.types import type_id as TypeId # no-cython-lint, isort:skip +from cudf._lib.cpp.types import null_policy as NullPolicy # no-cython-lint, isort:skip +from cudf._lib.cpp.types import interpolation as Interpolation # no-cython-lint, isort:skip +from cudf._lib.cpp.types import nan_equality as NanEquality # no-cython-lint, isort:skip +from cudf._lib.cpp.types import null_equality as NullEquality # no-cython-lint, isort:skip +from cudf._lib.cpp.types import null_order as NullOrder # no-cython-lint, isort:skip +from cudf._lib.cpp.types import order as Order # no-cython-lint, isort:skip cdef class DataType: From 28c455edf57b82218870f80a5dab0e241ed0b093 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 31 Jan 2024 18:07:15 +0000 Subject: [PATCH 27/34] Add some more missing imports and fix constructor of GroupByRequest --- python/cudf/cudf/_lib/cpp/groupby.pxd | 16 +++++++++++----- python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 10 ++++++++-- python/cudf/cudf/_lib/pylibcudf/types.pxd | 1 + python/cudf/cudf/_lib/pylibcudf/types.pyx | 1 + 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd index 0266404fc50..45923320a59 100644 --- a/python/cudf/cudf/_lib/cpp/groupby.pxd +++ b/python/cudf/cudf/_lib/cpp/groupby.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.functional cimport reference_wrapper @@ -16,7 +16,13 @@ from cudf._lib.cpp.replace cimport replace_policy from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport null_order, null_policy, order, size_type +from cudf._lib.cpp.types cimport ( + null_order, + null_policy, + order, + size_type, + sorted, +) from cudf._lib.cpp.utilities.host_span cimport host_span # workaround for https://github.com/cython/cython/issues/3885 @@ -55,20 +61,20 @@ cdef extern from "cudf/groupby.hpp" \ groupby( const table_view& keys, null_policy include_null_keys, - bool keys_are_sorted, + sorted keys_are_sorted, ) except + groupby( const table_view& keys, null_policy include_null_keys, - bool keys_are_sorted, + sorted keys_are_sorted, const vector[order]& column_order, ) except + groupby( const table_view& keys, null_policy include_null_keys, - bool keys_are_sorted, + sorted keys_are_sorted, const vector[order]& column_order, const vector[null_order]& null_precedence ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index 22ca5c8b1e0..bb5e006c2b5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -17,6 +17,7 @@ from cudf._lib.cpp.table.table cimport table from .aggregation cimport Aggregation from .column cimport Column from .table cimport Table +from .types cimport null_policy, sorted cdef class GroupByRequest: @@ -72,8 +73,13 @@ cdef class GroupBy: keys : Table The columns to group by. """ - def __init__(self, Table keys): - self.c_obj.reset(new groupby(keys.view())) + def __init__( + self, + Table keys, + null_policy null_handling=null_policy.EXCLUDE, + sorted keys_are_sorted=sorted.NO + ): + self.c_obj.reset(new groupby(keys.view(), null_handling, keys_are_sorted)) @staticmethod cdef tuple _parse_outputs( diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd index 95452721260..1ad3d19f15c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd @@ -11,6 +11,7 @@ from cudf._lib.cpp.types cimport ( null_order, null_policy, order, + sorted, type_id, ) diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index d6e94329fe4..5b25e7674e2 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -11,6 +11,7 @@ from cudf._lib.cpp.types import nan_equality as NanEquality # no-cython-lint, i from cudf._lib.cpp.types import null_equality as NullEquality # no-cython-lint, isort:skip from cudf._lib.cpp.types import null_order as NullOrder # no-cython-lint, isort:skip from cudf._lib.cpp.types import order as Order # no-cython-lint, isort:skip +from cudf._lib.cpp.types import sorted as Sorted # no-cython-lint, isort:skip cdef class DataType: From bd2ab536f1557ba9a3d83e869bab383ee065d2f4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 31 Jan 2024 18:57:52 +0000 Subject: [PATCH 28/34] Move _as_vector to utils --- python/cudf/cudf/_lib/pylibcudf/copying.pyx | 18 ++------------- python/cudf/cudf/_lib/pylibcudf/utils.pxd | 7 +++++- python/cudf/cudf/_lib/pylibcudf/utils.pyx | 25 ++++++++++++++++++++- 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx index 65f8c7a1854..12e592f3a92 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx @@ -26,23 +26,9 @@ from cudf._lib.cpp.copying import \ out_of_bounds_policy as OutOfBoundsPolicy # no-cython-lint from .column cimport Column +from .scalar cimport Scalar from .table cimport Table - -# This is a workaround for -# https://github.com/cython/cython/issues/4180 -# when creating reference_wrapper[constscalar] in the constructor -ctypedef const scalar constscalar - - -cdef vector[reference_wrapper[const scalar]] _as_vector(list source): - """Make a vector of reference_wrapper[const scalar] from a list of scalars.""" - cdef vector[reference_wrapper[const scalar]] c_scalars - c_scalars.reserve(len(source)) - cdef Scalar slr - for slr in source: - c_scalars.push_back( - reference_wrapper[constscalar](dereference((slr).c_obj))) - return c_scalars +from .utils cimport _as_vector cpdef Table gather( diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pxd b/python/cudf/cudf/_lib/pylibcudf/utils.pxd index 18bcd9cc91a..7efeaaf7e24 100644 --- a/python/cudf/cudf/_lib/pylibcudf/utils.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/utils.pxd @@ -1,7 +1,12 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +from libcpp.functional cimport reference_wrapper +from libcpp.vector cimport vector + +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport bitmask_type cdef void * int_to_void_ptr(Py_ssize_t ptr) nogil cdef bitmask_type * int_to_bitmask_ptr(Py_ssize_t ptr) nogil +cdef vector[reference_wrapper[const scalar]] _as_vector(list source) diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pyx b/python/cudf/cudf/_lib/pylibcudf/utils.pyx index ccf9ea2bd70..ea34a87a72a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/utils.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/utils.pyx @@ -1,9 +1,21 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +from cython.operator import dereference from libc.stdint cimport uintptr_t +from libcpp.functional cimport reference_wrapper +from libcpp.vector cimport vector +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport bitmask_type +from .scalar cimport Scalar + +# This is a workaround for +# https://github.com/cython/cython/issues/4180 +# when creating reference_wrapper[constscalar] in the constructor +ctypedef const scalar constscalar + cdef void * int_to_void_ptr(Py_ssize_t ptr) nogil: return (ptr) @@ -11,3 +23,14 @@ cdef void * int_to_void_ptr(Py_ssize_t ptr) nogil: cdef bitmask_type * int_to_bitmask_ptr(Py_ssize_t ptr) nogil: return (ptr) + + +cdef vector[reference_wrapper[const scalar]] _as_vector(list source): + """Make a vector of reference_wrapper[const scalar] from a list of scalars.""" + cdef vector[reference_wrapper[const scalar]] c_scalars + c_scalars.reserve(len(source)) + cdef Scalar slr + for slr in source: + c_scalars.push_back( + reference_wrapper[constscalar](dereference((slr).c_obj))) + return c_scalars From 6b14aba42b15123fd9cb7b917edbc941349dd067 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 31 Jan 2024 20:16:02 +0000 Subject: [PATCH 29/34] Implement shift in pylibcudf --- python/cudf/cudf/_lib/pylibcudf/groupby.pxd | 2 ++ python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 37 +++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd index d0768c1ba48..5fb28153268 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd @@ -18,6 +18,7 @@ from cudf._lib.cpp.groupby cimport ( from cudf._lib.cpp.table.table cimport table from .column cimport Column +from .table cimport Table cdef class GroupByRequest: @@ -38,6 +39,7 @@ cdef class GroupBy: cdef unique_ptr[groupby] c_obj cpdef tuple aggregate(self, list requests) cpdef tuple scan(self, list requests) + cpdef tuple shift(self, Table values, list offset, list fill_values) @staticmethod cdef tuple _parse_outputs(pair[unique_ptr[table], vector[aggregation_result]] c_res) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index bb5e006c2b5..ba19dc6e321 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from cython.operator cimport dereference +from libcpp.functional cimport reference_wrapper from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from libcpp.utility cimport move @@ -12,12 +13,15 @@ from cudf._lib.cpp.groupby cimport ( groupby, scan_request, ) +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.types cimport size_type from .aggregation cimport Aggregation from .column cimport Column from .table cimport Table from .types cimport null_policy, sorted +from .utils cimport _as_vector cdef class GroupByRequest: @@ -152,3 +156,36 @@ cdef class GroupBy: dereference(self.c_obj).scan(c_requests) ) return GroupBy._parse_outputs(move(c_res)) + + cpdef tuple shift(self, Table values, list offset, list fill_values): + """Compute shifts on columns. + + Parameters + ---------- + values : Table + The columns to shift. + offset : List[int] + The offsets to shift by. + fill_values : List[Scalar] + The values to use to fill in missing values. + + Returns + ------- + Tuple[Table, Table] + A tuple whose first element is the group's keys and whose second + element is a table of shifted values. + """ + + # cdef vector[size_type] c_offset = offset + cdef vector[reference_wrapper[const scalar]] c_fill_values = \ + _as_vector(fill_values) + + cdef vector[size_type] c_offset = offset + cdef pair[unique_ptr[table], unique_ptr[table]] c_res = move( + dereference(self.c_obj).shift(values.view(), c_offset, c_fill_values) + ) + + return ( + Table.from_libcudf(move(c_res.first)), + Table.from_libcudf(move(c_res.second)), + ) From 527a516db73d68e92aa6b7a52ea61ac0fae01ce0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 31 Jan 2024 21:16:32 +0000 Subject: [PATCH 30/34] Implement replace_nulls in pylibcudf --- python/cudf/cudf/_lib/cpp/replace.pxd | 9 +++++---- python/cudf/cudf/_lib/pylibcudf/groupby.pxd | 1 + python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 12 ++++++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/replace.pxd b/python/cudf/cudf/_lib/cpp/replace.pxd index c1ec89a6233..74bc9c2bb4c 100644 --- a/python/cudf/cudf/_lib/cpp/replace.pxd +++ b/python/cudf/cudf/_lib/cpp/replace.pxd @@ -1,5 +1,6 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libcpp cimport bool from libcpp.memory cimport unique_ptr from cudf._lib.types import cudf_to_np_types, np_to_cudf_types @@ -11,9 +12,9 @@ from cudf._lib.cpp.scalar.scalar cimport scalar cdef extern from "cudf/replace.hpp" namespace "cudf" nogil: - ctypedef enum replace_policy: - PRECEDING 'cudf::replace_policy::PRECEDING', - FOLLOWING 'cudf::replace_policy::FOLLOWING' + cdef enum class replace_policy(bool): + PRECEDING + FOLLOWING cdef unique_ptr[column] replace_nulls( column_view source_column, diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd index 5fb28153268..b14a6b557e4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd @@ -40,6 +40,7 @@ cdef class GroupBy: cpdef tuple aggregate(self, list requests) cpdef tuple scan(self, list requests) cpdef tuple shift(self, Table values, list offset, list fill_values) + cpdef tuple replace_nulls(self, Table value, list replace_policy) @staticmethod cdef tuple _parse_outputs(pair[unique_ptr[table], vector[aggregation_result]] c_res) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index ba19dc6e321..4e12f5b37c8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -175,8 +175,6 @@ cdef class GroupBy: A tuple whose first element is the group's keys and whose second element is a table of shifted values. """ - - # cdef vector[size_type] c_offset = offset cdef vector[reference_wrapper[const scalar]] c_fill_values = \ _as_vector(fill_values) @@ -189,3 +187,13 @@ cdef class GroupBy: Table.from_libcudf(move(c_res.first)), Table.from_libcudf(move(c_res.second)), ) + + cpdef tuple replace_nulls(self, Table value, list replace_policies): + cdef pair[unique_ptr[table], unique_ptr[table]] c_res = move( + dereference(self.c_obj).replace_nulls(value.view(), replace_policies) + ) + + return ( + Table.from_libcudf(move(c_res.first)), + Table.from_libcudf(move(c_res.second)), + ) From c44cc9f7e5cd2f7aa0b6e0cb93420a4943c2d010 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 31 Jan 2024 21:32:27 +0000 Subject: [PATCH 31/34] Fix argument name in cpp interface --- python/cudf/cudf/_lib/cpp/groupby.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/groupby.pxd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd index 45923320a59..8bbefcde0dd 100644 --- a/python/cudf/cudf/_lib/cpp/groupby.pxd +++ b/python/cudf/cudf/_lib/cpp/groupby.pxd @@ -106,6 +106,6 @@ cdef extern from "cudf/groupby.hpp" \ groups get_groups(table_view values) except + pair[unique_ptr[table], unique_ptr[table]] replace_nulls( - const table_view& value, + const table_view& values, const vector[replace_policy] replace_policy ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd index b14a6b557e4..bf61faca19b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd @@ -40,7 +40,7 @@ cdef class GroupBy: cpdef tuple aggregate(self, list requests) cpdef tuple scan(self, list requests) cpdef tuple shift(self, Table values, list offset, list fill_values) - cpdef tuple replace_nulls(self, Table value, list replace_policy) + cpdef tuple replace_nulls(self, Table values, list replace_policy) @staticmethod cdef tuple _parse_outputs(pair[unique_ptr[table], vector[aggregation_result]] c_res) From 54f0a6772e67dc2a5bfae90d6d9149732c78b384 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 31 Jan 2024 22:13:24 +0000 Subject: [PATCH 32/34] Implement get_groups in pylibcudf --- python/cudf/cudf/_lib/pylibcudf/groupby.pxd | 1 + python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 46 +++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd index bf61faca19b..735d5784226 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd @@ -41,6 +41,7 @@ cdef class GroupBy: cpdef tuple scan(self, list requests) cpdef tuple shift(self, Table values, list offset, list fill_values) cpdef tuple replace_nulls(self, Table values, list replace_policy) + cpdef tuple get_groups(self, Table values=*) @staticmethod cdef tuple _parse_outputs(pair[unique_ptr[table], vector[aggregation_result]] c_res) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index 4e12f5b37c8..e11a6b30860 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -11,6 +11,7 @@ from cudf._lib.cpp.groupby cimport ( aggregation_request, aggregation_result, groupby, + groups, scan_request, ) from cudf._lib.cpp.scalar.scalar cimport scalar @@ -189,6 +190,21 @@ cdef class GroupBy: ) cpdef tuple replace_nulls(self, Table value, list replace_policies): + """Replace nulls in columns. + + Parameters + ---------- + values : Table + The columns to replace nulls in. + replace_policies : List[replace_policy] + The policies to use to replace nulls. + + Returns + ------- + Tuple[Table, Table] + A tuple whose first element is the group's keys and whose second + element is a table of values with nulls replaced. + """ cdef pair[unique_ptr[table], unique_ptr[table]] c_res = move( dereference(self.c_obj).replace_nulls(value.view(), replace_policies) ) @@ -197,3 +213,33 @@ cdef class GroupBy: Table.from_libcudf(move(c_res.first)), Table.from_libcudf(move(c_res.second)), ) + + cpdef tuple get_groups(self, Table values=None): + """Get the grouped keys and values labels for each row. + + Parameters + ---------- + values : Table, optional + The columns to get group labels for. If not specified, the group + labels for the group keys are returned. + + Returns + ------- + Tuple[Table, Table, List[int]] + A tuple of tables containing three items: + - A table of group keys + - A table of group values + - A list of integer offsets into the tables + """ + + cdef groups c_groups + if values: + c_groups = dereference(self.c_obj).get_groups(values.view()) + else: + c_groups = dereference(self.c_obj).get_groups() + + return ( + Table.from_libcudf(move(c_groups.keys)), + Table.from_libcudf(move(c_groups.values)), + c_groups.offsets, + ) From f4e5acf894dcf58de0ee8ffe776c8b3e22575e98 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 1 Feb 2024 22:12:43 +0000 Subject: [PATCH 33/34] Address reviews --- python/cudf/cudf/_lib/pylibcudf/groupby.pxd | 8 +++---- python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 24 +++++++++++---------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd index 735d5784226..ce472e3c990 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd @@ -28,11 +28,11 @@ cdef class GroupByRequest: # each groupby.aggregate call to avoid invalidating this object. Therefore, # this class instead stores only Python/Cython objects and constructs the # C++ object on the fly as requested. - cdef Column values - cdef list aggregations + cdef Column _values + cdef list _aggregations - cdef aggregation_request to_libcudf_agg_request(self) except * - cdef scan_request to_libcudf_scan_request(self) except * + cdef aggregation_request _to_libcudf_agg_request(self) except * + cdef scan_request _to_libcudf_scan_request(self) except * cdef class GroupBy: diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index e11a6b30860..b9b4f49473a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -32,14 +32,14 @@ cdef class GroupByRequest: ---------- values : Column The column to aggregate. - aggregations : list + aggregations : List[Aggregation] The list of aggregations to perform. """ def __init__(self, Column values, list aggregations): - self.values = values - self.aggregations = aggregations + self._values = values + self._aggregations = aggregations - cdef aggregation_request to_libcudf_agg_request(self) except *: + cdef aggregation_request _to_libcudf_agg_request(self) except *: """Convert to a libcudf aggregation_request object. This method is for internal use only. It creates a new libcudf @@ -47,14 +47,14 @@ cdef class GroupByRequest: called. """ cdef aggregation_request c_obj - c_obj.values = self.values.view() + c_obj.values = self._values.view() cdef Aggregation agg - for agg in self.aggregations: + for agg in self._aggregations: c_obj.aggregations.push_back(move(agg.clone_underlying_as_groupby())) return move(c_obj) - cdef scan_request to_libcudf_scan_request(self) except *: + cdef scan_request _to_libcudf_scan_request(self) except *: """Convert to a libcudf scan_request object. This method is for internal use only. It creates a new libcudf @@ -62,10 +62,10 @@ cdef class GroupByRequest: called. """ cdef scan_request c_obj - c_obj.values = self.values.view() + c_obj.values = self._values.view() cdef Aggregation agg - for agg in self.aggregations: + for agg in self._aggregations: c_obj.aggregations.push_back(move(agg.clone_underlying_as_groupby_scan())) return move(c_obj) @@ -90,6 +90,8 @@ cdef class GroupBy: cdef tuple _parse_outputs( pair[unique_ptr[table], vector[aggregation_result]] c_res ): + # Convert libcudf aggregation/scan outputs into pylibcudf objects. + # This function is for internal use only. cdef Table group_keys = Table.from_libcudf(move(c_res.first)) cdef int i, j @@ -124,7 +126,7 @@ cdef class GroupBy: cdef GroupByRequest request cdef vector[aggregation_request] c_requests for request in requests: - c_requests.push_back(move(request.to_libcudf_agg_request())) + c_requests.push_back(move(request._to_libcudf_agg_request())) cdef pair[unique_ptr[table], vector[aggregation_result]] c_res = move( dereference(self.c_obj).aggregate(c_requests) @@ -151,7 +153,7 @@ cdef class GroupBy: cdef GroupByRequest request cdef vector[scan_request] c_requests for request in requests: - c_requests.push_back(move(request.to_libcudf_scan_request())) + c_requests.push_back(move(request._to_libcudf_scan_request())) cdef pair[unique_ptr[table], vector[aggregation_result]] c_res = move( dereference(self.c_obj).scan(c_requests) From 35bec862b602374332bc4d12eaa2fe411336da90 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 1 Feb 2024 22:14:10 +0000 Subject: [PATCH 34/34] Add some missing docs --- python/cudf/cudf/_lib/pylibcudf/groupby.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx index b9b4f49473a..f442aafa4bd 100644 --- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx @@ -77,6 +77,10 @@ cdef class GroupBy: ---------- keys : Table The columns to group by. + null_handling : null_policy, optional + Whether or not to include null rows in ``keys``. Default is null_policy.EXCLUDE. + keys_are_sorted : sorted, optional + Whether the keys are already sorted. Default is sorted.NO. """ def __init__( self,