Skip to content

Commit

Permalink
Merge branch 'branch-22.08' into list_label
Browse files Browse the repository at this point in the history
  • Loading branch information
ttnghia committed May 28, 2022
2 parents bfe0bf0 + f367863 commit 74a33d4
Show file tree
Hide file tree
Showing 29 changed files with 1,107 additions and 723 deletions.
126 changes: 103 additions & 23 deletions cpp/include/cudf/aggregation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,18 +120,56 @@ class aggregation {
};

aggregation() = delete;

/**
* @brief Construct a new aggregation object
*
* @param a aggregation::Kind enum value
*/
aggregation(aggregation::Kind a) : kind{a} {}
Kind kind; ///< The aggregation to perform
virtual ~aggregation() = default;

/**
* @brief Compares two aggregation objects for equality
*
* @param other The other aggregation to compare with
* @return True if the two aggregations are equal
*/
[[nodiscard]] virtual bool is_equal(aggregation const& other) const { return kind == other.kind; }

/**
* @brief Computes the hash value of the aggregation
*
* @return The hash value of the aggregation
*/
[[nodiscard]] virtual size_t do_hash() const { return std::hash<int>{}(kind); }

/**
* @pure @brief Clones the aggregation object
*
* @return A copy of the aggregation object
*/
[[nodiscard]] virtual std::unique_ptr<aggregation> clone() const = 0;

// override functions for compound aggregations
/**
* @pure @brief Get the simple aggregations that this aggregation requires to compute.
*
* @param col_type The type of the column to aggregate
* @param collector The collector visitor pattern to use to collect the simple aggregations
* @return Vector of pre-requisite simple aggregations
*/
virtual std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
data_type col_type, cudf::detail::simple_aggregations_collector& collector) const = 0;
virtual void finalize(cudf::detail::aggregation_finalizer& finalizer) const = 0;

/**
* @pure @brief Compute the aggregation after pre-requisite simple aggregations have been
* computed.
*
* @param finalizer The finalizer visitor pattern to use to compute the aggregation
*/
virtual void finalize(cudf::detail::aggregation_finalizer& finalizer) const = 0;
};

/**
Expand All @@ -147,7 +185,8 @@ class rolling_aggregation : public virtual aggregation {

protected:
rolling_aggregation() {}
rolling_aggregation(aggregation::Kind a) : aggregation{a} {}
/// constructor inherited from cudf::aggregation
using aggregation::aggregation;
};

/**
Expand Down Expand Up @@ -205,46 +244,57 @@ class segmented_reduce_aggregation : public virtual aggregation {
segmented_reduce_aggregation() {}
};

/// Type of code in the user defined function string.
enum class udf_type : bool { CUDA, PTX };
/// Type of correlation method.
enum class correlation_type : int32_t { PEARSON, KENDALL, SPEARMAN };

/// Factory to create a SUM aggregation
/// @return A SUM aggregation object
template <typename Base = aggregation>
std::unique_ptr<Base> make_sum_aggregation();

/// Factory to create a PRODUCT aggregation
/// @return A PRODUCT aggregation object
template <typename Base = aggregation>
std::unique_ptr<Base> make_product_aggregation();

/// Factory to create a MIN aggregation
/// @return A MIN aggregation object
template <typename Base = aggregation>
std::unique_ptr<Base> make_min_aggregation();

/// Factory to create a MAX aggregation
/// @return A MAX aggregation object
template <typename Base = aggregation>
std::unique_ptr<Base> make_max_aggregation();

/**
* @brief Factory to create a COUNT aggregation
*
* @param null_handling Indicates if null values will be counted.
* @param null_handling Indicates if null values will be counted
* @return A COUNT aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_count_aggregation(null_policy null_handling = null_policy::EXCLUDE);

/// Factory to create an ANY aggregation
/// @return A ANY aggregation object
template <typename Base = aggregation>
std::unique_ptr<Base> make_any_aggregation();

/// Factory to create a ALL aggregation
/// @return A ALL aggregation object
template <typename Base = aggregation>
std::unique_ptr<Base> make_all_aggregation();

/// Factory to create a SUM_OF_SQUARES aggregation
/// @return A SUM_OF_SQUARES aggregation object
template <typename Base = aggregation>
std::unique_ptr<Base> make_sum_of_squares_aggregation();

/// Factory to create a MEAN aggregation
/// @return A MEAN aggregation object
template <typename Base = aggregation>
std::unique_ptr<Base> make_mean_aggregation();

Expand All @@ -258,6 +308,7 @@ std::unique_ptr<Base> make_mean_aggregation();
* deviation across multiple discrete sets. See
* `https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm` for more
* detail.
* @return A M2 aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_m2_aggregation();
Expand All @@ -269,6 +320,7 @@ std::unique_ptr<Base> make_m2_aggregation();
* `variance` is `N - ddof`, where `N` is the population size.
*
* @throw cudf::logic_error if input type is chrono or compound types.
* @return A VARIANCE aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_variance_aggregation(size_type ddof = 1);
Expand All @@ -280,11 +332,13 @@ std::unique_ptr<Base> make_variance_aggregation(size_type ddof = 1);
* `std` is `N - ddof`, where `N` is the population size.
*
* @throw cudf::logic_error if input type is chrono or compound types.
* @return A STD aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_std_aggregation(size_type ddof = 1);

/// Factory to create a MEDIAN aggregation
/// @return A MEDIAN aggregation object
template <typename Base = aggregation>
std::unique_ptr<Base> make_median_aggregation();

Expand All @@ -293,54 +347,60 @@ std::unique_ptr<Base> make_median_aggregation();
*
* @param quantiles The desired quantiles
* @param interp The desired interpolation
* @return A QUANTILE aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_quantile_aggregation(std::vector<double> const& quantiles,
interpolation interp = interpolation::LINEAR);

/**
* @brief Factory to create an `argmax` aggregation
* @brief Factory to create an ARGMAX aggregation
*
* `argmax` returns the index of the maximum element.
* ARGMAX returns the index of the maximum element.
* @return A ARGMAX aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_argmax_aggregation();

/**
* @brief Factory to create an `argmin` aggregation
* @brief Factory to create an ARGMIN aggregation
*
* `argmin` returns the index of the minimum element.
* @return A ARGMIN aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_argmin_aggregation();

/**
* @brief Factory to create a `nunique` aggregation
* @brief Factory to create a NUNIQUE aggregation
*
* `nunique` returns the number of unique elements.
* @param null_handling Indicates if null values will be counted.
* NUNIQUE returns the number of unique elements.
* @param null_handling Indicates if null values will be counted
* @return A NUNIQUE aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_nunique_aggregation(null_policy null_handling = null_policy::EXCLUDE);

/**
* @brief Factory to create a `nth_element` aggregation
* @brief Factory to create a NTH_ELEMENT aggregation
*
* `nth_element` returns the n'th element of the group/series.
* NTH_ELEMENT returns the n'th element of the group/series.
*
* If @p n is not within the range `[-group_size, group_size)`, the result of
* the respective group will be null. Negative indices `[-group_size, -1]`
* corresponds to `[0, group_size-1]` indices respectively where `group_size` is
* the size of each group.
*
* @param n index of nth element in each group.
* @param null_handling Indicates to include/exclude nulls during indexing.
* @param n index of nth element in each group
* @param null_handling Indicates to include/exclude nulls during indexing
* @return A NTH_ELEMENT aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_nth_element_aggregation(
size_type n, null_policy null_handling = null_policy::INCLUDE);

/// Factory to create a ROW_NUMBER aggregation
/// @return A ROW_NUMBER aggregation object
template <typename Base = aggregation>
std::unique_ptr<Base> make_row_number_aggregation();

Expand Down Expand Up @@ -408,12 +468,13 @@ std::unique_ptr<Base> make_row_number_aggregation();
*
* @endcode
*
* @param method The ranking method used for tie breaking (same values).
* @param method The ranking method used for tie breaking (same values)
* @param column_order The desired sort order for ranking
* @param null_handling flag to include nulls during ranking. If nulls are not included,
* @param null_handling flag to include nulls during ranking If nulls are not included,
* the corresponding rank will be null.
* @param null_precedence The desired order of null compared to other elements for column
* @param percentage enum to denote the type of conversion of ranks to percentage in range (0,1]
* @return A RANK aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_rank_aggregation(rank_method method,
Expand All @@ -430,7 +491,8 @@ std::unique_ptr<Base> make_rank_aggregation(rank_method method,
* If `null_handling` is set to `EXCLUDE`, null elements are dropped from each
* of the list rows.
*
* @param null_handling Indicates whether to include/exclude nulls in list elements.
* @param null_handling Indicates whether to include/exclude nulls in list elements
* @return A COLLECT_LIST aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_collect_list_aggregation(
Expand All @@ -450,17 +512,28 @@ std::unique_ptr<Base> make_collect_list_aggregation(
* equal.
* @param nans_equal Flag to specify whether NaN values in floating point column should be
* considered equal.
* @return A COLLECT_SET aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::UNEQUAL);

/// Factory to create a LAG aggregation
/**
* @brief Factory to create a LAG aggregation
*
* @param offset The number of rows to lag the input
* @return A LAG aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_lag_aggregation(size_type offset);

/// Factory to create a LEAD aggregation
/**
* @brief Factory to create a LEAD aggregation
*
* @param offset The number of rows to lead the input
* @return A LEAD aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_lead_aggregation(size_type offset);

Expand All @@ -471,7 +544,7 @@ std::unique_ptr<Base> make_lead_aggregation(size_type offset);
* @param[in] user_defined_aggregator A string containing the aggregator code
* @param[in] output_type expected output type
*
* @return aggregation unique pointer housing user_defined_aggregator string.
* @return An aggregation containing a user-defined aggregator string
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_udf_aggregation(udf_type type,
Expand All @@ -486,6 +559,8 @@ std::unique_ptr<Base> make_udf_aggregation(udf_type type,
* groupby `COLLECT_LIST` aggregations into a final `COLLECT_LIST` result. As such, it requires the
* input lists column to be non-nullable (the child column containing list entries is not subjected
* to this requirement).
*
* @return A MERGE_LISTS aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_merge_lists_aggregation();
Expand All @@ -510,6 +585,7 @@ std::unique_ptr<Base> make_merge_lists_aggregation();
* during dropping duplicate list entries.
* @param nans_equal Flag to specify whether NaN values in floating point column should be
* considered equal during dropping duplicate list entries.
* @return A MERGE_SETS aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = null_equality::EQUAL,
Expand All @@ -526,6 +602,8 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = nu
*
* The input `M2` aggregation values are expected to be all non-negative numbers, since they
* were output from `M2` aggregation.
*
* @return A MERGE_M2 aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_merge_m2_aggregation();
Expand All @@ -535,9 +613,10 @@ std::unique_ptr<Base> make_merge_m2_aggregation();
*
* Compute covariance between two columns.
* The input columns are child columns of a non-nullable struct columns.
* @param min_periods Minimum number of non-null observations required to produce a result.
* @param min_periods Minimum number of non-null observations required to produce a result
* @param ddof Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N is
* the number of non-null observations.
* @return A COVARIANCE aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods = 1, size_type ddof = 1);
Expand All @@ -549,7 +628,8 @@ std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods = 1, siz
* The input columns are child columns of a non-nullable struct columns.
*
* @param type correlation_type
* @param min_periods Minimum number of non-null observations required to produce a result.
* @param min_periods Minimum number of non-null observations required to produce a result
* @return A CORRELATION aggregation object
*/
template <typename Base = aggregation>
std::unique_ptr<Base> make_correlation_aggregation(correlation_type type,
Expand Down Expand Up @@ -587,7 +667,7 @@ std::unique_ptr<Base> make_correlation_aggregation(correlation_type type,
* the computed tdigests: A value of 1000 will result in a tdigest containing no
* more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.
*
* @returns A TDIGEST aggregation object.
* @return A TDIGEST aggregation object
*/
template <typename Base>
std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids = 1000);
Expand Down Expand Up @@ -625,7 +705,7 @@ std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids = 1000);
* the computed tdigests: A value of 1000 will result in a tdigest containing no
* more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.
*
* @returns A MERGE_TDIGEST aggregation object.
* @return A MERGE_TDIGEST aggregation object
*/
template <typename Base>
std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids = 1000);
Expand Down
14 changes: 7 additions & 7 deletions cpp/include/cudf/concatenate.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -52,9 +52,9 @@ rmm::device_buffer concatenate_masks(
* If types of the input columns mismatch
*
* @param columns_to_concat host_span of column views to be concatenated into a single column
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return Unique pointer to a single table having all the rows from the
* elements of `columns_to_concat` respectively in the same order.
* @param mr Device memory resource used to allocate the returned column's device memory
* @return A single column having all the rows from the elements of `columns_to_concat` respectively
* in the same order.
*/
std::unique_ptr<column> concatenate(
host_span<column_view const> columns_to_concat,
Expand Down Expand Up @@ -82,9 +82,9 @@ std::unique_ptr<column> concatenate(
* If number of columns mismatch
*
* @param tables_to_concat host_span of table views to be concatenated into a single table
* @param mr Device memory resource used to allocate the returned table's device memory.
* @return Unique pointer to a single table having all the rows from the
* elements of `tables_to_concat` respectively in the same order.
* @param mr Device memory resource used to allocate the returned table's device memory
* @return A single table having all the rows from the elements of `tables_to_concat` respectively
* in the same order.
*/
std::unique_ptr<table> concatenate(
host_span<table_view const> tables_to_concat,
Expand Down
Loading

0 comments on commit 74a33d4

Please sign in to comment.