Skip to content

Commit

Permalink
Fix quantile gtests coded in namespace cudf::test (#12049)
Browse files Browse the repository at this point in the history
Fixes `cpp/tests/quantiles` gtests source files coded in namespace `cudf::test`
The `tdigest_utilities.cu` was moved to `cpp/tests/utilities` since it is used by quantiles, groupby, reductions tests. Also, the header for the functions defined in this source file is in `cpp/include/cudf_tests/`.

The `cpp/include/cudf_tests/tdigest_utilities.cuh` was also including a source file header from `cudf/tests/groupby` which seemed odd and was corrected by moving the code it needed directly into the `tdigest_utilities.cuh` header. These functions were used by quantiles, groupby, reductions, etc so it made sense for them to be moved into this utility header.

Simple reworking some of the code in `percentile_approx_test.cu` allowed it to become a `.cpp` file as well.
Also made some minor changes to the `tdigest_column_view` class to isolate a functor inside the class instead of the namespace scope.

No function or test has changed just the source code reworked or moved around.

Reference #11734

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #12049
  • Loading branch information
davidwendt authored Nov 7, 2022
1 parent 17b6b2e commit f9a2512
Show file tree
Hide file tree
Showing 10 changed files with 323 additions and 343 deletions.
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -690,10 +690,10 @@ add_library(cudf::cudf ALIAS cudf)
add_library(
cudftestutil STATIC
tests/io/metadata_utilities.cpp
tests/quantiles/tdigest_utilities.cu
tests/utilities/base_fixture.cpp
tests/utilities/column_utilities.cu
tests/utilities/table_utilities.cu
tests/utilities/tdigest_utilities.cu
)

set_target_properties(
Expand Down
36 changes: 17 additions & 19 deletions cpp/include/cudf/tdigest/tdigest_column_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,24 +22,6 @@
namespace cudf {
namespace tdigest {

/**
* @brief Functor to compute the size of each tdigest of a column.
*
*/
struct tdigest_size {
size_type const* offsets; ///< Offsets of the t-digest column
/**
* @brief Returns size of the each tdigest in the column
*
* @param tdigest_index Index of the tdigest in the column
* @return Size of the tdigest
*/
__device__ size_type operator()(size_type tdigest_index)
{
return offsets[tdigest_index + 1] - offsets[tdigest_index];
}
};

/**
* @brief Given a column_view containing tdigest data, an instance of this class
* provides a wrapper on the compound column for tdigest operations.
Expand Down Expand Up @@ -127,6 +109,22 @@ class tdigest_column_view : private column_view {
*/
[[nodiscard]] column_view weights() const;

/**
* @brief Functor to compute the size of each tdigest of a column.
*/
struct tdigest_size_fn {
size_type const* offsets; ///< Offsets of the t-digest column
/**
* @brief Returns size of the each tdigest in the column
*
* @param tdigest_index Index of the tdigest in the column
* @return Size of the tdigest
*/
__device__ size_type operator()(size_type tdigest_index)
{
return offsets[tdigest_index + 1] - offsets[tdigest_index];
}
};
/**
* @brief Returns an iterator that returns the size of each tdigest
* in the column (each row is 1 digest)
Expand All @@ -136,7 +134,7 @@ class tdigest_column_view : private column_view {
[[nodiscard]] auto size_begin() const
{
return cudf::detail::make_counting_transform_iterator(
0, tdigest_size{centroids().offsets_begin()});
0, tdigest_size_fn{centroids().offsets_begin()});
}

/**
Expand Down
58 changes: 54 additions & 4 deletions cpp/include/cudf_test/tdigest_utilities.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,14 @@

#pragma once

#include <cudf_test/column_wrapper.hpp>

#include <cudf/detail/tdigest/tdigest.hpp>
#include <cudf/detail/unary.hpp>
#include <cudf/groupby.hpp>
#include <cudf/tdigest/tdigest_column_view.cuh>
#include <cudf/utilities/default_stream.hpp>

#include <cudf_test/column_wrapper.hpp>

#include <tests/groupby/groupby_test_util.hpp>

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/extrema.h>
Expand Down Expand Up @@ -102,6 +100,58 @@ struct tdigest_gen {
// @endcond
};

template <typename T>
inline T frand()
{
return static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
}

template <typename T>
inline T rand_range(T min, T max)
{
return min + static_cast<T>(frand<T>() * (max - min));
}

inline std::unique_ptr<column> generate_typed_percentile_distribution(
std::vector<double> const& buckets,
std::vector<int> const& sizes,
data_type t,
bool sorted = false)
{
srand(0);

std::vector<double> values;
size_t total_size = std::reduce(sizes.begin(), sizes.end(), 0);
values.reserve(total_size);
for (size_t idx = 0; idx < sizes.size(); idx++) {
double min = idx == 0 ? 0.0f : buckets[idx - 1];
double max = buckets[idx];

for (int v_idx = 0; v_idx < sizes[idx]; v_idx++) {
values.push_back(rand_range(min, max));
}
}

if (sorted) { std::sort(values.begin(), values.end()); }

cudf::test::fixed_width_column_wrapper<double> src(values.begin(), values.end());
return cudf::cast(src, t);
}

// "standardized" means the parameters sent into generate_typed_percentile_distribution. the intent
// is to provide a standardized set of inputs for use with tdigest generation tests and
// percentile_approx tests. std::vector<double>
// buckets{10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0}; std::vector<int>
// sizes{50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
inline std::unique_ptr<column> generate_standardized_percentile_distribution(
data_type t = data_type{type_id::FLOAT64}, bool sorted = false)
{
std::vector<double> buckets{10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0, 90.0f, 100.0f};
std::vector<int> b_sizes{
50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
return generate_typed_percentile_distribution(buckets, b_sizes, t, sorted);
}

/**
* @brief Compare a tdigest column against a sampling of expected values.
*/
Expand Down
4 changes: 1 addition & 3 deletions cpp/src/quantiles/tdigest/tdigest_column_view.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -22,8 +22,6 @@
namespace cudf {
namespace tdigest {

using namespace cudf;

tdigest_column_view::tdigest_column_view(column_view const& col) : column_view(col)
{
// sanity check that this is actually tdigest data
Expand Down
2 changes: 1 addition & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ ConfigureTest(HASH_MAP_TEST hash_map/map_test.cu)
# ##################################################################################################
# * quantiles tests -------------------------------------------------------------------------------
ConfigureTest(
QUANTILES_TEST quantiles/percentile_approx_test.cu quantiles/quantile_test.cpp
QUANTILES_TEST quantiles/percentile_approx_test.cpp quantiles/quantile_test.cpp
quantiles/quantiles_test.cpp
)

Expand Down
54 changes: 1 addition & 53 deletions cpp/tests/groupby/groupby_test_util.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -131,57 +131,5 @@ inline void test_single_scan(column_view const& keys,
expect_vals, *result.second[0].results[0], debug_output_level::ALL_ERRORS);
}

template <typename T>
inline T frand()
{
return static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
}

template <typename T>
inline T rand_range(T min, T max)
{
return min + static_cast<T>(frand<T>() * (max - min));
}

inline std::unique_ptr<column> generate_typed_percentile_distribution(
std::vector<double> const& buckets,
std::vector<int> const& sizes,
data_type t,
bool sorted = false)
{
srand(0);

std::vector<double> values;
size_t total_size = std::reduce(sizes.begin(), sizes.end(), 0);
values.reserve(total_size);
for (size_t idx = 0; idx < sizes.size(); idx++) {
double min = idx == 0 ? 0.0f : buckets[idx - 1];
double max = buckets[idx];

for (int v_idx = 0; v_idx < sizes[idx]; v_idx++) {
values.push_back(rand_range(min, max));
}
}

if (sorted) { std::sort(values.begin(), values.end()); }

cudf::test::fixed_width_column_wrapper<double> src(values.begin(), values.end());
return cudf::cast(src, t);
}

// "standardized" means the parameters sent into generate_typed_percentile_distribution. the intent
// is to provide a standardized set of inputs for use with tdigest generation tests and
// percentile_approx tests. std::vector<double>
// buckets{10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0}; std::vector<int>
// sizes{50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
inline std::unique_ptr<column> generate_standardized_percentile_distribution(
data_type t = data_type{type_id::FLOAT64}, bool sorted = false)
{
std::vector<double> buckets{10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0, 90.0f, 100.0f};
std::vector<int> b_sizes{
50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
return generate_typed_percentile_distribution(buckets, b_sizes, t, sorted);
}

} // namespace test
} // namespace cudf
Loading

0 comments on commit f9a2512

Please sign in to comment.