From 1ef722d690bddfc1df48577dada44afe5f5d5aa0 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Fri, 7 Oct 2022 02:23:36 +0530 Subject: [PATCH] Fix decimal benchmark input data generation (#11863) closes https://github.com/rapidsai/cudf/issues/11850 Fixes decimal benchmark input data generation. Generated data alternated between two values because `device_uvector` has both value and scale. scale is fixed for a column and hence when this data is copied to `cudf::column`, this column values alternated between values and scale. Fix is to use `device_storage_type_t` instead of `T`. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/11863 --- cpp/benchmarks/common/generate_input.cu | 52 +++++++++++++------------ 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index 890a78bb9bf..2bcdaa6760c 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -247,40 +247,33 @@ struct random_value_fn()>> { */ template struct random_value_fn()>> { - using rep = typename T::rep; - rep const lower_bound; - rep const upper_bound; - distribution_fn dist; + using DeviceType = cudf::device_storage_type_t; + DeviceType const lower_bound; + DeviceType const upper_bound; + distribution_fn dist; std::optional scale; - random_value_fn(distribution_params const& desc) + random_value_fn(distribution_params const& desc) : lower_bound{desc.lower_bound}, upper_bound{desc.upper_bound}, - dist{make_distribution(desc.id, desc.lower_bound, desc.upper_bound)} + dist{make_distribution(desc.id, desc.lower_bound, desc.upper_bound)} { } - rmm::device_uvector operator()(thrust::minstd_rand& engine, unsigned size) + [[nodiscard]] numeric::scale_type get_scale(thrust::minstd_rand& engine) { if (not scale.has_value()) { - int const max_scale = std::numeric_limits::digits10; + constexpr int max_scale = std::numeric_limits::digits10; std::uniform_int_distribution scale_dist{-max_scale, max_scale}; std::mt19937 engine_scale(engine()); scale = numeric::scale_type{scale_dist(engine_scale)}; } - auto const ints = dist(engine, size); - rmm::device_uvector result(size, cudf::default_stream_value); - // Clamp the generated random value to the specified range - thrust::transform(thrust::device, - ints.begin(), - ints.end(), - result.begin(), - [scale = *(this->scale), - upper_bound = this->upper_bound, - lower_bound = this->lower_bound] __device__(auto int_value) { - return T{std::clamp(int_value, lower_bound, upper_bound), scale}; - }); - return result; + return scale.value_or(numeric::scale_type{0}); + } + + rmm::device_uvector operator()(thrust::minstd_rand& engine, unsigned size) + { + return dist(engine, size); } }; @@ -398,9 +391,17 @@ std::unique_ptr create_random_column(data_profile const& profile, distribution_params{1. - profile.get_null_probability().value_or(0)}); auto value_dist = random_value_fn{profile.get_distribution_params()}; + using DeviceType = cudf::device_storage_type_t; + cudf::data_type const dtype = [&]() { + if constexpr (cudf::is_fixed_point()) + return cudf::data_type{cudf::type_to_id(), value_dist.get_scale(engine)}; + else + return cudf::data_type{cudf::type_to_id()}; + }(); + // Distribution for picking elements from the array of samples auto const avg_run_len = profile.get_avg_run_length(); - rmm::device_uvector data(0, cudf::default_stream_value); + rmm::device_uvector data(0, cudf::default_stream_value); rmm::device_uvector null_mask(0, cudf::default_stream_value); if (profile.get_cardinality() == 0 and avg_run_len == 1) { @@ -412,11 +413,12 @@ std::unique_ptr create_random_column(data_profile const& profile, : profile_cardinality; }(); rmm::device_uvector samples_null_mask = valid_dist(engine, cardinality); - rmm::device_uvector samples = value_dist(engine, cardinality); + rmm::device_uvector samples = value_dist(engine, cardinality); + // generate n samples and gather. auto const sample_indices = sample_indices_with_run_length(avg_run_len, cardinality, num_rows, engine); - data = rmm::device_uvector(num_rows, cudf::default_stream_value); + data = rmm::device_uvector(num_rows, cudf::default_stream_value); null_mask = rmm::device_uvector(num_rows, cudf::default_stream_value); thrust::gather( thrust::device, sample_indices.begin(), sample_indices.end(), samples.begin(), data.begin()); @@ -431,7 +433,7 @@ std::unique_ptr create_random_column(data_profile const& profile, cudf::detail::valid_if(null_mask.begin(), null_mask.end(), thrust::identity{}); return std::make_unique( - cudf::data_type{cudf::type_to_id()}, + dtype, num_rows, data.release(), profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});