Skip to content

Commit

Permalink
Fix the data generator element size for decimal types (#10225)
Browse files Browse the repository at this point in the history
Data generator uses `sizeof` for all fixed width types and thus gets incorrect size for the decimal types. This leads to random tables that are actually half the requested size, causing incorrect benchmark bandwidth results.
This PR changes the `avg_element_size` to use `data_type::size_of`, which accounts for decimal types.

Note: expected to effectively half the BW of decimal benchmarks that specify the table size in bytes.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)

URL: #10225
  • Loading branch information
vuule authored Feb 8, 2022
1 parent 8014add commit 6502cea
Showing 1 changed file with 14 additions and 13 deletions.
27 changes: 14 additions & 13 deletions cpp/benchmarks/common/generate_input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,45 +65,46 @@ T get_distribution_mean(distribution_params<T> const& dist)
}

// Utilities to determine the mean size of an element, given the data profile
template <typename T>
std::enable_if_t<cudf::is_fixed_width<T>(), size_t> avg_element_size(data_profile const& profile)
template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_width<T>())>
size_t non_fixed_width_size(data_profile const& profile)
{
return sizeof(T);
CUDF_FAIL("Should not be called, use `size_of` for this type instead");
}

template <typename T>
std::enable_if_t<!cudf::is_fixed_width<T>(), size_t> avg_element_size(data_profile const& profile)
template <typename T, CUDF_ENABLE_IF(!cudf::is_fixed_width<T>())>
size_t non_fixed_width_size(data_profile const& profile)
{
CUDF_FAIL("not implemented!");
}

template <>
size_t avg_element_size<cudf::string_view>(data_profile const& profile)
size_t non_fixed_width_size<cudf::string_view>(data_profile const& profile)
{
auto const dist = profile.get_distribution_params<cudf::string_view>().length_params;
return get_distribution_mean(dist);
}

template <>
size_t avg_element_size<cudf::list_view>(data_profile const& profile)
size_t non_fixed_width_size<cudf::list_view>(data_profile const& profile)
{
auto const dist_params = profile.get_distribution_params<cudf::list_view>();
auto const single_level_mean = get_distribution_mean(dist_params.length_params);
auto const element_size = cudf::size_of(cudf::data_type{dist_params.element_type});
return element_size * pow(single_level_mean, dist_params.max_depth);
}

struct avg_element_size_fn {
struct non_fixed_width_size_fn {
template <typename T>
size_t operator()(data_profile const& profile)
{
return avg_element_size<T>(profile);
return non_fixed_width_size<T>(profile);
}
};

size_t avg_element_bytes(data_profile const& profile, cudf::type_id tid)
size_t avg_element_size(data_profile const& profile, cudf::data_type dtype)
{
return cudf::type_dispatcher(cudf::data_type(tid), avg_element_size_fn{}, profile);
if (cudf::is_fixed_width(dtype)) { return cudf::size_of(dtype); }
return cudf::type_dispatcher(dtype, non_fixed_width_size_fn{}, profile);
}

/**
Expand Down Expand Up @@ -419,7 +420,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
random_value_fn<uint32_t>{profile.get_distribution_params<cudf::string_view>().length_params};
auto valid_dist = std::bernoulli_distribution{1. - profile.get_null_frequency()};

auto const avg_string_len = avg_element_size<cudf::string_view>(profile);
auto const avg_string_len = non_fixed_width_size<cudf::string_view>(profile);
auto const cardinality = std::min(profile.get_cardinality(), num_rows);
string_column_data samples(cardinality, cardinality * avg_string_len);
for (cudf::size_type si = 0; si < cardinality; ++si) {
Expand Down Expand Up @@ -593,7 +594,7 @@ std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> cons
auto const out_dtype_ids = repeat_dtypes(dtype_ids, num_cols);
size_t const avg_row_bytes =
std::accumulate(out_dtype_ids.begin(), out_dtype_ids.end(), 0ul, [&](size_t sum, auto tid) {
return sum + avg_element_bytes(profile, tid);
return sum + avg_element_size(profile, cudf::data_type(tid));
});
cudf::size_type const num_rows = table_bytes.size / avg_row_bytes;

Expand Down

0 comments on commit 6502cea

Please sign in to comment.