Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-22.10' into mwilson/rem…
Browse files Browse the repository at this point in the history
…ove_parquet_assertions
  • Loading branch information
hyperbolic2346 committed Aug 16, 2022
2 parents ff22800 + 4178a51 commit b1a9d41
Show file tree
Hide file tree
Showing 56 changed files with 506 additions and 499 deletions.
14 changes: 11 additions & 3 deletions cpp/benchmarks/common/generate_input.cu
Original file line number Diff line number Diff line change
Expand Up @@ -785,13 +785,21 @@ std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> cons
columns_vector output_columns;
std::transform(
dtype_ids.begin(), dtype_ids.end(), std::back_inserter(output_columns), [&](auto tid) mutable {
auto engine = deterministic_engine(seed_dist(seed_engine));
return cudf::type_dispatcher(
cudf::data_type(tid), create_rand_col_fn{}, profile, engine, num_rows.count);
return create_random_column(tid, num_rows, profile, seed_dist(seed_engine));
});
return std::make_unique<cudf::table>(std::move(output_columns));
}

std::unique_ptr<cudf::column> create_random_column(cudf::type_id dtype_id,
row_count num_rows,
data_profile const& profile,
unsigned seed)
{
auto engine = deterministic_engine(seed);
return cudf::type_dispatcher(
cudf::data_type(dtype_id), create_rand_col_fn{}, profile, engine, num_rows.count);
}

std::unique_ptr<cudf::table> create_sequence_table(std::vector<cudf::type_id> const& dtype_ids,
row_count num_rows,
std::optional<double> null_probability,
Expand Down
27 changes: 20 additions & 7 deletions cpp/benchmarks/common/generate_input.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -606,8 +606,8 @@ struct row_count {
* @param dtype_ids Vector of requested column types
* @param table_bytes Target size of the output table, in bytes. Some type may not produce columns
* of exact size
* @param data_params optional, set of data parameters describing the data profile for each type
* @param seed optional, seed for the pseudo-random engine
* @param data_params Optional, set of data parameters describing the data profile for each type
* @param seed Optional, seed for the pseudo-random engine
*/
std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> const& dtype_ids,
table_size_bytes table_bytes,
Expand All @@ -619,23 +619,36 @@ std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> cons
*
* @param dtype_ids Vector of requested column types
* @param num_rows Number of rows in the output table
* @param data_params optional, set of data parameters describing the data profile for each type
* @param seed optional, seed for the pseudo-random engine
* @param data_params Optional, set of data parameters describing the data profile for each type
* @param seed Optional, seed for the pseudo-random engine
*/
std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> const& dtype_ids,
row_count num_rows,
data_profile const& data_params = data_profile{},
unsigned seed = 1);

/**
* @brief Deterministically generates a column filled with data with the given parameters.
*
* @param dtype_id Requested column type
* @param num_rows Number of rows in the output column
* @param data_params Optional, set of data parameters describing the data profile
* @param seed Optional, seed for the pseudo-random engine
*/
std::unique_ptr<cudf::column> create_random_column(cudf::type_id dtype_id,
row_count num_rows,
data_profile const& data_params = data_profile{},
unsigned seed = 1);

/**
* @brief Generate sequence columns starting with value 0 in first row and increasing by 1 in
* subsequent rows.
*
* @param dtype_ids Vector of requested column types
* @param num_rows Number of rows in the output table
* @param null_probability optional, probability of a null value
* @param null_probability Optional, probability of a null value
* no value implies no null mask, =0 implies all valids, >=1 implies all nulls
* @param seed optional, seed for the pseudo-random engine
* @param seed Optional, seed for the pseudo-random engine
* @return A table with the sequence columns.
*/
std::unique_ptr<cudf::table> create_sequence_table(
Expand All @@ -660,7 +673,7 @@ std::vector<cudf::type_id> cycle_dtypes(std::vector<cudf::type_id> const& dtype_
* @param size number of rows
* @param null_probability probability of a null value
* no value implies no null mask, =0 implies all valids, >=1 implies all nulls
* @param seed optional, seed for the pseudo-random engine
* @param seed Optional, seed for the pseudo-random engine
* @return null mask device buffer with random null mask data and null count
*/
std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(
Expand Down
6 changes: 3 additions & 3 deletions cpp/benchmarks/copying/contiguous_split.cu
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,10 @@ void BM_contiguous_split_strings(benchmark::State& state)
cudf::test::strings_column_wrapper one_col(h_strings.begin(), h_strings.end());
std::vector<std::unique_ptr<cudf::column>> src_cols(num_cols);
for (int64_t idx = 0; idx < num_cols; idx++) {
auto random_indices = create_random_table(
{cudf::type_id::INT32}, row_count{static_cast<cudf::size_type>(num_rows)}, profile);
auto random_indices = create_random_column(
cudf::type_id::INT32, row_count{static_cast<cudf::size_type>(num_rows)}, profile);
auto str_table = cudf::gather(cudf::table_view{{one_col}},
random_indices->get_column(0),
*random_indices,
(include_validity ? cudf::out_of_bounds_policy::NULLIFY
: cudf::out_of_bounds_policy::DONT_CHECK));
src_cols[idx] = std::move(str_table->release()[0]);
Expand Down
7 changes: 3 additions & 4 deletions cpp/benchmarks/filling/repeat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,14 @@ void BM_repeat(benchmark::State& state)
using sizeT = cudf::size_type;
data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
cudf::type_to_id<sizeT>(), distribution_id::UNIFORM, 0, 3);
auto repeat_table = create_random_table({cudf::type_to_id<sizeT>()}, row_count{n_rows}, profile);
cudf::column_view repeat_count{repeat_table->get_column(0)};
auto repeat_count = create_random_column(cudf::type_to_id<sizeT>(), row_count{n_rows}, profile);

// warm up
auto output = cudf::repeat(input, repeat_count);
auto output = cudf::repeat(input, *repeat_count);

for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
cudf::repeat(input, repeat_count);
cudf::repeat(input, *repeat_count);
}

auto data_bytes =
Expand Down
33 changes: 12 additions & 21 deletions cpp/benchmarks/groupby/group_max.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,38 +27,29 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
cudf::rmm_pool_raii pool_raii;
const auto size = static_cast<cudf::size_type>(state.get_int64("num_rows"));

auto const keys_table = [&] {
data_profile profile;
profile.set_null_probability(std::nullopt);
profile.set_cardinality(0);
profile.set_distribution_params<int32_t>(
auto const keys = [&] {
data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, 100);
return create_random_table({cudf::type_to_id<int32_t>()}, row_count{size}, profile);
return create_random_column(cudf::type_to_id<int32_t>(), row_count{size}, profile);
}();

auto const vals_table = [&] {
data_profile profile;
auto const vals = [&] {
auto builder = data_profile_builder().cardinality(0).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 1000);
if (const auto null_freq = state.get_float64("null_probability"); null_freq > 0) {
profile.set_null_probability({null_freq});
builder.null_probability(null_freq);
} else {
profile.set_null_probability(std::nullopt);
builder.no_validity();
}
profile.set_cardinality(0);
profile.set_distribution_params<Type>(cudf::type_to_id<Type>(),
distribution_id::UNIFORM,
static_cast<Type>(0),
static_cast<Type>(1000));
return create_random_table({cudf::type_to_id<Type>()}, row_count{size}, profile);
return create_random_column(cudf::type_to_id<Type>(), row_count{size}, data_profile{builder});
}();

auto const& keys = keys_table->get_column(0);
auto const& vals = vals_table->get_column(0);

auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys, keys, keys}));
auto keys_view = keys->view();
auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));

std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back(cudf::groupby::aggregation_request());
requests[0].values = vals;
requests[0].values = vals->view();
requests[0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
Expand Down
6 changes: 2 additions & 4 deletions cpp/benchmarks/groupby/group_nth.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,8 @@ void BM_pre_sorted_nth(benchmark::State& state)
cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
auto keys_table =
create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
auto vals_table =
create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
auto vals = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);

cudf::column_view vals(vals_table->get_column(0));
auto sort_order = cudf::sorted_order(*keys_table);
auto sorted_keys = cudf::gather(*keys_table, *sort_order);
// No need to sort values using sort_order because they were generated randomly
Expand All @@ -48,7 +46,7 @@ void BM_pre_sorted_nth(benchmark::State& state)

std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back(cudf::groupby::aggregation_request());
requests[0].values = vals;
requests[0].values = vals->view();
requests[0].aggregations.push_back(
cudf::make_nth_element_aggregation<cudf::groupby_aggregation>(-1));

Expand Down
31 changes: 11 additions & 20 deletions cpp/benchmarks/groupby/group_nunique.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,36 +43,27 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list<Type>)
cudf::rmm_pool_raii pool_raii;
const auto size = static_cast<cudf::size_type>(state.get_int64("num_rows"));

auto const keys_table = [&] {
data_profile profile;
profile.set_null_probability(std::nullopt);
profile.set_cardinality(0);
profile.set_distribution_params<int32_t>(
auto const keys = [&] {
data_profile profile = data_profile_builder().cardinality(0).no_validity().distribution(
cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, 100);
return create_random_table({cudf::type_to_id<int32_t>()}, row_count{size}, profile);
return create_random_column(cudf::type_to_id<int32_t>(), row_count{size}, profile);
}();

auto const vals_table = [&] {
data_profile profile;
auto const vals = [&] {
data_profile profile = data_profile_builder().cardinality(0).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 1000);
if (const auto null_freq = state.get_float64("null_probability"); null_freq > 0) {
profile.set_null_probability({null_freq});
profile.set_null_probability(null_freq);
} else {
profile.set_null_probability(std::nullopt);
}
profile.set_cardinality(0);
profile.set_distribution_params<Type>(cudf::type_to_id<Type>(),
distribution_id::UNIFORM,
static_cast<Type>(0),
static_cast<Type>(1000));
return create_random_table({cudf::type_to_id<Type>()}, row_count{size}, profile);
return create_random_column(cudf::type_to_id<Type>(), row_count{size}, profile);
}();

auto const& keys = keys_table->get_column(0);
auto const& vals = vals_table->get_column(0);

auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys, keys, keys}));
auto gb_obj =
cudf::groupby::groupby(cudf::table_view({keys->view(), keys->view(), keys->view()}));
auto const requests = make_aggregation_request_vector(
vals, cudf::make_nunique_aggregation<cudf::groupby_aggregation>());
*vals, cudf::make_nunique_aggregation<cudf::groupby_aggregation>());

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
state.exec(nvbench::exec_tag::sync,
Expand Down
20 changes: 6 additions & 14 deletions cpp/benchmarks/groupby/group_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,14 @@ void BM_basic_sum_scan(benchmark::State& state)

data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
auto keys_table =
create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
auto vals_table =
create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
auto keys = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);
auto vals = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);

cudf::column_view keys(keys_table->get_column(0));
cudf::column_view vals(vals_table->get_column(0));

cudf::groupby::groupby gb_obj(cudf::table_view({keys, keys, keys}));
cudf::groupby::groupby gb_obj(cudf::table_view({keys->view(), keys->view(), keys->view()}));

std::vector<cudf::groupby::scan_request> requests;
requests.emplace_back(cudf::groupby::scan_request());
requests[0].values = vals;
requests[0].values = vals->view();
requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_scan_aggregation>());

for (auto _ : state) {
Expand Down Expand Up @@ -74,10 +69,7 @@ void BM_pre_sorted_sum_scan(benchmark::State& state)
auto keys_table =
create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
profile.set_null_probability(0.1);
auto vals_table =
create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);

cudf::column_view vals(vals_table->get_column(0));
auto vals = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);

auto sort_order = cudf::sorted_order(*keys_table);
auto sorted_keys = cudf::gather(*keys_table, *sort_order);
Expand All @@ -87,7 +79,7 @@ void BM_pre_sorted_sum_scan(benchmark::State& state)

std::vector<cudf::groupby::scan_request> requests;
requests.emplace_back(cudf::groupby::scan_request());
requests[0].values = vals;
requests[0].values = vals->view();
requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_scan_aggregation>());

for (auto _ : state) {
Expand Down
5 changes: 2 additions & 3 deletions cpp/benchmarks/groupby/group_struct_keys.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,13 @@ void bench_groupby_struct_keys(nvbench::state& state)
cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);

auto const keys_table = cudf::table(std::move(child_cols));
auto const vals_table =
create_random_table({cudf::type_to_id<int64_t>()}, row_count{n_rows}, profile);
auto const vals = create_random_column(cudf::type_to_id<int64_t>(), row_count{n_rows}, profile);

cudf::groupby::groupby gb_obj(keys_table.view());

std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back(cudf::groupby::aggregation_request());
requests[0].values = vals_table->get_column(0).view();
requests[0].values = vals->view();
requests[0].aggregations.push_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());

// Set up nvbench default stream
Expand Down
20 changes: 6 additions & 14 deletions cpp/benchmarks/groupby/group_sum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,14 @@ void BM_basic_sum(benchmark::State& state)

data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
auto keys_table =
create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
auto vals_table =
create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
auto keys = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);
auto vals = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);

cudf::column_view keys(keys_table->get_column(0));
cudf::column_view vals(vals_table->get_column(0));

cudf::groupby::groupby gb_obj(cudf::table_view({keys, keys, keys}));
cudf::groupby::groupby gb_obj(cudf::table_view({keys->view(), keys->view(), keys->view()}));

std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back(cudf::groupby::aggregation_request());
requests[0].values = vals;
requests[0].values = vals->view();
requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());

for (auto _ : state) {
Expand Down Expand Up @@ -74,10 +69,7 @@ void BM_pre_sorted_sum(benchmark::State& state)
auto keys_table =
create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
profile.set_null_probability(0.1);
auto vals_table =
create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);

cudf::column_view vals(vals_table->get_column(0));
auto vals = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);

auto sort_order = cudf::sorted_order(*keys_table);
auto sorted_keys = cudf::gather(*keys_table, *sort_order);
Expand All @@ -87,7 +79,7 @@ void BM_pre_sorted_sum(benchmark::State& state)

std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back(cudf::groupby::aggregation_request());
requests[0].values = vals;
requests[0].values = vals->view();
requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());

for (auto _ : state) {
Expand Down
10 changes: 4 additions & 6 deletions cpp/benchmarks/io/text/multibyte_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,13 @@ static cudf::string_scalar create_random_input(int32_t num_chars,
data_profile const table_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, value_size_min, value_size_max);

auto const values_table = create_random_table( //
{cudf::type_id::STRING},
row_count{num_rows},
table_profile);
auto const values =
create_random_column(cudf::type_id::STRING, row_count{num_rows}, table_profile);

auto delim_scalar = cudf::make_string_scalar(delim);
auto delims_column = cudf::make_column_from_scalar(*delim_scalar, num_rows);
auto input_table = cudf::table_view({values_table->get_column(0).view(), delims_column->view()});
auto input_column = cudf::strings::concatenate(input_table);
auto input_table = cudf::table_view({values->view(), delims_column->view()});
auto input_column = cudf::strings::concatenate(input_table);

// extract the chars from the returned strings column.
auto input_column_contents = input_column->release();
Expand Down
8 changes: 3 additions & 5 deletions cpp/benchmarks/reduction/anyall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,15 @@ void BM_reduction_anyall(benchmark::State& state,
{
const cudf::size_type column_size{static_cast<cudf::size_type>(state.range(0))};
auto const dtype = cudf::type_to_id<type>();
data_profile const profile = data_profile_builder().distribution(
data_profile const profile = data_profile_builder().no_validity().distribution(
dtype, distribution_id::UNIFORM, 0, agg->kind == cudf::aggregation::ANY ? 0 : 100);
auto const table = create_random_table({dtype}, row_count{column_size}, profile);
table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
cudf::column_view values(table->view().column(0));
auto const values = create_random_column(dtype, row_count{column_size}, profile);

cudf::data_type output_dtype{cudf::type_id::BOOL8};

for (auto _ : state) {
cuda_event_timer timer(state, true);
auto result = cudf::reduce(values, agg, output_dtype);
auto result = cudf::reduce(*values, agg, output_dtype);
}
}

Expand Down
Loading

0 comments on commit b1a9d41

Please sign in to comment.