Merge remote-tracking branch 'upstream/branch-22.10' into mwilson/rem…

…ove_parquet_assertions
rapidsai · Aug 16, 2022 · b1a9d41 · b1a9d41
2 parents ff22800 + 4178a51
commit b1a9d41
Show file tree

Hide file tree

Showing 56 changed files with 506 additions and 499 deletions.
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
@@ -785,13 +785,21 @@ std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> cons
   columns_vector output_columns;
   std::transform(
     dtype_ids.begin(), dtype_ids.end(), std::back_inserter(output_columns), [&](auto tid) mutable {
-      auto engine = deterministic_engine(seed_dist(seed_engine));
-      return cudf::type_dispatcher(
-        cudf::data_type(tid), create_rand_col_fn{}, profile, engine, num_rows.count);
+      return create_random_column(tid, num_rows, profile, seed_dist(seed_engine));
     });
   return std::make_unique<cudf::table>(std::move(output_columns));
 }
 
+std::unique_ptr<cudf::column> create_random_column(cudf::type_id dtype_id,
+                                                   row_count num_rows,
+                                                   data_profile const& profile,
+                                                   unsigned seed)
+{
+  auto engine = deterministic_engine(seed);
+  return cudf::type_dispatcher(
+    cudf::data_type(dtype_id), create_rand_col_fn{}, profile, engine, num_rows.count);
+}
+
 std::unique_ptr<cudf::table> create_sequence_table(std::vector<cudf::type_id> const& dtype_ids,
                                                    row_count num_rows,
                                                    std::optional<double> null_probability,

diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
@@ -606,8 +606,8 @@ struct row_count {
  * @param dtype_ids Vector of requested column types
  * @param table_bytes Target size of the output table, in bytes. Some type may not produce columns
  * of exact size
- * @param data_params optional, set of data parameters describing the data profile for each type
- * @param seed optional, seed for the pseudo-random engine
+ * @param data_params Optional, set of data parameters describing the data profile for each type
+ * @param seed Optional, seed for the pseudo-random engine
  */
 std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> const& dtype_ids,
                                                  table_size_bytes table_bytes,
@@ -619,23 +619,36 @@ std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> cons
  *
  * @param dtype_ids Vector of requested column types
  * @param num_rows Number of rows in the output table
- * @param data_params optional, set of data parameters describing the data profile for each type
- * @param seed optional, seed for the pseudo-random engine
+ * @param data_params Optional, set of data parameters describing the data profile for each type
+ * @param seed Optional, seed for the pseudo-random engine
  */
 std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> const& dtype_ids,
                                                  row_count num_rows,
                                                  data_profile const& data_params = data_profile{},
                                                  unsigned seed                   = 1);
 
+/**
+ * @brief Deterministically generates a column filled with data with the given parameters.
+ *
+ * @param dtype_id Requested column type
+ * @param num_rows Number of rows in the output column
+ * @param data_params Optional, set of data parameters describing the data profile
+ * @param seed Optional, seed for the pseudo-random engine
+ */
+std::unique_ptr<cudf::column> create_random_column(cudf::type_id dtype_id,
+                                                   row_count num_rows,
+                                                   data_profile const& data_params = data_profile{},
+                                                   unsigned seed                   = 1);
+
 /**
  * @brief Generate sequence columns starting with value 0 in first row and increasing by 1 in
  * subsequent rows.
  *
  * @param dtype_ids Vector of requested column types
  * @param num_rows Number of rows in the output table
- * @param null_probability optional, probability of a null value
+ * @param null_probability Optional, probability of a null value
  *  no value implies no null mask, =0 implies all valids, >=1 implies all nulls
- * @param seed optional, seed for the pseudo-random engine
+ * @param seed Optional, seed for the pseudo-random engine
  * @return A table with the sequence columns.
  */
 std::unique_ptr<cudf::table> create_sequence_table(
@@ -660,7 +673,7 @@ std::vector<cudf::type_id> cycle_dtypes(std::vector<cudf::type_id> const& dtype_
  * @param size number of rows
  * @param null_probability probability of a null value
  *  no value implies no null mask, =0 implies all valids, >=1 implies all nulls
- * @param seed optional, seed for the pseudo-random engine
+ * @param seed Optional, seed for the pseudo-random engine
  * @return null mask device buffer with random null mask data and null count
  */
 std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(

diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu
@@ -120,10 +120,10 @@ void BM_contiguous_split_strings(benchmark::State& state)
   cudf::test::strings_column_wrapper one_col(h_strings.begin(), h_strings.end());
   std::vector<std::unique_ptr<cudf::column>> src_cols(num_cols);
   for (int64_t idx = 0; idx < num_cols; idx++) {
-    auto random_indices = create_random_table(
-      {cudf::type_id::INT32}, row_count{static_cast<cudf::size_type>(num_rows)}, profile);
+    auto random_indices = create_random_column(
+      cudf::type_id::INT32, row_count{static_cast<cudf::size_type>(num_rows)}, profile);
     auto str_table = cudf::gather(cudf::table_view{{one_col}},
-                                  random_indices->get_column(0),
+                                  *random_indices,
                                   (include_validity ? cudf::out_of_bounds_policy::NULLIFY
                                                     : cudf::out_of_bounds_policy::DONT_CHECK));
     src_cols[idx]  = std::move(str_table->release()[0]);

diff --git a/cpp/benchmarks/filling/repeat.cpp b/cpp/benchmarks/filling/repeat.cpp
@@ -40,15 +40,14 @@ void BM_repeat(benchmark::State& state)
   using sizeT                = cudf::size_type;
   data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
     cudf::type_to_id<sizeT>(), distribution_id::UNIFORM, 0, 3);
-  auto repeat_table = create_random_table({cudf::type_to_id<sizeT>()}, row_count{n_rows}, profile);
-  cudf::column_view repeat_count{repeat_table->get_column(0)};
+  auto repeat_count = create_random_column(cudf::type_to_id<sizeT>(), row_count{n_rows}, profile);
 
   // warm up
-  auto output = cudf::repeat(input, repeat_count);
+  auto output = cudf::repeat(input, *repeat_count);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf::repeat(input, repeat_count);
+    cudf::repeat(input, *repeat_count);
   }
 
   auto data_bytes =

diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
@@ -27,38 +27,29 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
   cudf::rmm_pool_raii pool_raii;
   const auto size = static_cast<cudf::size_type>(state.get_int64("num_rows"));
 
-  auto const keys_table = [&] {
-    data_profile profile;
-    profile.set_null_probability(std::nullopt);
-    profile.set_cardinality(0);
-    profile.set_distribution_params<int32_t>(
+  auto const keys = [&] {
+    data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
       cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, 100);
-    return create_random_table({cudf::type_to_id<int32_t>()}, row_count{size}, profile);
+    return create_random_column(cudf::type_to_id<int32_t>(), row_count{size}, profile);
   }();
 
-  auto const vals_table = [&] {
-    data_profile profile;
+  auto const vals = [&] {
+    auto builder = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 1000);
     if (const auto null_freq = state.get_float64("null_probability"); null_freq > 0) {
-      profile.set_null_probability({null_freq});
+      builder.null_probability(null_freq);
     } else {
-      profile.set_null_probability(std::nullopt);
+      builder.no_validity();
     }
-    profile.set_cardinality(0);
-    profile.set_distribution_params<Type>(cudf::type_to_id<Type>(),
-                                          distribution_id::UNIFORM,
-                                          static_cast<Type>(0),
-                                          static_cast<Type>(1000));
-    return create_random_table({cudf::type_to_id<Type>()}, row_count{size}, profile);
+    return create_random_column(cudf::type_to_id<Type>(), row_count{size}, data_profile{builder});
   }();
 
-  auto const& keys = keys_table->get_column(0);
-  auto const& vals = vals_table->get_column(0);
-
-  auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys, keys, keys}));
+  auto keys_view = keys->view();
+  auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
 
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
-  requests[0].values = vals;
+  requests[0].values = vals->view();
   requests[0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));

diff --git a/cpp/benchmarks/groupby/group_nth.cpp b/cpp/benchmarks/groupby/group_nth.cpp
@@ -36,10 +36,8 @@ void BM_pre_sorted_nth(benchmark::State& state)
     cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
   auto keys_table =
     create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
-  auto vals_table =
-    create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
+  auto vals = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);
 
-  cudf::column_view vals(vals_table->get_column(0));
   auto sort_order  = cudf::sorted_order(*keys_table);
   auto sorted_keys = cudf::gather(*keys_table, *sort_order);
   // No need to sort values using sort_order because they were generated randomly
@@ -48,7 +46,7 @@ void BM_pre_sorted_nth(benchmark::State& state)
 
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
-  requests[0].values = vals;
+  requests[0].values = vals->view();
   requests[0].aggregations.push_back(
     cudf::make_nth_element_aggregation<cudf::groupby_aggregation>(-1));
 

diff --git a/cpp/benchmarks/groupby/group_nunique.cpp b/cpp/benchmarks/groupby/group_nunique.cpp
@@ -43,36 +43,27 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list<Type>)
   cudf::rmm_pool_raii pool_raii;
   const auto size = static_cast<cudf::size_type>(state.get_int64("num_rows"));
 
-  auto const keys_table = [&] {
-    data_profile profile;
-    profile.set_null_probability(std::nullopt);
-    profile.set_cardinality(0);
-    profile.set_distribution_params<int32_t>(
+  auto const keys = [&] {
+    data_profile profile = data_profile_builder().cardinality(0).no_validity().distribution(
       cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, 100);
-    return create_random_table({cudf::type_to_id<int32_t>()}, row_count{size}, profile);
+    return create_random_column(cudf::type_to_id<int32_t>(), row_count{size}, profile);
   }();
 
-  auto const vals_table = [&] {
-    data_profile profile;
+  auto const vals = [&] {
+    data_profile profile = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 1000);
     if (const auto null_freq = state.get_float64("null_probability"); null_freq > 0) {
-      profile.set_null_probability({null_freq});
+      profile.set_null_probability(null_freq);
     } else {
       profile.set_null_probability(std::nullopt);
     }
-    profile.set_cardinality(0);
-    profile.set_distribution_params<Type>(cudf::type_to_id<Type>(),
-                                          distribution_id::UNIFORM,
-                                          static_cast<Type>(0),
-                                          static_cast<Type>(1000));
-    return create_random_table({cudf::type_to_id<Type>()}, row_count{size}, profile);
+    return create_random_column(cudf::type_to_id<Type>(), row_count{size}, profile);
   }();
 
-  auto const& keys = keys_table->get_column(0);
-  auto const& vals = vals_table->get_column(0);
-
-  auto gb_obj         = cudf::groupby::groupby(cudf::table_view({keys, keys, keys}));
+  auto gb_obj =
+    cudf::groupby::groupby(cudf::table_view({keys->view(), keys->view(), keys->view()}));
   auto const requests = make_aggregation_request_vector(
-    vals, cudf::make_nunique_aggregation<cudf::groupby_aggregation>());
+    *vals, cudf::make_nunique_aggregation<cudf::groupby_aggregation>());
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
   state.exec(nvbench::exec_tag::sync,

diff --git a/cpp/benchmarks/groupby/group_scan.cpp b/cpp/benchmarks/groupby/group_scan.cpp
@@ -34,19 +34,14 @@ void BM_basic_sum_scan(benchmark::State& state)
 
   data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
     cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
-  auto keys_table =
-    create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
-  auto vals_table =
-    create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
+  auto keys = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);
+  auto vals = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);
 
-  cudf::column_view keys(keys_table->get_column(0));
-  cudf::column_view vals(vals_table->get_column(0));
-
-  cudf::groupby::groupby gb_obj(cudf::table_view({keys, keys, keys}));
+  cudf::groupby::groupby gb_obj(cudf::table_view({keys->view(), keys->view(), keys->view()}));
 
   std::vector<cudf::groupby::scan_request> requests;
   requests.emplace_back(cudf::groupby::scan_request());
-  requests[0].values = vals;
+  requests[0].values = vals->view();
   requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_scan_aggregation>());
 
   for (auto _ : state) {
@@ -74,10 +69,7 @@ void BM_pre_sorted_sum_scan(benchmark::State& state)
   auto keys_table =
     create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
   profile.set_null_probability(0.1);
-  auto vals_table =
-    create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
-
-  cudf::column_view vals(vals_table->get_column(0));
+  auto vals = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);
 
   auto sort_order  = cudf::sorted_order(*keys_table);
   auto sorted_keys = cudf::gather(*keys_table, *sort_order);
@@ -87,7 +79,7 @@ void BM_pre_sorted_sum_scan(benchmark::State& state)
 
   std::vector<cudf::groupby::scan_request> requests;
   requests.emplace_back(cudf::groupby::scan_request());
-  requests[0].values = vals;
+  requests[0].values = vals->view();
   requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_scan_aggregation>());
 
   for (auto _ : state) {

diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp
@@ -73,14 +73,13 @@ void bench_groupby_struct_keys(nvbench::state& state)
     cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
 
   auto const keys_table = cudf::table(std::move(child_cols));
-  auto const vals_table =
-    create_random_table({cudf::type_to_id<int64_t>()}, row_count{n_rows}, profile);
+  auto const vals = create_random_column(cudf::type_to_id<int64_t>(), row_count{n_rows}, profile);
 
   cudf::groupby::groupby gb_obj(keys_table.view());
 
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
-  requests[0].values = vals_table->get_column(0).view();
+  requests[0].values = vals->view();
   requests[0].aggregations.push_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
 
   // Set up nvbench default stream

diff --git a/cpp/benchmarks/groupby/group_sum.cpp b/cpp/benchmarks/groupby/group_sum.cpp
@@ -33,19 +33,14 @@ void BM_basic_sum(benchmark::State& state)
 
   data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
     cudf::type_to_id<int64_t>(), distribution_id::UNIFORM, 0, 100);
-  auto keys_table =
-    create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
-  auto vals_table =
-    create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
+  auto keys = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);
+  auto vals = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);
 
-  cudf::column_view keys(keys_table->get_column(0));
-  cudf::column_view vals(vals_table->get_column(0));
-
-  cudf::groupby::groupby gb_obj(cudf::table_view({keys, keys, keys}));
+  cudf::groupby::groupby gb_obj(cudf::table_view({keys->view(), keys->view(), keys->view()}));
 
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
-  requests[0].values = vals;
+  requests[0].values = vals->view();
   requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
 
   for (auto _ : state) {
@@ -74,10 +69,7 @@ void BM_pre_sorted_sum(benchmark::State& state)
   auto keys_table =
     create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
   profile.set_null_probability(0.1);
-  auto vals_table =
-    create_random_table({cudf::type_to_id<int64_t>()}, row_count{column_size}, profile);
-
-  cudf::column_view vals(vals_table->get_column(0));
+  auto vals = create_random_column(cudf::type_to_id<int64_t>(), row_count{column_size}, profile);
 
   auto sort_order  = cudf::sorted_order(*keys_table);
   auto sorted_keys = cudf::gather(*keys_table, *sort_order);
@@ -87,7 +79,7 @@ void BM_pre_sorted_sum(benchmark::State& state)
 
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
-  requests[0].values = vals;
+  requests[0].values = vals->view();
   requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
 
   for (auto _ : state) {

diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -60,15 +60,13 @@ static cudf::string_scalar create_random_input(int32_t num_chars,
   data_profile const table_profile = data_profile_builder().distribution(
     cudf::type_id::STRING, distribution_id::NORMAL, value_size_min, value_size_max);
 
-  auto const values_table = create_random_table(  //
-    {cudf::type_id::STRING},
-    row_count{num_rows},
-    table_profile);
+  auto const values =
+    create_random_column(cudf::type_id::STRING, row_count{num_rows}, table_profile);
 
   auto delim_scalar  = cudf::make_string_scalar(delim);
   auto delims_column = cudf::make_column_from_scalar(*delim_scalar, num_rows);
-  auto input_table  = cudf::table_view({values_table->get_column(0).view(), delims_column->view()});
-  auto input_column = cudf::strings::concatenate(input_table);
+  auto input_table   = cudf::table_view({values->view(), delims_column->view()});
+  auto input_column  = cudf::strings::concatenate(input_table);
 
   // extract the chars from the returned strings column.
   auto input_column_contents = input_column->release();

diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp
@@ -33,17 +33,15 @@ void BM_reduction_anyall(benchmark::State& state,
 {
   const cudf::size_type column_size{static_cast<cudf::size_type>(state.range(0))};
   auto const dtype           = cudf::type_to_id<type>();
-  data_profile const profile = data_profile_builder().distribution(
+  data_profile const profile = data_profile_builder().no_validity().distribution(
     dtype, distribution_id::UNIFORM, 0, agg->kind == cudf::aggregation::ANY ? 0 : 100);
-  auto const table = create_random_table({dtype}, row_count{column_size}, profile);
-  table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
-  cudf::column_view values(table->view().column(0));
+  auto const values = create_random_column(dtype, row_count{column_size}, profile);
 
   cudf::data_type output_dtype{cudf::type_id::BOOL8};
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    auto result = cudf::reduce(values, agg, output_dtype);
+    auto result = cudf::reduce(*values, agg, output_dtype);
   }
 }