Merge remote-tracking branch 'upstream/branch-22.02' into refactor/mi…

…sc_cleanup
rapidsai · Dec 23, 2021 · 3637574 · 3637574
2 parents 63e3896 + 04f4219
commit 3637574
Show file tree

Hide file tree

Showing 40 changed files with 2,228 additions and 906 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -219,7 +219,7 @@ else
     KAFKA_CONDA_FILE=${KAFKA_CONDA_FILE//-/=} #convert to conda install
 
     gpuci_logger "Installing $CUDF_CONDA_FILE & $KAFKA_CONDA_FILE"
-    conda install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"
+    gpuci_mamba_retry install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"
 
     install_dask
 

diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp
@@ -161,8 +161,29 @@ struct random_value_fn<T, typename std::enable_if_t<cudf::is_chrono<T>()>> {
  */
 template <typename T>
 struct random_value_fn<T, typename std::enable_if_t<cudf::is_fixed_point<T>()>> {
-  random_value_fn(distribution_params<T> const&) {}
-  T operator()(std::mt19937& engine) { CUDF_FAIL("Not implemented"); }
+  using rep = typename T::rep;
+  rep const lower_bound;
+  rep const upper_bound;
+  distribution_fn<rep> dist;
+  std::optional<numeric::scale_type> scale;
+
+  random_value_fn(distribution_params<rep> const& desc)
+    : lower_bound{desc.lower_bound},
+      upper_bound{desc.upper_bound},
+      dist{make_distribution<rep>(desc.id, desc.lower_bound, desc.upper_bound)}
+  {
+  }
+
+  T operator()(std::mt19937& engine)
+  {
+    if (not scale.has_value()) {
+      int const max_scale = std::numeric_limits<rep>::digits10;
+      auto scale_dist     = make_distribution<int>(distribution_id::NORMAL, -max_scale, max_scale);
+      scale = numeric::scale_type{std::max(std::min(scale_dist(engine), max_scale), -max_scale)};
+    }
+    // Clamp the generated random value to the specified range
+    return T{std::max(std::min(dist(engine), upper_bound), lower_bound), *scale};
+  }
 };
 
 /**

diff --git a/cpp/benchmarks/common/generate_benchmark_input.hpp b/cpp/benchmarks/common/generate_benchmark_input.hpp
@@ -216,6 +216,7 @@ class data_profile {
   distribution_params<cudf::string_view> string_dist_desc{{distribution_id::NORMAL, 0, 32}};
   distribution_params<cudf::list_view> list_dist_desc{
     cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 100}, 2};
+  std::map<cudf::type_id, distribution_params<__uint128_t>> decimal_params;
 
   double bool_probability        = 0.5;
   double null_frequency          = 0.01;
@@ -284,9 +285,17 @@ class data_profile {
   }
 
   template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  distribution_params<T> get_distribution_params() const
+  distribution_params<typename T::rep> get_distribution_params() const
   {
-    CUDF_FAIL("Not implemented");
+    using rep = typename T::rep;
+    auto it   = decimal_params.find(cudf::type_to_id<T>());
+    if (it == decimal_params.end()) {
+      auto const range = default_range<rep>();
+      return distribution_params<rep>{default_distribution_id<rep>(), range.first, range.second};
+    } else {
+      auto& desc = it->second;
+      return {desc.id, static_cast<rep>(desc.lower_bound), static_cast<rep>(desc.upper_bound)};
+    }
   }
 
   auto get_bool_probability() const { return bool_probability; }

diff --git a/cpp/benchmarks/common/random_distribution_factory.hpp b/cpp/benchmarks/common/random_distribution_factory.hpp
@@ -21,19 +21,24 @@
 #include <memory>
 #include <random>
 
+/**
+ * @brief Generates a normal(binomial) distribution between zero and upper_bound.
+ */
 template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-auto make_normal_dist(T range_start, T range_end)
+auto make_normal_dist(T upper_bound)
 {
-  using uT            = typename std::make_unsigned<T>::type;
-  uT const range_size = range_end - range_start;
-  return std::binomial_distribution<uT>(range_size, 0.5);
+  using uT = typename std::make_unsigned<T>::type;
+  return std::binomial_distribution<uT>(upper_bound, 0.5);
 }
 
+/**
+ * @brief Generates a normal distribution between zero and upper_bound.
+ */
 template <typename T, std::enable_if_t<cudf::is_floating_point<T>()>* = nullptr>
-auto make_normal_dist(T range_start, T range_end)
+auto make_normal_dist(T upper_bound)
 {
-  T const mean   = range_start / 2 + range_end / 2;
-  T const stddev = range_end / 6 - range_start / 6;
+  T const mean   = upper_bound / 2;
+  T const stddev = upper_bound / 6;
   return std::normal_distribution<T>(mean, stddev);
 }
 
@@ -82,8 +87,8 @@ distribution_fn<T> make_distribution(distribution_id did, T lower_bound, T upper
 {
   switch (did) {
     case distribution_id::NORMAL:
-      return [lower_bound, dist = make_normal_dist(lower_bound, upper_bound)](
-               std::mt19937& engine) mutable -> T { return dist(engine) - lower_bound; };
+      return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)](
+               std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; };
     case distribution_id::UNIFORM:
       return [dist = make_uniform_dist(lower_bound, upper_bound)](
                std::mt19937& engine) mutable -> T { return dist(engine); };
@@ -104,8 +109,8 @@ distribution_fn<T> make_distribution(distribution_id dist_id, T lower_bound, T u
 {
   switch (dist_id) {
     case distribution_id::NORMAL:
-      return [dist = make_normal_dist(lower_bound, upper_bound)](
-               std::mt19937& engine) mutable -> T { return dist(engine); };
+      return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)](
+               std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; };
     case distribution_id::UNIFORM:
       return [dist = make_uniform_dist(lower_bound, upper_bound)](
                std::mt19937& engine) mutable -> T { return dist(engine); };

diff --git a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
@@ -70,6 +70,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
   auto const data_types =
     dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL),
                                                    int32_t(type_group_id::FLOATING_POINT),
+                                                   int32_t(type_group_id::FIXED_POINT),
                                                    int32_t(type_group_id::TIMESTAMP),
                                                    int32_t(cudf::type_id::STRING)}),
                                 col_sel);
@@ -143,6 +144,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
 
 RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
+RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
 

diff --git a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
@@ -63,6 +63,7 @@ void BM_csv_write_varying_options(benchmark::State& state)
 
   auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL),
                                              int32_t(type_group_id::FLOATING_POINT),
+                                             int32_t(type_group_id::FIXED_POINT),
                                              int32_t(type_group_id::TIMESTAMP),
                                              int32_t(cudf::type_id::STRING)});
 
@@ -96,6 +97,7 @@ void BM_csv_write_varying_options(benchmark::State& state)
 
 WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL);
 WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
+WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
 WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
 

diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
@@ -91,8 +91,10 @@ void BM_orc_read_varying_options(benchmark::State& state)
   auto const data_types =
     dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                                    int32_t(type_group_id::FLOATING_POINT),
+                                                   int32_t(type_group_id::FIXED_POINT),
                                                    int32_t(type_group_id::TIMESTAMP),
-                                                   int32_t(cudf::type_id::STRING)}),
+                                                   int32_t(cudf::type_id::STRING),
+                                                   int32_t(cudf::type_id::LIST)}),
                                 col_sel);
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
@@ -158,6 +160,7 @@ void BM_orc_read_varying_options(benchmark::State& state)
 
 RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
+RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST);

diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
@@ -70,8 +70,10 @@ void BM_orc_write_varying_options(benchmark::State& state)
 
   auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                              int32_t(type_group_id::FLOATING_POINT),
+                                             int32_t(type_group_id::FIXED_POINT),
                                              int32_t(type_group_id::TIMESTAMP),
-                                             int32_t(cudf::type_id::STRING)});
+                                             int32_t(cudf::type_id::STRING),
+                                             int32_t(cudf::type_id::LIST)});
 
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
@@ -101,6 +103,7 @@ void BM_orc_write_varying_options(benchmark::State& state)
 
 WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED);
 WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
+WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
 WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
 WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST);

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
@@ -92,8 +92,10 @@ void BM_parq_read_varying_options(benchmark::State& state)
   auto const data_types =
     dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL),
                                                    int32_t(type_group_id::FLOATING_POINT),
+                                                   int32_t(type_group_id::FIXED_POINT),
                                                    int32_t(type_group_id::TIMESTAMP),
-                                                   int32_t(cudf::type_id::STRING)}),
+                                                   int32_t(cudf::type_id::STRING),
+                                                   int32_t(cudf::type_id::LIST)}),
                                 col_sel);
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
@@ -160,6 +162,7 @@ void BM_parq_read_varying_options(benchmark::State& state)
 
 RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
+RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST);

diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -71,8 +71,10 @@ void BM_parq_write_varying_options(benchmark::State& state)
 
   auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                              int32_t(type_group_id::FLOATING_POINT),
+                                             int32_t(type_group_id::FIXED_POINT),
                                              int32_t(type_group_id::TIMESTAMP),
-                                             int32_t(cudf::type_id::STRING)});
+                                             int32_t(cudf::type_id::STRING),
+                                             int32_t(cudf::type_id::LIST)});
 
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
@@ -103,6 +105,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
 
 WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL);
 WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
+WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
 WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
 WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST);

diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
@@ -27,7 +27,7 @@ namespace lists {
  */
 
 /**
- * @brief Create a column of bool values indicating whether the specified scalar
+ * @brief Create a column of `bool` values indicating whether the specified scalar
  * is an element of each row of a list column.
  *
  * The output column has as many elements as the input `lists` column.
@@ -51,7 +51,7 @@ std::unique_ptr<column> contains(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Create a column of bool values indicating whether the list rows of the first
+ * @brief Create a column of `bool` values indicating whether the list rows of the first
  * column contain the corresponding values in the second column
  *
  * The output column has as many elements as the input `lists` column.
@@ -74,6 +74,104 @@ std::unique_ptr<column> contains(
   cudf::column_view const& search_keys,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create a column of `bool` values indicating whether each row in the `lists` column
+ * contains at least one null element.
+ *
+ * The output column has as many elements as the input `lists` column.
+ * Output `column[i]` is set to null the list row `lists[i]` is null.
+ * Otherwise, `column[i]` is set to a non-null boolean value, depending on whether that list
+ * contains a null element.
+ * (Empty list rows are considered *NOT* to contain a null element.)
+ *
+ * @param lists Lists column whose `n` rows are to be searched
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return std::unique_ptr<column> BOOL8 column of `n` rows with the result of the lookup
+ */
+std::unique_ptr<column> contains_nulls(
+  cudf::lists_column_view const& lists,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Option to choose whether `index_of()` returns the first or last match
+ * of a search key in a list row
+ */
+enum class duplicate_find_option : int32_t {
+  FIND_FIRST = 0,  ///< Finds first instance of a search key in a list row.
+  FIND_LAST        ///< Finds last instance of a search key in a list row.
+};
+
+/**
+ * @brief Create a column of `size_type` values indicating the position of a search key
+ * within each list row in the `lists` column
+ *
+ * The output column has as many elements as there are rows in the input `lists` column.
+ * Output `column[i]` contains a 0-based index indicating the position of the search key
+ * in each list, counting from the beginning of the list.
+ * Note:
+ *   1. If the `search_key` is null, all output rows are set to null.
+ *   2. If the row `lists[i]` is null, `output[i]` is also null.
+ *   3. If the row `lists[i]` does not contain the `search_key`, `output[i]` is set to `-1`.
+ *   4. In all other cases, `output[i]` is set to a non-negative `size_type` index.
+ *
+ * If the `find_option` is set to `FIND_FIRST`, the position of the first match for
+ * `search_key` is returned.
+ * If `find_option == FIND_LAST`, the position of the last match in the list row is
+ * returned.
+ *
+ * @param lists Lists column whose `n` rows are to be searched
+ * @param search_key The scalar key to be looked up in each list row
+ * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
+ * last (`FIND_LAST`)
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return std::unique_ptr<column> INT32 column of `n` rows with the location of the `search_key`
+ *
+ * @throw cudf::logic_error If `search_key` type does not match the element type in `lists`
+ * @throw cudf::logic_error If `search_key` is of a nested type, or `lists` contains nested
+ * elements (LIST, STRUCT)
+ */
+std::unique_ptr<column> index_of(
+  cudf::lists_column_view const& lists,
+  cudf::scalar const& search_key,
+  duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create a column of `size_type` values indicating the position of a search key
+ * row within the corresponding list row in the `lists` column
+ *
+ * The output column has as many elements as there are rows in the input `lists` column.
+ * Output `column[i]` contains a 0-based index indicating the position of each search key
+ * row in its corresponding list row, counting from the beginning of the list.
+ * Note:
+ *   1. If `search_keys[i]` is null, `output[i]` is also null.
+ *   2. If the row `lists[i]` is null, `output[i]` is also null.
+ *   3. If the row `lists[i]` does not contain `search_key[i]`, `output[i]` is set to `-1`.
+ *   4. In all other cases, `output[i]` is set to a non-negative `size_type` index.
+ *
+ * If the `find_option` is set to `FIND_FIRST`, the position of the first match for
+ * `search_key` is returned.
+ * If `find_option == FIND_LAST`, the position of the last match in the list row is
+ * returned.
+ *
+ * @param lists Lists column whose `n` rows are to be searched
+ * @param search_keys A column of search keys to be looked up in each corresponding row of
+ * `lists`
+ * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
+ * last (`FIND_LAST`)
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return std::unique_ptr<column> INT32 column of `n` rows with the location of the `search_key`
+ *
+ * @throw cudf::logic_error If `search_keys` does not match `lists` in its number of rows
+ * @throw cudf::logic_error If `search_keys` type does not match the element type in `lists`
+ * @throw cudf::logic_error If `lists` or `search_keys` contains nested elements (LIST, STRUCT)
+ */
+std::unique_ptr<column> index_of(
+  cudf::lists_column_view const& lists,
+  cudf::column_view const& search_keys,
+  duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace lists
 }  // namespace cudf