Merge remote-tracking branch 'upstream/branch-24.04' into ref/to_pand…

…as_multiindex
rapidsai · Mar 5, 2024 · ee07c5e · ee07c5e
2 parents 04c4af7 + 13d807e
commit ee07c5e
Show file tree

Hide file tree

Showing 51 changed files with 457 additions and 649 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -92,7 +92,8 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2")))
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -128,15 +128,17 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2")))
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
       script: "ci/build_wheel_dask_cudf.sh"
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2")))
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
@@ -154,7 +156,8 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2")))
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/run_tests.sh
   # pandas-tests:

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -99,7 +99,8 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.11" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.2.2")))
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}

diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
@@ -71,7 +71,7 @@ auto deterministic_engine(unsigned seed) { return thrust::minstd_rand{seed}; }
  *  Computes the mean value for a distribution of given type and value bounds.
  */
 template <typename T>
-T get_distribution_mean(distribution_params<T> const& dist)
+double get_distribution_mean(distribution_params<T> const& dist)
 {
   switch (dist.id) {
     case distribution_id::NORMAL:
@@ -90,33 +90,67 @@ T get_distribution_mean(distribution_params<T> const& dist)
   }
 }
 
+/**
+ * @brief Calculates the number of direct parents needed to generate a struct column hierarchy with
+ * lowest maximum number of children in any nested column.
+ *
+ * Used to generate an "evenly distributed" struct column hierarchy with the given number of leaf
+ * columns and nesting levels. The column tree is considered evenly distributed if all columns have
+ * nearly the same number of child columns (difference not larger than one).
+ */
+int num_direct_parents(int num_lvls, int num_leaf_columns)
+{
+  // Estimated average number of children in the hierarchy;
+  auto const num_children_avg = std::pow(num_leaf_columns, 1. / num_lvls);
+  // Minimum number of children columns for any column in the hierarchy
+  int const num_children_min = std::floor(num_children_avg);
+  // Maximum number of children columns for any column in the hierarchy
+  int const num_children_max = num_children_min + 1;
+
+  // Minimum number of columns needed so that their number of children does not exceed the maximum
+  int const min_for_current_nesting =
+    std::ceil(static_cast<double>(num_leaf_columns) / num_children_max);
+  // Minimum number of columns needed so that columns at the higher levels have at least the minimum
+  // number of children
+  int const min_for_upper_nesting = std::pow(num_children_min, num_lvls - 1);
+  // Both conditions need to be satisfied
+  return std::max(min_for_current_nesting, min_for_upper_nesting);
+}
+
+// Size of the null mask for each row, in bytes
+[[nodiscard]] double row_null_mask_size(data_profile const& profile)
+{
+  return profile.get_null_probability().has_value() ? 1. / 8 : 0.;
+}
+
 /**
  * @brief Computes the average element size in a column, given the data profile.
  *
  * Random distribution parameters like average string length and maximum list nesting level affect
  * the element size of non-fixed-width columns. For lists and structs, `avg_element_size` is called
  * recursively to determine the size of nested columns.
  */
-size_t avg_element_size(data_profile const& profile, cudf::data_type dtype);
+double avg_element_size(data_profile const& profile, cudf::data_type dtype);
 
 // Utilities to determine the mean size of an element, given the data profile
 template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_width<T>())>
-size_t non_fixed_width_size(data_profile const& profile)
+double non_fixed_width_size(data_profile const& profile)
 {
   CUDF_FAIL("Should not be called, use `size_of` for this type instead");
 }
 
 template <typename T, CUDF_ENABLE_IF(!cudf::is_fixed_width<T>())>
-size_t non_fixed_width_size(data_profile const& profile)
+double non_fixed_width_size(data_profile const& profile)
 {
   CUDF_FAIL("not implemented!");
 }
 
 template <>
-size_t non_fixed_width_size<cudf::string_view>(data_profile const& profile)
+double non_fixed_width_size<cudf::string_view>(data_profile const& profile)
 {
   auto const dist = profile.get_distribution_params<cudf::string_view>().length_params;
-  return get_distribution_mean(dist);
+  return get_distribution_mean(dist) * profile.get_valid_probability() + sizeof(cudf::size_type) +
+         row_null_mask_size(profile);
 }
 
 double geometric_sum(size_t n, double p)
@@ -126,45 +160,65 @@ double geometric_sum(size_t n, double p)
 }
 
 template <>
-size_t non_fixed_width_size<cudf::list_view>(data_profile const& profile)
+double non_fixed_width_size<cudf::list_view>(data_profile const& profile)
 {
-  auto const dist_params       = profile.get_distribution_params<cudf::list_view>();
-  auto const single_level_mean = get_distribution_mean(dist_params.length_params);
+  auto const dist_params = profile.get_distribution_params<cudf::list_view>();
+  auto const single_level_mean =
+    get_distribution_mean(dist_params.length_params) * profile.get_valid_probability();
 
+  // Leaf column size
   auto const element_size  = avg_element_size(profile, cudf::data_type{dist_params.element_type});
   auto const element_count = std::pow(single_level_mean, dist_params.max_depth);
 
+  auto const offset_size = avg_element_size(profile, cudf::data_type{cudf::type_id::INT32});
   // Each nesting level includes offsets, this is the sum of all levels
-  // Also include an additional offset per level for the size of the last element
-  auto const total_offset_count =
-    geometric_sum(dist_params.max_depth, single_level_mean) + dist_params.max_depth;
+  auto const total_offset_count = geometric_sum(dist_params.max_depth, single_level_mean);
 
-  return sizeof(cudf::size_type) * total_offset_count + element_size * element_count;
+  return element_size * element_count + offset_size * total_offset_count;
+}
+
+[[nodiscard]] cudf::size_type num_struct_columns(data_profile const& profile)
+{
+  auto const dist_params = profile.get_distribution_params<cudf::struct_view>();
+
+  cudf::size_type children_count     = dist_params.leaf_types.size();
+  cudf::size_type total_parent_count = 0;
+  for (cudf::size_type lvl = dist_params.max_depth; lvl > 0; --lvl) {
+    children_count = num_direct_parents(lvl, children_count);
+    total_parent_count += children_count;
+  }
+  return total_parent_count;
 }
 
 template <>
-size_t non_fixed_width_size<cudf::struct_view>(data_profile const& profile)
+double non_fixed_width_size<cudf::struct_view>(data_profile const& profile)
 {
   auto const dist_params = profile.get_distribution_params<cudf::struct_view>();
-  return std::accumulate(dist_params.leaf_types.cbegin(),
-                         dist_params.leaf_types.cend(),
-                         0ul,
-                         [&](auto& sum, auto type_id) {
-                           return sum + avg_element_size(profile, cudf::data_type{type_id});
-                         });
+  auto const total_children_size =
+    std::accumulate(dist_params.leaf_types.cbegin(),
+                    dist_params.leaf_types.cend(),
+                    0ul,
+                    [&](auto& sum, auto type_id) {
+                      return sum + avg_element_size(profile, cudf::data_type{type_id});
+                    });
+
+  // struct columns have a null mask for each row
+  auto const structs_null_mask_size = num_struct_columns(profile) * row_null_mask_size(profile);
+
+  return total_children_size + structs_null_mask_size;
 }
 
 struct non_fixed_width_size_fn {
   template <typename T>
-  size_t operator()(data_profile const& profile)
+  double operator()(data_profile const& profile)
   {
     return non_fixed_width_size<T>(profile);
   }
 };
 
-size_t avg_element_size(data_profile const& profile, cudf::data_type dtype)
+double avg_element_size(data_profile const& profile, cudf::data_type dtype)
 {
-  if (cudf::is_fixed_width(dtype)) { return cudf::size_of(dtype); }
+  if (cudf::is_fixed_width(dtype)) { return cudf::size_of(dtype) + row_null_mask_size(profile); }
   return cudf::type_dispatcher(dtype, non_fixed_width_size_fn{}, profile);
 }
 
@@ -596,32 +650,6 @@ struct create_rand_col_fn {
   }
 };
 
-/**
- * @brief Calculates the number of direct parents needed to generate a struct column hierarchy with
- * lowest maximum number of children in any nested column.
- *
- * Used to generate an "evenly distributed" struct column hierarchy with the given number of leaf
- * columns and nesting levels. The column tree is considered evenly distributed if all columns have
- * nearly the same number of child columns (difference not larger than one).
- */
-int num_direct_parents(int num_lvls, int num_leaf_columns)
-{
-  // Estimated average number of children in the hierarchy;
-  auto const num_children_avg = std::pow(num_leaf_columns, 1. / num_lvls);
-  // Minimum number of children columns for any column in the hierarchy
-  int const num_children_min = std::floor(num_children_avg);
-  // Maximum number of children columns for any column in the hierarchy
-  int const num_children_max = num_children_min + 1;
-
-  // Minimum number of columns needed so that their number of children does not exceed the maximum
-  int const min_for_current_nesting = std::ceil((double)num_leaf_columns / num_children_max);
-  // Minimum number of columns needed so that columns at the higher levels have at least the minimum
-  // number of children
-  int const min_for_upper_nesting = std::pow(num_children_min, num_lvls - 1);
-  // Both conditions need to be satisfied
-  return std::max(min_for_current_nesting, min_for_upper_nesting);
-}
-
 template <>
 std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profile const& profile,
                                                                       thrust::minstd_rand& engine,
@@ -825,13 +853,17 @@ std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> cons
                                                  data_profile const& profile,
                                                  unsigned seed)
 {
-  size_t const avg_row_bytes =
-    std::accumulate(dtype_ids.begin(), dtype_ids.end(), 0ul, [&](size_t sum, auto tid) {
+  auto const avg_row_bytes =
+    std::accumulate(dtype_ids.begin(), dtype_ids.end(), 0., [&](size_t sum, auto tid) {
       return sum + avg_element_size(profile, cudf::data_type(tid));
     });
-  cudf::size_type const num_rows = table_bytes.size / avg_row_bytes;
+  std::size_t const num_rows = std::lround(table_bytes.size / avg_row_bytes);
+  CUDF_EXPECTS(num_rows > 0, "Table size is too small for the given data types");
+  CUDF_EXPECTS(num_rows < std::numeric_limits<cudf::size_type>::max(),
+               "Table size is too large for the given data types");
 
-  return create_random_table(dtype_ids, row_count{num_rows}, profile, seed);
+  return create_random_table(
+    dtype_ids, row_count{static_cast<cudf::size_type>(num_rows)}, profile, seed);
 }
 
 std::unique_ptr<cudf::table> create_random_table(std::vector<cudf::type_id> const& dtype_ids,

diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -313,8 +313,9 @@ class data_profile {
     }
   }
 
-  auto get_bool_probability_true() const { return bool_probability_true; }
-  auto get_null_probability() const { return null_probability; };
+  [[nodiscard]] auto get_bool_probability_true() const { return bool_probability_true; }
+  [[nodiscard]] auto get_null_probability() const { return null_probability; };
+  [[nodiscard]] auto get_valid_probability() const { return 1. - null_probability.value_or(0.); };
   [[nodiscard]] auto get_cardinality() const { return cardinality; };
   [[nodiscard]] auto get_avg_run_length() const { return avg_run_length; };
 

diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
@@ -45,7 +45,9 @@ void bench_case(nvbench::state& state)
       cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126);  // nice ASCII range
     auto input        = cudf::strings_column_view(col_view);
     auto ascii_column = create_random_column(
-      cudf::type_id::INT8, row_count{input.chars_size(cudf::get_default_stream())}, ascii_profile);
+      cudf::type_id::INT8,
+      row_count{static_cast<cudf::size_type>(input.chars_size(cudf::get_default_stream()))},
+      ascii_profile);
     auto ascii_data = ascii_column->view();
 
     col_view = cudf::column_view(col_view.type(),

diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -170,8 +170,6 @@ struct genericAtomicOperationImpl<float, DeviceSum, 4> {
   }
 };
 
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
-// `atomicAdd(double)` is supported after cuda architecture 6.0
 template <>
 struct genericAtomicOperationImpl<double, DeviceSum, 8> {
   using T = double;
@@ -180,7 +178,6 @@ struct genericAtomicOperationImpl<double, DeviceSum, 8> {
     return atomicAdd(addr, update_value);
   }
 };
-#endif
 
 template <>
 struct genericAtomicOperationImpl<int32_t, DeviceSum, 4> {

diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -112,7 +112,7 @@ class strings_column_view : private column_view {
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return Number of bytes in the chars child column
    */
-  [[nodiscard]] size_type chars_size(rmm::cuda_stream_view stream) const noexcept;
+  [[nodiscard]] int64_t chars_size(rmm::cuda_stream_view stream) const noexcept;
 
   /**
    * @brief Return an iterator for the chars child column.

diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
@@ -55,13 +55,8 @@ struct count_accessor {
   std::enable_if_t<std::is_integral_v<T>, cudf::size_type> operator()(rmm::cuda_stream_view stream)
   {
     using ScalarType = cudf::scalar_type_t<T>;
-#if 1
-    // TODO: temporary till cudf::scalar's value() function is marked as const
-    auto p_count = const_cast<ScalarType*>(static_cast<ScalarType const*>(this->p_scalar));
-#else
-    auto p_count = static_cast<ScalarType const*>(this->p_scalar);
-#endif
-    auto count = p_count->value(stream);
+    auto p_count     = static_cast<ScalarType const*>(this->p_scalar);
+    auto count       = p_count->value(stream);
     // static_cast is necessary due to bool
     CUDF_EXPECTS(static_cast<int64_t>(count) <= std::numeric_limits<cudf::size_type>::max(),
                  "count should not exceed the column size limit",

diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh
@@ -37,9 +37,5 @@ struct managed {
 
 inline bool isPtrManaged(cudaPointerAttributes attr)
 {
-#if CUDART_VERSION >= 10000
   return (attr.type == cudaMemoryTypeManaged);
-#else
-  return attr.isManaged;
-#endif
 }
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
@@ -153,17 +153,7 @@ static __device__ uint8_t* StoreCopy(uint8_t* dst,
  */
 static inline __device__ uint32_t HashMatchAny(uint32_t v, uint32_t t)
 {
-#if (__CUDA_ARCH__ >= 700)
   return __match_any_sync(~0, v);
-#else
-  uint32_t err_map = 0;
-  for (uint32_t i = 0; i < hash_bits; i++, v >>= 1) {
-    uint32_t b       = v & 1;
-    uint32_t match_b = ballot(b);
-    err_map |= match_b ^ -(int32_t)b;
-  }
-  return ~err_map;
-#endif
 }
 
 /**

diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
@@ -91,7 +91,7 @@ class DFASimulationCallbackWrapper {
   {
     uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
     if (write) {
-#if __CUDA_ARCH__ > 0
+#if defined(__CUDA_ARCH__)
 #pragma unroll 1
 #endif
       for (uint32_t out_char = 0; out_char < count; out_char++) {