Merge branch 'branch-24.04' into verify-copyright

rapidsai · Mar 4, 2024 · bf5950c · bf5950c
2 parents c640477 + 903dcac
commit bf5950c
Show file tree

Hide file tree

Showing 90 changed files with 1,402 additions and 671 deletions.
diff --git a/README.md b/README.md
@@ -93,7 +93,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.04 python=3.10 cuda-version=11.8
+    cudf=24.04 python=3.11 cuda-version=12.2
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -4,7 +4,6 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - dask/label/dev
-- pytorch
 - conda-forge
 - nvidia
 dependencies:
@@ -59,7 +58,7 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.57
-- numpy>=1.21
+- numpy>=1.23
 - numpydoc
 - nvcc_linux-64=11.8
 - nvcomp==3.0.6
@@ -79,8 +78,8 @@ dependencies:
 - pytest-xdist
 - pytest<8
 - python-confluent-kafka>=1.9.0,<1.10.0a0
-- python>=3.9,<3.11
-- pytorch<1.12.0
+- python>=3.9,<3.12
+- pytorch>=2.1.0
 - rapids-dask-dependency==24.4.*
 - rich
 - rmm==24.4.*
@@ -96,8 +95,8 @@ dependencies:
 - sphinxcontrib-websupport
 - streamz
 - sysroot_linux-64==2.17
-- tokenizers==0.13.1
-- transformers==4.24.0
+- tokenizers==0.15.2
+- transformers==4.38.1
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 - pip:

diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -4,7 +4,6 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - dask/label/dev
-- pytorch
 - conda-forge
 - nvidia
 dependencies:
@@ -58,7 +57,7 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.57
-- numpy>=1.21
+- numpy>=1.23
 - numpydoc
 - nvcomp==3.0.6
 - nvtx>=0.2.1
@@ -77,8 +76,8 @@ dependencies:
 - pytest-xdist
 - pytest<8
 - python-confluent-kafka>=1.9.0,<1.10.0a0
-- python>=3.9,<3.11
-- pytorch<1.12.0
+- python>=3.9,<3.12
+- pytorch>=2.1.0
 - rapids-dask-dependency==24.4.*
 - rich
 - rmm==24.4.*
@@ -94,8 +93,8 @@ dependencies:
 - sphinxcontrib-websupport
 - streamz
 - sysroot_linux-64==2.17
-- tokenizers==0.13.1
-- transformers==4.24.0
+- tokenizers==0.15.2
+- transformers==4.38.1
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 - pip:

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -65,6 +65,7 @@ requirements:
     - scikit-build-core >=0.7.0
     - setuptools
     - dlpack >=0.5,<0.6.0a0
+    - numpy 1.23
     - pyarrow ==14.0.2.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
@@ -83,7 +84,7 @@ requirements:
     - pandas >=2.0,<2.2.2dev0
     - cupy >=12.0.0
     - numba >=0.57
-    - numpy >=1.21
+    - {{ pin_compatible('numpy', max_pin='x') }}
     - {{ pin_compatible('pyarrow', max_pin='x') }}
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}

diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
 
 #include <cudf/groupby.hpp>
 
@@ -50,9 +51,13 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
   requests[0].values = vals->view();
   requests[0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
 
+  auto const mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
+
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
 
 NVBENCH_BENCH_TYPES(bench_groupby_max,

diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -80,11 +81,15 @@ void bench_groupby_struct_keys(nvbench::state& state)
   requests[0].aggregations.push_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
 
   // Set up nvbench default stream
-  auto stream = cudf::get_default_stream();
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  auto stream                 = cudf::get_default_stream();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
+
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
 
 NVBENCH_BENCH(bench_groupby_struct_keys)

diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
@@ -179,8 +179,7 @@ auto build_json_string_column(int desired_bytes, int num_rows)
     desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
   auto [offsets, chars] = cudf::strings::detail::make_strings_children(
     jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-  return cudf::make_strings_column(
-    num_rows, std::move(offsets), std::move(chars->release().data.release()[0]), 0, {});
+  return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
 }
 
 void BM_case(benchmark::State& state, std::string query_arg)

diff --git a/cpp/include/cudf/detail/cuco_helpers.hpp b/cpp/include/cudf/detail/cuco_helpers.hpp
@@ -16,11 +16,16 @@
 
 #pragma once
 
+#include <cudf/types.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
 namespace cudf::detail {
 
+/// Sentinel value for `cudf::size_type`
+static cudf::size_type constexpr CUDF_SIZE_TYPE_SENTINEL = -1;
+
 /// Default load factor for cuco data structures
 static double constexpr CUCO_DESIRED_LOAD_FACTOR = 0.5;
 

diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -100,5 +100,15 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr);
 
+/**
+ * @copydoc cudf::segmented_row_bit_count
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
+                                                size_type segment_length,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -34,7 +34,7 @@ namespace strings {
 namespace detail {
 
 /**
- * @brief Creates child offsets and chars columns by applying the template function that
+ * @brief Creates child offsets and chars data by applying the template function that
  * can be used for computing the output size of each string as well as create the output
  *
  * @throws std::overflow_error if the output strings column exceeds the column size limit
@@ -49,7 +49,7 @@ namespace detail {
  * @param strings_count Number of strings.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned columns' device memory.
- * @return offsets child column and chars child column for a strings column
+ * @return Offsets child column and chars data for a strings column
  */
 template <typename SizeAndExecuteFunction>
 auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
@@ -84,18 +84,17 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
                std::overflow_error);
 
   // Now build the chars column
-  std::unique_ptr<column> chars_column =
-    create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
+  rmm::device_uvector<char> chars(bytes, stream, mr);
 
   // Execute the function fn again to fill the chars column.
   // Note that if the output chars column has zero size, the function fn should not be called to
   // avoid accidentally overwriting the offsets.
   if (bytes > 0) {
-    size_and_exec_fn.d_chars = chars_column->mutable_view().template data<char>();
+    size_and_exec_fn.d_chars = chars.data();
     for_each_fn(size_and_exec_fn);
   }
 
-  return std::pair(std::move(offsets_column), std::move(chars_column));
+  return std::pair(std::move(offsets_column), std::move(chars));
 }
 
 /**

diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,19 +20,16 @@
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/equal.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/swap.h>
 #include <thrust/transform_reduce.h>
 
-#include <limits>
-
 namespace cudf {
 
 /**
@@ -470,7 +467,9 @@ class element_hasher {
   template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
   __device__ hash_value_type operator()(column_device_view col, size_type row_index) const
   {
-    if (has_nulls && col.is_null(row_index)) { return std::numeric_limits<hash_value_type>::max(); }
+    if (has_nulls && col.is_null(row_index)) {
+      return cuda::std::numeric_limits<hash_value_type>::max();
+    }
     return hash_function<T>{}(col.element<T>(row_index));
   }
 
@@ -554,7 +553,7 @@ class element_hasher_with_seed {
 
  private:
   uint32_t _seed{DEFAULT_HASH_SEED};
-  hash_value_type _null_hash{std::numeric_limits<hash_value_type>::max()};
+  hash_value_type _null_hash{cuda::std::numeric_limits<hash_value_type>::max()};
   Nullate _has_nulls;
 };
 

diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -224,5 +224,28 @@ std::unique_ptr<column> row_bit_count(
   table_view const& t,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
+ * each segment of rows.
+ *
+ * This is similar to counting bit size per row for the input table in `cudf::row_bit_count`,
+ * except that row sizes are accumulated by segments.
+ *
+ * Currently, only fixed-length segments are supported. In case the input table has number of rows
+ * not divisible by `segment_length`, its last segment is considered as shorter than the others.
+ *
+ * @throw std::invalid_argument if the input `segment_length` is non-positive or larger than the
+ * number of rows in the input table.
+ *
+ * @param t The table view to perform the computation on
+ * @param segment_length The number of rows in each segment for which the total size is computed
+ * @param mr Device memory resource used to allocate the returned columns' device memory
+ * @return A 32-bit integer column containing the bit counts for each segment of rows
+ */
+std::unique_ptr<column> segmented_row_bit_count(
+  table_view const& t,
+  size_type segment_length,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
@@ -194,23 +194,7 @@ std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view
  *  `column_view`'s data, and second is the column's bitmask.
  */
 template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view c)
-{
-  using namespace numeric;
-  using Rep = typename T::rep;
-
-  auto host_rep_types = thrust::host_vector<Rep>(c.size());
-
-  CUDF_CUDA_TRY(
-    cudaMemcpy(host_rep_types.data(), c.begin<Rep>(), c.size() * sizeof(Rep), cudaMemcpyDefault));
-
-  auto to_fp = [&](Rep val) { return T{scaled_integer<Rep>{val, scale_type{c.type().scale()}}}; };
-  auto begin = thrust::make_transform_iterator(std::cbegin(host_rep_types), to_fp);
-  auto const host_fixed_points = thrust::host_vector<T>(begin, begin + c.size());
-
-  return {host_fixed_points, bitmask_to_host(c)};
-}
-//! @endcond
+std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view c);
 
 /**
  * @brief Copies the data and bitmask of a `column_view` of strings
@@ -223,29 +207,8 @@ std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view
  * and second is the column's bitmask.
  */
 template <>
-inline std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c)
-{
-  thrust::host_vector<std::string> host_data(c.size());
-  auto stream = cudf::get_default_stream();
-  if (c.size() > c.null_count()) {
-    auto const scv     = strings_column_view(c);
-    auto const h_chars = cudf::detail::make_std_vector_sync<char>(
-      cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
-    auto const h_offsets = cudf::detail::make_std_vector_sync(
-      cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
-                                               scv.size() + 1),
-      stream);
-
-    // build std::string vector from chars and offsets
-    std::transform(
-      std::begin(h_offsets),
-      std::end(h_offsets) - 1,
-      std::begin(h_offsets) + 1,
-      host_data.begin(),
-      [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
-  }
-  return {std::move(host_data), bitmask_to_host(c)};
-}
+std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c);
+//! @endcond
 
 }  // namespace cudf::test