rapidsai · rapids-bot · Jul 20, 2024 · Jun 11, 2024 · Jun 13, 2024 · Jun 13, 2024
@@ -672,6 +672,7 @@ add_library(
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
   src/utilities/pinned_memory.cpp
+  src/utilities/prefetch.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
   src/utilities/traits.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,9 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/prefetch.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -72,7 +74,7 @@ class column_view_base {
             CUDF_ENABLE_IF(std::is_same_v<T, void> or is_rep_layout_compatible<T>())>
   T const* head() const noexcept
   {
-    return static_cast<T const*>(_data);
+    return static_cast<T const*>(get_data());
   }
 
   /**
@@ -225,6 +227,17 @@ class column_view_base {
   [[nodiscard]] size_type offset() const noexcept { return _offset; }
 
  protected:
+  /**
+   * @brief Returns pointer to the base device memory allocation.
+   *
+   * The primary purpose of this function is to allow derived classes to
+   * override the fundamental properties of memory accesses without needing to
+   * change all of the different accessors for the underlying pointer.
+   *
+   * @return Typed pointer to underlying data
+   */
+  virtual void const* get_data() const noexcept { return _data; }
+
   data_type _type{type_id::EMPTY};   ///< Element type
   size_type _size{};                 ///< Number of elements
   void const* _data{};               ///< Pointer to device memory containing elements
@@ -236,7 +249,7 @@ class column_view_base {
                                      ///< Enables zero-copy slicing
 
   column_view_base()                        = default;
-  ~column_view_base()                       = default;
+  virtual ~column_view_base()               = default;
   column_view_base(column_view_base const&) = default;  ///< Copy constructor
   column_view_base(column_view_base&&)      = default;  ///< Move constructor
   /**
@@ -283,11 +296,6 @@ class column_view_base {
                    size_type null_count,
                    size_type offset = 0);
 };
-
-class mutable_column_view_base : public column_view_base {
- public:
- protected:
-};
 }  // namespace detail
 
 /**
@@ -323,7 +331,7 @@ class column_view : public detail::column_view_base {
 #ifdef __CUDACC__
 #pragma nv_exec_check_disable
 #endif
-  ~column_view() = default;
+  ~column_view() override = default;
 #ifdef __CUDACC__
 #pragma nv_exec_check_disable
 #endif
@@ -447,6 +455,18 @@ class column_view : public detail::column_view_base {
     return device_span<T const>(data<T>(), size());
   }
 
+ protected:
+  /**
+   * @brief Returns pointer to the base device memory allocation.
+   *
+   * The primary purpose of this function is to allow derived classes to
+   * override the fundamental properties of memory accesses without needing to
+   * change all of the different accessors for the underlying pointer.
+   *
+   * @return Typed pointer to underlying data
+   */
+  void const* get_data() const noexcept override;
+
  private:
   friend column_view bit_cast(column_view const& input, data_type type);
 
@@ -478,7 +498,7 @@ class mutable_column_view : public detail::column_view_base {
  public:
   mutable_column_view() = default;
 
-  ~mutable_column_view(){
+  ~mutable_column_view() override{
     // Needed so that the first instance of the implicit destructor for any TU isn't 'constructed'
     // from a host+device function marking the implicit version also as host+device
   };
@@ -572,7 +592,7 @@ class mutable_column_view : public detail::column_view_base {
   }
 
   /**
-   * @brief Return first element (accounting for offset) when underlying data is
+   * @brief Return first element (accounting for offset) after underlying data is
    * casted to the specified type.
    *
    * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
@@ -665,6 +685,18 @@ class mutable_column_view : public detail::column_view_base {
    */
   operator column_view() const;
 
+ protected:
+  /**
+   * @brief Returns pointer to the base device memory allocation.
+   *
+   * The primary purpose of this function is to allow derived classes to
+   * override the fundamental properties of memory accesses without needing to
+   * change all of the different accessors for the underlying pointer.
+   *
+   * @return Typed pointer to underlying data
+   */
+  void const* get_data() const noexcept override;
+
  private:
   friend mutable_column_view bit_cast(mutable_column_view const& input, data_type type);
 

@@ -40,9 +40,6 @@ class preprocessed_table;
 namespace cudf {
 namespace detail {
 
-// Forward declaration
-class cuco_allocator;
-
 constexpr int DEFAULT_JOIN_CG_SIZE = 2;
 
 enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };

@@ -18,11 +18,13 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/prefetch.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -230,7 +232,8 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
   if (output_count == 0) return rmm::device_uvector<char>(0, stream, mr);
 
   auto chars_data = rmm::device_uvector<char>(chars_bytes, stream, mr);
-  auto d_chars    = chars_data.data();
+  cudf::experimental::prefetch::detail::prefetch("manual_prefetch", chars_data, stream);
+  auto d_chars = chars_data.data();
 
   constexpr int warps_per_threadblock = 4;
   // String parallel strategy will be used if average string length is above this threshold.
@@ -312,6 +315,8 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
   // build chars column
   auto const offsets_view =
     cudf::detail::offsetalator_factory::make_input_iterator(out_offsets_column->view());
+  cudf::experimental::prefetch::detail::prefetch(
+    "manual_prefetch", strings.chars_begin(stream), strings.chars_size(stream), stream);
   auto out_chars_data = gather_chars(
     d_strings->begin<string_view>(), begin, end, offsets_view, total_bytes, stream, mr);
 

@@ -23,6 +23,7 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/prefetch.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -186,6 +187,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
 
   // Now build the chars column
   rmm::device_uvector<char> chars(bytes, stream, mr);
+  cudf::experimental::prefetch::detail::prefetch("manual_prefetch", chars, stream);
   size_and_exec_fn.d_chars = chars.data();
 
   // Execute the function fn again to fill in the chars data.

@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <map>
+#include <string>
+#include <string_view>
+
+namespace cudf::experimental::prefetch {
+
+namespace detail {
+
+/**
+ * @brief A singleton class that manages the prefetching configuration.
+ */
+class PrefetchConfig {
+ public:
+  PrefetchConfig& operator=(const PrefetchConfig&) = delete;
+  PrefetchConfig(const PrefetchConfig&)            = delete;
+
+  /**
+   * @brief Get the singleton instance of the prefetching configuration.
+   *
+   * @return The singleton instance of the prefetching configuration.
+   */
+  static PrefetchConfig& instance();
+
+  /**
+   * @brief Get the value of a configuration key.
+   *
+   * @param key The configuration key.
+   * @return The value of the configuration key.
+   */
+  bool get(std::string_view key);
+  /**
+   * @brief Set the value of a configuration key.
+   *
+   * @param key The configuration key.
+   * @param value The value to set.
+   */
+  void set(std::string_view key, bool value);
+  /**
+   * @brief Enable or disable debug mode.
+   *
+   * In debug mode, the pointers being prefetched are printed to stderr.
+   */
+  bool debug{false};
+
+ private:
+  PrefetchConfig() = default;                 //< Private constructor to enforce singleton pattern
+  std::map<std::string, bool> config_values;  //< Map of configuration keys to values
+};
+
+/**
+ * @brief Enable prefetching for a particular structure or algorithm.
+ *
+ * @param key The key to enable prefetching for.
+ * @param ptr The pointer to prefetch.
+ * @param size The size of the memory region to prefetch.
+ * @param stream The stream to prefetch on.
+ * @param device_id The device to prefetch on.
+ */
+void prefetch(std::string_view key,
+              void const* ptr,
+              std::size_t size,
+              rmm::cuda_stream_view stream,
+              rmm::cuda_device_id device_id = rmm::get_current_cuda_device());
+
+/**
+ * @brief Enable prefetching for a particular structure or algorithm.
+ *
+ * @note This function will not throw exceptions, so it is safe to call in
+ * noexcept contexts. If an error occurs, the error code is returned. This
+ * function primarily exists for [mutable_]column_view::get_data and should be
+ * removed once an method for stream-ordered data pointer access is added to
+ * those data structures.
+ *
+ * @param key The key to enable prefetching for.
+ * @param ptr The pointer to prefetch.
+ * @param size The size of the memory region to prefetch.
+ * @param stream The stream to prefetch on.
+ * @param device_id The device to prefetch on.
+ */
+cudaError_t prefetch_noexcept(
+  std::string_view key,
+  void const* ptr,
+  std::size_t size,
+  rmm::cuda_stream_view stream,
+  rmm::cuda_device_id device_id = rmm::get_current_cuda_device()) noexcept;
+
+/**
+ * @brief Prefetch the data in a device_uvector.
+ *
+ * @note At present this function does not support stream-ordered execution. Prefetching always
+ * occurs on the default stream.
+ *
+ * @param key The key to enable prefetching for.
+ * @param v The device_uvector to prefetch.
+ * @param stream The stream to prefetch on.
+ * @param device_id The device to prefetch on.
+ */
+template <typename T>
+void prefetch(std::string_view key,
+              rmm::device_uvector<T> const& v,
+              rmm::cuda_stream_view stream,
+              rmm::cuda_device_id device_id = rmm::get_current_cuda_device())
+{
+  if (v.is_empty()) { return; }
+  prefetch(key, v.data(), v.size(), stream, device_id);
+}
+
+}  // namespace detail
+
+/**
+ * @brief Enable prefetching for a particular structure or algorithm.
+ *
+ * @param key The key to enable prefetching for.
+ */
+void enable_prefetching(std::string_view key);
+
+/**
+ * @brief Disable prefetching for a particular structure or algorithm.
+ *
+ * @param key The key to disable prefetching for.
+ */
+void disable_prefetching(std::string_view key);
+
+/**
+ * @brief Enable or disable debug mode.
+ *
+ * In debug mode, the pointers being prefetched are printed to stderr.
+ *
+ * @param enable Whether to enable or disable debug mode.
+ */
+void prefetch_debugging(bool enable);
+
+}  // namespace cudf::experimental::prefetch