Use hostdevice_vector in kernel_error to avoid the pageable copy (#…

…15140) Issue #15122 The addition of kernel error checking introduced a 5% performance regression in Spark-RAPIDS. It was determined that the pageable copy of the error back to host caused this overhead, presumably because of the CUDA's bounce buffer bottleneck. This PR aims to eliminate most of the error checking overhead by using `hostdevice_vector` in the `kernel_error` class. The `hostdevice_vector` uses pinned memory so the copy is no longer pageable. The PR also removes the redundant sync after we read the error. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Paul Mattione (https://github.com/pmattione-nvidia) URL: #15140
rapidsai · Mar 5, 2024 · 3ea947a · 3ea947a
1 parent 176f75b
commit 3ea947a
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 22 deletions.
diff --git a/cpp/src/io/parquet/error.hpp b/cpp/src/io/parquet/error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,8 @@
 #pragma once
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
+
+#include <io/utilities/hostdevice_vector.hpp>
 
 #include <cstdint>
 #include <sstream>
@@ -37,7 +38,7 @@ class kernel_error {
   using pointer    = value_type*;
 
  private:
-  rmm::device_scalar<value_type> _error_code;
+  mutable cudf::detail::hostdevice_vector<value_type> _error_code;
 
  public:
   /**
@@ -50,30 +51,39 @@ class kernel_error {
    *
    * @param CUDA stream to use
    */
-  kernel_error(rmm::cuda_stream_view stream) : _error_code{0, stream} {}
+  kernel_error(rmm::cuda_stream_view stream) : _error_code(1, stream)
+  {
+    _error_code[0] = 0;
+    _error_code.host_to_device_async(stream);
+  }
 
   /**
    * @brief Return a pointer to the device memory for the error
    */
-  [[nodiscard]] auto data() { return _error_code.data(); }
+  [[nodiscard]] auto data() { return _error_code.device_ptr(); }
 
   /**
    * @brief Return the current value of the error
    *
-   * This uses the stream used to create this instance. This does a synchronize on the stream
-   * this object was instantiated with.
+   * @param stream The CUDA stream to synchronize with
    */
-  [[nodiscard]] auto value() const { return _error_code.value(_error_code.stream()); }
+  [[nodiscard]] auto value_sync(rmm::cuda_stream_view stream) const
+  {
+    _error_code.device_to_host_sync(stream);
+    return _error_code[0];
+  }
 
   /**
-   * @brief Return a hexadecimal string representation of the current error code
+   * @brief Return a hexadecimal string representation of an error code
    *
    * Returned string will have "0x" prepended.
+   *
+   * @param value The error code to convert to a string
    */
-  [[nodiscard]] std::string str() const
+  [[nodiscard]] static std::string to_string(value_type value)
   {
     std::stringstream sstream;
-    sstream << std::hex << value();
+    sstream << std::hex << value;
     return "0x" + sstream.str();
   }
 };

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
@@ -246,11 +246,9 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   page_nesting.device_to_host_async(_stream);
   page_nesting_decode.device_to_host_async(_stream);
 
-  if (error_code.value() != 0) {
-    CUDF_FAIL("Parquet data decode failed with code(s) " + error_code.str());
+  if (auto const error = error_code.value_sync(_stream); error != 0) {
+    CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error));
   }
-  // error_code.value() is synchronous; explicitly sync here for better visibility
-  _stream.synchronize();
 
   // for list columns, add the final offset to every offset buffer.
   // TODO : make this happen in more efficiently. Maybe use thrust::for_each

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -296,10 +296,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   // so that we can actually compile a list of all the unsupported encodings found
   // in the pages. That cannot be done here since we do not have the pages vector here.
   // see https://github.com/rapidsai/cudf/pull/14453#pullrequestreview-1778346688
-  if (error_code.value() != 0 and
-      error_code.value() != static_cast<uint32_t>(decode_error::UNSUPPORTED_ENCODING)) {
+  if (auto const error = error_code.value_sync(stream);
+      error != 0 and error != static_cast<uint32_t>(decode_error::UNSUPPORTED_ENCODING)) {
     CUDF_FAIL("Parquet header parsing failed with code(s) while counting page headers " +
-              error_code.str());
+              kernel_error::to_string(error));
   }
 
   for (size_t c = 0; c < chunks.size(); c++) {
@@ -480,13 +480,14 @@ void decode_page_headers(pass_intermediate_data& pass,
                     error_code.data(),
                     stream);
 
-  if (error_code.value() != 0) {
-    if (BitAnd(error_code.value(), decode_error::UNSUPPORTED_ENCODING) != 0) {
+  if (auto const error = error_code.value_sync(stream); error != 0) {
+    if (BitAnd(error, decode_error::UNSUPPORTED_ENCODING) != 0) {
       auto const unsupported_str =
         ". With unsupported encodings found: " + list_unsupported_encodings(pass.pages, stream);
-      CUDF_FAIL("Parquet header parsing failed with code(s) " + error_code.str() + unsupported_str);
+      CUDF_FAIL("Parquet header parsing failed with code(s) " + kernel_error::to_string(error) +
+                unsupported_str);
     } else {
-      CUDF_FAIL("Parquet header parsing failed with code(s) " + error_code.str());
+      CUDF_FAIL("Parquet header parsing failed with code(s) " + kernel_error::to_string(error));
     }
   }
 

diff --git a/cpp/src/io/utilities/hostdevice_span.hpp b/cpp/src/io/utilities/hostdevice_span.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>