Skip to content

Commit

Permalink
Use hostdevice_vector in kernel_error to avoid the pageable copy (#…
Browse files Browse the repository at this point in the history
…15140)

Issue #15122

The addition of kernel error checking introduced a 5% performance regression in Spark-RAPIDS. It was determined that the pageable copy of the error back to host caused this overhead, presumably because of the CUDA's bounce buffer bottleneck.

This PR aims to eliminate most of the error checking overhead by using `hostdevice_vector` in the `kernel_error` class. The `hostdevice_vector` uses pinned memory so the copy is no longer pageable. The PR also removes the redundant sync after we read the error.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: #15140
  • Loading branch information
vuule authored Mar 5, 2024
1 parent 176f75b commit 3ea947a
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 22 deletions.
32 changes: 21 additions & 11 deletions cpp/src/io/parquet/error.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,7 +17,8 @@
#pragma once

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>

#include <io/utilities/hostdevice_vector.hpp>

#include <cstdint>
#include <sstream>
Expand All @@ -37,7 +38,7 @@ class kernel_error {
using pointer = value_type*;

private:
rmm::device_scalar<value_type> _error_code;
mutable cudf::detail::hostdevice_vector<value_type> _error_code;

public:
/**
Expand All @@ -50,30 +51,39 @@ class kernel_error {
*
* @param CUDA stream to use
*/
kernel_error(rmm::cuda_stream_view stream) : _error_code{0, stream} {}
kernel_error(rmm::cuda_stream_view stream) : _error_code(1, stream)
{
_error_code[0] = 0;
_error_code.host_to_device_async(stream);
}

/**
* @brief Return a pointer to the device memory for the error
*/
[[nodiscard]] auto data() { return _error_code.data(); }
[[nodiscard]] auto data() { return _error_code.device_ptr(); }

/**
* @brief Return the current value of the error
*
* This uses the stream used to create this instance. This does a synchronize on the stream
* this object was instantiated with.
* @param stream The CUDA stream to synchronize with
*/
[[nodiscard]] auto value() const { return _error_code.value(_error_code.stream()); }
[[nodiscard]] auto value_sync(rmm::cuda_stream_view stream) const
{
_error_code.device_to_host_sync(stream);
return _error_code[0];
}

/**
* @brief Return a hexadecimal string representation of the current error code
* @brief Return a hexadecimal string representation of an error code
*
* Returned string will have "0x" prepended.
*
* @param value The error code to convert to a string
*/
[[nodiscard]] std::string str() const
[[nodiscard]] static std::string to_string(value_type value)
{
std::stringstream sstream;
sstream << std::hex << value();
sstream << std::hex << value;
return "0x" + sstream.str();
}
};
Expand Down
6 changes: 2 additions & 4 deletions cpp/src/io/parquet/reader_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,11 +246,9 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
page_nesting.device_to_host_async(_stream);
page_nesting_decode.device_to_host_async(_stream);

if (error_code.value() != 0) {
CUDF_FAIL("Parquet data decode failed with code(s) " + error_code.str());
if (auto const error = error_code.value_sync(_stream); error != 0) {
CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error));
}
// error_code.value() is synchronous; explicitly sync here for better visibility
_stream.synchronize();

// for list columns, add the final offset to every offset buffer.
// TODO : make this happen in more efficiently. Maybe use thrust::for_each
Expand Down
15 changes: 8 additions & 7 deletions cpp/src/io/parquet/reader_impl_preprocess.cu
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
// so that we can actually compile a list of all the unsupported encodings found
// in the pages. That cannot be done here since we do not have the pages vector here.
// see https://github.com/rapidsai/cudf/pull/14453#pullrequestreview-1778346688
if (error_code.value() != 0 and
error_code.value() != static_cast<uint32_t>(decode_error::UNSUPPORTED_ENCODING)) {
if (auto const error = error_code.value_sync(stream);
error != 0 and error != static_cast<uint32_t>(decode_error::UNSUPPORTED_ENCODING)) {
CUDF_FAIL("Parquet header parsing failed with code(s) while counting page headers " +
error_code.str());
kernel_error::to_string(error));
}

for (size_t c = 0; c < chunks.size(); c++) {
Expand Down Expand Up @@ -480,13 +480,14 @@ void decode_page_headers(pass_intermediate_data& pass,
error_code.data(),
stream);

if (error_code.value() != 0) {
if (BitAnd(error_code.value(), decode_error::UNSUPPORTED_ENCODING) != 0) {
if (auto const error = error_code.value_sync(stream); error != 0) {
if (BitAnd(error, decode_error::UNSUPPORTED_ENCODING) != 0) {
auto const unsupported_str =
". With unsupported encodings found: " + list_unsupported_encodings(pass.pages, stream);
CUDF_FAIL("Parquet header parsing failed with code(s) " + error_code.str() + unsupported_str);
CUDF_FAIL("Parquet header parsing failed with code(s) " + kernel_error::to_string(error) +
unsupported_str);
} else {
CUDF_FAIL("Parquet header parsing failed with code(s) " + error_code.str());
CUDF_FAIL("Parquet header parsing failed with code(s) " + kernel_error::to_string(error));
}
}

Expand Down
1 change: 1 addition & 0 deletions cpp/src/io/utilities/hostdevice_span.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#pragma once

#include <cudf/utilities/error.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand Down

0 comments on commit 3ea947a

Please sign in to comment.