Skip to content

Commit

Permalink
Fix ingest_raw_data performance issue in Nested JSON reader due to RVO (
Browse files Browse the repository at this point in the history
#12070)

Issue is that `json::experimental::ingest_raw_data` took double the time of `json::ingest_raw_data` for same data.

After replacing tertiary operator with `if` `else`, runtime for 500 MB file is same as `json::ingest_raw_data`
I suspect, RVO (copy elision) is skipped while using tertiary operator.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Elias Stehle (https://github.com/elstehle)
  - MithunR (https://github.com/mythrocks)

URL: #12070
  • Loading branch information
karthikeyann authored Nov 7, 2022
1 parent 52dbb63 commit 262631b
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 10 deletions.
46 changes: 36 additions & 10 deletions cpp/src/io/json/experimental/read_json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,38 +19,64 @@
#include <io/comp/io_uncomp.hpp>
#include <io/json/nested_json.hpp>

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/utilities/error.hpp>

#include <numeric>

namespace cudf::io::detail::json::experimental {

std::vector<uint8_t> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
compression_type compression)
size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
size_t range_offset,
size_t range_size)
{
auto const total_source_size =
std::accumulate(sources.begin(), sources.end(), 0ul, [](size_t sum, auto& source) {
return sum + source->size();
});
auto buffer = std::vector<uint8_t>(total_source_size);
return std::accumulate(sources.begin(), sources.end(), 0ul, [=](size_t sum, auto& source) {
auto const size = source->size();
// TODO take care of 0, 0, or *, 0 case.
return sum +
(range_size == 0 or range_offset + range_size > size ? size - range_offset : range_size);
});
}

std::vector<uint8_t> ingest_raw_input(host_span<std::unique_ptr<datasource>> const& sources,
compression_type compression,
size_t range_offset,
size_t range_size)
{
CUDF_FUNC_RANGE();
// Iterate through the user defined sources and read the contents into the local buffer
auto const total_source_size = sources_size(sources, range_offset, range_size);
auto buffer = std::vector<uint8_t>(total_source_size);

size_t bytes_read = 0;
for (const auto& source : sources) {
bytes_read += source->host_read(0, source->size(), buffer.data() + bytes_read);
if (!source->is_empty()) {
auto data_size = (range_size != 0) ? range_size : source->size();
auto destination = buffer.data() + bytes_read;
bytes_read += source->host_read(range_offset, data_size, destination);
}
}

return (compression == compression_type::NONE) ? buffer : decompress(compression, buffer);
if (compression == compression_type::NONE) {
return buffer;
} else {
return decompress(compression, buffer);
}
}

table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
json_reader_options const& reader_opts,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(reader_opts.get_byte_range_offset() == 0 and reader_opts.get_byte_range_size() == 0,
"specifying a byte range is not yet supported");

auto const buffer = ingest_raw_input(sources, reader_opts.get_compression());
auto const buffer = ingest_raw_input(sources,
reader_opts.get_compression(),
reader_opts.get_byte_range_offset(),
reader_opts.get_byte_range_size());
auto data = host_span<char const>(reinterpret_cast<char const*>(buffer.data()), buffer.size());

try {
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/io/json/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <io/utilities/type_conversion.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/detail/utilities/visitor_overload.hpp>
#include <cudf/groupby.hpp>
Expand Down Expand Up @@ -222,6 +223,7 @@ std::vector<uint8_t> ingest_raw_input(std::vector<std::unique_ptr<datasource>> c
size_t range_size,
size_t range_size_padded)
{
CUDF_FUNC_RANGE();
// Iterate through the user defined sources and read the contents into the local buffer
size_t total_source_size = 0;
for (const auto& source : sources) {
Expand Down Expand Up @@ -313,6 +315,7 @@ rmm::device_uvector<char> upload_data_to_device(json_reader_options const& reade
rmm::device_uvector<uint64_t>& rec_starts,
rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();
size_t end_offset = h_data.size();

// Trim lines that are outside range
Expand Down Expand Up @@ -592,6 +595,7 @@ table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
if (reader_opts.is_enabled_experimental()) {
return experimental::read_json(sources, reader_opts, stream, mr);
}
Expand Down

0 comments on commit 262631b

Please sign in to comment.