Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimizing multi-source byte range reading in JSON reader #15396

Merged
merged 48 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from 46 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
697cf65
byte range reader improvement
shrshi Mar 26, 2024
115c2c6
subchunk size heuristic; multistream d2d copy; small logic fix
shrshi Mar 27, 2024
c99e4ef
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Mar 27, 2024
7f97196
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Mar 27, 2024
615f005
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Apr 1, 2024
0ac251d
overhaul commit
shrshi Apr 4, 2024
5c21ee4
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Apr 4, 2024
09641db
format fix
shrshi Apr 5, 2024
c186435
Merge branch 'byte-range-improvement' of github.com:shrshi/cudf into …
shrshi Apr 5, 2024
e912671
more fixes
shrshi Apr 5, 2024
8557cf9
cleanup
shrshi Apr 5, 2024
16f7e7f
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Apr 5, 2024
3582358
fixes
shrshi Apr 5, 2024
90b5246
Merge branch 'byte-range-improvement' of github.com:shrshi/cudf into …
shrshi Apr 5, 2024
02a556d
cleanup
shrshi Apr 5, 2024
c7a2799
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Apr 5, 2024
9901f97
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Apr 9, 2024
a535136
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Apr 10, 2024
7f44bf4
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Apr 10, 2024
685127c
logic fix
shrshi Apr 15, 2024
ad9af4d
Merge branch 'byte-range-improvement' of github.com:shrshi/cudf into …
shrshi Apr 15, 2024
458bc67
fix to initial allocation for compressed file input
shrshi Apr 16, 2024
d080a5f
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Apr 16, 2024
0e30c85
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Apr 16, 2024
28fedd1
removed uniq ptrs, passing device uvecs directly
shrshi Apr 17, 2024
f1bf818
cleanup; so many fixes
shrshi Apr 17, 2024
f29f223
Merge branch 'byte-range-improvement' of github.com:shrshi/cudf into …
shrshi Apr 17, 2024
21f07c2
merge
shrshi Apr 17, 2024
0066f2e
Merge branch 'branch-24.06' of github.com:rapidsai/cudf into byte-ran…
shrshi Apr 18, 2024
6032949
partially addressing PR reviews
shrshi Apr 18, 2024
c503e33
addressing pr reviews
shrshi Apr 18, 2024
5004f0c
formatting
shrshi Apr 18, 2024
a1fe36b
reducing memalloc for whole file read
shrshi Apr 22, 2024
6c99591
merge
shrshi Apr 22, 2024
e4c04cd
addressing PR reviews
shrshi Apr 22, 2024
79fa4f3
partially address PR reviews
shrshi Apr 22, 2024
1b0c8f8
docs fix
shrshi Apr 23, 2024
5dc53d8
addressing PR reviews
shrshi Apr 23, 2024
d29fdf8
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Apr 23, 2024
bd18397
fix
shrshi Apr 24, 2024
a5e49af
Merge branch 'byte-range-improvement' of github.com:shrshi/cudf into …
shrshi Apr 24, 2024
54daff2
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Apr 25, 2024
9075159
partially addressing reviews
shrshi Apr 29, 2024
7d826af
PR reviews
shrshi Apr 29, 2024
fb9fdae
Merge branch 'byte-range-improvement' of github.com:shrshi/cudf into …
shrshi Apr 29, 2024
032a5da
Merge branch 'branch-24.06' into byte-range-improvement
shrshi Apr 29, 2024
329f9ae
adding consts
shrshi Apr 30, 2024
df68938
Merge branch 'byte-range-improvement' of github.com:shrshi/cudf into …
shrshi Apr 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions cpp/include/cudf/io/detail/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#pragma once

#include <cudf/io/datasource.hpp>
#include <cudf/io/json.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand Down Expand Up @@ -56,22 +57,22 @@ void write_json(data_sink* sink,
/**
* @brief Normalize single quotes to double quotes using FST
*
* @param inbuf Input device buffer
* @param indata Input device buffer
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& inbuf,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

/**
* @brief Normalize unquoted whitespace (space and tab characters) using FST
*
* @param inbuf Input device buffer
* @param indata Input device buffer
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
} // namespace cudf::io::json::detail
30 changes: 16 additions & 14 deletions cpp/src/io/json/json_normalization.cu
Original file line number Diff line number Diff line change
Expand Up @@ -298,52 +298,54 @@ struct TransduceToNormalizedWS {

namespace detail {

rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto parser = fst::detail::make_fst(
fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
fst::detail::make_translation_functor(normalize_quotes::TransduceToNormalizedQuotes{}),
stream);

rmm::device_uvector<SymbolT> outbuf(inbuf.size() * 2, stream, mr);
rmm::device_uvector<SymbolT> outbuf(indata.size() * 2, stream, mr);
rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
parser.Transduce(inbuf.data(),
static_cast<SymbolOffsetT>(inbuf.size()),
parser.Transduce(indata.data(),
static_cast<SymbolOffsetT>(indata.size()),
outbuf.data(),
thrust::make_discard_iterator(),
outbuf_size.data(),
normalize_quotes::start_state,
stream);

outbuf.resize(outbuf_size.value(stream), stream);
return outbuf;
datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
std::swap(indata, outdata);
}

rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&& inbuf,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto parser = fst::detail::make_fst(
fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
stream);

rmm::device_uvector<SymbolT> outbuf(inbuf.size(), stream, mr);
rmm::device_uvector<SymbolT> outbuf(indata.size(), stream, mr);
rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
parser.Transduce(inbuf.data(),
static_cast<SymbolOffsetT>(inbuf.size()),
parser.Transduce(indata.data(),
static_cast<SymbolOffsetT>(indata.size()),
outbuf.data(),
thrust::make_discard_iterator(),
outbuf_size.data(),
normalize_whitespace::start_state,
stream);

outbuf.resize(outbuf_size.value(stream), stream);
return outbuf;
datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
std::swap(indata, outdata);
}

} // namespace detail
Expand Down
Loading
Loading