From c83e5b3fdd7f9fe8a08c4f6874fbf847bba70c53 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 18 Jun 2024 16:22:44 -0400 Subject: [PATCH] Fix JSON multi-source reading when total source size exceeds `INT_MAX` bytes (#15930) Fixes #15917. - [X] Batched read and parse operations - [x] Fail when any single source file exceeds `INT_MAX` bytes. This case will be handled with a chunked reader later. Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/15930 --- cpp/include/cudf/io/types.hpp | 13 +++ cpp/src/io/json/read_json.cu | 121 +++++++++++++++++++++---- cpp/tests/CMakeLists.txt | 1 + cpp/tests/large_strings/json_tests.cpp | 58 ++++++++++++ 4 files changed, 177 insertions(+), 16 deletions(-) create mode 100644 cpp/tests/large_strings/json_tests.cpp diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 0dab1c606de..0c96268f6c7 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -256,6 +256,19 @@ struct column_name_info { } column_name_info() = default; + + /** + * @brief Compares two column name info structs for equality + * + * @param rhs column name info struct to compare against + * @return boolean indicating if this and rhs are equal + */ + bool operator==(column_name_info const& rhs) const + { + return ((name == rhs.name) && (is_nullable == rhs.is_nullable) && + (is_binary == rhs.is_binary) && (type_length == rhs.type_length) && + (children == rhs.children)); + }; }; /** diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index e999be8f83a..74001e5e01a 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -18,7 +18,9 @@ #include "io/json/nested_json.hpp" #include "read_json.hpp" +#include #include +#include #include #include #include @@ -76,7 +78,7 @@ device_span ingest_raw_input(device_span buffer, auto constexpr num_delimiter_chars = 1; if (compression == compression_type::NONE) { - std::vector delimiter_map{}; + std::vector delimiter_map{}; std::vector prefsum_source_sizes(sources.size()); std::vector> h_buffers; delimiter_map.reserve(sources.size()); @@ -84,7 +86,7 @@ device_span ingest_raw_input(device_span buffer, std::transform_inclusive_scan(sources.begin(), sources.end(), prefsum_source_sizes.begin(), - std::plus{}, + std::plus{}, [](std::unique_ptr const& s) { return s->size(); }); auto upper = std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset); @@ -259,6 +261,33 @@ datasource::owning_buffer> get_record_range_raw_input( readbufspan.size() - first_delim_pos - shift_for_nonzero_offset); } +table_with_metadata read_batch(host_span> sources, + json_reader_options const& reader_opts, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + datasource::owning_buffer> bufview = + get_record_range_raw_input(sources, reader_opts, stream); + + // If input JSON buffer has single quotes and option to normalize single quotes is enabled, + // invoke pre-processing FST + if (reader_opts.is_enabled_normalize_single_quotes()) { + normalize_single_quotes(bufview, stream, rmm::mr::get_current_device_resource()); + } + + // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is + // enabled, invoke pre-processing FST + if (reader_opts.is_enabled_normalize_whitespace()) { + normalize_whitespace(bufview, stream, rmm::mr::get_current_device_resource()); + } + + auto buffer = + cudf::device_span(reinterpret_cast(bufview.data()), bufview.size()); + stream.synchronize(); + return device_parse_nested_json(buffer, reader_opts, stream, mr); +} + table_with_metadata read_json(host_span> sources, json_reader_options const& reader_opts, rmm::cuda_stream_view stream, @@ -278,25 +307,85 @@ table_with_metadata read_json(host_span> sources, "Multiple inputs are supported only for JSON Lines format"); } - datasource::owning_buffer> bufview = - get_record_range_raw_input(sources, reader_opts, stream); + std::for_each(sources.begin(), sources.end(), [](auto const& source) { + CUDF_EXPECTS(source->size() < std::numeric_limits::max(), + "The size of each source file must be less than INT_MAX bytes"); + }); - // If input JSON buffer has single quotes and option to normalize single quotes is enabled, - // invoke pre-processing FST - if (reader_opts.is_enabled_normalize_single_quotes()) { - normalize_single_quotes(bufview, stream, rmm::mr::get_current_device_resource()); + constexpr size_t batch_size_ub = std::numeric_limits::max(); + size_t const chunk_offset = reader_opts.get_byte_range_offset(); + size_t chunk_size = reader_opts.get_byte_range_size(); + chunk_size = !chunk_size ? sources_size(sources, 0, 0) : chunk_size; + + // Identify the position of starting source file from which to begin batching based on + // byte range offset. If the offset is larger than the sum of all source + // sizes, then start_source is total number of source files i.e. no file is read + size_t const start_source = [&]() { + size_t sum = 0; + for (size_t src_idx = 0; src_idx < sources.size(); ++src_idx) { + if (sum + sources[src_idx]->size() > chunk_offset) return src_idx; + sum += sources[src_idx]->size(); + } + return sources.size(); + }(); + + // Construct batches of source files, with starting position of batches indicated by + // batch_positions. The size of each batch i.e. the sum of sizes of the source files in the batch + // is capped at INT_MAX bytes. + size_t cur_size = 0; + std::vector batch_positions; + std::vector batch_sizes; + batch_positions.push_back(0); + for (size_t i = start_source; i < sources.size(); i++) { + cur_size += sources[i]->size(); + if (cur_size >= batch_size_ub) { + batch_positions.push_back(i); + batch_sizes.push_back(cur_size - sources[i]->size()); + cur_size = sources[i]->size(); + } } + batch_positions.push_back(sources.size()); + batch_sizes.push_back(cur_size); - // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is - // enabled, invoke pre-processing FST - if (reader_opts.is_enabled_normalize_whitespace()) { - normalize_whitespace(bufview, stream, rmm::mr::get_current_device_resource()); + // If there is a single batch, then we can directly return the table without the + // unnecessary concatenate + if (batch_sizes.size() == 1) return read_batch(sources, reader_opts, stream, mr); + + std::vector partial_tables; + json_reader_options batched_reader_opts{reader_opts}; + + // Dispatch individual batches to read_batch and push the resulting table into + // partial_tables array. Note that the reader options need to be updated for each + // batch to adjust byte range offset and byte range size. + for (size_t i = 0; i < batch_sizes.size(); i++) { + batched_reader_opts.set_byte_range_size(std::min(batch_sizes[i], chunk_size)); + partial_tables.emplace_back(read_batch( + host_span>(sources.begin() + batch_positions[i], + batch_positions[i + 1] - batch_positions[i]), + batched_reader_opts, + stream, + rmm::mr::get_current_device_resource())); + if (chunk_size <= batch_sizes[i]) break; + chunk_size -= batch_sizes[i]; + batched_reader_opts.set_byte_range_offset(0); } - auto buffer = - cudf::device_span(reinterpret_cast(bufview.data()), bufview.size()); - stream.synchronize(); - return device_parse_nested_json(buffer, reader_opts, stream, mr); + auto expects_schema_equality = + std::all_of(partial_tables.begin() + 1, + partial_tables.end(), + [> = partial_tables[0].metadata.schema_info](auto& ptbl) { + return ptbl.metadata.schema_info == gt; + }); + CUDF_EXPECTS(expects_schema_equality, + "Mismatch in JSON schema across batches in multi-source multi-batch reading"); + + auto partial_table_views = std::vector(partial_tables.size()); + std::transform(partial_tables.begin(), + partial_tables.end(), + partial_table_views.begin(), + [](auto const& table) { return table.tbl->view(); }); + return table_with_metadata{cudf::concatenate(partial_table_views, stream, mr), + {partial_tables[0].metadata.schema_info}}; } } // namespace cudf::io::json::detail diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 329edbe4d36..eda470d2309 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -570,6 +570,7 @@ ConfigureTest( LARGE_STRINGS_TEST large_strings/concatenate_tests.cpp large_strings/case_tests.cpp + large_strings/json_tests.cpp large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp large_strings/parquet_tests.cpp diff --git a/cpp/tests/large_strings/json_tests.cpp b/cpp/tests/large_strings/json_tests.cpp new file mode 100644 index 00000000000..bf16d131ba7 --- /dev/null +++ b/cpp/tests/large_strings/json_tests.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "large_strings_fixture.hpp" + +#include +#include + +struct JsonLargeReaderTest : public cudf::test::StringsLargeTest {}; + +TEST_F(JsonLargeReaderTest, MultiBatch) +{ + std::string json_string = R"( + { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } + { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } + { "a": { "y" : 6}, "b" : [6 ], "c": 13 } + { "a": { "y" : 6}, "b" : [7 ], "c": 14 })"; + constexpr size_t expected_file_size = std::numeric_limits::max() / 2; + std::size_t const log_repetitions = + static_cast(std::ceil(std::log2(expected_file_size / json_string.size()))); + + json_string.reserve(json_string.size() * (1UL << log_repetitions)); + std::size_t numrows = 4; + for (std::size_t i = 0; i < log_repetitions; i++) { + json_string += json_string; + numrows <<= 1; + } + + constexpr int num_sources = 2; + std::vector> hostbufs( + num_sources, cudf::host_span(json_string.data(), json_string.size())); + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{ + cudf::host_span>(hostbufs.data(), hostbufs.size())}) + .lines(true) + .compression(cudf::io::compression_type::NONE) + .recovery_mode(cudf::io::json_recovery_mode_t::FAIL); + + // Read full test data via existing, nested JSON lines reader + cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); + ASSERT_EQ(current_reader_table.tbl->num_rows(), numrows * num_sources); +}