From 82a1913c97d8eeb6c4826474cef05fcac9266a0c Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 25 Jul 2023 15:48:04 -0700 Subject: [PATCH 1/3] experimental legacy update --- cpp/CMakeLists.txt | 8 ++-- cpp/include/cudf/io/detail/data_casting.cuh | 4 +- cpp/include/cudf/io/detail/json.hpp | 2 +- .../{experimental => }/byte_range_info.cu | 6 +-- cpp/src/io/json/json_column.cu | 38 +++++++-------- cpp/src/io/json/{ => legacy}/json_gpu.cu | 17 ++----- cpp/src/io/json/{ => legacy}/json_gpu.hpp | 12 ++--- cpp/src/io/json/legacy/read_json.hpp | 33 +++++++++++++ cpp/src/io/json/{ => legacy}/reader_impl.cu | 26 ++++------ cpp/src/io/json/nested_json_gpu.cu | 16 +++---- .../io/json/{experimental => }/read_json.cu | 14 ++++-- .../io/json/{experimental => }/read_json.hpp | 6 +-- cpp/src/io/json/write_json.cu | 2 +- cpp/tests/io/json_chunked_reader.cpp | 4 +- cpp/tests/io/json_type_cast_test.cu | 48 +++++++++---------- 15 files changed, 125 insertions(+), 111 deletions(-) rename cpp/src/io/json/{experimental => }/byte_range_info.cu (89%) rename cpp/src/io/json/{ => legacy}/json_gpu.cu (98%) rename cpp/src/io/json/{ => legacy}/json_gpu.hpp (95%) create mode 100644 cpp/src/io/json/legacy/read_json.hpp rename cpp/src/io/json/{ => legacy}/reader_impl.cu (96%) rename cpp/src/io/json/{experimental => }/read_json.cu (96%) rename cpp/src/io/json/{experimental => }/read_json.hpp (91%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8a19af31bf5..740c2f593c3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -371,12 +371,12 @@ add_library( src/io/csv/writer_impl.cu src/io/functions.cpp src/io/json/json_column.cu - src/io/json/json_gpu.cu + src/io/json/legacy/json_gpu.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu - src/io/json/reader_impl.cu - src/io/json/experimental/byte_range_info.cu - src/io/json/experimental/read_json.cu + src/io/json/read_json.cu + src/io/json/byte_range_info.cu + src/io/json/legacy/reader_impl.cu src/io/json/write_json.cu src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index d764e8533c6..b7ee5e05e96 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -32,7 +32,7 @@ #include -namespace cudf::io::json::experimental::detail { +namespace cudf::io::json::detail { // Unicode code point escape sequence static constexpr char UNICODE_SEQ = 0x7F; @@ -428,4 +428,4 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, return out_col; } -} // namespace cudf::io::json::experimental::detail +} // namespace cudf::io::json::detail diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 7b0350e9bc8..6930a4fdb25 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -33,7 +33,7 @@ namespace cudf::io::json::detail { * * @return cudf::table object that contains the array of cudf::column. */ -table_with_metadata read_json(std::vector>& sources, +table_with_metadata read_json(host_span> sources, json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/src/io/json/experimental/byte_range_info.cu b/cpp/src/io/json/byte_range_info.cu similarity index 89% rename from cpp/src/io/json/experimental/byte_range_info.cu rename to cpp/src/io/json/byte_range_info.cu index d6e30d090a5..d359e917dfa 100644 --- a/cpp/src/io/json/experimental/byte_range_info.cu +++ b/cpp/src/io/json/byte_range_info.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ #include #include -namespace cudf::io::detail::json::experimental { +namespace cudf::io::json::detail { // Extract the first character position in the string. size_type find_first_delimiter(device_span d_data, @@ -33,4 +33,4 @@ size_type find_first_delimiter(device_span d_data, return first_delimiter_position != d_data.end() ? first_delimiter_position - d_data.begin() : -1; } -} // namespace cudf::io::detail::json::experimental +} // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 3a79d832d06..7e599daa27c 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -52,8 +52,7 @@ #include #include -namespace cudf::io::json { -namespace detail { +namespace cudf::io::json::detail { // DEBUG prints auto to_cat = [](auto v) -> std::string { @@ -348,14 +347,14 @@ std::vector copy_strings_to_host(device_span input, cudf::io::parse_options_view options_view{}; options_view.quotechar = '\0'; // no quotes options_view.keepquotes = true; - auto d_column_names = experimental::detail::parse_data(string_views.begin(), - num_strings, - data_type{type_id::STRING}, - rmm::device_buffer{}, - 0, - options_view, - stream, - rmm::mr::get_current_device_resource()); + auto d_column_names = parse_data(string_views.begin(), + num_strings, + data_type{type_id::STRING}, + rmm::device_buffer{}, + 0, + options_view, + stream, + rmm::mr::get_current_device_resource()); auto to_host = [](auto const& col) { if (col.is_empty()) return std::vector{}; auto const scv = cudf::strings_column_view(col); @@ -796,14 +795,14 @@ std::pair, std::vector> device_json_co auto [result_bitmask, null_count] = make_validity(json_col); // Convert strings to the inferred data type - auto col = experimental::detail::parse_data(string_spans_it, - col_size, - target_type, - std::move(result_bitmask), - null_count, - options.view(), - stream, - mr); + auto col = parse_data(string_spans_it, + col_size, + target_type, + std::move(result_bitmask), + null_count, + options.view(), + stream, + mr); // Reset nullable if we do not have nulls // This is to match the existing JSON reader's behaviour: @@ -1044,5 +1043,4 @@ table_with_metadata device_parse_nested_json(device_span d_input, return table_with_metadata{std::make_unique(std::move(out_columns)), {out_column_names}}; } -} // namespace detail -} // namespace cudf::io::json +} // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu similarity index 98% rename from cpp/src/io/json/json_gpu.cu rename to cpp/src/io/json/legacy/json_gpu.cu index 167ae332ac7..d28d5614591 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/legacy/json_gpu.cu @@ -45,11 +45,7 @@ using cudf::device_span; -namespace cudf { -namespace io { -namespace json { -namespace gpu { -using namespace ::cudf; +namespace cudf::io::json::detail::legacy { namespace { /** @@ -515,7 +511,7 @@ __global__ void collect_keys_info_kernel(parse_options_view const options, } // namespace /** - * @copydoc cudf::io::json::gpu::convert_json_to_columns + * @copydoc cudf::io::json::detail::legacy::convert_json_to_columns */ void convert_json_to_columns(parse_options_view const& opts, device_span const data, @@ -547,7 +543,7 @@ void convert_json_to_columns(parse_options_view const& opts, } /** - * @copydoc cudf::io::gpu::detect_data_types + * @copydoc cudf::io::json::detail::legacy::detect_data_types */ std::vector detect_data_types( @@ -592,7 +588,7 @@ std::vector detect_data_types( } /** - * @copydoc cudf::io::json::gpu::gpu_collect_keys_info + * @copydoc cudf::io::json::detail::legacy::collect_keys_info */ void collect_keys_info(parse_options_view const& options, device_span const data, @@ -615,7 +611,4 @@ void collect_keys_info(parse_options_view const& options, CUDF_CHECK_CUDA(stream.value()); } -} // namespace gpu -} // namespace json -} // namespace io -} // namespace cudf +} // namespace cudf::io::json::detail::legacy diff --git a/cpp/src/io/json/json_gpu.hpp b/cpp/src/io/json/legacy/json_gpu.hpp similarity index 95% rename from cpp/src/io/json/json_gpu.hpp rename to cpp/src/io/json/legacy/json_gpu.hpp index 46bc2dd95a3..48fe6c69390 100644 --- a/cpp/src/io/json/json_gpu.hpp +++ b/cpp/src/io/json/legacy/json_gpu.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,10 +31,7 @@ using cudf::device_span; -namespace cudf { -namespace io { -namespace json { -namespace gpu { +namespace cudf::io::json::detail::legacy { using col_map_type = concurrent_unordered_map; /** @@ -100,7 +97,4 @@ void collect_keys_info(parse_options_view const& options, thrust::optional keys_info, rmm::cuda_stream_view stream); -} // namespace gpu -} // namespace json -} // namespace io -} // namespace cudf +} // namespace cudf::io::json::detail::legacy diff --git a/cpp/src/io/json/legacy/read_json.hpp b/cpp/src/io/json/legacy/read_json.hpp new file mode 100644 index 00000000000..e3fa010e08e --- /dev/null +++ b/cpp/src/io/json/legacy/read_json.hpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include + +#include +#include + +namespace cudf::io::json::detail::legacy { + +table_with_metadata read_json(host_span> sources, + json_reader_options const& reader_opts, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +} // namespace cudf::io::json::detail::legacy diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu similarity index 96% rename from cpp/src/io/json/reader_impl.cu rename to cpp/src/io/json/legacy/reader_impl.cu index c7b46813909..c524c041df7 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/legacy/reader_impl.cu @@ -16,8 +16,6 @@ #include "json_gpu.hpp" -#include "experimental/read_json.hpp" - #include #include @@ -56,9 +54,8 @@ using cudf::host_span; -namespace cudf::io::json::detail { +namespace cudf::io::json::detail::legacy { -using col_map_type = cudf::io::json::gpu::col_map_type; using col_map_ptr_type = std::unique_ptr>; /** @@ -129,8 +126,7 @@ std::unique_ptr
create_json_keys_info_table(parse_options_view const& par { // Count keys rmm::device_scalar key_counter(0, stream); - cudf::io::json::gpu::collect_keys_info( - parse_opts, data, row_offsets, key_counter.data(), {}, stream); + collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {}, stream); // Allocate columns to store hash value, length, and offset of each JSON object key in the input auto const num_keys = key_counter.value(stream); @@ -148,8 +144,7 @@ std::unique_ptr
create_json_keys_info_table(parse_options_view const& par // Reset the key counter - now used for indexing key_counter.set_value_to_zero_async(stream); // Fill the allocated columns - cudf::io::json::gpu::collect_keys_info( - parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream); + collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream); return info_table; } @@ -213,7 +208,7 @@ std::pair, col_map_ptr_type> get_json_object_keys_hashe create_col_names_hash_map(sorted_info->get_column(2).view(), stream)}; } -std::vector ingest_raw_input(std::vector> const& sources, +std::vector ingest_raw_input(host_span> sources, compression_type compression, size_t range_offset, size_t range_size, @@ -447,7 +442,7 @@ std::vector get_data_types(json_reader_options const& reader_opts, auto const num_columns = column_names.size(); auto const do_set_null_count = column_map->capacity() > 0; - auto const h_column_infos = cudf::io::json::gpu::detect_data_types( + auto const h_column_infos = detect_data_types( parse_opts, data, rec_starts, do_set_null_count, num_columns, column_map, stream); auto get_type_id = [&](auto const& cinfo) { @@ -523,7 +518,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async( num_columns, stream, rmm::mr::get_current_device_resource()); - cudf::io::json::gpu::convert_json_to_columns( + convert_json_to_columns( parse_opts, data, rec_starts, d_dtypes, column_map, d_data, d_valid, d_valid_counts, stream); stream.synchronize(); @@ -591,16 +586,11 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, * * @return Table and its metadata */ -table_with_metadata read_json(std::vector>& sources, +table_with_metadata read_json(host_span> sources, json_reader_options const& reader_opts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_FUNC_RANGE(); - if (not reader_opts.is_enabled_legacy()) { - return cudf::io::detail::json::experimental::read_json(sources, reader_opts, stream, mr); - } - CUDF_EXPECTS(not sources.empty(), "No sources were defined"); CUDF_EXPECTS(sources.size() == 1 or reader_opts.get_compression() == compression_type::NONE, "Multiple compressed inputs are not supported"); @@ -664,4 +654,4 @@ table_with_metadata read_json(std::vector>& sources, mr); } -} // namespace cudf::io::json::detail +} // namespace cudf::io::json::detail::legacy diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 3b6c2b18250..0629ceb95c6 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1993,14 +1993,14 @@ std::pair, std::vector> json_column_to auto [result_bitmask, null_count] = make_validity(json_col); // Convert strings to the inferred data type - auto col = experimental::detail::parse_data(string_spans_it, - col_size, - target_type, - std::move(result_bitmask), - null_count, - parsing_options(options).view(), - stream, - mr); + auto col = parse_data(string_spans_it, + col_size, + target_type, + std::move(result_bitmask), + null_count, + parsing_options(options).view(), + stream, + mr); // Reset nullable if we do not have nulls // This is to match the existing JSON reader's behaviour: diff --git a/cpp/src/io/json/experimental/read_json.cu b/cpp/src/io/json/read_json.cu similarity index 96% rename from cpp/src/io/json/experimental/read_json.cu rename to cpp/src/io/json/read_json.cu index dbb4a628c44..cf1aaa211b4 100644 --- a/cpp/src/io/json/experimental/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -17,6 +17,7 @@ #include "read_json.hpp" #include +#include #include #include @@ -30,7 +31,7 @@ #include -namespace cudf::io::detail::json::experimental { +namespace cudf::io::json::detail { size_t sources_size(host_span> const sources, size_t range_offset, @@ -44,7 +45,7 @@ size_t sources_size(host_span> const sources, }); } -rmm::device_uvector ingest_raw_input(host_span> const& sources, +rmm::device_uvector ingest_raw_input(host_span> sources, compression_type compression, size_t range_offset, size_t range_size, @@ -197,6 +198,11 @@ table_with_metadata read_json(host_span> sources, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); + + if (reader_opts.is_enabled_legacy()) { + return cudf::io::json::detail::legacy::read_json(sources, reader_opts, stream, mr); + } + if (not should_load_whole_source(reader_opts)) { CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Specifying a byte range is supported only for JSON Lines"); @@ -213,8 +219,8 @@ table_with_metadata read_json(host_span> sources, auto const buffer = get_record_range_raw_input(sources, reader_opts, stream); - return cudf::io::json::detail::device_parse_nested_json(buffer, reader_opts, stream, mr); + return device_parse_nested_json(buffer, reader_opts, stream, mr); // For debug purposes, use host_parse_nested_json() } -} // namespace cudf::io::detail::json::experimental +} // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/experimental/read_json.hpp b/cpp/src/io/json/read_json.hpp similarity index 91% rename from cpp/src/io/json/experimental/read_json.hpp rename to cpp/src/io/json/read_json.hpp index 48e104c4254..db37e7abcdb 100644 --- a/cpp/src/io/json/experimental/read_json.hpp +++ b/cpp/src/io/json/read_json.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ #include -namespace cudf::io::detail::json::experimental { +namespace cudf::io::json::detail { table_with_metadata read_json(host_span> sources, json_reader_options const& reader_opts, @@ -42,4 +42,4 @@ size_type find_first_delimiter_in_chunk(host_span #include -#include +#include /** * @brief Base test fixture for JSON reader tests @@ -37,7 +37,7 @@ std::vector skeleton_for_parellel_chunk_reader( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - using namespace cudf::io::detail::json::experimental; + using namespace cudf::io::json::detail; using cudf::size_type; // assuming single source. size_t total_source_size = 0; diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index a7710036125..5c32131114d 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -79,14 +79,14 @@ TEST_F(JSONTypeCastTest, String) auto null_mask = std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - auto str_col = cudf::io::json::experimental::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + auto str_col = cudf::io::json::detail::parse_data(svs.data(), + svs.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto out_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; }); @@ -115,14 +115,14 @@ TEST_F(JSONTypeCastTest, Int) auto null_mask = std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - auto col = cudf::io::json::experimental::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + auto col = cudf::io::json::detail::parse_data(svs.data(), + svs.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}}; @@ -158,14 +158,14 @@ TEST_F(JSONTypeCastTest, StringEscapes) auto null_mask = std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size())); - auto col = cudf::io::json::experimental::detail::parse_data(svs.data(), - svs.size(), - type, - std::move(null_mask), - 0, - default_json_options().view(), - stream, - mr); + auto col = cudf::io::json::detail::parse_data(svs.data(), + svs.size(), + type, + std::move(null_mask), + 0, + default_json_options().view(), + stream, + mr); auto expected = cudf::test::strings_column_wrapper{ {"🚀", "A🚀AA", "", "", "", "\\", "➩", "", "\"\\/\b\f\n\r\t"}, From 8c71dcca985aaf3d885cb07e5158f16ce0001306 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 31 Jul 2023 15:59:21 -0700 Subject: [PATCH 2/3] sort --- cpp/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f56d08190d8..d6b2fb10c23 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -370,12 +370,12 @@ add_library( src/io/csv/reader_impl.cu src/io/csv/writer_impl.cu src/io/functions.cpp + src/io/json/byte_range_info.cu src/io/json/json_column.cu - src/io/json/legacy/json_gpu.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu src/io/json/read_json.cu - src/io/json/byte_range_info.cu + src/io/json/legacy/json_gpu.cu src/io/json/legacy/reader_impl.cu src/io/json/write_json.cu src/io/orc/aggregate_orc_metadata.cpp From 018dd11abebef1597b0122175693cf09ac4cbf6c Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 31 Jul 2023 16:10:26 -0700 Subject: [PATCH 3/3] clean up --- cpp/src/io/json/read_json.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index cf1aaa211b4..080da7800f4 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -200,7 +200,7 @@ table_with_metadata read_json(host_span> sources, CUDF_FUNC_RANGE(); if (reader_opts.is_enabled_legacy()) { - return cudf::io::json::detail::legacy::read_json(sources, reader_opts, stream, mr); + return legacy::read_json(sources, reader_opts, stream, mr); } if (not should_load_whole_source(reader_opts)) {