From 9f2247f4d3e4a035b141251ea4cf4f1b6a6ef8ec Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 22 Jul 2022 11:51:44 -0700 Subject: [PATCH 01/40] add placeholder experimental JSON reader --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/io/json.hpp | 28 +++++++++++++++++ cpp/src/io/json/experimental/read_json.cpp | 31 +++++++++++++++++++ cpp/src/io/json/experimental/read_json.hpp | 36 ++++++++++++++++++++++ cpp/src/io/json/reader_impl.cu | 6 ++++ python/cudf/cudf/_lib/cpp/io/json.pxd | 5 +++ python/cudf/cudf/_lib/json.pyx | 4 ++- python/cudf/cudf/io/json.py | 9 ++++-- python/cudf/cudf/utils/ioutils.py | 2 +- 9 files changed, 118 insertions(+), 4 deletions(-) create mode 100644 cpp/src/io/json/experimental/read_json.cpp create mode 100644 cpp/src/io/json/experimental/read_json.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4819d1c2f5c..104e731c470 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -322,6 +322,7 @@ add_library( src/io/functions.cpp src/io/json/json_gpu.cu src/io/json/reader_impl.cu + src/io/json/experimental/read_json.cpp src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu src/io/orc/orc.cpp diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 9ccb5ec4d58..01334060063 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -80,6 +80,9 @@ class json_reader_options { // Whether to parse dates as DD/MM versus MM/DD bool _dayfirst = false; + // Whether to parse dates as DD/MM versus MM/DD + bool _experimental = false; + /** * @brief Constructor from source info. * @@ -193,6 +196,13 @@ class json_reader_options { */ bool is_enabled_dayfirst() const { return _dayfirst; } + /** + * @brief Whether the experimental reader should be used. + * + * @returns true if the experimental reader will be used, false otherwise + */ + bool is_enabled_experimental() const { return _experimental; } + /** * @brief Set data types for columns to be read. * @@ -241,6 +251,13 @@ class json_reader_options { * @param val Boolean value to enable/disable day first parsing format */ void enable_dayfirst(bool val) { _dayfirst = val; } + + /** + * @brief Set whether to use the experimental reader. + * + * @param val Boolean value to enable/disable the experimental readers + */ + void enable_experimental(bool val) { _experimental = val; } }; /** @@ -347,6 +364,17 @@ class json_reader_options_builder { options._dayfirst = val; return *this; } + /** + * @brief Set whether to use the experimental reader. + * + * @param val Boolean value to enable/disable experimental parsing + * @return this for chaining + */ + json_reader_options_builder& experimental(bool val) + { + options._experimental = val; + return *this; + } /** * @brief move json_reader_options member once it's built. diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp new file mode 100644 index 00000000000..fef5aa7d794 --- /dev/null +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "read_json.hpp" + +#include + +namespace cudf::io::detail::json::experimental { + +table_with_metadata read_json(std::vector>& sources, + json_reader_options const& reader_opts, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FAIL("Not implemented"); +} + +} // namespace cudf::io::detail::json::experimental diff --git a/cpp/src/io/json/experimental/read_json.hpp b/cpp/src/io/json/experimental/read_json.hpp new file mode 100644 index 00000000000..9c39315da30 --- /dev/null +++ b/cpp/src/io/json/experimental/read_json.hpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::io::detail::json::experimental { + +table_with_metadata read_json(std::vector>& sources, + json_reader_options const& reader_opts, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +} diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 052c51351a1..7e6be190acb 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -16,6 +16,8 @@ #include "json_gpu.hpp" +#include "experimental/read_json.hpp" + #include #include @@ -571,6 +573,10 @@ table_with_metadata read_json(std::vector>& sources, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + if (reader_opts.is_enabled_experimental()) { + return experimental::read_json(sources, reader_opts, stream, mr); + } + CUDF_EXPECTS(not sources.empty(), "No sources were defined"); CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index 2c65e329bb0..6e240d00349 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -24,6 +24,7 @@ cdef extern from "cudf/io/json.hpp" \ size_type get_byte_range_size() except+ bool is_enabled_lines() except+ bool is_enabled_dayfirst() except+ + bool is_enabled_experimental() except+ # setter void set_dtypes(vector[data_type] types) except+ @@ -35,6 +36,7 @@ cdef extern from "cudf/io/json.hpp" \ void set_byte_range_size(size_type size) except+ void enable_lines(bool val) except+ void enable_dayfirst(bool val) except+ + void enable_experimental(bool val) except+ @staticmethod json_reader_options_builder builder( @@ -70,6 +72,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& dayfirst( bool val ) except+ + json_reader_options_builder& experimental( + bool val + ) except+ json_reader_options build() except+ diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 263d70afe26..89057e61b6b 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -31,7 +31,8 @@ cpdef read_json(object filepaths_or_buffers, object dtype, bool lines, object compression, - object byte_range): + object byte_range, + bool experimental): """ Cython function to call into libcudf API, see `read_json`. @@ -98,6 +99,7 @@ cpdef read_json(object filepaths_or_buffers, .lines(c_lines) .byte_range_offset(c_range_offset) .byte_range_size(c_range_size) + .experimental(experimental) .build() ) if is_list_like_dtypes: diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 869e055decf..f7c5c36edc5 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -27,7 +27,7 @@ def read_json( raise ValueError("cudf engine only supports JSON Lines format") if engine == "auto": engine = "cudf" if lines else "pandas" - if engine == "cudf": + if engine == "cudf" or engine == "cudf_experimental": # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(path_or_buf): @@ -56,7 +56,12 @@ def read_json( return cudf.DataFrame._from_data( *libjson.read_json( - filepaths_or_buffers, dtype, lines, compression, byte_range + filepaths_or_buffers, + dtype, + lines, + compression, + byte_range, + engine == "cudf_experimental", ) ) else: diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 3771587eb47..d3c41de842a 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -463,7 +463,7 @@ function or `StringIO`). Multiple inputs may be provided as a list. If a list is specified each list entry may be of a different input type as long as each input is of a valid type and all input JSON schema(s) match. -engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto' +engine : {{ 'auto', 'cudf', 'cudf_experimental', 'pandas' }}, default 'auto' Parser engine to use. If 'auto' is passed, the engine will be automatically selected based on the other parameters. orient : string, From 76b283475bb2ded9622c8e9f1cae63a562db969b Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 22 Jul 2022 11:58:07 -0700 Subject: [PATCH 02/40] doc fix --- cpp/include/cudf/io/json.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 01334060063..72d5fc9c4a6 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -80,7 +80,7 @@ class json_reader_options { // Whether to parse dates as DD/MM versus MM/DD bool _dayfirst = false; - // Whether to parse dates as DD/MM versus MM/DD + // Whether to use the experimental reader bool _experimental = false; /** @@ -255,7 +255,7 @@ class json_reader_options { /** * @brief Set whether to use the experimental reader. * - * @param val Boolean value to enable/disable the experimental readers + * @param val Boolean value to enable/disable the experimental reader */ void enable_experimental(bool val) { _experimental = val; } }; From f5464f654f606566ca3701cbf9ec949cf4c1e6ce Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 22 Jul 2022 12:10:15 -0700 Subject: [PATCH 03/40] copyright year --- python/cudf/cudf/_lib/cpp/io/json.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index 6e240d00349..bc9d87a5cbf 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint8_t from libcpp cimport bool From 2ca0ac0442c26a10ccce4e4bef42abaec016c0d1 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 25 Jul 2022 13:27:38 -0700 Subject: [PATCH 04/40] newline Co-authored-by: Bradley Dice --- cpp/include/cudf/io/json.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 72d5fc9c4a6..73724b99589 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -364,6 +364,7 @@ class json_reader_options_builder { options._dayfirst = val; return *this; } + /** * @brief Set whether to use the experimental reader. * From 3ee7a5accaccc005445f7564c4d03df97eebb4d1 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 25 Jul 2022 13:32:45 -0700 Subject: [PATCH 05/40] use span --- cpp/src/io/json/experimental/read_json.cpp | 2 +- cpp/src/io/json/experimental/read_json.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index fef5aa7d794..146eaf203e4 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -20,7 +20,7 @@ namespace cudf::io::detail::json::experimental { -table_with_metadata read_json(std::vector>& sources, +table_with_metadata read_json(host_span> sources, json_reader_options const& reader_opts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/io/json/experimental/read_json.hpp b/cpp/src/io/json/experimental/read_json.hpp index 9c39315da30..c9f74b2cc41 100644 --- a/cpp/src/io/json/experimental/read_json.hpp +++ b/cpp/src/io/json/experimental/read_json.hpp @@ -19,16 +19,16 @@ #include #include #include +#include #include #include #include -#include namespace cudf::io::detail::json::experimental { -table_with_metadata read_json(std::vector>& sources, +table_with_metadata read_json(host_span> sources, json_reader_options const& reader_opts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); From fcc90c5a3a390165daea1f19d588f8c2134a7c55 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 26 Jul 2022 19:08:19 -0700 Subject: [PATCH 06/40] options check + decompression --- cpp/include/cudf/io/types.hpp | 1 + cpp/src/io/json/experimental/read_json.cpp | 46 +++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index e9a93894f7d..7520ca107cc 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -21,6 +21,7 @@ #pragma once +#include #include #include diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index 146eaf203e4..fbe9b5f6112 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -17,15 +17,59 @@ #include "read_json.hpp" #include +#include namespace cudf::io::detail::json::experimental { +table_with_metadata read_nested_json(host_span input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FAIL("Not implemented"); +} + +std::vector ingest_raw_input(host_span> sources, + compression_type compression) +{ + // Iterate through the user defined sources and read the contents into the local buffer + size_t total_source_size = 0; + for (const auto& source : sources) { + total_source_size += source->size(); + } + + auto buffer = std::vector(total_source_size); + + size_t bytes_read = 0; + for (const auto& source : sources) { + if (not source->is_empty()) { + auto const destination = buffer.data() + bytes_read; + bytes_read += source->host_read(0, source->size(), destination); + } + } + + if (compression == compression_type::NONE) { + return buffer; + } else { + return decompress(compression, buffer); + } +} + table_with_metadata read_json(host_span> sources, json_reader_options const& reader_opts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_FAIL("Not implemented"); + auto const dtypes_empty = + std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); + CUDF_EXPECTS(dtypes_empty, "user specified dtypes are not yet supported"); + CUDF_EXPECTS(not reader_opts.is_enabled_lines(), "JSON Lines format is not yet supported"); + CUDF_EXPECTS(reader_opts.get_byte_range_offset() == 0 and reader_opts.get_byte_range_size() == 0, + "specifying a byte range is not yet supported"); + + auto const buffer = ingest_raw_input(sources, reader_opts.get_compression()); + auto data = host_span(reinterpret_cast(buffer.data()), buffer.size()); + + return read_nested_json(data, stream, mr); } } // namespace cudf::io::detail::json::experimental From 22b5a46c8a8bb6dfbdaad9827452dbc1792be375 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 15 Aug 2022 07:29:48 -0700 Subject: [PATCH 07/40] adds support for ndjson --- cpp/src/io/json/nested_json.hpp | 14 +- cpp/src/io/json/nested_json_gpu.cu | 371 +++++++++++++++++++---------- cpp/tests/io/nested_json_test.cpp | 61 ++++- 3 files changed, 316 insertions(+), 130 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 03acd393594..d8886bc0928 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -267,14 +268,14 @@ namespace detail { * At this stage, we do not perform bracket matching, i.e., we do not verify whether a closing * bracket would actually pop a the corresponding opening brace. * - * @param[in] d_json_in The string of input characters + * @param[in] json_in The string of input characters * @param[out] d_top_of_stack Will be populated with what-is-on-top-of-the-stack for any given input * character of \p d_json_in, where a '{' represents that the corresponding input character is * within the context of a struct, a '[' represents that it is within the context of an array, and a * '_' symbol that it is at the root of the JSON. * @param[in] stream The cuda stream to dispatch GPU kernels to */ -void get_stack_context(device_span d_json_in, +void get_stack_context(device_span json_in, SymbolT* d_top_of_stack, rmm::cuda_stream_view stream); @@ -282,14 +283,17 @@ void get_stack_context(device_span d_json_in, * @brief Parses the given JSON string and emits a sequence of tokens that demarcate relevant * sections from the input. * - * @param[in] d_json_in The JSON input + * @param[in] json_in The JSON input + * @param[in] options Parsing options specifying the parsing behaviour * @param[out] d_tokens Device memory to which the parsed tokens are written * @param[out] d_tokens_indices Device memory to which the indices are written, where each index * represents the offset within \p d_json_in that cause the input being written * @param[out] d_num_written_tokens The total number of tokens that were parsed * @param[in] stream The CUDA stream to which kernels are dispatched */ -void get_token_stream(device_span d_json_in, + +void get_token_stream(device_span json_in, + cudf::io::json_reader_options const& options, PdaTokenT* d_tokens, SymbolOffsetT* d_tokens_indices, SymbolOffsetT* d_num_written_tokens, @@ -299,12 +303,14 @@ void get_token_stream(device_span d_json_in, * @brief Parses the given JSON string and generates table from the given input. * * @param input The JSON input + * @param options Parsing options specifying the parsing behaviour * @param stream The CUDA stream to which kernels are dispatched * @param mr Optional, resource with which to allocate. * @return The data parsed from the given JSON input */ table_with_metadata parse_nested_json( host_span input, + cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 5e293f8a750..f8a862e2c65 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -164,6 +165,8 @@ enum class symbol_group_id : PdaSymbolGroupIdT { COLON, /// Whitespace WHITE_SPACE, + /// Linebreak + LINE_BREAK, /// Other (any input symbol not assigned to one of the above symbol groups) OTHER, /// Total number of symbol groups amongst which to differentiate @@ -206,7 +209,7 @@ static __constant__ PdaSymbolGroupIdT tos_sg_to_pda_sgid[] = { static_cast(symbol_group_id::OTHER), static_cast(symbol_group_id::OTHER), static_cast(symbol_group_id::WHITE_SPACE), - static_cast(symbol_group_id::WHITE_SPACE), + static_cast(symbol_group_id::LINE_BREAK), static_cast(symbol_group_id::OTHER), static_cast(symbol_group_id::OTHER), static_cast(symbol_group_id::WHITE_SPACE), @@ -403,62 +406,62 @@ constexpr auto PD_NUM_STATES = static_cast(pda_state_t::PD_NUM_STATES); // The starting state of the pushdown automaton constexpr auto start_state = static_cast(pda_state_t::PD_BOV); -// Identity symbol to symbol group lookup table -std::vector> const pda_sgids{ - {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}, {13}, {14}, - {15}, {16}, {17}, {18}, {19}, {20}, {21}, {22}, {23}, {24}, {25}, {26}, {27}, {28}, {29}}; - /** * @brief Getting the transition table */ -auto get_transition_table() +auto get_transition_table(bool newline_delimited_json) { + static_assert(static_cast(stack_symbol_group_id::STACK_ROOT) == 0); + static_assert(static_cast(stack_symbol_group_id::STACK_LIST) == 1); + static_assert(static_cast(stack_symbol_group_id::STACK_STRUCT) == 2); + + auto const PD_ANL = newline_delimited_json ? PD_BOV : PD_PVL; std::array, PD_NUM_STATES> pda_tt; - // { [ } ] " \ , : space other + // { [ } ] " \ , : space newline other pda_tt[static_cast(pda_state_t::PD_BOV)] = { - PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON, - PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON, - PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON}; + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON}; pda_tt[static_cast(pda_state_t::PD_BOA)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_LON, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_LON)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_LON, - PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_LON, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_LON}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_LON}; pda_tt[static_cast(pda_state_t::PD_STR)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR}; + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; pda_tt[static_cast(pda_state_t::PD_SCE)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; pda_tt[static_cast(pda_state_t::PD_PVL)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_ERR, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ANL, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_BFN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_BFN, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_FLN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; pda_tt[static_cast(pda_state_t::PD_FNE)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; pda_tt[static_cast(pda_state_t::PD_PFN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_PFN, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_ERR)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR}; return pda_tt; } @@ -468,7 +471,8 @@ auto get_transition_table() auto get_translation_table() { std::array, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt; - pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{{token_t::StructBegin}, + pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{/*ROOT*/ + {token_t::StructBegin}, {token_t::ListBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -477,7 +481,9 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ValueBegin}, + /*LIST*/ {token_t::StructBegin}, {token_t::ListBegin}, {token_t::ErrorBegin}, @@ -487,7 +493,9 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ValueBegin}, + /*STRUCT*/ {token_t::StructBegin}, {token_t::ListBegin}, {token_t::ErrorBegin}, @@ -497,8 +505,10 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ValueBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_BOA)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_BOA)] = {{/*ROOT*/ + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -508,6 +518,8 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + /*LIST*/ {token_t::StructBegin}, {token_t::ListBegin}, {token_t::ErrorBegin}, @@ -517,7 +529,9 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ValueBegin}, + /*STRUCT*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::StructEnd}, @@ -527,8 +541,10 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_LON)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_LON)] = {{/*ROOT*/ + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -537,7 +553,9 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ValueEnd}, + {token_t::ValueEnd}, {}, + /*LIST*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -547,7 +565,9 @@ auto get_translation_table() {token_t::ValueEnd}, {token_t::ErrorBegin}, {token_t::ValueEnd}, + {token_t::ValueEnd}, {}, + /*STRUCT*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ValueEnd, token_t::StructEnd}, @@ -557,15 +577,82 @@ auto get_translation_table() {token_t::ValueEnd}, {token_t::ErrorBegin}, {token_t::ValueEnd}, + {token_t::ValueEnd}, + {}}}; + pda_tlt[static_cast(pda_state_t::PD_STR)] = {{/*ROOT*/ + {}, + {}, + {}, + {}, + {token_t::StringEnd}, + {}, + {}, + {}, + {}, + {}, + {}, + /*LIST*/ + {}, + {}, + {}, + {}, + {token_t::StringEnd}, + {}, + {}, + {}, + {}, + {}, + {}, + /*STRUCT*/ + {}, + {}, + {}, + {}, + {token_t::StringEnd}, + {}, + {}, + {}, + {}, + {}, + {}}}; + pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{/*ROOT*/ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + /*LIST*/ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + /*STRUCT*/ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, {}}}; - pda_tlt[static_cast(pda_state_t::PD_STR)] = { - {{}, {}, {}, {}, {token_t::StringEnd}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {token_t::StringEnd}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {token_t::StringEnd}, {}, {}, {}, {}, {}}}; - pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}, {}}}; - pda_tlt[static_cast(pda_state_t::PD_PVL)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_PVL)] = {{/*ROOT*/ + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -574,7 +661,9 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ErrorBegin}, + /*LIST*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -584,7 +673,9 @@ auto get_translation_table() {}, {token_t::ErrorBegin}, {}, + {}, {token_t::ErrorBegin}, + /*STRUCT*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::StructEnd}, @@ -594,8 +685,11 @@ auto get_translation_table() {}, {token_t::ErrorBegin}, {}, + {}, {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_BFN)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_BFN)] = {{/*ROOT*/ + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -605,6 +699,7 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + /*LIST*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -616,6 +711,8 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + /*STRUCT*/ + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -624,8 +721,9 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {}, + {}, {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{/*ROOT*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -637,6 +735,7 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + /*LIST*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -645,6 +744,10 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + /*STRUCT*/ {}, {}, {}, @@ -654,8 +757,10 @@ auto get_translation_table() {}, {}, {}, + {}, {}}}; - pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{/*ROOT*/ + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -666,6 +771,7 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + /*LIST*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -675,6 +781,10 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + /*STRUCT*/ + {}, {}, {}, {}, @@ -685,7 +795,8 @@ auto get_translation_table() {}, {}, {}}}; - pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{{token_t::ErrorBegin}, + pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{/*ROOT*/ + {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -696,6 +807,7 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + /*LIST*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, @@ -707,17 +819,54 @@ auto get_translation_table() {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + /*STRUCT*/ {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, {}, {}, {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}, {}}}; + pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{/*ROOT*/ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + /*LIST*/ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + /*STRUCT*/ + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}}}; return pda_tlt; } @@ -792,11 +941,14 @@ void get_stack_context(device_span json_in, // TODO: return pair of device_uvector instead of passing pre-allocated pointers. void get_token_stream(device_span json_in, + cudf::io::json_reader_options const& options, PdaTokenT* d_tokens, SymbolOffsetT* d_tokens_indices, SymbolOffsetT* d_num_written_tokens, rmm::cuda_stream_view stream) { + auto const new_line_delimited_json = options.is_enabled_lines(); + // Memory holding the top-of-stack stack context for the input rmm::device_uvector stack_op_indices{json_in.size(), stream}; @@ -820,8 +972,12 @@ void get_token_stream(device_span json_in, tokenizer_pda::pda_state_t::PD_NUM_STATES)>; // Instantiating PDA transducer - ToTokenStreamFstT json_to_tokens_fst{tokenizer_pda::pda_sgids, - tokenizer_pda::get_transition_table(), + std::vector> pda_sgid_identity{tokenizer_pda::NUM_PDA_SGIDS}; + std::generate(std::begin(pda_sgid_identity), std::end(pda_sgid_identity), [i = 0]() mutable { + return std::vector{static_cast(i++)}; + }); + ToTokenStreamFstT json_to_tokens_fst{pda_sgid_identity, + tokenizer_pda::get_transition_table(new_line_delimited_json), tokenizer_pda::get_translation_table(), stream}; @@ -850,6 +1006,7 @@ void make_json_column(json_column& root_column, std::stack& current_data_path, host_span input, device_span d_input, + cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream) { // Default name for a list's child column @@ -862,6 +1019,7 @@ void make_json_column(json_column& root_column, // Parse the JSON and get the token stream get_token_stream(d_input, + options, tokens_gpu.device_ptr(), token_indices_gpu.device_ptr(), num_tokens_out.device_ptr(), @@ -897,15 +1055,6 @@ void make_json_column(json_column& root_column, }; }; - // Whether this token is a beginning-of-list or beginning-of-struct token - auto is_nested_token = [](PdaTokenT const token) { - switch (token) { - case token_t::StructBegin: - case token_t::ListBegin: return true; - default: return false; - }; - }; - // Skips the quote char if the token is a beginning-of-string or beginning-of-field-name token auto get_token_index = [](PdaTokenT const token, SymbolOffsetT const token_index) { constexpr SymbolOffsetT skip_quote_char = 1; @@ -1061,7 +1210,6 @@ void make_json_column(json_column& root_column, std::size_t offset = 0; // Giving names to magic constants - constexpr uint32_t row_offset_zero = 0; constexpr uint32_t zero_child_count = 0; //-------------------------------------------------------------------------------- @@ -1071,51 +1219,6 @@ void make_json_column(json_column& root_column, CUDF_EXPECTS(num_tokens_out[0] > 0, "Empty JSON input not supported"); CUDF_EXPECTS(is_valid_root_token(tokens_gpu[offset]), "Invalid beginning of JSON document"); - // The JSON root is either a struct or list - if (is_nested_token(tokens_gpu[offset])) { - // Initialize the root column and append this row to it - root_column.append_row(row_offset_zero, - token_to_column_type(tokens_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), - 0); - - // Push the root node onto the stack for the data path - current_data_path.push({&root_column, row_offset_zero, nullptr, zero_child_count}); - - // Continue with the next token from the token stream - offset++; - } - // The JSON is a simple scalar value -> create simple table and return - else { - constexpr SymbolOffsetT max_tokens_for_scalar_value = 2; - CUDF_EXPECTS(num_tokens_out[0] <= max_tokens_for_scalar_value, - "Invalid JSON format. Expected just a scalar value."); - - // If this isn't the only token, verify the subsequent token is the correct end-of-* partner - if ((offset + 1) < num_tokens_out[0]) { - CUDF_EXPECTS(tokens_gpu[offset + 1] == end_of_partner(tokens_gpu[offset]), - "Invalid JSON token sequence"); - } - - // The offset to the first symbol from the JSON input associated with the current token - auto const& token_begin_offset = get_token_index(tokens_gpu[offset], token_indices_gpu[offset]); - - // The offset to one past the last symbol associated with the current token - // Literals without trailing space are missing the corresponding end-of-* counterpart. - auto const& token_end_offset = - (offset + 1 < num_tokens_out[0]) - ? get_token_index(tokens_gpu[offset + 1], token_indices_gpu[offset + 1]) - : input.size(); - - root_column.append_row(row_offset_zero, - json_col_t::StringColumn, - token_begin_offset, - token_end_offset, - zero_child_count); - return; - } - while (offset < num_tokens_out[0]) { // Verify there's at least the JSON root node left on the stack to which we can append data CUDF_EXPECTS(current_data_path.size() > 0, "Invalid JSON structure"); @@ -1215,6 +1318,7 @@ void make_json_column(json_column& root_column, else if (token == token_t::ErrorBegin) { #ifdef NJP_DEBUG_PRINT std::cout << "[ErrorBegin]\n"; + std::cout << "@" << get_token_index(tokens_gpu[offset], token_indices_gpu[offset]); #endif CUDF_FAIL("Parser encountered an invalid format."); } @@ -1371,26 +1475,51 @@ std::pair, std::vector> json_column_to } table_with_metadata parse_nested_json(host_span input, + cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + auto const new_line_delimited_json = options.is_enabled_lines(); + // Allocate device memory for the JSON input & copy over to device rmm::device_uvector d_input = cudf::detail::make_device_uvector_async(input, stream); // Get internal JSON column json_column root_column{}; std::stack data_path{}; - make_json_column(root_column, data_path, input, d_input, stream); + + constexpr uint32_t row_offset_zero = 0; + constexpr uint32_t token_begin_offset_zero = 0; + constexpr uint32_t token_end_offset_zero = 0; + constexpr uint32_t node_init_child_count_zero = 0; + + // We initialize the very root node and root column that represents a list column that contains + // all the values found at the root "level" of the given JSON string Initialize the root column + // For JSON lines: we expect to find a list of values that all will be inserted into this list + // column. + // For regular JSON: we expect to have only a single value (single row) that will be inserted into + // this column + root_column.append_row( + row_offset_zero, json_col_t::ListColumn, token_begin_offset_zero, token_end_offset_zero, 1); + + // Push the root node onto the stack for the data path + data_path.push({&root_column, row_offset_zero, nullptr, node_init_child_count_zero}); + + make_json_column(root_column, data_path, input, d_input, options, stream); + + // data_root refers to the root column of the data represented by the given JSON string + auto const& data_root = + new_line_delimited_json ? root_column : root_column.child_columns.begin()->second; // Verify that we were in fact given a list of structs (or in JSON speech: an array of objects) auto constexpr single_child_col_count = 1; - CUDF_EXPECTS(root_column.type == json_col_t::ListColumn and - root_column.child_columns.size() == single_child_col_count and - root_column.child_columns.begin()->second.type == json_col_t::StructColumn, + CUDF_EXPECTS(data_root.type == json_col_t::ListColumn and + data_root.child_columns.size() == single_child_col_count and + data_root.child_columns.begin()->second.type == json_col_t::StructColumn, "Currently the nested JSON parser only supports an array of (nested) objects"); // Slice off the root list column, which has only a single row that contains all the structs - auto const& root_struct_col = root_column.child_columns.begin()->second; + auto const& root_struct_col = data_root.child_columns.begin()->second; // Initialize meta data to be populated while recursing through the tree of columns std::vector> out_columns; diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index d426acf26f9..a217b2f7d18 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -251,6 +252,9 @@ TEST_F(JsonTest, TokenStream) rmm::cuda_stream stream{}; rmm::cuda_stream_view stream_view(stream); + // Default parsing options + cudf::io::json_reader_options default_options{}; + // Test input std::string input = R"( [{)" R"("category": "reference",)" @@ -282,6 +286,7 @@ TEST_F(JsonTest, TokenStream) // Parse the JSON and get the token stream cuio_json::detail::get_token_stream(d_input, + default_options, tokens_gpu.device_ptr(), token_indices_gpu.device_ptr(), num_tokens_out.device_ptr(), @@ -342,10 +347,13 @@ TEST_F(JsonTest, ExtractColumn) rmm::cuda_stream stream{}; rmm::cuda_stream_view stream_view(stream); + // Default parsing options + cudf::io::json_reader_options default_options{}; + std::string input = R"( [{"a":0.0, "b":1.0}, {"a":0.1, "b":1.1}, {"a":0.2, "b":1.2}] )"; // Get the JSON's tree representation auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, stream_view); + cudf::host_span{input.data(), input.size()}, default_options, stream_view); auto const expected_col_count = 2; auto const first_column_index = 0; @@ -366,6 +374,9 @@ TEST_F(JsonTest, UTF_JSON) rmm::cuda_stream stream{}; rmm::cuda_stream_view stream_view(stream); + // Default parsing options + cudf::io::json_reader_options default_options{}; + // Only ASCII string std::string ascii_pass = R"([ {"a":1,"b":2,"c":[3], "d": {}}, @@ -375,7 +386,8 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}])"; - CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(ascii_pass, stream_view)); + CUDF_EXPECT_NO_THROW( + cuio_json::detail::parse_nested_json(ascii_pass, default_options, stream_view)); // utf-8 string that fails parsing. std::string utf_failed = R"([ @@ -385,7 +397,8 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":8.0,"c":null, "d": {}}, {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "filip ʒakotɛ"}}])"; - CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_failed, stream_view)); + CUDF_EXPECT_NO_THROW( + cuio_json::detail::parse_nested_json(utf_failed, default_options, stream_view)); // utf-8 string that passes parsing. std::string utf_pass = R"([ @@ -396,7 +409,8 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}, {"a":1,"b":NaN,"c":[null, null], "d": {"year": 2, "author": "filip ʒakotɛ"}}])"; - CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_pass, stream_view)); + CUDF_EXPECT_NO_THROW( + cuio_json::detail::parse_nested_json(utf_pass, default_options, stream_view)); } TEST_F(JsonTest, FromParquet) @@ -410,6 +424,9 @@ TEST_F(JsonTest, FromParquet) rmm::cuda_stream stream{}; rmm::cuda_stream_view stream_view(stream); + // Default parsing options + cudf::io::json_reader_options default_options{}; + // Binary parquet data containing the same data as the data represented by the JSON string. // We could add a dataset to include this file, but we don't want tests in cudf to have data. const unsigned char parquet_data[] = { @@ -496,7 +513,7 @@ TEST_F(JsonTest, FromParquet) // Read in the data via the JSON parser auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, stream_view); + cudf::host_span{input.data(), input.size()}, default_options, stream_view); // Verify that the data read via parquet matches the data read via JSON CUDF_TEST_EXPECT_TABLES_EQUAL(cudf_table.tbl->view(), result.tbl->view()); @@ -504,3 +521,37 @@ TEST_F(JsonTest, FromParquet) // Verify that the schema read via parquet matches the schema read via JSON cudf::test::expect_metadata_equal(cudf_table.metadata, result.metadata); } + +TEST_F(JsonTest, JsonLines) +{ + // Prepare cuda stream for data transfers & kernels + rmm::cuda_stream stream{}; + rmm::cuda_stream_view stream_view(stream); + + // Default parsing options + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options_builder{}.lines(true); + + using cuio_json::SymbolT; + + std::string json_string = + R"({"a":"a0"} + {"a":"a1"} + {"a":"a2", "b":"b2"} + {"a":"a3", "c":"c3"} + {"a":"a4"})"; + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.c_str(), json_string.size()}) + .lines(true); + cudf::io::table_with_metadata old_reader_table = cudf::io::read_json(in_options); + + auto const new_reader_table = cuio_json::detail::parse_nested_json( + cudf::host_span{json_string.data(), json_string.size()}, + json_lines_options, + stream_view); + + // Verify that the data read via parquet matches the data read via JSON + CUDF_TEST_EXPECT_TABLES_EQUAL(old_reader_table.tbl->view(), new_reader_table.tbl->view()); +} From 87fce7d05c3a5377a2a68b2a374db1ebd127ee54 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 15 Aug 2022 10:48:12 -0700 Subject: [PATCH 08/40] addresses outstanding todo --- cpp/src/io/json/nested_json.hpp | 26 ++++++------- cpp/src/io/json/nested_json_gpu.cu | 61 ++++++++++++++++-------------- cpp/tests/io/nested_json_test.cpp | 26 +++++-------- 3 files changed, 53 insertions(+), 60 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index d8886bc0928..1048f9fcedd 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -285,27 +285,25 @@ void get_stack_context(device_span json_in, * * @param[in] json_in The JSON input * @param[in] options Parsing options specifying the parsing behaviour - * @param[out] d_tokens Device memory to which the parsed tokens are written - * @param[out] d_tokens_indices Device memory to which the indices are written, where each index * represents the offset within \p d_json_in that cause the input being written - * @param[out] d_num_written_tokens The total number of tokens that were parsed * @param[in] stream The CUDA stream to which kernels are dispatched + * @param[in] mr Optional, resource with which to allocate + * @return Pair of device vectors, where the first vector represents the token types and the second + * vector represents the index within the input corresponding to each token */ - -void get_token_stream(device_span json_in, - cudf::io::json_reader_options const& options, - PdaTokenT* d_tokens, - SymbolOffsetT* d_tokens_indices, - SymbolOffsetT* d_num_written_tokens, - rmm::cuda_stream_view stream); +std::pair, rmm::device_uvector> get_token_stream( + device_span json_in, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Parses the given JSON string and generates table from the given input. * - * @param input The JSON input - * @param options Parsing options specifying the parsing behaviour - * @param stream The CUDA stream to which kernels are dispatched - * @param mr Optional, resource with which to allocate. + * @param[in] input The JSON input + * @param[in] options Parsing options specifying the parsing behaviour + * @param[in] stream The CUDA stream to which kernels are dispatched + * @param[in] mr Optional, resource with which to allocate * @return The data parsed from the given JSON input */ table_with_metadata parse_nested_json( diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index f8a862e2c65..34c85402284 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -939,14 +939,17 @@ void get_stack_context(device_span json_in, stream); } -// TODO: return pair of device_uvector instead of passing pre-allocated pointers. -void get_token_stream(device_span json_in, - cudf::io::json_reader_options const& options, - PdaTokenT* d_tokens, - SymbolOffsetT* d_tokens_indices, - SymbolOffsetT* d_num_written_tokens, - rmm::cuda_stream_view stream) +std::pair, rmm::device_uvector> get_token_stream( + device_span json_in, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { + constexpr std::size_t single_item_count = 1ULL; + rmm::device_uvector tokens{json_in.size(), stream, mr}; + rmm::device_uvector tokens_indices{json_in.size(), stream, mr}; + rmm::device_uvector num_written_tokens{single_item_count, stream}; + auto const new_line_delimited_json = options.is_enabled_lines(); // Memory holding the top-of-stack stack context for the input @@ -984,11 +987,17 @@ void get_token_stream(device_span json_in, // Perform a PDA-transducer pass json_to_tokens_fst.Transduce(pda_sgids.begin(), static_cast(json_in.size()), - d_tokens, - d_tokens_indices, - d_num_written_tokens, + tokens.data(), + tokens_indices.data(), + num_written_tokens.data(), tokenizer_pda::start_state, stream); + + auto num_total_tokens = num_written_tokens.front_element(stream); + tokens.resize(num_total_tokens, stream); + tokens_indices.resize(num_total_tokens, stream); + + return std::make_pair(std::move(tokens), std::move(tokens_indices)); } /** @@ -1007,28 +1016,20 @@ void make_json_column(json_column& root_column, host_span input, device_span d_input, cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { // Default name for a list's child column std::string const list_child_name = "element"; - constexpr std::size_t single_item = 1; - hostdevice_vector tokens_gpu{input.size(), stream}; - hostdevice_vector token_indices_gpu{input.size(), stream}; - hostdevice_vector num_tokens_out{single_item, stream}; - // Parse the JSON and get the token stream - get_token_stream(d_input, - options, - tokens_gpu.device_ptr(), - token_indices_gpu.device_ptr(), - num_tokens_out.device_ptr(), - stream); + const auto [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr); // Copy the JSON tokens to the host - token_indices_gpu.device_to_host(stream); - tokens_gpu.device_to_host(stream); - num_tokens_out.device_to_host(stream); + thrust::host_vector tokens_gpu = + cudf::detail::make_host_vector_async(d_tokens_gpu, stream); + thrust::host_vector token_indices_gpu = + cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); // Make sure tokens have been copied to the host stream.synchronize(); @@ -1216,10 +1217,12 @@ void make_json_column(json_column& root_column, // INITIALIZE JSON ROOT NODE //-------------------------------------------------------------------------------- // The JSON root may only be a struct, list, string, or value node - CUDF_EXPECTS(num_tokens_out[0] > 0, "Empty JSON input not supported"); + CUDF_EXPECTS(tokens_gpu.size() == token_indices_gpu.size(), + "Unexpected mismatch in number of token types and token indices"); + CUDF_EXPECTS(tokens_gpu.size() > 0, "Empty JSON input not supported"); CUDF_EXPECTS(is_valid_root_token(tokens_gpu[offset]), "Invalid beginning of JSON document"); - while (offset < num_tokens_out[0]) { + while (offset < tokens_gpu.size()) { // Verify there's at least the JSON root node left on the stack to which we can append data CUDF_EXPECTS(current_data_path.size() > 0, "Invalid JSON structure"); @@ -1327,7 +1330,7 @@ void make_json_column(json_column& root_column, else if (token == token_t::FieldNameBegin or token == token_t::StringBegin or token == token_t::ValueBegin) { // Verify that this token has the right successor to build a correct (being, end) token pair - CUDF_EXPECTS((offset + 1) < num_tokens_out[0], "Invalid JSON token sequence"); + CUDF_EXPECTS((offset + 1) < tokens_gpu.size(), "Invalid JSON token sequence"); CUDF_EXPECTS(tokens_gpu[offset + 1] == end_of_partner(token), "Invalid JSON token sequence"); // The offset to the first symbol from the JSON input associated with the current token @@ -1505,7 +1508,7 @@ table_with_metadata parse_nested_json(host_span input, // Push the root node onto the stack for the data path data_path.push({&root_column, row_offset_zero, nullptr, node_init_child_count_zero}); - make_json_column(root_column, data_path, input, d_input, options, stream); + make_json_column(root_column, data_path, input, d_input, options, stream, mr); // data_root refers to the root column of the data represented by the given JSON string auto const& data_root = diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index a217b2f7d18..cae0083daed 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -246,8 +246,6 @@ TEST_F(JsonTest, TokenStream) using cuio_json::SymbolOffsetT; using cuio_json::SymbolT; - constexpr std::size_t single_item = 1; - // Prepare cuda stream for data transfers & kernels rmm::cuda_stream stream{}; rmm::cuda_stream_view stream_view(stream); @@ -280,22 +278,15 @@ TEST_F(JsonTest, TokenStream) cudaMemcpyHostToDevice, stream.value())); - hostdevice_vector tokens_gpu{input.size(), stream_view}; - hostdevice_vector token_indices_gpu{input.size(), stream_view}; - hostdevice_vector num_tokens_out{single_item, stream_view}; - // Parse the JSON and get the token stream - cuio_json::detail::get_token_stream(d_input, - default_options, - tokens_gpu.device_ptr(), - token_indices_gpu.device_ptr(), - num_tokens_out.device_ptr(), - stream_view); + const auto [d_tokens_gpu, d_token_indices_gpu] = + cuio_json::detail::get_token_stream(d_input, default_options, stream_view); // Copy back the number of tokens that were written - num_tokens_out.device_to_host(stream_view); - tokens_gpu.device_to_host(stream_view); - token_indices_gpu.device_to_host(stream_view); + thrust::host_vector tokens_gpu = + cudf::detail::make_host_vector_async(d_tokens_gpu, stream); + thrust::host_vector token_indices_gpu = + cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); // Make sure we copied back all relevant data stream_view.synchronize(); @@ -328,9 +319,10 @@ TEST_F(JsonTest, TokenStream) {267, token_t::StructEnd}, {268, token_t::ListEnd}}; // Verify the number of tokens matches - ASSERT_EQ(golden_token_stream.size(), num_tokens_out[0]); + ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size()); + ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size()); - for (std::size_t i = 0; i < num_tokens_out[0]; i++) { + for (std::size_t i = 0; i < tokens_gpu.size(); i++) { // Ensure the index the tokens are pointing to do match EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i; From 9669c6a1f49eebee8c7c7a5c251ff2e0a54afa98 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 15 Aug 2022 16:39:06 -0700 Subject: [PATCH 09/40] C++ side changes + test --- cpp/src/io/json/experimental/read_json.cpp | 10 ++------ cpp/tests/io/json_test.cpp | 27 ++++++++++++++++++---- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index fbe9b5f6112..0c579cbf035 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -15,19 +15,13 @@ */ #include "read_json.hpp" +#include #include #include namespace cudf::io::detail::json::experimental { -table_with_metadata read_nested_json(host_span input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FAIL("Not implemented"); -} - std::vector ingest_raw_input(host_span> sources, compression_type compression) { @@ -69,7 +63,7 @@ table_with_metadata read_json(host_span> sources, auto const buffer = ingest_raw_input(sources, reader_opts.get_compression()); auto data = host_span(reinterpret_cast(buffer.data()), buffer.size()); - return read_nested_json(data, stream, mr); + return cudf::io::json::detail::parse_nested_json(data, stream, mr); } } // namespace cudf::io::detail::json::experimental diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index c8aefece94f..3866def2cdf 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -915,13 +915,30 @@ TEST_F(JsonReaderTest, BadDtypeParams) EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error); } -TEST_F(JsonReaderTest, ExperimentalParam) +TEST_F(JsonReaderTest, JsonRecordsBasic) { - cudf_io::json_reader_options const options = - cudf_io::json_reader_options::builder(cudf_io::source_info{nullptr, 0}).experimental(true); + const std::string fname = temp_env->get_temp_dir() + "JsonLinesFileTest.json"; + std::ofstream outfile(fname, std::ofstream::out); + outfile << "[{\"a\":\"11\", \"b\":\"1.1\"},{\"a\":\"22\", \"b\":\"2.2\"}]"; + outfile.close(); + + cudf_io::json_reader_options options = + cudf_io::json_reader_options::builder(cudf_io::source_info{fname}).experimental(true); + auto result = cudf_io::read_json(options); + + EXPECT_EQ(result.tbl->num_columns(), 2); + EXPECT_EQ(result.tbl->num_rows(), 2); - // should throw for now - EXPECT_THROW(cudf_io::read_json(options), cudf::logic_error); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING); + + EXPECT_EQ(std::string(result.metadata.column_names[0]), "a"); + EXPECT_EQ(std::string(result.metadata.column_names[1]), "b"); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), + cudf::test::strings_column_wrapper({"11", "22"})); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), + cudf::test::strings_column_wrapper({"1.1", "2.2"})); } CUDF_TEST_PROGRAM_MAIN() From c9fb5b28d158ff198b042684f4a9d69803c7cb91 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 15 Aug 2022 16:51:58 -0700 Subject: [PATCH 10/40] working Python + test --- cpp/src/io/json/nested_json_gpu.cu | 4 +++- python/cudf/cudf/tests/test_json.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 5e293f8a750..bffc8891020 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1395,11 +1395,13 @@ table_with_metadata parse_nested_json(host_span input, // Initialize meta data to be populated while recursing through the tree of columns std::vector> out_columns; std::vector out_column_names; + std::vector out_root_column_names; // Iterate over the struct's child columns and convert to cudf column for (auto const& [col_name, json_col] : root_struct_col.child_columns) { // Insert this columns name into the schema out_column_names.emplace_back(col_name); + out_root_column_names.emplace_back(col_name); // Get this JSON column's cudf column and schema info auto [cudf_col, col_name_info] = json_column_to_cudf_column(json_col, d_input, stream, mr); @@ -1408,7 +1410,7 @@ table_with_metadata parse_nested_json(host_span input, } return table_with_metadata{std::make_unique(std::move(out_columns)), - {{}, out_column_names}}; + {out_root_column_names, out_column_names}}; } } // namespace detail diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 800ed68e8a4..5122c976f27 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -579,3 +579,22 @@ def test_json_experimental(): # should raise an exception, for now with pytest.raises(RuntimeError): cudf.read_json("", engine="cudf_experimental") + + +def test_json_nested_basic(tmpdir): + fname = tmpdir.mkdir("gdf_json").join("tmp_json_nested_basic") + data = { + "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], + "c2": [["l11", "l21"], ["l12", "l22"]], + } + pdf = pd.DataFrame(data) + pdf.to_json(fname, orient="records") + + with open(fname, "r") as f: + print(f.read()) + print(pdf) + + df = cudf.read_json(fname, engine="cudf_experimental", orient="records") + pdf = pd.read_json(fname, orient="records") + + assert_eq(cudf.DataFrame(pdf), df) From 2de91e18bedb8a97c3cd23648f733eaa170849c7 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 15 Aug 2022 17:12:03 -0700 Subject: [PATCH 11/40] clean up --- cpp/src/io/json/experimental/read_json.cpp | 23 ++++++++-------------- cpp/tests/io/json_test.cpp | 2 +- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index 0c579cbf035..2259fcc839a 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -20,32 +20,25 @@ #include #include +#include + namespace cudf::io::detail::json::experimental { std::vector ingest_raw_input(host_span> sources, compression_type compression) { - // Iterate through the user defined sources and read the contents into the local buffer - size_t total_source_size = 0; - for (const auto& source : sources) { - total_source_size += source->size(); - } - + auto const total_source_size = + std::accumulate(sources.begin(), sources.end(), 0ul, [](size_t sum, auto& source) { + return sum + source->size(); + }); auto buffer = std::vector(total_source_size); size_t bytes_read = 0; for (const auto& source : sources) { - if (not source->is_empty()) { - auto const destination = buffer.data() + bytes_read; - bytes_read += source->host_read(0, source->size(), destination); - } + bytes_read += source->host_read(0, source->size(), buffer.data() + bytes_read); } - if (compression == compression_type::NONE) { - return buffer; - } else { - return decompress(compression, buffer); - } + return (compression == compression_type::NONE) ? buffer : decompress(compression, buffer); } table_with_metadata read_json(host_span> sources, diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 3866def2cdf..4f98dc54a73 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -915,7 +915,7 @@ TEST_F(JsonReaderTest, BadDtypeParams) EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error); } -TEST_F(JsonReaderTest, JsonRecordsBasic) +TEST_F(JsonReaderTest, JsonExperimentalBasic) { const std::string fname = temp_env->get_temp_dir() + "JsonLinesFileTest.json"; std::ofstream outfile(fname, std::ofstream::out); From 70dd9b1c0df226809b84788a133f5f0974b88315 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 15 Aug 2022 23:43:20 -0700 Subject: [PATCH 12/40] stop using column_names --- cpp/src/io/json/nested_json_gpu.cu | 4 +-- cpp/src/io/json/reader_impl.cu | 19 +++++++---- cpp/tests/io/json_test.cpp | 52 +++++++++++++++--------------- python/cudf/cudf/_lib/json.pyx | 2 +- 4 files changed, 40 insertions(+), 37 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index bffc8891020..5e293f8a750 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1395,13 +1395,11 @@ table_with_metadata parse_nested_json(host_span input, // Initialize meta data to be populated while recursing through the tree of columns std::vector> out_columns; std::vector out_column_names; - std::vector out_root_column_names; // Iterate over the struct's child columns and convert to cudf column for (auto const& [col_name, json_col] : root_struct_col.child_columns) { // Insert this columns name into the schema out_column_names.emplace_back(col_name); - out_root_column_names.emplace_back(col_name); // Get this JSON column's cudf column and schema info auto [cudf_col, col_name_info] = json_column_to_cudf_column(json_col, d_input, stream, mr); @@ -1410,7 +1408,7 @@ table_with_metadata parse_nested_json(host_span input, } return table_with_metadata{std::make_unique
(std::move(out_columns)), - {out_root_column_names, out_column_names}}; + {{}, out_column_names}}; } } // namespace detail diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 6b12b462dd9..3be0ff318a1 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -480,7 +480,7 @@ std::vector get_data_types(json_reader_options const& reader_opts, table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, std::vector const& dtypes, - std::vector const& column_names, + std::vector&& column_names, col_map_type* column_map, device_span rec_starts, device_span data, @@ -552,8 +552,8 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, std::vector column_infos; column_infos.reserve(column_names.size()); - std::transform(column_names.cbegin(), - column_names.cend(), + std::transform(std::make_move_iterator(column_names.begin()), + std::make_move_iterator(column_names.end()), std::back_inserter(column_infos), [](auto const& col_name) { return column_name_info{col_name}; }); @@ -563,8 +563,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input"); - return table_with_metadata{std::make_unique
(std::move(out_columns)), - {column_names, column_infos}}; + return table_with_metadata{std::make_unique
(std::move(out_columns)), {{}, column_infos}}; } /** @@ -636,8 +635,14 @@ table_with_metadata read_json(std::vector>& sources, CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n"); - return convert_data_to_table( - parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, d_data, stream, mr); + return convert_data_to_table(parse_opts.view(), + dtypes, + std::move(column_names), + column_map.get(), + rec_starts, + d_data, + stream, + mr); } } // namespace json diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 4f98dc54a73..adf97bf3e2a 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -171,8 +171,8 @@ TEST_F(JsonReaderTest, BasicJsonLines) EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT32); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(result.metadata.column_names[0], "0"); - EXPECT_EQ(result.metadata.column_names[1], "1"); + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].name, "1"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -228,9 +228,9 @@ TEST_F(JsonReaderTest, JsonLinesStrings) EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING); - EXPECT_EQ(result.metadata.column_names[0], "0"); - EXPECT_EQ(result.metadata.column_names[1], "1"); - EXPECT_EQ(result.metadata.column_names[2], "2"); + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].name, "1"); + EXPECT_EQ(result.metadata.schema_info[2].name, "2"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -414,9 +414,9 @@ TEST_F(JsonReaderTest, JsonLinesDtypeInference) EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "0"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "1"); - EXPECT_EQ(std::string(result.metadata.column_names[2]), "2"); + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].name, "1"); + EXPECT_EQ(result.metadata.schema_info[2].name, "2"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -444,8 +444,8 @@ TEST_F(JsonReaderTest, JsonLinesFileInput) EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "0"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "1"); + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].name, "1"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -472,7 +472,7 @@ TEST_F(JsonReaderTest, JsonLinesByteRange) EXPECT_EQ(result.tbl->num_rows(), 3); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "0"); + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -496,9 +496,9 @@ TEST_F(JsonReaderTest, JsonLinesObjects) EXPECT_EQ(result.tbl->num_rows(), 1); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "co\\\"l1"); + EXPECT_EQ(result.metadata.schema_info[0].name, "co\\\"l1"); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "col2"); + EXPECT_EQ(result.metadata.schema_info[1].name, "col2"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -522,9 +522,9 @@ TEST_F(JsonReaderTest, JsonLinesObjectsStrings) EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "col1"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "col2"); - EXPECT_EQ(std::string(result.metadata.column_names[2]), "col3"); + EXPECT_EQ(result.metadata.schema_info[0].name, "col1"); + EXPECT_EQ(result.metadata.schema_info[1].name, "col2"); + EXPECT_EQ(result.metadata.schema_info[2].name, "col3"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -563,9 +563,9 @@ TEST_F(JsonReaderTest, JsonLinesObjectsMissingData) EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING); EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "col2"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "col3"); - EXPECT_EQ(std::string(result.metadata.column_names[2]), "col1"); + EXPECT_EQ(result.metadata.schema_info[0].name, "col2"); + EXPECT_EQ(result.metadata.schema_info[1].name, "col3"); + EXPECT_EQ(result.metadata.schema_info[2].name, "col1"); auto col1_validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; }); @@ -598,9 +598,9 @@ TEST_F(JsonReaderTest, JsonLinesObjectsOutOfOrder) EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "col1"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "col2"); - EXPECT_EQ(std::string(result.metadata.column_names[2]), "col3"); + EXPECT_EQ(result.metadata.schema_info[0].name, "col1"); + EXPECT_EQ(result.metadata.schema_info[1].name, "col2"); + EXPECT_EQ(result.metadata.schema_info[2].name, "col3"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -881,8 +881,8 @@ TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs) EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "0"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "1"); + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].name, "1"); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -932,8 +932,8 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic) EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING); - EXPECT_EQ(std::string(result.metadata.column_names[0]), "a"); - EXPECT_EQ(std::string(result.metadata.column_names[1]), "b"); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), cudf::test::strings_column_wrapper({"11", "22"})); diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 0ee6062e7f2..376850b7b1b 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -113,7 +113,7 @@ cpdef read_json(object filepaths_or_buffers, with nogil: c_result = move(libcudf_read_json(opts)) - meta_names = [name.decode() for name in c_result.metadata.column_names] + meta_names = [info.name.decode() for info in c_result.metadata.schema_info] df = cudf.DataFrame._from_data(*data_from_unique_ptr( move(c_result.tbl), column_names=meta_names From b1afef0c3ea332c62c4e5fd5e49fa15b14e8a705 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 16 Aug 2022 02:01:20 -0700 Subject: [PATCH 13/40] adds documentation for mr parameter --- cpp/src/io/json/nested_json_gpu.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 34c85402284..4c21b9a78a8 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1009,6 +1009,7 @@ std::pair, rmm::device_uvector> ge * @param[in] input The JSON input in host memory * @param[in] d_input The JSON input in device memory * @param[in] stream The CUDA stream to which kernels are dispatched + * @param[in] mr Optional, resource with which to allocate * @return The columnar representation of the data from the given JSON input */ void make_json_column(json_column& root_column, From 8409214ead5b150122a60b3c1b1db5fcecc59c9e Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 16 Aug 2022 02:04:44 -0700 Subject: [PATCH 14/40] minor documentation fixes --- cpp/src/io/json/nested_json.hpp | 1 - cpp/src/io/json/nested_json_gpu.cu | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 1048f9fcedd..47ce1edafaf 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -285,7 +285,6 @@ void get_stack_context(device_span json_in, * * @param[in] json_in The JSON input * @param[in] options Parsing options specifying the parsing behaviour - * represents the offset within \p d_json_in that cause the input being written * @param[in] stream The CUDA stream to which kernels are dispatched * @param[in] mr Optional, resource with which to allocate * @return Pair of device vectors, where the first vector represents the token types and the second diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 4c21b9a78a8..b51d1270f22 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1009,6 +1009,7 @@ std::pair, rmm::device_uvector> ge * @param[in] input The JSON input in host memory * @param[in] d_input The JSON input in device memory * @param[in] stream The CUDA stream to which kernels are dispatched + * @param[in] options Parsing options specifying the parsing behaviour * @param[in] mr Optional, resource with which to allocate * @return The columnar representation of the data from the given JSON input */ From d0e0defcdaf3c50da6cc13f174a4d55846ec23d2 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 16 Aug 2022 02:06:08 -0700 Subject: [PATCH 15/40] fixes parameter order --- cpp/src/io/json/nested_json_gpu.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index b51d1270f22..26d7aaf3b2b 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1008,8 +1008,8 @@ std::pair, rmm::device_uvector> ge * first node encountered in \p input * @param[in] input The JSON input in host memory * @param[in] d_input The JSON input in device memory - * @param[in] stream The CUDA stream to which kernels are dispatched * @param[in] options Parsing options specifying the parsing behaviour + * @param[in] stream The CUDA stream to which kernels are dispatched * @param[in] mr Optional, resource with which to allocate * @return The columnar representation of the data from the given JSON input */ From 574ac4397e25f05cbb7bdf1d12a1c673c5ecb543 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 16 Aug 2022 09:07:52 -0700 Subject: [PATCH 16/40] fix copy-paste error --- cpp/tests/io/json_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index adf97bf3e2a..c3af9fc2eb0 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -917,7 +917,7 @@ TEST_F(JsonReaderTest, BadDtypeParams) TEST_F(JsonReaderTest, JsonExperimentalBasic) { - const std::string fname = temp_env->get_temp_dir() + "JsonLinesFileTest.json"; + std::string const fname = temp_env->get_temp_dir() + "JsonExperimentalBasic.json"; std::ofstream outfile(fname, std::ofstream::out); outfile << "[{\"a\":\"11\", \"b\":\"1.1\"},{\"a\":\"22\", \"b\":\"2.2\"}]"; outfile.close(); From 2de80b74a42c21a4b4b738e013df22c403125e96 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 16 Aug 2022 09:11:04 -0700 Subject: [PATCH 17/40] raw string Co-authored-by: Elias Stehle <3958403+elstehle@users.noreply.github.com> --- cpp/tests/io/json_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index adf97bf3e2a..79d7bf241f6 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -919,7 +919,7 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic) { const std::string fname = temp_env->get_temp_dir() + "JsonLinesFileTest.json"; std::ofstream outfile(fname, std::ofstream::out); - outfile << "[{\"a\":\"11\", \"b\":\"1.1\"},{\"a\":\"22\", \"b\":\"2.2\"}]"; + outfile << R"([{"a":"11", "b":"1.1"},{"a":"22", "b":"2.2"}])" outfile.close(); cudf_io::json_reader_options options = From bc14a1dd71a12ca11b1964b4ae10f4d3932f374e Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 16 Aug 2022 09:12:23 -0700 Subject: [PATCH 18/40] remove print in Python test --- python/cudf/cudf/tests/test_json.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 5122c976f27..6beb050d920 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -590,10 +590,6 @@ def test_json_nested_basic(tmpdir): pdf = pd.DataFrame(data) pdf.to_json(fname, orient="records") - with open(fname, "r") as f: - print(f.read()) - print(pdf) - df = cudf.read_json(fname, engine="cudf_experimental", orient="records") pdf = pd.read_json(fname, orient="records") From bca2e839d3db8ea70abea83b6d281be613bad9cb Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 16 Aug 2022 09:21:48 -0700 Subject: [PATCH 19/40] addressing reviews --- cpp/src/io/json/experimental/read_json.cpp | 3 ++- python/cudf/cudf/tests/test_json.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index 2259fcc839a..e070aacaca2 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -15,10 +15,11 @@ */ #include "read_json.hpp" + +#include #include #include -#include #include diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 6beb050d920..368015cf563 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -593,4 +593,4 @@ def test_json_nested_basic(tmpdir): df = cudf.read_json(fname, engine="cudf_experimental", orient="records") pdf = pd.read_json(fname, orient="records") - assert_eq(cudf.DataFrame(pdf), df) + assert_eq(pdf, df) From ba28571ca2492f5edcb5b6f76c08ce751146a94f Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 16 Aug 2022 10:57:10 -0700 Subject: [PATCH 20/40] Java fix --- java/src/main/native/src/TableJni.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 44c08aec110..857fac7df2b 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1459,7 +1459,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON( cudf::io::table_with_metadata result = cudf::io::read_json(opts.build()); // there is no need to re-order columns when inferring schema - if (result.metadata.column_names.empty() || n_col_names.size() <= 0) { + if (result.metadata.schema_info.empty() || n_col_names.size() <= 0) { return convert_table_for_return(env, result.tbl); } else { // json reader will not return the correct column order, @@ -1467,10 +1467,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON( // turn name and its index in table into map std::map m; - std::transform(result.metadata.column_names.begin(), result.metadata.column_names.end(), + std::transform(result.metadata.schema_info.cbegin(), result.metadata.schema_info.cend(), thrust::make_counting_iterator(0), std::inserter(m, m.end()), - [](auto const &column_name, auto const &index) { - return std::make_pair(column_name, index); + [](auto const &column_info, auto const &index) { + return std::make_pair(column_info.name, index); }); auto col_names_vec = n_col_names.as_cpp_vector(); From a6d5ab732f6cc474289a088ecb1e1c9287fee728 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 16 Aug 2022 11:59:42 -0700 Subject: [PATCH 21/40] style --- cpp/tests/io/json_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index fcecea8e7e0..67f0542ace2 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -919,7 +919,7 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic) { std::string const fname = temp_env->get_temp_dir() + "JsonExperimentalBasic.json"; std::ofstream outfile(fname, std::ofstream::out); - outfile << R"([{"a":"11", "b":"1.1"},{"a":"22", "b":"2.2"}])" + outfile << R"([{"a":"11", "b":"1.1"},{"a":"22", "b":"2.2"}])"; outfile.close(); cudf_io::json_reader_options options = From a0bd2292f1dcfca9d4b6470c17c0f4b07d85d93f Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 16 Aug 2022 23:28:54 -0700 Subject: [PATCH 22/40] integrates upstream interface changes --- cpp/src/io/json/experimental/read_json.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index e070aacaca2..cc154d5f325 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -57,7 +57,7 @@ table_with_metadata read_json(host_span> sources, auto const buffer = ingest_raw_input(sources, reader_opts.get_compression()); auto data = host_span(reinterpret_cast(buffer.data()), buffer.size()); - return cudf::io::json::detail::parse_nested_json(data, stream, mr); + return cudf::io::json::detail::parse_nested_json(data, reader_opts, stream, mr); } } // namespace cudf::io::detail::json::experimental From f3bba9d4181822704a917b199591ea452bfd46ef Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 17 Aug 2022 10:14:26 -0700 Subject: [PATCH 23/40] enables lines option in the nested reader --- cpp/src/io/json/experimental/read_json.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index cc154d5f325..ceac40ba4f9 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -50,7 +50,6 @@ table_with_metadata read_json(host_span> sources, auto const dtypes_empty = std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); CUDF_EXPECTS(dtypes_empty, "user specified dtypes are not yet supported"); - CUDF_EXPECTS(not reader_opts.is_enabled_lines(), "JSON Lines format is not yet supported"); CUDF_EXPECTS(reader_opts.get_byte_range_offset() == 0 and reader_opts.get_byte_range_size() == 0, "specifying a byte range is not yet supported"); From 21b40231e5c6e6e05c548519b5419df84ffb9a83 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 17 Aug 2022 11:32:15 -0700 Subject: [PATCH 24/40] migrates test from details api to reader api --- cpp/tests/io/json_test.cpp | 27 ++++++++++++++++++++++++ cpp/tests/io/nested_json_test.cpp | 34 ------------------------------- 2 files changed, 27 insertions(+), 34 deletions(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 67f0542ace2..af72edce91b 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -941,4 +942,30 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic) cudf::test::strings_column_wrapper({"1.1", "2.2"})); } +TEST_F(JsonReaderTest, JsonExperimentalLines) +{ + std::string json_string = + R"({"a":"a0"} + {"a":"a1"} + {"a":"a2", "b":"b2"} + {"a":"a3", "c":"c3"} + {"a":"a4"})"; + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.c_str(), json_string.size()}) + .lines(true); + + // Read test data via existing, non-nested json lines reader + cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); + + // Read test data via new, nested json reader + json_lines_options.enable_experimental(true); + cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); + + // Verify that the data read via parquet matches the data read via JSON + CUDF_TEST_EXPECT_TABLES_EQUAL(current_reader_table.tbl->view(), new_reader_table.tbl->view()); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index cae0083daed..7ba7e0a4a03 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -513,37 +513,3 @@ TEST_F(JsonTest, FromParquet) // Verify that the schema read via parquet matches the schema read via JSON cudf::test::expect_metadata_equal(cudf_table.metadata, result.metadata); } - -TEST_F(JsonTest, JsonLines) -{ - // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); - - // Default parsing options - cudf::io::json_reader_options json_lines_options = - cudf::io::json_reader_options_builder{}.lines(true); - - using cuio_json::SymbolT; - - std::string json_string = - R"({"a":"a0"} - {"a":"a1"} - {"a":"a2", "b":"b2"} - {"a":"a3", "c":"c3"} - {"a":"a4"})"; - - cudf::io::json_reader_options in_options = - cudf::io::json_reader_options::builder( - cudf::io::source_info{json_string.c_str(), json_string.size()}) - .lines(true); - cudf::io::table_with_metadata old_reader_table = cudf::io::read_json(in_options); - - auto const new_reader_table = cuio_json::detail::parse_nested_json( - cudf::host_span{json_string.data(), json_string.size()}, - json_lines_options, - stream_view); - - // Verify that the data read via parquet matches the data read via JSON - CUDF_TEST_EXPECT_TABLES_EQUAL(old_reader_table.tbl->view(), new_reader_table.tbl->view()); -} From cdc44411a385fb71da6954a98ebd2a59944fcf0a Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 17 Aug 2022 11:58:45 -0700 Subject: [PATCH 25/40] improves code comment --- cpp/src/io/json/nested_json_gpu.cu | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 26d7aaf3b2b..07348e67b6c 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1498,12 +1498,13 @@ table_with_metadata parse_nested_json(host_span input, constexpr uint32_t token_end_offset_zero = 0; constexpr uint32_t node_init_child_count_zero = 0; - // We initialize the very root node and root column that represents a list column that contains - // all the values found at the root "level" of the given JSON string Initialize the root column - // For JSON lines: we expect to find a list of values that all will be inserted into this list + // We initialize the very root node and root column, which represent the JSON document being + // parsed. That root node is a list node and that root column is a list column. The column has the + // root node as its only row. The values parsed from the JSON input will be treated as follows: + // (1) For JSON lines: we expect to find a list of JSON values that all + // will be inserted into this root list column. (2) For regular JSON: we expect to have only a + // single value (list, struct, string, number, literal) that will be inserted into this root // column. - // For regular JSON: we expect to have only a single value (single row) that will be inserted into - // this column root_column.append_row( row_offset_zero, json_col_t::ListColumn, token_begin_offset_zero, token_end_offset_zero, 1); From ea6959fc6e51c92924ed07680d3b89a27b648144 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 22 Aug 2022 00:59:31 -0700 Subject: [PATCH 26/40] removes in/out specification on params --- cpp/src/io/json/nested_json.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 47ce1edafaf..8fa4d82a499 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -283,10 +283,10 @@ void get_stack_context(device_span json_in, * @brief Parses the given JSON string and emits a sequence of tokens that demarcate relevant * sections from the input. * - * @param[in] json_in The JSON input - * @param[in] options Parsing options specifying the parsing behaviour - * @param[in] stream The CUDA stream to which kernels are dispatched - * @param[in] mr Optional, resource with which to allocate + * @param json_in The JSON input + * @param options Parsing options specifying the parsing behaviour + * @param stream The CUDA stream to which kernels are dispatched + * @param mr Optional, resource with which to allocate * @return Pair of device vectors, where the first vector represents the token types and the second * vector represents the index within the input corresponding to each token */ @@ -299,10 +299,10 @@ std::pair, rmm::device_uvector> ge /** * @brief Parses the given JSON string and generates table from the given input. * - * @param[in] input The JSON input - * @param[in] options Parsing options specifying the parsing behaviour - * @param[in] stream The CUDA stream to which kernels are dispatched - * @param[in] mr Optional, resource with which to allocate + * @param input The JSON input + * @param options Parsing options specifying the parsing behaviour + * @param stream The CUDA stream to which kernels are dispatched + * @param mr Optional, resource with which to allocate * @return The data parsed from the given JSON input */ table_with_metadata parse_nested_json( From 00be9159aa81e508d42c79abbe7fbf3ef29a6744 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 22 Aug 2022 01:00:29 -0700 Subject: [PATCH 27/40] removes _gpu suffix from tokens --- cpp/src/io/json/nested_json_gpu.cu | 39 +++++++++++++++--------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 07348e67b6c..09dcce6ddd7 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1028,7 +1028,7 @@ void make_json_column(json_column& root_column, const auto [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr); // Copy the JSON tokens to the host - thrust::host_vector tokens_gpu = + thrust::host_vector tokens = cudf::detail::make_host_vector_async(d_tokens_gpu, stream); thrust::host_vector token_indices_gpu = cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); @@ -1219,12 +1219,12 @@ void make_json_column(json_column& root_column, // INITIALIZE JSON ROOT NODE //-------------------------------------------------------------------------------- // The JSON root may only be a struct, list, string, or value node - CUDF_EXPECTS(tokens_gpu.size() == token_indices_gpu.size(), + CUDF_EXPECTS(tokens.size() == token_indices_gpu.size(), "Unexpected mismatch in number of token types and token indices"); - CUDF_EXPECTS(tokens_gpu.size() > 0, "Empty JSON input not supported"); - CUDF_EXPECTS(is_valid_root_token(tokens_gpu[offset]), "Invalid beginning of JSON document"); + CUDF_EXPECTS(tokens.size() > 0, "Empty JSON input not supported"); + CUDF_EXPECTS(is_valid_root_token(tokens[offset]), "Invalid beginning of JSON document"); - while (offset < tokens_gpu.size()) { + while (offset < tokens.size()) { // Verify there's at least the JSON root node left on the stack to which we can append data CUDF_EXPECTS(current_data_path.size() > 0, "Invalid JSON structure"); @@ -1234,7 +1234,7 @@ void make_json_column(json_column& root_column, "Invalid JSON structure"); // The token we're currently parsing - auto const& token = tokens_gpu[offset]; + auto const& token = tokens[offset]; #ifdef NJP_DEBUG_PRINT std::cout << "[" << token_to_string(token) << "]\n"; @@ -1257,9 +1257,9 @@ void make_json_column(json_column& root_column, // Add this struct node to the current column selected_col->append_row(target_row_index, - token_to_column_type(tokens_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), + token_to_column_type(tokens[offset]), + get_token_index(tokens[offset], token_indices_gpu[offset]), + get_token_index(tokens[offset], token_indices_gpu[offset]), zero_child_count); } @@ -1274,7 +1274,7 @@ void make_json_column(json_column& root_column, // Update row to account for string offset update_row(current_data_path.top().column, current_data_path.top().row_index, - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), + get_token_index(tokens[offset], token_indices_gpu[offset]), current_data_path.top().num_children); // Pop struct from the path stack @@ -1295,9 +1295,9 @@ void make_json_column(json_column& root_column, // Add this struct node to the current column selected_col->append_row(target_row_index, - token_to_column_type(tokens_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), + token_to_column_type(tokens[offset]), + get_token_index(tokens[offset], token_indices_gpu[offset]), + get_token_index(tokens[offset], token_indices_gpu[offset]), zero_child_count); } @@ -1312,7 +1312,7 @@ void make_json_column(json_column& root_column, // Update row to account for string offset update_row(current_data_path.top().column, current_data_path.top().row_index, - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), + get_token_index(tokens[offset], token_indices_gpu[offset]), current_data_path.top().num_children); // Pop list from the path stack @@ -1323,7 +1323,7 @@ void make_json_column(json_column& root_column, else if (token == token_t::ErrorBegin) { #ifdef NJP_DEBUG_PRINT std::cout << "[ErrorBegin]\n"; - std::cout << "@" << get_token_index(tokens_gpu[offset], token_indices_gpu[offset]); + std::cout << "@" << get_token_index(tokens[offset], token_indices_gpu[offset]); #endif CUDF_FAIL("Parser encountered an invalid format."); } @@ -1332,16 +1332,15 @@ void make_json_column(json_column& root_column, else if (token == token_t::FieldNameBegin or token == token_t::StringBegin or token == token_t::ValueBegin) { // Verify that this token has the right successor to build a correct (being, end) token pair - CUDF_EXPECTS((offset + 1) < tokens_gpu.size(), "Invalid JSON token sequence"); - CUDF_EXPECTS(tokens_gpu[offset + 1] == end_of_partner(token), "Invalid JSON token sequence"); + CUDF_EXPECTS((offset + 1) < tokens.size(), "Invalid JSON token sequence"); + CUDF_EXPECTS(tokens[offset + 1] == end_of_partner(token), "Invalid JSON token sequence"); // The offset to the first symbol from the JSON input associated with the current token - auto const& token_begin_offset = - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]); + auto const& token_begin_offset = get_token_index(tokens[offset], token_indices_gpu[offset]); // The offset to one past the last symbol associated with the current token auto const& token_end_offset = - get_token_index(tokens_gpu[offset + 1], token_indices_gpu[offset + 1]); + get_token_index(tokens[offset + 1], token_indices_gpu[offset + 1]); // FieldNameBegin // For the current struct node in the tree, select the child column corresponding to this From 73ff3075e63fefc9025ad34be82ad7db89ed21a4 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 22 Aug 2022 14:41:15 -0700 Subject: [PATCH 28/40] better translation table comments thx @upsj --- cpp/src/io/json/nested_json_gpu.cu | 761 +++++++++++++++-------------- 1 file changed, 385 insertions(+), 376 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 09dcce6ddd7..826553c0a03 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -471,402 +471,411 @@ auto get_transition_table(bool newline_delimited_json) auto get_translation_table() { std::array, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt; - pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{/*ROOT*/ - {token_t::StructBegin}, - {token_t::ListBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::StringBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {}, - {token_t::ValueBegin}, + pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{ /*ROOT*/ + {token_t::StructBegin}, // OPENING_BRACE + {token_t::ListBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::StringBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {token_t::ValueBegin}, // OTHER /*LIST*/ - {token_t::StructBegin}, - {token_t::ListBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::StringBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {}, - {token_t::ValueBegin}, + {token_t::StructBegin}, // OPENING_BRACE + {token_t::ListBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::StringBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {token_t::ValueBegin}, // OTHER /*STRUCT*/ - {token_t::StructBegin}, - {token_t::ListBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::StringBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {}, - {token_t::ValueBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_BOA)] = {{/*ROOT*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, + {token_t::StructBegin}, // OPENING_BRACE + {token_t::ListBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::StringBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {token_t::ValueBegin}}}; // OTHER + pda_tlt[static_cast(pda_state_t::PD_BOA)] = {{ /*ROOT*/ + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {token_t::ErrorBegin}, // WHITE_SPACE + {token_t::ErrorBegin}, // LINE_BREAK + {token_t::ErrorBegin}, // OTHER /*LIST*/ - {token_t::StructBegin}, - {token_t::ListBegin}, - {token_t::ErrorBegin}, - {token_t::ListEnd}, - {token_t::StringBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {}, - {token_t::ValueBegin}, + {token_t::StructBegin}, // OPENING_BRACE + {token_t::ListBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ListEnd}, // CLOSING_BRACKET + {token_t::StringBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {token_t::ValueBegin}, // OTHER /*STRUCT*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::StructEnd}, - {token_t::ErrorBegin}, - {token_t::FieldNameBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {}, - {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_LON)] = {{/*ROOT*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ValueEnd}, - {token_t::ValueEnd}, - {}, + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::StructEnd}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::FieldNameBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {token_t::ErrorBegin}}}; // OTHER + pda_tlt[static_cast(pda_state_t::PD_LON)] = { + { /*ROOT*/ + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {token_t::ValueEnd}, // WHITE_SPACE + {token_t::ValueEnd}, // LINE_BREAK + {}, // OTHER + /*LIST*/ + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ValueEnd, token_t::ListEnd}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ValueEnd}, // COMMA + {token_t::ErrorBegin}, // COLON + {token_t::ValueEnd}, // WHITE_SPACE + {token_t::ValueEnd}, // LINE_BREAK + {}, // OTHER + /*STRUCT*/ + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ValueEnd, token_t::StructEnd}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ValueEnd}, // COMMA + {token_t::ErrorBegin}, // COLON + {token_t::ValueEnd}, // WHITE_SPACE + {token_t::ValueEnd}, // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {token_t::StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER /*LIST*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ValueEnd, token_t::ListEnd}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ValueEnd}, - {token_t::ErrorBegin}, - {token_t::ValueEnd}, - {token_t::ValueEnd}, - {}, + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {token_t::StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ValueEnd, token_t::StructEnd}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ValueEnd}, - {token_t::ErrorBegin}, - {token_t::ValueEnd}, - {token_t::ValueEnd}, - {}}}; - pda_tlt[static_cast(pda_state_t::PD_STR)] = {{/*ROOT*/ - {}, - {}, - {}, - {}, - {token_t::StringEnd}, - {}, - {}, - {}, - {}, - {}, - {}, + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {token_t::StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER /*LIST*/ - {}, - {}, - {}, - {}, - {token_t::StringEnd}, - {}, - {}, - {}, - {}, - {}, - {}, + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, - {}, - {}, - {}, - {token_t::StringEnd}, - {}, - {}, - {}, - {}, - {}, - {}}}; - pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{/*ROOT*/ - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_PVL)] = {{ /*ROOT*/ + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {token_t::ErrorBegin}, // OTHER /*LIST*/ - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ListEnd}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {}, // COMMA + {token_t::ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {token_t::ErrorBegin}, // OTHER /*STRUCT*/ - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}}}; - pda_tlt[static_cast(pda_state_t::PD_PVL)] = {{/*ROOT*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {}, - {token_t::ErrorBegin}, + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::StructEnd}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {}, // COMMA + {token_t::ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {token_t::ErrorBegin}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_BFN)] = {{ /*ROOT*/ + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {token_t::ErrorBegin}, // WHITE_SPACE + {token_t::ErrorBegin}, // LINE_BREAK + {token_t::ErrorBegin}, // OTHER /*LIST*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ListEnd}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {token_t::ErrorBegin}, - {}, - {}, - {token_t::ErrorBegin}, + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {token_t::ErrorBegin}, // WHITE_SPACE + {token_t::ErrorBegin}, // LINE_BREAK + {token_t::ErrorBegin}, // OTHER /*STRUCT*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::StructEnd}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {token_t::ErrorBegin}, - {}, - {}, - {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_BFN)] = {{/*ROOT*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::FieldNameBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {token_t::ErrorBegin}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{ /*ROOT*/ + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {token_t::ErrorBegin}, // WHITE_SPACE + {token_t::ErrorBegin}, // LINE_BREAK + {token_t::ErrorBegin}, // OTHER /*LIST*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {token_t::ErrorBegin}, // WHITE_SPACE + {token_t::ErrorBegin}, // LINE_BREAK + {token_t::ErrorBegin}, // OTHER /*STRUCT*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::FieldNameBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {}, - {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{/*ROOT*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {token_t::FieldNameEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{ /*ROOT*/ + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {token_t::ErrorBegin}, // WHITE_SPACE + {token_t::ErrorBegin}, // LINE_BREAK + {token_t::ErrorBegin}, // OTHER /*LIST*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {token_t::ErrorBegin}, // WHITE_SPACE + {token_t::ErrorBegin}, // LINE_BREAK + {token_t::ErrorBegin}, // OTHER /*STRUCT*/ - {}, - {}, - {}, - {}, - {token_t::FieldNameEnd}, - {}, - {}, - {}, - {}, - {}, - {}}}; - pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{/*ROOT*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{ /*ROOT*/ + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {token_t::ErrorBegin}, // WHITE_SPACE + {token_t::ErrorBegin}, // LINE_BREAK + {token_t::ErrorBegin}, // OTHER /*LIST*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {token_t::ErrorBegin}, // COLON + {token_t::ErrorBegin}, // WHITE_SPACE + {token_t::ErrorBegin}, // LINE_BREAK + {token_t::ErrorBegin}, // OTHER /*STRUCT*/ - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}}}; - pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{/*ROOT*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, + {token_t::ErrorBegin}, // OPENING_BRACE + {token_t::ErrorBegin}, // OPENING_BRACKET + {token_t::ErrorBegin}, // CLOSING_BRACE + {token_t::ErrorBegin}, // CLOSING_BRACKET + {token_t::ErrorBegin}, // QUOTE + {token_t::ErrorBegin}, // ESCAPE + {token_t::ErrorBegin}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {token_t::ErrorBegin}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER /*LIST*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {}, - {}, - {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{/*ROOT*/ - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - /*LIST*/ - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - /*STRUCT*/ - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}}}; + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}}}; // OTHER return pda_tlt; } From 39243f343dcf1ddd0a758cf3c0e1f7e841bd706a Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Mon, 22 Aug 2022 14:49:24 -0700 Subject: [PATCH 29/40] uses device_scalar and better generator --- cpp/src/io/json/nested_json_gpu.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 826553c0a03..03e66f14c43 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -957,7 +957,7 @@ std::pair, rmm::device_uvector> ge constexpr std::size_t single_item_count = 1ULL; rmm::device_uvector tokens{json_in.size(), stream, mr}; rmm::device_uvector tokens_indices{json_in.size(), stream, mr}; - rmm::device_uvector num_written_tokens{single_item_count, stream}; + rmm::device_scalar num_written_tokens{stream, mr}; auto const new_line_delimited_json = options.is_enabled_lines(); @@ -985,9 +985,9 @@ std::pair, rmm::device_uvector> ge // Instantiating PDA transducer std::vector> pda_sgid_identity{tokenizer_pda::NUM_PDA_SGIDS}; - std::generate(std::begin(pda_sgid_identity), std::end(pda_sgid_identity), [i = 0]() mutable { - return std::vector{static_cast(i++)}; - }); + std::generate(std::begin(pda_sgid_identity), + std::end(pda_sgid_identity), + [i = char{0}]() mutable { return std::vector{i++}; }); ToTokenStreamFstT json_to_tokens_fst{pda_sgid_identity, tokenizer_pda::get_transition_table(new_line_delimited_json), tokenizer_pda::get_translation_table(), @@ -1002,7 +1002,7 @@ std::pair, rmm::device_uvector> ge tokenizer_pda::start_state, stream); - auto num_total_tokens = num_written_tokens.front_element(stream); + auto num_total_tokens = num_written_tokens.value(stream); tokens.resize(num_total_tokens, stream); tokens_indices.resize(num_total_tokens, stream); From 722017477d519a18132e5d71917793b7d94d8168 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 23 Aug 2022 08:56:36 -0700 Subject: [PATCH 30/40] removes code comment banner --- cpp/src/io/json/nested_json_gpu.cu | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 03e66f14c43..21833800063 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1224,13 +1224,11 @@ void make_json_column(json_column& root_column, // Giving names to magic constants constexpr uint32_t zero_child_count = 0; - //-------------------------------------------------------------------------------- - // INITIALIZE JSON ROOT NODE - //-------------------------------------------------------------------------------- - // The JSON root may only be a struct, list, string, or value node CUDF_EXPECTS(tokens.size() == token_indices_gpu.size(), "Unexpected mismatch in number of token types and token indices"); CUDF_EXPECTS(tokens.size() > 0, "Empty JSON input not supported"); + + // The JSON root may only be a struct, list, string, or value node CUDF_EXPECTS(is_valid_root_token(tokens[offset]), "Invalid beginning of JSON document"); while (offset < tokens.size()) { From c6f8d0ed36da905760d64b2ba8c0baee8f924e33 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 23 Aug 2022 21:58:57 -0700 Subject: [PATCH 31/40] fixes code comments --- cpp/tests/io/json_test.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index af72edce91b..b44780314b7 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -964,7 +964,8 @@ TEST_F(JsonReaderTest, JsonExperimentalLines) json_lines_options.enable_experimental(true); cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); - // Verify that the data read via parquet matches the data read via JSON + // Verify that the data read via non-nested JSON lines reader matches the data read via nested + // JSON reader CUDF_TEST_EXPECT_TABLES_EQUAL(current_reader_table.tbl->view(), new_reader_table.tbl->view()); } From e38f3d891c3b55fcf0d5118c265e6849cc1d915e Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 23 Aug 2022 22:30:11 -0700 Subject: [PATCH 32/40] adds more tests for json lines --- cpp/tests/io/json_test.cpp | 41 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index b44780314b7..77efb0c4d76 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -957,10 +957,10 @@ TEST_F(JsonReaderTest, JsonExperimentalLines) cudf::io::source_info{json_string.c_str(), json_string.size()}) .lines(true); - // Read test data via existing, non-nested json lines reader + // Read test data via existing, non-nested JSON lines reader cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); - // Read test data via new, nested json reader + // Read test data via new, nested JSON reader json_lines_options.enable_experimental(true); cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); @@ -969,4 +969,41 @@ TEST_F(JsonReaderTest, JsonExperimentalLines) CUDF_TEST_EXPECT_TABLES_EQUAL(current_reader_table.tbl->view(), new_reader_table.tbl->view()); } +TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions) +{ + std::vector json_inputs = + // single column + {R"({"a":"a0"} + {"a":"a1"} + {"a":"a2"} + {"a":"a3"} + {"a":"a4"})", + // single column, single row + R"({"a":"a0"})", + // single row + R"({"a":"a0", "b":"b0"})", + // two column, two rows + R"({"a":"a0", "b":"b0"} + {"a":"a1", "b":"b1"})"}; + + for (auto const& json_string : json_inputs) { + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.c_str(), json_string.size()}) + .lines(true); + + // Read test data via existing, non-nested JSON lines reader + cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); + + // Read test data via new, nested JSON reader + json_lines_options.enable_experimental(true); + cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); + + // Verify that the data read via non-nested JSON lines reader matches the data read via nested + // JSON reader + CUDF_TEST_EXPECT_TABLES_EQUAL(current_reader_table.tbl->view(), new_reader_table.tbl->view()); + } +} + CUDF_TEST_PROGRAM_MAIN() From cdb743d0e0a8906fa05c6dfee41616251d99089c Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 24 Aug 2022 11:20:09 -0700 Subject: [PATCH 33/40] adds json lines test for experimental nested json reader --- python/cudf/cudf/tests/test_json.py | 42 ++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 368015cf563..338c38df272 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf @@ -575,12 +576,6 @@ def test_default_float_bitwidth(default_float_bitwidth): assert df["b"].dtype == np.dtype(f"f{default_float_bitwidth//8}") -def test_json_experimental(): - # should raise an exception, for now - with pytest.raises(RuntimeError): - cudf.read_json("", engine="cudf_experimental") - - def test_json_nested_basic(tmpdir): fname = tmpdir.mkdir("gdf_json").join("tmp_json_nested_basic") data = { @@ -594,3 +589,38 @@ def test_json_nested_basic(tmpdir): pdf = pd.read_json(fname, orient="records") assert_eq(pdf, df) + + +def test_json_nested_lines(tmpdir): + fname = tmpdir.mkdir("gdf_json").join("tmp_json_nested_lines") + data = { + "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], + "c2": [["l11", "l21"], ["l12", "l22"]], + } + pdf = pd.DataFrame(data) + pdf.to_json(fname, orient="records", lines=True) + + df = cudf.read_json( + fname, engine="cudf_experimental", orient="records", lines=True + ) + pdf = pd.read_json(fname, orient="records", lines=True) + + assert_eq(pdf, df) + + +def test_json_nested_lines_with_omissions(tmpdir): + fname = tmpdir.mkdir("gdf_json").join("tmp_json_nested_lines_omissions") + data = { + "c1": [{"f2": "sf21"}, {"f1": "sf12"}], + "c2": [["l11", "l21"], []], + } + pdf = pd.DataFrame(data) + pdf.to_json(fname, orient="records", lines=True) + + df = cudf.read_json( + fname, engine="cudf_experimental", orient="records", lines=True + ) + pdf = pd.read_json(fname, orient="records", lines=True) + + # Pandas just just omits "f1" in first row, so we have to enforce a common schema + assert df.to_arrow().equals(pa.Table.from_pandas(pdf)) From 713260f622f371ce7d4852e6020a83736173489f Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 24 Aug 2022 11:36:05 -0700 Subject: [PATCH 34/40] fixes style --- python/cudf/cudf/tests/test_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 338c38df272..d5e40f5a829 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -622,5 +622,5 @@ def test_json_nested_lines_with_omissions(tmpdir): ) pdf = pd.read_json(fname, orient="records", lines=True) - # Pandas just just omits "f1" in first row, so we have to enforce a common schema + # Pandas omits "f1" in first row, so we have to enforce a common schema assert df.to_arrow().equals(pa.Table.from_pandas(pdf)) From 14749f7685e3866f733b8417027ec10f80e0d1eb Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 24 Aug 2022 12:59:49 -0700 Subject: [PATCH 35/40] parametrizes test and uses bytesio --- python/cudf/cudf/tests/test_json.py | 49 +++++++++++++---------------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index d5e40f5a829..80ccbd64130 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -591,36 +591,29 @@ def test_json_nested_basic(tmpdir): assert_eq(pdf, df) -def test_json_nested_lines(tmpdir): - fname = tmpdir.mkdir("gdf_json").join("tmp_json_nested_lines") - data = { - "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], - "c2": [["l11", "l21"], ["l12", "l22"]], - } - pdf = pd.DataFrame(data) - pdf.to_json(fname, orient="records", lines=True) - - df = cudf.read_json( - fname, engine="cudf_experimental", orient="records", lines=True - ) - pdf = pd.read_json(fname, orient="records", lines=True) - - assert_eq(pdf, df) - - -def test_json_nested_lines_with_omissions(tmpdir): - fname = tmpdir.mkdir("gdf_json").join("tmp_json_nested_lines_omissions") - data = { - "c1": [{"f2": "sf21"}, {"f1": "sf12"}], - "c2": [["l11", "l21"], []], - } +@pytest.mark.parametrize( + "data", + [ + { + "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], + "c2": [["l11", "l21"], ["l12", "l22"]], + }, + # Essential test case to handle omissions + { + "c1": [{"f2": "sf21"}, {"f1": "sf12"}], + "c2": [["l11", "l21"], []], + }, + ], +) +def test_json_nested_lines(data): + bytes = BytesIO() pdf = pd.DataFrame(data) - pdf.to_json(fname, orient="records", lines=True) - + pdf.to_json(bytes, orient="records", lines=True) + bytes.seek(0) df = cudf.read_json( - fname, engine="cudf_experimental", orient="records", lines=True + bytes, engine="cudf_experimental", orient="records", lines=True ) - pdf = pd.read_json(fname, orient="records", lines=True) - + pdf = pd.read_json(bytes, orient="records", lines=True) + # In the second test-case: # Pandas omits "f1" in first row, so we have to enforce a common schema assert df.to_arrow().equals(pa.Table.from_pandas(pdf)) From 94daa4fffa6318c22b9c86f003fdf1113e40a510 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 24 Aug 2022 13:01:38 -0700 Subject: [PATCH 36/40] adds seek before reads --- python/cudf/cudf/tests/test_json.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 80ccbd64130..f3d9180d44d 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -613,6 +613,7 @@ def test_json_nested_lines(data): df = cudf.read_json( bytes, engine="cudf_experimental", orient="records", lines=True ) + bytes.seek(0) pdf = pd.read_json(bytes, orient="records", lines=True) # In the second test-case: # Pandas omits "f1" in first row, so we have to enforce a common schema From c09c4afe2b1f2065360da4c1374858cc893f3360 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 25 Aug 2022 06:26:08 -0700 Subject: [PATCH 37/40] prettifies translation table --- cpp/src/io/json/nested_json_gpu.cu | 615 +++++++++++++++-------------- 1 file changed, 313 insertions(+), 302 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 21833800063..fe5f00318b9 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -470,153 +470,164 @@ auto get_transition_table(bool newline_delimited_json) */ auto get_translation_table() { + constexpr auto StructBegin = token_t::StructBegin; + constexpr auto StructEnd = token_t::StructEnd; + constexpr auto ListBegin = token_t::ListBegin; + constexpr auto ListEnd = token_t::ListEnd; + constexpr auto FieldNameBegin = token_t::FieldNameBegin; + constexpr auto FieldNameEnd = token_t::FieldNameEnd; + constexpr auto StringBegin = token_t::StringBegin; + constexpr auto StringEnd = token_t::StringEnd; + constexpr auto ValueBegin = token_t::ValueBegin; + constexpr auto ValueEnd = token_t::ValueEnd; + constexpr auto ErrorBegin = token_t::ErrorBegin; + std::array, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt; - pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{ /*ROOT*/ - {token_t::StructBegin}, // OPENING_BRACE - {token_t::ListBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::StringBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {token_t::ValueBegin}, // OTHER + pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{ /*ROOT*/ + {StructBegin}, // OPENING_BRACE + {ListBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {StringBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ValueBegin}, // OTHER + /*LIST*/ + {StructBegin}, // OPENING_BRACE + {ListBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {StringBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ValueBegin}, // OTHER + /*STRUCT*/ + {StructBegin}, // OPENING_BRACE + {ListBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {StringBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ValueBegin}}}; // OTHER + pda_tlt[static_cast(pda_state_t::PD_BOA)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {token_t::StructBegin}, // OPENING_BRACE - {token_t::ListBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::StringBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {token_t::ValueBegin}, // OTHER + {StructBegin}, // OPENING_BRACE + {ListBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ListEnd}, // CLOSING_BRACKET + {StringBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ValueBegin}, // OTHER /*STRUCT*/ - {token_t::StructBegin}, // OPENING_BRACE - {token_t::ListBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::StringBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {token_t::ValueBegin}}}; // OTHER - pda_tlt[static_cast(pda_state_t::PD_BOA)] = {{ /*ROOT*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {token_t::ErrorBegin}, // WHITE_SPACE - {token_t::ErrorBegin}, // LINE_BREAK - {token_t::ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {FieldNameBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}}}; // OTHER + pda_tlt[static_cast(pda_state_t::PD_LON)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + {ValueEnd}, // LINE_BREAK + {}, // OTHER /*LIST*/ - {token_t::StructBegin}, // OPENING_BRACE - {token_t::ListBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ListEnd}, // CLOSING_BRACKET - {token_t::StringBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {token_t::ValueBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ValueEnd, ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ValueEnd}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + {ValueEnd}, // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::StructEnd}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::FieldNameBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {token_t::ErrorBegin}}}; // OTHER - pda_tlt[static_cast(pda_state_t::PD_LON)] = { - { /*ROOT*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {token_t::ValueEnd}, // WHITE_SPACE - {token_t::ValueEnd}, // LINE_BREAK - {}, // OTHER - /*LIST*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ValueEnd, token_t::ListEnd}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ValueEnd}, // COMMA - {token_t::ErrorBegin}, // COLON - {token_t::ValueEnd}, // WHITE_SPACE - {token_t::ValueEnd}, // LINE_BREAK - {}, // OTHER - /*STRUCT*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ValueEnd, token_t::StructEnd}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ValueEnd}, // COMMA - {token_t::ErrorBegin}, // COLON - {token_t::ValueEnd}, // WHITE_SPACE - {token_t::ValueEnd}, // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {token_t::StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ValueEnd, StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ValueEnd}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + {ValueEnd}, // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER /*LIST*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {token_t::StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {token_t::StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}}}; // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{ /*ROOT*/ {}, // OPENING_BRACE @@ -655,141 +666,141 @@ auto get_translation_table() {}, // LINE_BREAK {}}}; // OTHER - pda_tlt[static_cast(pda_state_t::PD_PVL)] = {{ /*ROOT*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {token_t::ErrorBegin}, // OTHER + pda_tlt[static_cast(pda_state_t::PD_PVL)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ListEnd}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {}, // COMMA - {token_t::ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {token_t::ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::StructEnd}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {}, // COMMA - {token_t::ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {token_t::ErrorBegin}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_BFN)] = {{ /*ROOT*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {token_t::ErrorBegin}, // WHITE_SPACE - {token_t::ErrorBegin}, // LINE_BREAK - {token_t::ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_BFN)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {token_t::ErrorBegin}, // WHITE_SPACE - {token_t::ErrorBegin}, // LINE_BREAK - {token_t::ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::FieldNameBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {token_t::ErrorBegin}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{ /*ROOT*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {token_t::ErrorBegin}, // WHITE_SPACE - {token_t::ErrorBegin}, // LINE_BREAK - {token_t::ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {FieldNameBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {token_t::ErrorBegin}, // WHITE_SPACE - {token_t::ErrorBegin}, // LINE_BREAK - {token_t::ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {token_t::FieldNameEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{ /*ROOT*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {token_t::ErrorBegin}, // WHITE_SPACE - {token_t::ErrorBegin}, // LINE_BREAK - {token_t::ErrorBegin}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {FieldNameEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {token_t::ErrorBegin}, // WHITE_SPACE - {token_t::ErrorBegin}, // LINE_BREAK - {token_t::ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ {}, // OPENING_BRACE {}, // OPENING_BRACKET @@ -803,42 +814,42 @@ auto get_translation_table() {}, // LINE_BREAK {}}}; // OTHER - pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{ /*ROOT*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {token_t::ErrorBegin}, // WHITE_SPACE - {token_t::ErrorBegin}, // LINE_BREAK - {token_t::ErrorBegin}, // OTHER + pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {token_t::ErrorBegin}, // COLON - {token_t::ErrorBegin}, // WHITE_SPACE - {token_t::ErrorBegin}, // LINE_BREAK - {token_t::ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ - {token_t::ErrorBegin}, // OPENING_BRACE - {token_t::ErrorBegin}, // OPENING_BRACKET - {token_t::ErrorBegin}, // CLOSING_BRACE - {token_t::ErrorBegin}, // CLOSING_BRACKET - {token_t::ErrorBegin}, // QUOTE - {token_t::ErrorBegin}, // ESCAPE - {token_t::ErrorBegin}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {token_t::ErrorBegin}}}; // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{ /*ROOT*/ {}, // OPENING_BRACE From 6efecf49a2daf534e283c3f32694fadb5be68a9e Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 25 Aug 2022 06:27:16 -0700 Subject: [PATCH 38/40] default_stream and more constness --- cpp/tests/io/json_test.cpp | 4 +- cpp/tests/io/nested_json_test.cpp | 129 ++++++++++++++---------------- 2 files changed, 61 insertions(+), 72 deletions(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 77efb0c4d76..232aaa51ef3 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -944,7 +944,7 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic) TEST_F(JsonReaderTest, JsonExperimentalLines) { - std::string json_string = + std::string const json_string = R"({"a":"a0"} {"a":"a1"} {"a":"a2", "b":"b2"} @@ -971,7 +971,7 @@ TEST_F(JsonReaderTest, JsonExperimentalLines) TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions) { - std::vector json_inputs = + std::array const json_inputs // single column {R"({"a":"a0"} {"a":"a1"} diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 7ba7e0a4a03..f0ececaf4eb 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -29,9 +30,6 @@ #include #include -#include -#include - #include namespace cuio_json = cudf::io::json; @@ -139,28 +137,27 @@ TEST_F(JsonTest, StackContext) using StackSymbolT = char; // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); + constexpr auto stream = cudf::default_stream_value; // Test input - std::string input = R"( [{)" - R"("category": "reference",)" - R"("index:": [4,12,42],)" - R"("author": "Nigel Rees",)" - R"("title": "[Sayings of the Century]",)" - R"("price": 8.95)" - R"(}, )" - R"({)" - R"("category": "reference",)" - R"("index": [4,{},null,{"a":[{ }, {}] } ],)" - R"("author": "Nigel Rees",)" - R"("title": "{}\\\"[], <=semantic-symbols-string\\\\",)" - R"("price": 8.95)" - R"(}] )"; + std::string const input = R"( [{)" + R"("category": "reference",)" + R"("index:": [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "[Sayings of the Century]",)" + R"("price": 8.95)" + R"(}, )" + R"({)" + R"("category": "reference",)" + R"("index": [4,{},null,{"a":[{ }, {}] } ],)" + R"("author": "Nigel Rees",)" + R"("title": "{}\\\"[], <=semantic-symbols-string\\\\",)" + R"("price": 8.95)" + R"(}] )"; // Prepare input & output buffers - rmm::device_uvector d_input(input.size(), stream_view); - hostdevice_vector stack_context(input.size(), stream_view); + rmm::device_uvector d_input(input.size(), stream); + hostdevice_vector stack_context(input.size(), stream); ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_input.data(), input.data(), @@ -169,13 +166,13 @@ TEST_F(JsonTest, StackContext) stream.value())); // Run algorithm - cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream_view); + cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream); // Copy back the results - stack_context.device_to_host(stream_view); + stack_context.device_to_host(stream); // Make sure we copied back the stack context - stream_view.synchronize(); + stream.synchronize(); std::vector golden_stack_context{ '_', '_', '_', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', @@ -205,15 +202,14 @@ TEST_F(JsonTest, StackContextUtf8) using StackSymbolT = char; // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); + constexpr auto stream = cudf::default_stream_value; // Test input - std::string input = R"([{"a":{"year":1882,"author": "Bharathi"}, {"a":"filip ʒakotɛ"}}])"; + std::string const input = R"([{"a":{"year":1882,"author": "Bharathi"}, {"a":"filip ʒakotɛ"}}])"; // Prepare input & output buffers - rmm::device_uvector d_input(input.size(), stream_view); - hostdevice_vector stack_context(input.size(), stream_view); + rmm::device_uvector d_input(input.size(), stream); + hostdevice_vector stack_context(input.size(), stream); ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_input.data(), input.data(), @@ -222,13 +218,13 @@ TEST_F(JsonTest, StackContextUtf8) stream.value())); // Run algorithm - cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream_view); + cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream); // Copy back the results - stack_context.device_to_host(stream_view); + stack_context.device_to_host(stream); // Make sure we copied back the stack context - stream_view.synchronize(); + stream.synchronize(); std::vector golden_stack_context{ '_', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', @@ -247,30 +243,29 @@ TEST_F(JsonTest, TokenStream) using cuio_json::SymbolT; // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); + constexpr auto stream = cudf::default_stream_value; // Default parsing options cudf::io::json_reader_options default_options{}; // Test input - std::string input = R"( [{)" - R"("category": "reference",)" - R"("index:": [4,12,42],)" - R"("author": "Nigel Rees",)" - R"("title": "[Sayings of the Century]",)" - R"("price": 8.95)" - R"(}, )" - R"({)" - R"("category": "reference",)" - R"("index": [4,{},null,{"a":[{ }, {}] } ],)" - R"("author": "Nigel Rees",)" - R"("title": "{}[], <=semantic-symbols-string",)" - R"("price": 8.95)" - R"(}] )"; + std::string const input = R"( [{)" + R"("category": "reference",)" + R"("index:": [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "[Sayings of the Century]",)" + R"("price": 8.95)" + R"(}, )" + R"({)" + R"("category": "reference",)" + R"("index": [4,{},null,{"a":[{ }, {}] } ],)" + R"("author": "Nigel Rees",)" + R"("title": "{}[], <=semantic-symbols-string",)" + R"("price": 8.95)" + R"(}] )"; // Prepare input & output buffers - rmm::device_uvector d_input(input.size(), stream_view); + rmm::device_uvector d_input(input.size(), stream); ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_input.data(), input.data(), @@ -280,7 +275,7 @@ TEST_F(JsonTest, TokenStream) // Parse the JSON and get the token stream const auto [d_tokens_gpu, d_token_indices_gpu] = - cuio_json::detail::get_token_stream(d_input, default_options, stream_view); + cuio_json::detail::get_token_stream(d_input, default_options, stream); // Copy back the number of tokens that were written thrust::host_vector tokens_gpu = @@ -289,7 +284,7 @@ TEST_F(JsonTest, TokenStream) cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); // Make sure we copied back all relevant data - stream_view.synchronize(); + stream.synchronize(); // Golden token stream sample using token_t = cuio_json::token_t; @@ -336,16 +331,15 @@ TEST_F(JsonTest, ExtractColumn) using cuio_json::SymbolT; // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); + constexpr auto stream = cudf::default_stream_value; // Default parsing options cudf::io::json_reader_options default_options{}; - std::string input = R"( [{"a":0.0, "b":1.0}, {"a":0.1, "b":1.1}, {"a":0.2, "b":1.2}] )"; + std::string const input = R"( [{"a":0.0, "b":1.0}, {"a":0.1, "b":1.1}, {"a":0.2, "b":1.2}] )"; // Get the JSON's tree representation auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, default_options, stream_view); + cudf::host_span{input.data(), input.size()}, default_options, stream); auto const expected_col_count = 2; auto const first_column_index = 0; @@ -363,14 +357,13 @@ TEST_F(JsonTest, ExtractColumn) TEST_F(JsonTest, UTF_JSON) { // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); + constexpr auto stream = cudf::default_stream_value; // Default parsing options cudf::io::json_reader_options default_options{}; // Only ASCII string - std::string ascii_pass = R"([ + std::string const ascii_pass = R"([ {"a":1,"b":2,"c":[3], "d": {}}, {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}}, {"a":1,"b":6.0,"c":[5, 7], "d": null}, @@ -378,22 +371,20 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}])"; - CUDF_EXPECT_NO_THROW( - cuio_json::detail::parse_nested_json(ascii_pass, default_options, stream_view)); + CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(ascii_pass, default_options, stream)); // utf-8 string that fails parsing. - std::string utf_failed = R"([ + std::string const utf_failed = R"([ {"a":1,"b":2,"c":[3], "d": {}}, {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}}, {"a":1,"b":6.0,"c":[5, 7], "d": null}, {"a":1,"b":8.0,"c":null, "d": {}}, {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "filip ʒakotɛ"}}])"; - CUDF_EXPECT_NO_THROW( - cuio_json::detail::parse_nested_json(utf_failed, default_options, stream_view)); + CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_failed, default_options, stream)); // utf-8 string that passes parsing. - std::string utf_pass = R"([ + std::string const utf_pass = R"([ {"a":1,"b":2,"c":[3], "d": {}}, {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}}, {"a":1,"b":6.0,"c":[5, 7], "d": null}, @@ -401,20 +392,18 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}, {"a":1,"b":NaN,"c":[null, null], "d": {"year": 2, "author": "filip ʒakotɛ"}}])"; - CUDF_EXPECT_NO_THROW( - cuio_json::detail::parse_nested_json(utf_pass, default_options, stream_view)); + CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_pass, default_options, stream)); } TEST_F(JsonTest, FromParquet) { using cuio_json::SymbolT; - std::string input = + std::string const input = R"([{"0":{},"1":[],"2":{}},{"1":[[""],[]],"2":{"2":""}},{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}])"; // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); + constexpr auto stream = cudf::default_stream_value; // Default parsing options cudf::io::json_reader_options default_options{}; @@ -505,7 +494,7 @@ TEST_F(JsonTest, FromParquet) // Read in the data via the JSON parser auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, default_options, stream_view); + cudf::host_span{input.data(), input.size()}, default_options, stream); // Verify that the data read via parquet matches the data read via JSON CUDF_TEST_EXPECT_TABLES_EQUAL(cudf_table.tbl->view(), result.tbl->view()); From 272bc164fd5a84a301c497ad3bb1d681d05a6a7e Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 25 Aug 2022 06:42:58 -0700 Subject: [PATCH 39/40] add TODO for stack ctx interface --- cpp/src/io/json/nested_json.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 8fa4d82a499..4e930f86591 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -262,6 +262,8 @@ enum token_t : PdaTokenT { }; namespace detail { + +// TODO: return device_uvector instead of passing pre-allocated memory /** * @brief Identifies the stack context for each character from a JSON input. Specifically, we * identify brackets and braces outside of quoted fields (e.g., field names, strings). From 9822ecb01f0eaa5c88fd5638f7b235dd16c3707a Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 25 Aug 2022 06:54:41 -0700 Subject: [PATCH 40/40] clarifies treatment of empty lines for ndjson --- cpp/src/io/json/nested_json_gpu.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index fe5f00318b9..3bc7bd89692 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -415,6 +415,8 @@ auto get_transition_table(bool newline_delimited_json) static_assert(static_cast(stack_symbol_group_id::STACK_LIST) == 1); static_assert(static_cast(stack_symbol_group_id::STACK_STRUCT) == 2); + // In case of newline-delimited JSON, multiple newlines are ignored, similar to whitespace. + // Thas is, empty lines are ignored auto const PD_ANL = newline_delimited_json ? PD_BOV : PD_PVL; std::array, PD_NUM_STATES> pda_tt; // { [ } ] " \ , : space newline other