diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index e070aacaca2..ceac40ba4f9 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -50,14 +50,13 @@ table_with_metadata read_json(host_span> sources, auto const dtypes_empty = std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); CUDF_EXPECTS(dtypes_empty, "user specified dtypes are not yet supported"); - CUDF_EXPECTS(not reader_opts.is_enabled_lines(), "JSON Lines format is not yet supported"); CUDF_EXPECTS(reader_opts.get_byte_range_offset() == 0 and reader_opts.get_byte_range_size() == 0, "specifying a byte range is not yet supported"); auto const buffer = ingest_raw_input(sources, reader_opts.get_compression()); auto data = host_span(reinterpret_cast(buffer.data()), buffer.size()); - return cudf::io::json::detail::parse_nested_json(data, stream, mr); + return cudf::io::json::detail::parse_nested_json(data, reader_opts, stream, mr); } } // namespace cudf::io::detail::json::experimental diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 03acd393594..4e930f86591 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -261,20 +262,22 @@ enum token_t : PdaTokenT { }; namespace detail { + +// TODO: return device_uvector instead of passing pre-allocated memory /** * @brief Identifies the stack context for each character from a JSON input. Specifically, we * identify brackets and braces outside of quoted fields (e.g., field names, strings). * At this stage, we do not perform bracket matching, i.e., we do not verify whether a closing * bracket would actually pop a the corresponding opening brace. * - * @param[in] d_json_in The string of input characters + * @param[in] json_in The string of input characters * @param[out] d_top_of_stack Will be populated with what-is-on-top-of-the-stack for any given input * character of \p d_json_in, where a '{' represents that the corresponding input character is * within the context of a struct, a '[' represents that it is within the context of an array, and a * '_' symbol that it is at the root of the JSON. * @param[in] stream The cuda stream to dispatch GPU kernels to */ -void get_stack_context(device_span d_json_in, +void get_stack_context(device_span json_in, SymbolT* d_top_of_stack, rmm::cuda_stream_view stream); @@ -282,29 +285,31 @@ void get_stack_context(device_span d_json_in, * @brief Parses the given JSON string and emits a sequence of tokens that demarcate relevant * sections from the input. * - * @param[in] d_json_in The JSON input - * @param[out] d_tokens Device memory to which the parsed tokens are written - * @param[out] d_tokens_indices Device memory to which the indices are written, where each index - * represents the offset within \p d_json_in that cause the input being written - * @param[out] d_num_written_tokens The total number of tokens that were parsed - * @param[in] stream The CUDA stream to which kernels are dispatched + * @param json_in The JSON input + * @param options Parsing options specifying the parsing behaviour + * @param stream The CUDA stream to which kernels are dispatched + * @param mr Optional, resource with which to allocate + * @return Pair of device vectors, where the first vector represents the token types and the second + * vector represents the index within the input corresponding to each token */ -void get_token_stream(device_span d_json_in, - PdaTokenT* d_tokens, - SymbolOffsetT* d_tokens_indices, - SymbolOffsetT* d_num_written_tokens, - rmm::cuda_stream_view stream); +std::pair, rmm::device_uvector> get_token_stream( + device_span json_in, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Parses the given JSON string and generates table from the given input. * * @param input The JSON input + * @param options Parsing options specifying the parsing behaviour * @param stream The CUDA stream to which kernels are dispatched - * @param mr Optional, resource with which to allocate. + * @param mr Optional, resource with which to allocate * @return The data parsed from the given JSON input */ table_with_metadata parse_nested_json( host_span input, + cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 5e293f8a750..3bc7bd89692 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -164,6 +165,8 @@ enum class symbol_group_id : PdaSymbolGroupIdT { COLON, /// Whitespace WHITE_SPACE, + /// Linebreak + LINE_BREAK, /// Other (any input symbol not assigned to one of the above symbol groups) OTHER, /// Total number of symbol groups amongst which to differentiate @@ -206,7 +209,7 @@ static __constant__ PdaSymbolGroupIdT tos_sg_to_pda_sgid[] = { static_cast(symbol_group_id::OTHER), static_cast(symbol_group_id::OTHER), static_cast(symbol_group_id::WHITE_SPACE), - static_cast(symbol_group_id::WHITE_SPACE), + static_cast(symbol_group_id::LINE_BREAK), static_cast(symbol_group_id::OTHER), static_cast(symbol_group_id::OTHER), static_cast(symbol_group_id::WHITE_SPACE), @@ -403,62 +406,64 @@ constexpr auto PD_NUM_STATES = static_cast(pda_state_t::PD_NUM_STATES); // The starting state of the pushdown automaton constexpr auto start_state = static_cast(pda_state_t::PD_BOV); -// Identity symbol to symbol group lookup table -std::vector> const pda_sgids{ - {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}, {13}, {14}, - {15}, {16}, {17}, {18}, {19}, {20}, {21}, {22}, {23}, {24}, {25}, {26}, {27}, {28}, {29}}; - /** * @brief Getting the transition table */ -auto get_transition_table() +auto get_transition_table(bool newline_delimited_json) { + static_assert(static_cast(stack_symbol_group_id::STACK_ROOT) == 0); + static_assert(static_cast(stack_symbol_group_id::STACK_LIST) == 1); + static_assert(static_cast(stack_symbol_group_id::STACK_STRUCT) == 2); + + // In case of newline-delimited JSON, multiple newlines are ignored, similar to whitespace. + // Thas is, empty lines are ignored + auto const PD_ANL = newline_delimited_json ? PD_BOV : PD_PVL; std::array, PD_NUM_STATES> pda_tt; - // { [ } ] " \ , : space other + // { [ } ] " \ , : space newline other pda_tt[static_cast(pda_state_t::PD_BOV)] = { - PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON, - PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON, - PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON}; + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON}; pda_tt[static_cast(pda_state_t::PD_BOA)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_LON, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_LON)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_LON, - PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_LON, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_LON}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_LON}; pda_tt[static_cast(pda_state_t::PD_STR)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR}; + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; pda_tt[static_cast(pda_state_t::PD_SCE)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; pda_tt[static_cast(pda_state_t::PD_PVL)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_ERR, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ANL, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_BFN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_BFN, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_FLN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; pda_tt[static_cast(pda_state_t::PD_FNE)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; pda_tt[static_cast(pda_state_t::PD_PFN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_PFN, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_ERR)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR}; return pda_tt; } @@ -467,257 +472,423 @@ auto get_transition_table() */ auto get_translation_table() { + constexpr auto StructBegin = token_t::StructBegin; + constexpr auto StructEnd = token_t::StructEnd; + constexpr auto ListBegin = token_t::ListBegin; + constexpr auto ListEnd = token_t::ListEnd; + constexpr auto FieldNameBegin = token_t::FieldNameBegin; + constexpr auto FieldNameEnd = token_t::FieldNameEnd; + constexpr auto StringBegin = token_t::StringBegin; + constexpr auto StringEnd = token_t::StringEnd; + constexpr auto ValueBegin = token_t::ValueBegin; + constexpr auto ValueEnd = token_t::ValueEnd; + constexpr auto ErrorBegin = token_t::ErrorBegin; + std::array, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt; - pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{{token_t::StructBegin}, - {token_t::ListBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::StringBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {token_t::ValueBegin}, - {token_t::StructBegin}, - {token_t::ListBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::StringBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {token_t::ValueBegin}, - {token_t::StructBegin}, - {token_t::ListBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::StringBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {token_t::ValueBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_BOA)] = {{{token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::StructBegin}, - {token_t::ListBegin}, - {token_t::ErrorBegin}, - {token_t::ListEnd}, - {token_t::StringBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {token_t::ValueBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::StructEnd}, - {token_t::ErrorBegin}, - {token_t::FieldNameBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_LON)] = {{{token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ValueEnd}, - {}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ValueEnd, token_t::ListEnd}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ValueEnd}, - {token_t::ErrorBegin}, - {token_t::ValueEnd}, - {}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ValueEnd, token_t::StructEnd}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ValueEnd}, - {token_t::ErrorBegin}, - {token_t::ValueEnd}, - {}}}; - pda_tlt[static_cast(pda_state_t::PD_STR)] = { - {{}, {}, {}, {}, {token_t::StringEnd}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {token_t::StringEnd}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {token_t::StringEnd}, {}, {}, {}, {}, {}}}; - pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}, {}}}; - pda_tlt[static_cast(pda_state_t::PD_PVL)] = {{{token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ListEnd}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {token_t::ErrorBegin}, - {}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::StructEnd}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {token_t::ErrorBegin}, - {}, - {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_BFN)] = {{{token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::FieldNameBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{{token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {}, - {}, - {}, - {token_t::FieldNameEnd}, - {}, - {}, - {}, - {}, - {}}}; - pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{{token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}, - {}}}; - pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{{token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {token_t::ErrorBegin}, - {}, - {}, - {token_t::ErrorBegin}}}; - pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}, {}}}; + pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{ /*ROOT*/ + {StructBegin}, // OPENING_BRACE + {ListBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {StringBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ValueBegin}, // OTHER + /*LIST*/ + {StructBegin}, // OPENING_BRACE + {ListBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {StringBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ValueBegin}, // OTHER + /*STRUCT*/ + {StructBegin}, // OPENING_BRACE + {ListBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {StringBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ValueBegin}}}; // OTHER + pda_tlt[static_cast(pda_state_t::PD_BOA)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {StructBegin}, // OPENING_BRACE + {ListBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ListEnd}, // CLOSING_BRACKET + {StringBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ValueBegin}, // OTHER + /*STRUCT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {FieldNameBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}}}; // OTHER + pda_tlt[static_cast(pda_state_t::PD_LON)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + {ValueEnd}, // LINE_BREAK + {}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ValueEnd, ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ValueEnd}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + {ValueEnd}, // LINE_BREAK + {}, // OTHER + /*STRUCT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ValueEnd, StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ValueEnd}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + {ValueEnd}, // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER + /*LIST*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER + /*STRUCT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER + /*LIST*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER + /*STRUCT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_PVL)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*STRUCT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_BFN)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*STRUCT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {FieldNameBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*STRUCT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {FieldNameEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*STRUCT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*STRUCT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER + /*LIST*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}, // OTHER + /*STRUCT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {}}}; // OTHER return pda_tlt; } @@ -790,13 +961,19 @@ void get_stack_context(device_span json_in, stream); } -// TODO: return pair of device_uvector instead of passing pre-allocated pointers. -void get_token_stream(device_span json_in, - PdaTokenT* d_tokens, - SymbolOffsetT* d_tokens_indices, - SymbolOffsetT* d_num_written_tokens, - rmm::cuda_stream_view stream) +std::pair, rmm::device_uvector> get_token_stream( + device_span json_in, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { + constexpr std::size_t single_item_count = 1ULL; + rmm::device_uvector tokens{json_in.size(), stream, mr}; + rmm::device_uvector tokens_indices{json_in.size(), stream, mr}; + rmm::device_scalar num_written_tokens{stream, mr}; + + auto const new_line_delimited_json = options.is_enabled_lines(); + // Memory holding the top-of-stack stack context for the input rmm::device_uvector stack_op_indices{json_in.size(), stream}; @@ -820,19 +997,29 @@ void get_token_stream(device_span json_in, tokenizer_pda::pda_state_t::PD_NUM_STATES)>; // Instantiating PDA transducer - ToTokenStreamFstT json_to_tokens_fst{tokenizer_pda::pda_sgids, - tokenizer_pda::get_transition_table(), + std::vector> pda_sgid_identity{tokenizer_pda::NUM_PDA_SGIDS}; + std::generate(std::begin(pda_sgid_identity), + std::end(pda_sgid_identity), + [i = char{0}]() mutable { return std::vector{i++}; }); + ToTokenStreamFstT json_to_tokens_fst{pda_sgid_identity, + tokenizer_pda::get_transition_table(new_line_delimited_json), tokenizer_pda::get_translation_table(), stream}; // Perform a PDA-transducer pass json_to_tokens_fst.Transduce(pda_sgids.begin(), static_cast(json_in.size()), - d_tokens, - d_tokens_indices, - d_num_written_tokens, + tokens.data(), + tokens_indices.data(), + num_written_tokens.data(), tokenizer_pda::start_state, stream); + + auto num_total_tokens = num_written_tokens.value(stream); + tokens.resize(num_total_tokens, stream); + tokens_indices.resize(num_total_tokens, stream); + + return std::make_pair(std::move(tokens), std::move(tokens_indices)); } /** @@ -843,34 +1030,30 @@ void get_token_stream(device_span json_in, * first node encountered in \p input * @param[in] input The JSON input in host memory * @param[in] d_input The JSON input in device memory + * @param[in] options Parsing options specifying the parsing behaviour * @param[in] stream The CUDA stream to which kernels are dispatched + * @param[in] mr Optional, resource with which to allocate * @return The columnar representation of the data from the given JSON input */ void make_json_column(json_column& root_column, std::stack& current_data_path, host_span input, device_span d_input, - rmm::cuda_stream_view stream) + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { // Default name for a list's child column std::string const list_child_name = "element"; - constexpr std::size_t single_item = 1; - hostdevice_vector tokens_gpu{input.size(), stream}; - hostdevice_vector token_indices_gpu{input.size(), stream}; - hostdevice_vector num_tokens_out{single_item, stream}; - // Parse the JSON and get the token stream - get_token_stream(d_input, - tokens_gpu.device_ptr(), - token_indices_gpu.device_ptr(), - num_tokens_out.device_ptr(), - stream); + const auto [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr); // Copy the JSON tokens to the host - token_indices_gpu.device_to_host(stream); - tokens_gpu.device_to_host(stream); - num_tokens_out.device_to_host(stream); + thrust::host_vector tokens = + cudf::detail::make_host_vector_async(d_tokens_gpu, stream); + thrust::host_vector token_indices_gpu = + cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); // Make sure tokens have been copied to the host stream.synchronize(); @@ -897,15 +1080,6 @@ void make_json_column(json_column& root_column, }; }; - // Whether this token is a beginning-of-list or beginning-of-struct token - auto is_nested_token = [](PdaTokenT const token) { - switch (token) { - case token_t::StructBegin: - case token_t::ListBegin: return true; - default: return false; - }; - }; - // Skips the quote char if the token is a beginning-of-string or beginning-of-field-name token auto get_token_index = [](PdaTokenT const token, SymbolOffsetT const token_index) { constexpr SymbolOffsetT skip_quote_char = 1; @@ -1061,62 +1235,16 @@ void make_json_column(json_column& root_column, std::size_t offset = 0; // Giving names to magic constants - constexpr uint32_t row_offset_zero = 0; constexpr uint32_t zero_child_count = 0; - //-------------------------------------------------------------------------------- - // INITIALIZE JSON ROOT NODE - //-------------------------------------------------------------------------------- - // The JSON root may only be a struct, list, string, or value node - CUDF_EXPECTS(num_tokens_out[0] > 0, "Empty JSON input not supported"); - CUDF_EXPECTS(is_valid_root_token(tokens_gpu[offset]), "Invalid beginning of JSON document"); - - // The JSON root is either a struct or list - if (is_nested_token(tokens_gpu[offset])) { - // Initialize the root column and append this row to it - root_column.append_row(row_offset_zero, - token_to_column_type(tokens_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), - 0); - - // Push the root node onto the stack for the data path - current_data_path.push({&root_column, row_offset_zero, nullptr, zero_child_count}); - - // Continue with the next token from the token stream - offset++; - } - // The JSON is a simple scalar value -> create simple table and return - else { - constexpr SymbolOffsetT max_tokens_for_scalar_value = 2; - CUDF_EXPECTS(num_tokens_out[0] <= max_tokens_for_scalar_value, - "Invalid JSON format. Expected just a scalar value."); - - // If this isn't the only token, verify the subsequent token is the correct end-of-* partner - if ((offset + 1) < num_tokens_out[0]) { - CUDF_EXPECTS(tokens_gpu[offset + 1] == end_of_partner(tokens_gpu[offset]), - "Invalid JSON token sequence"); - } + CUDF_EXPECTS(tokens.size() == token_indices_gpu.size(), + "Unexpected mismatch in number of token types and token indices"); + CUDF_EXPECTS(tokens.size() > 0, "Empty JSON input not supported"); - // The offset to the first symbol from the JSON input associated with the current token - auto const& token_begin_offset = get_token_index(tokens_gpu[offset], token_indices_gpu[offset]); - - // The offset to one past the last symbol associated with the current token - // Literals without trailing space are missing the corresponding end-of-* counterpart. - auto const& token_end_offset = - (offset + 1 < num_tokens_out[0]) - ? get_token_index(tokens_gpu[offset + 1], token_indices_gpu[offset + 1]) - : input.size(); - - root_column.append_row(row_offset_zero, - json_col_t::StringColumn, - token_begin_offset, - token_end_offset, - zero_child_count); - return; - } + // The JSON root may only be a struct, list, string, or value node + CUDF_EXPECTS(is_valid_root_token(tokens[offset]), "Invalid beginning of JSON document"); - while (offset < num_tokens_out[0]) { + while (offset < tokens.size()) { // Verify there's at least the JSON root node left on the stack to which we can append data CUDF_EXPECTS(current_data_path.size() > 0, "Invalid JSON structure"); @@ -1126,7 +1254,7 @@ void make_json_column(json_column& root_column, "Invalid JSON structure"); // The token we're currently parsing - auto const& token = tokens_gpu[offset]; + auto const& token = tokens[offset]; #ifdef NJP_DEBUG_PRINT std::cout << "[" << token_to_string(token) << "]\n"; @@ -1149,9 +1277,9 @@ void make_json_column(json_column& root_column, // Add this struct node to the current column selected_col->append_row(target_row_index, - token_to_column_type(tokens_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), + token_to_column_type(tokens[offset]), + get_token_index(tokens[offset], token_indices_gpu[offset]), + get_token_index(tokens[offset], token_indices_gpu[offset]), zero_child_count); } @@ -1166,7 +1294,7 @@ void make_json_column(json_column& root_column, // Update row to account for string offset update_row(current_data_path.top().column, current_data_path.top().row_index, - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), + get_token_index(tokens[offset], token_indices_gpu[offset]), current_data_path.top().num_children); // Pop struct from the path stack @@ -1187,9 +1315,9 @@ void make_json_column(json_column& root_column, // Add this struct node to the current column selected_col->append_row(target_row_index, - token_to_column_type(tokens_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), + token_to_column_type(tokens[offset]), + get_token_index(tokens[offset], token_indices_gpu[offset]), + get_token_index(tokens[offset], token_indices_gpu[offset]), zero_child_count); } @@ -1204,7 +1332,7 @@ void make_json_column(json_column& root_column, // Update row to account for string offset update_row(current_data_path.top().column, current_data_path.top().row_index, - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]), + get_token_index(tokens[offset], token_indices_gpu[offset]), current_data_path.top().num_children); // Pop list from the path stack @@ -1215,6 +1343,7 @@ void make_json_column(json_column& root_column, else if (token == token_t::ErrorBegin) { #ifdef NJP_DEBUG_PRINT std::cout << "[ErrorBegin]\n"; + std::cout << "@" << get_token_index(tokens[offset], token_indices_gpu[offset]); #endif CUDF_FAIL("Parser encountered an invalid format."); } @@ -1223,16 +1352,15 @@ void make_json_column(json_column& root_column, else if (token == token_t::FieldNameBegin or token == token_t::StringBegin or token == token_t::ValueBegin) { // Verify that this token has the right successor to build a correct (being, end) token pair - CUDF_EXPECTS((offset + 1) < num_tokens_out[0], "Invalid JSON token sequence"); - CUDF_EXPECTS(tokens_gpu[offset + 1] == end_of_partner(token), "Invalid JSON token sequence"); + CUDF_EXPECTS((offset + 1) < tokens.size(), "Invalid JSON token sequence"); + CUDF_EXPECTS(tokens[offset + 1] == end_of_partner(token), "Invalid JSON token sequence"); // The offset to the first symbol from the JSON input associated with the current token - auto const& token_begin_offset = - get_token_index(tokens_gpu[offset], token_indices_gpu[offset]); + auto const& token_begin_offset = get_token_index(tokens[offset], token_indices_gpu[offset]); // The offset to one past the last symbol associated with the current token auto const& token_end_offset = - get_token_index(tokens_gpu[offset + 1], token_indices_gpu[offset + 1]); + get_token_index(tokens[offset + 1], token_indices_gpu[offset + 1]); // FieldNameBegin // For the current struct node in the tree, select the child column corresponding to this @@ -1371,26 +1499,52 @@ std::pair, std::vector> json_column_to } table_with_metadata parse_nested_json(host_span input, + cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + auto const new_line_delimited_json = options.is_enabled_lines(); + // Allocate device memory for the JSON input & copy over to device rmm::device_uvector d_input = cudf::detail::make_device_uvector_async(input, stream); // Get internal JSON column json_column root_column{}; std::stack data_path{}; - make_json_column(root_column, data_path, input, d_input, stream); + + constexpr uint32_t row_offset_zero = 0; + constexpr uint32_t token_begin_offset_zero = 0; + constexpr uint32_t token_end_offset_zero = 0; + constexpr uint32_t node_init_child_count_zero = 0; + + // We initialize the very root node and root column, which represent the JSON document being + // parsed. That root node is a list node and that root column is a list column. The column has the + // root node as its only row. The values parsed from the JSON input will be treated as follows: + // (1) For JSON lines: we expect to find a list of JSON values that all + // will be inserted into this root list column. (2) For regular JSON: we expect to have only a + // single value (list, struct, string, number, literal) that will be inserted into this root + // column. + root_column.append_row( + row_offset_zero, json_col_t::ListColumn, token_begin_offset_zero, token_end_offset_zero, 1); + + // Push the root node onto the stack for the data path + data_path.push({&root_column, row_offset_zero, nullptr, node_init_child_count_zero}); + + make_json_column(root_column, data_path, input, d_input, options, stream, mr); + + // data_root refers to the root column of the data represented by the given JSON string + auto const& data_root = + new_line_delimited_json ? root_column : root_column.child_columns.begin()->second; // Verify that we were in fact given a list of structs (or in JSON speech: an array of objects) auto constexpr single_child_col_count = 1; - CUDF_EXPECTS(root_column.type == json_col_t::ListColumn and - root_column.child_columns.size() == single_child_col_count and - root_column.child_columns.begin()->second.type == json_col_t::StructColumn, + CUDF_EXPECTS(data_root.type == json_col_t::ListColumn and + data_root.child_columns.size() == single_child_col_count and + data_root.child_columns.begin()->second.type == json_col_t::StructColumn, "Currently the nested JSON parser only supports an array of (nested) objects"); // Slice off the root list column, which has only a single row that contains all the structs - auto const& root_struct_col = root_column.child_columns.begin()->second; + auto const& root_struct_col = data_root.child_columns.begin()->second; // Initialize meta data to be populated while recursing through the tree of columns std::vector> out_columns; diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 67f0542ace2..232aaa51ef3 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -941,4 +942,68 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic) cudf::test::strings_column_wrapper({"1.1", "2.2"})); } +TEST_F(JsonReaderTest, JsonExperimentalLines) +{ + std::string const json_string = + R"({"a":"a0"} + {"a":"a1"} + {"a":"a2", "b":"b2"} + {"a":"a3", "c":"c3"} + {"a":"a4"})"; + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.c_str(), json_string.size()}) + .lines(true); + + // Read test data via existing, non-nested JSON lines reader + cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); + + // Read test data via new, nested JSON reader + json_lines_options.enable_experimental(true); + cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); + + // Verify that the data read via non-nested JSON lines reader matches the data read via nested + // JSON reader + CUDF_TEST_EXPECT_TABLES_EQUAL(current_reader_table.tbl->view(), new_reader_table.tbl->view()); +} + +TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions) +{ + std::array const json_inputs + // single column + {R"({"a":"a0"} + {"a":"a1"} + {"a":"a2"} + {"a":"a3"} + {"a":"a4"})", + // single column, single row + R"({"a":"a0"})", + // single row + R"({"a":"a0", "b":"b0"})", + // two column, two rows + R"({"a":"a0", "b":"b0"} + {"a":"a1", "b":"b1"})"}; + + for (auto const& json_string : json_inputs) { + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.c_str(), json_string.size()}) + .lines(true); + + // Read test data via existing, non-nested JSON lines reader + cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); + + // Read test data via new, nested JSON reader + json_lines_options.enable_experimental(true); + cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); + + // Verify that the data read via non-nested JSON lines reader matches the data read via nested + // JSON reader + CUDF_TEST_EXPECT_TABLES_EQUAL(current_reader_table.tbl->view(), new_reader_table.tbl->view()); + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index d426acf26f9..f0ececaf4eb 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -18,8 +18,10 @@ #include #include +#include #include #include +#include #include #include @@ -28,9 +30,6 @@ #include #include -#include -#include - #include namespace cuio_json = cudf::io::json; @@ -138,28 +137,27 @@ TEST_F(JsonTest, StackContext) using StackSymbolT = char; // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); + constexpr auto stream = cudf::default_stream_value; // Test input - std::string input = R"( [{)" - R"("category": "reference",)" - R"("index:": [4,12,42],)" - R"("author": "Nigel Rees",)" - R"("title": "[Sayings of the Century]",)" - R"("price": 8.95)" - R"(}, )" - R"({)" - R"("category": "reference",)" - R"("index": [4,{},null,{"a":[{ }, {}] } ],)" - R"("author": "Nigel Rees",)" - R"("title": "{}\\\"[], <=semantic-symbols-string\\\\",)" - R"("price": 8.95)" - R"(}] )"; + std::string const input = R"( [{)" + R"("category": "reference",)" + R"("index:": [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "[Sayings of the Century]",)" + R"("price": 8.95)" + R"(}, )" + R"({)" + R"("category": "reference",)" + R"("index": [4,{},null,{"a":[{ }, {}] } ],)" + R"("author": "Nigel Rees",)" + R"("title": "{}\\\"[], <=semantic-symbols-string\\\\",)" + R"("price": 8.95)" + R"(}] )"; // Prepare input & output buffers - rmm::device_uvector d_input(input.size(), stream_view); - hostdevice_vector stack_context(input.size(), stream_view); + rmm::device_uvector d_input(input.size(), stream); + hostdevice_vector stack_context(input.size(), stream); ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_input.data(), input.data(), @@ -168,13 +166,13 @@ TEST_F(JsonTest, StackContext) stream.value())); // Run algorithm - cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream_view); + cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream); // Copy back the results - stack_context.device_to_host(stream_view); + stack_context.device_to_host(stream); // Make sure we copied back the stack context - stream_view.synchronize(); + stream.synchronize(); std::vector golden_stack_context{ '_', '_', '_', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', @@ -204,15 +202,14 @@ TEST_F(JsonTest, StackContextUtf8) using StackSymbolT = char; // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); + constexpr auto stream = cudf::default_stream_value; // Test input - std::string input = R"([{"a":{"year":1882,"author": "Bharathi"}, {"a":"filip ʒakotɛ"}}])"; + std::string const input = R"([{"a":{"year":1882,"author": "Bharathi"}, {"a":"filip ʒakotɛ"}}])"; // Prepare input & output buffers - rmm::device_uvector d_input(input.size(), stream_view); - hostdevice_vector stack_context(input.size(), stream_view); + rmm::device_uvector d_input(input.size(), stream); + hostdevice_vector stack_context(input.size(), stream); ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_input.data(), input.data(), @@ -221,13 +218,13 @@ TEST_F(JsonTest, StackContextUtf8) stream.value())); // Run algorithm - cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream_view); + cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream); // Copy back the results - stack_context.device_to_host(stream_view); + stack_context.device_to_host(stream); // Make sure we copied back the stack context - stream_view.synchronize(); + stream.synchronize(); std::vector golden_stack_context{ '_', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', @@ -245,30 +242,30 @@ TEST_F(JsonTest, TokenStream) using cuio_json::SymbolOffsetT; using cuio_json::SymbolT; - constexpr std::size_t single_item = 1; - // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); + constexpr auto stream = cudf::default_stream_value; + + // Default parsing options + cudf::io::json_reader_options default_options{}; // Test input - std::string input = R"( [{)" - R"("category": "reference",)" - R"("index:": [4,12,42],)" - R"("author": "Nigel Rees",)" - R"("title": "[Sayings of the Century]",)" - R"("price": 8.95)" - R"(}, )" - R"({)" - R"("category": "reference",)" - R"("index": [4,{},null,{"a":[{ }, {}] } ],)" - R"("author": "Nigel Rees",)" - R"("title": "{}[], <=semantic-symbols-string",)" - R"("price": 8.95)" - R"(}] )"; + std::string const input = R"( [{)" + R"("category": "reference",)" + R"("index:": [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "[Sayings of the Century]",)" + R"("price": 8.95)" + R"(}, )" + R"({)" + R"("category": "reference",)" + R"("index": [4,{},null,{"a":[{ }, {}] } ],)" + R"("author": "Nigel Rees",)" + R"("title": "{}[], <=semantic-symbols-string",)" + R"("price": 8.95)" + R"(}] )"; // Prepare input & output buffers - rmm::device_uvector d_input(input.size(), stream_view); + rmm::device_uvector d_input(input.size(), stream); ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_input.data(), input.data(), @@ -276,24 +273,18 @@ TEST_F(JsonTest, TokenStream) cudaMemcpyHostToDevice, stream.value())); - hostdevice_vector tokens_gpu{input.size(), stream_view}; - hostdevice_vector token_indices_gpu{input.size(), stream_view}; - hostdevice_vector num_tokens_out{single_item, stream_view}; - // Parse the JSON and get the token stream - cuio_json::detail::get_token_stream(d_input, - tokens_gpu.device_ptr(), - token_indices_gpu.device_ptr(), - num_tokens_out.device_ptr(), - stream_view); + const auto [d_tokens_gpu, d_token_indices_gpu] = + cuio_json::detail::get_token_stream(d_input, default_options, stream); // Copy back the number of tokens that were written - num_tokens_out.device_to_host(stream_view); - tokens_gpu.device_to_host(stream_view); - token_indices_gpu.device_to_host(stream_view); + thrust::host_vector tokens_gpu = + cudf::detail::make_host_vector_async(d_tokens_gpu, stream); + thrust::host_vector token_indices_gpu = + cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); // Make sure we copied back all relevant data - stream_view.synchronize(); + stream.synchronize(); // Golden token stream sample using token_t = cuio_json::token_t; @@ -323,9 +314,10 @@ TEST_F(JsonTest, TokenStream) {267, token_t::StructEnd}, {268, token_t::ListEnd}}; // Verify the number of tokens matches - ASSERT_EQ(golden_token_stream.size(), num_tokens_out[0]); + ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size()); + ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size()); - for (std::size_t i = 0; i < num_tokens_out[0]; i++) { + for (std::size_t i = 0; i < tokens_gpu.size(); i++) { // Ensure the index the tokens are pointing to do match EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i; @@ -339,13 +331,15 @@ TEST_F(JsonTest, ExtractColumn) using cuio_json::SymbolT; // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); + constexpr auto stream = cudf::default_stream_value; - std::string input = R"( [{"a":0.0, "b":1.0}, {"a":0.1, "b":1.1}, {"a":0.2, "b":1.2}] )"; + // Default parsing options + cudf::io::json_reader_options default_options{}; + + std::string const input = R"( [{"a":0.0, "b":1.0}, {"a":0.1, "b":1.1}, {"a":0.2, "b":1.2}] )"; // Get the JSON's tree representation auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, stream_view); + cudf::host_span{input.data(), input.size()}, default_options, stream); auto const expected_col_count = 2; auto const first_column_index = 0; @@ -363,11 +357,13 @@ TEST_F(JsonTest, ExtractColumn) TEST_F(JsonTest, UTF_JSON) { // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); + constexpr auto stream = cudf::default_stream_value; + + // Default parsing options + cudf::io::json_reader_options default_options{}; // Only ASCII string - std::string ascii_pass = R"([ + std::string const ascii_pass = R"([ {"a":1,"b":2,"c":[3], "d": {}}, {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}}, {"a":1,"b":6.0,"c":[5, 7], "d": null}, @@ -375,20 +371,20 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}])"; - CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(ascii_pass, stream_view)); + CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(ascii_pass, default_options, stream)); // utf-8 string that fails parsing. - std::string utf_failed = R"([ + std::string const utf_failed = R"([ {"a":1,"b":2,"c":[3], "d": {}}, {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}}, {"a":1,"b":6.0,"c":[5, 7], "d": null}, {"a":1,"b":8.0,"c":null, "d": {}}, {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "filip ʒakotɛ"}}])"; - CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_failed, stream_view)); + CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_failed, default_options, stream)); // utf-8 string that passes parsing. - std::string utf_pass = R"([ + std::string const utf_pass = R"([ {"a":1,"b":2,"c":[3], "d": {}}, {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}}, {"a":1,"b":6.0,"c":[5, 7], "d": null}, @@ -396,19 +392,21 @@ TEST_F(JsonTest, UTF_JSON) {"a":1,"b":null,"c":null}, {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}, {"a":1,"b":NaN,"c":[null, null], "d": {"year": 2, "author": "filip ʒakotɛ"}}])"; - CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_pass, stream_view)); + CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_pass, default_options, stream)); } TEST_F(JsonTest, FromParquet) { using cuio_json::SymbolT; - std::string input = + std::string const input = R"([{"0":{},"1":[],"2":{}},{"1":[[""],[]],"2":{"2":""}},{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}])"; // Prepare cuda stream for data transfers & kernels - rmm::cuda_stream stream{}; - rmm::cuda_stream_view stream_view(stream); + constexpr auto stream = cudf::default_stream_value; + + // Default parsing options + cudf::io::json_reader_options default_options{}; // Binary parquet data containing the same data as the data represented by the JSON string. // We could add a dataset to include this file, but we don't want tests in cudf to have data. @@ -496,7 +494,7 @@ TEST_F(JsonTest, FromParquet) // Read in the data via the JSON parser auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, stream_view); + cudf::host_span{input.data(), input.size()}, default_options, stream); // Verify that the data read via parquet matches the data read via JSON CUDF_TEST_EXPECT_TABLES_EQUAL(cudf_table.tbl->view(), result.tbl->view()); diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 368015cf563..f3d9180d44d 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf @@ -575,12 +576,6 @@ def test_default_float_bitwidth(default_float_bitwidth): assert df["b"].dtype == np.dtype(f"f{default_float_bitwidth//8}") -def test_json_experimental(): - # should raise an exception, for now - with pytest.raises(RuntimeError): - cudf.read_json("", engine="cudf_experimental") - - def test_json_nested_basic(tmpdir): fname = tmpdir.mkdir("gdf_json").join("tmp_json_nested_basic") data = { @@ -594,3 +589,32 @@ def test_json_nested_basic(tmpdir): pdf = pd.read_json(fname, orient="records") assert_eq(pdf, df) + + +@pytest.mark.parametrize( + "data", + [ + { + "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], + "c2": [["l11", "l21"], ["l12", "l22"]], + }, + # Essential test case to handle omissions + { + "c1": [{"f2": "sf21"}, {"f1": "sf12"}], + "c2": [["l11", "l21"], []], + }, + ], +) +def test_json_nested_lines(data): + bytes = BytesIO() + pdf = pd.DataFrame(data) + pdf.to_json(bytes, orient="records", lines=True) + bytes.seek(0) + df = cudf.read_json( + bytes, engine="cudf_experimental", orient="records", lines=True + ) + bytes.seek(0) + pdf = pd.read_json(bytes, orient="records", lines=True) + # In the second test-case: + # Pandas omits "f1" in first row, so we have to enforce a common schema + assert df.to_arrow().equals(pa.Table.from_pandas(pdf))