diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 90dc898c552..1838459b16d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -328,6 +328,7 @@ add_library( src/io/csv/writer_impl.cu src/io/functions.cpp src/io/json/json_gpu.cu + src/io/json/nested_json_gpu.cu src/io/json/reader_impl.cu src/io/json/experimental/read_json.cpp src/io/orc/aggregate_orc_metadata.cpp diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index c5033868925..04a8418dcc2 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -142,7 +142,8 @@ class SingleSymbolSmemLUT { constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT const symbol) const { // Look up the symbol group for given symbol - return temp_storage.sym_to_sgid[min(symbol, num_valid_entries - 1)]; + return temp_storage + .sym_to_sgid[min(static_cast(symbol), num_valid_entries - 1U)]; } }; @@ -170,19 +171,21 @@ class TransitionTable { ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; }; - template ()})>> - static void InitDeviceTransitionTable(hostdevice_vector& transition_table_init, - std::vector> const& translation_table, - rmm::cuda_stream_view stream) + template + static void InitDeviceTransitionTable( + hostdevice_vector& transition_table_init, + std::array, MAX_NUM_STATES> const& translation_table, + rmm::cuda_stream_view stream) { // translation_table[state][symbol] -> new state for (std::size_t state = 0; state < translation_table.size(); ++state) { for (std::size_t symbol = 0; symbol < translation_table[state].size(); ++symbol) { CUDF_EXPECTS( - translation_table[state][symbol] <= std::numeric_limits::max(), + static_cast(translation_table[state][symbol]) <= + std::numeric_limits::max(), "Target state index value exceeds value representable by the transition table's type"); transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] = - translation_table[state][symbol]; + static_cast(translation_table[state][symbol]); } } @@ -319,7 +322,8 @@ class TransducerLookupTable { */ static void InitDeviceTranslationTable( hostdevice_vector& translation_table_init, - std::vector>> const& translation_table, + std::array, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& + translation_table, rmm::cuda_stream_view stream) { std::vector out_symbols; @@ -476,8 +480,8 @@ class Dfa { */ template Dfa(SymbolGroupIdItT const& symbol_vec, - std::vector> const& tt_vec, - std::vector>> const& out_tt_vec, + std::array, NUM_STATES> const& tt_vec, + std::array, NUM_SYMBOLS>, NUM_STATES> const& out_tt_vec, cudaStream_t stream) { constexpr std::size_t single_item = 1; diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp new file mode 100644 index 00000000000..3f7d73fb931 --- /dev/null +++ b/cpp/src/io/json/nested_json.hpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf::io::json { + +/// Type used to represent the atomic symbol type used within the finite-state machine +using SymbolT = char; + +/// Type used to represent the stack alphabet (i.e.: empty-stack, struct, list) +using StackSymbolT = char; + +/// Type used to index into the symbols within the JSON input +using SymbolOffsetT = uint32_t; + +/// Type large enough to support indexing up to max nesting level (must be signed) +using StackLevelT = int8_t; + +/// Type used to represent a symbol group id of the input alphabet in the pushdown automaton +using PdaInputSymbolGroupIdT = char; + +/// Type used to represent a symbol group id of the stack alphabet in the pushdown automaton +using PdaStackSymbolGroupIdT = char; + +/// Type used to represent a (input-symbol, stack-symbol)-tuple in stack-symbol-major order +using PdaSymbolGroupIdT = char; + +/// Type being emitted by the pushdown automaton transducer +using PdaTokenT = char; + +/** + * @brief Tokens emitted while parsing a JSON input + */ +enum token_t : PdaTokenT { + /// Beginning-of-struct token (on encounter of semantic '{') + StructBegin, + /// End-of-struct token (on encounter of semantic '}') + StructEnd, + /// Beginning-of-list token (on encounter of semantic '[') + ListBegin, + /// End-of-list token (on encounter of semantic ']') + ListEnd, + /// Beginning-of-field-name token (on encounter of first quote) + FieldNameBegin, + /// End-of-field-name token (on encounter of a field name's second quote) + FieldNameEnd, + /// Beginning-of-string-value token (on encounter of the string's first quote) + StringBegin, + /// End-of-string token (on encounter of a string's second quote) + StringEnd, + /// Beginning-of-value token (first character of literal or numeric) + ValueBegin, + /// Post-value token (first character after a literal or numeric string) + ValueEnd, + /// Beginning-of-error token (on first encounter of a parsing error) + ErrorBegin, + /// Total number of tokens + NUM_TOKENS +}; + +namespace detail { +/** + * @brief Identifies the stack context for each character from a JSON input. Specifically, we + * identify brackets and braces outside of quoted fields (e.g., field names, strings). + * At this stage, we do not perform bracket matching, i.e., we do not verify whether a closing + * bracket would actually pop a the corresponding opening brace. + * + * @param[in] d_json_in The string of input characters + * @param[out] d_top_of_stack Will be populated with what-is-on-top-of-the-stack for any given input + * character of \p d_json_in, where a '{' represents that the corresponding input character is + * within the context of a struct, a '[' represents that it is within the context of an array, and a + * '_' symbol that it is at the root of the JSON. + * @param[in] stream The cuda stream to dispatch GPU kernels to + */ +void get_stack_context(device_span d_json_in, + SymbolT* d_top_of_stack, + rmm::cuda_stream_view stream); + +/** + * @brief Parses the given JSON string and emits a sequence of tokens that demarcate relevant + * sections from the input. + * + * @param[in] d_json_in The JSON input + * @param[out] d_tokens Device memory to which the parsed tokens are written + * @param[out] d_tokens_indices Device memory to which the indices are written, where each index + * represents the offset within \p d_json_in that cause the input being written + * @param[out] d_num_written_tokens The total number of tokens that were parsed + * @param[in] stream The CUDA stream to which kernels are dispatched + */ +void get_token_stream(device_span d_json_in, + PdaTokenT* d_tokens, + SymbolOffsetT* d_tokens_indices, + SymbolOffsetT* d_num_written_tokens, + rmm::cuda_stream_view stream); +} // namespace detail + +} // namespace cudf::io::json diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu new file mode 100644 index 00000000000..b8e05054e11 --- /dev/null +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -0,0 +1,801 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nested_json.hpp" + +#include +#include +#include + +#include +#include + +#include + +namespace cudf::io::json { + +// JSON to stack operator DFA (Deterministic Finite Automata) +namespace to_stack_op { + +// Type used to represent the target state in the transition table +using StateT = char; + +/** + * @brief Definition of the DFA's states + */ +enum class dfa_states : StateT { + // The active state while outside of a string. When encountering an opening bracket or curly + // brace, we push it onto the stack. When encountering a closing bracket or brace, we pop from the + // stack. + TT_OOS = 0U, + + // The active state while within a string (e.g., field name or a string value). We do not push or + // pop from the stack while in this state. + TT_STR, + + // The active state after encountering an escape symbol (e.g., '\'), while in the TT_STR state. + TT_ESC, + + // Total number of states + TT_NUM_STATES +}; + +// Aliases for readability of the transition table +constexpr auto TT_OOS = dfa_states::TT_OOS; +constexpr auto TT_STR = dfa_states::TT_STR; +constexpr auto TT_ESC = dfa_states::TT_ESC; + +/** + * @brief Definition of the symbol groups + */ +enum class dfa_symbol_group_id : uint8_t { + OPENING_BRACE, ///< Opening brace SG: { + OPENING_BRACKET, ///< Opening bracket SG: [ + CLOSING_BRACE, ///< Closing brace SG: } + CLOSING_BRACKET, ///< Closing bracket SG: ] + QUOTE_CHAR, ///< Quote character SG: " + ESCAPE_CHAR, ///< Escape character SG: '\' + OTHER_SYMBOLS, ///< SG implicitly matching all other characters + NUM_SYMBOL_GROUPS ///< Total number of symbol groups +}; + +constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); +constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); + +// The i-th string representing all the characters of a symbol group +std::array const symbol_groups{ + {{"{"}, {"["}, {"}"}, {"]"}, {"\""}, {"\\"}}}; + +// Transition table +std::array, TT_NUM_STATES> const transition_table{ + {/* IN_STATE { [ } ] " \ OTHER */ + /* TT_OOS */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS}}, + /* TT_STR */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR}}, + /* TT_ESC */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}}; + +// Translation table (i.e., for each transition, what are the symbols that we output) +std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{ + {/* IN_STATE { [ } ] " \ OTHER */ + /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}}, + /* TT_STR */ {{{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}, + /* TT_ESC */ {{{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}}}; + +// The DFA's starting state +constexpr auto start_state = static_cast(TT_OOS); +} // namespace to_stack_op + +// JSON tokenizer pushdown automaton +namespace tokenizer_pda { + +// Type used to represent the target state in the transition table +using StateT = char; + +/** + * @brief Symbol groups for the input alphabet for the pushdown automaton + */ +enum class symbol_group_id : PdaSymbolGroupIdT { + /// Opening brace + OPENING_BRACE, + /// Opening bracket + OPENING_BRACKET, + /// Closing brace + CLOSING_BRACE, + /// Closing bracket + CLOSING_BRACKET, + /// Quote + QUOTE, + /// Escape + ESCAPE, + /// Comma + COMMA, + /// Colon + COLON, + /// Whitespace + WHITE_SPACE, + /// Other (any input symbol not assigned to one of the above symbol groups) + OTHER, + /// Total number of symbol groups amongst which to differentiate + NUM_PDA_INPUT_SGS +}; + +/** + * @brief Symbols in the stack alphabet + */ +enum class stack_symbol_group_id : PdaStackSymbolGroupIdT { + /// Symbol representing that we're at the JSON root (nesting level 0) + STACK_ROOT, + + /// Symbol representing that we're currently within a list object + STACK_LIST, + + /// Symbol representing that we're currently within a struct object + STACK_STRUCT, + + /// Total number of symbols in the stack alphabet + NUM_STACK_SGS +}; +constexpr auto NUM_PDA_INPUT_SGS = + static_cast(symbol_group_id::NUM_PDA_INPUT_SGS); +constexpr auto NUM_STACK_SGS = + static_cast(stack_symbol_group_id::NUM_STACK_SGS); + +/// Total number of symbol groups to differentiate amongst (stack alphabet * input alphabet) +constexpr PdaSymbolGroupIdT NUM_PDA_SGIDS = NUM_PDA_INPUT_SGS * NUM_STACK_SGS; + +/// Mapping a input symbol to the symbol group id +static __constant__ PdaSymbolGroupIdT tos_sg_to_pda_sgid[] = { + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::WHITE_SPACE), + static_cast(symbol_group_id::WHITE_SPACE), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::WHITE_SPACE), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::WHITE_SPACE), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::QUOTE), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::COMMA), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::COLON), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OPENING_BRACKET), + static_cast(symbol_group_id::ESCAPE), + static_cast(symbol_group_id::CLOSING_BRACKET), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::OPENING_BRACE), + static_cast(symbol_group_id::OTHER), + static_cast(symbol_group_id::CLOSING_BRACE), + static_cast(symbol_group_id::OTHER)}; + +/** + * @brief Maps a (top-of-stack symbol, input symbol)-pair to a symbol group id of the deterministic + * visibly pushdown automaton (DVPA) + */ +struct PdaSymbolToSymbolGroupId { + template + __device__ __forceinline__ PdaSymbolGroupIdT + operator()(thrust::tuple symbol_pair) + { + // The symbol read from the input + auto symbol = thrust::get<0>(symbol_pair); + + // The stack symbol (i.e., what is on top of the stack at the time the input symbol was read) + // I.e., whether we're reading in something within a struct, a list, or the JSON root + auto stack_symbol = thrust::get<1>(symbol_pair); + + // The stack symbol offset: '_' is the root group (0), '[' is the list group (1), '{' is the + // struct group (2) + int32_t stack_idx = static_cast( + (stack_symbol == '_') ? stack_symbol_group_id::STACK_ROOT + : ((stack_symbol == '[') ? stack_symbol_group_id::STACK_LIST + : stack_symbol_group_id::STACK_STRUCT)); + + // The relative symbol group id of the current input symbol + constexpr auto pda_sgid_lookup_size = + static_cast(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0])); + PdaSymbolGroupIdT symbol_gid = + tos_sg_to_pda_sgid[min(static_cast(symbol), pda_sgid_lookup_size - 1)]; + return stack_idx * static_cast(symbol_group_id::NUM_PDA_INPUT_SGS) + + symbol_gid; + } +}; + +// The states defined by the pushdown automaton +enum class pda_state_t : StateT { + // Beginning of value + PD_BOV, + // Beginning of array + PD_BOA, + // Literal or number + PD_LON, + // String + PD_STR, + // After escape char when within string + PD_SCE, + // After having parsed a value + PD_PVL, + // Before the next field name + PD_BFN, + // Field name + PD_FLN, + // After escape char when within field name + PD_FNE, + // After a field name inside a struct + PD_PFN, + // Error state (trap state) + PD_ERR, + // Total number of PDA states + PD_NUM_STATES +}; + +// Aliases for readability of the transition table +constexpr auto PD_BOV = pda_state_t::PD_BOV; +constexpr auto PD_BOA = pda_state_t::PD_BOA; +constexpr auto PD_LON = pda_state_t::PD_LON; +constexpr auto PD_STR = pda_state_t::PD_STR; +constexpr auto PD_SCE = pda_state_t::PD_SCE; +constexpr auto PD_PVL = pda_state_t::PD_PVL; +constexpr auto PD_BFN = pda_state_t::PD_BFN; +constexpr auto PD_FLN = pda_state_t::PD_FLN; +constexpr auto PD_FNE = pda_state_t::PD_FNE; +constexpr auto PD_PFN = pda_state_t::PD_PFN; +constexpr auto PD_ERR = pda_state_t::PD_ERR; + +constexpr auto PD_NUM_STATES = static_cast(pda_state_t::PD_NUM_STATES); + +// The starting state of the pushdown automaton +constexpr auto start_state = static_cast(pda_state_t::PD_BOV); + +// Identity symbol to symbol group lookup table +std::vector> const pda_sgids{ + {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}, {13}, {14}, + {15}, {16}, {17}, {18}, {19}, {20}, {21}, {22}, {23}, {24}, {25}, {26}, {27}, {28}, {29}}; + +/** + * @brief Getting the transition table + */ +auto get_transition_table() +{ + std::array, PD_NUM_STATES> pda_tt; + // { [ } ] " \ , : space other + pda_tt[static_cast(pda_state_t::PD_BOV)] = { + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON}; + pda_tt[static_cast(pda_state_t::PD_BOA)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_LON)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_LON}; + pda_tt[static_cast(pda_state_t::PD_STR)] = { + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[static_cast(pda_state_t::PD_SCE)] = { + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[static_cast(pda_state_t::PD_PVL)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_BFN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_FLN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + pda_tt[static_cast(pda_state_t::PD_FNE)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + pda_tt[static_cast(pda_state_t::PD_PFN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_ERR)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR}; + return pda_tt; +} + +/** + * @brief Getting the translation table + */ +auto get_translation_table() +{ + std::array, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt; + pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{{token_t::StructBegin}, + {token_t::ListBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::StringBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, + {token_t::ValueBegin}, + {token_t::StructBegin}, + {token_t::ListBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::StringBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, + {token_t::ValueBegin}, + {token_t::StructBegin}, + {token_t::ListBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::StringBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, + {token_t::ValueBegin}}}; + pda_tlt[static_cast(pda_state_t::PD_BOA)] = {{{token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::StructBegin}, + {token_t::ListBegin}, + {token_t::ErrorBegin}, + {token_t::ListEnd}, + {token_t::StringBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, + {token_t::ValueBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::StructEnd}, + {token_t::ErrorBegin}, + {token_t::FieldNameBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, + {token_t::ErrorBegin}}}; + pda_tlt[static_cast(pda_state_t::PD_LON)] = {{{token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ValueEnd}, + {}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ValueEnd, token_t::ListEnd}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ValueEnd}, + {token_t::ErrorBegin}, + {token_t::ValueEnd}, + {}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ValueEnd, token_t::StructEnd}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ValueEnd}, + {token_t::ErrorBegin}, + {token_t::ValueEnd}, + {}}}; + pda_tlt[static_cast(pda_state_t::PD_STR)] = { + {{}, {}, {}, {}, {token_t::StringEnd}, {}, {}, {}, {}, {}, + {}, {}, {}, {}, {token_t::StringEnd}, {}, {}, {}, {}, {}, + {}, {}, {}, {}, {token_t::StringEnd}, {}, {}, {}, {}, {}}}; + pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, + {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, + {}, {}, {}, {}, {}, {}, {}, {}, {}, {}}}; + pda_tlt[static_cast(pda_state_t::PD_PVL)] = {{{token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ListEnd}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, + {token_t::ErrorBegin}, + {}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::StructEnd}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, + {token_t::ErrorBegin}, + {}, + {token_t::ErrorBegin}}}; + pda_tlt[static_cast(pda_state_t::PD_BFN)] = {{{token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::FieldNameBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, + {token_t::ErrorBegin}}}; + pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{{token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, + {}, + {}, + {}, + {token_t::FieldNameEnd}, + {}, + {}, + {}, + {}, + {}}}; + pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{{token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}, + {}}}; + pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{{token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {token_t::ErrorBegin}, + {}, + {}, + {token_t::ErrorBegin}}}; + pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, + {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, + {}, {}, {}, {}, {}, {}, {}, {}, {}, {}}}; + return pda_tlt; +} + +} // namespace tokenizer_pda + +/** + * @brief Function object used to filter for brackets and braces that represent push and pop + * operations + */ +struct JSONToStackOp { + template + constexpr CUDF_HOST_DEVICE fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const + { + return (stack_symbol == '{' || stack_symbol == '[') ? fst::stack_op_type::PUSH + : (stack_symbol == '}' || stack_symbol == ']') ? fst::stack_op_type::POP + : fst::stack_op_type::READ; + } +}; + +namespace detail { + +void get_stack_context(device_span json_in, + SymbolT* d_top_of_stack, + rmm::cuda_stream_view stream) +{ + constexpr std::size_t single_item = 1; + + // Symbol representing the JSON-root (i.e., we're at nesting level '0') + constexpr StackSymbolT root_symbol = '_'; + // This can be any stack symbol from the stack alphabet that does not push onto stack + constexpr StackSymbolT read_symbol = 'x'; + + // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes) + hostdevice_vector num_stack_ops(single_item, stream); + + // Sequence of stack symbols and their position in the original input (sparse representation) + rmm::device_uvector stack_ops{json_in.size(), stream}; + rmm::device_uvector stack_op_indices{json_in.size(), stream}; + + // Prepare finite-state transducer that only selects '{', '}', '[', ']' outside of quotes + using ToStackOpFstT = + cudf::io::fst::detail::Dfa( + to_stack_op::dfa_symbol_group_id::NUM_SYMBOL_GROUPS), + static_cast(to_stack_op::dfa_states::TT_NUM_STATES)>; + ToStackOpFstT json_to_stack_ops_fst{to_stack_op::symbol_groups, + to_stack_op::transition_table, + to_stack_op::translation_table, + stream}; + + // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end + // of structs/lists + json_to_stack_ops_fst.Transduce(json_in.begin(), + static_cast(json_in.size()), + stack_ops.data(), + stack_op_indices.data(), + num_stack_ops.device_ptr(), + to_stack_op::start_state, + stream); + + // stack operations with indices are converted to top of the stack for each character in the input + fst::sparse_stack_op_to_top_of_stack( + stack_ops.data(), + device_span{stack_op_indices.data(), stack_op_indices.size()}, + JSONToStackOp{}, + d_top_of_stack, + root_symbol, + read_symbol, + json_in.size(), + stream); +} + +// TODO: return pair of device_uvector instead of passing pre-allocated pointers. +void get_token_stream(device_span json_in, + PdaTokenT* d_tokens, + SymbolOffsetT* d_tokens_indices, + SymbolOffsetT* d_num_written_tokens, + rmm::cuda_stream_view stream) +{ + // Memory holding the top-of-stack stack context for the input + rmm::device_uvector stack_op_indices{json_in.size(), stream}; + + // Identify what is the stack context for each input character (is it: JSON-root, struct, or list) + get_stack_context(json_in, stack_op_indices.data(), stream); + + // Prepare for PDA transducer pass, merging input symbols with stack symbols + rmm::device_uvector pda_sgids{json_in.size(), stream}; + auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_op_indices.data()); + thrust::transform(rmm::exec_policy(stream), + zip_in, + zip_in + json_in.size(), + pda_sgids.data(), + tokenizer_pda::PdaSymbolToSymbolGroupId{}); + + // PDA transducer alias + using ToTokenStreamFstT = + cudf::io::fst::detail::Dfa( + tokenizer_pda::pda_state_t::PD_NUM_STATES)>; + + // Instantiating PDA transducer + ToTokenStreamFstT json_to_tokens_fst{tokenizer_pda::pda_sgids, + tokenizer_pda::get_transition_table(), + tokenizer_pda::get_translation_table(), + stream}; + + // Perform a PDA-transducer pass + json_to_tokens_fst.Transduce(pda_sgids.begin(), + static_cast(json_in.size()), + d_tokens, + d_tokens_indices, + d_num_written_tokens, + tokenizer_pda::start_state, + stream); +} + +} // namespace detail + +} // namespace cudf::io::json diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 4a3eb1b9aef..be610d33b1b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -222,6 +222,7 @@ ConfigureTest(FILE_IO_TEST io/file_io_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp) ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp) +ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cu) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu) diff --git a/cpp/tests/io/fst/common.hpp b/cpp/tests/io/fst/common.hpp index bf19a9e8f6a..ce09c810e88 100644 --- a/cpp/tests/io/fst/common.hpp +++ b/cpp/tests/io/fst/common.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -24,7 +25,7 @@ namespace cudf::test::io::json { // TEST FST SPECIFICATIONS //------------------------------------------------------------------------------ // FST to check for brackets and braces outside of pairs of quotes -enum DFA_STATES : char { +enum class dfa_states : char { // The state being active while being outside of a string. When encountering an opening bracket or // curly brace, we push it onto the stack. When encountering a closing bracket or brace, we pop it // from the stack. @@ -39,36 +40,46 @@ enum DFA_STATES : char { TT_NUM_STATES }; -// Definition of the symbol groups -enum PDA_SG_ID { - OBC = 0U, ///< Opening brace SG: { - OBT, ///< Opening bracket SG: [ - CBC, ///< Closing brace SG: } - CBT, ///< Closing bracket SG: ] - QTE, ///< Quote character SG: " - ESC, ///< Escape character SG: '\' - OTR, ///< SG implicitly matching all other characters +/** + * @brief Definition of the symbol groups + */ +enum class dfa_symbol_group_id : uint32_t { + OPENING_BRACE, ///< Opening brace SG: { + OPENING_BRACKET, ///< Opening bracket SG: [ + CLOSING_BRACE, ///< Closing brace SG: } + CLOSING_BRACKET, ///< Closing bracket SG: ] + QUOTE_CHAR, ///< Quote character SG: " + ESCAPE_CHAR, ///< Escape character SG: '\' + OTHER_SYMBOLS, ///< SG implicitly matching all other characters NUM_SYMBOL_GROUPS ///< Total number of symbol groups }; +// Aliases for readability of the transition table +constexpr auto TT_OOS = dfa_states::TT_OOS; +constexpr auto TT_STR = dfa_states::TT_STR; +constexpr auto TT_ESC = dfa_states::TT_ESC; + +constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); +constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); + // Transition table -const std::vector> pda_state_tt = { - /* IN_STATE { [ } ] " \ OTHER */ - /* TT_OOS */ {TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS}, - /* TT_STR */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR}, - /* TT_ESC */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}; +std::array, TT_NUM_STATES> const pda_state_tt{ + {/* IN_STATE { [ } ] " \ OTHER */ + /* TT_OOS */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS}}, + /* TT_STR */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR}}, + /* TT_ESC */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}}; // Translation table (i.e., for each transition, what are the symbols that we output) -const std::vector>> pda_out_tt = { - /* IN_STATE { [ } ] " \ OTHER */ - /* TT_OOS */ {{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}, - /* TT_STR */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}, - /* TT_ESC */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}; +std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const pda_out_tt{ + {/* IN_STATE { [ } ] " \ OTHER */ + /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}}, + /* TT_STR */ {{{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}, + /* TT_ESC */ {{{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}}}; // The i-th string representing all the characters of a symbol group -const std::vector pda_sgs = {"{", "[", "}", "]", "\"", "\\"}; +std::array const pda_sgs{"{", "[", "}", "]", "\"", "\\"}; // The DFA's starting state -constexpr DFA_STATES start_state = TT_OOS; +constexpr char start_state = static_cast(dfa_states::TT_OOS); } // namespace cudf::test::io::json diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index c472b6851b0..64ecf1f7329 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -109,7 +109,7 @@ static std::pair fst_baseline(InputItT begin, out_index_tape = std::fill_n(out_index_tape, out_size, in_offset); // Transition the state of the finite-state machine - state = transition_table[state][symbol_group]; + state = static_cast(transition_table[state][symbol_group]); // Continue with next symbol from input tape in_offset++; diff --git a/cpp/tests/io/nested_json_test.cu b/cpp/tests/io/nested_json_test.cu new file mode 100644 index 00000000000..0b7e2bb82f8 --- /dev/null +++ b/cpp/tests/io/nested_json_test.cu @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include +#include + +namespace cuio_json = cudf::io::json; + +// Base test fixture for tests +struct JsonTest : public cudf::test::BaseFixture { +}; + +TEST_F(JsonTest, StackContext) +{ + // Type used to represent the atomic symbol type used within the finite-state machine + using SymbolT = char; + using StackSymbolT = char; + + // Prepare cuda stream for data transfers & kernels + rmm::cuda_stream stream{}; + rmm::cuda_stream_view stream_view(stream); + + // Test input + std::string input = R"( [{)" + R"("category": "reference",)" + R"("index:": [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "[Sayings of the Century]",)" + R"("price": 8.95)" + R"(}, )" + R"({)" + R"("category": "reference",)" + R"("index": [4,{},null,{"a":[{ }, {}] } ],)" + R"("author": "Nigel Rees",)" + R"("title": "{}\\\"[], <=semantic-symbols-string\\\\",)" + R"("price": 8.95)" + R"(}] )"; + + // Prepare input & output buffers + rmm::device_uvector d_input(input.size(), stream_view); + hostdevice_vector stack_context(input.size(), stream_view); + + ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_input.data(), + input.data(), + input.size() * sizeof(SymbolT), + cudaMemcpyHostToDevice, + stream.value())); + + // Run algorithm + cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream_view); + + // Copy back the results + stack_context.device_to_host(stream_view); + + // Make sure we copied back the stack context + stream_view.synchronize(); + + std::vector golden_stack_context{ + '_', '_', '_', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '[', '[', '[', '[', '[', '[', '[', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '[', '[', '[', '[', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '[', '[', '[', '{', '[', '[', '[', '[', '[', '[', '[', '{', + '{', '{', '{', '{', '[', '{', '{', '[', '[', '[', '{', '[', '{', '{', '[', '[', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '_'}; + + ASSERT_EQ(golden_stack_context.size(), stack_context.size()); + CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size()); +} + +TEST_F(JsonTest, StackContextUtf8) +{ + // Type used to represent the atomic symbol type used within the finite-state machine + using SymbolT = char; + using StackSymbolT = char; + + // Prepare cuda stream for data transfers & kernels + rmm::cuda_stream stream{}; + rmm::cuda_stream_view stream_view(stream); + + // Test input + std::string input = R"([{"a":{"year":1882,"author": "Bharathi"}, {"a":"filip ʒakotɛ"}}])"; + + // Prepare input & output buffers + rmm::device_uvector d_input(input.size(), stream_view); + hostdevice_vector stack_context(input.size(), stream_view); + + ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_input.data(), + input.data(), + input.size() * sizeof(SymbolT), + cudaMemcpyHostToDevice, + stream.value())); + + // Run algorithm + cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream_view); + + // Copy back the results + stack_context.device_to_host(stream_view); + + // Make sure we copied back the stack context + stream_view.synchronize(); + + std::vector golden_stack_context{ + '_', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '['}; + + ASSERT_EQ(golden_stack_context.size(), stack_context.size()); + CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size()); +} + +TEST_F(JsonTest, TokenStream) +{ + using cuio_json::PdaTokenT; + using cuio_json::SymbolOffsetT; + using cuio_json::SymbolT; + + constexpr std::size_t single_item = 1; + + // Prepare cuda stream for data transfers & kernels + rmm::cuda_stream stream{}; + rmm::cuda_stream_view stream_view(stream); + + // Test input + std::string input = R"( [{)" + R"("category": "reference",)" + R"("index:": [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "[Sayings of the Century]",)" + R"("price": 8.95)" + R"(}, )" + R"({)" + R"("category": "reference",)" + R"("index": [4,{},null,{"a":[{ }, {}] } ],)" + R"("author": "Nigel Rees",)" + R"("title": "{}[], <=semantic-symbols-string",)" + R"("price": 8.95)" + R"(}] )"; + + // Prepare input & output buffers + rmm::device_uvector d_input(input.size(), stream_view); + + ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_input.data(), + input.data(), + input.size() * sizeof(SymbolT), + cudaMemcpyHostToDevice, + stream.value())); + + hostdevice_vector tokens_gpu{input.size(), stream_view}; + hostdevice_vector token_indices_gpu{input.size(), stream_view}; + hostdevice_vector num_tokens_out{single_item, stream_view}; + + // Parse the JSON and get the token stream + cuio_json::detail::get_token_stream(d_input, + tokens_gpu.device_ptr(), + token_indices_gpu.device_ptr(), + num_tokens_out.device_ptr(), + stream_view); + + // Copy back the number of tokens that were written + num_tokens_out.device_to_host(stream_view); + tokens_gpu.device_to_host(stream_view); + token_indices_gpu.device_to_host(stream_view); + + // Make sure we copied back all relevant data + stream_view.synchronize(); + + // Golden token stream sample + using token_t = cuio_json::token_t; + std::vector> golden_token_stream = { + {2, token_t::ListBegin}, {3, token_t::StructBegin}, {4, token_t::FieldNameBegin}, + {13, token_t::FieldNameEnd}, {16, token_t::StringBegin}, {26, token_t::StringEnd}, + {28, token_t::FieldNameBegin}, {35, token_t::FieldNameEnd}, {38, token_t::ListBegin}, + {39, token_t::ValueBegin}, {40, token_t::ValueEnd}, {41, token_t::ValueBegin}, + {43, token_t::ValueEnd}, {44, token_t::ValueBegin}, {46, token_t::ValueEnd}, + {46, token_t::ListEnd}, {48, token_t::FieldNameBegin}, {55, token_t::FieldNameEnd}, + {58, token_t::StringBegin}, {69, token_t::StringEnd}, {71, token_t::FieldNameBegin}, + {77, token_t::FieldNameEnd}, {80, token_t::StringBegin}, {105, token_t::StringEnd}, + {107, token_t::FieldNameBegin}, {113, token_t::FieldNameEnd}, {116, token_t::ValueBegin}, + {120, token_t::ValueEnd}, {120, token_t::StructEnd}, {124, token_t::StructBegin}, + {125, token_t::FieldNameBegin}, {134, token_t::FieldNameEnd}, {137, token_t::StringBegin}, + {147, token_t::StringEnd}, {149, token_t::FieldNameBegin}, {155, token_t::FieldNameEnd}, + {158, token_t::ListBegin}, {159, token_t::ValueBegin}, {160, token_t::ValueEnd}, + {161, token_t::StructBegin}, {162, token_t::StructEnd}, {164, token_t::ValueBegin}, + {168, token_t::ValueEnd}, {169, token_t::StructBegin}, {170, token_t::FieldNameBegin}, + {172, token_t::FieldNameEnd}, {174, token_t::ListBegin}, {175, token_t::StructBegin}, + {177, token_t::StructEnd}, {180, token_t::StructBegin}, {181, token_t::StructEnd}, + {182, token_t::ListEnd}, {184, token_t::StructEnd}, {186, token_t::ListEnd}, + {188, token_t::FieldNameBegin}, {195, token_t::FieldNameEnd}, {198, token_t::StringBegin}, + {209, token_t::StringEnd}, {211, token_t::FieldNameBegin}, {217, token_t::FieldNameEnd}, + {220, token_t::StringBegin}, {252, token_t::StringEnd}, {254, token_t::FieldNameBegin}, + {260, token_t::FieldNameEnd}, {263, token_t::ValueBegin}, {267, token_t::ValueEnd}, + {267, token_t::StructEnd}, {268, token_t::ListEnd}}; + + // Verify the number of tokens matches + ASSERT_EQ(golden_token_stream.size(), num_tokens_out[0]); + + for (std::size_t i = 0; i < num_tokens_out[0]; i++) { + // Ensure the index the tokens are pointing to do match + EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i; + + // Ensure the token category is correct + EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i; + } +}