From 7fc8619aa6f1f990da53e5f211c5382b7fb89322 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 3 May 2022 07:05:44 -0700 Subject: [PATCH] rebase on latest FST --- cpp/CMakeLists.txt | 1 + cpp/src/io/json/nested_json.h | 116 ++++++++ cpp/src/io/json/nested_json_gpu.cu | 410 +++++++++++++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/io/nested_json_test.cu | 189 +++++++++++++ 5 files changed, 717 insertions(+) create mode 100644 cpp/src/io/json/nested_json.h create mode 100644 cpp/src/io/json/nested_json_gpu.cu create mode 100644 cpp/tests/io/nested_json_test.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 86bfdc1444b..5db5ad61f90 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -321,6 +321,7 @@ add_library( src/io/csv/writer_impl.cu src/io/functions.cpp src/io/json/json_gpu.cu + src/io/json/nested_json_gpu.cu src/io/json/reader_impl.cu src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu diff --git a/cpp/src/io/json/nested_json.h b/cpp/src/io/json/nested_json.h new file mode 100644 index 00000000000..58f30c7b9ac --- /dev/null +++ b/cpp/src/io/json/nested_json.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { +namespace io { +namespace json { +namespace gpu { + +/// Type used to represent the atomic symbol type used within the finite-state machine +using SymbolT = char; + +/// Type used to represent the stack alphabet (i.e.: empty-stack, struct, list) +using StackSymbolT = char; + +/// Type used to index into the symbols within the JSON input +using SymbolOffsetT = uint32_t; + +/// Type large enough to support indexing up to max nesting level (must be signed) +using StackLevelT = int8_t; + +/// Type used to represent a symbol group id of the input alphabet in the pushdown automaton +using PdaInputSymbolGroupIdT = char; + +/// Type used to represent a symbol group id of the stack alphabet in the pushdown automaton +using PdaStackSymbolGroupIdT = char; + +/// Type used to represent a (input-symbol, stack-symbole)-tuple in stack-symbole-major order +using PdaSymbolGroupIdT = char; + +/// Type being emitted by the pushdown automaton transducer +using PdaTokenT = char; + +/** + * @brief Tokens emitted while parsing a JSON input + */ +enum token_t : PdaTokenT { + /// Beginning-of-struct token (on encounter of semantic '{') + TK_BOS, + /// Beginning-of-list token (on encounter of semantic '[') + TK_BOL, + /// Beginning-of-error token (on first encounter of a parsing error) + TK_ERR, + /// Beginning-of-string-value token (on encounter of the string's first quote) + TK_BST, + /// Beginning-of-value token (first character of literal or numeric) + TK_BOV, + /// End-of-list token (on encounter of semantic ']') + TK_EOL, + /// End-of-struct token (on encounter of semantic '}') + TK_EOS, + /// Beginning-of-field-name token (on encounter of first quote) + TK_BFN, + /// Post-value token (first character after a literal or numeric string) + TK_POV, + /// End-of-string token (on encounter of a string's second quote) + TK_EST, + /// End-of-field-name token (on encounter of a field name's second quote) + TK_EFN, + /// Total number of tokens + NUM_TOKENS +}; + +/** + * @brief Identifies the stack context for each character from a JSON input. Specifically, we + * identify brackets and braces outside of quoted fields (e.g., field names, strings). + * At this stage, we do not perform bracket matching, i.e., we do not verify whether a closing + * bracket would actually pop a the corresponding opening brace. + * + * @param d_json_in The string of input characters + * @param d_top_of_stack + * @param stream The cuda stream to dispatch GPU kernels to + */ +void get_stack_context(device_span d_json_in, + device_span d_top_of_stack, + rmm::cuda_stream_view stream); + +/** + * @brief Parses the given JSON string and emits a sequence of tokens that demarcate relevant + * sections from the input. + * + * @param d_json_in The JSON input + * @param d_tokens_out Device memory to which the parsed tokens are written + * @param d_tokens_indices Device memory to which the indices are written, where each index + * represents the offset within \p d_json_in that cause the input being written + * @param d_num_written_tokens The total number of tokens that were parsed + * @param stream The CUDA stream to which kernels are dispatched + */ +void get_token_stream(device_span d_json_in, + device_span d_tokens, + device_span d_tokens_indices, + SymbolOffsetT* d_num_written_tokens, + rmm::cuda_stream_view stream); + +} // namespace gpu +} // namespace json +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu new file mode 100644 index 00000000000..ae1767bf63a --- /dev/null +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nested_json.h" + +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace io { +namespace json { +namespace gpu { + +//------------------------------------------------------------------------------ +// JSON-TO-STACK-OP DFA +//------------------------------------------------------------------------------ +namespace to_stack_op { + +/** + * @brief Definition of the DFA's states + */ +enum DFA_STATES { + // The state being active while being outside of a string. When encountering an opening bracket + // or curly brace, we push it onto the stack. When encountering a closing bracket or brace, we + // pop from the stack. + TT_OOS = 0U, + + // The state being active while being within a string (e.g., field name or a string value). We do + // not push or pop from the stack while being in this state. + TT_STR, + + // The state being active after encountering an escape symbol (e.g., '\'), while being in the + // TT_STR state. + TT_ESC, + + // Total number of states + TT_NUM_STATES +}; + +/** + * @brief Definition of the symbol groups + */ +enum DFA_SGID { + OBC = 0U, ///< Opening brace SG: { + OBT, ///< Opening bracket SG: [ + CBC, ///< Closing brace SG: } + CBT, ///< Closing bracket SG: ] + QTE, ///< Quote character SG: " + ESC, ///< Escape character SG: '\' + OTR, ///< SG implicitly matching all other characters + NUM_SYMBOL_GROUPS ///< Total number of symbol groups +}; + +// The i-th string representing all the characters of a symbol group +const std::vector symbol_groups = {"{", "[", "}", "]", "\"", "\\"}; + +// Transition table +const std::vector> transition_table = { + /* IN_STATE { [ } ] " \ OTHER */ + /* TT_OOS */ {TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS}, + /* TT_STR */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR, TT_STR}, + /* TT_ESC */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}; + +// Translation table (i.e., for each transition, what are the symbols that we output) +const std::vector>> translation_table = { + /* IN_STATE { [ } ] " \ OTHER */ + /* TT_OOS */ {{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}, + /* TT_STR */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}, + /* TT_ESC */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}; + +// The DFA's starting state +constexpr int32_t start_state = TT_OOS; +} // namespace to_stack_op + +//------------------------------------------------------------------------------ +// JSON TOKENIZER PUSHDOWN AUTOMATON +//------------------------------------------------------------------------------ +namespace tokenizer_pda { + +/** + * @brief Symbol groups for the input alphabet for the pushdown automaton + */ +enum SGID : PdaSymbolGroupIdT { + /// Opening brace + OBC, + /// Opening bracket + OBT, + /// Closing brace + CBC, + /// Closing bracket + CBT, + /// Quote + QTE, + /// Escape + ESC, + /// Comma + CMA, + /// Colon + CLN, + /// Whitespace + WSP, + /// Other (any input symbol not assigned to one of the above symbol groups) + OTR, + /// Total number of symbol groups amongst which to differentiate + NUM_PDA_INPUT_SGS +}; + +/** + * @brief Symbols in the stack alphabet + */ +enum STACK_SGID : PdaStackSymbolGroupIdT { + /// Symbol representing the JSON-root (i.e., we're at nesting level '0') + STACK_ROOT = 0, + + /// Symbol representing that we're currently within a list object + STACK_LIST = 1, + + /// Symbol representing that we're currently within a struct object + STACK_STRUCT = 2, + + /// Total number of symbols in the stack alphabet + NUM_STACK_SGS +}; + +/// Total number of symbol groups to differentiate amongst (stack alphabet * input alphabet) +constexpr PdaSymbolGroupIdT NUM_PDA_SGIDS = NUM_PDA_INPUT_SGS * NUM_STACK_SGS; + +/// Mapping a input symbol to the symbol group id +static __constant__ PdaSymbolGroupIdT tos_sg_to_pda_sgid[] = { + OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, WSP, WSP, OTR, OTR, WSP, OTR, OTR, OTR, OTR, OTR, + OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, WSP, OTR, QTE, OTR, OTR, OTR, + OTR, OTR, OTR, OTR, OTR, OTR, CMA, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, + OTR, CLN, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, + OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OBT, ESC, CBT, OTR, + OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, + OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OBC, OTR, CBC, OTR}; + +/** + * @brief Maps a (top-of-stack symbol, input symbol)-pair to a symbol group id of the DVPA + */ +struct PdaSymbolToSymbolGroupId { + template + __device__ __forceinline__ PdaSymbolGroupIdT + operator()(thrust::tuple symbol_pair) + { + // The symbol read from the input + auto symbol = thrust::get<0>(symbol_pair); + + // The stack symbol (i.e., what is on top of the stack at the time the input symbol was read) + // I.e., whether we're reading in something within a struct, a list, or the JSON root + auto stack_symbol = thrust::get<1>(symbol_pair); + + // The stack symbol offset: '_' is the root group (0), '[' is the list group (1), '{' is the + // struct group (2) + int32_t stack_idx = + (stack_symbol == '_') ? STACK_ROOT : ((stack_symbol == '[') ? STACK_LIST : STACK_STRUCT); + + // The relative symbol group id of the current input symbol + PdaSymbolGroupIdT symbol_gid = tos_sg_to_pda_sgid[min( + static_cast(symbol), + static_cast(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0])) - 1)]; + return stack_idx * NUM_PDA_INPUT_SGS + symbol_gid; + } +}; + +// The states defined by the pushdown automaton +enum pda_state_t : int32_t { + PD_BOV, + PD_BOA, + PD_LON, + PD_STR, + PD_SCE, + PD_PVL, + PD_BFN, + PD_FLN, + PD_FNE, + PD_PFN, + PD_ERR, + PD_NUM_STATES +}; + +// The starting state of the pushdown automaton +constexpr int32_t start_state = PD_BOV; + +// Identity symbol to symbol group lookup table +const std::vector> pda_sgids{ + {0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}, {13}, {14}, + {15}, {16}, {17}, {18}, {19}, {20}, {21}, {22}, {23}, {24}, {25}, {26}, {27}, {28}, {29}}; + +/** + * @brief Getting the transition table + */ +std::vector> get_transition_table() +{ + std::vector> pda_tt(PD_NUM_STATES); + pda_tt[PD_BOV] = {PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON}; + pda_tt[PD_BOA] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_ERR}; + pda_tt[PD_LON] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_LON}; + pda_tt[PD_STR] = {PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[PD_SCE] = {PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[PD_PVL] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_ERR}; + pda_tt[PD_BFN] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR}; + pda_tt[PD_FLN] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + pda_tt[PD_FNE] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + pda_tt[PD_PFN] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_ERR}; + pda_tt[PD_ERR] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR}; + return pda_tt; +} + +/** + * @brief Getting the translation table + */ +std::vector>> get_translation_table() +{ + std::vector>> pda_tlt(PD_NUM_STATES); + pda_tlt[PD_BOV] = {{TK_BOS}, {TK_BOL}, {TK_ERR}, {TK_ERR}, {TK_BST}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {}, {TK_BOV}, {TK_BOS}, {TK_BOL}, {TK_ERR}, {TK_ERR}, {TK_BST}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {}, {TK_BOV}, {TK_BOS}, {TK_BOL}, {TK_ERR}, {TK_ERR}, + {TK_BST}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {}, {TK_BOV}}; + pda_tlt[PD_BOA] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_BOS}, {TK_BOL}, {TK_ERR}, {TK_EOL}, {TK_BST}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {}, {TK_BOV}, {TK_ERR}, {TK_ERR}, {TK_EOS}, {TK_ERR}, + {TK_BFN}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {}, {TK_ERR}}; + pda_tlt[PD_LON] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_POV}, {}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_POV, TK_EOL}, {TK_ERR}, + {TK_ERR}, {TK_POV}, {TK_ERR}, {TK_POV}, {}, + {TK_ERR}, {TK_ERR}, {TK_POV, TK_EOS}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_POV}, {TK_ERR}, {TK_POV}, {}}; + pda_tlt[PD_STR] = {{}, {}, {}, {}, {TK_EST}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {TK_EST}, + {}, {}, {}, {}, {}, {}, {}, {}, {}, {TK_EST}, {}, {}, {}, {}, {}}; + pda_tlt[PD_SCE] = {{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, + {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}}; + pda_tlt[PD_PVL] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_EOL}, {TK_ERR}, {TK_ERR}, + {}, {TK_ERR}, {}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_EOS}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {}, {TK_ERR}, {}, {TK_ERR}}; + pda_tlt[PD_BFN] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_BFN}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {}, {TK_ERR}}; + pda_tlt[PD_FLN] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {}, {}, {}, {}, + {TK_EFN}, {}, {}, {}, {}, {}}; + pda_tlt[PD_FNE] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {}, {}, {}, {}, + {}, {}, {}, {}, {}, {}}; + pda_tlt[PD_PFN] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, + {TK_ERR}, {TK_ERR}, {TK_ERR}, {}, {}, {TK_ERR}}; + pda_tlt[PD_ERR] = {{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, + {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}}; + return pda_tlt; +} + +} // namespace tokenizer_pda + +/** + * @brief Function object used to filter for brackets and braces that represent push and pop + * operations + * + */ +struct JSONToStackOp { + template + constexpr CUDF_HOST_DEVICE fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const + { + return (stack_symbol == '{' || stack_symbol == '[') ? fst::stack_op_type::PUSH + : (stack_symbol == '}' || stack_symbol == ']') ? fst::stack_op_type::POP + : fst::stack_op_type::READ; + } +}; + +void get_stack_context(device_span d_json_in, + device_span d_top_of_stack, + rmm::cuda_stream_view stream) +{ + constexpr std::size_t single_item = 1; + + // Symbol that will represent empty-stack (i.e., that we're at the DOM root) + constexpr StackSymbolT root_symbol = '_'; + // This can be any stack symbol from the stack alphabet that does not push onto stack + constexpr StackSymbolT read_symbol = 'x'; + + // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes) + hostdevice_vector d_num_stack_ops(single_item, stream); + + // Sequence of stack symbols and their position in the original input (sparse representation) + rmm::device_uvector d_stack_ops{d_json_in.size(), stream}; + rmm::device_uvector d_stack_op_indices{d_json_in.size(), stream}; + + // Prepare finite-state transducer that only selects '{', '}', '[', ']' outside of quotes + using ToStackOpFstT = cudf::io::fst::detail::Dfa; + ToStackOpFstT json_to_stack_ops_fst{to_stack_op::symbol_groups, + to_stack_op::transition_table, + to_stack_op::translation_table, + stream}; + + // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end + // structs/lists + json_to_stack_ops_fst.Transduce(d_json_in.begin(), + static_cast(d_json_in.size()), + d_stack_ops.data(), + d_stack_op_indices.data(), + d_num_stack_ops.device_ptr(), + to_stack_op::start_state, + stream); + + // Request temporary storage requirements + fst::sparse_stack_op_to_top_of_stack( + d_stack_ops.data(), + device_span{d_stack_op_indices.data(), d_stack_op_indices.size()}, + JSONToStackOp{}, + d_top_of_stack.data(), + root_symbol, + read_symbol, + d_json_in.size(), + stream); +} + +void get_token_stream(device_span d_json_in, + device_span d_tokens, + device_span d_tokens_indices, + SymbolOffsetT* d_num_written_tokens, + rmm::cuda_stream_view stream) +{ + // Memory holding the top-of-stack stack context for the input + rmm::device_uvector d_top_of_stack{d_json_in.size(), stream}; + + // Identify what is the stack context for each input character (is it: JSON-root, struct, or list) + get_stack_context(d_json_in, d_top_of_stack, stream); + + // Prepare for PDA transducer pass, merging input symbols with stack symbols + rmm::device_uvector d_pda_sgids{d_json_in.size(), stream}; + auto zip_in = thrust::make_zip_iterator(d_json_in.data(), d_top_of_stack.data()); + thrust::transform(rmm::exec_policy(stream), + zip_in, + zip_in + d_json_in.size(), + d_pda_sgids.data(), + tokenizer_pda::PdaSymbolToSymbolGroupId{}); + + // PDA transducer alias + using ToTokenStreamFstT = cudf::io::fst::detail:: + Dfa; + + // Instantiating PDA transducer + ToTokenStreamFstT json_to_tokens_fst{tokenizer_pda::pda_sgids, + tokenizer_pda::get_transition_table(), + tokenizer_pda::get_translation_table(), + stream}; + + // Perform a PDA-transducer pass + json_to_tokens_fst.Transduce(d_pda_sgids.begin(), + static_cast(d_json_in.size()), + d_tokens.data(), + d_tokens_indices.data(), + d_num_written_tokens, + tokenizer_pda::start_state, + stream); +} + +} // namespace gpu +} // namespace json +} // namespace io +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 587f317692f..6c3d2353693 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -222,6 +222,7 @@ ConfigureTest(FILE_IO_TEST io/file_io_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp) ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp) +ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cu) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu) diff --git a/cpp/tests/io/nested_json_test.cu b/cpp/tests/io/nested_json_test.cu new file mode 100644 index 00000000000..6336f493c17 --- /dev/null +++ b/cpp/tests/io/nested_json_test.cu @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +namespace nested_json = cudf::io::json::gpu; + +// Base test fixture for tests +struct JsonTest : public cudf::test::BaseFixture { +}; + +TEST_F(JsonTest, StackContext) +{ + // Type used to represent the atomic symbol type used within the finite-state machine + using SymbolT = char; + using StackSymbolT = char; + + // Prepare cuda stream for data transfers & kernels + cudaStream_t stream = nullptr; + cudaStreamCreate(&stream); + rmm::cuda_stream_view stream_view(stream); + + // Test input + std::string input = R"( [{)" + R"("category": "reference",)" + R"("index:": [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "[Sayings of the Century]",)" + R"("price": 8.95)" + R"(}, )" + R"({)" + R"("category": "reference",)" + R"("index": [4,{},null,{"a":[{ }, {}] } ],)" + R"("author": "Nigel Rees",)" + R"("title": "{}[], <=semantic-symbols-string",)" + R"("price": 8.95)" + R"(}] )"; + + // Prepare input & output buffers + rmm::device_uvector d_input(input.size(), stream_view); + hostdevice_vector stack_context(input.size(), stream_view); + + ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( + d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); + + // Run algorithm + cudf::io::json::gpu::get_stack_context( + d_input, + cudf::device_span{stack_context.device_ptr(), stack_context.size()}, + stream); + + // Copy back the results + stack_context.device_to_host(stream); + + // Make sure we copied back the stack context + stream_view.synchronize(); + + std::vector golden_stack_context{ + '_', '_', '_', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '[', '[', '[', '[', '[', '[', '[', '[', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '[', '[', '[', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '[', '[', + '{', '[', '[', '[', '[', '[', '[', '[', '{', '{', '{', '{', '{', '[', '{', '{', '[', '[', + '[', '{', '[', '{', '{', '[', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', + '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '_'}; + + ASSERT_EQ(golden_stack_context.size(), stack_context.size()); + for (std::size_t i = 0; i < stack_context.size() && i < 1000; i++) { + ASSERT_EQ(golden_stack_context[i], stack_context[i]); + } +} + +TEST_F(JsonTest, TokenStream) +{ + using cudf::io::json::gpu::PdaTokenT; + using cudf::io::json::gpu::SymbolOffsetT; + using cudf::io::json::gpu::SymbolT; + + constexpr std::size_t single_item = 1; + + // Prepare cuda stream for data transfers & kernels + cudaStream_t stream = nullptr; + cudaStreamCreate(&stream); + rmm::cuda_stream_view stream_view(stream); + + // Test input + std::string input = R"( [{)" + R"("category": "reference",)" + R"("index:": [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "[Sayings of the Century]",)" + R"("price": 8.95)" + R"(}, )" + R"({)" + R"("category": "reference",)" + R"("index": [4,{},null,{"a":[{ }, {}] } ],)" + R"("author": "Nigel Rees",)" + R"("title": "{}[], <=semantic-symbols-string",)" + R"("price": 8.95)" + R"(}] )"; + + // Prepare input & output buffers + rmm::device_uvector d_input(input.size(), stream_view); + + ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( + d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream)); + + + hostdevice_vector tokens_gpu{input.size(), stream}; + hostdevice_vector token_indices_gpu{input.size(), stream}; + hostdevice_vector num_tokens_out{single_item, stream}; + + // Parse the JSON and get the token stream + cudf::io::json::gpu::get_token_stream( + d_input, + cudf::device_span{tokens_gpu.device_ptr(), tokens_gpu.size()}, + cudf::device_span{token_indices_gpu.device_ptr(), token_indices_gpu.size()}, + num_tokens_out.device_ptr(), + stream); + + // Copy back the number of tokens that were written + num_tokens_out.device_to_host(stream); + tokens_gpu.device_to_host(stream); + token_indices_gpu.device_to_host(stream); + + // Make sure we copied back all relevant data + stream_view.synchronize(); + + // Golden token stream sample + std::vector> golden_token_stream = { + {2, nested_json::TK_BOL}, {3, nested_json::TK_BOS}, {4, nested_json::TK_BFN}, + {13, nested_json::TK_EFN}, {16, nested_json::TK_BST}, {26, nested_json::TK_EST}, + {28, nested_json::TK_BFN}, {35, nested_json::TK_EFN}, {38, nested_json::TK_BOL}, + {39, nested_json::TK_BOV}, {40, nested_json::TK_POV}, {41, nested_json::TK_BOV}, + {43, nested_json::TK_POV}, {44, nested_json::TK_BOV}, {46, nested_json::TK_POV}, + {46, nested_json::TK_EOL}, {48, nested_json::TK_BFN}, {55, nested_json::TK_EFN}, + {58, nested_json::TK_BST}, {69, nested_json::TK_EST}, {71, nested_json::TK_BFN}, + {77, nested_json::TK_EFN}, {80, nested_json::TK_BST}, {105, nested_json::TK_EST}, + {107, nested_json::TK_BFN}, {113, nested_json::TK_EFN}, {116, nested_json::TK_BOV}, + {120, nested_json::TK_POV}, {120, nested_json::TK_EOS}, {124, nested_json::TK_BOS}, + {125, nested_json::TK_BFN}, {134, nested_json::TK_EFN}, {137, nested_json::TK_BST}, + {147, nested_json::TK_EST}, {149, nested_json::TK_BFN}, {155, nested_json::TK_EFN}, + {158, nested_json::TK_BOL}, {159, nested_json::TK_BOV}, {160, nested_json::TK_POV}, + {161, nested_json::TK_BOS}, {162, nested_json::TK_EOS}, {164, nested_json::TK_BOV}, + {168, nested_json::TK_POV}, {169, nested_json::TK_BOS}, {170, nested_json::TK_BFN}, + {172, nested_json::TK_EFN}, {174, nested_json::TK_BOL}, {175, nested_json::TK_BOS}, + {177, nested_json::TK_EOS}, {180, nested_json::TK_BOS}, {181, nested_json::TK_EOS}, + {182, nested_json::TK_EOL}, {184, nested_json::TK_EOS}, {186, nested_json::TK_EOL}, + {188, nested_json::TK_BFN}, {195, nested_json::TK_EFN}, {198, nested_json::TK_BST}, + {209, nested_json::TK_EST}, {211, nested_json::TK_BFN}, {217, nested_json::TK_EFN}, + {220, nested_json::TK_BST}, {252, nested_json::TK_EST}, {254, nested_json::TK_BFN}, + {260, nested_json::TK_EFN}, {263, nested_json::TK_BOV}, {267, nested_json::TK_POV}, + {267, nested_json::TK_EOS}, {268, nested_json::TK_EOL}}; + + // Verify the number of tokens matches + ASSERT_EQ(golden_token_stream.size(), num_tokens_out[0]); + + for (std::size_t i = 0; i < num_tokens_out[0]; i++) { + // Ensure the index the tokens are pointing to do match + ASSERT_EQ(golden_token_stream[i].first, token_indices_gpu[i]); + // Ensure the token category is correct + ASSERT_EQ(golden_token_stream[i].second, tokens_gpu[i]); + } +}