Adds JSON tokenizer (#11264)

This PR builds on the _Finite-State Transducer_ (_FST_) algorithm and the _Logical Stack_ to implement a tokenizer that demarcates sections from the JSON input and assigns a category to each such section. **This PR builds on:** ⛓️ #11242 ⛓️ #11078 Specifically, the tokenizer comprises the following processing steps: 1. FST to emit sequence of stack operations (i.e., emit push(LIST), push(STRUCT), pop(), read()). This FST does transduce each occurrence of an opening semantic bracket or brace to the respective push(LIST) and push(STRUCT) operation, respectively. Each semantic closing bracket or brace is transduced to a pop() operation. All other input is transduced to a read() operation. 2. The sequence of stack operations from (1) is fed into the logical stack that resolves what is on top of the stack before each operation from (1) (i.e., STRUCT, LIST). After this stage, for every input character we know what is on top of the stack: either a STRUCT or LIST or ROOT, if there is no symbol on top of the stack. 3. We use the top-of-stack information from (2) for a second FST. This part can be considered a full pushdown or DVPA (because now, we also have stack context). State transitions are caused by the combination of the input character + the top-of-stack for that character. The output of this stage is the token stream: ({beginning-of, end-of}x{struct, list}, field name, value, etc. Authors: - Elias Stehle (https://github.com/elstehle) - Karthikeyan (https://github.com/karthikeyann) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Tobias Ribizel (https://github.com/upsj) - Karthikeyan (https://github.com/karthikeyann) - Yunsong Wang (https://github.com/PointKernel) - Bradley Dice (https://github.com/bdice) URL: #11264
rapidsai · Aug 6, 2022 · e1a4e03 · e1a4e03
1 parent d695129
commit e1a4e03
Show file tree

Hide file tree

Showing 8 changed files with 1,199 additions and 33 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -328,6 +328,7 @@ add_library(
   src/io/csv/writer_impl.cu
   src/io/functions.cpp
   src/io/json/json_gpu.cu
+  src/io/json/nested_json_gpu.cu
   src/io/json/reader_impl.cu
   src/io/json/experimental/read_json.cpp
   src/io/orc/aggregate_orc_metadata.cpp

diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh
@@ -142,7 +142,8 @@ class SingleSymbolSmemLUT {
   constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT const symbol) const
   {
     // Look up the symbol group for given symbol
-    return temp_storage.sym_to_sgid[min(symbol, num_valid_entries - 1)];
+    return temp_storage
+      .sym_to_sgid[min(static_cast<SymbolGroupIdT>(symbol), num_valid_entries - 1U)];
   }
 };
 
@@ -170,19 +171,21 @@ class TransitionTable {
     ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS];
   };
 
-  template <typename StateIdT, typename = std::void_t<decltype(ItemT{std::declval<StateIdT>()})>>
-  static void InitDeviceTransitionTable(hostdevice_vector<KernelParameter>& transition_table_init,
-                                        std::vector<std::vector<StateIdT>> const& translation_table,
-                                        rmm::cuda_stream_view stream)
+  template <typename StateIdT>
+  static void InitDeviceTransitionTable(
+    hostdevice_vector<KernelParameter>& transition_table_init,
+    std::array<std::array<StateIdT, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& translation_table,
+    rmm::cuda_stream_view stream)
   {
     // translation_table[state][symbol] -> new state
     for (std::size_t state = 0; state < translation_table.size(); ++state) {
       for (std::size_t symbol = 0; symbol < translation_table[state].size(); ++symbol) {
         CUDF_EXPECTS(
-          translation_table[state][symbol] <= std::numeric_limits<ItemT>::max(),
+          static_cast<int64_t>(translation_table[state][symbol]) <=
+            std::numeric_limits<ItemT>::max(),
           "Target state index value exceeds value representable by the transition table's type");
         transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] =
-          translation_table[state][symbol];
+          static_cast<ItemT>(translation_table[state][symbol]);
       }
     }
 
@@ -319,7 +322,8 @@ class TransducerLookupTable {
    */
   static void InitDeviceTranslationTable(
     hostdevice_vector<KernelParameter>& translation_table_init,
-    std::vector<std::vector<std::vector<OutSymbolT>>> const& translation_table,
+    std::array<std::array<std::vector<OutSymbolT>, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const&
+      translation_table,
     rmm::cuda_stream_view stream)
   {
     std::vector<OutSymbolT> out_symbols;
@@ -476,8 +480,8 @@ class Dfa {
    */
   template <typename StateIdT, typename SymbolGroupIdItT>
   Dfa(SymbolGroupIdItT const& symbol_vec,
-      std::vector<std::vector<StateIdT>> const& tt_vec,
-      std::vector<std::vector<std::vector<OutSymbolT>>> const& out_tt_vec,
+      std::array<std::array<StateIdT, NUM_SYMBOLS>, NUM_STATES> const& tt_vec,
+      std::array<std::array<std::vector<OutSymbolT>, NUM_SYMBOLS>, NUM_STATES> const& out_tt_vec,
       cudaStream_t stream)
   {
     constexpr std::size_t single_item = 1;

diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::io::json {
+
+/// Type used to represent the atomic symbol type used within the finite-state machine
+using SymbolT = char;
+
+/// Type used to represent the stack alphabet (i.e.: empty-stack, struct, list)
+using StackSymbolT = char;
+
+/// Type used to index into the symbols within the JSON input
+using SymbolOffsetT = uint32_t;
+
+/// Type large enough to support indexing up to max nesting level (must be signed)
+using StackLevelT = int8_t;
+
+/// Type used to represent a symbol group id of the input alphabet in the pushdown automaton
+using PdaInputSymbolGroupIdT = char;
+
+/// Type used to represent a symbol group id of the stack alphabet in the pushdown automaton
+using PdaStackSymbolGroupIdT = char;
+
+/// Type used to represent a (input-symbol, stack-symbol)-tuple in stack-symbol-major order
+using PdaSymbolGroupIdT = char;
+
+/// Type being emitted by the pushdown automaton transducer
+using PdaTokenT = char;
+
+/**
+ * @brief Tokens emitted while parsing a JSON input
+ */
+enum token_t : PdaTokenT {
+  /// Beginning-of-struct token (on encounter of semantic '{')
+  StructBegin,
+  /// End-of-struct token (on encounter of semantic '}')
+  StructEnd,
+  /// Beginning-of-list token (on encounter of semantic '[')
+  ListBegin,
+  /// End-of-list token (on encounter of semantic ']')
+  ListEnd,
+  /// Beginning-of-field-name token (on encounter of first quote)
+  FieldNameBegin,
+  /// End-of-field-name token (on encounter of a field name's second quote)
+  FieldNameEnd,
+  /// Beginning-of-string-value token (on encounter of the string's first quote)
+  StringBegin,
+  /// End-of-string token (on encounter of a string's second quote)
+  StringEnd,
+  /// Beginning-of-value token (first character of literal or numeric)
+  ValueBegin,
+  /// Post-value token (first character after a literal or numeric string)
+  ValueEnd,
+  /// Beginning-of-error token (on first encounter of a parsing error)
+  ErrorBegin,
+  /// Total number of tokens
+  NUM_TOKENS
+};
+
+namespace detail {
+/**
+ * @brief Identifies the stack context for each character from a JSON input. Specifically, we
+ * identify brackets and braces outside of quoted fields (e.g., field names, strings).
+ * At this stage, we do not perform bracket matching, i.e., we do not verify whether a closing
+ * bracket would actually pop a the corresponding opening brace.
+ *
+ * @param[in] d_json_in The string of input characters
+ * @param[out] d_top_of_stack Will be populated with what-is-on-top-of-the-stack for any given input
+ * character of \p d_json_in, where a '{' represents that the corresponding input character is
+ * within the context of a struct, a '[' represents that it is within the context of an array, and a
+ * '_' symbol that it is at the root of the JSON.
+ * @param[in] stream The cuda stream to dispatch GPU kernels to
+ */
+void get_stack_context(device_span<SymbolT const> d_json_in,
+                       SymbolT* d_top_of_stack,
+                       rmm::cuda_stream_view stream);
+
+/**
+ * @brief Parses the given JSON string and emits a sequence of tokens that demarcate relevant
+ * sections from the input.
+ *
+ * @param[in] d_json_in The JSON input
+ * @param[out] d_tokens Device memory to which the parsed tokens are written
+ * @param[out] d_tokens_indices Device memory to which the indices are written, where each index
+ * represents the offset within \p d_json_in that cause the input being written
+ * @param[out] d_num_written_tokens The total number of tokens that were parsed
+ * @param[in] stream The CUDA stream to which kernels are dispatched
+ */
+void get_token_stream(device_span<SymbolT const> d_json_in,
+                      PdaTokenT* d_tokens,
+                      SymbolOffsetT* d_tokens_indices,
+                      SymbolOffsetT* d_num_written_tokens,
+                      rmm::cuda_stream_view stream);
+}  // namespace detail
+
+}  // namespace cudf::io::json