From 7fc8619aa6f1f990da53e5f211c5382b7fb89322 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 3 May 2022 07:05:44 -0700
Subject: [PATCH] rebase on latest FST

---
 cpp/CMakeLists.txt                 |   1 +
 cpp/src/io/json/nested_json.h      | 116 ++++++++
 cpp/src/io/json/nested_json_gpu.cu | 410 +++++++++++++++++++++++++++++
 cpp/tests/CMakeLists.txt           |   1 +
 cpp/tests/io/nested_json_test.cu   | 189 +++++++++++++
 5 files changed, 717 insertions(+)
 create mode 100644 cpp/src/io/json/nested_json.h
 create mode 100644 cpp/src/io/json/nested_json_gpu.cu
 create mode 100644 cpp/tests/io/nested_json_test.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 86bfdc1444b..5db5ad61f90 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -321,6 +321,7 @@ add_library(
   src/io/csv/writer_impl.cu
   src/io/functions.cpp
   src/io/json/json_gpu.cu
+  src/io/json/nested_json_gpu.cu
   src/io/json/reader_impl.cu
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
diff --git a/cpp/src/io/json/nested_json.h b/cpp/src/io/json/nested_json.h
new file mode 100644
index 00000000000..58f30c7b9ac
--- /dev/null
+++ b/cpp/src/io/json/nested_json.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace io {
+namespace json {
+namespace gpu {
+
+/// Type used to represent the atomic symbol type used within the finite-state machine
+using SymbolT = char;
+
+/// Type used to represent the stack alphabet (i.e.: empty-stack, struct, list)
+using StackSymbolT = char;
+
+/// Type used to index into the symbols within the JSON input
+using SymbolOffsetT = uint32_t;
+
+/// Type large enough to support indexing up to max nesting level (must be signed)
+using StackLevelT = int8_t;
+
+/// Type used to represent a symbol group id of the input alphabet in the pushdown automaton
+using PdaInputSymbolGroupIdT = char;
+
+/// Type used to represent a symbol group id of the stack alphabet in the pushdown automaton
+using PdaStackSymbolGroupIdT = char;
+
+/// Type used to represent a (input-symbol, stack-symbole)-tuple in stack-symbole-major order
+using PdaSymbolGroupIdT = char;
+
+/// Type being emitted by the pushdown automaton transducer
+using PdaTokenT = char;
+
+/**
+ * @brief Tokens emitted while parsing a JSON input
+ */
+enum token_t : PdaTokenT {
+  /// Beginning-of-struct token (on encounter of semantic '{')
+  TK_BOS,
+  /// Beginning-of-list token (on encounter of semantic '[')
+  TK_BOL,
+  /// Beginning-of-error token (on first encounter of a parsing error)
+  TK_ERR,
+  /// Beginning-of-string-value token (on encounter of the string's first quote)
+  TK_BST,
+  /// Beginning-of-value token (first character of literal or numeric)
+  TK_BOV,
+  /// End-of-list token (on encounter of semantic ']')
+  TK_EOL,
+  /// End-of-struct token (on encounter of semantic '}')
+  TK_EOS,
+  /// Beginning-of-field-name token (on encounter of first quote)
+  TK_BFN,
+  /// Post-value token (first character after a literal or numeric string)
+  TK_POV,
+  /// End-of-string token (on encounter of a string's second quote)
+  TK_EST,
+  /// End-of-field-name token (on encounter of a field name's second quote)
+  TK_EFN,
+  /// Total number of tokens
+  NUM_TOKENS
+};
+
+/**
+ * @brief Identifies the stack context for each character from a JSON input. Specifically, we
+ * identify brackets and braces outside of quoted fields (e.g., field names, strings).
+ * At this stage, we do not perform bracket matching, i.e., we do not verify whether a closing
+ * bracket would actually pop a the corresponding opening brace.
+ *
+ * @param d_json_in The string of input characters
+ * @param d_top_of_stack
+ * @param stream The cuda stream to dispatch GPU kernels to
+ */
+void get_stack_context(device_span<SymbolT const> d_json_in,
+                       device_span<SymbolT> d_top_of_stack,
+                       rmm::cuda_stream_view stream);
+
+/**
+ * @brief Parses the given JSON string and emits a sequence of tokens that demarcate relevant
+ * sections from the input.
+ *
+ * @param d_json_in The JSON input
+ * @param d_tokens_out Device memory to which the parsed tokens are written
+ * @param d_tokens_indices Device memory to which the indices are written, where each index
+ * represents the offset within \p d_json_in that cause the input being written
+ * @param d_num_written_tokens The total number of tokens that were parsed
+ * @param stream The CUDA stream to which kernels are dispatched
+ */
+void get_token_stream(device_span<SymbolT const> d_json_in,
+                      device_span<PdaTokenT> d_tokens,
+                      device_span<SymbolOffsetT> d_tokens_indices,
+                      SymbolOffsetT* d_num_written_tokens,
+                      rmm::cuda_stream_view stream);
+
+}  // namespace gpu
+}  // namespace json
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
new file mode 100644
index 00000000000..ae1767bf63a
--- /dev/null
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nested_json.h"
+
+#include <io/fst/logical_stack.cuh>
+#include <io/fst/lookup_tables.cuh>
+#include <io/utilities/hostdevice_vector.hpp>
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+namespace cudf {
+namespace io {
+namespace json {
+namespace gpu {
+
+//------------------------------------------------------------------------------
+// JSON-TO-STACK-OP DFA
+//------------------------------------------------------------------------------
+namespace to_stack_op {
+
+/**
+ * @brief Definition of the DFA's states
+ */
+enum DFA_STATES {
+  // The state being active while being outside of a string. When encountering an opening bracket
+  // or curly brace, we push it onto the stack. When encountering a closing bracket or brace, we
+  // pop from the stack.
+  TT_OOS = 0U,
+
+  // The state being active while being within a string (e.g., field name or a string value). We do
+  // not push or pop from the stack while being in this state.
+  TT_STR,
+
+  // The state being active after encountering an escape symbol (e.g., '\'), while being in the
+  // TT_STR state.
+  TT_ESC,
+
+  // Total number of states
+  TT_NUM_STATES
+};
+
+/**
+ * @brief Definition of the symbol groups
+ */
+enum DFA_SGID {
+  OBC = 0U,          ///< Opening brace SG: {
+  OBT,               ///< Opening bracket SG: [
+  CBC,               ///< Closing brace SG: }
+  CBT,               ///< Closing bracket SG: ]
+  QTE,               ///< Quote character SG: "
+  ESC,               ///< Escape character SG: '\'
+  OTR,               ///< SG implicitly matching all other characters
+  NUM_SYMBOL_GROUPS  ///< Total number of symbol groups
+};
+
+// The i-th string representing all the characters of a symbol group
+const std::vector<std::string> symbol_groups = {"{", "[", "}", "]", "\"", "\\"};
+
+// Transition table
+const std::vector<std::vector<int32_t>> transition_table = {
+  /* IN_STATE         {       [       }       ]       "       \    OTHER */
+  /* TT_OOS    */ {TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS},
+  /* TT_STR    */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR, TT_STR},
+  /* TT_ESC    */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}};
+
+// Translation table (i.e., for each transition, what are the symbols that we output)
+const std::vector<std::vector<std::vector<char>>> translation_table = {
+  /* IN_STATE        {      [      }      ]     "  \   OTHER */
+  /* TT_OOS    */ {{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}},
+  /* TT_STR    */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}},
+  /* TT_ESC    */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}};
+
+// The DFA's starting state
+constexpr int32_t start_state = TT_OOS;
+}  // namespace to_stack_op
+
+//------------------------------------------------------------------------------
+// JSON TOKENIZER PUSHDOWN AUTOMATON
+//------------------------------------------------------------------------------
+namespace tokenizer_pda {
+
+/**
+ * @brief Symbol groups for the input alphabet for the pushdown automaton
+ */
+enum SGID : PdaSymbolGroupIdT {
+  /// Opening brace
+  OBC,
+  /// Opening bracket
+  OBT,
+  /// Closing brace
+  CBC,
+  /// Closing bracket
+  CBT,
+  /// Quote
+  QTE,
+  /// Escape
+  ESC,
+  /// Comma
+  CMA,
+  /// Colon
+  CLN,
+  /// Whitespace
+  WSP,
+  /// Other (any input symbol not assigned to one of the above symbol groups)
+  OTR,
+  /// Total number of symbol groups amongst which to differentiate
+  NUM_PDA_INPUT_SGS
+};
+
+/**
+ * @brief Symbols in the stack alphabet
+ */
+enum STACK_SGID : PdaStackSymbolGroupIdT {
+  /// Symbol representing the JSON-root (i.e., we're at nesting level '0')
+  STACK_ROOT = 0,
+
+  /// Symbol representing that we're currently within a list object
+  STACK_LIST = 1,
+
+  /// Symbol representing that we're currently within a struct object
+  STACK_STRUCT = 2,
+
+  /// Total number of symbols in the stack alphabet
+  NUM_STACK_SGS
+};
+
+/// Total number of symbol groups to differentiate amongst (stack alphabet * input alphabet)
+constexpr PdaSymbolGroupIdT NUM_PDA_SGIDS = NUM_PDA_INPUT_SGS * NUM_STACK_SGS;
+
+/// Mapping a input symbol to the symbol group id
+static __constant__ PdaSymbolGroupIdT tos_sg_to_pda_sgid[] = {
+  OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, WSP, WSP, OTR, OTR, WSP, OTR, OTR, OTR, OTR, OTR,
+  OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, WSP, OTR, QTE, OTR, OTR, OTR,
+  OTR, OTR, OTR, OTR, OTR, OTR, CMA, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR,
+  OTR, CLN, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR,
+  OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OBT, ESC, CBT, OTR,
+  OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR,
+  OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OTR, OBC, OTR, CBC, OTR};
+
+/**
+ * @brief Maps a (top-of-stack symbol, input symbol)-pair to a symbol group id of the DVPA
+ */
+struct PdaSymbolToSymbolGroupId {
+  template <typename SymbolT, typename StackSymbolT>
+  __device__ __forceinline__ PdaSymbolGroupIdT
+  operator()(thrust::tuple<SymbolT, StackSymbolT> symbol_pair)
+  {
+    // The symbol read from the input
+    auto symbol = thrust::get<0>(symbol_pair);
+
+    // The stack symbol (i.e., what is on top of the stack at the time the input symbol was read)
+    // I.e., whether we're reading in something within a struct, a list, or the JSON root
+    auto stack_symbol = thrust::get<1>(symbol_pair);
+
+    // The stack symbol offset: '_' is the root group (0), '[' is the list group (1), '{' is the
+    // struct group (2)
+    int32_t stack_idx =
+      (stack_symbol == '_') ? STACK_ROOT : ((stack_symbol == '[') ? STACK_LIST : STACK_STRUCT);
+
+    // The relative symbol group id of the current input symbol
+    PdaSymbolGroupIdT symbol_gid = tos_sg_to_pda_sgid[min(
+      static_cast<int32_t>(symbol),
+      static_cast<int32_t>(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0])) - 1)];
+    return stack_idx * NUM_PDA_INPUT_SGS + symbol_gid;
+  }
+};
+
+// The states defined by the pushdown automaton
+enum pda_state_t : int32_t {
+  PD_BOV,
+  PD_BOA,
+  PD_LON,
+  PD_STR,
+  PD_SCE,
+  PD_PVL,
+  PD_BFN,
+  PD_FLN,
+  PD_FNE,
+  PD_PFN,
+  PD_ERR,
+  PD_NUM_STATES
+};
+
+// The starting state of the pushdown automaton
+constexpr int32_t start_state = PD_BOV;
+
+// Identity symbol to symbol group lookup table
+const std::vector<std::vector<char>> pda_sgids{
+  {0},  {1},  {2},  {3},  {4},  {5},  {6},  {7},  {8},  {9},  {10}, {11}, {12}, {13}, {14},
+  {15}, {16}, {17}, {18}, {19}, {20}, {21}, {22}, {23}, {24}, {25}, {26}, {27}, {28}, {29}};
+
+/**
+ * @brief Getting the transition table  
+ */
+std::vector<std::vector<int32_t>> get_transition_table()
+{
+  std::vector<std::vector<int32_t>> pda_tt(PD_NUM_STATES);
+  pda_tt[PD_BOV] = {PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON,
+                    PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON,
+                    PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_LON};
+  pda_tt[PD_BOA] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+                    PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_LON,
+                    PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_ERR};
+  pda_tt[PD_LON] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_LON,
+                    PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_LON,
+                    PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_LON};
+  pda_tt[PD_STR] = {PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR,
+                    PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR,
+                    PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR};
+  pda_tt[PD_SCE] = {PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
+                    PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
+                    PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR};
+  pda_tt[PD_PVL] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR,
+                    PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_ERR,
+                    PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_ERR};
+  pda_tt[PD_BFN] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+                    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+                    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR};
+  pda_tt[PD_FLN] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+                    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+                    PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN};
+  pda_tt[PD_FNE] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+                    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+                    PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN};
+  pda_tt[PD_PFN] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+                    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+                    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_ERR};
+  pda_tt[PD_ERR] = {PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+                    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
+                    PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR};
+  return pda_tt;
+}
+
+/**
+ * @brief Getting the translation table  
+ */
+std::vector<std::vector<std::vector<char>>> get_translation_table()
+{
+  std::vector<std::vector<std::vector<char>>> pda_tlt(PD_NUM_STATES);
+  pda_tlt[PD_BOV] = {{TK_BOS}, {TK_BOL}, {TK_ERR}, {TK_ERR}, {TK_BST}, {TK_ERR}, {TK_ERR}, {TK_ERR},
+                     {},       {TK_BOV}, {TK_BOS}, {TK_BOL}, {TK_ERR}, {TK_ERR}, {TK_BST}, {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {},       {TK_BOV}, {TK_BOS}, {TK_BOL}, {TK_ERR}, {TK_ERR},
+                     {TK_BST}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {},       {TK_BOV}};
+  pda_tlt[PD_BOA] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {TK_BOS}, {TK_BOL}, {TK_ERR}, {TK_EOL}, {TK_BST}, {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {},       {TK_BOV}, {TK_ERR}, {TK_ERR}, {TK_EOS}, {TK_ERR},
+                     {TK_BFN}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {},       {TK_ERR}};
+  pda_tlt[PD_LON] = {{TK_ERR}, {TK_ERR}, {TK_ERR},         {TK_ERR},         {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {TK_ERR},         {TK_POV},         {},
+                     {TK_ERR}, {TK_ERR}, {TK_ERR},         {TK_POV, TK_EOL}, {TK_ERR},
+                     {TK_ERR}, {TK_POV}, {TK_ERR},         {TK_POV},         {},
+                     {TK_ERR}, {TK_ERR}, {TK_POV, TK_EOS}, {TK_ERR},         {TK_ERR},
+                     {TK_ERR}, {TK_POV}, {TK_ERR},         {TK_POV},         {}};
+  pda_tlt[PD_STR] = {{}, {}, {}, {}, {TK_EST}, {}, {}, {}, {}, {},       {}, {}, {}, {}, {TK_EST},
+                     {}, {}, {}, {}, {},       {}, {}, {}, {}, {TK_EST}, {}, {}, {}, {}, {}};
+  pda_tlt[PD_SCE] = {{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {},
+                     {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}};
+  pda_tlt[PD_PVL] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR},
+                     {},       {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_EOL}, {TK_ERR}, {TK_ERR},
+                     {},       {TK_ERR}, {},       {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_EOS}, {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {},       {TK_ERR}, {},       {TK_ERR}};
+  pda_tlt[PD_BFN] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR},
+                     {TK_BFN}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {},       {TK_ERR}};
+  pda_tlt[PD_FLN] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {},       {},       {},       {},
+                     {TK_EFN}, {},       {},       {},       {},       {}};
+  pda_tlt[PD_FNE] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {},       {},       {},       {},
+                     {},       {},       {},       {},       {},       {}};
+  pda_tlt[PD_PFN] = {{TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR}, {TK_ERR},
+                     {TK_ERR}, {TK_ERR}, {TK_ERR}, {},       {},       {TK_ERR}};
+  pda_tlt[PD_ERR] = {{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {},
+                     {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}};
+  return pda_tlt;
+}
+
+}  // namespace tokenizer_pda
+
+/**
+ * @brief Function object used to filter for brackets and braces that represent push and pop
+ * operations
+ *
+ */
+struct JSONToStackOp {
+  template <typename StackSymbolT>
+  constexpr CUDF_HOST_DEVICE fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const
+  {
+    return (stack_symbol == '{' || stack_symbol == '[')   ? fst::stack_op_type::PUSH
+           : (stack_symbol == '}' || stack_symbol == ']') ? fst::stack_op_type::POP
+                                                          : fst::stack_op_type::READ;
+  }
+};
+
+void get_stack_context(device_span<SymbolT const> d_json_in,
+                       device_span<SymbolT> d_top_of_stack,
+                       rmm::cuda_stream_view stream)
+{
+  constexpr std::size_t single_item = 1;
+
+  // Symbol that will represent empty-stack (i.e., that we're at the DOM root)
+  constexpr StackSymbolT root_symbol = '_';
+  // This can be any stack symbol from the stack alphabet that does not push onto stack
+  constexpr StackSymbolT read_symbol = 'x';
+
+  // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes)
+  hostdevice_vector<SymbolOffsetT> d_num_stack_ops(single_item, stream);
+
+  // Sequence of stack symbols and their position in the original input (sparse representation)
+  rmm::device_uvector<StackSymbolT> d_stack_ops{d_json_in.size(), stream};
+  rmm::device_uvector<SymbolOffsetT> d_stack_op_indices{d_json_in.size(), stream};
+
+  // Prepare finite-state transducer that only selects '{', '}', '[', ']' outside of quotes
+  using ToStackOpFstT = cudf::io::fst::detail::Dfa<StackSymbolT,
+                                                   (to_stack_op::DFA_SGID::NUM_SYMBOL_GROUPS - 1),
+                                                   to_stack_op::DFA_STATES::TT_NUM_STATES>;
+  ToStackOpFstT json_to_stack_ops_fst{to_stack_op::symbol_groups,
+                                      to_stack_op::transition_table,
+                                      to_stack_op::translation_table,
+                                      stream};
+
+  // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end
+  // structs/lists
+  json_to_stack_ops_fst.Transduce(d_json_in.begin(),
+                                  static_cast<SymbolOffsetT>(d_json_in.size()),
+                                  d_stack_ops.data(),
+                                  d_stack_op_indices.data(),
+                                  d_num_stack_ops.device_ptr(),
+                                  to_stack_op::start_state,
+                                  stream);
+
+  // Request temporary storage requirements
+  fst::sparse_stack_op_to_top_of_stack<StackLevelT>(
+    d_stack_ops.data(),
+    device_span<SymbolOffsetT>{d_stack_op_indices.data(), d_stack_op_indices.size()},
+    JSONToStackOp{},
+    d_top_of_stack.data(),
+    root_symbol,
+    read_symbol,
+    d_json_in.size(),
+    stream);
+}
+
+void get_token_stream(device_span<SymbolT const> d_json_in,
+                      device_span<PdaTokenT> d_tokens,
+                      device_span<SymbolOffsetT> d_tokens_indices,
+                      SymbolOffsetT* d_num_written_tokens,
+                      rmm::cuda_stream_view stream)
+{
+  // Memory holding the top-of-stack stack context for the input
+  rmm::device_uvector<StackSymbolT> d_top_of_stack{d_json_in.size(), stream};
+
+  // Identify what is the stack context for each input character (is it: JSON-root, struct, or list)
+  get_stack_context(d_json_in, d_top_of_stack, stream);
+
+  // Prepare for PDA transducer pass, merging input symbols with stack symbols
+  rmm::device_uvector<PdaSymbolGroupIdT> d_pda_sgids{d_json_in.size(), stream};
+  auto zip_in = thrust::make_zip_iterator(d_json_in.data(), d_top_of_stack.data());
+  thrust::transform(rmm::exec_policy(stream),
+                    zip_in,
+                    zip_in + d_json_in.size(),
+                    d_pda_sgids.data(),
+                    tokenizer_pda::PdaSymbolToSymbolGroupId{});
+
+  // PDA transducer alias
+  using ToTokenStreamFstT = cudf::io::fst::detail::
+    Dfa<StackSymbolT, (tokenizer_pda::NUM_PDA_SGIDS - 1), tokenizer_pda::PD_NUM_STATES>;
+
+  // Instantiating PDA transducer
+  ToTokenStreamFstT json_to_tokens_fst{tokenizer_pda::pda_sgids,
+                                       tokenizer_pda::get_transition_table(),
+                                       tokenizer_pda::get_translation_table(),
+                                       stream};
+
+  // Perform a PDA-transducer pass
+  json_to_tokens_fst.Transduce(d_pda_sgids.begin(),
+                               static_cast<SymbolOffsetT>(d_json_in.size()),
+                               d_tokens.data(),
+                               d_tokens_indices.data(),
+                               d_num_written_tokens,
+                               tokenizer_pda::start_state,
+                               stream);
+}
+
+}  // namespace gpu
+}  // namespace json
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 587f317692f..6c3d2353693 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -222,6 +222,7 @@ ConfigureTest(FILE_IO_TEST io/file_io_test.cpp)
 ConfigureTest(ORC_TEST io/orc_test.cpp)
 ConfigureTest(PARQUET_TEST io/parquet_test.cpp)
 ConfigureTest(JSON_TEST io/json_test.cpp)
+ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cu)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
 ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)
diff --git a/cpp/tests/io/nested_json_test.cu b/cpp/tests/io/nested_json_test.cu
new file mode 100644
index 00000000000..6336f493c17
--- /dev/null
+++ b/cpp/tests/io/nested_json_test.cu
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <io/json/nested_json.h>
+#include <io/utilities/hostdevice_vector.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+
+namespace nested_json = cudf::io::json::gpu;
+
+// Base test fixture for tests
+struct JsonTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(JsonTest, StackContext)
+{
+  // Type used to represent the atomic symbol type used within the finite-state machine
+  using SymbolT      = char;
+  using StackSymbolT = char;
+
+  // Prepare cuda stream for data transfers & kernels
+  cudaStream_t stream = nullptr;
+  cudaStreamCreate(&stream);
+  rmm::cuda_stream_view stream_view(stream);
+
+  // Test input
+  std::string input = R"(  [{)"
+                      R"("category": "reference",)"
+                      R"("index:": [4,12,42],)"
+                      R"("author": "Nigel Rees",)"
+                      R"("title": "[Sayings of the Century]",)"
+                      R"("price": 8.95)"
+                      R"(},  )"
+                      R"({)"
+                      R"("category": "reference",)"
+                      R"("index": [4,{},null,{"a":[{ }, {}] } ],)"
+                      R"("author": "Nigel Rees",)"
+                      R"("title": "{}[], <=semantic-symbols-string",)"
+                      R"("price": 8.95)"
+                      R"(}] )";
+
+  // Prepare input & output buffers
+  rmm::device_uvector<SymbolT> d_input(input.size(), stream_view);
+  hostdevice_vector<StackSymbolT> stack_context(input.size(), stream_view);
+
+  ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(
+    d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream));
+
+  // Run algorithm
+  cudf::io::json::gpu::get_stack_context(
+    d_input,
+    cudf::device_span<StackSymbolT>{stack_context.device_ptr(), stack_context.size()},
+    stream);
+
+  // Copy back the results
+  stack_context.device_to_host(stream);
+
+  // Make sure we copied back the stack context
+  stream_view.synchronize();
+
+  std::vector<char> golden_stack_context{
+    '_', '_', '_', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{',
+    '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{',
+    '{', '{', '{', '[', '[', '[', '[', '[', '[', '[', '[', '{', '{', '{', '{', '{', '{', '{',
+    '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{',
+    '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{',
+    '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{',
+    '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '[', '[', '[', '{',
+    '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{',
+    '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '[', '[',
+    '{', '[', '[', '[', '[', '[', '[', '[', '{', '{', '{', '{', '{', '[', '{', '{', '[', '[',
+    '[', '{', '[', '{', '{', '[', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{',
+    '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{',
+    '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{',
+    '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{',
+    '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '_'};
+
+  ASSERT_EQ(golden_stack_context.size(), stack_context.size());
+  for (std::size_t i = 0; i < stack_context.size() && i < 1000; i++) {
+    ASSERT_EQ(golden_stack_context[i], stack_context[i]);
+  }
+}
+
+TEST_F(JsonTest, TokenStream)
+{
+  using cudf::io::json::gpu::PdaTokenT;
+  using cudf::io::json::gpu::SymbolOffsetT;
+  using cudf::io::json::gpu::SymbolT;
+
+  constexpr std::size_t single_item = 1;
+
+  // Prepare cuda stream for data transfers & kernels
+  cudaStream_t stream = nullptr;
+  cudaStreamCreate(&stream);
+  rmm::cuda_stream_view stream_view(stream);
+
+  // Test input
+  std::string input = R"(  [{)"
+                      R"("category": "reference",)"
+                      R"("index:": [4,12,42],)"
+                      R"("author": "Nigel Rees",)"
+                      R"("title": "[Sayings of the Century]",)"
+                      R"("price": 8.95)"
+                      R"(},  )"
+                      R"({)"
+                      R"("category": "reference",)"
+                      R"("index": [4,{},null,{"a":[{ }, {}] } ],)"
+                      R"("author": "Nigel Rees",)"
+                      R"("title": "{}[], <=semantic-symbols-string",)"
+                      R"("price": 8.95)"
+                      R"(}] )";
+
+  // Prepare input & output buffers
+  rmm::device_uvector<SymbolT> d_input(input.size(), stream_view);
+
+  ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(
+    d_input.data(), input.data(), input.size() * sizeof(SymbolT), cudaMemcpyHostToDevice, stream));
+
+
+  hostdevice_vector<PdaTokenT> tokens_gpu{input.size(), stream};
+  hostdevice_vector<SymbolOffsetT> token_indices_gpu{input.size(), stream};
+  hostdevice_vector<SymbolOffsetT> num_tokens_out{single_item, stream};
+
+  // Parse the JSON and get the token stream
+  cudf::io::json::gpu::get_token_stream(
+    d_input,
+    cudf::device_span<PdaTokenT>{tokens_gpu.device_ptr(), tokens_gpu.size()},
+    cudf::device_span<SymbolOffsetT>{token_indices_gpu.device_ptr(), token_indices_gpu.size()},
+    num_tokens_out.device_ptr(),
+    stream);
+
+  // Copy back the number of tokens that were written
+  num_tokens_out.device_to_host(stream);
+  tokens_gpu.device_to_host(stream);
+  token_indices_gpu.device_to_host(stream);
+
+  // Make sure we copied back all relevant data
+  stream_view.synchronize();
+
+  // Golden token stream sample
+  std::vector<std::pair<std::size_t, nested_json::PdaTokenT>> golden_token_stream = {
+    {2, nested_json::TK_BOL},   {3, nested_json::TK_BOS},   {4, nested_json::TK_BFN},
+    {13, nested_json::TK_EFN},  {16, nested_json::TK_BST},  {26, nested_json::TK_EST},
+    {28, nested_json::TK_BFN},  {35, nested_json::TK_EFN},  {38, nested_json::TK_BOL},
+    {39, nested_json::TK_BOV},  {40, nested_json::TK_POV},  {41, nested_json::TK_BOV},
+    {43, nested_json::TK_POV},  {44, nested_json::TK_BOV},  {46, nested_json::TK_POV},
+    {46, nested_json::TK_EOL},  {48, nested_json::TK_BFN},  {55, nested_json::TK_EFN},
+    {58, nested_json::TK_BST},  {69, nested_json::TK_EST},  {71, nested_json::TK_BFN},
+    {77, nested_json::TK_EFN},  {80, nested_json::TK_BST},  {105, nested_json::TK_EST},
+    {107, nested_json::TK_BFN}, {113, nested_json::TK_EFN}, {116, nested_json::TK_BOV},
+    {120, nested_json::TK_POV}, {120, nested_json::TK_EOS}, {124, nested_json::TK_BOS},
+    {125, nested_json::TK_BFN}, {134, nested_json::TK_EFN}, {137, nested_json::TK_BST},
+    {147, nested_json::TK_EST}, {149, nested_json::TK_BFN}, {155, nested_json::TK_EFN},
+    {158, nested_json::TK_BOL}, {159, nested_json::TK_BOV}, {160, nested_json::TK_POV},
+    {161, nested_json::TK_BOS}, {162, nested_json::TK_EOS}, {164, nested_json::TK_BOV},
+    {168, nested_json::TK_POV}, {169, nested_json::TK_BOS}, {170, nested_json::TK_BFN},
+    {172, nested_json::TK_EFN}, {174, nested_json::TK_BOL}, {175, nested_json::TK_BOS},
+    {177, nested_json::TK_EOS}, {180, nested_json::TK_BOS}, {181, nested_json::TK_EOS},
+    {182, nested_json::TK_EOL}, {184, nested_json::TK_EOS}, {186, nested_json::TK_EOL},
+    {188, nested_json::TK_BFN}, {195, nested_json::TK_EFN}, {198, nested_json::TK_BST},
+    {209, nested_json::TK_EST}, {211, nested_json::TK_BFN}, {217, nested_json::TK_EFN},
+    {220, nested_json::TK_BST}, {252, nested_json::TK_EST}, {254, nested_json::TK_BFN},
+    {260, nested_json::TK_EFN}, {263, nested_json::TK_BOV}, {267, nested_json::TK_POV},
+    {267, nested_json::TK_EOS}, {268, nested_json::TK_EOL}};
+
+  // Verify the number of tokens matches
+  ASSERT_EQ(golden_token_stream.size(), num_tokens_out[0]);
+  
+  for (std::size_t i = 0; i < num_tokens_out[0]; i++) {
+    // Ensure the index the tokens are pointing to do match
+    ASSERT_EQ(golden_token_stream[i].first, token_indices_gpu[i]);
+    // Ensure the token category is correct
+    ASSERT_EQ(golden_token_stream[i].second, tokens_gpu[i]);
+  }
+}