From b6548797d5ac43585fdfeb6762df6ac7b8d18c8b Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Tue, 2 Feb 2021 17:03:04 -0600
Subject: [PATCH 01/33] Extremely rough draft.

---
 cpp/include/cudf/strings/detail/substring.hpp |  42 ++
 cpp/include/cudf/strings/substring.hpp        |  21 +-
 cpp/src/strings/json/json_path.cu             | 460 ++++++++++++++++++
 cpp/tests/CMakeLists.txt                      |   3 +-
 cpp/tests/strings/json_tests.cpp              | 162 ++++++
 5 files changed, 686 insertions(+), 2 deletions(-)
 create mode 100644 cpp/include/cudf/strings/detail/substring.hpp
 create mode 100644 cpp/src/strings/json/json_path.cu
 create mode 100644 cpp/tests/strings/json_tests.cpp
diff --git a/cpp/include/cudf/strings/detail/substring.hpp b/cpp/include/cudf/strings/detail/substring.hpp
new file mode 100644
index 00000000000..a646d93e2b8
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/substring.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+/**
+ * @copydoc cudf::get_json_object
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+std::unique_ptr<cudf::column> get_json_object(
+  cudf::strings_column_view const& col,
+  cudf::string_scalar const& json_path,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp
index 6941615a0c2..06a38360fdb 100644
--- a/cpp/include/cudf/strings/substring.hpp
+++ b/cpp/include/cudf/strings/substring.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -197,6 +197,25 @@ std::unique_ptr<column> slice_strings(
   size_type count,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Apply a JSONPath string to all rows in an input strings column.
+ *
+ * Applies a JSONPath string to an incoming strings column where each row in the column
+ * is a valid json string.  The output is returned by row as a strings column.
+ *
+ * https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html
+ * Implements only the operators: $ . [] *
+ *
+ * @param col The input strings column. Each row must contain a valid json string
+ * @param json_path The JSONPath string to be applied to each row
+ * @param mr Resource for allocating device memory.
+ * @return New strings column containing the retrieved json object strings
+ */
+std::unique_ptr<cudf::column> get_json_object(
+  cudf::strings_column_view const& col,
+  cudf::string_scalar const& json_path,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
new file mode 100644
index 00000000000..b2b317829e8
--- /dev/null
+++ b/cpp/src/strings/json/json_path.cu
@@ -0,0 +1,460 @@
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+namespace {
+
+using namespace cudf;
+
+CUDA_HOST_DEVICE_CALLABLE bool device_strncmp(const char* str1, const char* str2, size_t num_chars)
+{
+  for (size_t idx = 0; idx < num_chars; idx++) {
+    if (str1[idx] != str2[idx]) { return false; }
+  }
+  return true;
+}
+
+CUDA_HOST_DEVICE_CALLABLE char const* device_strpbrk(const char* str,
+                                                     size_t str_size,
+                                                     const char* tok,
+                                                     size_t tok_size)
+{
+  size_t pos = 0;
+  while (pos < str_size) {
+    size_t tpos = 0;
+    char c      = str[pos];
+    while (tpos < tok_size) {
+      if (c == tok[tpos]) { return str + pos; }
+      tpos++;
+    }
+    pos++;
+  }
+  return nullptr;
+}
+
+struct json_string {
+  const char* str;
+  int64_t len;
+
+  CUDA_HOST_DEVICE_CALLABLE bool operator==(json_string const& cmp)
+  {
+    return len == cmp.len && str != nullptr && cmp.str != nullptr &&
+           device_strncmp(str, cmp.str, static_cast<size_t>(len));
+  }
+};
+
+enum json_element_type {
+  NONE,
+  OBJECT,
+  ARRAY,
+};
+
+class parser {
+ protected:
+  CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {}
+  CUDA_HOST_DEVICE_CALLABLE parser(const char* _input, int64_t _input_len)
+    : input(_input), input_len(_input_len), pos(_input)
+  {
+    parse_whitespace();
+  }
+
+  CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace()
+  {
+    while (!eof()) {
+      char c = *pos;
+      if (c == ' ' || c == '\r' || c == '\n' || c == '\t') {
+        pos++;
+      } else {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; }
+
+  CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); }
+
+  CUDA_HOST_DEVICE_CALLABLE bool parse_name(json_string& name, json_string& terminators)
+  {
+    char c = *pos;
+    switch (c) {
+      case '*':
+        name.str = pos;
+        name.len = 1;
+        pos++;
+        return true;
+
+      default: {
+        size_t const chars_left = input_len - (pos - input);
+        char const* end         = device_strpbrk(pos, chars_left, terminators.str, terminators.len);
+        if (end) {
+          name.str = pos;
+          name.len = end - pos;
+          pos      = end;
+        } else {
+          name.str = pos;
+          name.len = chars_left;
+          pos      = input + input_len;
+        }
+        return true;
+      } break;
+    }
+
+    return false;
+  }
+
+ protected:
+  char const* input;
+  int64_t input_len;
+  char const* pos;
+};
+
+class json_state : private parser {
+ public:
+  CUDA_HOST_DEVICE_CALLABLE json_state()
+    : parser(), element(json_element_type::NONE), cur_el_start(nullptr)
+  {
+  }
+  CUDA_HOST_DEVICE_CALLABLE json_state(const char* _input, int64_t _input_len)
+    : parser(_input, _input_len), element(json_element_type::NONE), cur_el_start(nullptr)
+  {
+  }
+
+  CUDA_HOST_DEVICE_CALLABLE bool next_match(json_string& str, json_state& child)
+  {
+    json_string name;
+    if (!parse_string(name, true)) { return false; }
+    if ((str.len == 1 && str.str[0] == '*') || str == name) {
+      // if this isn't an empty string, parse out the :
+      if (name.len > 0) {
+        if (!parse_whitespace() || *pos != ':') { return false; }
+        pos++;
+      }
+
+      // we have a match on the name, so advance to the beginning of the next element
+      if (parse_whitespace()) {
+        switch (*pos) {
+          case '[': element = ARRAY; break;
+
+          case '{': element = OBJECT; break;
+
+          default: return false;
+        }
+        cur_el_start = pos++;
+
+        // success
+        child = *this;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  CUDA_HOST_DEVICE_CALLABLE json_string extract_element()
+  {
+    // collapse the current element into a json_string
+    int obj_count = 0;
+    int arr_count = 0;
+
+    char const* start = cur_el_start;
+    char const* end   = start;
+    while (!eof(end)) {
+      char c = *end++;
+      switch (c) {
+        case '{': obj_count++; break;
+        case '}': obj_count--; break;
+        case '[': arr_count++; break;
+        case ']': arr_count--; break;
+        default: break;
+      }
+      if (obj_count == 0 && arr_count == 0) { break; }
+    }
+    pos = end;
+
+    return {start, end - start};
+  }
+
+  json_element_type element;
+
+ private:
+  CUDA_HOST_DEVICE_CALLABLE bool parse_string(json_string& str, bool can_be_empty)
+  {
+    str.str = nullptr;
+    str.len = 0;
+
+    if (parse_whitespace()) {
+      if (*pos == '\"') {
+        const char* start = ++pos;
+        while (!eof()) {
+          if (*pos == '\"') {
+            str.str = start;
+            str.len = pos - start;
+            pos++;
+            return true;
+          }
+          pos++;
+        }
+      }
+    }
+
+    return can_be_empty ? true : false;
+  }
+  const char* cur_el_start;
+};
+
+enum path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END };
+
+// constexpr max_name_len    (63)
+struct path_operator {
+  path_operator_type type;
+  json_string name;
+  int index;
+};
+
+// current state of the JSONPath
+class path_state : private parser {
+ public:
+  CUDA_HOST_DEVICE_CALLABLE path_state() : parser() {}
+  CUDA_HOST_DEVICE_CALLABLE path_state(const char* _path, size_t _path_len)
+    : parser(_path, _path_len)
+  {
+  }
+
+  CUDA_HOST_DEVICE_CALLABLE path_operator get_next_operator()
+  {
+    if (eof()) { return {END}; }
+
+    char c = parse_char();
+    switch (c) {
+      case '$': return {ROOT};
+
+      case '.': {
+        path_operator op;
+        json_string term{".[", 2};
+        if (parse_name(op.name, term)) {
+          if (op.name.len == 1 && op.name.str[0] == '*') {
+            op.type = CHILD_WILDCARD;
+          } else {
+            op.type = CHILD;
+          }
+          return op;
+        }
+      } break;
+
+      // 3 ways this can be used
+      // indices:   [0]
+      // name:      ['book']
+      // wildcard:  [*]
+      case '[': {
+        path_operator op;
+        json_string term{"]", 1};
+        if (parse_name(op.name, term)) {
+          pos++;
+          if (op.name.len == 1 && op.name.str[0] == '*') {
+            op.type = CHILD_WILDCARD;
+          } else {
+            // unhandled cases
+            break;
+          }
+          return op;
+        }
+      } break;
+
+      default: break;
+    }
+    return {ERROR};
+  }
+
+ private:
+  CUDA_HOST_DEVICE_CALLABLE char parse_char() { return *pos++; }
+};
+
+struct json_output {
+  size_t output_max_len;
+  size_t output_len;
+  char* output;
+
+  CUDA_HOST_DEVICE_CALLABLE void add_output(const char* str, size_t len)
+  {
+    if (output != nullptr) {
+      // assert output_len + len < output_max_len
+      memcpy(output + output_len, str, len);
+    }
+    output_len += len;
+  }
+
+  CUDA_HOST_DEVICE_CALLABLE void add_output(json_string str) { add_output(str.str, str.len); }
+};
+
+CUDA_HOST_DEVICE_CALLABLE void parse_json_path(json_state& j_state,
+                                               path_state p_state,
+                                               json_output& output)
+{
+  path_operator op = p_state.get_next_operator();
+
+  switch (op.type) {
+    // whatever the first object is
+    case ROOT: {
+      json_state child;
+      json_string wildcard{"*", 1};
+      if (j_state.next_match(wildcard, child)) { parse_json_path(child, p_state, output); }
+    } break;
+
+    // .name
+    // ['name']
+    // [1]
+    // will return a single thing
+    case CHILD: {
+      json_state child;
+      if (j_state.next_match(op.name, child)) { parse_json_path(child, p_state, output); }
+    } break;
+
+    // .*
+    // [*]
+    // will return an array of things
+    case CHILD_WILDCARD: {
+      output.add_output("[\n", 2);
+
+      json_state child;
+      int count = 0;
+      while (j_state.next_match(op.name, child)) {
+        if (count > 0) { output.add_output(",\n", 2); }
+        parse_json_path(child, p_state, output);
+        j_state = child;
+        count++;
+      }
+      output.add_output("]\n", 2);
+    } break;
+
+    // some sort of error.
+    case ERROR: break;
+
+    // END case
+    default: output.add_output(j_state.extract_element()); break;
+  }
+}
+
+CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input,
+                                                             size_t input_len,
+                                                             char const* path,
+                                                             size_t path_len,
+                                                             char* out_buf,
+                                                             size_t out_buf_size)
+{
+  // TODO: add host-side code to verify path is a valid string.
+  json_state j_state(input, input_len);
+  path_state p_state(path, path_len);
+  json_output output{out_buf_size, 0, out_buf};
+
+  parse_json_path(j_state, p_state, output);
+
+  return output;
+}
+
+__global__ void get_json_object_kernel(char const* chars,
+                                       size_type const* offsets,
+                                       char const* json_path,
+                                       size_t json_path_len,
+                                       size_type* output_offsets,
+                                       char* out_buf,
+                                       size_t out_buf_size)
+{
+  uint64_t const tid = threadIdx.x + (blockDim.x * blockIdx.x);
+
+  json_output out = get_json_object_single(chars + offsets[tid],
+                                           offsets[tid + 1] - offsets[tid],
+                                           json_path,
+                                           json_path_len,
+                                           out_buf,
+                                           out_buf_size);
+
+  // filled in only during the precompute step
+  if (output_offsets != nullptr) { output_offsets[tid] = static_cast<size_type>(out.output_len); }
+}
+
+std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
+                                              cudf::string_scalar const& json_path,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  size_t stack_size;
+  cudaDeviceGetLimit(&stack_size, cudaLimitStackSize);
+  cudaDeviceSetLimit(cudaLimitStackSize, 2048);
+
+  auto offsets = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr);
+  cudf::mutable_column_view offsets_view(*offsets);
+
+  cudf::detail::grid_1d const grid{1, col.size()};
+
+  // preprocess sizes
+  get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    col.chars().head<char>(),
+    col.offsets().head<size_type>(),
+    json_path.data(),
+    json_path.size(),
+    offsets_view.head<size_type>(),
+    nullptr,
+    0);
+
+  // convert sizes to offsets
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         offsets_view.head<size_type>(),
+                         offsets_view.head<size_type>() + col.size() + 1,
+                         offsets_view.head<size_type>(),
+                         0);
+  size_type output_size = cudf::detail::get_value<size_type>(offsets_view, col.size(), stream);
+
+  // allocate output string column
+  auto chars = cudf::make_fixed_width_column(
+    data_type{type_id::INT8}, output_size, mask_state::UNALLOCATED, stream, mr);
+
+  // compute results
+  cudf::mutable_column_view chars_view(*chars);
+  get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    col.chars().head<char>(),
+    col.offsets().head<size_type>(),
+    json_path.data(),
+    json_path.size(),
+    nullptr,
+    chars_view.head<char>(),
+    output_size);
+
+  // reset back to original stack size
+  cudaDeviceSetLimit(cudaLimitStackSize, stack_size);
+
+  return make_strings_column(col.size(),
+                             std::move(offsets),
+                             std::move(chars),
+                             UNKNOWN_NULL_COUNT,
+                             rmm::device_buffer{},
+                             stream,
+                             mr);
+}
+
+}  // namespace
+}  // namespace detail
+
+std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
+                                              cudf::string_scalar const& json_path,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  return detail::get_json_object(col, json_path, 0, mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
\ No newline at end of file
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8395a3cc1f2..08899c09d5c 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -585,7 +585,8 @@ set(STRINGS_TEST_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/strings/strip_tests.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/strings/substring_tests.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/strings/translate_tests.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/strings/urls_tests.cpp")
+    "${CMAKE_CURRENT_SOURCE_DIR}/strings/urls_tests.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/strings/json_tests.cpp")
 
 ConfigureTest(STRINGS_TEST "${STRINGS_TEST_SRC}")
 
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
new file mode 100644
index 00000000000..f114738b345
--- /dev/null
+++ b/cpp/tests/strings/json_tests.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/substring.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+/*
+const char* json_string = "{
+    "store": {
+        "book": [
+            {
+                "category": "reference",
+                "author": "Nigel Rees",
+                "title": "Sayings of the Century",
+                "price": 8.95
+            },
+            {
+                "category": "fiction",
+                "author": "Evelyn Waugh",
+                "title": "Sword of Honour",
+                "price": 12.99
+            },
+            {
+                "category": "fiction",
+                "author": "Herman Melville",
+                "title": "Moby Dick",
+                "isbn": "0-553-21311-3",
+                "price": 8.99
+            },
+            {
+                "category": "fiction",
+                "author": "J. R. R. Tolkien",
+                "title": "The Lord of the Rings",
+                "isbn": "0-395-19395-8",
+                "price": 22.99
+            }
+        ],
+        "bicycle": {
+            "color": "red",
+            "price": 19.95
+        }
+    },
+}";
+*/
+
+struct JsonTests : public cudf::test::BaseFixture {
+};
+
+TEST_F(JsonTests, GetJsonObject)
+{
+  // reference:  https://jsonpath.herokuapp.com/
+  // clang-format off
+   /*
+   {
+      "store": {
+         "book": [
+               {
+                  "category": "reference",
+                  "author": "Nigel Rees",
+                  "title": "Sayings of the Century",
+                  "price": 8.95
+               },
+               {
+                  "category": "fiction",
+                  "author": "Evelyn Waugh",
+                  "title": "Sword of Honour",
+                  "price": 12.99
+               },
+               {
+                  "category": "fiction",
+                  "author": "Herman Melville",
+                  "title": "Moby Dick",
+                  "isbn": "0-553-21311-3",
+                  "price": 8.99
+               },
+               {
+                  "category": "fiction",
+                  "author": "J. R. R. Tolkien",
+                  "title": "The Lord of the Rings",
+                  "isbn": "0-395-19395-8",
+                  "price": 22.99
+               }
+         ],
+         "bicycle": {
+               "color": "red",
+               "price": 19.95
+         }
+      },
+      "expensive": 10
+   }
+   */
+  // clang-format on
+  // this string is formatted to result in a reasonably readable debug printf
+  const char* json_string =
+    "{\n\"store\": {\n\t\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
+    "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
+    "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
+    "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
+    "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
+    "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
+    "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
+    "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
+    "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
+    "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": "
+    "19.95\n\t}\n},\n\"expensive\": 10\n}";
+
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    cudf::string_scalar json_path("$");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    cudf::string_scalar json_path("$.store");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    cudf::string_scalar json_path("$.store.book");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    cudf::string_scalar json_path("$.*");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    cudf::string_scalar json_path("$[*]");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+}

From 36cd4c1356045472048d8591a5455365ea9b4255 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Wed, 10 Feb 2021 17:31:13 -0600
Subject: [PATCH 02/33] Add support for full set of operators I believe we will
 need to support. Code is still purely naive and probably doesn't handle all
 possible error conditions well.

---
 cpp/src/io/utilities/parsing_utils.cuh |   2 +
 cpp/src/strings/json/json_path.cu      | 567 +++++++++++++++++++------
 cpp/tests/strings/json_tests.cpp       |  44 +-
 3 files changed, 475 insertions(+), 138 deletions(-)

diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index c7f405e1cc0..d8a6cb70a10 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -20,6 +20,8 @@
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <io/utilities/column_type_histogram.hpp>
+
 #include <rmm/device_vector.hpp>
 
 using cudf::detail::device_span;
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index b2b317829e8..9267d933368 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -7,6 +7,8 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <io/utilities/parsing_utils.cuh>
+
 #include <rmm/exec_policy.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
@@ -19,6 +21,108 @@ namespace {
 
 using namespace cudf;
 
+CUDA_HOST_DEVICE_CALLABLE char to_lower(char const c)
+{
+  return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c;
+}
+
+template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+CUDA_HOST_DEVICE_CALLABLE uint8_t decode_digit(char c, bool* valid_flag)
+{
+  if (c >= '0' && c <= '9') return c - '0';
+  if (c >= 'a' && c <= 'f') return c - 'a' + 10;
+  if (c >= 'A' && c <= 'F') return c - 'A' + 10;
+
+  *valid_flag = false;
+  return 0;
+}
+
+template <typename T, typename std::enable_if_t<!std::is_integral<T>::value>* = nullptr>
+CUDA_HOST_DEVICE_CALLABLE uint8_t decode_digit(char c, bool* valid_flag)
+{
+  if (c >= '0' && c <= '9') return c - '0';
+
+  *valid_flag = false;
+  return 0;
+}
+
+CUDA_HOST_DEVICE_CALLABLE bool is_infinity(char const* begin, char const* end)
+{
+  if (*begin == '-' || *begin == '+') begin++;
+  char const* cinf = "infinity";
+  auto index       = begin;
+  while (index < end) {
+    if (*cinf != to_lower(*index)) break;
+    index++;
+    cinf++;
+  }
+  return ((index == begin + 3 || index == begin + 8) && index >= end);
+}
+
+template <typename T, int base = 10>
+CUDA_HOST_DEVICE_CALLABLE T parse_numeric(const char* begin,
+                                          const char* end,
+                                          cudf::io::parse_options_view const& opts)
+{
+  T value{};
+  bool all_digits_valid = true;
+
+  // Handle negative values if necessary
+  int32_t sign = (*begin == '-') ? -1 : 1;
+
+  // Handle infinity
+  if (std::is_floating_point<T>::value && is_infinity(begin, end)) {
+    return sign * std::numeric_limits<T>::infinity();
+  }
+  if (*begin == '-' || *begin == '+') begin++;
+
+  // Skip over the "0x" prefix for hex notation
+  if (base == 16 && begin + 2 < end && *begin == '0' && *(begin + 1) == 'x') { begin += 2; }
+
+  // Handle the whole part of the number
+  // auto index = begin;
+  while (begin < end) {
+    if (*begin == opts.decimal) {
+      ++begin;
+      break;
+    } else if (base == 10 && (*begin == 'e' || *begin == 'E')) {
+      break;
+    } else if (*begin != opts.thousands && *begin != '+') {
+      value = (value * base) + decode_digit<T>(*begin, &all_digits_valid);
+    }
+    ++begin;
+  }
+
+  if (std::is_floating_point<T>::value) {
+    // Handle fractional part of the number if necessary
+    double divisor = 1;
+    while (begin < end) {
+      if (*begin == 'e' || *begin == 'E') {
+        ++begin;
+        break;
+      } else if (*begin != opts.thousands && *begin != '+') {
+        divisor /= base;
+        value += decode_digit<T>(*begin, &all_digits_valid) * divisor;
+      }
+      ++begin;
+    }
+
+    // Handle exponential part of the number if necessary
+    if (begin < end) {
+      const int32_t exponent_sign = *begin == '-' ? -1 : 1;
+      if (*begin == '-' || *begin == '+') { ++begin; }
+      int32_t exponent = 0;
+      while (begin < end) {
+        exponent = (exponent * 10) + decode_digit<T>(*(begin++), &all_digits_valid);
+      }
+      if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
+    }
+  }
+  if (!all_digits_valid) { return std::numeric_limits<T>::quiet_NaN(); }
+
+  return value * sign;
+}
+
 CUDA_HOST_DEVICE_CALLABLE bool device_strncmp(const char* str1, const char* str2, size_t num_chars)
 {
   for (size_t idx = 0; idx < num_chars; idx++) {
@@ -56,12 +160,36 @@ struct json_string {
   }
 };
 
-enum json_element_type {
-  NONE,
-  OBJECT,
-  ARRAY,
+enum class parse_result {
+  ERROR,
+  SUCCESS,
+  EMPTY,
+};
+
+enum json_element_type { NONE, OBJECT, ARRAY, VALUE };
+
+struct json_output {
+  size_t output_max_len;
+  size_t output_len;
+  char* output;
+
+  CUDA_HOST_DEVICE_CALLABLE void add_output(const char* str, size_t len)
+  {
+    if (output != nullptr) {
+      // assert output_len + len < output_max_len
+      memcpy(output + output_len, str, len);
+    }
+    output_len += len;
+  }
+
+  CUDA_HOST_DEVICE_CALLABLE void add_output(json_string str) { add_output(str.str, str.len); }
 };
 
+CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c)
+{
+  return c == ' ' || c == '\r' || c == '\n' || c == '\t' ? true : false;
+}
+
 class parser {
  protected:
   CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {}
@@ -71,11 +199,13 @@ class parser {
     parse_whitespace();
   }
 
+  CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; }
+  CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); }
+
   CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace()
   {
     while (!eof()) {
-      char c = *pos;
-      if (c == ' ' || c == '\r' || c == '\n' || c == '\t') {
+      if (is_whitespace(*pos)) {
         pos++;
       } else {
         return true;
@@ -84,37 +214,73 @@ class parser {
     return false;
   }
 
-  CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; }
+  CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(json_string& str,
+                                                      bool can_be_empty,
+                                                      char quote)
+  {
+    str.str = nullptr;
+    str.len = 0;
 
-  CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); }
+    if (parse_whitespace()) {
+      if (*pos == quote) {
+        const char* start = ++pos;
+        while (!eof()) {
+          if (*pos == quote) {
+            str.str = start;
+            str.len = pos - start;
+            pos++;
+            return parse_result::SUCCESS;
+          }
+          pos++;
+        }
+      }
+    }
 
-  CUDA_HOST_DEVICE_CALLABLE bool parse_name(json_string& name, json_string& terminators)
+    return can_be_empty ? parse_result::EMPTY : parse_result::ERROR;
+  }
+
+  // a name means:
+  // - a string followed by a :
+  // - no string
+  CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(json_string& name,
+                                                    bool can_be_empty,
+                                                    char quote)
   {
-    char c = *pos;
-    switch (c) {
-      case '*':
-        name.str = pos;
-        name.len = 1;
+    if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) {
+      return parse_result::ERROR;
+    }
+
+    // if we got a real string, the next char must be a :
+    if (name.len > 0) {
+      if (!parse_whitespace()) { return parse_result::ERROR; }
+      if (*pos == ':') {
         pos++;
-        return true;
+        return parse_result::SUCCESS;
+      }
+    }
+    return parse_result::EMPTY;
+  }
 
-      default: {
-        size_t const chars_left = input_len - (pos - input);
-        char const* end         = device_strpbrk(pos, chars_left, terminators.str, terminators.len);
-        if (end) {
-          name.str = pos;
-          name.len = end - pos;
-          pos      = end;
-        } else {
-          name.str = pos;
-          name.len = chars_left;
-          pos      = input + input_len;
-        }
-        return true;
-      } break;
+  // this function is not particularly strong
+  CUDA_HOST_DEVICE_CALLABLE parse_result parse_number(json_string& val)
+  {
+    if (!parse_whitespace()) { return parse_result::ERROR; }
+
+    // parse to the end of the number (does not do any error checking on whether
+    // the number is reasonably formed or not)
+    char const* start = pos;
+    char const* end   = start;
+    while (!eof(end)) {
+      char c = *end;
+      if (c == ',' || is_whitespace(c)) { break; }
+      end++;
     }
+    pos = end;
 
-    return false;
+    val.str = start;
+    val.len = {end - start};
+
+    return parse_result::SUCCESS;
   }
 
  protected:
@@ -134,89 +300,153 @@ class json_state : private parser {
   {
   }
 
-  CUDA_HOST_DEVICE_CALLABLE bool next_match(json_string& str, json_state& child)
-  {
-    json_string name;
-    if (!parse_string(name, true)) { return false; }
-    if ((str.len == 1 && str.str[0] == '*') || str == name) {
-      // if this isn't an empty string, parse out the :
-      if (name.len > 0) {
-        if (!parse_whitespace() || *pos != ':') { return false; }
-        pos++;
-      }
+  CUDA_HOST_DEVICE_CALLABLE json_state(json_state const& j) { *this = j; }
 
-      // we have a match on the name, so advance to the beginning of the next element
-      if (parse_whitespace()) {
-        switch (*pos) {
-          case '[': element = ARRAY; break;
+  CUDA_HOST_DEVICE_CALLABLE parse_result extract_element(json_output* output)
+  {
+    // collapse the current element into a json_string
 
-          case '{': element = OBJECT; break;
+    char const* start = cur_el_start;
+    char const* end   = start;
 
-          default: return false;
+    // if we're a value type, do a simple value parse.
+    if (cur_el_type == VALUE) {
+      pos = cur_el_start;
+      if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; }
+      end = pos;
+    }
+    // otherwise, march through everything inside
+    else {
+      int obj_count = 0;
+      int arr_count = 0;
+
+      while (!eof(end)) {
+        char c = *end++;
+        // could do some additional checks here. we know our current
+        // element type, so we could be more strict on what kinds of
+        // characters we expect to see.
+        switch (c) {
+          case '{': obj_count++; break;
+          case '}': obj_count--; break;
+          case '[': arr_count++; break;
+          case ']': arr_count--; break;
+          default: break;
         }
-        cur_el_start = pos++;
-
-        // success
-        child = *this;
-        return true;
+        if (obj_count == 0 && arr_count == 0) { break; }
       }
+      pos = end;
     }
-    return false;
-  }
 
-  CUDA_HOST_DEVICE_CALLABLE json_string extract_element()
-  {
-    // collapse the current element into a json_string
-    int obj_count = 0;
-    int arr_count = 0;
+    // parse trailing ,
+    if (parse_whitespace()) {
+      if (*pos == ',') { pos++; }
+    }
 
-    char const* start = cur_el_start;
-    char const* end   = start;
-    while (!eof(end)) {
-      char c = *end++;
-      switch (c) {
-        case '{': obj_count++; break;
-        case '}': obj_count--; break;
-        case '[': arr_count++; break;
-        case ']': arr_count--; break;
-        default: break;
+    if (output != nullptr) {
+      // seems like names are never included with JSONPath unless
+      // they are nested within the element being returned.
+      /*
+      if(cur_el_name.len > 0){
+        output->add_output({"\"", 1});
+        output->add_output(cur_el_name);
+        output->add_output({"\"", 1});
+        output->add_output({":", 1});
       }
-      if (obj_count == 0 && arr_count == 0) { break; }
+      */
+      output->add_output({start, end - start});
     }
-    pos = end;
-
-    return {start, end - start};
+    return parse_result::SUCCESS;
   }
 
+  CUDA_HOST_DEVICE_CALLABLE parse_result skip_element() { return extract_element(nullptr); }
+
   json_element_type element;
 
+  CUDA_HOST_DEVICE_CALLABLE parse_result next_element() { return next_element_internal(false); }
+
+  CUDA_HOST_DEVICE_CALLABLE parse_result child_element() { return next_element_internal(true); }
+
+  CUDA_HOST_DEVICE_CALLABLE parse_result next_matching_element(json_string const& name,
+                                                               bool inclusive)
+  {
+    // if we're not including the current element, skip it
+    if (!inclusive) {
+      parse_result result = next_element_internal(false);
+      if (result != parse_result::SUCCESS) { return result; }
+    }
+    // loop until we find a match or there's nothing left
+    do {
+      // wildcard matches anything
+      if (name.len == 1 && name.str[0] == '*') {
+        return parse_result::SUCCESS;
+      } else if (cur_el_name == name) {
+        return parse_result::SUCCESS;
+      }
+
+      // next
+      parse_result result = next_element_internal(false);
+      if (result != parse_result::SUCCESS) { return result; }
+    } while (1);
+
+    return parse_result::ERROR;
+  }
+
  private:
-  CUDA_HOST_DEVICE_CALLABLE bool parse_string(json_string& str, bool can_be_empty)
+  CUDA_HOST_DEVICE_CALLABLE parse_result parse_value()
   {
-    str.str = nullptr;
-    str.len = 0;
+    if (!parse_whitespace()) { return parse_result::ERROR; }
 
-    if (parse_whitespace()) {
-      if (*pos == '\"') {
-        const char* start = ++pos;
-        while (!eof()) {
-          if (*pos == '\"') {
-            str.str = start;
-            str.len = pos - start;
-            pos++;
-            return true;
-          }
-          pos++;
-        }
-      }
+    // string or number?
+    json_string unused;
+    return *pos == '\"' ? parse_string(unused, false, '\"') : parse_number(unused);
+  }
+
+  CUDA_HOST_DEVICE_CALLABLE parse_result next_element_internal(bool child)
+  {
+    // if we're not getting a child element, skip the current element.
+    // this will leave pos as the first character -after- the close of
+    // the current element
+    if (!child && cur_el_start != nullptr) {
+      if (skip_element() == parse_result::ERROR) { return parse_result::ERROR; }
+      cur_el_start = nullptr;
     }
+    // otherwise pos will be at the first character within the current element
+
+    // what's next
+    if (!parse_whitespace()) { return parse_result::EMPTY; }
+    // if we're closing off a parent element, we're done
+    char c = *pos;
+    if (c == ']' || c == '}') { return parse_result::EMPTY; }
+
+    // element name, if any
+    if (parse_name(cur_el_name, true, '\"') == parse_result::ERROR) { return parse_result::ERROR; }
+
+    // element type
+    if (!parse_whitespace()) { return parse_result::EMPTY; }
+    switch (*pos) {
+      case '[': cur_el_type = ARRAY; break;
+      case '{': cur_el_type = OBJECT; break;
+
+      case ',':
+      case ':':
+      case '\'': return parse_result::ERROR;
 
-    return can_be_empty ? true : false;
+      // value type
+      default: cur_el_type = VALUE; break;
+    }
+    pos++;
+
+    // the start of the current element is always at the value, not the name
+    cur_el_start = pos - 1;
+    return parse_result::SUCCESS;
   }
+
   const char* cur_el_start;
+  json_string cur_el_name;
+  json_element_type cur_el_type;
 };
 
-enum path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END };
+enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END };
 
 // constexpr max_name_len    (63)
 struct path_operator {
@@ -233,23 +463,24 @@ class path_state : private parser {
     : parser(_path, _path_len)
   {
   }
+  CUDA_HOST_DEVICE_CALLABLE path_state(path_state const& p) { *this = p; }
 
   CUDA_HOST_DEVICE_CALLABLE path_operator get_next_operator()
   {
-    if (eof()) { return {END}; }
+    if (eof()) { return {path_operator_type::END}; }
 
-    char c = parse_char();
+    char c = *pos++;
     switch (c) {
-      case '$': return {ROOT};
+      case '$': return {path_operator_type::ROOT};
 
       case '.': {
         path_operator op;
         json_string term{".[", 2};
-        if (parse_name(op.name, term)) {
+        if (parse_path_name(op.name, term)) {
           if (op.name.len == 1 && op.name.str[0] == '*') {
-            op.type = CHILD_WILDCARD;
+            op.type = path_operator_type::CHILD_WILDCARD;
           } else {
-            op.type = CHILD;
+            op.type = path_operator_type::CHILD;
           }
           return op;
         }
@@ -262,13 +493,18 @@ class path_state : private parser {
       case '[': {
         path_operator op;
         json_string term{"]", 1};
-        if (parse_name(op.name, term)) {
+        bool is_string = *pos == '\'' ? true : false;
+        if (parse_path_name(op.name, term)) {
           pos++;
           if (op.name.len == 1 && op.name.str[0] == '*') {
-            op.type = CHILD_WILDCARD;
+            op.type = path_operator_type::CHILD_WILDCARD;
           } else {
-            // unhandled cases
-            break;
+            if (is_string) {
+              op.type = path_operator_type::CHILD;
+            } else {
+              op.type  = path_operator_type::CHILD_INDEX;
+              op.index = parse_numeric<int>(op.name.str, op.name.str + op.name.len, json_opts);
+            }
           }
           return op;
         }
@@ -276,76 +512,135 @@ class path_state : private parser {
 
       default: break;
     }
-    return {ERROR};
+    return {path_operator_type::ERROR};
   }
 
  private:
-  CUDA_HOST_DEVICE_CALLABLE char parse_char() { return *pos++; }
-};
+  cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'};
 
-struct json_output {
-  size_t output_max_len;
-  size_t output_len;
-  char* output;
-
-  CUDA_HOST_DEVICE_CALLABLE void add_output(const char* str, size_t len)
+  CUDA_HOST_DEVICE_CALLABLE bool parse_path_name(json_string& name, json_string& terminators)
   {
-    if (output != nullptr) {
-      // assert output_len + len < output_max_len
-      memcpy(output + output_len, str, len);
+    char c = *pos;
+    switch (c) {
+      case '*':
+        name.str = pos;
+        name.len = 1;
+        pos++;
+        break;
+
+      case '\'':
+        if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; }
+        break;
+
+      default: {
+        size_t const chars_left = input_len - (pos - input);
+        char const* end         = device_strpbrk(pos, chars_left, terminators.str, terminators.len);
+        if (end) {
+          name.str = pos;
+          name.len = end - pos;
+          pos      = end;
+        } else {
+          name.str = pos;
+          name.len = chars_left;
+          pos      = input + input_len;
+        }
+        return true;
+      }
     }
-    output_len += len;
-  }
 
-  CUDA_HOST_DEVICE_CALLABLE void add_output(json_string str) { add_output(str.str, str.len); }
+    // must end in one of the terminators
+    size_t const chars_left = input_len - (pos - input);
+    char const* end         = device_strpbrk(pos, chars_left, terminators.str, terminators.len);
+    if (!end) { return false; }
+    pos = end;
+    return true;
+  }
 };
 
-CUDA_HOST_DEVICE_CALLABLE void parse_json_path(json_state& j_state,
-                                               path_state p_state,
-                                               json_output& output)
+CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state,
+                                                       path_state p_state,
+                                                       json_output& output,
+                                                       bool list_element = false)
 {
   path_operator op = p_state.get_next_operator();
 
   switch (op.type) {
     // whatever the first object is
-    case ROOT: {
-      json_state child;
-      json_string wildcard{"*", 1};
-      if (j_state.next_match(wildcard, child)) { parse_json_path(child, p_state, output); }
-    } break;
+    case path_operator_type::ROOT:
+      if (j_state.next_element() != parse_result::ERROR) {
+        return parse_json_path(j_state, p_state, output);
+      }
+      break;
 
     // .name
     // ['name']
     // [1]
     // will return a single thing
-    case CHILD: {
-      json_state child;
-      if (j_state.next_match(op.name, child)) { parse_json_path(child, p_state, output); }
+    case path_operator_type::CHILD: {
+      parse_result res = j_state.child_element();
+      if (res != parse_result::SUCCESS) { return res; }
+      res = j_state.next_matching_element(op.name, true);
+      if (res != parse_result::SUCCESS) { return res; }
+      return parse_json_path(j_state, p_state, output, list_element);
     } break;
 
     // .*
     // [*]
     // will return an array of things
-    case CHILD_WILDCARD: {
+    case path_operator_type::CHILD_WILDCARD: {
       output.add_output("[\n", 2);
 
-      json_state child;
+      parse_result res = j_state.child_element();
+      if (res == parse_result::ERROR) { return parse_result::ERROR; }
+      if (res == parse_result::EMPTY) {
+        output.add_output("]\n", 2);
+        return parse_result::SUCCESS;
+      }
+
+      res       = j_state.next_matching_element(op.name, true);
       int count = 0;
-      while (j_state.next_match(op.name, child)) {
-        if (count > 0) { output.add_output(",\n", 2); }
-        parse_json_path(child, p_state, output);
-        j_state = child;
-        count++;
+      while (res == parse_result::SUCCESS) {
+        json_state j_sub(j_state);
+        path_state p_sub(p_state);
+        parse_result sub_res = parse_json_path(j_sub, p_sub, output, count > 0 ? true : false);
+        if (sub_res == parse_result::ERROR) { return parse_result::ERROR; }
+        if (sub_res != parse_result::EMPTY) { count++; }
+        res = j_state.next_matching_element(op.name, false);
       }
+
+      if (res == parse_result::ERROR) { return parse_result::ERROR; }
+
       output.add_output("]\n", 2);
+      return parse_result::SUCCESS;
+    } break;
+
+    // [0]
+    // [1]
+    // etc
+    // returns a single thing
+    case path_operator_type::CHILD_INDEX: {
+      parse_result res = j_state.child_element();
+      if (res != parse_result::SUCCESS) { return res; }
+      json_string any{"*", 1};
+      res = j_state.next_matching_element(any, true);
+      if (res != parse_result::SUCCESS) { return res; }
+      for (int idx = 1; idx <= op.index; idx++) {
+        res = j_state.next_matching_element(any, false);
+        if (res != parse_result::SUCCESS) { return res; }
+      }
+      return parse_json_path(j_state, p_state, output, list_element);
     } break;
 
     // some sort of error.
-    case ERROR: break;
+    case path_operator_type::ERROR: return parse_result::ERROR; break;
 
     // END case
-    default: output.add_output(j_state.extract_element()); break;
+    default: {
+      if (list_element) { output.add_output({",\n", 2}); }
+      if (j_state.extract_element(&output) == parse_result::ERROR) { return parse_result::ERROR; }
+    } break;
   }
+  return parse_result::SUCCESS;
 }
 
 CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input,
@@ -393,7 +688,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 {
   size_t stack_size;
   cudaDeviceGetLimit(&stack_size, cudaLimitStackSize);
-  cudaDeviceSetLimit(cudaLimitStackSize, 2048);
+  cudaDeviceSetLimit(cudaLimitStackSize, 4096);
 
   auto offsets = cudf::make_fixed_width_column(
     data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index f114738b345..015e50f910a 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -146,7 +146,7 @@ TEST_F(JsonTests, GetJsonObject)
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    cudf::string_scalar json_path("$.*");
+    cudf::string_scalar json_path("$.store.*");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::print(*result);
@@ -154,7 +154,47 @@ TEST_F(JsonTests, GetJsonObject)
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    cudf::string_scalar json_path("$[*]");
+    cudf::string_scalar json_path("$.store.book[*]");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    cudf::string_scalar json_path("$.store.book[*].category");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    cudf::string_scalar json_path("$.store.book[*].title");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    cudf::string_scalar json_path("$.store['bicycle']");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    cudf::string_scalar json_path("$.store.book[*]['isbn']");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    cudf::string_scalar json_path("$.store.book[2]");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::print(*result);

From adb572425fb5ed5b647bb4ee2c2e3e36f41f88fe Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Thu, 11 Feb 2021 13:40:04 -0600
Subject: [PATCH 03/33] Optimization: preprocess the json path into a simple
 command buffer instead of doing the parsing on the gpu.

---
 cpp/include/cudf/strings/substring.hpp |  2 +-
 cpp/src/strings/json/json_path.cu      | 88 +++++++++++++++++---------
 cpp/tests/strings/json_tests.cpp       | 20 +++---
 3 files changed, 68 insertions(+), 42 deletions(-)

diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp
index 06a38360fdb..100dd80b3c2 100644
--- a/cpp/include/cudf/strings/substring.hpp
+++ b/cpp/include/cudf/strings/substring.hpp
@@ -213,7 +213,7 @@ std::unique_ptr<column> slice_strings(
  */
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& col,
-  cudf::string_scalar const& json_path,
+  std::string const& json_path,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 9267d933368..fc98ee25ee6 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -9,6 +9,7 @@
 
 #include <io/utilities/parsing_utils.cuh>
 
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
@@ -448,12 +449,20 @@ class json_state : private parser {
 
 enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END };
 
-// constexpr max_name_len    (63)
 struct path_operator {
+  CUDA_HOST_DEVICE_CALLABLE path_operator() : type(path_operator_type::ERROR), index(-1) {}
+  CUDA_HOST_DEVICE_CALLABLE path_operator(path_operator_type _type) : type(_type), index(-1) {}
+
   path_operator_type type;
   json_string name;
   int index;
 };
+struct command_buffer {
+  rmm::device_uvector<path_operator> commands;
+  // used as backing memory for the name fields inside the
+  // path_operator objects
+  string_scalar json_path;
+};
 
 // current state of the JSONPath
 class path_state : private parser {
@@ -548,27 +557,50 @@ class path_state : private parser {
       }
     }
 
-    // must end in one of the terminators
-    size_t const chars_left = input_len - (pos - input);
-    char const* end         = device_strpbrk(pos, chars_left, terminators.str, terminators.len);
-    if (!end) { return false; }
-    pos = end;
     return true;
   }
 };
 
+command_buffer build_command_buffer(std::string const& json_path, rmm::cuda_stream_view stream)
+{
+  path_state p_state(json_path.data(), static_cast<size_type>(json_path.size()));
+
+  std::vector<path_operator> h_operators;
+  cudf::string_scalar d_json_path(json_path);
+
+  path_operator op;
+  do {
+    op = p_state.get_next_operator();
+    if (op.type == path_operator_type::ERROR) {
+      CUDF_FAIL("Encountered invalid JSONPath input string");
+    }
+    // convert pointer to device pointer
+    if (op.name.len > 0) { op.name.str = d_json_path.data() + (op.name.str - json_path.data()); }
+    h_operators.push_back(op);
+  } while (op.type != path_operator_type::END);
+
+  rmm::device_uvector<path_operator> d_operators(h_operators.size(), stream);
+  cudaMemcpyAsync(d_operators.data(),
+                  h_operators.data(),
+                  sizeof(path_operator) * h_operators.size(),
+                  cudaMemcpyHostToDevice,
+                  stream.value());
+
+  return {std::move(d_operators), std::move(d_json_path)};
+}
+
 CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state,
-                                                       path_state p_state,
+                                                       path_operator const* commands,
                                                        json_output& output,
                                                        bool list_element = false)
 {
-  path_operator op = p_state.get_next_operator();
+  path_operator op = *commands;
 
   switch (op.type) {
     // whatever the first object is
     case path_operator_type::ROOT:
       if (j_state.next_element() != parse_result::ERROR) {
-        return parse_json_path(j_state, p_state, output);
+        return parse_json_path(j_state, commands + 1, output);
       }
       break;
 
@@ -581,7 +613,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state,
       if (res != parse_result::SUCCESS) { return res; }
       res = j_state.next_matching_element(op.name, true);
       if (res != parse_result::SUCCESS) { return res; }
-      return parse_json_path(j_state, p_state, output, list_element);
+      return parse_json_path(j_state, commands + 1, output, list_element);
     } break;
 
     // .*
@@ -601,8 +633,8 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state,
       int count = 0;
       while (res == parse_result::SUCCESS) {
         json_state j_sub(j_state);
-        path_state p_sub(p_state);
-        parse_result sub_res = parse_json_path(j_sub, p_sub, output, count > 0 ? true : false);
+        parse_result sub_res =
+          parse_json_path(j_sub, commands + 1, output, count > 0 ? true : false);
         if (sub_res == parse_result::ERROR) { return parse_result::ERROR; }
         if (sub_res != parse_result::EMPTY) { count++; }
         res = j_state.next_matching_element(op.name, false);
@@ -628,7 +660,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state,
         res = j_state.next_matching_element(any, false);
         if (res != parse_result::SUCCESS) { return res; }
       }
-      return parse_json_path(j_state, p_state, output, list_element);
+      return parse_json_path(j_state, commands + 1, output, list_element);
     } break;
 
     // some sort of error.
@@ -645,47 +677,43 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state,
 
 CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input,
                                                              size_t input_len,
-                                                             char const* path,
-                                                             size_t path_len,
+                                                             path_operator const* commands,
                                                              char* out_buf,
                                                              size_t out_buf_size)
 {
   // TODO: add host-side code to verify path is a valid string.
   json_state j_state(input, input_len);
-  path_state p_state(path, path_len);
   json_output output{out_buf_size, 0, out_buf};
 
-  parse_json_path(j_state, p_state, output);
+  parse_json_path(j_state, commands, output);
 
   return output;
 }
 
 __global__ void get_json_object_kernel(char const* chars,
                                        size_type const* offsets,
-                                       char const* json_path,
-                                       size_t json_path_len,
+                                       path_operator const* commands,
                                        size_type* output_offsets,
                                        char* out_buf,
                                        size_t out_buf_size)
 {
   uint64_t const tid = threadIdx.x + (blockDim.x * blockIdx.x);
 
-  json_output out = get_json_object_single(chars + offsets[tid],
-                                           offsets[tid + 1] - offsets[tid],
-                                           json_path,
-                                           json_path_len,
-                                           out_buf,
-                                           out_buf_size);
+  json_output out = get_json_object_single(
+    chars + offsets[tid], offsets[tid + 1] - offsets[tid], commands, out_buf, out_buf_size);
 
   // filled in only during the precompute step
   if (output_offsets != nullptr) { output_offsets[tid] = static_cast<size_type>(out.output_len); }
 }
 
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
-                                              cudf::string_scalar const& json_path,
+                                              std::string const& json_path,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
+  // preprocess the json_path into a command buffer
+  command_buffer cmd_buf = build_command_buffer(json_path, stream);
+
   size_t stack_size;
   cudaDeviceGetLimit(&stack_size, cudaLimitStackSize);
   cudaDeviceSetLimit(cudaLimitStackSize, 4096);
@@ -700,8 +728,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
   get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     col.chars().head<char>(),
     col.offsets().head<size_type>(),
-    json_path.data(),
-    json_path.size(),
+    cmd_buf.commands.data(),
     offsets_view.head<size_type>(),
     nullptr,
     0);
@@ -723,8 +750,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
   get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     col.chars().head<char>(),
     col.offsets().head<size_type>(),
-    json_path.data(),
-    json_path.size(),
+    cmd_buf.commands.data(),
     nullptr,
     chars_view.head<char>(),
     output_size);
@@ -745,7 +771,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 }  // namespace detail
 
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
-                                              cudf::string_scalar const& json_path,
+                                              std::string const& json_path,
                                               rmm::mr::device_memory_resource* mr)
 {
   return detail::get_json_object(col, json_path, 0, mr);
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index 015e50f910a..b00fb39158c 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -122,7 +122,7 @@ TEST_F(JsonTests, GetJsonObject)
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    cudf::string_scalar json_path("$");
+    std::string json_path("$");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::print(*result);
@@ -130,7 +130,7 @@ TEST_F(JsonTests, GetJsonObject)
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    cudf::string_scalar json_path("$.store");
+    std::string json_path("$.store");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::print(*result);
@@ -138,7 +138,7 @@ TEST_F(JsonTests, GetJsonObject)
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    cudf::string_scalar json_path("$.store.book");
+    std::string json_path("$.store.book");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::print(*result);
@@ -146,7 +146,7 @@ TEST_F(JsonTests, GetJsonObject)
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    cudf::string_scalar json_path("$.store.*");
+    std::string json_path("$.store.*");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::print(*result);
@@ -154,7 +154,7 @@ TEST_F(JsonTests, GetJsonObject)
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    cudf::string_scalar json_path("$.store.book[*]");
+    std::string json_path("$.store.book[*]");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::print(*result);
@@ -162,7 +162,7 @@ TEST_F(JsonTests, GetJsonObject)
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    cudf::string_scalar json_path("$.store.book[*].category");
+    std::string json_path("$.store.book[*].category");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::print(*result);
@@ -170,7 +170,7 @@ TEST_F(JsonTests, GetJsonObject)
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    cudf::string_scalar json_path("$.store.book[*].title");
+    std::string json_path("$.store.book[*].title");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::print(*result);
@@ -178,7 +178,7 @@ TEST_F(JsonTests, GetJsonObject)
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    cudf::string_scalar json_path("$.store['bicycle']");
+    std::string json_path("$.store['bicycle']");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::print(*result);
@@ -186,7 +186,7 @@ TEST_F(JsonTests, GetJsonObject)
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    cudf::string_scalar json_path("$.store.book[*]['isbn']");
+    std::string json_path("$.store.book[*]['isbn']");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::print(*result);
@@ -194,7 +194,7 @@ TEST_F(JsonTests, GetJsonObject)
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    cudf::string_scalar json_path("$.store.book[2]");
+    std::string json_path("$.store.book[2]");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::print(*result);

From ec7ab4a07c6854678a382bdc34913b1e7133965f Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Wed, 17 Feb 2021 10:13:10 -0600
Subject: [PATCH 04/33] Fix incorrect interface in detail header.

---
 cpp/include/cudf/strings/detail/substring.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/strings/detail/substring.hpp b/cpp/include/cudf/strings/detail/substring.hpp
index a646d93e2b8..3e7f6fea0d0 100644
--- a/cpp/include/cudf/strings/detail/substring.hpp
+++ b/cpp/include/cudf/strings/detail/substring.hpp
@@ -33,7 +33,7 @@ namespace detail {
  */
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& col,
-  cudf::string_scalar const& json_path,
+  std::string const& json_path,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 

From 6d94a73cbe6a70640e6bcb82ff85d5a7c681dd0d Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Thu, 18 Feb 2021 14:20:48 -0600
Subject: [PATCH 05/33] Add benchmarks for get_json_object().  Couple of bug
 fixes.

---
 cpp/benchmarks/CMakeLists.txt            |   8 ++
 cpp/benchmarks/string/json_benchmark.cpp | 140 +++++++++++++++++++++++
 cpp/src/strings/json/json_path.cu        |  27 +++--
 3 files changed, 165 insertions(+), 10 deletions(-)
 create mode 100644 cpp/benchmarks/string/json_benchmark.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7b5c092f9c6..fe3ad177e9e 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -309,3 +309,11 @@ set(STRINGS_BENCH_SRC
   "${CMAKE_CURRENT_SOURCE_DIR}/string/convert_durations_benchmark.cpp")
 
 ConfigureBench(STRINGS_BENCH "${STRINGS_BENCH_SRC}")
+
+###################################################################################################
+# - json benchmark -------------------------------------------------------------------
+
+set(JSON_BENCH_SRC
+  "${CMAKE_CURRENT_SOURCE_DIR}/string/json_benchmark.cpp")
+
+ConfigureBench(JSON_BENCH "${JSON_BENCH_SRC}")
diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json_benchmark.cpp
new file mode 100644
index 00000000000..df1aadef404
--- /dev/null
+++ b/cpp/benchmarks/string/json_benchmark.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/substring.hpp>
+
+class JsonPath : public cudf::benchmark {
+};
+
+float frand() { return static_cast<float>(rand()) / static_cast<float>(RAND_MAX); }
+
+int rand_range(int min, int max) { return min + static_cast<int>(frand() * (max - min)); }
+
+std::vector<std::string> Books{
+  "{\n\"category\": \"reference\",\n\"author\": \"Nigel Rees\",\n\"title\": \"Sayings of the "
+  "Century\",\n\"price\": 8.95\n}",
+  "{\n\"category\": \"fiction\",\n\"author\": \"Evelyn Waugh\",\n\"title\": \"Sword of "
+  "Honour\",\n\"price\": 12.99\n}",
+  "{\n\"category\": \"fiction\",\n\"author\": \"Herman Melville\",\n\"title\": \"Moby "
+  "Dick\",\n\"isbn\": \"0-553-21311-3\",\n\"price\": 8.99\n}",
+  "{\n\"category\": \"fiction\",\n\"author\": \"J. R. R. Tolkien\",\n\"title\": \"The Lord of the "
+  "Rings\",\n\"isbn\": \"0-395-19395-8\",\n\"price\": 22.99\n}"};
+constexpr int Approx_book_size = 110;
+std::vector<std::string> Bicycles{
+  "{\"color\": \"red\", \"price\": 9.95}",
+  "{\"color\": \"green\", \"price\": 29.95}",
+  "{\"color\": \"blue\", \"price\": 399.95}",
+  "{\"color\": \"yellow\", \"price\": 99.95}",
+  "{\"color\": \"mauve\", \"price\": 199.95}",
+};
+constexpr int Approx_bicycle_size = 33;
+std::string Misc{"\n\"expensive\": 10\n"};
+std::string generate_field(std::vector<std::string> const& values, int num_values)
+{
+  std::string res;
+  for (int idx = 0; idx < num_values; idx++) {
+    if (idx > 0) { res += std::string(",\n"); }
+    int vindex = std::min(static_cast<int>(floor(frand() * values.size())),
+                          static_cast<int>(values.size() - 1));
+    res += values[vindex];
+  }
+  return res;
+}
+
+std::string build_row(int desired_bytes)
+{
+  // always have at least 2 books and 2 bikes
+  int num_books    = 2;
+  int num_bicycles = 2;
+  int remaining_bytes =
+    desired_bytes - ((num_books * Approx_book_size) + (num_bicycles * Approx_bicycle_size));
+
+  // divide up the remainder between books and bikes
+  float book_pct    = frand();
+  float bicycle_pct = 1.0f - book_pct;
+  num_books += (remaining_bytes * book_pct) / Approx_book_size;
+  num_bicycles += (remaining_bytes * bicycle_pct) / Approx_bicycle_size;
+
+  std::string books    = "\"book\": [\n" + generate_field(Books, num_books) + "]\n";
+  std::string bicycles = "\"bicycle\": [\n" + generate_field(Bicycles, num_bicycles) + "]\n";
+
+  std::string store = "\"store\": {\n";
+  if (frand() <= 0.5f) {
+    store += books + std::string(",\n") + bicycles;
+  } else {
+    store += bicycles + std::string(",\n") + books;
+  }
+  store += std::string("}\n");
+
+  std::string row = std::string("{\n");
+  if (frand() <= 0.5f) {
+    row += store + std::string(",\n") + Misc;
+  } else {
+    row += Misc + std::string(",\n") + store;
+  }
+  row += std::string("}\n");
+  return row;
+}
+
+template <class... QueryArg>
+static void BM_case(benchmark::State& state, QueryArg&&... query_arg)
+{
+  srand(5236);
+  auto iter = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    [desired_bytes = state.range(1)](int index) { return build_row(desired_bytes); });
+  int num_rows = state.range(0);
+  cudf::test::strings_column_wrapper input(iter, iter + num_rows);
+  cudf::strings_column_view scv(input);
+  size_t num_chars = scv.chars().size();
+
+  std::string json_path(query_arg...);
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    auto result = cudf::strings::get_json_object(scv, json_path);
+    cudaStreamSynchronize(0);
+  }
+
+  // this isn't strictly 100% accurate. a given query isn't necessarily
+  // going to visit every single incoming character.  but in spirit it does.
+  state.SetBytesProcessed(state.iterations() * num_chars);
+}
+
+#define JSON_BENCHMARK_DEFINE(name, query)                         \
+  BENCHMARK_CAPTURE(BM_case, name, query)                          \
+    ->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}}) \
+    ->UseManualTime()                                              \
+    ->Unit(benchmark::kMillisecond);
+
+JSON_BENCHMARK_DEFINE(query0, "$");
+JSON_BENCHMARK_DEFINE(query1, "$.store");
+JSON_BENCHMARK_DEFINE(query2, "$.store.book");
+JSON_BENCHMARK_DEFINE(query3, "$.store.*");
+JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]");
+JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category");
+JSON_BENCHMARK_DEFINE(query6, "$.store.['bicycle']");
+JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']");
+JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]");
\ No newline at end of file
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index fc98ee25ee6..226143cb6a2 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -14,6 +14,8 @@
 
 #include <cudf_test/column_wrapper.hpp>
 
+// #include "db_test.cuh"
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -590,7 +592,7 @@ command_buffer build_command_buffer(std::string const& json_path, rmm::cuda_stre
 }
 
 CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state,
-                                                       path_operator const* commands,
+                                                       path_operator const* const commands,
                                                        json_output& output,
                                                        bool list_element = false)
 {
@@ -677,7 +679,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state,
 
 CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input,
                                                              size_t input_len,
-                                                             path_operator const* commands,
+                                                             path_operator const* const commands,
                                                              char* out_buf,
                                                              size_t out_buf_size)
 {
@@ -692,18 +694,23 @@ CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input,
 
 __global__ void get_json_object_kernel(char const* chars,
                                        size_type const* offsets,
-                                       path_operator const* commands,
+                                       path_operator const* const commands,
                                        size_type* output_offsets,
                                        char* out_buf,
-                                       size_t out_buf_size)
+                                       size_type num_rows)
 {
   uint64_t const tid = threadIdx.x + (blockDim.x * blockIdx.x);
 
+  if (tid >= num_rows) { return; }
+
+  char* dst       = out_buf ? out_buf + output_offsets[tid] : nullptr;
+  size_t dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0;
+
   json_output out = get_json_object_single(
-    chars + offsets[tid], offsets[tid + 1] - offsets[tid], commands, out_buf, out_buf_size);
+    chars + offsets[tid], offsets[tid + 1] - offsets[tid], commands, dst, dst_size);
 
   // filled in only during the precompute step
-  if (output_offsets != nullptr) { output_offsets[tid] = static_cast<size_type>(out.output_len); }
+  if (!out_buf) { output_offsets[tid] = static_cast<size_type>(out.output_len); }
 }
 
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
@@ -722,7 +729,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr);
   cudf::mutable_column_view offsets_view(*offsets);
 
-  cudf::detail::grid_1d const grid{1, col.size()};
+  cudf::detail::grid_1d const grid{col.size(), 512};
 
   // preprocess sizes
   get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
@@ -731,7 +738,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     cmd_buf.commands.data(),
     offsets_view.head<size_type>(),
     nullptr,
-    0);
+    col.size());
 
   // convert sizes to offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
@@ -751,9 +758,9 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     col.chars().head<char>(),
     col.offsets().head<size_type>(),
     cmd_buf.commands.data(),
-    nullptr,
+    offsets_view.head<size_type>(),
     chars_view.head<char>(),
-    output_size);
+    col.size());
 
   // reset back to original stack size
   cudaDeviceSetLimit(cudaLimitStackSize, stack_size);

From 05ad3fccd4951bef7de6bc99a211748bd7eb9c02 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Fri, 19 Feb 2021 15:08:22 -0600
Subject: [PATCH 06/33] Make get_json_object() non-recursive.

---
 cpp/src/strings/json/json_path.cu | 244 +++++++++++++++++++-----------
 1 file changed, 152 insertions(+), 92 deletions(-)

diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 226143cb6a2..84e09d7673b 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -202,6 +202,8 @@ class parser {
     parse_whitespace();
   }
 
+  CUDA_HOST_DEVICE_CALLABLE parser(parser const& p) { *this = p; }
+
   CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; }
   CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); }
 
@@ -303,7 +305,7 @@ class json_state : private parser {
   {
   }
 
-  CUDA_HOST_DEVICE_CALLABLE json_state(json_state const& j) { *this = j; }
+  CUDA_HOST_DEVICE_CALLABLE json_state(json_state const& j) : parser(j) { *this = j; }
 
   CUDA_HOST_DEVICE_CALLABLE parse_result extract_element(json_output* output)
   {
@@ -469,12 +471,10 @@ struct command_buffer {
 // current state of the JSONPath
 class path_state : private parser {
  public:
-  CUDA_HOST_DEVICE_CALLABLE path_state() : parser() {}
   CUDA_HOST_DEVICE_CALLABLE path_state(const char* _path, size_t _path_len)
     : parser(_path, _path_len)
   {
   }
-  CUDA_HOST_DEVICE_CALLABLE path_state(path_state const& p) { *this = p; }
 
   CUDA_HOST_DEVICE_CALLABLE path_operator get_next_operator()
   {
@@ -563,7 +563,8 @@ class path_state : private parser {
   }
 };
 
-command_buffer build_command_buffer(std::string const& json_path, rmm::cuda_stream_view stream)
+std::pair<command_buffer, int> build_command_buffer(std::string const& json_path,
+                                                    rmm::cuda_stream_view stream)
 {
   path_state p_state(json_path.data(), static_cast<size_type>(json_path.size()));
 
@@ -571,11 +572,13 @@ command_buffer build_command_buffer(std::string const& json_path, rmm::cuda_stre
   cudf::string_scalar d_json_path(json_path);
 
   path_operator op;
+  int max_stack_depth = 1;
   do {
     op = p_state.get_next_operator();
     if (op.type == path_operator_type::ERROR) {
       CUDF_FAIL("Encountered invalid JSONPath input string");
     }
+    if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; }
     // convert pointer to device pointer
     if (op.name.len > 0) { op.name.str = d_json_path.data() + (op.name.str - json_path.data()); }
     h_operators.push_back(op);
@@ -588,106 +591,168 @@ command_buffer build_command_buffer(std::string const& json_path, rmm::cuda_stre
                   cudaMemcpyHostToDevice,
                   stream.value());
 
-  return {std::move(d_operators), std::move(d_json_path)};
+  return {command_buffer{std::move(d_operators), std::move(d_json_path)}, max_stack_depth};
 }
 
-CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state,
-                                                       path_operator const* const commands,
+#define PARSE_TRY(_x)                                                       \
+  do {                                                                      \
+    last_result = _x;                                                       \
+    if (last_result == parse_result::ERROR) { return parse_result::ERROR; } \
+  } while (0)
+
+template <int max_command_stack_depth>
+CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
+                                                       path_operator const* _commands,
                                                        json_output& output,
-                                                       bool list_element = false)
+                                                       bool _list_element = false)
 {
-  path_operator op = *commands;
+  // manually maintained context stack in lieu of calling parse_json_path recursively.
+  struct context {
+    json_state j_state;
+    path_operator const* commands;
+    bool list_element;
+    bool state_flag;
+    int count;
+  };
+  context stack[max_command_stack_depth];
+  int stack_pos     = 0;
+  auto push_context = [&stack, &stack_pos](json_state const& _j_state,
+                                           path_operator const* _commands,
+                                           bool _list_element = false,
+                                           bool _state_flag   = false,
+                                           int _count         = 0) {
+    if (stack_pos == max_command_stack_depth - 1) { return false; }
+    stack[stack_pos++] = context{_j_state, _commands, _list_element, _state_flag, _count};
+    return true;
+  };
+  auto pop_context = [&stack, &stack_pos](context& c) {
+    if (stack_pos > 0) {
+      c = stack[--stack_pos];
+      return true;
+    }
+    return false;
+  };
+  push_context(_j_state, _commands, _list_element);
+
+  parse_result last_result = parse_result::SUCCESS;
+  context ctx;
+  while (pop_context(ctx)) {
+    path_operator op = *ctx.commands;
+
+    switch (op.type) {
+      // whatever the first object is
+      case path_operator_type::ROOT:
+        PARSE_TRY(ctx.j_state.next_element());
+        push_context(ctx.j_state, ctx.commands + 1);
+        break;
 
-  switch (op.type) {
-    // whatever the first object is
-    case path_operator_type::ROOT:
-      if (j_state.next_element() != parse_result::ERROR) {
-        return parse_json_path(j_state, commands + 1, output);
-      }
-      break;
+      // .name
+      // ['name']
+      // [1]
+      // will return a single thing
+      case path_operator_type::CHILD: {
+        PARSE_TRY(ctx.j_state.child_element());
+        if (last_result == parse_result::SUCCESS) {
+          PARSE_TRY(ctx.j_state.next_matching_element(op.name, true));
+          if (last_result == parse_result::SUCCESS) {
+            push_context(ctx.j_state, ctx.commands + 1, ctx.list_element);
+          }
+        }
+      } break;
 
-    // .name
-    // ['name']
-    // [1]
-    // will return a single thing
-    case path_operator_type::CHILD: {
-      parse_result res = j_state.child_element();
-      if (res != parse_result::SUCCESS) { return res; }
-      res = j_state.next_matching_element(op.name, true);
-      if (res != parse_result::SUCCESS) { return res; }
-      return parse_json_path(j_state, commands + 1, output, list_element);
-    } break;
-
-    // .*
-    // [*]
-    // will return an array of things
-    case path_operator_type::CHILD_WILDCARD: {
-      output.add_output("[\n", 2);
-
-      parse_result res = j_state.child_element();
-      if (res == parse_result::ERROR) { return parse_result::ERROR; }
-      if (res == parse_result::EMPTY) {
-        output.add_output("]\n", 2);
-        return parse_result::SUCCESS;
-      }
+      // .*
+      // [*]
+      // will return an array of things
+      case path_operator_type::CHILD_WILDCARD: {
+        // if we're on the first element of this wildcard
+        if (!ctx.state_flag) {
+          output.add_output("[\n", 2);
+
+          // step into the child element
+          PARSE_TRY(ctx.j_state.child_element());
+          if (last_result == parse_result::EMPTY) {
+            output.add_output("]\n", 2);
+            last_result = parse_result::SUCCESS;
+            break;
+          }
 
-      res       = j_state.next_matching_element(op.name, true);
-      int count = 0;
-      while (res == parse_result::SUCCESS) {
-        json_state j_sub(j_state);
-        parse_result sub_res =
-          parse_json_path(j_sub, commands + 1, output, count > 0 ? true : false);
-        if (sub_res == parse_result::ERROR) { return parse_result::ERROR; }
-        if (sub_res != parse_result::EMPTY) { count++; }
-        res = j_state.next_matching_element(op.name, false);
-      }
+          // first element
+          PARSE_TRY(ctx.j_state.next_matching_element(op.name, true));
+          if (last_result == parse_result::EMPTY) {
+            output.add_output("]\n", 2);
+            last_result = parse_result::SUCCESS;
+            break;
+          }
 
-      if (res == parse_result::ERROR) { return parse_result::ERROR; }
-
-      output.add_output("]\n", 2);
-      return parse_result::SUCCESS;
-    } break;
-
-    // [0]
-    // [1]
-    // etc
-    // returns a single thing
-    case path_operator_type::CHILD_INDEX: {
-      parse_result res = j_state.child_element();
-      if (res != parse_result::SUCCESS) { return res; }
-      json_string any{"*", 1};
-      res = j_state.next_matching_element(any, true);
-      if (res != parse_result::SUCCESS) { return res; }
-      for (int idx = 1; idx <= op.index; idx++) {
-        res = j_state.next_matching_element(any, false);
-        if (res != parse_result::SUCCESS) { return res; }
-      }
-      return parse_json_path(j_state, commands + 1, output, list_element);
-    } break;
+          // re-push ourselves
+          push_context(ctx.j_state, ctx.commands, false, true);
+          // push the next command
+          push_context(ctx.j_state, ctx.commands + 1);
+        } else {
+          // if we actually processed something to the output, increment count
+          if (last_result != parse_result::EMPTY) { ctx.count++; }
+
+          // next element
+          PARSE_TRY(ctx.j_state.next_matching_element(op.name, false));
+          if (last_result == parse_result::EMPTY) {
+            output.add_output("]\n", 2);
+            last_result = parse_result::SUCCESS;
+            break;
+          }
+
+          // re-push ourselves
+          push_context(ctx.j_state, ctx.commands, false, true);
+          // push the next command
+          push_context(ctx.j_state, ctx.commands + 1, ctx.count > 0 ? true : false);
+        }
+      } break;
+
+      // [0]
+      // [1]
+      // etc
+      // returns a single thing
+      case path_operator_type::CHILD_INDEX: {
+        PARSE_TRY(ctx.j_state.child_element());
+        if (last_result == parse_result::SUCCESS) {
+          json_string any{"*", 1};
+          PARSE_TRY(ctx.j_state.next_matching_element(any, true));
+          if (last_result == parse_result::SUCCESS) {
+            for (int idx = 1; idx <= op.index; idx++) {
+              PARSE_TRY(ctx.j_state.next_matching_element(any, false));
+              if (last_result == parse_result::EMPTY) { break; }
+            }
+            push_context(ctx.j_state, ctx.commands + 1, ctx.list_element);
+          }
+        }
+      } break;
 
-    // some sort of error.
-    case path_operator_type::ERROR: return parse_result::ERROR; break;
+      // some sort of error.
+      case path_operator_type::ERROR: return parse_result::ERROR; break;
 
-    // END case
-    default: {
-      if (list_element) { output.add_output({",\n", 2}); }
-      if (j_state.extract_element(&output) == parse_result::ERROR) { return parse_result::ERROR; }
-    } break;
+      // END case
+      default: {
+        if (ctx.list_element) { output.add_output({",\n", 2}); }
+        PARSE_TRY(ctx.j_state.extract_element(&output));
+      } break;
+    }
   }
   return parse_result::SUCCESS;
 }
 
+// hardcoding this for now. to reach a stack depth of 8 would require
+// a jsonpath containing 7 nested wildcards so this is probably reasonable.
+constexpr int max_command_stack_depth = 8;
+
 CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input,
                                                              size_t input_len,
                                                              path_operator const* const commands,
                                                              char* out_buf,
                                                              size_t out_buf_size)
 {
-  // TODO: add host-side code to verify path is a valid string.
   json_state j_state(input, input_len);
   json_output output{out_buf_size, 0, out_buf};
 
-  parse_json_path(j_state, commands, output);
+  parse_json_path<max_command_stack_depth>(j_state, commands, output);
 
   return output;
 }
@@ -719,11 +784,9 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
                                               rmm::mr::device_memory_resource* mr)
 {
   // preprocess the json_path into a command buffer
-  command_buffer cmd_buf = build_command_buffer(json_path, stream);
-
-  size_t stack_size;
-  cudaDeviceGetLimit(&stack_size, cudaLimitStackSize);
-  cudaDeviceSetLimit(cudaLimitStackSize, 4096);
+  std::pair<command_buffer, int> preprocess = build_command_buffer(json_path, stream);
+  CUDF_EXPECTS(preprocess.second <= max_command_stack_depth,
+               "Encountered json_path string that is too complex");
 
   auto offsets = cudf::make_fixed_width_column(
     data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr);
@@ -735,7 +798,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
   get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     col.chars().head<char>(),
     col.offsets().head<size_type>(),
-    cmd_buf.commands.data(),
+    preprocess.first.commands.data(),
     offsets_view.head<size_type>(),
     nullptr,
     col.size());
@@ -757,14 +820,11 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
   get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     col.chars().head<char>(),
     col.offsets().head<size_type>(),
-    cmd_buf.commands.data(),
+    preprocess.first.commands.data(),
     offsets_view.head<size_type>(),
     chars_view.head<char>(),
     col.size());
 
-  // reset back to original stack size
-  cudaDeviceSetLimit(cudaLimitStackSize, stack_size);
-
   return make_strings_column(col.size(),
                              std::move(offsets),
                              std::move(chars),
@@ -785,4 +845,4 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 }
 
 }  // namespace strings
-}  // namespace cudf
\ No newline at end of file
+}  // namespace cudf

From 9411f29e94d2d55ee72f85d4b6d5f3abbb32ef91 Mon Sep 17 00:00:00 2001
From: Raza Jafri <rjafri@nvidia.com>
Date: Mon, 22 Feb 2021 09:48:30 -0800
Subject: [PATCH 07/33] Java bindings for get_json_object

---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 19 +++++++++
 java/src/main/native/src/ColumnViewJni.cpp    | 23 ++++++++++
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 42 +++++++++++++++++++
 3 files changed, 84 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 1dce52f7105..8be1f035187 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -1883,6 +1883,23 @@ public final ColumnVector substring(ColumnView start, ColumnView end) {
     return new ColumnVector(substringColumn(getNativeView(), start.getNativeView(), end.getNativeView()));
   }
 
+   /**
+   * Apply a JSONPath string to all rows in an input strings column.
+   *
+   * Applies a JSONPath string to an incoming strings column where each row in the column
+   * is a valid json string.  The output is returned by row as a strings column.
+   *
+   * For reference, https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html
+   * Note: Only implements the operators: $ . [] *
+   *
+   * @param path The JSONPath string to be applied to each row
+   * @return new strings ColumnVector containing the retrieved json object strings
+   */
+  public final ColumnVector getJSONObject(String path) {
+    assert(type.equals(DType.STRING)) : "column type must be a String";
+    return new ColumnVector(getJSONObject(getNativeView(), path));
+  }
+
   /**
    * Returns a new strings column where target string within each string is replaced with the specified
    * replacement string.
@@ -2406,6 +2423,8 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) {
    */
   private static native long stringTimestampToTimestamp(long viewHandle, int unit, String format);
 
+  private static native long getJSONObject(long viewHandle, String path) throws CudfException;
+
   /**
    * Native method to parse and convert a timestamp column vector to string column vector. A unix
    * timestamp is a long value representing how many units since 1970-01-01 00:00:00:000 in either
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 82e71b04a2f..aebe56ee857 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -60,9 +60,12 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <map_lookup.hpp>
+#include "cudf/strings/strings_column_view.hpp"
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
+#include "jni.h"
+#include "jni_utils.hpp"
 
 namespace {
 
@@ -1760,4 +1763,24 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv
   }
   CATCH_STD(env, 0)
 }
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jobject j_object, 
+                                                                     jlong j_view_handle, jstring j_path) {
+
+   JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0);
+   JNI_NULL_CHECK(env, j_path, "path cannot be null", 0);
+
+  try {
+
+    cudf::column_view* n_column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
+    cudf::strings_column_view n_strings_col_view(*n_column_view);
+
+    cudf::jni::native_jstring n_path(env, j_path);
+    auto result = cudf::strings::get_json_object(n_strings_col_view, std::string(n_path.get()));
+
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0)
+
+}
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index cb1f792b99e..5139269553e 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3875,6 +3875,48 @@ void testCopyToColumnVector() {
     }
   }
 
+  @Test
+  void testGetJSONObject() {
+    String jsonString = "{ \"store\": {\n" +
+        "    \"book\": [\n" +
+        "      { \"category\": \"reference\",\n" +
+        "        \"author\": \"Nigel Rees\",\n" +
+        "        \"title\": \"Sayings of the Century\",\n" +
+        "        \"price\": 8.95\n" +
+        "      },\n" +
+        "      { \"category\": \"fiction\",\n" +
+        "        \"author\": \"Evelyn Waugh\",\n" +
+        "        \"title\": \"Sword of Honour\",\n" +
+        "        \"price\": 12.99\n" +
+        "      },\n" +
+        "      { \"category\": \"fiction\",\n" +
+        "        \"author\": \"Herman Melville\",\n" +
+        "        \"title\": \"Moby Dick\",\n" +
+        "        \"isbn\": \"0-553-21311-3\",\n" +
+        "        \"price\": 8.99\n" +
+        "      },\n" +
+        "      { \"category\": \"fiction\",\n" +
+        "        \"author\": \"J. R. R. Tolkien\",\n" +
+        "        \"title\": \"The Lord of the Rings\",\n" +
+        "        \"isbn\": \"0-395-19395-8\",\n" +
+        "        \"price\": 22.99\n" +
+        "      }\n" +
+        "    ],\n" +
+        "    \"bicycle\": {\n" +
+        "      \"color\": \"red\",\n" +
+        "      \"price\": 19.95\n" +
+        "    }\n" +
+        "  }\n" +
+        "}";
+
+    try (ColumnVector json = ColumnVector.fromStrings(jsonString);
+         ColumnVector expectedAuthors = ColumnVector.fromStrings("[\n\"Nigel Rees\",\n\"Evelyn " +
+             "Waugh\",\n\"Herman Melville\",\n\"J. R. R. Tolkien\"]\n");
+         ColumnVector gotAuthors = json.getJSONObject("$.store.book[*].author")) {
+      assertColumnsAreEqual(expectedAuthors, gotAuthors);
+    }
+  }
+
   @Test
   void testMakeStructEmpty() {
     final int numRows = 10;

From ff3544ce920ff85ff3409f0b80201ccc88cb5183 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Mon, 22 Feb 2021 13:13:05 -0600
Subject: [PATCH 08/33] Make debug readability formatting of output off by
 default.

---
 cpp/src/strings/json/json_path.cu | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 84e09d7673b..c9d43c22dfd 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -22,6 +22,10 @@ namespace detail {
 
 namespace {
 
+// temporary. for debugging purposes
+#define DEBUG_NEWLINE
+// #define DEBUG_NEWLINE  "\n"
+
 using namespace cudf;
 
 CUDA_HOST_DEVICE_CALLABLE char to_lower(char const c)
@@ -666,12 +670,12 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
       case path_operator_type::CHILD_WILDCARD: {
         // if we're on the first element of this wildcard
         if (!ctx.state_flag) {
-          output.add_output("[\n", 2);
+          output.add_output("[" DEBUG_NEWLINE, 2);
 
           // step into the child element
           PARSE_TRY(ctx.j_state.child_element());
           if (last_result == parse_result::EMPTY) {
-            output.add_output("]\n", 2);
+            output.add_output("]" DEBUG_NEWLINE, 2);
             last_result = parse_result::SUCCESS;
             break;
           }
@@ -679,7 +683,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
           // first element
           PARSE_TRY(ctx.j_state.next_matching_element(op.name, true));
           if (last_result == parse_result::EMPTY) {
-            output.add_output("]\n", 2);
+            output.add_output("]" DEBUG_NEWLINE, 2);
             last_result = parse_result::SUCCESS;
             break;
           }
@@ -695,7 +699,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
           // next element
           PARSE_TRY(ctx.j_state.next_matching_element(op.name, false));
           if (last_result == parse_result::EMPTY) {
-            output.add_output("]\n", 2);
+            output.add_output("]" DEBUG_NEWLINE, 2);
             last_result = parse_result::SUCCESS;
             break;
           }
@@ -731,7 +735,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
 
       // END case
       default: {
-        if (ctx.list_element) { output.add_output({",\n", 2}); }
+        if (ctx.list_element) { output.add_output({"," DEBUG_NEWLINE, 2}); }
         PARSE_TRY(ctx.j_state.extract_element(&output));
       } break;
     }

From 78d3dd8a09fdb6d592766d5893f8863980dd689d Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Mon, 22 Feb 2021 15:29:17 -0600
Subject: [PATCH 09/33] Change interface to get_json_object() to take a
 cudf::string_scalar instead of a std::string

---
 cpp/include/cudf/strings/detail/substring.hpp |  2 +-
 cpp/include/cudf/strings/substring.hpp        |  2 +-
 cpp/src/strings/json/json_path.cu             | 39 +++++++++++--------
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/substring.hpp b/cpp/include/cudf/strings/detail/substring.hpp
index 3e7f6fea0d0..a646d93e2b8 100644
--- a/cpp/include/cudf/strings/detail/substring.hpp
+++ b/cpp/include/cudf/strings/detail/substring.hpp
@@ -33,7 +33,7 @@ namespace detail {
  */
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& col,
-  std::string const& json_path,
+  cudf::string_scalar const& json_path,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp
index 100dd80b3c2..06a38360fdb 100644
--- a/cpp/include/cudf/strings/substring.hpp
+++ b/cpp/include/cudf/strings/substring.hpp
@@ -213,7 +213,7 @@ std::unique_ptr<column> slice_strings(
  */
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& col,
-  std::string const& json_path,
+  cudf::string_scalar const& json_path,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index c9d43c22dfd..b06f1c5b32d 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -24,7 +24,10 @@ namespace {
 
 // temporary. for debugging purposes
 #define DEBUG_NEWLINE
+#define DEBUG_NEWLINE_LEN (0)
+
 // #define DEBUG_NEWLINE  "\n"
+// #define DEBUG_NEWLINE_LEN (1)
 
 using namespace cudf;
 
@@ -160,6 +163,9 @@ struct json_string {
   const char* str;
   int64_t len;
 
+  CUDA_HOST_DEVICE_CALLABLE json_string() : str(nullptr), len(-1) {}
+  CUDA_HOST_DEVICE_CALLABLE json_string(const char* _str, int64_t _len) : str(_str), len(_len) {}
+
   CUDA_HOST_DEVICE_CALLABLE bool operator==(json_string const& cmp)
   {
     return len == cmp.len && str != nullptr && cmp.str != nullptr &&
@@ -567,13 +573,13 @@ class path_state : private parser {
   }
 };
 
-std::pair<command_buffer, int> build_command_buffer(std::string const& json_path,
-                                                    rmm::cuda_stream_view stream)
+std::pair<rmm::device_uvector<path_operator>, int> build_command_buffer(
+  cudf::string_scalar const& json_path, rmm::cuda_stream_view stream)
 {
-  path_state p_state(json_path.data(), static_cast<size_type>(json_path.size()));
+  std::string h_json_path = json_path.to_string(stream);
+  path_state p_state(h_json_path.data(), static_cast<size_type>(h_json_path.size()));
 
   std::vector<path_operator> h_operators;
-  cudf::string_scalar d_json_path(json_path);
 
   path_operator op;
   int max_stack_depth = 1;
@@ -584,7 +590,7 @@ std::pair<command_buffer, int> build_command_buffer(std::string const& json_path
     }
     if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; }
     // convert pointer to device pointer
-    if (op.name.len > 0) { op.name.str = d_json_path.data() + (op.name.str - json_path.data()); }
+    if (op.name.len > 0) { op.name.str = json_path.data() + (op.name.str - h_json_path.data()); }
     h_operators.push_back(op);
   } while (op.type != path_operator_type::END);
 
@@ -595,7 +601,7 @@ std::pair<command_buffer, int> build_command_buffer(std::string const& json_path
                   cudaMemcpyHostToDevice,
                   stream.value());
 
-  return {command_buffer{std::move(d_operators), std::move(d_json_path)}, max_stack_depth};
+  return {std::move(d_operators), max_stack_depth};
 }
 
 #define PARSE_TRY(_x)                                                       \
@@ -670,12 +676,12 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
       case path_operator_type::CHILD_WILDCARD: {
         // if we're on the first element of this wildcard
         if (!ctx.state_flag) {
-          output.add_output("[" DEBUG_NEWLINE, 2);
+          output.add_output("[" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN);
 
           // step into the child element
           PARSE_TRY(ctx.j_state.child_element());
           if (last_result == parse_result::EMPTY) {
-            output.add_output("]" DEBUG_NEWLINE, 2);
+            output.add_output("]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN);
             last_result = parse_result::SUCCESS;
             break;
           }
@@ -683,7 +689,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
           // first element
           PARSE_TRY(ctx.j_state.next_matching_element(op.name, true));
           if (last_result == parse_result::EMPTY) {
-            output.add_output("]" DEBUG_NEWLINE, 2);
+            output.add_output("]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN);
             last_result = parse_result::SUCCESS;
             break;
           }
@@ -699,7 +705,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
           // next element
           PARSE_TRY(ctx.j_state.next_matching_element(op.name, false));
           if (last_result == parse_result::EMPTY) {
-            output.add_output("]" DEBUG_NEWLINE, 2);
+            output.add_output("]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN);
             last_result = parse_result::SUCCESS;
             break;
           }
@@ -735,7 +741,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
 
       // END case
       default: {
-        if (ctx.list_element) { output.add_output({"," DEBUG_NEWLINE, 2}); }
+        if (ctx.list_element) { output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); }
         PARSE_TRY(ctx.j_state.extract_element(&output));
       } break;
     }
@@ -783,12 +789,13 @@ __global__ void get_json_object_kernel(char const* chars,
 }
 
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
-                                              std::string const& json_path,
+                                              cudf::string_scalar const& json_path,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   // preprocess the json_path into a command buffer
-  std::pair<command_buffer, int> preprocess = build_command_buffer(json_path, stream);
+  std::pair<rmm::device_uvector<path_operator>, int> preprocess =
+    build_command_buffer(json_path, stream);
   CUDF_EXPECTS(preprocess.second <= max_command_stack_depth,
                "Encountered json_path string that is too complex");
 
@@ -802,7 +809,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
   get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     col.chars().head<char>(),
     col.offsets().head<size_type>(),
-    preprocess.first.commands.data(),
+    preprocess.first.data(),
     offsets_view.head<size_type>(),
     nullptr,
     col.size());
@@ -824,7 +831,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
   get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     col.chars().head<char>(),
     col.offsets().head<size_type>(),
-    preprocess.first.commands.data(),
+    preprocess.first.data(),
     offsets_view.head<size_type>(),
     chars_view.head<char>(),
     col.size());
@@ -842,7 +849,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 }  // namespace detail
 
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
-                                              std::string const& json_path,
+                                              cudf::string_scalar const& json_path,
                                               rmm::mr::device_memory_resource* mr)
 {
   return detail::get_json_object(col, json_path, 0, mr);

From e124cc503712d6de4ef83e1ab30763291755dbfa Mon Sep 17 00:00:00 2001
From: Raza Jafri <rjafri@nvidia.com>
Date: Mon, 22 Feb 2021 16:00:47 -0800
Subject: [PATCH 10/33] updated test

---
 java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 5139269553e..1eb2b9a2af2 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3909,8 +3909,9 @@ void testGetJSONObject() {
         "  }\n" +
         "}";
 
-    try (ColumnVector json = ColumnVector.fromStrings(jsonString);
+    try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString);
          ColumnVector expectedAuthors = ColumnVector.fromStrings("[\n\"Nigel Rees\",\n\"Evelyn " +
+             "Waugh\",\n\"Herman Melville\",\n\"J. R. R. Tolkien\"]\n", "[\n\"Nigel Rees\",\n\"Evelyn " +
              "Waugh\",\n\"Herman Melville\",\n\"J. R. R. Tolkien\"]\n");
          ColumnVector gotAuthors = json.getJSONObject("$.store.book[*].author")) {
       assertColumnsAreEqual(expectedAuthors, gotAuthors);

From efb767e39aec7f4e8829247598afc6907c8f3e3f Mon Sep 17 00:00:00 2001
From: Raza Jafri <rjafri@nvidia.com>
Date: Mon, 22 Feb 2021 16:12:23 -0800
Subject: [PATCH 11/33] updated to scalar

---
 java/src/main/java/ai/rapids/cudf/ColumnView.java | 6 +++---
 java/src/main/native/src/ColumnViewJni.cpp        | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 8be1f035187..4453d7bb1f1 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -1895,9 +1895,9 @@ public final ColumnVector substring(ColumnView start, ColumnView end) {
    * @param path The JSONPath string to be applied to each row
    * @return new strings ColumnVector containing the retrieved json object strings
    */
-  public final ColumnVector getJSONObject(String path) {
+  public final ColumnVector getJSONObject(Scalar path) {
     assert(type.equals(DType.STRING)) : "column type must be a String";
-    return new ColumnVector(getJSONObject(getNativeView(), path));
+    return new ColumnVector(getJSONObject(getNativeView(), path.getScalarHandle()));
   }
 
   /**
@@ -2423,7 +2423,7 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) {
    */
   private static native long stringTimestampToTimestamp(long viewHandle, int unit, String format);
 
-  private static native long getJSONObject(long viewHandle, String path) throws CudfException;
+  private static native long getJSONObject(long viewHandle, long scalarHandle) throws CudfException;
 
   /**
    * Native method to parse and convert a timestamp column vector to string column vector. A unix
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index aebe56ee857..3bac1673764 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -61,6 +61,7 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <map_lookup.hpp>
 #include "cudf/strings/strings_column_view.hpp"
+#include "cudf/types.hpp"
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
@@ -1765,7 +1766,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jobject j_object, 
-                                                                     jlong j_view_handle, jstring j_path) {
+                                                                     jlong j_view_handle, jlong j_scalar_handle) {
 
    JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0);
    JNI_NULL_CHECK(env, j_path, "path cannot be null", 0);
@@ -1774,9 +1775,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env
 
     cudf::column_view* n_column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view n_strings_col_view(*n_column_view);
+    cudf::scalar* n_scalar_path = reinterpret_cast<cudf::scalar*>(j_scalar_handle);
 
-    cudf::jni::native_jstring n_path(env, j_path);
-    auto result = cudf::strings::get_json_object(n_strings_col_view, std::string(n_path.get()));
+    auto result = cudf::strings::get_json_object(n_strings_col_view, n_scalar_path);
 
     return reinterpret_cast<jlong>(result.release());
   }

From 6127b7c45ee173ce041aae16e3ab8e0c6a0b02bf Mon Sep 17 00:00:00 2001
From: Raza Jafri <rjafri@nvidia.com>
Date: Tue, 23 Feb 2021 09:25:30 -0800
Subject: [PATCH 12/33] changes to match the cudf

---
 java/src/main/native/src/ColumnViewJni.cpp              | 6 +++---
 java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 9 +++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 3bac1673764..f3043777242 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1769,15 +1769,15 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env
                                                                      jlong j_view_handle, jlong j_scalar_handle) {
 
    JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0);
-   JNI_NULL_CHECK(env, j_path, "path cannot be null", 0);
+   JNI_NULL_CHECK(env, j_scalar_handle, "path cannot be null", 0);
 
   try {
 
     cudf::column_view* n_column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view n_strings_col_view(*n_column_view);
-    cudf::scalar* n_scalar_path = reinterpret_cast<cudf::scalar*>(j_scalar_handle);
+    cudf::string_scalar *n_scalar_path = reinterpret_cast<cudf::string_scalar *>(j_scalar_handle);
 
-    auto result = cudf::strings::get_json_object(n_strings_col_view, n_scalar_path);
+    auto result = cudf::strings::get_json_object(n_strings_col_view, *n_scalar_path);
 
     return reinterpret_cast<jlong>(result.release());
   }
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 1eb2b9a2af2..f32cd9e8e9b 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3910,10 +3910,11 @@ void testGetJSONObject() {
         "}";
 
     try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString);
-         ColumnVector expectedAuthors = ColumnVector.fromStrings("[\n\"Nigel Rees\",\n\"Evelyn " +
-             "Waugh\",\n\"Herman Melville\",\n\"J. R. R. Tolkien\"]\n", "[\n\"Nigel Rees\",\n\"Evelyn " +
-             "Waugh\",\n\"Herman Melville\",\n\"J. R. R. Tolkien\"]\n");
-         ColumnVector gotAuthors = json.getJSONObject("$.store.book[*].author")) {
+         ColumnVector expectedAuthors = ColumnVector.fromStrings("[\"Nigel Rees\",\"Evelyn " +
+             "Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]", "[\"Nigel Rees\",\"Evelyn " +
+             "Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]");
+         Scalar path = Scalar.fromString("$.store.book[*].author");
+         ColumnVector gotAuthors = json.getJSONObject(path)) {
       assertColumnsAreEqual(expectedAuthors, gotAuthors);
     }
   }

From d6602bdd62cd9abee30d6e57c3cf694f33478698 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Tue, 23 Feb 2021 18:05:27 -0600
Subject: [PATCH 13/33] Strip quotes from singular returned string values. 
 Propagate validity vector to output.

---
 cpp/src/strings/json/json_path.cu | 78 +++++++++++++++++++++++--------
 cpp/tests/strings/json_tests.cpp  | 21 +++++++++
 2 files changed, 80 insertions(+), 19 deletions(-)

diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index b06f1c5b32d..51a16122b45 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -1,5 +1,6 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -7,6 +8,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <io/utilities/column_type_histogram.hpp>
 #include <io/utilities/parsing_utils.cuh>
 
 #include <rmm/device_uvector.hpp>
@@ -14,8 +16,6 @@
 
 #include <cudf_test/column_wrapper.hpp>
 
-// #include "db_test.cuh"
-
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -29,6 +29,30 @@ namespace {
 // #define DEBUG_NEWLINE  "\n"
 // #define DEBUG_NEWLINE_LEN (1)
 
+// temporary. spark doesn't strictly follow the JSONPath spec.
+// I think this probably should be a configurable enum to control
+// the kind of output you get and what features are supported.
+//
+// Current known differences:
+// - When returning a string value as a single element, Spark strips the quotes.
+//   standard:   "whee"
+//   spark:      whee
+//
+// - Spark only supports the wildcard operator when in a subscript, eg  [*]
+//   It does not handle .*
+//
+#define __SPARK_BEHAVIORS
+
+// Other, non-spark known differences:
+//
+// - In jsonpath_ng, name subscripts can use double quotes instead of the standard
+//   single quotes in the query string.
+//   standard:      $.thing['subscript']
+//   jsonpath_ng:   $.thing["subscript"]
+//
+//  Currently, this code only allows single-quotes but that can be easily expanded.
+//
+
 using namespace cudf;
 
 CUDA_HOST_DEVICE_CALLABLE char to_lower(char const c)
@@ -317,7 +341,7 @@ class json_state : private parser {
 
   CUDA_HOST_DEVICE_CALLABLE json_state(json_state const& j) : parser(j) { *this = j; }
 
-  CUDA_HOST_DEVICE_CALLABLE parse_result extract_element(json_output* output)
+  CUDA_HOST_DEVICE_CALLABLE parse_result extract_element(json_output* output, bool list_element)
   {
     // collapse the current element into a json_string
 
@@ -329,6 +353,15 @@ class json_state : private parser {
       pos = cur_el_start;
       if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; }
       end = pos;
+
+#if defined(__SPARK_BEHAVIORS)
+      // spark/hive-specific behavior.  if this is a non-list-element wrapped in quotes,
+      // strip them
+      if (!list_element && *start == '\"' && *(end - 1) == '\"') {
+        start++;
+        end--;
+      }
+#endif
     }
     // otherwise, march through everything inside
     else {
@@ -373,7 +406,7 @@ class json_state : private parser {
     return parse_result::SUCCESS;
   }
 
-  CUDA_HOST_DEVICE_CALLABLE parse_result skip_element() { return extract_element(nullptr); }
+  CUDA_HOST_DEVICE_CALLABLE parse_result skip_element() { return extract_element(nullptr, false); }
 
   json_element_type element;
 
@@ -498,6 +531,9 @@ class path_state : private parser {
         path_operator op;
         json_string term{".[", 2};
         if (parse_path_name(op.name, term)) {
+          // this is another potential use case for __SPARK_BEHAVIORS / configurability
+          // Spark currently only handles the wildcard operator inside [*], it does
+          // not handle .*
           if (op.name.len == 1 && op.name.str[0] == '*') {
             op.type = path_operator_type::CHILD_WILDCARD;
           } else {
@@ -621,18 +657,18 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
     json_state j_state;
     path_operator const* commands;
     bool list_element;
+    int element_count;
     bool state_flag;
-    int count;
   };
   context stack[max_command_stack_depth];
   int stack_pos     = 0;
   auto push_context = [&stack, &stack_pos](json_state const& _j_state,
                                            path_operator const* _commands,
                                            bool _list_element = false,
-                                           bool _state_flag   = false,
-                                           int _count         = 0) {
+                                           int _element_count = 0,
+                                           bool _state_flag   = false) {
     if (stack_pos == max_command_stack_depth - 1) { return false; }
-    stack[stack_pos++] = context{_j_state, _commands, _list_element, _state_flag, _count};
+    stack[stack_pos++] = context{_j_state, _commands, _list_element, _element_count, _state_flag};
     return true;
   };
   auto pop_context = [&stack, &stack_pos](context& c) {
@@ -665,7 +701,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
         if (last_result == parse_result::SUCCESS) {
           PARSE_TRY(ctx.j_state.next_matching_element(op.name, true));
           if (last_result == parse_result::SUCCESS) {
-            push_context(ctx.j_state, ctx.commands + 1, ctx.list_element);
+            push_context(ctx.j_state, ctx.commands + 1, ctx.list_element, ctx.element_count);
           }
         }
       } break;
@@ -695,12 +731,12 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
           }
 
           // re-push ourselves
-          push_context(ctx.j_state, ctx.commands, false, true);
+          push_context(ctx.j_state, ctx.commands, false, 0, true);
           // push the next command
-          push_context(ctx.j_state, ctx.commands + 1);
+          push_context(ctx.j_state, ctx.commands + 1, true, 0);
         } else {
           // if we actually processed something to the output, increment count
-          if (last_result != parse_result::EMPTY) { ctx.count++; }
+          if (last_result != parse_result::EMPTY) { ctx.element_count++; }
 
           // next element
           PARSE_TRY(ctx.j_state.next_matching_element(op.name, false));
@@ -711,9 +747,9 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
           }
 
           // re-push ourselves
-          push_context(ctx.j_state, ctx.commands, false, true);
+          push_context(ctx.j_state, ctx.commands, false, 0, true);
           // push the next command
-          push_context(ctx.j_state, ctx.commands + 1, ctx.count > 0 ? true : false);
+          push_context(ctx.j_state, ctx.commands + 1, true, ctx.element_count);
         }
       } break;
 
@@ -731,7 +767,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
               PARSE_TRY(ctx.j_state.next_matching_element(any, false));
               if (last_result == parse_result::EMPTY) { break; }
             }
-            push_context(ctx.j_state, ctx.commands + 1, ctx.list_element);
+            push_context(ctx.j_state, ctx.commands + 1, ctx.list_element, ctx.element_count);
           }
         }
       } break;
@@ -741,8 +777,10 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
 
       // END case
       default: {
-        if (ctx.list_element) { output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); }
-        PARSE_TRY(ctx.j_state.extract_element(&output));
+        if (ctx.list_element && ctx.element_count > 0) {
+          output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN});
+        }
+        PARSE_TRY(ctx.j_state.extract_element(&output, ctx.list_element));
       } break;
     }
   }
@@ -759,6 +797,8 @@ CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input,
                                                              char* out_buf,
                                                              size_t out_buf_size)
 {
+  if (input_len == 0) { return json_output{0, 0, out_buf}; }
+
   json_state j_state(input, input_len);
   json_output output{out_buf_size, 0, out_buf};
 
@@ -839,8 +879,8 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
   return make_strings_column(col.size(),
                              std::move(offsets),
                              std::move(chars),
-                             UNKNOWN_NULL_COUNT,
-                             rmm::device_buffer{},
+                             col.null_count(),
+                             cudf::detail::copy_bitmask(col.parent(), stream, mr),
                              stream,
                              mr);
 }
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index b00fb39158c..cf53d7f54e2 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -199,4 +199,25 @@ TEST_F(JsonTests, GetJsonObject)
 
     cudf::test::print(*result);
   }
+
+  {
+    // spark behavioral difference.
+    //  standard:     "fiction"
+    //  spark:        fiction
+    cudf::test::strings_column_wrapper input{json_string};
+    std::string json_path("$.store.book[2].category");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+
+  {
+    char const* str = "{\"a\" : \"b\"}";
+    cudf::test::strings_column_wrapper input{{str, str, str, str}, {1, 0, 1, 0}};
+
+    std::string json_path("$.a");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
 }

From f4196368d17b1018415f981b93887cf7becf2fd8 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Wed, 24 Feb 2021 15:54:20 -0600
Subject: [PATCH 14/33] Return null rows for queries with no result instead of
 just empty strings.

---
 cpp/src/strings/json/json_path.cu | 82 ++++++++++++++++++++++++-------
 cpp/tests/strings/json_tests.cpp  | 38 ++++++++++++++
 2 files changed, 101 insertions(+), 19 deletions(-)

diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 51a16122b45..953e30098ec 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -6,6 +6,7 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <io/utilities/column_type_histogram.hpp>
@@ -208,6 +209,7 @@ enum json_element_type { NONE, OBJECT, ARRAY, VALUE };
 struct json_output {
   size_t output_max_len;
   size_t output_len;
+  int element_count;
   char* output;
 
   CUDA_HOST_DEVICE_CALLABLE void add_output(const char* str, size_t len)
@@ -402,6 +404,7 @@ class json_state : private parser {
       }
       */
       output->add_output({start, end - start});
+      output->element_count++;
     }
     return parse_result::SUCCESS;
   }
@@ -609,7 +612,7 @@ class path_state : private parser {
   }
 };
 
-std::pair<rmm::device_uvector<path_operator>, int> build_command_buffer(
+std::tuple<rmm::device_uvector<path_operator>, int, bool> build_command_buffer(
   cudf::string_scalar const& json_path, rmm::cuda_stream_view stream)
 {
   std::string h_json_path = json_path.to_string(stream);
@@ -637,7 +640,9 @@ std::pair<rmm::device_uvector<path_operator>, int> build_command_buffer(
                   cudaMemcpyHostToDevice,
                   stream.value());
 
-  return {std::move(d_operators), max_stack_depth};
+  return {std::move(d_operators),
+          max_stack_depth,
+          h_operators.size() == 1 && h_operators[0].type == path_operator_type::END ? true : false};
 }
 
 #define PARSE_TRY(_x)                                                       \
@@ -797,10 +802,8 @@ CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input,
                                                              char* out_buf,
                                                              size_t out_buf_size)
 {
-  if (input_len == 0) { return json_output{0, 0, out_buf}; }
-
   json_state j_state(input, input_len);
-  json_output output{out_buf_size, 0, out_buf};
+  json_output output{out_buf_size, 0, 0, out_buf};
 
   parse_json_path<max_command_stack_depth>(j_state, commands, output);
 
@@ -812,20 +815,37 @@ __global__ void get_json_object_kernel(char const* chars,
                                        path_operator const* const commands,
                                        size_type* output_offsets,
                                        char* out_buf,
+                                       bitmask_type* out_validity,
                                        size_type num_rows)
 {
   uint64_t const tid = threadIdx.x + (blockDim.x * blockIdx.x);
 
-  if (tid >= num_rows) { return; }
-
-  char* dst       = out_buf ? out_buf + output_offsets[tid] : nullptr;
-  size_t dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0;
+  bool is_valid = false;
+  if (tid < num_rows) {
+    size_type src_size    = offsets[tid + 1] - offsets[tid];
+    size_type output_size = 0;
+    if (src_size > 0) {
+      char* dst       = out_buf ? out_buf + output_offsets[tid] : nullptr;
+      size_t dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0;
+
+      json_output out =
+        get_json_object_single(chars + offsets[tid], src_size, commands, dst, dst_size);
+      output_size = out.output_len;
+      if (out.element_count > 0) { is_valid = true; }
+    }
 
-  json_output out = get_json_object_single(
-    chars + offsets[tid], offsets[tid + 1] - offsets[tid], commands, dst, dst_size);
+    // filled in only during the precompute step
+    if (!out_buf) { output_offsets[tid] = static_cast<size_type>(output_size); }
+  }
 
-  // filled in only during the precompute step
-  if (!out_buf) { output_offsets[tid] = static_cast<size_type>(out.output_len); }
+  // validity filled in only during the output step
+  if (out_validity) {
+    uint32_t mask = __ballot_sync(0xffffffff, is_valid);
+    // 0th lane of the warp writes the validity
+    if (!(tid % cudf::detail::warp_size) && tid < num_rows) {
+      out_validity[cudf::word_index(tid)] = mask;
+    }
+  }
 }
 
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
@@ -834,24 +854,42 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
                                               rmm::mr::device_memory_resource* mr)
 {
   // preprocess the json_path into a command buffer
-  std::pair<rmm::device_uvector<path_operator>, int> preprocess =
+  std::tuple<rmm::device_uvector<path_operator>, int, bool> preprocess =
     build_command_buffer(json_path, stream);
-  CUDF_EXPECTS(preprocess.second <= max_command_stack_depth,
+  CUDF_EXPECTS(std::get<1>(preprocess) <= max_command_stack_depth,
                "Encountered json_path string that is too complex");
 
   auto offsets = cudf::make_fixed_width_column(
     data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr);
   cudf::mutable_column_view offsets_view(*offsets);
 
+  // if the query is empty, return a string column containing all nulls
+  if (std::get<2>(preprocess)) {
+    thrust::generate(rmm::exec_policy(stream),
+                     offsets_view.head<size_type>(),
+                     offsets_view.head<size_type>() + offsets_view.size(),
+                     [] __device__() { return 0; });
+    return cudf::make_strings_column(
+      col.size(),
+      std::move(offsets),
+      cudf::make_fixed_width_column(
+        data_type{type_id::INT8}, 0, mask_state::UNALLOCATED, stream, mr),
+      col.size(),
+      cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr),
+      stream,
+      mr);
+  }
+
   cudf::detail::grid_1d const grid{col.size(), 512};
 
   // preprocess sizes
   get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     col.chars().head<char>(),
     col.offsets().head<size_type>(),
-    preprocess.first.data(),
+    std::get<0>(preprocess).data(),
     offsets_view.head<size_type>(),
     nullptr,
+    nullptr,
     col.size());
 
   // convert sizes to offsets
@@ -866,21 +904,27 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
   auto chars = cudf::make_fixed_width_column(
     data_type{type_id::INT8}, output_size, mask_state::UNALLOCATED, stream, mr);
 
+  // potential optimization : if we know that all outputs are valid, we could skip creating
+  // the validity mask altogether
+  rmm::device_buffer validity =
+    cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr);
+
   // compute results
   cudf::mutable_column_view chars_view(*chars);
   get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     col.chars().head<char>(),
     col.offsets().head<size_type>(),
-    preprocess.first.data(),
+    std::get<0>(preprocess).data(),
     offsets_view.head<size_type>(),
     chars_view.head<char>(),
+    static_cast<bitmask_type*>(validity.data()),
     col.size());
 
   return make_strings_column(col.size(),
                              std::move(offsets),
                              std::move(chars),
-                             col.null_count(),
-                             cudf::detail::copy_bitmask(col.parent(), stream, mr),
+                             UNKNOWN_NULL_COUNT,
+                             std::move(validity),
                              stream,
                              mr);
 }
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index cf53d7f54e2..bb33b0d8289 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -220,4 +220,42 @@ TEST_F(JsonTests, GetJsonObject)
 
     cudf::test::print(*result);
   }
+
+  // empty query -> null
+  {
+    cudf::test::strings_column_wrapper input{""};
+    std::string json_path("");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+
+  // empty input -> null
+  {
+    cudf::test::strings_column_wrapper input{""};
+    std::string json_path("$");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+
+  // empty output -> null
+  {
+    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    std::string json_path("$[*].c");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
+
+  // slightly different from "empty output". in this case, we're
+  // returning something, but it happens to be empty. so we expect
+  // a valid, but empty row
+  {
+    cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"};
+    std::string json_path("$.store.bicycle");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::print(*result);
+  }
 }

From 4cd0e2d7633f5c7175dcb7c6898bac8882acd9d7 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Wed, 24 Mar 2021 20:55:03 -0500
Subject: [PATCH 15/33] get_json_path() cleaned up and ready for review.

---
 cpp/src/io/csv/csv_gpu.cu              |   6 +-
 cpp/src/io/json/json_gpu.cu            |   4 +-
 cpp/src/io/utilities/parsing_utils.cuh | 142 ++++---
 cpp/src/strings/json/json_path.cu      | 532 +++++++++++--------------
 cpp/tests/strings/json_tests.cpp       | 479 ++++++++++++++++------
 5 files changed, 664 insertions(+), 499 deletions(-)

diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 86e5f1fdcae..44acc7fc55f 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -196,7 +196,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
       } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) ||
                  serialized_trie_contains(opts.trie_false, {field_start, field_len})) {
         atomicAdd(&d_columnData[actual_col].bool_count, 1);
-      } else if (cudf::io::gpu::is_infinity(field_start, next_delimiter)) {
+      } else if (cudf::io::is_infinity(field_start, next_delimiter)) {
         atomicAdd(&d_columnData[actual_col].float_count, 1);
       } else {
         long countNumber   = 0;
@@ -277,7 +277,7 @@ __inline__ __device__ T decode_value(char const *begin,
                                      char const *end,
                                      parse_options_view const &opts)
 {
-  return cudf::io::gpu::parse_numeric<T, base>(begin, end, opts);
+  return cudf::io::parse_numeric<T, base>(begin, end, opts);
 }
 
 template <typename T>
@@ -285,7 +285,7 @@ __inline__ __device__ T decode_value(char const *begin,
                                      char const *end,
                                      parse_options_view const &opts)
 {
-  return cudf::io::gpu::parse_numeric<T>(begin, end, opts);
+  return cudf::io::parse_numeric<T>(begin, end, opts);
 }
 
 template <>
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 5efb64fd4d5..75910ae6b5b 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -114,7 +114,7 @@ __inline__ __device__ T decode_value(const char *begin,
                                      uint64_t end,
                                      parse_options_view const &opts)
 {
-  return cudf::io::gpu::parse_numeric<T, base>(begin, end, opts);
+  return cudf::io::parse_numeric<T, base>(begin, end, opts);
 }
 
 /**
@@ -131,7 +131,7 @@ __inline__ __device__ T decode_value(const char *begin,
                                      const char *end,
                                      parse_options_view const &opts)
 {
-  return cudf::io::gpu::parse_numeric<T>(begin, end, opts);
+  return cudf::io::parse_numeric<T>(begin, end, opts);
 }
 
 /**
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 375874ea11e..b7719cba580 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -84,67 +84,6 @@ struct parse_options {
   }
 };
 
-namespace gpu {
-/**
- * @brief CUDA kernel iterates over the data until the end of the current field
- *
- * Also iterates over (one or more) delimiter characters after the field.
- * Function applies to formats with field delimiters and line terminators.
- *
- * @param begin Pointer to the first element of the string
- * @param end Pointer to the first element after the string
- * @param opts A set of parsing options
- * @param escape_char A boolean value to signify whether to consider `\` as escape character or
- * just a character.
- *
- * @return Pointer to the last character in the field, including the
- *  delimiter(s) following the field data
- */
-__device__ __inline__ char const* seek_field_end(char const* begin,
-                                                 char const* end,
-                                                 parse_options_view const& opts,
-                                                 bool escape_char = false)
-{
-  bool quotation   = false;
-  auto current     = begin;
-  bool escape_next = false;
-  while (true) {
-    // Use simple logic to ignore control chars between any quote seq
-    // Handles nominal cases including doublequotes within quotes, but
-    // may not output exact failures as PANDAS for malformed fields.
-    // Check for instances such as "a2\"bc" and "\\" if `escape_char` is true.
-
-    if (*current == opts.quotechar and not escape_next) {
-      quotation = !quotation;
-    } else if (!quotation) {
-      if (*current == opts.delimiter) {
-        while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) {
-          ++current;
-        }
-        break;
-      } else if (*current == opts.terminator) {
-        break;
-      } else if (*current == '\r' && (current + 1 < end && *(current + 1) == '\n')) {
-        --end;
-        break;
-      }
-    }
-
-    if (escape_char == true) {
-      // If a escape character is encountered, escape next character in next loop.
-      if (escape_next == false and *current == '\\') {
-        escape_next = true;
-      } else {
-        escape_next = false;
-      }
-    }
-
-    if (current >= end) break;
-    current++;
-  }
-  return current;
-}
-
 /**
  * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization
  * for integral types. Handles hexadecimal digits, both uppercase and lowercase.
@@ -157,7 +96,7 @@ __device__ __inline__ char const* seek_field_end(char const* begin,
  * @return uint8_t Numeric value of the character, or `0`
  */
 template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-__device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag)
+constexpr uint8_t decode_digit(char c, bool* valid_flag)
 {
   if (c >= '0' && c <= '9') return c - '0';
   if (c >= 'a' && c <= 'f') return c - 'a' + 10;
@@ -178,7 +117,7 @@ __device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag)
  * @return uint8_t Numeric value of the character, or `0`
  */
 template <typename T, typename std::enable_if_t<!std::is_integral<T>::value>* = nullptr>
-__device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag)
+constexpr uint8_t decode_digit(char c, bool* valid_flag)
 {
   if (c >= '0' && c <= '9') return c - '0';
 
@@ -187,10 +126,7 @@ __device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag)
 }
 
 // Converts character to lowercase.
-__inline__ __device__ char to_lower(char const c)
-{
-  return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c;
-}
+constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; }
 
 /**
  * @brief Checks if string is infinity, case insensitive with/without sign
@@ -201,7 +137,7 @@ __inline__ __device__ char to_lower(char const c)
  * @param end Pointer to the first element after the string
  * @return true if string is valid infinity, else false.
  */
-__inline__ __device__ bool is_infinity(char const* begin, char const* end)
+constexpr bool is_infinity(char const* begin, char const* end)
 {
   if (*begin == '-' || *begin == '+') begin++;
   char const* cinf = "infinity";
@@ -225,9 +161,10 @@ __inline__ __device__ bool is_infinity(char const* begin, char const* end)
  * @return The parsed and converted value
  */
 template <typename T, int base = 10>
-__inline__ __device__ T parse_numeric(const char* begin,
-                                      const char* end,
-                                      parse_options_view const& opts)
+constexpr T parse_numeric(const char* begin,
+                          const char* end,
+                          parse_options_view const& opts,
+                          T error_result = std::numeric_limits<T>::quiet_NaN())
 {
   T value{};
   bool all_digits_valid = true;
@@ -283,11 +220,72 @@ __inline__ __device__ T parse_numeric(const char* begin,
       if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
     }
   }
-  if (!all_digits_valid) { return std::numeric_limits<T>::quiet_NaN(); }
+  if (!all_digits_valid) { return error_result; }
 
   return value * sign;
 }
 
+namespace gpu {
+/**
+ * @brief CUDA kernel iterates over the data until the end of the current field
+ *
+ * Also iterates over (one or more) delimiter characters after the field.
+ * Function applies to formats with field delimiters and line terminators.
+ *
+ * @param begin Pointer to the first element of the string
+ * @param end Pointer to the first element after the string
+ * @param opts A set of parsing options
+ * @param escape_char A boolean value to signify whether to consider `\` as escape character or
+ * just a character.
+ *
+ * @return Pointer to the last character in the field, including the
+ *  delimiter(s) following the field data
+ */
+__device__ __inline__ char const* seek_field_end(char const* begin,
+                                                 char const* end,
+                                                 parse_options_view const& opts,
+                                                 bool escape_char = false)
+{
+  bool quotation   = false;
+  auto current     = begin;
+  bool escape_next = false;
+  while (true) {
+    // Use simple logic to ignore control chars between any quote seq
+    // Handles nominal cases including doublequotes within quotes, but
+    // may not output exact failures as PANDAS for malformed fields.
+    // Check for instances such as "a2\"bc" and "\\" if `escape_char` is true.
+
+    if (*current == opts.quotechar and not escape_next) {
+      quotation = !quotation;
+    } else if (!quotation) {
+      if (*current == opts.delimiter) {
+        while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) {
+          ++current;
+        }
+        break;
+      } else if (*current == opts.terminator) {
+        break;
+      } else if (*current == '\r' && (current + 1 < end && *(current + 1) == '\n')) {
+        --end;
+        break;
+      }
+    }
+
+    if (escape_char == true) {
+      // If a escape character is encountered, escape next character in next loop.
+      if (escape_next == false and *current == '\\') {
+        escape_next = true;
+      } else {
+        escape_next = false;
+      }
+    }
+
+    if (current >= end) break;
+    current++;
+  }
+  return current;
+}
+
 /**
  * @brief Lexicographically compare digits in input against string
  * representing an integer
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 65503dfa1a3..8cb3c7bb25b 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -21,15 +21,14 @@ namespace detail {
 
 namespace {
 
-// temporary. for debugging purposes
-#define DEBUG_NEWLINE
-#define DEBUG_NEWLINE_LEN (0)
+// debug accessibility
 
-// #define DEBUG_NEWLINE  "\n"
-// #define DEBUG_NEWLINE_LEN (1)
+// change to "\n" and 1 to make output more readable
+#define DEBUG_NEWLINE
+constexpr int DEBUG_NEWLINE_LEN = 0;
 
-// temporary. spark doesn't strictly follow the JSONPath spec.
-// I think this probably should be a configurable enum to control
+// temporary? spark doesn't strictly follow the JSONPath spec.
+// I think this probably could be a configurable enum to control
 // the kind of output you get and what features are supported.
 //
 // Current known differences:
@@ -40,8 +39,6 @@ namespace {
 // - Spark only supports the wildcard operator when in a subscript, eg  [*]
 //   It does not handle .*
 //
-#define __SPARK_BEHAVIORS
-
 // Other, non-spark known differences:
 //
 // - In jsonpath_ng, name subscripts can use double quotes instead of the standard
@@ -49,199 +46,63 @@ namespace {
 //   standard:      $.thing['subscript']
 //   jsonpath_ng:   $.thing["subscript"]
 //
-//  Currently, this code only allows single-quotes but that can be easily expanded.
+//  Currently, this code only allows single-quotes but that could be expanded if necessary.
 //
+#define SPARK_BEHAVIORS
 
 using namespace cudf;
 
-CUDA_HOST_DEVICE_CALLABLE char to_lower(char const c)
-{
-  return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c;
-}
-
-template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-CUDA_HOST_DEVICE_CALLABLE uint8_t decode_digit(char c, bool* valid_flag)
-{
-  if (c >= '0' && c <= '9') return c - '0';
-  if (c >= 'a' && c <= 'f') return c - 'a' + 10;
-  if (c >= 'A' && c <= 'F') return c - 'A' + 10;
-
-  *valid_flag = false;
-  return 0;
-}
-
-template <typename T, typename std::enable_if_t<!std::is_integral<T>::value>* = nullptr>
-CUDA_HOST_DEVICE_CALLABLE uint8_t decode_digit(char c, bool* valid_flag)
-{
-  if (c >= '0' && c <= '9') return c - '0';
-
-  *valid_flag = false;
-  return 0;
-}
-
-CUDA_HOST_DEVICE_CALLABLE bool is_infinity(char const* begin, char const* end)
-{
-  if (*begin == '-' || *begin == '+') begin++;
-  char const* cinf = "infinity";
-  auto index       = begin;
-  while (index < end) {
-    if (*cinf != to_lower(*index)) break;
-    index++;
-    cinf++;
-  }
-  return ((index == begin + 3 || index == begin + 8) && index >= end);
-}
-
-template <typename T, int base = 10>
-CUDA_HOST_DEVICE_CALLABLE T parse_numeric(const char* begin,
-                                          const char* end,
-                                          cudf::io::parse_options_view const& opts)
-{
-  T value{};
-  bool all_digits_valid = true;
-
-  // Handle negative values if necessary
-  int32_t sign = (*begin == '-') ? -1 : 1;
-
-  // Handle infinity
-  if (std::is_floating_point<T>::value && is_infinity(begin, end)) {
-    return sign * std::numeric_limits<T>::infinity();
-  }
-  if (*begin == '-' || *begin == '+') begin++;
-
-  // Skip over the "0x" prefix for hex notation
-  if (base == 16 && begin + 2 < end && *begin == '0' && *(begin + 1) == 'x') { begin += 2; }
-
-  // Handle the whole part of the number
-  // auto index = begin;
-  while (begin < end) {
-    if (*begin == opts.decimal) {
-      ++begin;
-      break;
-    } else if (base == 10 && (*begin == 'e' || *begin == 'E')) {
-      break;
-    } else if (*begin != opts.thousands && *begin != '+') {
-      value = (value * base) + decode_digit<T>(*begin, &all_digits_valid);
-    }
-    ++begin;
-  }
-
-  if (std::is_floating_point<T>::value) {
-    // Handle fractional part of the number if necessary
-    double divisor = 1;
-    while (begin < end) {
-      if (*begin == 'e' || *begin == 'E') {
-        ++begin;
-        break;
-      } else if (*begin != opts.thousands && *begin != '+') {
-        divisor /= base;
-        value += decode_digit<T>(*begin, &all_digits_valid) * divisor;
-      }
-      ++begin;
-    }
-
-    // Handle exponential part of the number if necessary
-    if (begin < end) {
-      const int32_t exponent_sign = *begin == '-' ? -1 : 1;
-      if (*begin == '-' || *begin == '+') { ++begin; }
-      int32_t exponent = 0;
-      while (begin < end) {
-        exponent = (exponent * 10) + decode_digit<T>(*(begin++), &all_digits_valid);
-      }
-      if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
-    }
-  }
-  if (!all_digits_valid) { return std::numeric_limits<T>::quiet_NaN(); }
-
-  return value * sign;
-}
-
-CUDA_HOST_DEVICE_CALLABLE bool device_strncmp(const char* str1, const char* str2, size_t num_chars)
-{
-  for (size_t idx = 0; idx < num_chars; idx++) {
-    if (str1[idx] != str2[idx]) { return false; }
-  }
-  return true;
-}
-
-CUDA_HOST_DEVICE_CALLABLE char const* device_strpbrk(const char* str,
-                                                     size_t str_size,
-                                                     const char* tok,
-                                                     size_t tok_size)
-{
-  size_t pos = 0;
-  while (pos < str_size) {
-    size_t tpos = 0;
-    char c      = str[pos];
-    while (tpos < tok_size) {
-      if (c == tok[tpos]) { return str + pos; }
-      tpos++;
-    }
-    pos++;
-  }
-  return nullptr;
-}
+/**
+ * @brief Result of calling a parse type function.
+ *
+ * The primary use of this is to distinguish between "success" and
+ * "success but no data" return cases.  For example, if you are reading the
+ * values of an array you might call a parse function in a while loop. You
+ * would want to continue doing this until you either encounter an error (parse_result::ERROR)
+ * or you get nothing back (parse_result::EMPTY)
+ */
+enum class parse_result {
+  ERROR,    // failure
+  SUCCESS,  // success
+  EMPTY,    // success, but no data
+};
 
 struct json_string {
   const char* str;
   int64_t len;
 
-  CUDA_HOST_DEVICE_CALLABLE json_string() : str(nullptr), len(-1) {}
-  CUDA_HOST_DEVICE_CALLABLE json_string(const char* _str, int64_t _len) : str(_str), len(_len) {}
+  constexpr json_string() : str(nullptr), len(-1) {}
+  constexpr json_string(const char* _str, int64_t _len) : str(_str), len(_len) {}
 
-  CUDA_HOST_DEVICE_CALLABLE bool operator==(json_string const& cmp)
+  constexpr bool operator==(json_string const& cmp)
   {
     return len == cmp.len && str != nullptr && cmp.str != nullptr &&
-           device_strncmp(str, cmp.str, static_cast<size_t>(len));
+           thrust::equal(thrust::seq, str, str + len, cmp.str);
   }
 };
 
-enum class parse_result {
-  ERROR,
-  SUCCESS,
-  EMPTY,
-};
-
-enum json_element_type { NONE, OBJECT, ARRAY, VALUE };
-
-struct json_output {
-  size_t output_max_len;
-  size_t output_len;
-  int element_count;
-  char* output;
-
-  CUDA_HOST_DEVICE_CALLABLE void add_output(const char* str, size_t len)
-  {
-    if (output != nullptr) {
-      // assert output_len + len < output_max_len
-      memcpy(output + output_len, str, len);
-    }
-    output_len += len;
-  }
-
-  CUDA_HOST_DEVICE_CALLABLE void add_output(json_string str) { add_output(str.str, str.len); }
-};
-
-CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c)
-{
-  return c == ' ' || c == '\r' || c == '\n' || c == '\t' ? true : false;
-}
-
+/**
+ * @brief Base parser class inherited by the (device-side) json_state class and
+ * (host-side) path_state class.
+ *
+ * Contains a number of useful utility functions common to parsing json and
+ * JSONPath strings.
+ */
 class parser {
  protected:
-  CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {}
-  CUDA_HOST_DEVICE_CALLABLE parser(const char* _input, int64_t _input_len)
+  constexpr parser() : input(nullptr), input_len(0), pos(nullptr) {}
+  constexpr parser(const char* _input, int64_t _input_len)
     : input(_input), input_len(_input_len), pos(_input)
   {
     parse_whitespace();
   }
 
-  CUDA_HOST_DEVICE_CALLABLE parser(parser const& p) { *this = p; }
+  constexpr parser(parser const& p) : input(p.input), input_len(p.input_len), pos(p.pos) {}
 
-  CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; }
-  CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); }
+  constexpr bool eof(const char* p) { return p - input >= input_len; }
+  constexpr bool eof() { return eof(pos); }
 
-  CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace()
+  constexpr bool parse_whitespace()
   {
     while (!eof()) {
       if (is_whitespace(*pos)) {
@@ -253,25 +114,21 @@ class parser {
     return false;
   }
 
-  CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(json_string& str,
-                                                      bool can_be_empty,
-                                                      char quote)
+  constexpr parse_result parse_string(json_string& str, bool can_be_empty, char quote)
   {
     str.str = nullptr;
     str.len = 0;
 
-    if (parse_whitespace()) {
-      if (*pos == quote) {
-        const char* start = ++pos;
-        while (!eof()) {
-          if (*pos == quote) {
-            str.str = start;
-            str.len = pos - start;
-            pos++;
-            return parse_result::SUCCESS;
-          }
+    if (parse_whitespace() && *pos == quote) {
+      const char* start = ++pos;
+      while (!eof()) {
+        if (*pos == quote) {
+          str.str = start;
+          str.len = pos - start;
           pos++;
+          return parse_result::SUCCESS;
         }
+        pos++;
       }
     }
 
@@ -281,9 +138,7 @@ class parser {
   // a name means:
   // - a string followed by a :
   // - no string
-  CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(json_string& name,
-                                                    bool can_be_empty,
-                                                    char quote)
+  constexpr parse_result parse_name(json_string& name, bool can_be_empty, char quote)
   {
     if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) {
       return parse_result::ERROR;
@@ -300,18 +155,22 @@ class parser {
     return parse_result::EMPTY;
   }
 
-  // this function is not particularly strong
-  CUDA_HOST_DEVICE_CALLABLE parse_result parse_number(json_string& val)
+  // numbers, true, false, null.
+  // this function is not particularly strong. badly formed values will get
+  // consumed without throwing any errors
+  constexpr parse_result parse_non_string_value(json_string& val)
   {
     if (!parse_whitespace()) { return parse_result::ERROR; }
 
-    // parse to the end of the number (does not do any error checking on whether
-    // the number is reasonably formed or not)
+    // parse to the end of the value
     char const* start = pos;
     char const* end   = start;
     while (!eof(end)) {
-      char c = *end;
-      if (c == ',' || is_whitespace(c)) { break; }
+      char const c = *end;
+      if (c == ',' || c == '}' || c == ']' || is_whitespace(c)) { break; }
+
+      // illegal chars
+      if (c == '[' || c == '{' || c == ':' || c == '\"') { return parse_result::ERROR; }
       end++;
     }
     pos = end;
@@ -326,25 +185,56 @@ class parser {
   char const* input;
   int64_t input_len;
   char const* pos;
+
+ private:
+  constexpr bool is_whitespace(char c)
+  {
+    return c == ' ' || c == '\r' || c == '\n' || c == '\t' ? true : false;
+  }
 };
 
+struct json_output {
+  size_t output_max_len;
+  size_t output_len;
+  int element_count;
+  char* output;
+
+  constexpr void add_output(const char* str, size_t len)
+  {
+    if (output != nullptr) { memcpy(output + output_len, str, len); }
+    output_len += len;
+  }
+
+  constexpr void add_output(json_string str) { add_output(str.str, str.len); }
+};
+
+enum json_element_type { NONE, OBJECT, ARRAY, VALUE };
+
 class json_state : private parser {
  public:
-  CUDA_HOST_DEVICE_CALLABLE json_state()
-    : parser(), element(json_element_type::NONE), cur_el_start(nullptr)
+  constexpr json_state()
+    : parser(),
+      element(json_element_type::NONE),
+      cur_el_start(nullptr),
+      cur_el_type(json_element_type::NONE)
   {
   }
-  CUDA_HOST_DEVICE_CALLABLE json_state(const char* _input, int64_t _input_len)
-    : parser(_input, _input_len), element(json_element_type::NONE), cur_el_start(nullptr)
+  constexpr json_state(const char* _input, int64_t _input_len)
+    : parser(_input, _input_len),
+      element(json_element_type::NONE),
+      cur_el_start(nullptr),
+      cur_el_type(json_element_type::NONE)
   {
   }
 
-  CUDA_HOST_DEVICE_CALLABLE json_state(json_state const& j) : parser(j) { *this = j; }
-
-  CUDA_HOST_DEVICE_CALLABLE parse_result extract_element(json_output* output, bool list_element)
+  constexpr json_state(json_state const& j)
+    : parser(j), element(j.element), cur_el_start(j.cur_el_start), cur_el_type(j.cur_el_type)
   {
-    // collapse the current element into a json_string
+  }
 
+  // retrieve the entire current element as a json_string
+  constexpr parse_result extract_element(json_output* output, bool list_element)
+  {
     char const* start = cur_el_start;
     char const* end   = start;
 
@@ -354,7 +244,7 @@ class json_state : private parser {
       if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; }
       end = pos;
 
-#if defined(__SPARK_BEHAVIORS)
+#if defined(SPARK_BEHAVIORS)
       // spark/hive-specific behavior.  if this is a non-list-element wrapped in quotes,
       // strip them
       if (!list_element && *start == '\"' && *(end - 1) == '\"') {
@@ -369,11 +259,10 @@ class json_state : private parser {
       int arr_count = 0;
 
       while (!eof(end)) {
-        char c = *end++;
         // could do some additional checks here. we know our current
         // element type, so we could be more strict on what kinds of
         // characters we expect to see.
-        switch (c) {
+        switch (*end++) {
           case '{': obj_count++; break;
           case '}': obj_count--; break;
           case '[': arr_count++; break;
@@ -382,6 +271,7 @@ class json_state : private parser {
         }
         if (obj_count == 0 && arr_count == 0) { break; }
       }
+      if (obj_count > 0 || arr_count > 0) { return parse_result::ERROR; }
       pos = end;
     }
 
@@ -391,32 +281,24 @@ class json_state : private parser {
     }
 
     if (output != nullptr) {
-      // seems like names are never included with JSONPath unless
-      // they are nested within the element being returned.
-      /*
-      if(cur_el_name.len > 0){
-        output->add_output({"\"", 1});
-        output->add_output(cur_el_name);
-        output->add_output({"\"", 1});
-        output->add_output({":", 1});
-      }
-      */
       output->add_output({start, end - start});
       output->element_count++;
     }
     return parse_result::SUCCESS;
   }
 
-  CUDA_HOST_DEVICE_CALLABLE parse_result skip_element() { return extract_element(nullptr, false); }
-
-  json_element_type element;
+  constexpr parse_result skip_element() { return extract_element(nullptr, false); }
 
-  CUDA_HOST_DEVICE_CALLABLE parse_result next_element() { return next_element_internal(false); }
+  constexpr parse_result next_element() { return next_element_internal(false); }
 
-  CUDA_HOST_DEVICE_CALLABLE parse_result child_element() { return next_element_internal(true); }
+  constexpr parse_result child_element(bool as_field = false)
+  {
+    // cannot retrieve a field from an array
+    if (as_field && cur_el_type == json_element_type::ARRAY) { return parse_result::ERROR; }
+    return next_element_internal(true);
+  }
 
-  CUDA_HOST_DEVICE_CALLABLE parse_result next_matching_element(json_string const& name,
-                                                               bool inclusive)
+  constexpr parse_result next_matching_element(json_string const& name, bool inclusive)
   {
     // if we're not including the current element, skip it
     if (!inclusive) {
@@ -441,16 +323,16 @@ class json_state : private parser {
   }
 
  private:
-  CUDA_HOST_DEVICE_CALLABLE parse_result parse_value()
+  constexpr parse_result parse_value()
   {
     if (!parse_whitespace()) { return parse_result::ERROR; }
 
     // string or number?
     json_string unused;
-    return *pos == '\"' ? parse_string(unused, false, '\"') : parse_number(unused);
+    return *pos == '\"' ? parse_string(unused, false, '\"') : parse_non_string_value(unused);
   }
 
-  CUDA_HOST_DEVICE_CALLABLE parse_result next_element_internal(bool child)
+  constexpr parse_result next_element_internal(bool child)
   {
     // if we're not getting a child element, skip the current element.
     // this will leave pos as the first character -after- the close of
@@ -461,10 +343,15 @@ class json_state : private parser {
     }
     // otherwise pos will be at the first character within the current element
 
+    // can only get the child of an object or array.
+    // this could theoretically be handled as an error, but the evaluators I've found
+    // seem to treat this as "it's nothing"
+    if (child && (cur_el_type == VALUE || cur_el_type == NONE)) { return parse_result::EMPTY; }
+
     // what's next
     if (!parse_whitespace()) { return parse_result::EMPTY; }
     // if we're closing off a parent element, we're done
-    char c = *pos;
+    char const c = *pos;
     if (c == ']' || c == '}') { return parse_result::EMPTY; }
 
     // element name, if any
@@ -472,7 +359,7 @@ class json_state : private parser {
 
     // element type
     if (!parse_whitespace()) { return parse_result::EMPTY; }
-    switch (*pos) {
+    switch (*pos++) {
       case '[': cur_el_type = ARRAY; break;
       case '{': cur_el_type = OBJECT; break;
 
@@ -483,13 +370,13 @@ class json_state : private parser {
       // value type
       default: cur_el_type = VALUE; break;
     }
-    pos++;
 
     // the start of the current element is always at the value, not the name
     cur_el_start = pos - 1;
     return parse_result::SUCCESS;
   }
 
+  json_element_type element;
   const char* cur_el_start;
   json_string cur_el_name;
   json_element_type cur_el_type;
@@ -498,34 +385,24 @@ class json_state : private parser {
 enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END };
 
 struct path_operator {
-  CUDA_HOST_DEVICE_CALLABLE path_operator() : type(path_operator_type::ERROR), index(-1) {}
-  CUDA_HOST_DEVICE_CALLABLE path_operator(path_operator_type _type) : type(_type), index(-1) {}
+  constexpr path_operator() : type(path_operator_type::ERROR), index(-1) {}
+  constexpr path_operator(path_operator_type _type) : type(_type), index(-1) {}
 
   path_operator_type type;
   json_string name;
   int index;
 };
-struct command_buffer {
-  rmm::device_uvector<path_operator> commands;
-  // used as backing memory for the name fields inside the
-  // path_operator objects
-  string_scalar json_path;
-};
 
 // current state of the JSONPath
 class path_state : private parser {
  public:
-  CUDA_HOST_DEVICE_CALLABLE path_state(const char* _path, size_t _path_len)
-    : parser(_path, _path_len)
-  {
-  }
+  path_state(const char* _path, size_t _path_len) : parser(_path, _path_len) {}
 
-  CUDA_HOST_DEVICE_CALLABLE path_operator get_next_operator()
+  path_operator get_next_operator()
   {
     if (eof()) { return {path_operator_type::END}; }
 
-    char c = *pos++;
-    switch (c) {
+    switch (*pos++) {
       case '$': return {path_operator_type::ROOT};
 
       case '.': {
@@ -551,7 +428,7 @@ class path_state : private parser {
       case '[': {
         path_operator op;
         json_string term{"]", 1};
-        bool is_string = *pos == '\'' ? true : false;
+        bool const is_string = *pos == '\'' ? true : false;
         if (parse_path_name(op.name, term)) {
           pos++;
           if (op.name.len == 1 && op.name.str[0] == '*') {
@@ -560,15 +437,23 @@ class path_state : private parser {
             if (is_string) {
               op.type = path_operator_type::CHILD;
             } else {
-              op.type  = path_operator_type::CHILD_INDEX;
-              op.index = parse_numeric<int>(op.name.str, op.name.str + op.name.len, json_opts);
+              op.type = path_operator_type::CHILD_INDEX;
+              op.index =
+                cudf::io::parse_numeric<int>(op.name.str, op.name.str + op.name.len, json_opts, -1);
+              CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath");
             }
           }
           return op;
         }
       } break;
 
-      default: break;
+      // wildcard operator
+      case '*': {
+        pos++;
+        return path_operator{path_operator_type::CHILD_WILDCARD};
+      } break;
+
+      default: CUDF_FAIL("Unrecognized JSONPath operator"); break;
     }
     return {path_operator_type::ERROR};
   }
@@ -576,10 +461,9 @@ class path_state : private parser {
  private:
   cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'};
 
-  CUDA_HOST_DEVICE_CALLABLE bool parse_path_name(json_string& name, json_string& terminators)
+  bool parse_path_name(json_string& name, json_string const& terminators)
   {
-    char c = *pos;
-    switch (c) {
+    switch (*pos) {
       case '*':
         name.str = pos;
         name.len = 1;
@@ -592,7 +476,8 @@ class path_state : private parser {
 
       default: {
         size_t const chars_left = input_len - (pos - input);
-        char const* end         = device_strpbrk(pos, chars_left, terminators.str, terminators.len);
+        char const* end         = std::find_first_of(
+          pos, pos + chars_left, terminators.str, terminators.str + terminators.len);
         if (end) {
           name.str = pos;
           name.len = end - pos;
@@ -602,10 +487,13 @@ class path_state : private parser {
           name.len = chars_left;
           pos      = input + input_len;
         }
-        return true;
+        break;
       }
     }
 
+    // an empty name is not valid
+    CUDF_EXPECTS(name.len > 0, "Invalid empty name in JSONpath query string");
+
     return true;
   }
 };
@@ -628,15 +516,25 @@ std::tuple<rmm::device_uvector<path_operator>, int, bool> build_command_buffer(
     if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; }
     // convert pointer to device pointer
     if (op.name.len > 0) { op.name.str = json_path.data() + (op.name.str - h_json_path.data()); }
+    if (op.type == path_operator_type::ROOT) {
+      CUDF_EXPECTS(h_operators.size() == 0, "Root operator ($) can only exist at the root");
+    }
+    // if we havent' gotten a root operator to start, and we're not empty, quietly push a
+    // root operator now.
+    if (h_operators.size() == 0 && op.type != path_operator_type::ROOT &&
+        op.type != path_operator_type::END) {
+      h_operators.push_back(path_operator{path_operator_type::ROOT});
+    }
     h_operators.push_back(op);
   } while (op.type != path_operator_type::END);
 
   rmm::device_uvector<path_operator> d_operators(h_operators.size(), stream);
-  cudaMemcpyAsync(d_operators.data(),
-                  h_operators.data(),
-                  sizeof(path_operator) * h_operators.size(),
-                  cudaMemcpyHostToDevice,
-                  stream.value());
+  CUDA_TRY(cudaMemcpyAsync(d_operators.data(),
+                           h_operators.data(),
+                           sizeof(path_operator) * h_operators.size(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+  stream.synchronize();
 
   return {std::move(d_operators),
           max_stack_depth,
@@ -650,17 +548,17 @@ std::tuple<rmm::device_uvector<path_operator>, int, bool> build_command_buffer(
   } while (0)
 
 template <int max_command_stack_depth>
-CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
-                                                       path_operator const* _commands,
-                                                       json_output& output,
-                                                       bool _list_element = false)
+__device__ parse_result parse_json_path(json_state& _j_state,
+                                        path_operator const* _commands,
+                                        json_output& output,
+                                        bool _list_element = false)
 {
   // manually maintained context stack in lieu of calling parse_json_path recursively.
   struct context {
     json_state j_state;
     path_operator const* commands;
     bool list_element;
-    int element_count;
+    // int element_count;
     bool state_flag;
   };
   context stack[max_command_stack_depth];
@@ -668,10 +566,11 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
   auto push_context = [&stack, &stack_pos](json_state const& _j_state,
                                            path_operator const* _commands,
                                            bool _list_element = false,
-                                           int _element_count = 0,
-                                           bool _state_flag   = false) {
+                                           /* int _element_count = 0,*/
+                                           bool _state_flag = false) {
     if (stack_pos == max_command_stack_depth - 1) { return false; }
-    stack[stack_pos++] = context{_j_state, _commands, _list_element, _element_count, _state_flag};
+    stack[stack_pos++] =
+      context{_j_state, _commands, _list_element, /*_element_count,*/ _state_flag};
     return true;
   };
   auto pop_context = [&stack, &stack_pos](context& c) {
@@ -685,6 +584,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
 
   parse_result last_result = parse_result::SUCCESS;
   context ctx;
+  int element_count = 0;
   while (pop_context(ctx)) {
     path_operator op = *ctx.commands;
 
@@ -700,11 +600,11 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
       // [1]
       // will return a single thing
       case path_operator_type::CHILD: {
-        PARSE_TRY(ctx.j_state.child_element());
+        PARSE_TRY(ctx.j_state.child_element(true));
         if (last_result == parse_result::SUCCESS) {
           PARSE_TRY(ctx.j_state.next_matching_element(op.name, true));
           if (last_result == parse_result::SUCCESS) {
-            push_context(ctx.j_state, ctx.commands + 1, ctx.list_element, ctx.element_count);
+            push_context(ctx.j_state, ctx.commands + 1, ctx.list_element);
           }
         }
       } break;
@@ -715,44 +615,48 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
       case path_operator_type::CHILD_WILDCARD: {
         // if we're on the first element of this wildcard
         if (!ctx.state_flag) {
-          output.add_output("[" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN);
+          // we will only ever be returning 1 array
+          if (!ctx.list_element) { output.add_output({"[" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); }
 
           // step into the child element
           PARSE_TRY(ctx.j_state.child_element());
           if (last_result == parse_result::EMPTY) {
-            output.add_output("]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN);
+            if (!ctx.list_element) {
+              output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN});
+            }
             last_result = parse_result::SUCCESS;
             break;
           }
 
           // first element
-          PARSE_TRY(ctx.j_state.next_matching_element(op.name, true));
+          PARSE_TRY(ctx.j_state.next_matching_element({"*", 1}, true));
           if (last_result == parse_result::EMPTY) {
-            output.add_output("]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN);
+            if (!ctx.list_element) {
+              output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN});
+            }
             last_result = parse_result::SUCCESS;
             break;
           }
 
           // re-push ourselves
-          push_context(ctx.j_state, ctx.commands, false, 0, true);
+          push_context(ctx.j_state, ctx.commands, ctx.list_element, true);
           // push the next command
-          push_context(ctx.j_state, ctx.commands + 1, true, 0);
+          push_context(ctx.j_state, ctx.commands + 1, true);
         } else {
-          // if we actually processed something to the output, increment count
-          if (last_result != parse_result::EMPTY) { ctx.element_count++; }
-
           // next element
-          PARSE_TRY(ctx.j_state.next_matching_element(op.name, false));
+          PARSE_TRY(ctx.j_state.next_matching_element({"*", 1}, false));
           if (last_result == parse_result::EMPTY) {
-            output.add_output("]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN);
+            if (!ctx.list_element) {
+              output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN});
+            }
             last_result = parse_result::SUCCESS;
             break;
           }
 
           // re-push ourselves
-          push_context(ctx.j_state, ctx.commands, false, 0, true);
+          push_context(ctx.j_state, ctx.commands, ctx.list_element, true);
           // push the next command
-          push_context(ctx.j_state, ctx.commands + 1, true, ctx.element_count);
+          push_context(ctx.j_state, ctx.commands + 1, true);
         }
       } break;
 
@@ -763,14 +667,17 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
       case path_operator_type::CHILD_INDEX: {
         PARSE_TRY(ctx.j_state.child_element());
         if (last_result == parse_result::SUCCESS) {
-          json_string any{"*", 1};
+          json_string const any{"*", 1};
           PARSE_TRY(ctx.j_state.next_matching_element(any, true));
           if (last_result == parse_result::SUCCESS) {
-            for (int idx = 1; idx <= op.index; idx++) {
+            int idx;
+            for (idx = 1; idx <= op.index; idx++) {
               PARSE_TRY(ctx.j_state.next_matching_element(any, false));
               if (last_result == parse_result::EMPTY) { break; }
             }
-            push_context(ctx.j_state, ctx.commands + 1, ctx.list_element, ctx.element_count);
+            // if we didn't end up at the index we requested, this is an invalid indexe
+            if (idx - 1 != op.index) { return parse_result::ERROR; }
+            push_context(ctx.j_state, ctx.commands + 1, ctx.list_element);
           }
         }
       } break;
@@ -780,13 +687,15 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
 
       // END case
       default: {
-        if (ctx.list_element && ctx.element_count > 0) {
+        if (ctx.list_element && element_count > 0) {
           output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN});
         }
         PARSE_TRY(ctx.j_state.extract_element(&output, ctx.list_element));
+        if (ctx.list_element && last_result != parse_result::EMPTY) { element_count++; }
       } break;
     }
   }
+
   return parse_result::SUCCESS;
 }
 
@@ -794,20 +703,29 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state,
 // a jsonpath containing 7 nested wildcards so this is probably reasonable.
 constexpr int max_command_stack_depth = 8;
 
-CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input,
-                                                             size_t input_len,
-                                                             path_operator const* const commands,
-                                                             char* out_buf,
-                                                             size_t out_buf_size)
+__device__ thrust::pair<parse_result, json_output> get_json_object_single(
+  char const* input,
+  size_t input_len,
+  path_operator const* const commands,
+  char* out_buf,
+  size_t out_buf_size)
 {
   json_state j_state(input, input_len);
   json_output output{out_buf_size, 0, 0, out_buf};
 
-  parse_json_path<max_command_stack_depth>(j_state, commands, output);
+  auto const result = parse_json_path<max_command_stack_depth>(j_state, commands, output);
 
-  return output;
+  return {result, output};
 }
 
+/**
+ * @brief Kernel for running the JSONPath query.
+ *
+ * This kernel operates in a 2-pass way.  On the first pass, it computes
+ * output sizes.  On the second pass it fills in the provided output buffers
+ * (chars and validity)
+ *
+ */
 __global__ void get_json_object_kernel(char const* chars,
                                        size_type const* offsets,
                                        path_operator const* const commands,
@@ -820,16 +738,18 @@ __global__ void get_json_object_kernel(char const* chars,
 
   bool is_valid = false;
   if (tid < num_rows) {
-    size_type src_size    = offsets[tid + 1] - offsets[tid];
-    size_type output_size = 0;
+    size_type const src_size = offsets[tid + 1] - offsets[tid];
+    size_type output_size    = 0;
     if (src_size > 0) {
-      char* dst       = out_buf ? out_buf + output_offsets[tid] : nullptr;
-      size_t dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0;
+      char* dst             = out_buf ? out_buf + output_offsets[tid] : nullptr;
+      size_t const dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0;
 
-      json_output out =
+      parse_result result;
+      json_output out;
+      thrust::tie(result, out) =
         get_json_object_single(chars + offsets[tid], src_size, commands, dst, dst_size);
       output_size = out.output_len;
-      if (out.element_count > 0) { is_valid = true; }
+      if (out.element_count > 0 && result == parse_result::SUCCESS) { is_valid = true; }
     }
 
     // filled in only during the precompute step
@@ -857,6 +777,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
   CUDF_EXPECTS(std::get<1>(preprocess) <= max_command_stack_depth,
                "Encountered json_path string that is too complex");
 
+  // allocate output offsets buffer.
   auto offsets = cudf::make_fixed_width_column(
     data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr);
   cudf::mutable_column_view offsets_view(*offsets);
@@ -880,7 +801,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 
   cudf::detail::grid_1d const grid{col.size(), 512};
 
-  // preprocess sizes
+  // preprocess sizes (returned in the offsets buffer)
   get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     col.chars().head<char>(),
     col.offsets().head<size_type>(),
@@ -896,7 +817,8 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
                          offsets_view.head<size_type>() + col.size() + 1,
                          offsets_view.head<size_type>(),
                          0);
-  size_type output_size = cudf::detail::get_value<size_type>(offsets_view, col.size(), stream);
+  size_type const output_size =
+    cudf::detail::get_value<size_type>(offsets_view, col.size(), stream);
 
   // allocate output string column
   auto chars = cudf::make_fixed_width_column(
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index bb33b0d8289..3c9aff72337 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -15,189 +15,299 @@
  */
 
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/replace.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/substring.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+// reference:  https://jsonpath.herokuapp.com/
 /*
-const char* json_string = "{
+  {
     "store": {
         "book": [
-            {
+              {
                 "category": "reference",
                 "author": "Nigel Rees",
                 "title": "Sayings of the Century",
                 "price": 8.95
-            },
-            {
+              },
+              {
                 "category": "fiction",
                 "author": "Evelyn Waugh",
                 "title": "Sword of Honour",
                 "price": 12.99
-            },
-            {
+              },
+              {
                 "category": "fiction",
                 "author": "Herman Melville",
                 "title": "Moby Dick",
                 "isbn": "0-553-21311-3",
                 "price": 8.99
-            },
-            {
+              },
+              {
                 "category": "fiction",
                 "author": "J. R. R. Tolkien",
                 "title": "The Lord of the Rings",
                 "isbn": "0-395-19395-8",
                 "price": 22.99
-            }
+              }
         ],
         "bicycle": {
-            "color": "red",
-            "price": 19.95
+              "color": "red",
+              "price": 19.95
         }
     },
-}";
+    "expensive": 10
+  }
 */
 
+// this string is formatted to result in a reasonably readable debug printf
+std::string json_string{
+  "{\n\"store\": {\n\t\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
+  "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
+  "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
+  "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
+  "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
+  "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
+  "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
+  "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
+  "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
+  "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": "
+  "19.95\n\t}\n},\n\"expensive\": 10\n}"};
+
+std::unique_ptr<cudf::column> drop_whitespace(cudf::column_view const& col)
+{
+  cudf::test::strings_column_wrapper whitespace{"\n", "\r", "\t"};
+  cudf::test::strings_column_wrapper repl{"", "", ""};
+
+  cudf::strings_column_view strings(col);
+  cudf::strings_column_view targets(whitespace);
+  cudf::strings_column_view replacements(repl);
+  return cudf::strings::replace(strings, targets, replacements);
+}
+
 struct JsonTests : public cudf::test::BaseFixture {
 };
 
-TEST_F(JsonTests, GetJsonObject)
+TEST_F(JsonTests, GetJsonObjectRootOp)
 {
-  // reference:  https://jsonpath.herokuapp.com/
-  // clang-format off
-   /*
-   {
-      "store": {
-         "book": [
-               {
-                  "category": "reference",
-                  "author": "Nigel Rees",
-                  "title": "Sayings of the Century",
-                  "price": 8.95
-               },
-               {
-                  "category": "fiction",
-                  "author": "Evelyn Waugh",
-                  "title": "Sword of Honour",
-                  "price": 12.99
-               },
-               {
-                  "category": "fiction",
-                  "author": "Herman Melville",
-                  "title": "Moby Dick",
-                  "isbn": "0-553-21311-3",
-                  "price": 8.99
-               },
-               {
-                  "category": "fiction",
-                  "author": "J. R. R. Tolkien",
-                  "title": "The Lord of the Rings",
-                  "isbn": "0-395-19395-8",
-                  "price": 22.99
-               }
-         ],
-         "bicycle": {
-               "color": "red",
-               "price": 19.95
-         }
-      },
-      "expensive": 10
-   }
-   */
-  // clang-format on
-  // this string is formatted to result in a reasonably readable debug printf
-  const char* json_string =
-    "{\n\"store\": {\n\t\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
-    "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
-    "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
-    "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
-    "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
-    "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
-    "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
-    "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
-    "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
-    "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": "
-    "19.95\n\t}\n},\n\"expensive\": 10\n}";
+  // root
+  cudf::test::strings_column_wrapper input{json_string};
+  std::string json_path("$");
+  auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+  auto result     = drop_whitespace(*result_raw);
 
-  {
-    cudf::test::strings_column_wrapper input{json_string};
-    std::string json_path("$");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+  auto expected = drop_whitespace(input);
 
-    cudf::test::print(*result);
-  }
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
+}
 
+TEST_F(JsonTests, GetJsonObjectChildOp)
+{
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
-
-    cudf::test::print(*result);
+    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result     = drop_whitespace(*result_raw);
+
+    cudf::test::strings_column_wrapper expected_raw{
+      "{\n\t\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
+      "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
+      "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
+      "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
+      "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
+      "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
+      "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
+      "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
+      "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
+      "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": "
+      "19.95\n\t}\n}"};
+    auto expected = drop_whitespace(expected_raw);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
   }
 
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
-
-    cudf::test::print(*result);
+    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result     = drop_whitespace(*result_raw);
+
+    cudf::test::strings_column_wrapper expected_raw{
+      "[\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
+      "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
+      "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
+      "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
+      "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
+      "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
+      "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
+      "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
+      "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
+      "22.99\n\t\t}\n\t]"};
+    auto expected = drop_whitespace(expected_raw);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
   }
+}
 
+TEST_F(JsonTests, GetJsonObjectWildcardOp)
+{
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.*");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result     = drop_whitespace(*result_raw);
+
+    cudf::test::strings_column_wrapper expected_raw{
+      "[[\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
+      "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
+      "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
+      "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
+      "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
+      "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
+      "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
+      "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
+      "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
+      "22.99\n\t\t}\n\t],\n\t{\n\t\t\"color\": \"red\",\n\t\t\"price\": "
+      "19.95\n\t}]"};
+    auto expected = drop_whitespace(expected_raw);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
+  }
 
-    cudf::test::print(*result);
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    std::string json_path("*");
+    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result     = drop_whitespace(*result_raw);
+
+    cudf::test::strings_column_wrapper expected_raw{
+      "[{\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
+      "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
+      "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
+      "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
+      "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
+      "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
+      "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
+      "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
+      "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
+      "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": "
+      "19.95\n\t}\n},10]"};
+    auto expected = drop_whitespace(expected_raw);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
   }
+}
 
+TEST_F(JsonTests, GetJsonObjectSubscriptOp)
+{
   {
     cudf::test::strings_column_wrapper input{json_string};
-    std::string json_path("$.store.book[*]");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    std::string json_path("$.store.book[2]");
+    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result     = drop_whitespace(*result_raw);
 
-    cudf::test::print(*result);
+    cudf::test::strings_column_wrapper expected_raw{
+      "{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
+      "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
+      "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t}"};
+    auto expected = drop_whitespace(expected_raw);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
   }
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    std::string json_path("$.store.book[*].category");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    std::string json_path("$.store['bicycle']");
+    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result     = drop_whitespace(*result_raw);
 
-    cudf::test::print(*result);
+    cudf::test::strings_column_wrapper expected_raw{
+      "{\n\t\t\"color\": \"red\",\n\t\t\"price\": "
+      "19.95\n\t}"};
+    auto expected = drop_whitespace(expected_raw);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
   }
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    std::string json_path("$.store.book[*].title");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    std::string json_path("$.store.book[*]");
+    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result     = drop_whitespace(*result_raw);
+
+    cudf::test::strings_column_wrapper expected_raw{
+      "[\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
+      "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
+      "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
+      "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
+      "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
+      "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
+      "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
+      "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
+      "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
+      "22.99\n\t\t}\n\t]"};
+    auto expected = drop_whitespace(expected_raw);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
+  }
+}
 
-    cudf::test::print(*result);
+TEST_F(JsonTests, GetJsonObjectFilter)
+{
+  // queries that result in filtering/collating results (mostly meaning - generates new
+  // json instead of just returning parts of the existing string
+
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    std::string json_path("$.store.book[*]['isbn']");
+    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result     = drop_whitespace(*result_raw);
+
+    cudf::test::strings_column_wrapper expected_raw{"[\"0-553-21311-3\",\"0-395-19395-8\"]"};
+    auto expected = drop_whitespace(expected_raw);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
   }
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    std::string json_path("$.store['bicycle']");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    std::string json_path("$.store.book[*].category");
+    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result     = drop_whitespace(*result_raw);
+
+    cudf::test::strings_column_wrapper expected_raw{
+      "[\"reference\",\"fiction\",\"fiction\",\"fiction\"]"};
+    auto expected = drop_whitespace(expected_raw);
 
-    cudf::test::print(*result);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
   }
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    std::string json_path("$.store.book[*]['isbn']");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    std::string json_path("$.store.book[*].title");
+    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result     = drop_whitespace(*result_raw);
+
+    cudf::test::strings_column_wrapper expected_raw{
+      "[\"Sayings of the Century\",\"Sword of Honour\",\"Moby Dick\",\"The Lord of the Rings\"]"};
+    auto expected = drop_whitespace(expected_raw);
 
-    cudf::test::print(*result);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
   }
 
   {
     cudf::test::strings_column_wrapper input{json_string};
-    std::string json_path("$.store.book[2]");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    std::string json_path("$.store.book.*.price");
+    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result     = drop_whitespace(*result_raw);
 
-    cudf::test::print(*result);
+    cudf::test::strings_column_wrapper expected_raw{"[8.95,12.99,8.99,22.99]"};
+    auto expected = drop_whitespace(expected_raw);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
   }
 
   {
@@ -206,56 +316,191 @@ TEST_F(JsonTests, GetJsonObject)
     //  spark:        fiction
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[2].category");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result     = drop_whitespace(*result_raw);
 
-    cudf::test::print(*result);
+    cudf::test::strings_column_wrapper expected_raw{"fiction"};
+    auto expected = drop_whitespace(expected_raw);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
   }
+}
 
+TEST_F(JsonTests, GetJsonObjectNullInputs)
+{
   {
-    char const* str = "{\"a\" : \"b\"}";
-    cudf::test::strings_column_wrapper input{{str, str, str, str}, {1, 0, 1, 0}};
+    std::string str("{\"a\" : \"b\"}");
+    cudf::test::strings_column_wrapper input({str, str, str, str}, {1, 0, 1, 0});
 
     std::string json_path("$.a");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result     = drop_whitespace(*result_raw);
 
-    cudf::test::print(*result);
+    cudf::test::strings_column_wrapper expected_raw({"b", "", "b", ""}, {1, 0, 1, 0});
+    auto expected = drop_whitespace(expected_raw);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
   }
+}
 
+TEST_F(JsonTests, GetJsonObjectEmptyQuery)
+{
   // empty query -> null
   {
-    cudf::test::strings_column_wrapper input{""};
+    cudf::test::strings_column_wrapper input{"{\"a\" : \"b\"}"};
     std::string json_path("");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
-    cudf::test::print(*result);
+    cudf::test::strings_column_wrapper expected({""}, {0});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
   }
+}
 
+TEST_F(JsonTests, GetJsonObjectEmptyInputsAndOutputs)
+{
   // empty input -> null
   {
     cudf::test::strings_column_wrapper input{""};
     std::string json_path("$");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
-    cudf::test::print(*result);
+    cudf::test::strings_column_wrapper expected({""}, {0});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  }
+
+  // slightly different from "empty output". in this case, we're
+  // returning something, but it happens to be empty. so we expect
+  // a valid, but empty row
+  {
+    cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"};
+    std::string json_path("$.store.bicycle");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::strings_column_wrapper expected({""}, {1});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  }
+}
+
+// badly formed JSONpath strings
+TEST_F(JsonTests, GetJsonObjectIllegalQuery)
+{
+  // can't have more than one root operator, or a root operator anywhere other
+  // than the beginning
+  {
+    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    std::string json_path("$$");
+    auto query = [&]() {
+      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    };
+    EXPECT_THROW(query(), cudf::logic_error);
+  }
+
+  // invalid index
+  {
+    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    std::string json_path("$[auh46h-]");
+    auto query = [&]() {
+      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    };
+    EXPECT_THROW(query(), cudf::logic_error);
   }
 
-  // empty output -> null
+  // invalid index
+  {
+    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    std::string json_path("$[[]]");
+    auto query = [&]() {
+      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    };
+    EXPECT_THROW(query(), cudf::logic_error);
+  }
+
+  // negative index
+  {
+    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    std::string json_path("$[-1]");
+    auto query = [&]() {
+      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    };
+    EXPECT_THROW(query(), cudf::logic_error);
+  }
+
+  // child operator with no name specified
+  {
+    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    std::string json_path(".");
+    auto query = [&]() {
+      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    };
+    EXPECT_THROW(query(), cudf::logic_error);
+  }
+
+  {
+    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    std::string json_path("][");
+    auto query = [&]() {
+      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    };
+    EXPECT_THROW(query(), cudf::logic_error);
+  }
+
+  {
+    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    std::string json_path("6hw6,56i3");
+    auto query = [&]() {
+      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    };
+    EXPECT_THROW(query(), cudf::logic_error);
+  }
+}
+
+// queries that are legal, but reference invalid parts of the input
+TEST_F(JsonTests, GetJsonObjectInvalidQuery)
+{
+  // non-existent field
   {
     cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
     std::string json_path("$[*].c");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
-    cudf::test::print(*result);
+    cudf::test::strings_column_wrapper expected({""}, {0});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
   }
 
-  // slightly different from "empty output". in this case, we're
-  // returning something, but it happens to be empty. so we expect
-  // a valid, but empty row
+  // non-existent field
   {
-    cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"};
-    std::string json_path("$.store.bicycle");
+    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    std::string json_path("$[*].c[2]");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::strings_column_wrapper expected({""}, {0});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  }
+
+  // non-existent field
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    std::string json_path("$.store.book.price");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
-    cudf::test::print(*result);
+    cudf::test::strings_column_wrapper expected({""}, {0});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  }
+
+  // out of bounds index
+  {
+    cudf::test::strings_column_wrapper input{json_string};
+    std::string json_path("$.store.book[4]");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    cudf::test::strings_column_wrapper expected({""}, {0});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
   }
 }

From b1a2b0980816d094cd7099240cf8c9a11fa8fe55 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Wed, 24 Mar 2021 21:12:39 -0500
Subject: [PATCH 16/33] Update meta.yaml

---
 conda/recipes/libcudf/meta.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 5657d21889f..7188065eca0 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -176,6 +176,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/detail/copying.hpp
     - test -f $PREFIX/include/cudf/strings/detail/fill.hpp
     - test -f $PREFIX/include/cudf/strings/detail/replace.hpp
+    - test -f $PREFIX/include/cudf/strings/detail/substring.hpp
     - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
     - test -f $PREFIX/include/cudf/strings/extract.hpp
     - test -f $PREFIX/include/cudf/strings/findall.hpp

From 02e20b7904fb09f5fd1047b860d2dcfb5f2e8776 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Wed, 24 Mar 2021 21:42:55 -0500
Subject: [PATCH 17/33] Additional docs and cleanup

---
 cpp/benchmarks/string/json_benchmark.cpp      |   2 +-
 cpp/include/cudf/strings/detail/substring.hpp |   2 +-
 cpp/src/strings/json/json_path.cu             | 116 +++++++++++++-----
 3 files changed, 87 insertions(+), 33 deletions(-)

diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json_benchmark.cpp
index df1aadef404..dbb33927716 100644
--- a/cpp/benchmarks/string/json_benchmark.cpp
+++ b/cpp/benchmarks/string/json_benchmark.cpp
@@ -137,4 +137,4 @@ JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]");
 JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category");
 JSON_BENCHMARK_DEFINE(query6, "$.store.['bicycle']");
 JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']");
-JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]");
\ No newline at end of file
+JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]");
diff --git a/cpp/include/cudf/strings/detail/substring.hpp b/cpp/include/cudf/strings/detail/substring.hpp
index a646d93e2b8..35b7dec6449 100644
--- a/cpp/include/cudf/strings/detail/substring.hpp
+++ b/cpp/include/cudf/strings/detail/substring.hpp
@@ -27,7 +27,7 @@ namespace strings {
 namespace detail {
 
 /**
- * @copydoc cudf::get_json_object
+ * @copydoc cudf::strings::get_json_object
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 8cb3c7bb25b..880fe167d02 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -53,7 +53,7 @@ constexpr int DEBUG_NEWLINE_LEN = 0;
 using namespace cudf;
 
 /**
- * @brief Result of calling a parse type function.
+ * @brief Result of calling a parse function.
  *
  * The primary use of this is to distinguish between "success" and
  * "success but no data" return cases.  For example, if you are reading the
@@ -193,6 +193,10 @@ class parser {
   }
 };
 
+/**
+ * @brief Output buffer object.  Used during the preprocess/size-computation step
+ * and the actual output step.
+ */
 struct json_output {
   size_t output_max_len;
   size_t output_len;
@@ -210,25 +214,20 @@ struct json_output {
 
 enum json_element_type { NONE, OBJECT, ARRAY, VALUE };
 
+/**
+ * @brief Parsing class that holds the current state of the json to be parse and provides
+ * functions for navigating through it.
+ */
 class json_state : private parser {
  public:
-  constexpr json_state()
-    : parser(),
-      element(json_element_type::NONE),
-      cur_el_start(nullptr),
-      cur_el_type(json_element_type::NONE)
-  {
-  }
+  constexpr json_state() : parser(), cur_el_start(nullptr), cur_el_type(json_element_type::NONE) {}
   constexpr json_state(const char* _input, int64_t _input_len)
-    : parser(_input, _input_len),
-      element(json_element_type::NONE),
-      cur_el_start(nullptr),
-      cur_el_type(json_element_type::NONE)
+    : parser(_input, _input_len), cur_el_start(nullptr), cur_el_type(json_element_type::NONE)
   {
   }
 
   constexpr json_state(json_state const& j)
-    : parser(j), element(j.element), cur_el_start(j.cur_el_start), cur_el_type(j.cur_el_type)
+    : parser(j), cur_el_start(j.cur_el_start), cur_el_type(j.cur_el_type)
   {
   }
 
@@ -287,10 +286,13 @@ class json_state : private parser {
     return parse_result::SUCCESS;
   }
 
+  // skip the next element
   constexpr parse_result skip_element() { return extract_element(nullptr, false); }
 
+  // advance to the next element
   constexpr parse_result next_element() { return next_element_internal(false); }
 
+  // advance inside the current element
   constexpr parse_result child_element(bool as_field = false)
   {
     // cannot retrieve a field from an array
@@ -298,6 +300,7 @@ class json_state : private parser {
     return next_element_internal(true);
   }
 
+  // return the next element that matches the specified name.
   constexpr parse_result next_matching_element(json_string const& name, bool inclusive)
   {
     // if we're not including the current element, skip it
@@ -323,6 +326,7 @@ class json_state : private parser {
   }
 
  private:
+  // parse a value - either a string or a number/null/bool
   constexpr parse_result parse_value()
   {
     if (!parse_whitespace()) { return parse_result::ERROR; }
@@ -376,28 +380,37 @@ class json_state : private parser {
     return parse_result::SUCCESS;
   }
 
-  json_element_type element;
-  const char* cur_el_start;
-  json_string cur_el_name;
-  json_element_type cur_el_type;
+  const char* cur_el_start;       // pointer to the first character of the -value- of the current
+                                  // element - not the name
+  json_string cur_el_name;        // name of the current element (if applicable)
+  json_element_type cur_el_type;  // type of the current element
 };
 
 enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END };
 
+/**
+ * @brief A "command" operator used to query a json string.  A full query is
+ * an array of these operators applied to the incoming json string,
+ */
 struct path_operator {
   constexpr path_operator() : type(path_operator_type::ERROR), index(-1) {}
   constexpr path_operator(path_operator_type _type) : type(_type), index(-1) {}
 
-  path_operator_type type;
-  json_string name;
-  int index;
+  path_operator_type type;  // operator type
+  json_string name;         // name to match against (if applicable)
+  int index;                // index for subscript operator
 };
 
-// current state of the JSONPath
+/**
+ * @brief Parsing class that holds the current state of the JSONPath string to be parsed
+ * and provides functions for navigating through it. This is only called on the host
+ * during the preprocess step which builds a command buffer that the gpu uses.
+ */
 class path_state : private parser {
  public:
   path_state(const char* _path, size_t _path_len) : parser(_path, _path_len) {}
 
+  // get the next operator in the JSONPath string
   path_operator get_next_operator()
   {
     if (eof()) { return {path_operator_type::END}; }
@@ -498,6 +511,15 @@ class path_state : private parser {
   }
 };
 
+/**
+ * @brief Preprocess the incoming JSONPath string on the host to generate a
+ * command buffer for use by the GPU.
+ *
+ * @param json_path The incoming json path
+ * @param stream Cuda stream to perform any gpu actions on
+ * @returns A tuple containing the command buffer, the maximum stack depth required and whether or
+ * not the command buffer is empty.
+ */
 std::tuple<rmm::device_uvector<path_operator>, int, bool> build_command_buffer(
   cudf::string_scalar const& json_path, rmm::cuda_stream_view stream)
 {
@@ -547,18 +569,25 @@ std::tuple<rmm::device_uvector<path_operator>, int, bool> build_command_buffer(
     if (last_result == parse_result::ERROR) { return parse_result::ERROR; } \
   } while (0)
 
+/**
+ * @brief Parse a single json string using the provided command buffer
+ *
+ * @param j_state The incoming json string and associated parser
+ * @param commands The command buffer to be applied to the string. Always ends with a
+ * path_operator_type::END
+ * @param output Buffer user to store the results of the query
+ * @returns A result code indicating success/fail/empty.
+ */
 template <int max_command_stack_depth>
-__device__ parse_result parse_json_path(json_state& _j_state,
-                                        path_operator const* _commands,
-                                        json_output& output,
-                                        bool _list_element = false)
+__device__ parse_result parse_json_path(json_state& j_state,
+                                        path_operator const* commands,
+                                        json_output& output)
 {
   // manually maintained context stack in lieu of calling parse_json_path recursively.
   struct context {
     json_state j_state;
     path_operator const* commands;
     bool list_element;
-    // int element_count;
     bool state_flag;
   };
   context stack[max_command_stack_depth];
@@ -566,11 +595,9 @@ __device__ parse_result parse_json_path(json_state& _j_state,
   auto push_context = [&stack, &stack_pos](json_state const& _j_state,
                                            path_operator const* _commands,
                                            bool _list_element = false,
-                                           /* int _element_count = 0,*/
-                                           bool _state_flag = false) {
+                                           bool _state_flag   = false) {
     if (stack_pos == max_command_stack_depth - 1) { return false; }
-    stack[stack_pos++] =
-      context{_j_state, _commands, _list_element, /*_element_count,*/ _state_flag};
+    stack[stack_pos++] = context{_j_state, _commands, _list_element, _state_flag};
     return true;
   };
   auto pop_context = [&stack, &stack_pos](context& c) {
@@ -580,7 +607,7 @@ __device__ parse_result parse_json_path(json_state& _j_state,
     }
     return false;
   };
-  push_context(_j_state, _commands, _list_element);
+  push_context(j_state, commands, false);
 
   parse_result last_result = parse_result::SUCCESS;
   context ctx;
@@ -703,6 +730,20 @@ __device__ parse_result parse_json_path(json_state& _j_state,
 // a jsonpath containing 7 nested wildcards so this is probably reasonable.
 constexpr int max_command_stack_depth = 8;
 
+/**
+ * @brief Parse a single json string using the provided command buffer
+ *
+ * This function exists primarily as a shim for debugging purposes.
+ *
+ * @param input The incoming json string
+ * @param input_len Size of the incoming json string
+ * @param commands The command buffer to be applied to the string. Always ends with a
+ * path_operator_type::END
+ * @param out_buf Buffer user to store the results of the query (nullptr in the size computation
+ * step)
+ * @param out_buf_size Size of the output buffer
+ * @returns A pair containing the result code the output buffer.
+ */
 __device__ thrust::pair<parse_result, json_output> get_json_object_single(
   char const* input,
   size_t input_len,
@@ -725,6 +766,13 @@ __device__ thrust::pair<parse_result, json_output> get_json_object_single(
  * output sizes.  On the second pass it fills in the provided output buffers
  * (chars and validity)
  *
+ * @param chars The chars child column of the incoming strings column
+ * @param offsets The offsets of the incoming strings column
+ * @param commands JSONPath command buffer
+ * @param out_buf Buffer user to store the results of the query (nullptr in the size computation
+ * step)
+ * @param out_validity Output validity buffer (nullptr in the size computation step)
+ * @param num_rows Number of rows in the input column
  */
 __global__ void get_json_object_kernel(char const* chars,
                                        size_type const* offsets,
@@ -766,6 +814,9 @@ __global__ void get_json_object_kernel(char const* chars,
   }
 }
 
+/**
+ * @copydoc cudf::strings::detail::get_json_object
+ */
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
                                               cudf::string_scalar const& json_path,
                                               rmm::cuda_stream_view stream,
@@ -852,6 +903,9 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 }  // namespace
 }  // namespace detail
 
+/**
+ * @copydoc cudf::strings::get_json_object
+ */
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
                                               cudf::string_scalar const& json_path,
                                               rmm::mr::device_memory_resource* mr)

From fd330feed0b416adcbb33df1c30d72f3effc52aa Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Thu, 25 Mar 2021 09:36:56 -0700
Subject: [PATCH 18/33] Update java/src/main/native/src/ColumnViewJni.cpp

Co-authored-by: Jason Lowe <jlowe@nvidia.com>
---
 java/src/main/native/src/ColumnViewJni.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index c57d4c1fbd9..028e9f5e640 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1803,7 +1803,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env
    JNI_NULL_CHECK(env, j_scalar_handle, "path cannot be null", 0);
 
   try {
-
+    cudf::jni::auto_set_device(env);
     cudf::column_view* n_column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view n_strings_col_view(*n_column_view);
     cudf::string_scalar *n_scalar_path = reinterpret_cast<cudf::string_scalar *>(j_scalar_handle);

From 5229790ede1beb4783ed68b31be934009e6e241c Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Thu, 25 Mar 2021 09:37:05 -0700
Subject: [PATCH 19/33] Update java/src/main/native/src/ColumnViewJni.cpp

Co-authored-by: Jason Lowe <jlowe@nvidia.com>
---
 java/src/main/native/src/ColumnViewJni.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 028e9f5e640..fdfbee116cd 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1796,7 +1796,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jobject j_object, 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jclass, 
                                                                      jlong j_view_handle, jlong j_scalar_handle) {
 
    JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0);

From 94658646b86578b1794744574019f56e44dbcbc0 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Thu, 25 Mar 2021 16:14:39 -0500
Subject: [PATCH 20/33] Fix spelling.

---
 cpp/src/strings/json/json_path.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 880fe167d02..a93dfc46f72 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -702,7 +702,7 @@ __device__ parse_result parse_json_path(json_state& j_state,
               PARSE_TRY(ctx.j_state.next_matching_element(any, false));
               if (last_result == parse_result::EMPTY) { break; }
             }
-            // if we didn't end up at the index we requested, this is an invalid indexe
+            // if we didn't end up at the index we requested, this is an invalid index
             if (idx - 1 != op.index) { return parse_result::ERROR; }
             push_context(ctx.j_state, ctx.commands + 1, ctx.list_element);
           }

From 4e4865b28714522050e0a4606458f5ae18ef43fe Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Fri, 26 Mar 2021 11:15:29 -0500
Subject: [PATCH 21/33] Make larger test strings more human readable.

---
 cpp/tests/strings/json_tests.cpp | 346 +++++++++++++++++++++----------
 1 file changed, 235 insertions(+), 111 deletions(-)

diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index 3c9aff72337..e745b97de2d 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -23,59 +23,48 @@
 #include <cudf_test/column_wrapper.hpp>
 
 // reference:  https://jsonpath.herokuapp.com/
-/*
-  {
-    "store": {
-        "book": [
-              {
-                "category": "reference",
-                "author": "Nigel Rees",
-                "title": "Sayings of the Century",
-                "price": 8.95
-              },
-              {
-                "category": "fiction",
-                "author": "Evelyn Waugh",
-                "title": "Sword of Honour",
-                "price": 12.99
-              },
-              {
-                "category": "fiction",
-                "author": "Herman Melville",
-                "title": "Moby Dick",
-                "isbn": "0-553-21311-3",
-                "price": 8.99
-              },
-              {
-                "category": "fiction",
-                "author": "J. R. R. Tolkien",
-                "title": "The Lord of the Rings",
-                "isbn": "0-395-19395-8",
-                "price": 22.99
-              }
-        ],
-        "bicycle": {
-              "color": "red",
-              "price": 19.95
-        }
-    },
-    "expensive": 10
-  }
-*/
 
-// this string is formatted to result in a reasonably readable debug printf
+// clang-format off
 std::string json_string{
-  "{\n\"store\": {\n\t\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
-  "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
-  "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
-  "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
-  "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
-  "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
-  "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
-  "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
-  "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
-  "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": "
-  "19.95\n\t}\n},\n\"expensive\": 10\n}"};
+  "{" 
+    "\"store\": {"
+      "\"book\": ["
+        "{"
+          "\"category\": \"reference\","
+          "\"author\": \"Nigel Rees\","
+          "\"title\": \"Sayings of the Century\","
+          "\"price\": 8.95"
+        "},"
+        "{"
+          "\"category\": \"fiction\","
+          "\"author\": \"Evelyn Waugh\","
+          "\"title\": \"Sword of Honour\","
+          "\"price\": 12.99"
+        "},"
+        "{"
+          "\"category\": \"fiction\","
+          "\"author\": \"Herman Melville\","
+          "\"title\": \"Moby Dick\","
+          "\"isbn\": \"0-553-21311-3\","
+          "\"price\": 8.99"
+        "},"
+        "{"
+          "\"category\": \"fiction\","
+          "\"author\": \"J. R. R. Tolkien\","
+          "\"title\": \"The Lord of the Rings\","
+          "\"isbn\": \"0-395-19395-8\","
+          "\"price\": 22.99"
+        "}"
+      "],"
+      "\"bicycle\": {"
+        "\"color\": \"red\","
+        "\"price\": 19.95"
+      "}"
+    "},"
+    "\"expensive\": 10"
+  "}"
+};
+// clang-format on
 
 std::unique_ptr<cudf::column> drop_whitespace(cudf::column_view const& col)
 {
@@ -112,18 +101,44 @@ TEST_F(JsonTests, GetJsonObjectChildOp)
     auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
-    cudf::test::strings_column_wrapper expected_raw{
-      "{\n\t\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
-      "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
-      "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
-      "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
-      "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
-      "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
-      "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
-      "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
-      "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
-      "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": "
-      "19.95\n\t}\n}"};
+    // clang-format off
+    cudf::test::strings_column_wrapper expected_raw{     
+      "{"
+        "\"book\": ["
+          "{"
+            "\"category\": \"reference\","
+            "\"author\": \"Nigel Rees\","
+            "\"title\": \"Sayings of the Century\","
+            "\"price\": 8.95"
+          "},"
+          "{"
+            "\"category\": \"fiction\","
+            "\"author\": \"Evelyn Waugh\","
+            "\"title\": \"Sword of Honour\","
+            "\"price\": 12.99"
+          "},"
+          "{"
+            "\"category\": \"fiction\","
+            "\"author\": \"Herman Melville\","
+            "\"title\": \"Moby Dick\","
+            "\"isbn\": \"0-553-21311-3\","
+            "\"price\": 8.99"
+          "},"
+          "{"
+            "\"category\": \"fiction\","
+            "\"author\": \"J. R. R. Tolkien\","
+            "\"title\": \"The Lord of the Rings\","
+            "\"isbn\": \"0-395-19395-8\","
+            "\"price\": 22.99"
+          "}"
+        "],"
+        "\"bicycle\": {"
+          "\"color\": \"red\","
+          "\"price\": 19.95"
+        "}"
+      "}"
+    };
+    // clang-format on
     auto expected = drop_whitespace(expected_raw);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
@@ -135,17 +150,38 @@ TEST_F(JsonTests, GetJsonObjectChildOp)
     auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
+    // clang-format off
     cudf::test::strings_column_wrapper expected_raw{
-      "[\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
-      "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
-      "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
-      "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
-      "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
-      "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
-      "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
-      "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
-      "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
-      "22.99\n\t\t}\n\t]"};
+      "["
+        "{"
+          "\"category\": \"reference\","
+          "\"author\": \"Nigel Rees\","
+          "\"title\": \"Sayings of the Century\","
+          "\"price\": 8.95"
+        "},"
+        "{"
+          "\"category\": \"fiction\","
+          "\"author\": \"Evelyn Waugh\","
+          "\"title\": \"Sword of Honour\","
+          "\"price\": 12.99"
+        "},"
+        "{"
+          "\"category\": \"fiction\","
+          "\"author\": \"Herman Melville\","
+          "\"title\": \"Moby Dick\","
+          "\"isbn\": \"0-553-21311-3\","
+          "\"price\": 8.99"
+        "},"
+        "{"
+          "\"category\": \"fiction\","
+          "\"author\": \"J. R. R. Tolkien\","
+          "\"title\": \"The Lord of the Rings\","
+          "\"isbn\": \"0-395-19395-8\","
+          "\"price\": 22.99"
+        "}"
+      "]"
+    };
+    // clang-format on
     auto expected = drop_whitespace(expected_raw);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
@@ -160,18 +196,44 @@ TEST_F(JsonTests, GetJsonObjectWildcardOp)
     auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
+    // clang-format off
     cudf::test::strings_column_wrapper expected_raw{
-      "[[\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
-      "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
-      "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
-      "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
-      "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
-      "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
-      "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
-      "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
-      "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
-      "22.99\n\t\t}\n\t],\n\t{\n\t\t\"color\": \"red\",\n\t\t\"price\": "
-      "19.95\n\t}]"};
+      "["
+        "["
+          "{"
+            "\"category\": \"reference\","
+            "\"author\": \"Nigel Rees\","
+            "\"title\": \"Sayings of the Century\","
+            "\"price\": 8.95"
+          "},"
+          "{"
+            "\"category\": \"fiction\","
+            "\"author\": \"Evelyn Waugh\","
+            "\"title\": \"Sword of Honour\","
+            "\"price\": 12.99"
+          "},"
+          "{"
+            "\"category\": \"fiction\","
+            "\"author\": \"Herman Melville\","
+            "\"title\": \"Moby Dick\","
+            "\"isbn\": \"0-553-21311-3\","
+            "\"price\": 8.99"
+          "},"
+          "{"
+            "\"category\": \"fiction\","
+            "\"author\": \"J. R. R. Tolkien\","
+            "\"title\": \"The Lord of the Rings\","
+            "\"isbn\": \"0-395-19395-8\","
+            "\"price\": 22.99"
+          "}"
+        "],"
+        "{"
+          "\"color\": \"red\","
+          "\"price\": 19.95"
+        "}"
+      "]"
+    };
+    // clang-format on
     auto expected = drop_whitespace(expected_raw);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
@@ -183,18 +245,47 @@ TEST_F(JsonTests, GetJsonObjectWildcardOp)
     auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
+    // clang-format off
     cudf::test::strings_column_wrapper expected_raw{
-      "[{\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
-      "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
-      "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
-      "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
-      "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
-      "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
-      "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
-      "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
-      "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
-      "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": "
-      "19.95\n\t}\n},10]"};
+      "[" 
+        "{"
+          "\"book\": ["
+            "{"
+              "\"category\": \"reference\","
+              "\"author\": \"Nigel Rees\","
+              "\"title\": \"Sayings of the Century\","
+              "\"price\": 8.95"
+            "},"
+            "{"
+              "\"category\": \"fiction\","
+              "\"author\": \"Evelyn Waugh\","
+              "\"title\": \"Sword of Honour\","
+              "\"price\": 12.99"
+            "},"
+            "{"
+              "\"category\": \"fiction\","
+              "\"author\": \"Herman Melville\","
+              "\"title\": \"Moby Dick\","
+              "\"isbn\": \"0-553-21311-3\","
+              "\"price\": 8.99"
+            "},"
+            "{"
+              "\"category\": \"fiction\","
+              "\"author\": \"J. R. R. Tolkien\","
+              "\"title\": \"The Lord of the Rings\","
+              "\"isbn\": \"0-395-19395-8\","
+              "\"price\": 22.99"
+            "}"
+          "],"
+          "\"bicycle\": {"
+            "\"color\": \"red\","
+            "\"price\": 19.95"
+          "}"
+        "},"
+        "10"
+      "]"
+    };
+    // clang-format on
     auto expected = drop_whitespace(expected_raw);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
@@ -209,10 +300,17 @@ TEST_F(JsonTests, GetJsonObjectSubscriptOp)
     auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
+    // clang-format off
     cudf::test::strings_column_wrapper expected_raw{
-      "{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
-      "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
-      "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t}"};
+      "{"
+        "\"category\": \"fiction\","
+        "\"author\": \"Herman Melville\","
+        "\"title\": \"Moby Dick\","
+        "\"isbn\": \"0-553-21311-3\","
+        "\"price\": 8.99"
+      "}"
+    };
+    // clang-format on
     auto expected = drop_whitespace(expected_raw);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
@@ -224,9 +322,14 @@ TEST_F(JsonTests, GetJsonObjectSubscriptOp)
     auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
+    // clang-format off
     cudf::test::strings_column_wrapper expected_raw{
-      "{\n\t\t\"color\": \"red\",\n\t\t\"price\": "
-      "19.95\n\t}"};
+      "{"
+        "\"color\": \"red\","
+        "\"price\": 19.95"
+      "}"
+    };
+    // clang-format on
     auto expected = drop_whitespace(expected_raw);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
@@ -238,17 +341,38 @@ TEST_F(JsonTests, GetJsonObjectSubscriptOp)
     auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
+    // clang-format off
     cudf::test::strings_column_wrapper expected_raw{
-      "[\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": "
-      "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": "
-      "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn "
-      "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": "
-      "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman "
-      "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": "
-      "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": "
-      "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the "
-      "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": "
-      "22.99\n\t\t}\n\t]"};
+      "["
+        "{"
+          "\"category\": \"reference\","
+          "\"author\": \"Nigel Rees\","
+          "\"title\": \"Sayings of the Century\","
+          "\"price\": 8.95"
+        "},"
+        "{"
+          "\"category\": \"fiction\","
+          "\"author\": \"Evelyn Waugh\","
+          "\"title\": \"Sword of Honour\","
+          "\"price\": 12.99"
+        "},"
+        "{"
+          "\"category\": \"fiction\","
+          "\"author\": \"Herman Melville\","
+          "\"title\": \"Moby Dick\","
+          "\"isbn\": \"0-553-21311-3\","
+          "\"price\": 8.99"
+        "},"
+        "{"
+          "\"category\": \"fiction\","
+          "\"author\": \"J. R. R. Tolkien\","
+          "\"title\": \"The Lord of the Rings\","
+          "\"isbn\": \"0-395-19395-8\","
+          "\"price\": 22.99"
+        "}"
+      "]"
+    };
+    // clang-format on
     auto expected = drop_whitespace(expected_raw);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
@@ -503,4 +627,4 @@ TEST_F(JsonTests, GetJsonObjectInvalidQuery)
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
   }
-}
+}
\ No newline at end of file

From 3653d0d99f27b2a6d8a976cadda6805fb8f30f28 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Fri, 26 Mar 2021 14:24:04 -0500
Subject: [PATCH 22/33] PR review changes.  Changed get_json_object_kernel() to
 take a column_device_view instead of raw pointers for input data.

---
 cpp/src/strings/json/json_path.cu       | 96 +++++++++++++++----------
 cpp/tests/utilities/column_utilities.cu |  2 +-
 2 files changed, 59 insertions(+), 39 deletions(-)

diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index a93dfc46f72..a67ef865cb1 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -1,9 +1,27 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -50,8 +68,6 @@ constexpr int DEBUG_NEWLINE_LEN = 0;
 //
 #define SPARK_BEHAVIORS
 
-using namespace cudf;
-
 /**
  * @brief Result of calling a parse function.
  *
@@ -67,6 +83,25 @@ enum class parse_result {
   EMPTY,    // success, but no data
 };
 
+/**
+ * @brief A struct which represents a string.
+ *
+ * Typically used to point into a substring of a larger string, such as
+ * the input json itself.
+ *
+ * @code
+ * // where cur_pos is a pointer to the beginning of a name string in the
+ * // input json and name_size is the computed size.
+ * json_string name{cur_pos, name_size};
+ * @endcode
+ *
+ * Also used for parameter passing in a few cases:
+ *
+ * @code
+ * json_string wildcard{"*", 1};
+ * func(wildcard);
+ * @endcode
+ */
 struct json_string {
   const char* str;
   int64_t len;
@@ -766,36 +801,34 @@ __device__ thrust::pair<parse_result, json_output> get_json_object_single(
  * output sizes.  On the second pass it fills in the provided output buffers
  * (chars and validity)
  *
- * @param chars The chars child column of the incoming strings column
- * @param offsets The offsets of the incoming strings column
+ * @param col Device view of the incoming string
  * @param commands JSONPath command buffer
- * @param out_buf Buffer user to store the results of the query (nullptr in the size computation
+ * @param output_offsets Buffer used to store the string offsets for the results of the query
+ * (nullptr in the size computation step)
+ * @param out_buf Buffer used to store the results of the query (nullptr in the size computation
  * step)
  * @param out_validity Output validity buffer (nullptr in the size computation step)
- * @param num_rows Number of rows in the input column
  */
-__global__ void get_json_object_kernel(char const* chars,
-                                       size_type const* offsets,
+__global__ void get_json_object_kernel(column_device_view col,
                                        path_operator const* const commands,
                                        size_type* output_offsets,
                                        char* out_buf,
-                                       bitmask_type* out_validity,
-                                       size_type num_rows)
+                                       bitmask_type* out_validity)
 {
   uint64_t const tid = threadIdx.x + (blockDim.x * blockIdx.x);
 
   bool is_valid = false;
-  if (tid < num_rows) {
-    size_type const src_size = offsets[tid + 1] - offsets[tid];
-    size_type output_size    = 0;
-    if (src_size > 0) {
+  if (tid < col.size()) {
+    string_view const str = col.element<string_view>(tid);
+    size_type output_size = 0;
+    if (str.size_bytes() > 0) {
       char* dst             = out_buf ? out_buf + output_offsets[tid] : nullptr;
       size_t const dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0;
 
       parse_result result;
       json_output out;
       thrust::tie(result, out) =
-        get_json_object_single(chars + offsets[tid], src_size, commands, dst, dst_size);
+        get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size);
       output_size = out.output_len;
       if (out.element_count > 0 && result == parse_result::SUCCESS) { is_valid = true; }
     }
@@ -808,7 +841,7 @@ __global__ void get_json_object_kernel(char const* chars,
   if (out_validity) {
     uint32_t mask = __ballot_sync(0xffffffff, is_valid);
     // 0th lane of the warp writes the validity
-    if (!(tid % cudf::detail::warp_size) && tid < num_rows) {
+    if (!(tid % cudf::detail::warp_size) && tid < col.size()) {
       out_validity[cudf::word_index(tid)] = mask;
     }
   }
@@ -835,32 +868,21 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 
   // if the query is empty, return a string column containing all nulls
   if (std::get<2>(preprocess)) {
-    thrust::generate(rmm::exec_policy(stream),
-                     offsets_view.head<size_type>(),
-                     offsets_view.head<size_type>() + offsets_view.size(),
-                     [] __device__() { return 0; });
-    return cudf::make_strings_column(
-      col.size(),
-      std::move(offsets),
-      cudf::make_fixed_width_column(
-        data_type{type_id::INT8}, 0, mask_state::UNALLOCATED, stream, mr),
+    return std::make_unique<column>(
+      data_type{type_id::STRING},
       col.size(),
+      rmm::device_buffer{0, stream, mr},  // no data
       cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr),
-      stream,
-      mr);
+      col.size());  // null count
   }
 
   cudf::detail::grid_1d const grid{col.size(), 512};
 
+  auto cdv = column_device_view::create(col.parent(), stream);
+
   // preprocess sizes (returned in the offsets buffer)
   get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    col.chars().head<char>(),
-    col.offsets().head<size_type>(),
-    std::get<0>(preprocess).data(),
-    offsets_view.head<size_type>(),
-    nullptr,
-    nullptr,
-    col.size());
+    *cdv, std::get<0>(preprocess).data(), offsets_view.head<size_type>(), nullptr, nullptr);
 
   // convert sizes to offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
@@ -883,13 +905,11 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
   // compute results
   cudf::mutable_column_view chars_view(*chars);
   get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    col.chars().head<char>(),
-    col.offsets().head<size_type>(),
+    *cdv,
     std::get<0>(preprocess).data(),
     offsets_view.head<size_type>(),
     chars_view.head<char>(),
-    static_cast<bitmask_type*>(validity.data()),
-    col.size());
+    static_cast<bitmask_type*>(validity.data()));
 
   return make_strings_column(col.size(),
                              std::move(offsets),
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index cea66eced11..ddb3730b039 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -71,7 +71,7 @@ struct column_property_comparator {
 
     // equivalent, but not exactly equal columns can have a different number of children if their
     // sizes are both 0. Specifically, empty string columns may or may not have children.
-    if (check_exact_equality || lhs.size() > 0) {
+    if (check_exact_equality || (lhs.size() > 0 && lhs.null_count() < lhs.size())) {
       EXPECT_EQ(lhs.num_children(), rhs.num_children());
     }
   }

From 9c761b8262aa4043552cbd910d569bca4327815e Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Fri, 26 Mar 2021 14:30:36 -0500
Subject: [PATCH 23/33] Fixed missing newline.

---
 cpp/tests/strings/json_tests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index e745b97de2d..682a9a0689f 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -627,4 +627,4 @@ TEST_F(JsonTests, GetJsonObjectInvalidQuery)
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
   }
-}
\ No newline at end of file
+}

From e47b0882a6d7513ac5fd54f0960b4552c8962d17 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Sun, 28 Mar 2021 14:53:23 -0500
Subject: [PATCH 24/33] Handle additional disallowed cases when indexing into
 child elements. Make sure to never parse names when dealing with elements in
 an array.  Add more tests.

---
 cpp/src/strings/json/json_path.cu |  84 +++++++++++++++------
 cpp/tests/strings/json_tests.cpp  | 117 ++++++++++++++++++++++++++++++
 2 files changed, 177 insertions(+), 24 deletions(-)

diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index a67ef865cb1..5a0a684d232 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -255,14 +255,26 @@ enum json_element_type { NONE, OBJECT, ARRAY, VALUE };
  */
 class json_state : private parser {
  public:
-  constexpr json_state() : parser(), cur_el_start(nullptr), cur_el_type(json_element_type::NONE) {}
+  constexpr json_state()
+    : parser(),
+      cur_el_start(nullptr),
+      cur_el_type(json_element_type::NONE),
+      parent_el_type(json_element_type::NONE)
+  {
+  }
   constexpr json_state(const char* _input, int64_t _input_len)
-    : parser(_input, _input_len), cur_el_start(nullptr), cur_el_type(json_element_type::NONE)
+    : parser(_input, _input_len),
+      cur_el_start(nullptr),
+      cur_el_type(json_element_type::NONE),
+      parent_el_type(json_element_type::NONE)
   {
   }
 
   constexpr json_state(json_state const& j)
-    : parser(j), cur_el_start(j.cur_el_start), cur_el_type(j.cur_el_type)
+    : parser(j),
+      cur_el_start(j.cur_el_start),
+      cur_el_type(j.cur_el_type),
+      parent_el_type(j.parent_el_type)
   {
   }
 
@@ -328,11 +340,15 @@ class json_state : private parser {
   constexpr parse_result next_element() { return next_element_internal(false); }
 
   // advance inside the current element
-  constexpr parse_result child_element(bool as_field = false)
+  constexpr parse_result child_element(json_element_type expected_type)
   {
-    // cannot retrieve a field from an array
-    if (as_field && cur_el_type == json_element_type::ARRAY) { return parse_result::ERROR; }
-    return next_element_internal(true);
+    if (expected_type != NONE && cur_el_type != expected_type) { return parse_result::ERROR; }
+
+    // if we succeed, record our parent element type.
+    auto const prev_el_type = cur_el_type;
+    auto const result       = next_element_internal(true);
+    if (result == parse_result::SUCCESS) { parent_el_type = prev_el_type; }
+    return result;
   }
 
   // return the next element that matches the specified name.
@@ -393,8 +409,12 @@ class json_state : private parser {
     char const c = *pos;
     if (c == ']' || c == '}') { return parse_result::EMPTY; }
 
-    // element name, if any
-    if (parse_name(cur_el_name, true, '\"') == parse_result::ERROR) { return parse_result::ERROR; }
+    // if we're not accessing elements of an array, check for name.
+    bool const array_access =
+      (cur_el_type == ARRAY && child) || (parent_el_type == ARRAY && !child);
+    if (!array_access && parse_name(cur_el_name, true, '\"') == parse_result::ERROR) {
+      return parse_result::ERROR;
+    }
 
     // element type
     if (!parse_whitespace()) { return parse_result::EMPTY; }
@@ -415,10 +435,11 @@ class json_state : private parser {
     return parse_result::SUCCESS;
   }
 
-  const char* cur_el_start;       // pointer to the first character of the -value- of the current
-                                  // element - not the name
-  json_string cur_el_name;        // name of the current element (if applicable)
-  json_element_type cur_el_type;  // type of the current element
+  const char* cur_el_start;          // pointer to the first character of the -value- of the current
+                                     // element - not the name
+  json_string cur_el_name;           // name of the current element (if applicable)
+  json_element_type cur_el_type;     // type of the current element
+  json_element_type parent_el_type;  // parent element type
 };
 
 enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END };
@@ -428,12 +449,22 @@ enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR,
  * an array of these operators applied to the incoming json string,
  */
 struct path_operator {
-  constexpr path_operator() : type(path_operator_type::ERROR), index(-1) {}
-  constexpr path_operator(path_operator_type _type) : type(_type), index(-1) {}
+  constexpr path_operator() : type(path_operator_type::ERROR), index(-1), expected_type{NONE} {}
+  constexpr path_operator(path_operator_type _type, json_element_type _expected_type = NONE)
+    : type(_type), index(-1), expected_type{_expected_type}
+  {
+  }
 
   path_operator_type type;  // operator type
-  json_string name;         // name to match against (if applicable)
-  int index;                // index for subscript operator
+  // the expected element type we're applying this operation to.
+  // for example:
+  //    - you cannot retrieve a subscripted field (eg [5]) from an object.
+  //    - you cannot retrieve a field by name (eg  .book) from an array.
+  //    - you -can- use .* for both arrays and objects
+  // a value of NONE imples any type accepted
+  json_element_type expected_type;  // the expected type of the element we're working with
+  json_string name;                 // name to match against (if applicable)
+  int index;                        // index for subscript operator
 };
 
 /**
@@ -461,9 +492,11 @@ class path_state : private parser {
           // Spark currently only handles the wildcard operator inside [*], it does
           // not handle .*
           if (op.name.len == 1 && op.name.str[0] == '*') {
-            op.type = path_operator_type::CHILD_WILDCARD;
+            op.type          = path_operator_type::CHILD_WILDCARD;
+            op.expected_type = NONE;
           } else {
-            op.type = path_operator_type::CHILD;
+            op.type          = path_operator_type::CHILD;
+            op.expected_type = OBJECT;
           }
           return op;
         }
@@ -480,15 +513,18 @@ class path_state : private parser {
         if (parse_path_name(op.name, term)) {
           pos++;
           if (op.name.len == 1 && op.name.str[0] == '*') {
-            op.type = path_operator_type::CHILD_WILDCARD;
+            op.type          = path_operator_type::CHILD_WILDCARD;
+            op.expected_type = NONE;
           } else {
             if (is_string) {
-              op.type = path_operator_type::CHILD;
+              op.type          = path_operator_type::CHILD;
+              op.expected_type = OBJECT;
             } else {
               op.type = path_operator_type::CHILD_INDEX;
               op.index =
                 cudf::io::parse_numeric<int>(op.name.str, op.name.str + op.name.len, json_opts, -1);
               CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath");
+              op.expected_type = ARRAY;
             }
           }
           return op;
@@ -662,7 +698,7 @@ __device__ parse_result parse_json_path(json_state& j_state,
       // [1]
       // will return a single thing
       case path_operator_type::CHILD: {
-        PARSE_TRY(ctx.j_state.child_element(true));
+        PARSE_TRY(ctx.j_state.child_element(op.expected_type));
         if (last_result == parse_result::SUCCESS) {
           PARSE_TRY(ctx.j_state.next_matching_element(op.name, true));
           if (last_result == parse_result::SUCCESS) {
@@ -681,7 +717,7 @@ __device__ parse_result parse_json_path(json_state& j_state,
           if (!ctx.list_element) { output.add_output({"[" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); }
 
           // step into the child element
-          PARSE_TRY(ctx.j_state.child_element());
+          PARSE_TRY(ctx.j_state.child_element(op.expected_type));
           if (last_result == parse_result::EMPTY) {
             if (!ctx.list_element) {
               output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN});
@@ -727,7 +763,7 @@ __device__ parse_result parse_json_path(json_state& j_state,
       // etc
       // returns a single thing
       case path_operator_type::CHILD_INDEX: {
-        PARSE_TRY(ctx.j_state.child_element());
+        PARSE_TRY(ctx.j_state.child_element(op.expected_type));
         if (last_result == parse_result::SUCCESS) {
           json_string const any{"*", 1};
           PARSE_TRY(ctx.j_state.next_matching_element(any, true));
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index 682a9a0689f..9fdc4af7e87 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -628,3 +628,120 @@ TEST_F(JsonTests, GetJsonObjectInvalidQuery)
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
   }
 }
+
+TEST_F(JsonTests, MixedOutput)
+{
+  // various queries on:
+  // clang-format off
+  std::vector<std::string> input_strings {
+    "{\"a\": {\"b\" : \"c\"}}",
+
+    "{"
+      "\"a\": {\"b\" : \"c\"},"
+      "\"d\": [{\"e\":123}, {\"f\":-10}]"
+    "}",
+
+    "{"
+      "\"b\": 123"
+    "}",
+
+    "{"
+      "\"a\": [\"y\",500]"
+    "}",
+
+    "{"
+      "\"a\": \"\""
+    "}",
+
+    "{"
+      "\"a\": {"
+                "\"z\": {\"i\": 10, \"j\": 100},"
+                "\"b\": [\"c\", null, true, -1]"
+              "}"
+    "}"
+  };
+  // clang-format on
+  cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end());
+
+  {
+    std::string json_path("$.a");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    // clang-format off
+    cudf::test::strings_column_wrapper expected({
+      "{\"b\" : \"c\"}",
+      "{\"b\" : \"c\"}",
+      "",
+      "[\"y\",500]",
+      "",
+      "{"
+         "\"z\": {\"i\": 10, \"j\": 100},"
+         "\"b\": [\"c\", null, true, -1]"
+      "}"
+      }, 
+      {1, 1, 0, 1, 1, 1});
+    // clang-format on
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  }
+
+  {
+    std::string json_path("$.a[1]");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    // clang-format off
+    cudf::test::strings_column_wrapper expected({
+        "",
+        "",
+        "",
+        "500",
+        "",
+        "",
+      },
+      {0, 0, 0, 1, 0, 0});
+    // clang-format on
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  }
+
+  {
+    std::string json_path("$.a.b");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    // clang-format off
+    cudf::test::strings_column_wrapper expected({
+      "c", 
+      "c", 
+      "", 
+      "", 
+      "", 
+      "[\"c\", null, true, -1]"},
+      {1, 1, 0, 0, 0, 1});
+    // clang-format on
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  }
+
+  {
+    std::string json_path("$.a[*]");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    // clang-format off
+    cudf::test::strings_column_wrapper expected({
+      "[\"c\"]", 
+      "[\"c\"]", 
+      "", 
+      "[\"y\",500]", 
+      "", 
+      "["
+        "{\"i\": 10, \"j\": 100},"
+        "[\"c\", null, true, -1]"
+      "]" },
+      {1, 1, 0, 1, 0, 1});
+    // clang-format on
+
+    cudf::test::print(*result);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  }
+}
\ No newline at end of file

From f898ca65cb993075c78dd5bd17d3d58f5c0dd9b4 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Sun, 28 Mar 2021 15:36:11 -0500
Subject: [PATCH 25/33] Distinguish between "no output" (null result) and
 "empty output" (valid, but empty string) case more clearly. Additional test.

---
 cpp/src/strings/json/json_path.cu | 24 +++++++++++++++-------
 cpp/tests/strings/json_tests.cpp  | 34 ++++++++++++++++++++++---------
 2 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 5a0a684d232..d4d180339ef 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -231,17 +231,30 @@ class parser {
 /**
  * @brief Output buffer object.  Used during the preprocess/size-computation step
  * and the actual output step.
+ *
+ * There is an important distinction between two cases:
+ *
+ * - producing no output at all. that is, the query matched nothing in the input.
+ * - producing empty output. the query matched something in the input, but the
+ *   value of the result is an empty string.
+ *
+ * The `has_output` field is the flag which indicates whether or not the output
+ * from the query should be considered empty or null.
+ *
  */
 struct json_output {
   size_t output_max_len;
   size_t output_len;
-  int element_count;
+  bool has_output;
   char* output;
 
   constexpr void add_output(const char* str, size_t len)
   {
     if (output != nullptr) { memcpy(output + output_len, str, len); }
     output_len += len;
+    // set this to true even if the string is empty. it implies we have
+    // seen actual output
+    has_output = true;
   }
 
   constexpr void add_output(json_string str) { add_output(str.str, str.len); }
@@ -326,10 +339,7 @@ class json_state : private parser {
       if (*pos == ',') { pos++; }
     }
 
-    if (output != nullptr) {
-      output->add_output({start, end - start});
-      output->element_count++;
-    }
+    if (output != nullptr) { output->add_output({start, end - start}); }
     return parse_result::SUCCESS;
   }
 
@@ -823,7 +833,7 @@ __device__ thrust::pair<parse_result, json_output> get_json_object_single(
   size_t out_buf_size)
 {
   json_state j_state(input, input_len);
-  json_output output{out_buf_size, 0, 0, out_buf};
+  json_output output{out_buf_size, 0, false, out_buf};
 
   auto const result = parse_json_path<max_command_stack_depth>(j_state, commands, output);
 
@@ -866,7 +876,7 @@ __global__ void get_json_object_kernel(column_device_view col,
       thrust::tie(result, out) =
         get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size);
       output_size = out.output_len;
-      if (out.element_count > 0 && result == parse_result::SUCCESS) { is_valid = true; }
+      if (out.has_output > 0 && result == parse_result::SUCCESS) { is_valid = true; }
     }
 
     // filled in only during the precompute step
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index 9fdc4af7e87..c2dbd649832 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -27,8 +27,7 @@
 // clang-format off
 std::string json_string{
   "{" 
-    "\"store\": {"
-      "\"book\": ["
+    "\"store\": {""\"book\": ["
         "{"
           "\"category\": \"reference\","
           "\"author\": \"Nigel Rees\","
@@ -656,13 +655,12 @@ TEST_F(JsonTests, MixedOutput)
     "{"
       "\"a\": {"
                 "\"z\": {\"i\": 10, \"j\": 100},"
-                "\"b\": [\"c\", null, true, -1]"
+                "\"b\": [\"c\",null,true,-1]"
               "}"
     "}"
   };
   // clang-format on
   cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end());
-
   {
     std::string json_path("$.a");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -676,7 +674,7 @@ TEST_F(JsonTests, MixedOutput)
       "",
       "{"
          "\"z\": {\"i\": 10, \"j\": 100},"
-         "\"b\": [\"c\", null, true, -1]"
+         "\"b\": [\"c\",null,true,-1]"
       "}"
       }, 
       {1, 1, 0, 1, 1, 1});
@@ -715,7 +713,7 @@ TEST_F(JsonTests, MixedOutput)
       "", 
       "", 
       "", 
-      "[\"c\", null, true, -1]"},
+      "[\"c\",null,true,-1]"},
       {1, 1, 0, 0, 0, 1});
     // clang-format on
 
@@ -732,15 +730,31 @@ TEST_F(JsonTests, MixedOutput)
       "[\"c\"]", 
       "", 
       "[\"y\",500]", 
-      "", 
+      "[]", 
       "["
         "{\"i\": 10, \"j\": 100},"
-        "[\"c\", null, true, -1]"
+        "[\"c\",null,true,-1]"
       "]" },
-      {1, 1, 0, 1, 0, 1});
+      {1, 1, 0, 1, 1, 1});
     // clang-format on
 
-    cudf::test::print(*result);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  }
+
+  {
+    std::string json_path("$.a.b[*]");
+    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+
+    // clang-format off
+    cudf::test::strings_column_wrapper expected({
+      "[]", 
+      "[]", 
+      "", 
+      "",
+      "",      
+      "[\"c\",null,true,-1]"},
+      {1, 1, 0, 0, 0, 1});
+    // clang-format on
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
   }

From 6829f46195eb2059d8a308afaa5de2d3544303ae Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Mon, 29 Mar 2021 11:11:36 -0500
Subject: [PATCH 26/33] Moved get_json_object() declarations out of
 strings/substring.hpp to strings/json.hpp

---
 conda/recipes/libcudf/meta.yaml               |  5 +-
 .../detail/{substring.hpp => json.hpp}        |  0
 cpp/include/cudf/strings/json.hpp             | 50 +++++++++++++++++++
 cpp/include/cudf/strings/substring.hpp        | 19 -------
 cpp/tests/strings/json_tests.cpp              |  4 +-
 5 files changed, 55 insertions(+), 23 deletions(-)
 rename cpp/include/cudf/strings/detail/{substring.hpp => json.hpp} (100%)
 create mode 100644 cpp/include/cudf/strings/json.hpp

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 7188065eca0..60200869ff8 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -175,13 +175,14 @@ test:
     - test -f $PREFIX/include/cudf/strings/detail/converters.hpp
     - test -f $PREFIX/include/cudf/strings/detail/copying.hpp
     - test -f $PREFIX/include/cudf/strings/detail/fill.hpp
-    - test -f $PREFIX/include/cudf/strings/detail/replace.hpp
-    - test -f $PREFIX/include/cudf/strings/detail/substring.hpp
+    - test -f $PREFIX/include/cudf/strings/detail/json.hpp
+    - test -f $PREFIX/include/cudf/strings/detail/replace.hpp    
     - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
     - test -f $PREFIX/include/cudf/strings/extract.hpp
     - test -f $PREFIX/include/cudf/strings/findall.hpp
     - test -f $PREFIX/include/cudf/strings/find.hpp
     - test -f $PREFIX/include/cudf/strings/find_multiple.hpp
+    - test -f $PREFIX/include/cudf/strings/json.hpp
     - test -f $PREFIX/include/cudf/strings/padding.hpp
     - test -f $PREFIX/include/cudf/strings/replace.hpp
     - test -f $PREFIX/include/cudf/strings/replace_re.hpp
diff --git a/cpp/include/cudf/strings/detail/substring.hpp b/cpp/include/cudf/strings/detail/json.hpp
similarity index 100%
rename from cpp/include/cudf/strings/detail/substring.hpp
rename to cpp/include/cudf/strings/detail/json.hpp
diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp
new file mode 100644
index 00000000000..b39e4a2027c
--- /dev/null
+++ b/cpp/include/cudf/strings/json.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/strings_column_view.hpp>
+
+namespace cudf {
+namespace strings {
+
+/**
+ * @addtogroup strings_json
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Apply a JSONPath string to all rows in an input strings column.
+ *
+ * Applies a JSONPath string to an incoming strings column where each row in the column
+ * is a valid json string.  The output is returned by row as a strings column.
+ *
+ * https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html
+ * Implements only the operators: $ . [] *
+ *
+ * @param col The input strings column. Each row must contain a valid json string
+ * @param json_path The JSONPath string to be applied to each row
+ * @param mr Resource for allocating device memory.
+ * @return New strings column containing the retrieved json object strings
+ */
+std::unique_ptr<cudf::column> get_json_object(
+  cudf::strings_column_view const& col,
+  cudf::string_scalar const& json_path,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of doxygen group
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp
index 06a38360fdb..d1431060792 100644
--- a/cpp/include/cudf/strings/substring.hpp
+++ b/cpp/include/cudf/strings/substring.hpp
@@ -197,25 +197,6 @@ std::unique_ptr<column> slice_strings(
   size_type count,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Apply a JSONPath string to all rows in an input strings column.
- *
- * Applies a JSONPath string to an incoming strings column where each row in the column
- * is a valid json string.  The output is returned by row as a strings column.
- *
- * https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html
- * Implements only the operators: $ . [] *
- *
- * @param col The input strings column. Each row must contain a valid json string
- * @param json_path The JSONPath string to be applied to each row
- * @param mr Resource for allocating device memory.
- * @return New strings column containing the retrieved json object strings
- */
-std::unique_ptr<cudf::column> get_json_object(
-  cudf::strings_column_view const& col,
-  cudf::string_scalar const& json_path,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index c2dbd649832..feaa69e4364 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -17,7 +17,7 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/strings/substring.hpp>
+#include <cudf/strings/json.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -758,4 +758,4 @@ TEST_F(JsonTests, MixedOutput)
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
   }
-}
\ No newline at end of file
+}

From c0743b455133cbdd78b2b3fdcd574e73b616c1b7 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Mon, 29 Mar 2021 11:14:18 -0500
Subject: [PATCH 27/33] Clang format

---
 cpp/tests/strings/json_tests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index feaa69e4364..44eb35d4163 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -15,9 +15,9 @@
  */
 
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/json.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/strings/json.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>

From 285ed92208eb4a6c1a901393dfaa85e04da7bde4 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Mon, 29 Mar 2021 12:11:26 -0500
Subject: [PATCH 28/33] Use string_view instead of json_string struct. Cleanup
 benchmark CMakeLists.txt

---
 cpp/benchmarks/CMakeLists.txt     |   3 +-
 cpp/src/strings/json/json_path.cu | 155 +++++++++++++-----------------
 2 files changed, 66 insertions(+), 92 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 26a1cf95ab6..a331be5b611 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -197,5 +197,4 @@ ConfigureBench(STRINGS_BENCH
 ###################################################################################################
 # - json benchmark -------------------------------------------------------------------
 ConfigureBench(JSON_BENCH
-  string/json_benchmark.cpp)
-  
\ No newline at end of file
+  string/json_benchmark.cpp)
\ No newline at end of file
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index d4d180339ef..e3f73d376e7 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -83,39 +83,6 @@ enum class parse_result {
   EMPTY,    // success, but no data
 };
 
-/**
- * @brief A struct which represents a string.
- *
- * Typically used to point into a substring of a larger string, such as
- * the input json itself.
- *
- * @code
- * // where cur_pos is a pointer to the beginning of a name string in the
- * // input json and name_size is the computed size.
- * json_string name{cur_pos, name_size};
- * @endcode
- *
- * Also used for parameter passing in a few cases:
- *
- * @code
- * json_string wildcard{"*", 1};
- * func(wildcard);
- * @endcode
- */
-struct json_string {
-  const char* str;
-  int64_t len;
-
-  constexpr json_string() : str(nullptr), len(-1) {}
-  constexpr json_string(const char* _str, int64_t _len) : str(_str), len(_len) {}
-
-  constexpr bool operator==(json_string const& cmp)
-  {
-    return len == cmp.len && str != nullptr && cmp.str != nullptr &&
-           thrust::equal(thrust::seq, str, str + len, cmp.str);
-  }
-};
-
 /**
  * @brief Base parser class inherited by the (device-side) json_state class and
  * (host-side) path_state class.
@@ -125,19 +92,22 @@ struct json_string {
  */
 class parser {
  protected:
-  constexpr parser() : input(nullptr), input_len(0), pos(nullptr) {}
-  constexpr parser(const char* _input, int64_t _input_len)
+  CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {}
+  CUDA_HOST_DEVICE_CALLABLE parser(const char* _input, int64_t _input_len)
     : input(_input), input_len(_input_len), pos(_input)
   {
     parse_whitespace();
   }
 
-  constexpr parser(parser const& p) : input(p.input), input_len(p.input_len), pos(p.pos) {}
+  CUDA_HOST_DEVICE_CALLABLE parser(parser const& p)
+    : input(p.input), input_len(p.input_len), pos(p.pos)
+  {
+  }
 
-  constexpr bool eof(const char* p) { return p - input >= input_len; }
-  constexpr bool eof() { return eof(pos); }
+  CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; }
+  CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); }
 
-  constexpr bool parse_whitespace()
+  CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace()
   {
     while (!eof()) {
       if (is_whitespace(*pos)) {
@@ -149,17 +119,17 @@ class parser {
     return false;
   }
 
-  constexpr parse_result parse_string(json_string& str, bool can_be_empty, char quote)
+  CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(string_view& str,
+                                                      bool can_be_empty,
+                                                      char quote)
   {
-    str.str = nullptr;
-    str.len = 0;
+    str = string_view(nullptr, 0);
 
     if (parse_whitespace() && *pos == quote) {
       const char* start = ++pos;
       while (!eof()) {
         if (*pos == quote) {
-          str.str = start;
-          str.len = pos - start;
+          str = string_view(start, pos - start);
           pos++;
           return parse_result::SUCCESS;
         }
@@ -173,14 +143,16 @@ class parser {
   // a name means:
   // - a string followed by a :
   // - no string
-  constexpr parse_result parse_name(json_string& name, bool can_be_empty, char quote)
+  CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(string_view& name,
+                                                    bool can_be_empty,
+                                                    char quote)
   {
     if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) {
       return parse_result::ERROR;
     }
 
     // if we got a real string, the next char must be a :
-    if (name.len > 0) {
+    if (name.size_bytes() > 0) {
       if (!parse_whitespace()) { return parse_result::ERROR; }
       if (*pos == ':') {
         pos++;
@@ -193,7 +165,7 @@ class parser {
   // numbers, true, false, null.
   // this function is not particularly strong. badly formed values will get
   // consumed without throwing any errors
-  constexpr parse_result parse_non_string_value(json_string& val)
+  CUDA_HOST_DEVICE_CALLABLE parse_result parse_non_string_value(string_view& val)
   {
     if (!parse_whitespace()) { return parse_result::ERROR; }
 
@@ -210,8 +182,7 @@ class parser {
     }
     pos = end;
 
-    val.str = start;
-    val.len = {end - start};
+    val = string_view(start, end - start);
 
     return parse_result::SUCCESS;
   }
@@ -222,7 +193,7 @@ class parser {
   char const* pos;
 
  private:
-  constexpr bool is_whitespace(char c)
+  CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c)
   {
     return c == ' ' || c == '\r' || c == '\n' || c == '\t' ? true : false;
   }
@@ -248,7 +219,7 @@ struct json_output {
   bool has_output;
   char* output;
 
-  constexpr void add_output(const char* str, size_t len)
+  __device__ void add_output(const char* str, size_t len)
   {
     if (output != nullptr) { memcpy(output + output_len, str, len); }
     output_len += len;
@@ -257,7 +228,7 @@ struct json_output {
     has_output = true;
   }
 
-  constexpr void add_output(json_string str) { add_output(str.str, str.len); }
+  __device__ void add_output(string_view const& str) { add_output(str.data(), str.size_bytes()); }
 };
 
 enum json_element_type { NONE, OBJECT, ARRAY, VALUE };
@@ -268,14 +239,14 @@ enum json_element_type { NONE, OBJECT, ARRAY, VALUE };
  */
 class json_state : private parser {
  public:
-  constexpr json_state()
+  __device__ json_state()
     : parser(),
       cur_el_start(nullptr),
       cur_el_type(json_element_type::NONE),
       parent_el_type(json_element_type::NONE)
   {
   }
-  constexpr json_state(const char* _input, int64_t _input_len)
+  __device__ json_state(const char* _input, int64_t _input_len)
     : parser(_input, _input_len),
       cur_el_start(nullptr),
       cur_el_type(json_element_type::NONE),
@@ -283,7 +254,7 @@ class json_state : private parser {
   {
   }
 
-  constexpr json_state(json_state const& j)
+  __device__ json_state(json_state const& j)
     : parser(j),
       cur_el_start(j.cur_el_start),
       cur_el_type(j.cur_el_type),
@@ -291,8 +262,8 @@ class json_state : private parser {
   {
   }
 
-  // retrieve the entire current element as a json_string
-  constexpr parse_result extract_element(json_output* output, bool list_element)
+  // retrieve the entire current element into the output
+  __device__ parse_result extract_element(json_output* output, bool list_element)
   {
     char const* start = cur_el_start;
     char const* end   = start;
@@ -339,18 +310,18 @@ class json_state : private parser {
       if (*pos == ',') { pos++; }
     }
 
-    if (output != nullptr) { output->add_output({start, end - start}); }
+    if (output != nullptr) { output->add_output({start, static_cast<size_type>(end - start)}); }
     return parse_result::SUCCESS;
   }
 
   // skip the next element
-  constexpr parse_result skip_element() { return extract_element(nullptr, false); }
+  __device__ parse_result skip_element() { return extract_element(nullptr, false); }
 
   // advance to the next element
-  constexpr parse_result next_element() { return next_element_internal(false); }
+  __device__ parse_result next_element() { return next_element_internal(false); }
 
   // advance inside the current element
-  constexpr parse_result child_element(json_element_type expected_type)
+  __device__ parse_result child_element(json_element_type expected_type)
   {
     if (expected_type != NONE && cur_el_type != expected_type) { return parse_result::ERROR; }
 
@@ -362,7 +333,7 @@ class json_state : private parser {
   }
 
   // return the next element that matches the specified name.
-  constexpr parse_result next_matching_element(json_string const& name, bool inclusive)
+  __device__ parse_result next_matching_element(string_view const& name, bool inclusive)
   {
     // if we're not including the current element, skip it
     if (!inclusive) {
@@ -372,7 +343,7 @@ class json_state : private parser {
     // loop until we find a match or there's nothing left
     do {
       // wildcard matches anything
-      if (name.len == 1 && name.str[0] == '*') {
+      if (name.size_bytes() == 1 && name.data()[0] == '*') {
         return parse_result::SUCCESS;
       } else if (cur_el_name == name) {
         return parse_result::SUCCESS;
@@ -388,16 +359,16 @@ class json_state : private parser {
 
  private:
   // parse a value - either a string or a number/null/bool
-  constexpr parse_result parse_value()
+  __device__ parse_result parse_value()
   {
     if (!parse_whitespace()) { return parse_result::ERROR; }
 
     // string or number?
-    json_string unused;
+    string_view unused;
     return *pos == '\"' ? parse_string(unused, false, '\"') : parse_non_string_value(unused);
   }
 
-  constexpr parse_result next_element_internal(bool child)
+  __device__ parse_result next_element_internal(bool child)
   {
     // if we're not getting a child element, skip the current element.
     // this will leave pos as the first character -after- the close of
@@ -447,7 +418,7 @@ class json_state : private parser {
 
   const char* cur_el_start;          // pointer to the first character of the -value- of the current
                                      // element - not the name
-  json_string cur_el_name;           // name of the current element (if applicable)
+  string_view cur_el_name;           // name of the current element (if applicable)
   json_element_type cur_el_type;     // type of the current element
   json_element_type parent_el_type;  // parent element type
 };
@@ -459,8 +430,12 @@ enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR,
  * an array of these operators applied to the incoming json string,
  */
 struct path_operator {
-  constexpr path_operator() : type(path_operator_type::ERROR), index(-1), expected_type{NONE} {}
-  constexpr path_operator(path_operator_type _type, json_element_type _expected_type = NONE)
+  CUDA_HOST_DEVICE_CALLABLE path_operator()
+    : type(path_operator_type::ERROR), index(-1), expected_type{NONE}
+  {
+  }
+  CUDA_HOST_DEVICE_CALLABLE path_operator(path_operator_type _type,
+                                          json_element_type _expected_type = NONE)
     : type(_type), index(-1), expected_type{_expected_type}
   {
   }
@@ -473,7 +448,7 @@ struct path_operator {
   //    - you -can- use .* for both arrays and objects
   // a value of NONE imples any type accepted
   json_element_type expected_type;  // the expected type of the element we're working with
-  json_string name;                 // name to match against (if applicable)
+  string_view name;                 // name to match against (if applicable)
   int index;                        // index for subscript operator
 };
 
@@ -496,12 +471,12 @@ class path_state : private parser {
 
       case '.': {
         path_operator op;
-        json_string term{".[", 2};
+        string_view term{".[", 2};
         if (parse_path_name(op.name, term)) {
           // this is another potential use case for __SPARK_BEHAVIORS / configurability
           // Spark currently only handles the wildcard operator inside [*], it does
           // not handle .*
-          if (op.name.len == 1 && op.name.str[0] == '*') {
+          if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') {
             op.type          = path_operator_type::CHILD_WILDCARD;
             op.expected_type = NONE;
           } else {
@@ -518,11 +493,11 @@ class path_state : private parser {
       // wildcard:  [*]
       case '[': {
         path_operator op;
-        json_string term{"]", 1};
+        string_view term{"]", 1};
         bool const is_string = *pos == '\'' ? true : false;
         if (parse_path_name(op.name, term)) {
           pos++;
-          if (op.name.len == 1 && op.name.str[0] == '*') {
+          if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') {
             op.type          = path_operator_type::CHILD_WILDCARD;
             op.expected_type = NONE;
           } else {
@@ -530,9 +505,9 @@ class path_state : private parser {
               op.type          = path_operator_type::CHILD;
               op.expected_type = OBJECT;
             } else {
-              op.type = path_operator_type::CHILD_INDEX;
-              op.index =
-                cudf::io::parse_numeric<int>(op.name.str, op.name.str + op.name.len, json_opts, -1);
+              op.type  = path_operator_type::CHILD_INDEX;
+              op.index = cudf::io::parse_numeric<int>(
+                op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1);
               CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath");
               op.expected_type = ARRAY;
             }
@@ -555,12 +530,11 @@ class path_state : private parser {
  private:
   cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'};
 
-  bool parse_path_name(json_string& name, json_string const& terminators)
+  bool parse_path_name(string_view& name, string_view const& terminators)
   {
     switch (*pos) {
       case '*':
-        name.str = pos;
-        name.len = 1;
+        name = string_view(pos, 1);
         pos++;
         break;
 
@@ -571,22 +545,20 @@ class path_state : private parser {
       default: {
         size_t const chars_left = input_len - (pos - input);
         char const* end         = std::find_first_of(
-          pos, pos + chars_left, terminators.str, terminators.str + terminators.len);
+          pos, pos + chars_left, terminators.data(), terminators.data() + terminators.size_bytes());
         if (end) {
-          name.str = pos;
-          name.len = end - pos;
-          pos      = end;
+          name = string_view(pos, end - pos);
+          pos  = end;
         } else {
-          name.str = pos;
-          name.len = chars_left;
-          pos      = input + input_len;
+          name = string_view(pos, chars_left);
+          pos  = input + input_len;
         }
         break;
       }
     }
 
     // an empty name is not valid
-    CUDF_EXPECTS(name.len > 0, "Invalid empty name in JSONpath query string");
+    CUDF_EXPECTS(name.size_bytes() > 0, "Invalid empty name in JSONpath query string");
 
     return true;
   }
@@ -618,7 +590,10 @@ std::tuple<rmm::device_uvector<path_operator>, int, bool> build_command_buffer(
     }
     if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; }
     // convert pointer to device pointer
-    if (op.name.len > 0) { op.name.str = json_path.data() + (op.name.str - h_json_path.data()); }
+    if (op.name.size_bytes() > 0) {
+      op.name =
+        string_view(json_path.data() + (op.name.data() - h_json_path.data()), op.name.size_bytes());
+    }
     if (op.type == path_operator_type::ROOT) {
       CUDF_EXPECTS(h_operators.size() == 0, "Root operator ($) can only exist at the root");
     }
@@ -775,7 +750,7 @@ __device__ parse_result parse_json_path(json_state& j_state,
       case path_operator_type::CHILD_INDEX: {
         PARSE_TRY(ctx.j_state.child_element(op.expected_type));
         if (last_result == parse_result::SUCCESS) {
-          json_string const any{"*", 1};
+          string_view const any{"*", 1};
           PARSE_TRY(ctx.j_state.next_matching_element(any, true));
           if (last_result == parse_result::SUCCESS) {
             int idx;

From ef03e30eb56ba39d4720d518d540d9263e8be302 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Mon, 29 Mar 2021 15:11:15 -0500
Subject: [PATCH 29/33] Fix errant whitespace in meta.yaml. Update benchmarks
 and JNI bindings to point to new location for get_json_object(). Use a grid
 stride loop in core kernel.  Use some thrust_optionals where appropriate.
 Compute and return null count instead of just leaving it unknown.

---
 conda/recipes/libcudf/meta.yaml            |   2 +-
 cpp/benchmarks/string/json_benchmark.cpp   |   4 +-
 cpp/src/strings/json/json_path.cu          | 125 ++++++++++++---------
 java/src/main/native/src/ColumnViewJni.cpp |   1 +
 4 files changed, 77 insertions(+), 55 deletions(-)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 60200869ff8..4ab222a3c85 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -176,7 +176,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/detail/copying.hpp
     - test -f $PREFIX/include/cudf/strings/detail/fill.hpp
     - test -f $PREFIX/include/cudf/strings/detail/json.hpp
-    - test -f $PREFIX/include/cudf/strings/detail/replace.hpp    
+    - test -f $PREFIX/include/cudf/strings/detail/replace.hpp
     - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
     - test -f $PREFIX/include/cudf/strings/extract.hpp
     - test -f $PREFIX/include/cudf/strings/findall.hpp
diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json_benchmark.cpp
index dbb33927716..6fb6a07a8d0 100644
--- a/cpp/benchmarks/string/json_benchmark.cpp
+++ b/cpp/benchmarks/string/json_benchmark.cpp
@@ -22,8 +22,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <cudf/strings/json.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/strings/substring.hpp>
 
 class JsonPath : public cudf::benchmark {
 };
@@ -135,6 +135,6 @@ JSON_BENCHMARK_DEFINE(query2, "$.store.book");
 JSON_BENCHMARK_DEFINE(query3, "$.store.*");
 JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]");
 JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category");
-JSON_BENCHMARK_DEFINE(query6, "$.store.['bicycle']");
+JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']");
 JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']");
 JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]");
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index e3f73d376e7..44a43b7f095 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -215,17 +216,13 @@ class parser {
  */
 struct json_output {
   size_t output_max_len;
-  size_t output_len;
-  bool has_output;
   char* output;
+  thrust::optional<size_t> output_len;
 
   __device__ void add_output(const char* str, size_t len)
   {
-    if (output != nullptr) { memcpy(output + output_len, str, len); }
-    output_len += len;
-    // set this to true even if the string is empty. it implies we have
-    // seen actual output
-    has_output = true;
+    if (output != nullptr) { memcpy(output + output_len.value_or(0), str, len); }
+    output_len = output_len.value_or(0) + len;
   }
 
   __device__ void add_output(string_view const& str) { add_output(str.data(), str.size_bytes()); }
@@ -570,10 +567,9 @@ class path_state : private parser {
  *
  * @param json_path The incoming json path
  * @param stream Cuda stream to perform any gpu actions on
- * @returns A tuple containing the command buffer, the maximum stack depth required and whether or
- * not the command buffer is empty.
+ * @returns A pair containing the command buffer, and maximum stack depth required.
  */
-std::tuple<rmm::device_uvector<path_operator>, int, bool> build_command_buffer(
+std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_command_buffer(
   cudf::string_scalar const& json_path, rmm::cuda_stream_view stream)
 {
   std::string h_json_path = json_path.to_string(stream);
@@ -606,17 +602,12 @@ std::tuple<rmm::device_uvector<path_operator>, int, bool> build_command_buffer(
     h_operators.push_back(op);
   } while (op.type != path_operator_type::END);
 
-  rmm::device_uvector<path_operator> d_operators(h_operators.size(), stream);
-  CUDA_TRY(cudaMemcpyAsync(d_operators.data(),
-                           h_operators.data(),
-                           sizeof(path_operator) * h_operators.size(),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
-  stream.synchronize();
-
-  return {std::move(d_operators),
-          max_stack_depth,
-          h_operators.size() == 1 && h_operators[0].type == path_operator_type::END ? true : false};
+  auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END;
+  return is_empty
+           ? std::make_pair(thrust::nullopt, 0)
+           : std::make_pair(
+               thrust::make_optional(cudf::detail::make_device_uvector_sync(h_operators, stream)),
+               max_stack_depth);
 }
 
 #define PARSE_TRY(_x)                                                       \
@@ -808,7 +799,7 @@ __device__ thrust::pair<parse_result, json_output> get_json_object_single(
   size_t out_buf_size)
 {
   json_state j_state(input, input_len);
-  json_output output{out_buf_size, 0, false, out_buf};
+  json_output output{out_buf_size, out_buf};
 
   auto const result = parse_json_path<max_command_stack_depth>(j_state, commands, output);
 
@@ -829,17 +820,26 @@ __device__ thrust::pair<parse_result, json_output> get_json_object_single(
  * @param out_buf Buffer used to store the results of the query (nullptr in the size computation
  * step)
  * @param out_validity Output validity buffer (nullptr in the size computation step)
+ * @param out_valid_count Output count of # of valid bits (nullptr in the size computation step)
  */
-__global__ void get_json_object_kernel(column_device_view col,
-                                       path_operator const* const commands,
-                                       size_type* output_offsets,
-                                       char* out_buf,
-                                       bitmask_type* out_validity)
+template <int block_size>
+__launch_bounds__(block_size) __global__
+  void get_json_object_kernel(column_device_view col,
+                              path_operator const* const commands,
+                              size_type* output_offsets,
+                              char* out_buf,
+                              bitmask_type* out_validity,
+                              size_type* out_valid_count)
 {
-  uint64_t const tid = threadIdx.x + (blockDim.x * blockIdx.x);
+  size_type tid    = threadIdx.x + (blockDim.x * blockIdx.x);
+  size_type stride = blockDim.x * gridDim.x;
 
-  bool is_valid = false;
-  if (tid < col.size()) {
+  if (out_valid_count) { *out_valid_count = 0; }
+  size_type warp_valid_count{0};
+
+  auto active_threads = __ballot_sync(0xffffffff, tid < col.size());
+  while (tid < col.size()) {
+    bool is_valid         = false;
     string_view const str = col.element<string_view>(tid);
     size_type output_size = 0;
     if (str.size_bytes() > 0) {
@@ -850,21 +850,32 @@ __global__ void get_json_object_kernel(column_device_view col,
       json_output out;
       thrust::tie(result, out) =
         get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size);
-      output_size = out.output_len;
-      if (out.has_output > 0 && result == parse_result::SUCCESS) { is_valid = true; }
+      output_size = out.output_len.value_or(0);
+      if (out.output_len.has_value() && result == parse_result::SUCCESS) { is_valid = true; }
     }
 
     // filled in only during the precompute step
     if (!out_buf) { output_offsets[tid] = static_cast<size_type>(output_size); }
-  }
 
-  // validity filled in only during the output step
-  if (out_validity) {
-    uint32_t mask = __ballot_sync(0xffffffff, is_valid);
-    // 0th lane of the warp writes the validity
-    if (!(tid % cudf::detail::warp_size) && tid < col.size()) {
-      out_validity[cudf::word_index(tid)] = mask;
+    // validity filled in only during the output step
+    if (out_validity) {
+      uint32_t mask = __ballot_sync(active_threads, is_valid);
+      // 0th lane of the warp writes the validity
+      if (!(tid % cudf::detail::warp_size)) {
+        out_validity[cudf::word_index(tid)] = mask;
+        warp_valid_count += __popc(mask);
+      }
     }
+
+    tid += stride;
+    active_threads = __ballot_sync(active_threads, tid < col.size());
+  }
+
+  // sum the valid counts across the whole block
+  if (out_valid_count) {
+    size_type block_valid_count =
+      cudf::detail::single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
+    if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); }
   }
 }
 
@@ -877,8 +888,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
                                               rmm::mr::device_memory_resource* mr)
 {
   // preprocess the json_path into a command buffer
-  std::tuple<rmm::device_uvector<path_operator>, int, bool> preprocess =
-    build_command_buffer(json_path, stream);
+  auto preprocess = build_command_buffer(json_path, stream);
   CUDF_EXPECTS(std::get<1>(preprocess) <= max_command_stack_depth,
                "Encountered json_path string that is too complex");
 
@@ -888,7 +898,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
   cudf::mutable_column_view offsets_view(*offsets);
 
   // if the query is empty, return a string column containing all nulls
-  if (std::get<2>(preprocess)) {
+  if (!std::get<0>(preprocess).has_value()) {
     return std::make_unique<column>(
       data_type{type_id::STRING},
       col.size(),
@@ -897,13 +907,20 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
       col.size());  // null count
   }
 
-  cudf::detail::grid_1d const grid{col.size(), 512};
+  constexpr int block_size = 512;
+  cudf::detail::grid_1d const grid{col.size(), block_size};
 
   auto cdv = column_device_view::create(col.parent(), stream);
 
   // preprocess sizes (returned in the offsets buffer)
-  get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    *cdv, std::get<0>(preprocess).data(), offsets_view.head<size_type>(), nullptr, nullptr);
+  get_json_object_kernel<block_size>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      *cdv,
+      std::get<0>(preprocess).value().data(),
+      offsets_view.head<size_type>(),
+      nullptr,
+      nullptr,
+      nullptr);
 
   // convert sizes to offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
@@ -925,17 +942,20 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 
   // compute results
   cudf::mutable_column_view chars_view(*chars);
-  get_json_object_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    *cdv,
-    std::get<0>(preprocess).data(),
-    offsets_view.head<size_type>(),
-    chars_view.head<char>(),
-    static_cast<bitmask_type*>(validity.data()));
+  rmm::device_scalar<size_type> d_valid_count{0, stream};
+  get_json_object_kernel<block_size>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      *cdv,
+      std::get<0>(preprocess).value().data(),
+      offsets_view.head<size_type>(),
+      chars_view.head<char>(),
+      static_cast<bitmask_type*>(validity.data()),
+      d_valid_count.data());
 
   return make_strings_column(col.size(),
                              std::move(offsets),
                              std::move(chars),
-                             UNKNOWN_NULL_COUNT,
+                             col.size() - d_valid_count.value(),
                              std::move(validity),
                              stream,
                              mr);
@@ -951,6 +971,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
                                               cudf::string_scalar const& json_path,
                                               rmm::mr::device_memory_resource* mr)
 {
+  CUDF_FUNC_RANGE();
   return detail::get_json_object(col, json_path, 0, mr);
 }
 
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index fdfbee116cd..0023ebd879e 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -53,6 +53,7 @@
 #include <cudf/strings/split/split.hpp>
 #include <cudf/strings/strip.hpp>
 #include <cudf/strings/substring.hpp>
+#include <cudf/strings/json.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/bit.hpp>

From 975ee51d5ef2ef9e0fb4c60b9df09b26114eb7c2 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Mon, 29 Mar 2021 16:41:19 -0500
Subject: [PATCH 30/33] Remove SPARK_BEHAVIORS #define.  Use thrust::optional
 for more kernel parameters.

---
 cpp/src/strings/json/json_path.cu | 70 +++++++++++--------------------
 1 file changed, 24 insertions(+), 46 deletions(-)

diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 44a43b7f095..ad598574044 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -34,6 +34,8 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/optional.h>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -46,29 +48,6 @@ namespace {
 #define DEBUG_NEWLINE
 constexpr int DEBUG_NEWLINE_LEN = 0;
 
-// temporary? spark doesn't strictly follow the JSONPath spec.
-// I think this probably could be a configurable enum to control
-// the kind of output you get and what features are supported.
-//
-// Current known differences:
-// - When returning a string value as a single element, Spark strips the quotes.
-//   standard:   "whee"
-//   spark:      whee
-//
-// - Spark only supports the wildcard operator when in a subscript, eg  [*]
-//   It does not handle .*
-//
-// Other, non-spark known differences:
-//
-// - In jsonpath_ng, name subscripts can use double quotes instead of the standard
-//   single quotes in the query string.
-//   standard:      $.thing['subscript']
-//   jsonpath_ng:   $.thing["subscript"]
-//
-//  Currently, this code only allows single-quotes but that could be expanded if necessary.
-//
-#define SPARK_BEHAVIORS
-
 /**
  * @brief Result of calling a parse function.
  *
@@ -271,14 +250,13 @@ class json_state : private parser {
       if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; }
       end = pos;
 
-#if defined(SPARK_BEHAVIORS)
-      // spark/hive-specific behavior.  if this is a non-list-element wrapped in quotes,
-      // strip them
+      // SPARK-specific behavior.  if this is a non-list-element wrapped in quotes,
+      // strip them. we may need to make this behavior configurable in some way
+      // later on.
       if (!list_element && *start == '\"' && *(end - 1) == '\"') {
         start++;
         end--;
       }
-#endif
     }
     // otherwise, march through everything inside
     else {
@@ -816,25 +794,23 @@ __device__ thrust::pair<parse_result, json_output> get_json_object_single(
  * @param col Device view of the incoming string
  * @param commands JSONPath command buffer
  * @param output_offsets Buffer used to store the string offsets for the results of the query
- * (nullptr in the size computation step)
- * @param out_buf Buffer used to store the results of the query (nullptr in the size computation
- * step)
- * @param out_validity Output validity buffer (nullptr in the size computation step)
- * @param out_valid_count Output count of # of valid bits (nullptr in the size computation step)
+ * @param out_buf Buffer used to store the results of the query
+ * @param out_validity Output validity buffer
+ * @param out_valid_count Output count of # of valid bits
  */
 template <int block_size>
 __launch_bounds__(block_size) __global__
   void get_json_object_kernel(column_device_view col,
                               path_operator const* const commands,
                               size_type* output_offsets,
-                              char* out_buf,
-                              bitmask_type* out_validity,
-                              size_type* out_valid_count)
+                              thrust::optional<char*> out_buf,
+                              thrust::optional<bitmask_type*> out_validity,
+                              thrust::optional<size_type*> out_valid_count)
 {
   size_type tid    = threadIdx.x + (blockDim.x * blockIdx.x);
   size_type stride = blockDim.x * gridDim.x;
 
-  if (out_valid_count) { *out_valid_count = 0; }
+  if (out_valid_count.has_value()) { *(out_valid_count.value()) = 0; }
   size_type warp_valid_count{0};
 
   auto active_threads = __ballot_sync(0xffffffff, tid < col.size());
@@ -843,8 +819,9 @@ __launch_bounds__(block_size) __global__
     string_view const str = col.element<string_view>(tid);
     size_type output_size = 0;
     if (str.size_bytes() > 0) {
-      char* dst             = out_buf ? out_buf + output_offsets[tid] : nullptr;
-      size_t const dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0;
+      char* dst = out_buf.has_value() ? out_buf.value() + output_offsets[tid] : nullptr;
+      size_t const dst_size =
+        out_buf.has_value() ? output_offsets[tid + 1] - output_offsets[tid] : 0;
 
       parse_result result;
       json_output out;
@@ -854,15 +831,16 @@ __launch_bounds__(block_size) __global__
       if (out.output_len.has_value() && result == parse_result::SUCCESS) { is_valid = true; }
     }
 
-    // filled in only during the precompute step
-    if (!out_buf) { output_offsets[tid] = static_cast<size_type>(output_size); }
+    // filled in only during the precompute step. during the compute step, the offsets
+    // are fed back in so we do -not- want to write them out
+    if (!out_buf.has_value()) { output_offsets[tid] = static_cast<size_type>(output_size); }
 
     // validity filled in only during the output step
-    if (out_validity) {
+    if (out_validity.has_value()) {
       uint32_t mask = __ballot_sync(active_threads, is_valid);
       // 0th lane of the warp writes the validity
       if (!(tid % cudf::detail::warp_size)) {
-        out_validity[cudf::word_index(tid)] = mask;
+        out_validity.value()[cudf::word_index(tid)] = mask;
         warp_valid_count += __popc(mask);
       }
     }
@@ -875,7 +853,7 @@ __launch_bounds__(block_size) __global__
   if (out_valid_count) {
     size_type block_valid_count =
       cudf::detail::single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
-    if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); }
+    if (threadIdx.x == 0) { atomicAdd(out_valid_count.value(), block_valid_count); }
   }
 }
 
@@ -918,9 +896,9 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
       *cdv,
       std::get<0>(preprocess).value().data(),
       offsets_view.head<size_type>(),
-      nullptr,
-      nullptr,
-      nullptr);
+      thrust::nullopt,
+      thrust::nullopt,
+      thrust::nullopt);
 
   // convert sizes to offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),

From bc649d88f9fe8fc99061271446130f223900093e Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Tue, 30 Mar 2021 16:59:33 -0500
Subject: [PATCH 31/33] Clean up includes in detail/json.hpp. Change copyright
 date back to 2020 in substring.hpp.  Add strings_json doxygen group. Make
 sure JSONPath terminology is used consistently. Other small PR review
 cleanup.

---
 cpp/include/cudf/strings/detail/json.hpp |  4 +---
 cpp/include/cudf/strings/substring.hpp   |  2 +-
 cpp/include/doxygen_groups.h             |  1 +
 cpp/src/strings/json/json_path.cu        | 11 ++++-------
 cpp/tests/CMakeLists.txt                 |  4 ++--
 5 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp
index 35b7dec6449..e6a0b49f102 100644
--- a/cpp/include/cudf/strings/detail/json.hpp
+++ b/cpp/include/cudf/strings/detail/json.hpp
@@ -16,12 +16,10 @@
 
 #pragma once
 
-#include <cudf/types.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <memory>
-
 namespace cudf {
 namespace strings {
 namespace detail {
diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp
index d1431060792..6941615a0c2 100644
--- a/cpp/include/cudf/strings/substring.hpp
+++ b/cpp/include/cudf/strings/substring.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h
index 65dd5c73475..f78ff98d49d 100644
--- a/cpp/include/doxygen_groups.h
+++ b/cpp/include/doxygen_groups.h
@@ -127,6 +127,7 @@
  *   @defgroup strings_modify Modifying
  *   @defgroup strings_replace Replacing
  *   @defgroup strings_split Splitting
+ *   @defgroup strings_json JSON
  * @}
  * @defgroup dictionary_apis Dictionary
  * @{
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index ad598574044..d6ae72fba12 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -173,10 +173,7 @@ class parser {
   char const* pos;
 
  private:
-  CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c)
-  {
-    return c == ' ' || c == '\r' || c == '\n' || c == '\t' ? true : false;
-  }
+  CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' ' ? true : false; }
 };
 
 /**
@@ -533,7 +530,7 @@ class path_state : private parser {
     }
 
     // an empty name is not valid
-    CUDF_EXPECTS(name.size_bytes() > 0, "Invalid empty name in JSONpath query string");
+    CUDF_EXPECTS(name.size_bytes() > 0, "Invalid empty name in JSONPath query string");
 
     return true;
   }
@@ -752,7 +749,7 @@ __device__ parse_result parse_json_path(json_state& j_state,
 }
 
 // hardcoding this for now. to reach a stack depth of 8 would require
-// a jsonpath containing 7 nested wildcards so this is probably reasonable.
+// a JSONPath containing 7 nested wildcards so this is probably reasonable.
 constexpr int max_command_stack_depth = 8;
 
 /**
@@ -868,7 +865,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
   // preprocess the json_path into a command buffer
   auto preprocess = build_command_buffer(json_path, stream);
   CUDF_EXPECTS(std::get<1>(preprocess) <= max_command_stack_depth,
-               "Encountered json_path string that is too complex");
+               "Encountered JSONPath string that is too complex");
 
   // allocate output offsets buffer.
   auto offsets = cudf::make_fixed_width_column(
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index dcbba9b7220..79666a3325f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -333,6 +333,7 @@ ConfigureTest(STRINGS_TEST
     strings/hash_string.cu
     strings/integers_tests.cu
     strings/ipv4_tests.cpp
+    strings/json_tests.cpp
     strings/pad_tests.cpp
     strings/replace_regex_tests.cpp
     strings/replace_tests.cpp
@@ -340,8 +341,7 @@ ConfigureTest(STRINGS_TEST
     strings/strip_tests.cpp
     strings/substring_tests.cpp
     strings/translate_tests.cpp
-    strings/urls_tests.cpp
-    strings/json_tests.cpp)
+    strings/urls_tests.cpp)
 
 ###################################################################################################
 # - structs test ----------------------------------------------------------------------------------

From e69e6bb99fa7da654d3559cdb1c260a5192f05e1 Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Tue, 30 Mar 2021 17:49:00 -0500
Subject: [PATCH 32/33] Use offset_type when dealing with output offsets view.

---
 cpp/src/strings/json/json_path.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index d6ae72fba12..4958848633a 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -799,7 +799,7 @@ template <int block_size>
 __launch_bounds__(block_size) __global__
   void get_json_object_kernel(column_device_view col,
                               path_operator const* const commands,
-                              size_type* output_offsets,
+                              offset_type* output_offsets,
                               thrust::optional<char*> out_buf,
                               thrust::optional<bitmask_type*> out_validity,
                               thrust::optional<size_type*> out_valid_count)
@@ -830,7 +830,7 @@ __launch_bounds__(block_size) __global__
 
     // filled in only during the precompute step. during the compute step, the offsets
     // are fed back in so we do -not- want to write them out
-    if (!out_buf.has_value()) { output_offsets[tid] = static_cast<size_type>(output_size); }
+    if (!out_buf.has_value()) { output_offsets[tid] = static_cast<offset_type>(output_size); }
 
     // validity filled in only during the output step
     if (out_validity.has_value()) {
@@ -892,19 +892,19 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
       *cdv,
       std::get<0>(preprocess).value().data(),
-      offsets_view.head<size_type>(),
+      offsets_view.head<offset_type>(),
       thrust::nullopt,
       thrust::nullopt,
       thrust::nullopt);
 
   // convert sizes to offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
-                         offsets_view.head<size_type>(),
-                         offsets_view.head<size_type>() + col.size() + 1,
-                         offsets_view.head<size_type>(),
+                         offsets_view.head<offset_type>(),
+                         offsets_view.head<offset_type>() + col.size() + 1,
+                         offsets_view.head<offset_type>(),
                          0);
   size_type const output_size =
-    cudf::detail::get_value<size_type>(offsets_view, col.size(), stream);
+    cudf::detail::get_value<offset_type>(offsets_view, col.size(), stream);
 
   // allocate output string column
   auto chars = cudf::make_fixed_width_column(
@@ -922,7 +922,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
       *cdv,
       std::get<0>(preprocess).value().data(),
-      offsets_view.head<size_type>(),
+      offsets_view.head<offset_type>(),
       chars_view.head<char>(),
       static_cast<bitmask_type*>(validity.data()),
       d_valid_count.data());

From 74a71542099822e9211a823ac02d02fee6534a9f Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Tue, 30 Mar 2021 20:16:29 -0500
Subject: [PATCH 33/33] Newline in benchmark CMakeLists.txt.  Remove more
 includes.  Remove unneeded ternary.

---
 cpp/benchmarks/CMakeLists.txt     | 2 +-
 cpp/src/strings/json/json_path.cu | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 4d5914fe1b8..11af408f1c5 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -206,4 +206,4 @@ ConfigureBench(STRINGS_BENCH
 ###################################################################################################
 # - json benchmark -------------------------------------------------------------------
 ConfigureBench(JSON_BENCH
-  string/json_benchmark.cpp)
\ No newline at end of file
+  string/json_benchmark.cpp)
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 4958848633a..cd8aae12070 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -21,14 +21,12 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <io/utilities/column_type_histogram.hpp>
 #include <io/utilities/parsing_utils.cuh>
 
 #include <rmm/device_uvector.hpp>
@@ -173,7 +171,7 @@ class parser {
   char const* pos;
 
  private:
-  CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' ' ? true : false; }
+  CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' '; }
 };
 
 /**