From b6548797d5ac43585fdfeb6762df6ac7b8d18c8b Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Tue, 2 Feb 2021 17:03:04 -0600 Subject: [PATCH 01/33] Extremely rough draft. --- cpp/include/cudf/strings/detail/substring.hpp | 42 ++ cpp/include/cudf/strings/substring.hpp | 21 +- cpp/src/strings/json/json_path.cu | 460 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 3 +- cpp/tests/strings/json_tests.cpp | 162 ++++++ 5 files changed, 686 insertions(+), 2 deletions(-) create mode 100644 cpp/include/cudf/strings/detail/substring.hpp create mode 100644 cpp/src/strings/json/json_path.cu create mode 100644 cpp/tests/strings/json_tests.cpp diff --git a/cpp/include/cudf/strings/detail/substring.hpp b/cpp/include/cudf/strings/detail/substring.hpp new file mode 100644 index 00000000000..a646d93e2b8 --- /dev/null +++ b/cpp/include/cudf/strings/detail/substring.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +/** + * @copydoc cudf::get_json_object + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ +std::unique_ptr get_json_object( + cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp index 6941615a0c2..06a38360fdb 100644 --- a/cpp/include/cudf/strings/substring.hpp +++ b/cpp/include/cudf/strings/substring.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -197,6 +197,25 @@ std::unique_ptr slice_strings( size_type count, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Apply a JSONPath string to all rows in an input strings column. + * + * Applies a JSONPath string to an incoming strings column where each row in the column + * is a valid json string. The output is returned by row as a strings column. + * + * https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html + * Implements only the operators: $ . [] * + * + * @param col The input strings column. Each row must contain a valid json string + * @param json_path The JSONPath string to be applied to each row + * @param mr Resource for allocating device memory. + * @return New strings column containing the retrieved json object strings + */ +std::unique_ptr get_json_object( + cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu new file mode 100644 index 00000000000..b2b317829e8 --- /dev/null +++ b/cpp/src/strings/json/json_path.cu @@ -0,0 +1,460 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +namespace { + +using namespace cudf; + +CUDA_HOST_DEVICE_CALLABLE bool device_strncmp(const char* str1, const char* str2, size_t num_chars) +{ + for (size_t idx = 0; idx < num_chars; idx++) { + if (str1[idx] != str2[idx]) { return false; } + } + return true; +} + +CUDA_HOST_DEVICE_CALLABLE char const* device_strpbrk(const char* str, + size_t str_size, + const char* tok, + size_t tok_size) +{ + size_t pos = 0; + while (pos < str_size) { + size_t tpos = 0; + char c = str[pos]; + while (tpos < tok_size) { + if (c == tok[tpos]) { return str + pos; } + tpos++; + } + pos++; + } + return nullptr; +} + +struct json_string { + const char* str; + int64_t len; + + CUDA_HOST_DEVICE_CALLABLE bool operator==(json_string const& cmp) + { + return len == cmp.len && str != nullptr && cmp.str != nullptr && + device_strncmp(str, cmp.str, static_cast(len)); + } +}; + +enum json_element_type { + NONE, + OBJECT, + ARRAY, +}; + +class parser { + protected: + CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {} + CUDA_HOST_DEVICE_CALLABLE parser(const char* _input, int64_t _input_len) + : input(_input), input_len(_input_len), pos(_input) + { + parse_whitespace(); + } + + CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace() + { + while (!eof()) { + char c = *pos; + if (c == ' ' || c == '\r' || c == '\n' || c == '\t') { + pos++; + } else { + return true; + } + } + return false; + } + + CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; } + + CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); } + + CUDA_HOST_DEVICE_CALLABLE bool parse_name(json_string& name, json_string& terminators) + { + char c = *pos; + switch (c) { + case '*': + name.str = pos; + name.len = 1; + pos++; + return true; + + default: { + size_t const chars_left = input_len - (pos - input); + char const* end = device_strpbrk(pos, chars_left, terminators.str, terminators.len); + if (end) { + name.str = pos; + name.len = end - pos; + pos = end; + } else { + name.str = pos; + name.len = chars_left; + pos = input + input_len; + } + return true; + } break; + } + + return false; + } + + protected: + char const* input; + int64_t input_len; + char const* pos; +}; + +class json_state : private parser { + public: + CUDA_HOST_DEVICE_CALLABLE json_state() + : parser(), element(json_element_type::NONE), cur_el_start(nullptr) + { + } + CUDA_HOST_DEVICE_CALLABLE json_state(const char* _input, int64_t _input_len) + : parser(_input, _input_len), element(json_element_type::NONE), cur_el_start(nullptr) + { + } + + CUDA_HOST_DEVICE_CALLABLE bool next_match(json_string& str, json_state& child) + { + json_string name; + if (!parse_string(name, true)) { return false; } + if ((str.len == 1 && str.str[0] == '*') || str == name) { + // if this isn't an empty string, parse out the : + if (name.len > 0) { + if (!parse_whitespace() || *pos != ':') { return false; } + pos++; + } + + // we have a match on the name, so advance to the beginning of the next element + if (parse_whitespace()) { + switch (*pos) { + case '[': element = ARRAY; break; + + case '{': element = OBJECT; break; + + default: return false; + } + cur_el_start = pos++; + + // success + child = *this; + return true; + } + } + return false; + } + + CUDA_HOST_DEVICE_CALLABLE json_string extract_element() + { + // collapse the current element into a json_string + int obj_count = 0; + int arr_count = 0; + + char const* start = cur_el_start; + char const* end = start; + while (!eof(end)) { + char c = *end++; + switch (c) { + case '{': obj_count++; break; + case '}': obj_count--; break; + case '[': arr_count++; break; + case ']': arr_count--; break; + default: break; + } + if (obj_count == 0 && arr_count == 0) { break; } + } + pos = end; + + return {start, end - start}; + } + + json_element_type element; + + private: + CUDA_HOST_DEVICE_CALLABLE bool parse_string(json_string& str, bool can_be_empty) + { + str.str = nullptr; + str.len = 0; + + if (parse_whitespace()) { + if (*pos == '\"') { + const char* start = ++pos; + while (!eof()) { + if (*pos == '\"') { + str.str = start; + str.len = pos - start; + pos++; + return true; + } + pos++; + } + } + } + + return can_be_empty ? true : false; + } + const char* cur_el_start; +}; + +enum path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END }; + +// constexpr max_name_len (63) +struct path_operator { + path_operator_type type; + json_string name; + int index; +}; + +// current state of the JSONPath +class path_state : private parser { + public: + CUDA_HOST_DEVICE_CALLABLE path_state() : parser() {} + CUDA_HOST_DEVICE_CALLABLE path_state(const char* _path, size_t _path_len) + : parser(_path, _path_len) + { + } + + CUDA_HOST_DEVICE_CALLABLE path_operator get_next_operator() + { + if (eof()) { return {END}; } + + char c = parse_char(); + switch (c) { + case '$': return {ROOT}; + + case '.': { + path_operator op; + json_string term{".[", 2}; + if (parse_name(op.name, term)) { + if (op.name.len == 1 && op.name.str[0] == '*') { + op.type = CHILD_WILDCARD; + } else { + op.type = CHILD; + } + return op; + } + } break; + + // 3 ways this can be used + // indices: [0] + // name: ['book'] + // wildcard: [*] + case '[': { + path_operator op; + json_string term{"]", 1}; + if (parse_name(op.name, term)) { + pos++; + if (op.name.len == 1 && op.name.str[0] == '*') { + op.type = CHILD_WILDCARD; + } else { + // unhandled cases + break; + } + return op; + } + } break; + + default: break; + } + return {ERROR}; + } + + private: + CUDA_HOST_DEVICE_CALLABLE char parse_char() { return *pos++; } +}; + +struct json_output { + size_t output_max_len; + size_t output_len; + char* output; + + CUDA_HOST_DEVICE_CALLABLE void add_output(const char* str, size_t len) + { + if (output != nullptr) { + // assert output_len + len < output_max_len + memcpy(output + output_len, str, len); + } + output_len += len; + } + + CUDA_HOST_DEVICE_CALLABLE void add_output(json_string str) { add_output(str.str, str.len); } +}; + +CUDA_HOST_DEVICE_CALLABLE void parse_json_path(json_state& j_state, + path_state p_state, + json_output& output) +{ + path_operator op = p_state.get_next_operator(); + + switch (op.type) { + // whatever the first object is + case ROOT: { + json_state child; + json_string wildcard{"*", 1}; + if (j_state.next_match(wildcard, child)) { parse_json_path(child, p_state, output); } + } break; + + // .name + // ['name'] + // [1] + // will return a single thing + case CHILD: { + json_state child; + if (j_state.next_match(op.name, child)) { parse_json_path(child, p_state, output); } + } break; + + // .* + // [*] + // will return an array of things + case CHILD_WILDCARD: { + output.add_output("[\n", 2); + + json_state child; + int count = 0; + while (j_state.next_match(op.name, child)) { + if (count > 0) { output.add_output(",\n", 2); } + parse_json_path(child, p_state, output); + j_state = child; + count++; + } + output.add_output("]\n", 2); + } break; + + // some sort of error. + case ERROR: break; + + // END case + default: output.add_output(j_state.extract_element()); break; + } +} + +CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input, + size_t input_len, + char const* path, + size_t path_len, + char* out_buf, + size_t out_buf_size) +{ + // TODO: add host-side code to verify path is a valid string. + json_state j_state(input, input_len); + path_state p_state(path, path_len); + json_output output{out_buf_size, 0, out_buf}; + + parse_json_path(j_state, p_state, output); + + return output; +} + +__global__ void get_json_object_kernel(char const* chars, + size_type const* offsets, + char const* json_path, + size_t json_path_len, + size_type* output_offsets, + char* out_buf, + size_t out_buf_size) +{ + uint64_t const tid = threadIdx.x + (blockDim.x * blockIdx.x); + + json_output out = get_json_object_single(chars + offsets[tid], + offsets[tid + 1] - offsets[tid], + json_path, + json_path_len, + out_buf, + out_buf_size); + + // filled in only during the precompute step + if (output_offsets != nullptr) { output_offsets[tid] = static_cast(out.output_len); } +} + +std::unique_ptr get_json_object(cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + size_t stack_size; + cudaDeviceGetLimit(&stack_size, cudaLimitStackSize); + cudaDeviceSetLimit(cudaLimitStackSize, 2048); + + auto offsets = cudf::make_fixed_width_column( + data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr); + cudf::mutable_column_view offsets_view(*offsets); + + cudf::detail::grid_1d const grid{1, col.size()}; + + // preprocess sizes + get_json_object_kernel<<>>( + col.chars().head(), + col.offsets().head(), + json_path.data(), + json_path.size(), + offsets_view.head(), + nullptr, + 0); + + // convert sizes to offsets + thrust::exclusive_scan(rmm::exec_policy(stream), + offsets_view.head(), + offsets_view.head() + col.size() + 1, + offsets_view.head(), + 0); + size_type output_size = cudf::detail::get_value(offsets_view, col.size(), stream); + + // allocate output string column + auto chars = cudf::make_fixed_width_column( + data_type{type_id::INT8}, output_size, mask_state::UNALLOCATED, stream, mr); + + // compute results + cudf::mutable_column_view chars_view(*chars); + get_json_object_kernel<<>>( + col.chars().head(), + col.offsets().head(), + json_path.data(), + json_path.size(), + nullptr, + chars_view.head(), + output_size); + + // reset back to original stack size + cudaDeviceSetLimit(cudaLimitStackSize, stack_size); + + return make_strings_column(col.size(), + std::move(offsets), + std::move(chars), + UNKNOWN_NULL_COUNT, + rmm::device_buffer{}, + stream, + mr); +} + +} // namespace +} // namespace detail + +std::unique_ptr get_json_object(cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::mr::device_memory_resource* mr) +{ + return detail::get_json_object(col, json_path, 0, mr); +} + +} // namespace strings +} // namespace cudf \ No newline at end of file diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 8395a3cc1f2..08899c09d5c 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -585,7 +585,8 @@ set(STRINGS_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/strings/strip_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/strings/substring_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/strings/translate_tests.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/strings/urls_tests.cpp") + "${CMAKE_CURRENT_SOURCE_DIR}/strings/urls_tests.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/strings/json_tests.cpp") ConfigureTest(STRINGS_TEST "${STRINGS_TEST_SRC}") diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp new file mode 100644 index 00000000000..f114738b345 --- /dev/null +++ b/cpp/tests/strings/json_tests.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +/* +const char* json_string = "{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + }, + { + "category": "fiction", + "author": "Herman Melville", + "title": "Moby Dick", + "isbn": "0-553-21311-3", + "price": 8.99 + }, + { + "category": "fiction", + "author": "J. R. R. Tolkien", + "title": "The Lord of the Rings", + "isbn": "0-395-19395-8", + "price": 22.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, +}"; +*/ + +struct JsonTests : public cudf::test::BaseFixture { +}; + +TEST_F(JsonTests, GetJsonObject) +{ + // reference: https://jsonpath.herokuapp.com/ + // clang-format off + /* + { + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Evelyn Waugh", + "title": "Sword of Honour", + "price": 12.99 + }, + { + "category": "fiction", + "author": "Herman Melville", + "title": "Moby Dick", + "isbn": "0-553-21311-3", + "price": 8.99 + }, + { + "category": "fiction", + "author": "J. R. R. Tolkien", + "title": "The Lord of the Rings", + "isbn": "0-395-19395-8", + "price": 22.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 + } + */ + // clang-format on + // this string is formatted to result in a reasonably readable debug printf + const char* json_string = + "{\n\"store\": {\n\t\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " + "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " + "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " + "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " + "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " + "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " + "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " + "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " + "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " + "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": " + "19.95\n\t}\n},\n\"expensive\": 10\n}"; + + { + cudf::test::strings_column_wrapper input{json_string}; + cudf::string_scalar json_path("$"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + cudf::string_scalar json_path("$.store"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + cudf::string_scalar json_path("$.store.book"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + cudf::string_scalar json_path("$.*"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + cudf::string_scalar json_path("$[*]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } +} From 36cd4c1356045472048d8591a5455365ea9b4255 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 10 Feb 2021 17:31:13 -0600 Subject: [PATCH 02/33] Add support for full set of operators I believe we will need to support. Code is still purely naive and probably doesn't handle all possible error conditions well. --- cpp/src/io/utilities/parsing_utils.cuh | 2 + cpp/src/strings/json/json_path.cu | 567 +++++++++++++++++++------ cpp/tests/strings/json_tests.cpp | 44 +- 3 files changed, 475 insertions(+), 138 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index c7f405e1cc0..d8a6cb70a10 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -20,6 +20,8 @@ #include #include +#include + #include using cudf::detail::device_span; diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index b2b317829e8..9267d933368 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -7,6 +7,8 @@ #include #include +#include + #include #include @@ -19,6 +21,108 @@ namespace { using namespace cudf; +CUDA_HOST_DEVICE_CALLABLE char to_lower(char const c) +{ + return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; +} + +template ::value>* = nullptr> +CUDA_HOST_DEVICE_CALLABLE uint8_t decode_digit(char c, bool* valid_flag) +{ + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + + *valid_flag = false; + return 0; +} + +template ::value>* = nullptr> +CUDA_HOST_DEVICE_CALLABLE uint8_t decode_digit(char c, bool* valid_flag) +{ + if (c >= '0' && c <= '9') return c - '0'; + + *valid_flag = false; + return 0; +} + +CUDA_HOST_DEVICE_CALLABLE bool is_infinity(char const* begin, char const* end) +{ + if (*begin == '-' || *begin == '+') begin++; + char const* cinf = "infinity"; + auto index = begin; + while (index < end) { + if (*cinf != to_lower(*index)) break; + index++; + cinf++; + } + return ((index == begin + 3 || index == begin + 8) && index >= end); +} + +template +CUDA_HOST_DEVICE_CALLABLE T parse_numeric(const char* begin, + const char* end, + cudf::io::parse_options_view const& opts) +{ + T value{}; + bool all_digits_valid = true; + + // Handle negative values if necessary + int32_t sign = (*begin == '-') ? -1 : 1; + + // Handle infinity + if (std::is_floating_point::value && is_infinity(begin, end)) { + return sign * std::numeric_limits::infinity(); + } + if (*begin == '-' || *begin == '+') begin++; + + // Skip over the "0x" prefix for hex notation + if (base == 16 && begin + 2 < end && *begin == '0' && *(begin + 1) == 'x') { begin += 2; } + + // Handle the whole part of the number + // auto index = begin; + while (begin < end) { + if (*begin == opts.decimal) { + ++begin; + break; + } else if (base == 10 && (*begin == 'e' || *begin == 'E')) { + break; + } else if (*begin != opts.thousands && *begin != '+') { + value = (value * base) + decode_digit(*begin, &all_digits_valid); + } + ++begin; + } + + if (std::is_floating_point::value) { + // Handle fractional part of the number if necessary + double divisor = 1; + while (begin < end) { + if (*begin == 'e' || *begin == 'E') { + ++begin; + break; + } else if (*begin != opts.thousands && *begin != '+') { + divisor /= base; + value += decode_digit(*begin, &all_digits_valid) * divisor; + } + ++begin; + } + + // Handle exponential part of the number if necessary + if (begin < end) { + const int32_t exponent_sign = *begin == '-' ? -1 : 1; + if (*begin == '-' || *begin == '+') { ++begin; } + int32_t exponent = 0; + while (begin < end) { + exponent = (exponent * 10) + decode_digit(*(begin++), &all_digits_valid); + } + if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } + } + } + if (!all_digits_valid) { return std::numeric_limits::quiet_NaN(); } + + return value * sign; +} + CUDA_HOST_DEVICE_CALLABLE bool device_strncmp(const char* str1, const char* str2, size_t num_chars) { for (size_t idx = 0; idx < num_chars; idx++) { @@ -56,12 +160,36 @@ struct json_string { } }; -enum json_element_type { - NONE, - OBJECT, - ARRAY, +enum class parse_result { + ERROR, + SUCCESS, + EMPTY, +}; + +enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; + +struct json_output { + size_t output_max_len; + size_t output_len; + char* output; + + CUDA_HOST_DEVICE_CALLABLE void add_output(const char* str, size_t len) + { + if (output != nullptr) { + // assert output_len + len < output_max_len + memcpy(output + output_len, str, len); + } + output_len += len; + } + + CUDA_HOST_DEVICE_CALLABLE void add_output(json_string str) { add_output(str.str, str.len); } }; +CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) +{ + return c == ' ' || c == '\r' || c == '\n' || c == '\t' ? true : false; +} + class parser { protected: CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {} @@ -71,11 +199,13 @@ class parser { parse_whitespace(); } + CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; } + CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); } + CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace() { while (!eof()) { - char c = *pos; - if (c == ' ' || c == '\r' || c == '\n' || c == '\t') { + if (is_whitespace(*pos)) { pos++; } else { return true; @@ -84,37 +214,73 @@ class parser { return false; } - CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; } + CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(json_string& str, + bool can_be_empty, + char quote) + { + str.str = nullptr; + str.len = 0; - CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); } + if (parse_whitespace()) { + if (*pos == quote) { + const char* start = ++pos; + while (!eof()) { + if (*pos == quote) { + str.str = start; + str.len = pos - start; + pos++; + return parse_result::SUCCESS; + } + pos++; + } + } + } - CUDA_HOST_DEVICE_CALLABLE bool parse_name(json_string& name, json_string& terminators) + return can_be_empty ? parse_result::EMPTY : parse_result::ERROR; + } + + // a name means: + // - a string followed by a : + // - no string + CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(json_string& name, + bool can_be_empty, + char quote) { - char c = *pos; - switch (c) { - case '*': - name.str = pos; - name.len = 1; + if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) { + return parse_result::ERROR; + } + + // if we got a real string, the next char must be a : + if (name.len > 0) { + if (!parse_whitespace()) { return parse_result::ERROR; } + if (*pos == ':') { pos++; - return true; + return parse_result::SUCCESS; + } + } + return parse_result::EMPTY; + } - default: { - size_t const chars_left = input_len - (pos - input); - char const* end = device_strpbrk(pos, chars_left, terminators.str, terminators.len); - if (end) { - name.str = pos; - name.len = end - pos; - pos = end; - } else { - name.str = pos; - name.len = chars_left; - pos = input + input_len; - } - return true; - } break; + // this function is not particularly strong + CUDA_HOST_DEVICE_CALLABLE parse_result parse_number(json_string& val) + { + if (!parse_whitespace()) { return parse_result::ERROR; } + + // parse to the end of the number (does not do any error checking on whether + // the number is reasonably formed or not) + char const* start = pos; + char const* end = start; + while (!eof(end)) { + char c = *end; + if (c == ',' || is_whitespace(c)) { break; } + end++; } + pos = end; - return false; + val.str = start; + val.len = {end - start}; + + return parse_result::SUCCESS; } protected: @@ -134,89 +300,153 @@ class json_state : private parser { { } - CUDA_HOST_DEVICE_CALLABLE bool next_match(json_string& str, json_state& child) - { - json_string name; - if (!parse_string(name, true)) { return false; } - if ((str.len == 1 && str.str[0] == '*') || str == name) { - // if this isn't an empty string, parse out the : - if (name.len > 0) { - if (!parse_whitespace() || *pos != ':') { return false; } - pos++; - } + CUDA_HOST_DEVICE_CALLABLE json_state(json_state const& j) { *this = j; } - // we have a match on the name, so advance to the beginning of the next element - if (parse_whitespace()) { - switch (*pos) { - case '[': element = ARRAY; break; + CUDA_HOST_DEVICE_CALLABLE parse_result extract_element(json_output* output) + { + // collapse the current element into a json_string - case '{': element = OBJECT; break; + char const* start = cur_el_start; + char const* end = start; - default: return false; + // if we're a value type, do a simple value parse. + if (cur_el_type == VALUE) { + pos = cur_el_start; + if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; } + end = pos; + } + // otherwise, march through everything inside + else { + int obj_count = 0; + int arr_count = 0; + + while (!eof(end)) { + char c = *end++; + // could do some additional checks here. we know our current + // element type, so we could be more strict on what kinds of + // characters we expect to see. + switch (c) { + case '{': obj_count++; break; + case '}': obj_count--; break; + case '[': arr_count++; break; + case ']': arr_count--; break; + default: break; } - cur_el_start = pos++; - - // success - child = *this; - return true; + if (obj_count == 0 && arr_count == 0) { break; } } + pos = end; } - return false; - } - CUDA_HOST_DEVICE_CALLABLE json_string extract_element() - { - // collapse the current element into a json_string - int obj_count = 0; - int arr_count = 0; + // parse trailing , + if (parse_whitespace()) { + if (*pos == ',') { pos++; } + } - char const* start = cur_el_start; - char const* end = start; - while (!eof(end)) { - char c = *end++; - switch (c) { - case '{': obj_count++; break; - case '}': obj_count--; break; - case '[': arr_count++; break; - case ']': arr_count--; break; - default: break; + if (output != nullptr) { + // seems like names are never included with JSONPath unless + // they are nested within the element being returned. + /* + if(cur_el_name.len > 0){ + output->add_output({"\"", 1}); + output->add_output(cur_el_name); + output->add_output({"\"", 1}); + output->add_output({":", 1}); } - if (obj_count == 0 && arr_count == 0) { break; } + */ + output->add_output({start, end - start}); } - pos = end; - - return {start, end - start}; + return parse_result::SUCCESS; } + CUDA_HOST_DEVICE_CALLABLE parse_result skip_element() { return extract_element(nullptr); } + json_element_type element; + CUDA_HOST_DEVICE_CALLABLE parse_result next_element() { return next_element_internal(false); } + + CUDA_HOST_DEVICE_CALLABLE parse_result child_element() { return next_element_internal(true); } + + CUDA_HOST_DEVICE_CALLABLE parse_result next_matching_element(json_string const& name, + bool inclusive) + { + // if we're not including the current element, skip it + if (!inclusive) { + parse_result result = next_element_internal(false); + if (result != parse_result::SUCCESS) { return result; } + } + // loop until we find a match or there's nothing left + do { + // wildcard matches anything + if (name.len == 1 && name.str[0] == '*') { + return parse_result::SUCCESS; + } else if (cur_el_name == name) { + return parse_result::SUCCESS; + } + + // next + parse_result result = next_element_internal(false); + if (result != parse_result::SUCCESS) { return result; } + } while (1); + + return parse_result::ERROR; + } + private: - CUDA_HOST_DEVICE_CALLABLE bool parse_string(json_string& str, bool can_be_empty) + CUDA_HOST_DEVICE_CALLABLE parse_result parse_value() { - str.str = nullptr; - str.len = 0; + if (!parse_whitespace()) { return parse_result::ERROR; } - if (parse_whitespace()) { - if (*pos == '\"') { - const char* start = ++pos; - while (!eof()) { - if (*pos == '\"') { - str.str = start; - str.len = pos - start; - pos++; - return true; - } - pos++; - } - } + // string or number? + json_string unused; + return *pos == '\"' ? parse_string(unused, false, '\"') : parse_number(unused); + } + + CUDA_HOST_DEVICE_CALLABLE parse_result next_element_internal(bool child) + { + // if we're not getting a child element, skip the current element. + // this will leave pos as the first character -after- the close of + // the current element + if (!child && cur_el_start != nullptr) { + if (skip_element() == parse_result::ERROR) { return parse_result::ERROR; } + cur_el_start = nullptr; } + // otherwise pos will be at the first character within the current element + + // what's next + if (!parse_whitespace()) { return parse_result::EMPTY; } + // if we're closing off a parent element, we're done + char c = *pos; + if (c == ']' || c == '}') { return parse_result::EMPTY; } + + // element name, if any + if (parse_name(cur_el_name, true, '\"') == parse_result::ERROR) { return parse_result::ERROR; } + + // element type + if (!parse_whitespace()) { return parse_result::EMPTY; } + switch (*pos) { + case '[': cur_el_type = ARRAY; break; + case '{': cur_el_type = OBJECT; break; + + case ',': + case ':': + case '\'': return parse_result::ERROR; - return can_be_empty ? true : false; + // value type + default: cur_el_type = VALUE; break; + } + pos++; + + // the start of the current element is always at the value, not the name + cur_el_start = pos - 1; + return parse_result::SUCCESS; } + const char* cur_el_start; + json_string cur_el_name; + json_element_type cur_el_type; }; -enum path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END }; +enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END }; // constexpr max_name_len (63) struct path_operator { @@ -233,23 +463,24 @@ class path_state : private parser { : parser(_path, _path_len) { } + CUDA_HOST_DEVICE_CALLABLE path_state(path_state const& p) { *this = p; } CUDA_HOST_DEVICE_CALLABLE path_operator get_next_operator() { - if (eof()) { return {END}; } + if (eof()) { return {path_operator_type::END}; } - char c = parse_char(); + char c = *pos++; switch (c) { - case '$': return {ROOT}; + case '$': return {path_operator_type::ROOT}; case '.': { path_operator op; json_string term{".[", 2}; - if (parse_name(op.name, term)) { + if (parse_path_name(op.name, term)) { if (op.name.len == 1 && op.name.str[0] == '*') { - op.type = CHILD_WILDCARD; + op.type = path_operator_type::CHILD_WILDCARD; } else { - op.type = CHILD; + op.type = path_operator_type::CHILD; } return op; } @@ -262,13 +493,18 @@ class path_state : private parser { case '[': { path_operator op; json_string term{"]", 1}; - if (parse_name(op.name, term)) { + bool is_string = *pos == '\'' ? true : false; + if (parse_path_name(op.name, term)) { pos++; if (op.name.len == 1 && op.name.str[0] == '*') { - op.type = CHILD_WILDCARD; + op.type = path_operator_type::CHILD_WILDCARD; } else { - // unhandled cases - break; + if (is_string) { + op.type = path_operator_type::CHILD; + } else { + op.type = path_operator_type::CHILD_INDEX; + op.index = parse_numeric(op.name.str, op.name.str + op.name.len, json_opts); + } } return op; } @@ -276,76 +512,135 @@ class path_state : private parser { default: break; } - return {ERROR}; + return {path_operator_type::ERROR}; } private: - CUDA_HOST_DEVICE_CALLABLE char parse_char() { return *pos++; } -}; + cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'}; -struct json_output { - size_t output_max_len; - size_t output_len; - char* output; - - CUDA_HOST_DEVICE_CALLABLE void add_output(const char* str, size_t len) + CUDA_HOST_DEVICE_CALLABLE bool parse_path_name(json_string& name, json_string& terminators) { - if (output != nullptr) { - // assert output_len + len < output_max_len - memcpy(output + output_len, str, len); + char c = *pos; + switch (c) { + case '*': + name.str = pos; + name.len = 1; + pos++; + break; + + case '\'': + if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; } + break; + + default: { + size_t const chars_left = input_len - (pos - input); + char const* end = device_strpbrk(pos, chars_left, terminators.str, terminators.len); + if (end) { + name.str = pos; + name.len = end - pos; + pos = end; + } else { + name.str = pos; + name.len = chars_left; + pos = input + input_len; + } + return true; + } } - output_len += len; - } - CUDA_HOST_DEVICE_CALLABLE void add_output(json_string str) { add_output(str.str, str.len); } + // must end in one of the terminators + size_t const chars_left = input_len - (pos - input); + char const* end = device_strpbrk(pos, chars_left, terminators.str, terminators.len); + if (!end) { return false; } + pos = end; + return true; + } }; -CUDA_HOST_DEVICE_CALLABLE void parse_json_path(json_state& j_state, - path_state p_state, - json_output& output) +CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state, + path_state p_state, + json_output& output, + bool list_element = false) { path_operator op = p_state.get_next_operator(); switch (op.type) { // whatever the first object is - case ROOT: { - json_state child; - json_string wildcard{"*", 1}; - if (j_state.next_match(wildcard, child)) { parse_json_path(child, p_state, output); } - } break; + case path_operator_type::ROOT: + if (j_state.next_element() != parse_result::ERROR) { + return parse_json_path(j_state, p_state, output); + } + break; // .name // ['name'] // [1] // will return a single thing - case CHILD: { - json_state child; - if (j_state.next_match(op.name, child)) { parse_json_path(child, p_state, output); } + case path_operator_type::CHILD: { + parse_result res = j_state.child_element(); + if (res != parse_result::SUCCESS) { return res; } + res = j_state.next_matching_element(op.name, true); + if (res != parse_result::SUCCESS) { return res; } + return parse_json_path(j_state, p_state, output, list_element); } break; // .* // [*] // will return an array of things - case CHILD_WILDCARD: { + case path_operator_type::CHILD_WILDCARD: { output.add_output("[\n", 2); - json_state child; + parse_result res = j_state.child_element(); + if (res == parse_result::ERROR) { return parse_result::ERROR; } + if (res == parse_result::EMPTY) { + output.add_output("]\n", 2); + return parse_result::SUCCESS; + } + + res = j_state.next_matching_element(op.name, true); int count = 0; - while (j_state.next_match(op.name, child)) { - if (count > 0) { output.add_output(",\n", 2); } - parse_json_path(child, p_state, output); - j_state = child; - count++; + while (res == parse_result::SUCCESS) { + json_state j_sub(j_state); + path_state p_sub(p_state); + parse_result sub_res = parse_json_path(j_sub, p_sub, output, count > 0 ? true : false); + if (sub_res == parse_result::ERROR) { return parse_result::ERROR; } + if (sub_res != parse_result::EMPTY) { count++; } + res = j_state.next_matching_element(op.name, false); } + + if (res == parse_result::ERROR) { return parse_result::ERROR; } + output.add_output("]\n", 2); + return parse_result::SUCCESS; + } break; + + // [0] + // [1] + // etc + // returns a single thing + case path_operator_type::CHILD_INDEX: { + parse_result res = j_state.child_element(); + if (res != parse_result::SUCCESS) { return res; } + json_string any{"*", 1}; + res = j_state.next_matching_element(any, true); + if (res != parse_result::SUCCESS) { return res; } + for (int idx = 1; idx <= op.index; idx++) { + res = j_state.next_matching_element(any, false); + if (res != parse_result::SUCCESS) { return res; } + } + return parse_json_path(j_state, p_state, output, list_element); } break; // some sort of error. - case ERROR: break; + case path_operator_type::ERROR: return parse_result::ERROR; break; // END case - default: output.add_output(j_state.extract_element()); break; + default: { + if (list_element) { output.add_output({",\n", 2}); } + if (j_state.extract_element(&output) == parse_result::ERROR) { return parse_result::ERROR; } + } break; } + return parse_result::SUCCESS; } CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input, @@ -393,7 +688,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c { size_t stack_size; cudaDeviceGetLimit(&stack_size, cudaLimitStackSize); - cudaDeviceSetLimit(cudaLimitStackSize, 2048); + cudaDeviceSetLimit(cudaLimitStackSize, 4096); auto offsets = cudf::make_fixed_width_column( data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr); diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index f114738b345..015e50f910a 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -146,7 +146,7 @@ TEST_F(JsonTests, GetJsonObject) { cudf::test::strings_column_wrapper input{json_string}; - cudf::string_scalar json_path("$.*"); + cudf::string_scalar json_path("$.store.*"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::print(*result); @@ -154,7 +154,47 @@ TEST_F(JsonTests, GetJsonObject) { cudf::test::strings_column_wrapper input{json_string}; - cudf::string_scalar json_path("$[*]"); + cudf::string_scalar json_path("$.store.book[*]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + cudf::string_scalar json_path("$.store.book[*].category"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + cudf::string_scalar json_path("$.store.book[*].title"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + cudf::string_scalar json_path("$.store['bicycle']"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + cudf::string_scalar json_path("$.store.book[*]['isbn']"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + cudf::string_scalar json_path("$.store.book[2]"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::print(*result); From adb572425fb5ed5b647bb4ee2c2e3e36f41f88fe Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Thu, 11 Feb 2021 13:40:04 -0600 Subject: [PATCH 03/33] Optimization: preprocess the json path into a simple command buffer instead of doing the parsing on the gpu. --- cpp/include/cudf/strings/substring.hpp | 2 +- cpp/src/strings/json/json_path.cu | 88 +++++++++++++++++--------- cpp/tests/strings/json_tests.cpp | 20 +++--- 3 files changed, 68 insertions(+), 42 deletions(-) diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp index 06a38360fdb..100dd80b3c2 100644 --- a/cpp/include/cudf/strings/substring.hpp +++ b/cpp/include/cudf/strings/substring.hpp @@ -213,7 +213,7 @@ std::unique_ptr slice_strings( */ std::unique_ptr get_json_object( cudf::strings_column_view const& col, - cudf::string_scalar const& json_path, + std::string const& json_path, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 9267d933368..fc98ee25ee6 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -9,6 +9,7 @@ #include +#include #include #include @@ -448,12 +449,20 @@ class json_state : private parser { enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END }; -// constexpr max_name_len (63) struct path_operator { + CUDA_HOST_DEVICE_CALLABLE path_operator() : type(path_operator_type::ERROR), index(-1) {} + CUDA_HOST_DEVICE_CALLABLE path_operator(path_operator_type _type) : type(_type), index(-1) {} + path_operator_type type; json_string name; int index; }; +struct command_buffer { + rmm::device_uvector commands; + // used as backing memory for the name fields inside the + // path_operator objects + string_scalar json_path; +}; // current state of the JSONPath class path_state : private parser { @@ -548,27 +557,50 @@ class path_state : private parser { } } - // must end in one of the terminators - size_t const chars_left = input_len - (pos - input); - char const* end = device_strpbrk(pos, chars_left, terminators.str, terminators.len); - if (!end) { return false; } - pos = end; return true; } }; +command_buffer build_command_buffer(std::string const& json_path, rmm::cuda_stream_view stream) +{ + path_state p_state(json_path.data(), static_cast(json_path.size())); + + std::vector h_operators; + cudf::string_scalar d_json_path(json_path); + + path_operator op; + do { + op = p_state.get_next_operator(); + if (op.type == path_operator_type::ERROR) { + CUDF_FAIL("Encountered invalid JSONPath input string"); + } + // convert pointer to device pointer + if (op.name.len > 0) { op.name.str = d_json_path.data() + (op.name.str - json_path.data()); } + h_operators.push_back(op); + } while (op.type != path_operator_type::END); + + rmm::device_uvector d_operators(h_operators.size(), stream); + cudaMemcpyAsync(d_operators.data(), + h_operators.data(), + sizeof(path_operator) * h_operators.size(), + cudaMemcpyHostToDevice, + stream.value()); + + return {std::move(d_operators), std::move(d_json_path)}; +} + CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state, - path_state p_state, + path_operator const* commands, json_output& output, bool list_element = false) { - path_operator op = p_state.get_next_operator(); + path_operator op = *commands; switch (op.type) { // whatever the first object is case path_operator_type::ROOT: if (j_state.next_element() != parse_result::ERROR) { - return parse_json_path(j_state, p_state, output); + return parse_json_path(j_state, commands + 1, output); } break; @@ -581,7 +613,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state, if (res != parse_result::SUCCESS) { return res; } res = j_state.next_matching_element(op.name, true); if (res != parse_result::SUCCESS) { return res; } - return parse_json_path(j_state, p_state, output, list_element); + return parse_json_path(j_state, commands + 1, output, list_element); } break; // .* @@ -601,8 +633,8 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state, int count = 0; while (res == parse_result::SUCCESS) { json_state j_sub(j_state); - path_state p_sub(p_state); - parse_result sub_res = parse_json_path(j_sub, p_sub, output, count > 0 ? true : false); + parse_result sub_res = + parse_json_path(j_sub, commands + 1, output, count > 0 ? true : false); if (sub_res == parse_result::ERROR) { return parse_result::ERROR; } if (sub_res != parse_result::EMPTY) { count++; } res = j_state.next_matching_element(op.name, false); @@ -628,7 +660,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state, res = j_state.next_matching_element(any, false); if (res != parse_result::SUCCESS) { return res; } } - return parse_json_path(j_state, p_state, output, list_element); + return parse_json_path(j_state, commands + 1, output, list_element); } break; // some sort of error. @@ -645,47 +677,43 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state, CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input, size_t input_len, - char const* path, - size_t path_len, + path_operator const* commands, char* out_buf, size_t out_buf_size) { // TODO: add host-side code to verify path is a valid string. json_state j_state(input, input_len); - path_state p_state(path, path_len); json_output output{out_buf_size, 0, out_buf}; - parse_json_path(j_state, p_state, output); + parse_json_path(j_state, commands, output); return output; } __global__ void get_json_object_kernel(char const* chars, size_type const* offsets, - char const* json_path, - size_t json_path_len, + path_operator const* commands, size_type* output_offsets, char* out_buf, size_t out_buf_size) { uint64_t const tid = threadIdx.x + (blockDim.x * blockIdx.x); - json_output out = get_json_object_single(chars + offsets[tid], - offsets[tid + 1] - offsets[tid], - json_path, - json_path_len, - out_buf, - out_buf_size); + json_output out = get_json_object_single( + chars + offsets[tid], offsets[tid + 1] - offsets[tid], commands, out_buf, out_buf_size); // filled in only during the precompute step if (output_offsets != nullptr) { output_offsets[tid] = static_cast(out.output_len); } } std::unique_ptr get_json_object(cudf::strings_column_view const& col, - cudf::string_scalar const& json_path, + std::string const& json_path, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + // preprocess the json_path into a command buffer + command_buffer cmd_buf = build_command_buffer(json_path, stream); + size_t stack_size; cudaDeviceGetLimit(&stack_size, cudaLimitStackSize); cudaDeviceSetLimit(cudaLimitStackSize, 4096); @@ -700,8 +728,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c get_json_object_kernel<<>>( col.chars().head(), col.offsets().head(), - json_path.data(), - json_path.size(), + cmd_buf.commands.data(), offsets_view.head(), nullptr, 0); @@ -723,8 +750,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c get_json_object_kernel<<>>( col.chars().head(), col.offsets().head(), - json_path.data(), - json_path.size(), + cmd_buf.commands.data(), nullptr, chars_view.head(), output_size); @@ -745,7 +771,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c } // namespace detail std::unique_ptr get_json_object(cudf::strings_column_view const& col, - cudf::string_scalar const& json_path, + std::string const& json_path, rmm::mr::device_memory_resource* mr) { return detail::get_json_object(col, json_path, 0, mr); diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index 015e50f910a..b00fb39158c 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -122,7 +122,7 @@ TEST_F(JsonTests, GetJsonObject) { cudf::test::strings_column_wrapper input{json_string}; - cudf::string_scalar json_path("$"); + std::string json_path("$"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::print(*result); @@ -130,7 +130,7 @@ TEST_F(JsonTests, GetJsonObject) { cudf::test::strings_column_wrapper input{json_string}; - cudf::string_scalar json_path("$.store"); + std::string json_path("$.store"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::print(*result); @@ -138,7 +138,7 @@ TEST_F(JsonTests, GetJsonObject) { cudf::test::strings_column_wrapper input{json_string}; - cudf::string_scalar json_path("$.store.book"); + std::string json_path("$.store.book"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::print(*result); @@ -146,7 +146,7 @@ TEST_F(JsonTests, GetJsonObject) { cudf::test::strings_column_wrapper input{json_string}; - cudf::string_scalar json_path("$.store.*"); + std::string json_path("$.store.*"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::print(*result); @@ -154,7 +154,7 @@ TEST_F(JsonTests, GetJsonObject) { cudf::test::strings_column_wrapper input{json_string}; - cudf::string_scalar json_path("$.store.book[*]"); + std::string json_path("$.store.book[*]"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::print(*result); @@ -162,7 +162,7 @@ TEST_F(JsonTests, GetJsonObject) { cudf::test::strings_column_wrapper input{json_string}; - cudf::string_scalar json_path("$.store.book[*].category"); + std::string json_path("$.store.book[*].category"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::print(*result); @@ -170,7 +170,7 @@ TEST_F(JsonTests, GetJsonObject) { cudf::test::strings_column_wrapper input{json_string}; - cudf::string_scalar json_path("$.store.book[*].title"); + std::string json_path("$.store.book[*].title"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::print(*result); @@ -178,7 +178,7 @@ TEST_F(JsonTests, GetJsonObject) { cudf::test::strings_column_wrapper input{json_string}; - cudf::string_scalar json_path("$.store['bicycle']"); + std::string json_path("$.store['bicycle']"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::print(*result); @@ -186,7 +186,7 @@ TEST_F(JsonTests, GetJsonObject) { cudf::test::strings_column_wrapper input{json_string}; - cudf::string_scalar json_path("$.store.book[*]['isbn']"); + std::string json_path("$.store.book[*]['isbn']"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::print(*result); @@ -194,7 +194,7 @@ TEST_F(JsonTests, GetJsonObject) { cudf::test::strings_column_wrapper input{json_string}; - cudf::string_scalar json_path("$.store.book[2]"); + std::string json_path("$.store.book[2]"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::print(*result); From ec7ab4a07c6854678a382bdc34913b1e7133965f Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 17 Feb 2021 10:13:10 -0600 Subject: [PATCH 04/33] Fix incorrect interface in detail header. --- cpp/include/cudf/strings/detail/substring.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/strings/detail/substring.hpp b/cpp/include/cudf/strings/detail/substring.hpp index a646d93e2b8..3e7f6fea0d0 100644 --- a/cpp/include/cudf/strings/detail/substring.hpp +++ b/cpp/include/cudf/strings/detail/substring.hpp @@ -33,7 +33,7 @@ namespace detail { */ std::unique_ptr get_json_object( cudf::strings_column_view const& col, - cudf::string_scalar const& json_path, + std::string const& json_path, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); From 6d94a73cbe6a70640e6bcb82ff85d5a7c681dd0d Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Thu, 18 Feb 2021 14:20:48 -0600 Subject: [PATCH 05/33] Add benchmarks for get_json_object(). Couple of bug fixes. --- cpp/benchmarks/CMakeLists.txt | 8 ++ cpp/benchmarks/string/json_benchmark.cpp | 140 +++++++++++++++++++++++ cpp/src/strings/json/json_path.cu | 27 +++-- 3 files changed, 165 insertions(+), 10 deletions(-) create mode 100644 cpp/benchmarks/string/json_benchmark.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 7b5c092f9c6..fe3ad177e9e 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -309,3 +309,11 @@ set(STRINGS_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/string/convert_durations_benchmark.cpp") ConfigureBench(STRINGS_BENCH "${STRINGS_BENCH_SRC}") + +################################################################################################### +# - json benchmark ------------------------------------------------------------------- + +set(JSON_BENCH_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/string/json_benchmark.cpp") + +ConfigureBench(JSON_BENCH "${JSON_BENCH_SRC}") diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json_benchmark.cpp new file mode 100644 index 00000000000..df1aadef404 --- /dev/null +++ b/cpp/benchmarks/string/json_benchmark.cpp @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +#include +#include + +class JsonPath : public cudf::benchmark { +}; + +float frand() { return static_cast(rand()) / static_cast(RAND_MAX); } + +int rand_range(int min, int max) { return min + static_cast(frand() * (max - min)); } + +std::vector Books{ + "{\n\"category\": \"reference\",\n\"author\": \"Nigel Rees\",\n\"title\": \"Sayings of the " + "Century\",\n\"price\": 8.95\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"Evelyn Waugh\",\n\"title\": \"Sword of " + "Honour\",\n\"price\": 12.99\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"Herman Melville\",\n\"title\": \"Moby " + "Dick\",\n\"isbn\": \"0-553-21311-3\",\n\"price\": 8.99\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"J. R. R. Tolkien\",\n\"title\": \"The Lord of the " + "Rings\",\n\"isbn\": \"0-395-19395-8\",\n\"price\": 22.99\n}"}; +constexpr int Approx_book_size = 110; +std::vector Bicycles{ + "{\"color\": \"red\", \"price\": 9.95}", + "{\"color\": \"green\", \"price\": 29.95}", + "{\"color\": \"blue\", \"price\": 399.95}", + "{\"color\": \"yellow\", \"price\": 99.95}", + "{\"color\": \"mauve\", \"price\": 199.95}", +}; +constexpr int Approx_bicycle_size = 33; +std::string Misc{"\n\"expensive\": 10\n"}; +std::string generate_field(std::vector const& values, int num_values) +{ + std::string res; + for (int idx = 0; idx < num_values; idx++) { + if (idx > 0) { res += std::string(",\n"); } + int vindex = std::min(static_cast(floor(frand() * values.size())), + static_cast(values.size() - 1)); + res += values[vindex]; + } + return res; +} + +std::string build_row(int desired_bytes) +{ + // always have at least 2 books and 2 bikes + int num_books = 2; + int num_bicycles = 2; + int remaining_bytes = + desired_bytes - ((num_books * Approx_book_size) + (num_bicycles * Approx_bicycle_size)); + + // divide up the remainder between books and bikes + float book_pct = frand(); + float bicycle_pct = 1.0f - book_pct; + num_books += (remaining_bytes * book_pct) / Approx_book_size; + num_bicycles += (remaining_bytes * bicycle_pct) / Approx_bicycle_size; + + std::string books = "\"book\": [\n" + generate_field(Books, num_books) + "]\n"; + std::string bicycles = "\"bicycle\": [\n" + generate_field(Bicycles, num_bicycles) + "]\n"; + + std::string store = "\"store\": {\n"; + if (frand() <= 0.5f) { + store += books + std::string(",\n") + bicycles; + } else { + store += bicycles + std::string(",\n") + books; + } + store += std::string("}\n"); + + std::string row = std::string("{\n"); + if (frand() <= 0.5f) { + row += store + std::string(",\n") + Misc; + } else { + row += Misc + std::string(",\n") + store; + } + row += std::string("}\n"); + return row; +} + +template +static void BM_case(benchmark::State& state, QueryArg&&... query_arg) +{ + srand(5236); + auto iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [desired_bytes = state.range(1)](int index) { return build_row(desired_bytes); }); + int num_rows = state.range(0); + cudf::test::strings_column_wrapper input(iter, iter + num_rows); + cudf::strings_column_view scv(input); + size_t num_chars = scv.chars().size(); + + std::string json_path(query_arg...); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + auto result = cudf::strings::get_json_object(scv, json_path); + cudaStreamSynchronize(0); + } + + // this isn't strictly 100% accurate. a given query isn't necessarily + // going to visit every single incoming character. but in spirit it does. + state.SetBytesProcessed(state.iterations() * num_chars); +} + +#define JSON_BENCHMARK_DEFINE(name, query) \ + BENCHMARK_CAPTURE(BM_case, name, query) \ + ->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +JSON_BENCHMARK_DEFINE(query0, "$"); +JSON_BENCHMARK_DEFINE(query1, "$.store"); +JSON_BENCHMARK_DEFINE(query2, "$.store.book"); +JSON_BENCHMARK_DEFINE(query3, "$.store.*"); +JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]"); +JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category"); +JSON_BENCHMARK_DEFINE(query6, "$.store.['bicycle']"); +JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']"); +JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]"); \ No newline at end of file diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index fc98ee25ee6..226143cb6a2 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -14,6 +14,8 @@ #include +// #include "db_test.cuh" + namespace cudf { namespace strings { namespace detail { @@ -590,7 +592,7 @@ command_buffer build_command_buffer(std::string const& json_path, rmm::cuda_stre } CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state, - path_operator const* commands, + path_operator const* const commands, json_output& output, bool list_element = false) { @@ -677,7 +679,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state, CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input, size_t input_len, - path_operator const* commands, + path_operator const* const commands, char* out_buf, size_t out_buf_size) { @@ -692,18 +694,23 @@ CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input, __global__ void get_json_object_kernel(char const* chars, size_type const* offsets, - path_operator const* commands, + path_operator const* const commands, size_type* output_offsets, char* out_buf, - size_t out_buf_size) + size_type num_rows) { uint64_t const tid = threadIdx.x + (blockDim.x * blockIdx.x); + if (tid >= num_rows) { return; } + + char* dst = out_buf ? out_buf + output_offsets[tid] : nullptr; + size_t dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0; + json_output out = get_json_object_single( - chars + offsets[tid], offsets[tid + 1] - offsets[tid], commands, out_buf, out_buf_size); + chars + offsets[tid], offsets[tid + 1] - offsets[tid], commands, dst, dst_size); // filled in only during the precompute step - if (output_offsets != nullptr) { output_offsets[tid] = static_cast(out.output_len); } + if (!out_buf) { output_offsets[tid] = static_cast(out.output_len); } } std::unique_ptr get_json_object(cudf::strings_column_view const& col, @@ -722,7 +729,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr); cudf::mutable_column_view offsets_view(*offsets); - cudf::detail::grid_1d const grid{1, col.size()}; + cudf::detail::grid_1d const grid{col.size(), 512}; // preprocess sizes get_json_object_kernel<<>>( @@ -731,7 +738,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c cmd_buf.commands.data(), offsets_view.head(), nullptr, - 0); + col.size()); // convert sizes to offsets thrust::exclusive_scan(rmm::exec_policy(stream), @@ -751,9 +758,9 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c col.chars().head(), col.offsets().head(), cmd_buf.commands.data(), - nullptr, + offsets_view.head(), chars_view.head(), - output_size); + col.size()); // reset back to original stack size cudaDeviceSetLimit(cudaLimitStackSize, stack_size); From 05ad3fccd4951bef7de6bc99a211748bd7eb9c02 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Fri, 19 Feb 2021 15:08:22 -0600 Subject: [PATCH 06/33] Make get_json_object() non-recursive. --- cpp/src/strings/json/json_path.cu | 244 +++++++++++++++++++----------- 1 file changed, 152 insertions(+), 92 deletions(-) diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 226143cb6a2..84e09d7673b 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -202,6 +202,8 @@ class parser { parse_whitespace(); } + CUDA_HOST_DEVICE_CALLABLE parser(parser const& p) { *this = p; } + CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; } CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); } @@ -303,7 +305,7 @@ class json_state : private parser { { } - CUDA_HOST_DEVICE_CALLABLE json_state(json_state const& j) { *this = j; } + CUDA_HOST_DEVICE_CALLABLE json_state(json_state const& j) : parser(j) { *this = j; } CUDA_HOST_DEVICE_CALLABLE parse_result extract_element(json_output* output) { @@ -469,12 +471,10 @@ struct command_buffer { // current state of the JSONPath class path_state : private parser { public: - CUDA_HOST_DEVICE_CALLABLE path_state() : parser() {} CUDA_HOST_DEVICE_CALLABLE path_state(const char* _path, size_t _path_len) : parser(_path, _path_len) { } - CUDA_HOST_DEVICE_CALLABLE path_state(path_state const& p) { *this = p; } CUDA_HOST_DEVICE_CALLABLE path_operator get_next_operator() { @@ -563,7 +563,8 @@ class path_state : private parser { } }; -command_buffer build_command_buffer(std::string const& json_path, rmm::cuda_stream_view stream) +std::pair build_command_buffer(std::string const& json_path, + rmm::cuda_stream_view stream) { path_state p_state(json_path.data(), static_cast(json_path.size())); @@ -571,11 +572,13 @@ command_buffer build_command_buffer(std::string const& json_path, rmm::cuda_stre cudf::string_scalar d_json_path(json_path); path_operator op; + int max_stack_depth = 1; do { op = p_state.get_next_operator(); if (op.type == path_operator_type::ERROR) { CUDF_FAIL("Encountered invalid JSONPath input string"); } + if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; } // convert pointer to device pointer if (op.name.len > 0) { op.name.str = d_json_path.data() + (op.name.str - json_path.data()); } h_operators.push_back(op); @@ -588,106 +591,168 @@ command_buffer build_command_buffer(std::string const& json_path, rmm::cuda_stre cudaMemcpyHostToDevice, stream.value()); - return {std::move(d_operators), std::move(d_json_path)}; + return {command_buffer{std::move(d_operators), std::move(d_json_path)}, max_stack_depth}; } -CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& j_state, - path_operator const* const commands, +#define PARSE_TRY(_x) \ + do { \ + last_result = _x; \ + if (last_result == parse_result::ERROR) { return parse_result::ERROR; } \ + } while (0) + +template +CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, + path_operator const* _commands, json_output& output, - bool list_element = false) + bool _list_element = false) { - path_operator op = *commands; + // manually maintained context stack in lieu of calling parse_json_path recursively. + struct context { + json_state j_state; + path_operator const* commands; + bool list_element; + bool state_flag; + int count; + }; + context stack[max_command_stack_depth]; + int stack_pos = 0; + auto push_context = [&stack, &stack_pos](json_state const& _j_state, + path_operator const* _commands, + bool _list_element = false, + bool _state_flag = false, + int _count = 0) { + if (stack_pos == max_command_stack_depth - 1) { return false; } + stack[stack_pos++] = context{_j_state, _commands, _list_element, _state_flag, _count}; + return true; + }; + auto pop_context = [&stack, &stack_pos](context& c) { + if (stack_pos > 0) { + c = stack[--stack_pos]; + return true; + } + return false; + }; + push_context(_j_state, _commands, _list_element); + + parse_result last_result = parse_result::SUCCESS; + context ctx; + while (pop_context(ctx)) { + path_operator op = *ctx.commands; + + switch (op.type) { + // whatever the first object is + case path_operator_type::ROOT: + PARSE_TRY(ctx.j_state.next_element()); + push_context(ctx.j_state, ctx.commands + 1); + break; - switch (op.type) { - // whatever the first object is - case path_operator_type::ROOT: - if (j_state.next_element() != parse_result::ERROR) { - return parse_json_path(j_state, commands + 1, output); - } - break; + // .name + // ['name'] + // [1] + // will return a single thing + case path_operator_type::CHILD: { + PARSE_TRY(ctx.j_state.child_element()); + if (last_result == parse_result::SUCCESS) { + PARSE_TRY(ctx.j_state.next_matching_element(op.name, true)); + if (last_result == parse_result::SUCCESS) { + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); + } + } + } break; - // .name - // ['name'] - // [1] - // will return a single thing - case path_operator_type::CHILD: { - parse_result res = j_state.child_element(); - if (res != parse_result::SUCCESS) { return res; } - res = j_state.next_matching_element(op.name, true); - if (res != parse_result::SUCCESS) { return res; } - return parse_json_path(j_state, commands + 1, output, list_element); - } break; - - // .* - // [*] - // will return an array of things - case path_operator_type::CHILD_WILDCARD: { - output.add_output("[\n", 2); - - parse_result res = j_state.child_element(); - if (res == parse_result::ERROR) { return parse_result::ERROR; } - if (res == parse_result::EMPTY) { - output.add_output("]\n", 2); - return parse_result::SUCCESS; - } + // .* + // [*] + // will return an array of things + case path_operator_type::CHILD_WILDCARD: { + // if we're on the first element of this wildcard + if (!ctx.state_flag) { + output.add_output("[\n", 2); + + // step into the child element + PARSE_TRY(ctx.j_state.child_element()); + if (last_result == parse_result::EMPTY) { + output.add_output("]\n", 2); + last_result = parse_result::SUCCESS; + break; + } - res = j_state.next_matching_element(op.name, true); - int count = 0; - while (res == parse_result::SUCCESS) { - json_state j_sub(j_state); - parse_result sub_res = - parse_json_path(j_sub, commands + 1, output, count > 0 ? true : false); - if (sub_res == parse_result::ERROR) { return parse_result::ERROR; } - if (sub_res != parse_result::EMPTY) { count++; } - res = j_state.next_matching_element(op.name, false); - } + // first element + PARSE_TRY(ctx.j_state.next_matching_element(op.name, true)); + if (last_result == parse_result::EMPTY) { + output.add_output("]\n", 2); + last_result = parse_result::SUCCESS; + break; + } - if (res == parse_result::ERROR) { return parse_result::ERROR; } - - output.add_output("]\n", 2); - return parse_result::SUCCESS; - } break; - - // [0] - // [1] - // etc - // returns a single thing - case path_operator_type::CHILD_INDEX: { - parse_result res = j_state.child_element(); - if (res != parse_result::SUCCESS) { return res; } - json_string any{"*", 1}; - res = j_state.next_matching_element(any, true); - if (res != parse_result::SUCCESS) { return res; } - for (int idx = 1; idx <= op.index; idx++) { - res = j_state.next_matching_element(any, false); - if (res != parse_result::SUCCESS) { return res; } - } - return parse_json_path(j_state, commands + 1, output, list_element); - } break; + // re-push ourselves + push_context(ctx.j_state, ctx.commands, false, true); + // push the next command + push_context(ctx.j_state, ctx.commands + 1); + } else { + // if we actually processed something to the output, increment count + if (last_result != parse_result::EMPTY) { ctx.count++; } + + // next element + PARSE_TRY(ctx.j_state.next_matching_element(op.name, false)); + if (last_result == parse_result::EMPTY) { + output.add_output("]\n", 2); + last_result = parse_result::SUCCESS; + break; + } + + // re-push ourselves + push_context(ctx.j_state, ctx.commands, false, true); + // push the next command + push_context(ctx.j_state, ctx.commands + 1, ctx.count > 0 ? true : false); + } + } break; + + // [0] + // [1] + // etc + // returns a single thing + case path_operator_type::CHILD_INDEX: { + PARSE_TRY(ctx.j_state.child_element()); + if (last_result == parse_result::SUCCESS) { + json_string any{"*", 1}; + PARSE_TRY(ctx.j_state.next_matching_element(any, true)); + if (last_result == parse_result::SUCCESS) { + for (int idx = 1; idx <= op.index; idx++) { + PARSE_TRY(ctx.j_state.next_matching_element(any, false)); + if (last_result == parse_result::EMPTY) { break; } + } + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); + } + } + } break; - // some sort of error. - case path_operator_type::ERROR: return parse_result::ERROR; break; + // some sort of error. + case path_operator_type::ERROR: return parse_result::ERROR; break; - // END case - default: { - if (list_element) { output.add_output({",\n", 2}); } - if (j_state.extract_element(&output) == parse_result::ERROR) { return parse_result::ERROR; } - } break; + // END case + default: { + if (ctx.list_element) { output.add_output({",\n", 2}); } + PARSE_TRY(ctx.j_state.extract_element(&output)); + } break; + } } return parse_result::SUCCESS; } +// hardcoding this for now. to reach a stack depth of 8 would require +// a jsonpath containing 7 nested wildcards so this is probably reasonable. +constexpr int max_command_stack_depth = 8; + CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input, size_t input_len, path_operator const* const commands, char* out_buf, size_t out_buf_size) { - // TODO: add host-side code to verify path is a valid string. json_state j_state(input, input_len); json_output output{out_buf_size, 0, out_buf}; - parse_json_path(j_state, commands, output); + parse_json_path(j_state, commands, output); return output; } @@ -719,11 +784,9 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c rmm::mr::device_memory_resource* mr) { // preprocess the json_path into a command buffer - command_buffer cmd_buf = build_command_buffer(json_path, stream); - - size_t stack_size; - cudaDeviceGetLimit(&stack_size, cudaLimitStackSize); - cudaDeviceSetLimit(cudaLimitStackSize, 4096); + std::pair preprocess = build_command_buffer(json_path, stream); + CUDF_EXPECTS(preprocess.second <= max_command_stack_depth, + "Encountered json_path string that is too complex"); auto offsets = cudf::make_fixed_width_column( data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr); @@ -735,7 +798,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c get_json_object_kernel<<>>( col.chars().head(), col.offsets().head(), - cmd_buf.commands.data(), + preprocess.first.commands.data(), offsets_view.head(), nullptr, col.size()); @@ -757,14 +820,11 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c get_json_object_kernel<<>>( col.chars().head(), col.offsets().head(), - cmd_buf.commands.data(), + preprocess.first.commands.data(), offsets_view.head(), chars_view.head(), col.size()); - // reset back to original stack size - cudaDeviceSetLimit(cudaLimitStackSize, stack_size); - return make_strings_column(col.size(), std::move(offsets), std::move(chars), @@ -785,4 +845,4 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c } } // namespace strings -} // namespace cudf \ No newline at end of file +} // namespace cudf From 9411f29e94d2d55ee72f85d4b6d5f3abbb32ef91 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Mon, 22 Feb 2021 09:48:30 -0800 Subject: [PATCH 07/33] Java bindings for get_json_object --- .../main/java/ai/rapids/cudf/ColumnView.java | 19 +++++++++ java/src/main/native/src/ColumnViewJni.cpp | 23 ++++++++++ .../java/ai/rapids/cudf/ColumnVectorTest.java | 42 +++++++++++++++++++ 3 files changed, 84 insertions(+) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 1dce52f7105..8be1f035187 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -1883,6 +1883,23 @@ public final ColumnVector substring(ColumnView start, ColumnView end) { return new ColumnVector(substringColumn(getNativeView(), start.getNativeView(), end.getNativeView())); } + /** + * Apply a JSONPath string to all rows in an input strings column. + * + * Applies a JSONPath string to an incoming strings column where each row in the column + * is a valid json string. The output is returned by row as a strings column. + * + * For reference, https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html + * Note: Only implements the operators: $ . [] * + * + * @param path The JSONPath string to be applied to each row + * @return new strings ColumnVector containing the retrieved json object strings + */ + public final ColumnVector getJSONObject(String path) { + assert(type.equals(DType.STRING)) : "column type must be a String"; + return new ColumnVector(getJSONObject(getNativeView(), path)); + } + /** * Returns a new strings column where target string within each string is replaced with the specified * replacement string. @@ -2406,6 +2423,8 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) { */ private static native long stringTimestampToTimestamp(long viewHandle, int unit, String format); + private static native long getJSONObject(long viewHandle, String path) throws CudfException; + /** * Native method to parse and convert a timestamp column vector to string column vector. A unix * timestamp is a long value representing how many units since 1970-01-01 00:00:00:000 in either diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 82e71b04a2f..aebe56ee857 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -60,9 +60,12 @@ #include #include #include +#include "cudf/strings/strings_column_view.hpp" #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp" +#include "jni.h" +#include "jni_utils.hpp" namespace { @@ -1760,4 +1763,24 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv } CATCH_STD(env, 0) } + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jobject j_object, + jlong j_view_handle, jstring j_path) { + + JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0); + JNI_NULL_CHECK(env, j_path, "path cannot be null", 0); + + try { + + cudf::column_view* n_column_view = reinterpret_cast(j_view_handle); + cudf::strings_column_view n_strings_col_view(*n_column_view); + + cudf::jni::native_jstring n_path(env, j_path); + auto result = cudf::strings::get_json_object(n_strings_col_view, std::string(n_path.get())); + + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0) + +} } // extern "C" diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index cb1f792b99e..5139269553e 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3875,6 +3875,48 @@ void testCopyToColumnVector() { } } + @Test + void testGetJSONObject() { + String jsonString = "{ \"store\": {\n" + + " \"book\": [\n" + + " { \"category\": \"reference\",\n" + + " \"author\": \"Nigel Rees\",\n" + + " \"title\": \"Sayings of the Century\",\n" + + " \"price\": 8.95\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"Evelyn Waugh\",\n" + + " \"title\": \"Sword of Honour\",\n" + + " \"price\": 12.99\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"Herman Melville\",\n" + + " \"title\": \"Moby Dick\",\n" + + " \"isbn\": \"0-553-21311-3\",\n" + + " \"price\": 8.99\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"J. R. R. Tolkien\",\n" + + " \"title\": \"The Lord of the Rings\",\n" + + " \"isbn\": \"0-395-19395-8\",\n" + + " \"price\": 22.99\n" + + " }\n" + + " ],\n" + + " \"bicycle\": {\n" + + " \"color\": \"red\",\n" + + " \"price\": 19.95\n" + + " }\n" + + " }\n" + + "}"; + + try (ColumnVector json = ColumnVector.fromStrings(jsonString); + ColumnVector expectedAuthors = ColumnVector.fromStrings("[\n\"Nigel Rees\",\n\"Evelyn " + + "Waugh\",\n\"Herman Melville\",\n\"J. R. R. Tolkien\"]\n"); + ColumnVector gotAuthors = json.getJSONObject("$.store.book[*].author")) { + assertColumnsAreEqual(expectedAuthors, gotAuthors); + } + } + @Test void testMakeStructEmpty() { final int numRows = 10; From ff3544ce920ff85ff3409f0b80201ccc88cb5183 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 22 Feb 2021 13:13:05 -0600 Subject: [PATCH 08/33] Make debug readability formatting of output off by default. --- cpp/src/strings/json/json_path.cu | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 84e09d7673b..c9d43c22dfd 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -22,6 +22,10 @@ namespace detail { namespace { +// temporary. for debugging purposes +#define DEBUG_NEWLINE +// #define DEBUG_NEWLINE "\n" + using namespace cudf; CUDA_HOST_DEVICE_CALLABLE char to_lower(char const c) @@ -666,12 +670,12 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, case path_operator_type::CHILD_WILDCARD: { // if we're on the first element of this wildcard if (!ctx.state_flag) { - output.add_output("[\n", 2); + output.add_output("[" DEBUG_NEWLINE, 2); // step into the child element PARSE_TRY(ctx.j_state.child_element()); if (last_result == parse_result::EMPTY) { - output.add_output("]\n", 2); + output.add_output("]" DEBUG_NEWLINE, 2); last_result = parse_result::SUCCESS; break; } @@ -679,7 +683,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, // first element PARSE_TRY(ctx.j_state.next_matching_element(op.name, true)); if (last_result == parse_result::EMPTY) { - output.add_output("]\n", 2); + output.add_output("]" DEBUG_NEWLINE, 2); last_result = parse_result::SUCCESS; break; } @@ -695,7 +699,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, // next element PARSE_TRY(ctx.j_state.next_matching_element(op.name, false)); if (last_result == parse_result::EMPTY) { - output.add_output("]\n", 2); + output.add_output("]" DEBUG_NEWLINE, 2); last_result = parse_result::SUCCESS; break; } @@ -731,7 +735,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, // END case default: { - if (ctx.list_element) { output.add_output({",\n", 2}); } + if (ctx.list_element) { output.add_output({"," DEBUG_NEWLINE, 2}); } PARSE_TRY(ctx.j_state.extract_element(&output)); } break; } From 78d3dd8a09fdb6d592766d5893f8863980dd689d Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 22 Feb 2021 15:29:17 -0600 Subject: [PATCH 09/33] Change interface to get_json_object() to take a cudf::string_scalar instead of a std::string --- cpp/include/cudf/strings/detail/substring.hpp | 2 +- cpp/include/cudf/strings/substring.hpp | 2 +- cpp/src/strings/json/json_path.cu | 39 +++++++++++-------- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/cpp/include/cudf/strings/detail/substring.hpp b/cpp/include/cudf/strings/detail/substring.hpp index 3e7f6fea0d0..a646d93e2b8 100644 --- a/cpp/include/cudf/strings/detail/substring.hpp +++ b/cpp/include/cudf/strings/detail/substring.hpp @@ -33,7 +33,7 @@ namespace detail { */ std::unique_ptr get_json_object( cudf::strings_column_view const& col, - std::string const& json_path, + cudf::string_scalar const& json_path, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp index 100dd80b3c2..06a38360fdb 100644 --- a/cpp/include/cudf/strings/substring.hpp +++ b/cpp/include/cudf/strings/substring.hpp @@ -213,7 +213,7 @@ std::unique_ptr slice_strings( */ std::unique_ptr get_json_object( cudf::strings_column_view const& col, - std::string const& json_path, + cudf::string_scalar const& json_path, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index c9d43c22dfd..b06f1c5b32d 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -24,7 +24,10 @@ namespace { // temporary. for debugging purposes #define DEBUG_NEWLINE +#define DEBUG_NEWLINE_LEN (0) + // #define DEBUG_NEWLINE "\n" +// #define DEBUG_NEWLINE_LEN (1) using namespace cudf; @@ -160,6 +163,9 @@ struct json_string { const char* str; int64_t len; + CUDA_HOST_DEVICE_CALLABLE json_string() : str(nullptr), len(-1) {} + CUDA_HOST_DEVICE_CALLABLE json_string(const char* _str, int64_t _len) : str(_str), len(_len) {} + CUDA_HOST_DEVICE_CALLABLE bool operator==(json_string const& cmp) { return len == cmp.len && str != nullptr && cmp.str != nullptr && @@ -567,13 +573,13 @@ class path_state : private parser { } }; -std::pair build_command_buffer(std::string const& json_path, - rmm::cuda_stream_view stream) +std::pair, int> build_command_buffer( + cudf::string_scalar const& json_path, rmm::cuda_stream_view stream) { - path_state p_state(json_path.data(), static_cast(json_path.size())); + std::string h_json_path = json_path.to_string(stream); + path_state p_state(h_json_path.data(), static_cast(h_json_path.size())); std::vector h_operators; - cudf::string_scalar d_json_path(json_path); path_operator op; int max_stack_depth = 1; @@ -584,7 +590,7 @@ std::pair build_command_buffer(std::string const& json_path } if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; } // convert pointer to device pointer - if (op.name.len > 0) { op.name.str = d_json_path.data() + (op.name.str - json_path.data()); } + if (op.name.len > 0) { op.name.str = json_path.data() + (op.name.str - h_json_path.data()); } h_operators.push_back(op); } while (op.type != path_operator_type::END); @@ -595,7 +601,7 @@ std::pair build_command_buffer(std::string const& json_path cudaMemcpyHostToDevice, stream.value()); - return {command_buffer{std::move(d_operators), std::move(d_json_path)}, max_stack_depth}; + return {std::move(d_operators), max_stack_depth}; } #define PARSE_TRY(_x) \ @@ -670,12 +676,12 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, case path_operator_type::CHILD_WILDCARD: { // if we're on the first element of this wildcard if (!ctx.state_flag) { - output.add_output("[" DEBUG_NEWLINE, 2); + output.add_output("[" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN); // step into the child element PARSE_TRY(ctx.j_state.child_element()); if (last_result == parse_result::EMPTY) { - output.add_output("]" DEBUG_NEWLINE, 2); + output.add_output("]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN); last_result = parse_result::SUCCESS; break; } @@ -683,7 +689,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, // first element PARSE_TRY(ctx.j_state.next_matching_element(op.name, true)); if (last_result == parse_result::EMPTY) { - output.add_output("]" DEBUG_NEWLINE, 2); + output.add_output("]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN); last_result = parse_result::SUCCESS; break; } @@ -699,7 +705,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, // next element PARSE_TRY(ctx.j_state.next_matching_element(op.name, false)); if (last_result == parse_result::EMPTY) { - output.add_output("]" DEBUG_NEWLINE, 2); + output.add_output("]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN); last_result = parse_result::SUCCESS; break; } @@ -735,7 +741,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, // END case default: { - if (ctx.list_element) { output.add_output({"," DEBUG_NEWLINE, 2}); } + if (ctx.list_element) { output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); } PARSE_TRY(ctx.j_state.extract_element(&output)); } break; } @@ -783,12 +789,13 @@ __global__ void get_json_object_kernel(char const* chars, } std::unique_ptr get_json_object(cudf::strings_column_view const& col, - std::string const& json_path, + cudf::string_scalar const& json_path, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // preprocess the json_path into a command buffer - std::pair preprocess = build_command_buffer(json_path, stream); + std::pair, int> preprocess = + build_command_buffer(json_path, stream); CUDF_EXPECTS(preprocess.second <= max_command_stack_depth, "Encountered json_path string that is too complex"); @@ -802,7 +809,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c get_json_object_kernel<<>>( col.chars().head(), col.offsets().head(), - preprocess.first.commands.data(), + preprocess.first.data(), offsets_view.head(), nullptr, col.size()); @@ -824,7 +831,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c get_json_object_kernel<<>>( col.chars().head(), col.offsets().head(), - preprocess.first.commands.data(), + preprocess.first.data(), offsets_view.head(), chars_view.head(), col.size()); @@ -842,7 +849,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c } // namespace detail std::unique_ptr get_json_object(cudf::strings_column_view const& col, - std::string const& json_path, + cudf::string_scalar const& json_path, rmm::mr::device_memory_resource* mr) { return detail::get_json_object(col, json_path, 0, mr); From e124cc503712d6de4ef83e1ab30763291755dbfa Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Mon, 22 Feb 2021 16:00:47 -0800 Subject: [PATCH 10/33] updated test --- java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 5139269553e..1eb2b9a2af2 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3909,8 +3909,9 @@ void testGetJSONObject() { " }\n" + "}"; - try (ColumnVector json = ColumnVector.fromStrings(jsonString); + try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString); ColumnVector expectedAuthors = ColumnVector.fromStrings("[\n\"Nigel Rees\",\n\"Evelyn " + + "Waugh\",\n\"Herman Melville\",\n\"J. R. R. Tolkien\"]\n", "[\n\"Nigel Rees\",\n\"Evelyn " + "Waugh\",\n\"Herman Melville\",\n\"J. R. R. Tolkien\"]\n"); ColumnVector gotAuthors = json.getJSONObject("$.store.book[*].author")) { assertColumnsAreEqual(expectedAuthors, gotAuthors); From efb767e39aec7f4e8829247598afc6907c8f3e3f Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Mon, 22 Feb 2021 16:12:23 -0800 Subject: [PATCH 11/33] updated to scalar --- java/src/main/java/ai/rapids/cudf/ColumnView.java | 6 +++--- java/src/main/native/src/ColumnViewJni.cpp | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 8be1f035187..4453d7bb1f1 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -1895,9 +1895,9 @@ public final ColumnVector substring(ColumnView start, ColumnView end) { * @param path The JSONPath string to be applied to each row * @return new strings ColumnVector containing the retrieved json object strings */ - public final ColumnVector getJSONObject(String path) { + public final ColumnVector getJSONObject(Scalar path) { assert(type.equals(DType.STRING)) : "column type must be a String"; - return new ColumnVector(getJSONObject(getNativeView(), path)); + return new ColumnVector(getJSONObject(getNativeView(), path.getScalarHandle())); } /** @@ -2423,7 +2423,7 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) { */ private static native long stringTimestampToTimestamp(long viewHandle, int unit, String format); - private static native long getJSONObject(long viewHandle, String path) throws CudfException; + private static native long getJSONObject(long viewHandle, long scalarHandle) throws CudfException; /** * Native method to parse and convert a timestamp column vector to string column vector. A unix diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index aebe56ee857..3bac1673764 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -61,6 +61,7 @@ #include #include #include "cudf/strings/strings_column_view.hpp" +#include "cudf/types.hpp" #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp" @@ -1765,7 +1766,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jobject j_object, - jlong j_view_handle, jstring j_path) { + jlong j_view_handle, jlong j_scalar_handle) { JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0); JNI_NULL_CHECK(env, j_path, "path cannot be null", 0); @@ -1774,9 +1775,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env cudf::column_view* n_column_view = reinterpret_cast(j_view_handle); cudf::strings_column_view n_strings_col_view(*n_column_view); + cudf::scalar* n_scalar_path = reinterpret_cast(j_scalar_handle); - cudf::jni::native_jstring n_path(env, j_path); - auto result = cudf::strings::get_json_object(n_strings_col_view, std::string(n_path.get())); + auto result = cudf::strings::get_json_object(n_strings_col_view, n_scalar_path); return reinterpret_cast(result.release()); } From 6127b7c45ee173ce041aae16e3ab8e0c6a0b02bf Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Tue, 23 Feb 2021 09:25:30 -0800 Subject: [PATCH 12/33] changes to match the cudf --- java/src/main/native/src/ColumnViewJni.cpp | 6 +++--- java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 3bac1673764..f3043777242 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1769,15 +1769,15 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env jlong j_view_handle, jlong j_scalar_handle) { JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0); - JNI_NULL_CHECK(env, j_path, "path cannot be null", 0); + JNI_NULL_CHECK(env, j_scalar_handle, "path cannot be null", 0); try { cudf::column_view* n_column_view = reinterpret_cast(j_view_handle); cudf::strings_column_view n_strings_col_view(*n_column_view); - cudf::scalar* n_scalar_path = reinterpret_cast(j_scalar_handle); + cudf::string_scalar *n_scalar_path = reinterpret_cast(j_scalar_handle); - auto result = cudf::strings::get_json_object(n_strings_col_view, n_scalar_path); + auto result = cudf::strings::get_json_object(n_strings_col_view, *n_scalar_path); return reinterpret_cast(result.release()); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 1eb2b9a2af2..f32cd9e8e9b 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3910,10 +3910,11 @@ void testGetJSONObject() { "}"; try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString); - ColumnVector expectedAuthors = ColumnVector.fromStrings("[\n\"Nigel Rees\",\n\"Evelyn " + - "Waugh\",\n\"Herman Melville\",\n\"J. R. R. Tolkien\"]\n", "[\n\"Nigel Rees\",\n\"Evelyn " + - "Waugh\",\n\"Herman Melville\",\n\"J. R. R. Tolkien\"]\n"); - ColumnVector gotAuthors = json.getJSONObject("$.store.book[*].author")) { + ColumnVector expectedAuthors = ColumnVector.fromStrings("[\"Nigel Rees\",\"Evelyn " + + "Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]", "[\"Nigel Rees\",\"Evelyn " + + "Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]"); + Scalar path = Scalar.fromString("$.store.book[*].author"); + ColumnVector gotAuthors = json.getJSONObject(path)) { assertColumnsAreEqual(expectedAuthors, gotAuthors); } } From d6602bdd62cd9abee30d6e57c3cf694f33478698 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Tue, 23 Feb 2021 18:05:27 -0600 Subject: [PATCH 13/33] Strip quotes from singular returned string values. Propagate validity vector to output. --- cpp/src/strings/json/json_path.cu | 78 +++++++++++++++++++++++-------- cpp/tests/strings/json_tests.cpp | 21 +++++++++ 2 files changed, 80 insertions(+), 19 deletions(-) diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index b06f1c5b32d..51a16122b45 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -7,6 +8,7 @@ #include #include +#include #include #include @@ -14,8 +16,6 @@ #include -// #include "db_test.cuh" - namespace cudf { namespace strings { namespace detail { @@ -29,6 +29,30 @@ namespace { // #define DEBUG_NEWLINE "\n" // #define DEBUG_NEWLINE_LEN (1) +// temporary. spark doesn't strictly follow the JSONPath spec. +// I think this probably should be a configurable enum to control +// the kind of output you get and what features are supported. +// +// Current known differences: +// - When returning a string value as a single element, Spark strips the quotes. +// standard: "whee" +// spark: whee +// +// - Spark only supports the wildcard operator when in a subscript, eg [*] +// It does not handle .* +// +#define __SPARK_BEHAVIORS + +// Other, non-spark known differences: +// +// - In jsonpath_ng, name subscripts can use double quotes instead of the standard +// single quotes in the query string. +// standard: $.thing['subscript'] +// jsonpath_ng: $.thing["subscript"] +// +// Currently, this code only allows single-quotes but that can be easily expanded. +// + using namespace cudf; CUDA_HOST_DEVICE_CALLABLE char to_lower(char const c) @@ -317,7 +341,7 @@ class json_state : private parser { CUDA_HOST_DEVICE_CALLABLE json_state(json_state const& j) : parser(j) { *this = j; } - CUDA_HOST_DEVICE_CALLABLE parse_result extract_element(json_output* output) + CUDA_HOST_DEVICE_CALLABLE parse_result extract_element(json_output* output, bool list_element) { // collapse the current element into a json_string @@ -329,6 +353,15 @@ class json_state : private parser { pos = cur_el_start; if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; } end = pos; + +#if defined(__SPARK_BEHAVIORS) + // spark/hive-specific behavior. if this is a non-list-element wrapped in quotes, + // strip them + if (!list_element && *start == '\"' && *(end - 1) == '\"') { + start++; + end--; + } +#endif } // otherwise, march through everything inside else { @@ -373,7 +406,7 @@ class json_state : private parser { return parse_result::SUCCESS; } - CUDA_HOST_DEVICE_CALLABLE parse_result skip_element() { return extract_element(nullptr); } + CUDA_HOST_DEVICE_CALLABLE parse_result skip_element() { return extract_element(nullptr, false); } json_element_type element; @@ -498,6 +531,9 @@ class path_state : private parser { path_operator op; json_string term{".[", 2}; if (parse_path_name(op.name, term)) { + // this is another potential use case for __SPARK_BEHAVIORS / configurability + // Spark currently only handles the wildcard operator inside [*], it does + // not handle .* if (op.name.len == 1 && op.name.str[0] == '*') { op.type = path_operator_type::CHILD_WILDCARD; } else { @@ -621,18 +657,18 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, json_state j_state; path_operator const* commands; bool list_element; + int element_count; bool state_flag; - int count; }; context stack[max_command_stack_depth]; int stack_pos = 0; auto push_context = [&stack, &stack_pos](json_state const& _j_state, path_operator const* _commands, bool _list_element = false, - bool _state_flag = false, - int _count = 0) { + int _element_count = 0, + bool _state_flag = false) { if (stack_pos == max_command_stack_depth - 1) { return false; } - stack[stack_pos++] = context{_j_state, _commands, _list_element, _state_flag, _count}; + stack[stack_pos++] = context{_j_state, _commands, _list_element, _element_count, _state_flag}; return true; }; auto pop_context = [&stack, &stack_pos](context& c) { @@ -665,7 +701,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, if (last_result == parse_result::SUCCESS) { PARSE_TRY(ctx.j_state.next_matching_element(op.name, true)); if (last_result == parse_result::SUCCESS) { - push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element, ctx.element_count); } } } break; @@ -695,12 +731,12 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, } // re-push ourselves - push_context(ctx.j_state, ctx.commands, false, true); + push_context(ctx.j_state, ctx.commands, false, 0, true); // push the next command - push_context(ctx.j_state, ctx.commands + 1); + push_context(ctx.j_state, ctx.commands + 1, true, 0); } else { // if we actually processed something to the output, increment count - if (last_result != parse_result::EMPTY) { ctx.count++; } + if (last_result != parse_result::EMPTY) { ctx.element_count++; } // next element PARSE_TRY(ctx.j_state.next_matching_element(op.name, false)); @@ -711,9 +747,9 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, } // re-push ourselves - push_context(ctx.j_state, ctx.commands, false, true); + push_context(ctx.j_state, ctx.commands, false, 0, true); // push the next command - push_context(ctx.j_state, ctx.commands + 1, ctx.count > 0 ? true : false); + push_context(ctx.j_state, ctx.commands + 1, true, ctx.element_count); } } break; @@ -731,7 +767,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, PARSE_TRY(ctx.j_state.next_matching_element(any, false)); if (last_result == parse_result::EMPTY) { break; } } - push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element, ctx.element_count); } } } break; @@ -741,8 +777,10 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, // END case default: { - if (ctx.list_element) { output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); } - PARSE_TRY(ctx.j_state.extract_element(&output)); + if (ctx.list_element && ctx.element_count > 0) { + output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + PARSE_TRY(ctx.j_state.extract_element(&output, ctx.list_element)); } break; } } @@ -759,6 +797,8 @@ CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input, char* out_buf, size_t out_buf_size) { + if (input_len == 0) { return json_output{0, 0, out_buf}; } + json_state j_state(input, input_len); json_output output{out_buf_size, 0, out_buf}; @@ -839,8 +879,8 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c return make_strings_column(col.size(), std::move(offsets), std::move(chars), - UNKNOWN_NULL_COUNT, - rmm::device_buffer{}, + col.null_count(), + cudf::detail::copy_bitmask(col.parent(), stream, mr), stream, mr); } diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index b00fb39158c..cf53d7f54e2 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -199,4 +199,25 @@ TEST_F(JsonTests, GetJsonObject) cudf::test::print(*result); } + + { + // spark behavioral difference. + // standard: "fiction" + // spark: fiction + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[2].category"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } + + { + char const* str = "{\"a\" : \"b\"}"; + cudf::test::strings_column_wrapper input{{str, str, str, str}, {1, 0, 1, 0}}; + + std::string json_path("$.a"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } } From f4196368d17b1018415f981b93887cf7becf2fd8 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 24 Feb 2021 15:54:20 -0600 Subject: [PATCH 14/33] Return null rows for queries with no result instead of just empty strings. --- cpp/src/strings/json/json_path.cu | 82 ++++++++++++++++++++++++------- cpp/tests/strings/json_tests.cpp | 38 ++++++++++++++ 2 files changed, 101 insertions(+), 19 deletions(-) diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 51a16122b45..953e30098ec 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -208,6 +209,7 @@ enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; struct json_output { size_t output_max_len; size_t output_len; + int element_count; char* output; CUDA_HOST_DEVICE_CALLABLE void add_output(const char* str, size_t len) @@ -402,6 +404,7 @@ class json_state : private parser { } */ output->add_output({start, end - start}); + output->element_count++; } return parse_result::SUCCESS; } @@ -609,7 +612,7 @@ class path_state : private parser { } }; -std::pair, int> build_command_buffer( +std::tuple, int, bool> build_command_buffer( cudf::string_scalar const& json_path, rmm::cuda_stream_view stream) { std::string h_json_path = json_path.to_string(stream); @@ -637,7 +640,9 @@ std::pair, int> build_command_buffer( cudaMemcpyHostToDevice, stream.value()); - return {std::move(d_operators), max_stack_depth}; + return {std::move(d_operators), + max_stack_depth, + h_operators.size() == 1 && h_operators[0].type == path_operator_type::END ? true : false}; } #define PARSE_TRY(_x) \ @@ -797,10 +802,8 @@ CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input, char* out_buf, size_t out_buf_size) { - if (input_len == 0) { return json_output{0, 0, out_buf}; } - json_state j_state(input, input_len); - json_output output{out_buf_size, 0, out_buf}; + json_output output{out_buf_size, 0, 0, out_buf}; parse_json_path(j_state, commands, output); @@ -812,20 +815,37 @@ __global__ void get_json_object_kernel(char const* chars, path_operator const* const commands, size_type* output_offsets, char* out_buf, + bitmask_type* out_validity, size_type num_rows) { uint64_t const tid = threadIdx.x + (blockDim.x * blockIdx.x); - if (tid >= num_rows) { return; } - - char* dst = out_buf ? out_buf + output_offsets[tid] : nullptr; - size_t dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0; + bool is_valid = false; + if (tid < num_rows) { + size_type src_size = offsets[tid + 1] - offsets[tid]; + size_type output_size = 0; + if (src_size > 0) { + char* dst = out_buf ? out_buf + output_offsets[tid] : nullptr; + size_t dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0; + + json_output out = + get_json_object_single(chars + offsets[tid], src_size, commands, dst, dst_size); + output_size = out.output_len; + if (out.element_count > 0) { is_valid = true; } + } - json_output out = get_json_object_single( - chars + offsets[tid], offsets[tid + 1] - offsets[tid], commands, dst, dst_size); + // filled in only during the precompute step + if (!out_buf) { output_offsets[tid] = static_cast(output_size); } + } - // filled in only during the precompute step - if (!out_buf) { output_offsets[tid] = static_cast(out.output_len); } + // validity filled in only during the output step + if (out_validity) { + uint32_t mask = __ballot_sync(0xffffffff, is_valid); + // 0th lane of the warp writes the validity + if (!(tid % cudf::detail::warp_size) && tid < num_rows) { + out_validity[cudf::word_index(tid)] = mask; + } + } } std::unique_ptr get_json_object(cudf::strings_column_view const& col, @@ -834,24 +854,42 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c rmm::mr::device_memory_resource* mr) { // preprocess the json_path into a command buffer - std::pair, int> preprocess = + std::tuple, int, bool> preprocess = build_command_buffer(json_path, stream); - CUDF_EXPECTS(preprocess.second <= max_command_stack_depth, + CUDF_EXPECTS(std::get<1>(preprocess) <= max_command_stack_depth, "Encountered json_path string that is too complex"); auto offsets = cudf::make_fixed_width_column( data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr); cudf::mutable_column_view offsets_view(*offsets); + // if the query is empty, return a string column containing all nulls + if (std::get<2>(preprocess)) { + thrust::generate(rmm::exec_policy(stream), + offsets_view.head(), + offsets_view.head() + offsets_view.size(), + [] __device__() { return 0; }); + return cudf::make_strings_column( + col.size(), + std::move(offsets), + cudf::make_fixed_width_column( + data_type{type_id::INT8}, 0, mask_state::UNALLOCATED, stream, mr), + col.size(), + cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr), + stream, + mr); + } + cudf::detail::grid_1d const grid{col.size(), 512}; // preprocess sizes get_json_object_kernel<<>>( col.chars().head(), col.offsets().head(), - preprocess.first.data(), + std::get<0>(preprocess).data(), offsets_view.head(), nullptr, + nullptr, col.size()); // convert sizes to offsets @@ -866,21 +904,27 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c auto chars = cudf::make_fixed_width_column( data_type{type_id::INT8}, output_size, mask_state::UNALLOCATED, stream, mr); + // potential optimization : if we know that all outputs are valid, we could skip creating + // the validity mask altogether + rmm::device_buffer validity = + cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr); + // compute results cudf::mutable_column_view chars_view(*chars); get_json_object_kernel<<>>( col.chars().head(), col.offsets().head(), - preprocess.first.data(), + std::get<0>(preprocess).data(), offsets_view.head(), chars_view.head(), + static_cast(validity.data()), col.size()); return make_strings_column(col.size(), std::move(offsets), std::move(chars), - col.null_count(), - cudf::detail::copy_bitmask(col.parent(), stream, mr), + UNKNOWN_NULL_COUNT, + std::move(validity), stream, mr); } diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index cf53d7f54e2..bb33b0d8289 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -220,4 +220,42 @@ TEST_F(JsonTests, GetJsonObject) cudf::test::print(*result); } + + // empty query -> null + { + cudf::test::strings_column_wrapper input{""}; + std::string json_path(""); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } + + // empty input -> null + { + cudf::test::strings_column_wrapper input{""}; + std::string json_path("$"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } + + // empty output -> null + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[*].c"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } + + // slightly different from "empty output". in this case, we're + // returning something, but it happens to be empty. so we expect + // a valid, but empty row + { + cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"}; + std::string json_path("$.store.bicycle"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::print(*result); + } } From 4cd0e2d7633f5c7175dcb7c6898bac8882acd9d7 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 24 Mar 2021 20:55:03 -0500 Subject: [PATCH 15/33] get_json_path() cleaned up and ready for review. --- cpp/src/io/csv/csv_gpu.cu | 6 +- cpp/src/io/json/json_gpu.cu | 4 +- cpp/src/io/utilities/parsing_utils.cuh | 142 ++++--- cpp/src/strings/json/json_path.cu | 532 +++++++++++-------------- cpp/tests/strings/json_tests.cpp | 479 ++++++++++++++++------ 5 files changed, 664 insertions(+), 499 deletions(-) diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 86e5f1fdcae..44acc7fc55f 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -196,7 +196,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) || serialized_trie_contains(opts.trie_false, {field_start, field_len})) { atomicAdd(&d_columnData[actual_col].bool_count, 1); - } else if (cudf::io::gpu::is_infinity(field_start, next_delimiter)) { + } else if (cudf::io::is_infinity(field_start, next_delimiter)) { atomicAdd(&d_columnData[actual_col].float_count, 1); } else { long countNumber = 0; @@ -277,7 +277,7 @@ __inline__ __device__ T decode_value(char const *begin, char const *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } template @@ -285,7 +285,7 @@ __inline__ __device__ T decode_value(char const *begin, char const *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } template <> diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index 5efb64fd4d5..75910ae6b5b 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -114,7 +114,7 @@ __inline__ __device__ T decode_value(const char *begin, uint64_t end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } /** @@ -131,7 +131,7 @@ __inline__ __device__ T decode_value(const char *begin, const char *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } /** diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 375874ea11e..b7719cba580 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -84,67 +84,6 @@ struct parse_options { } }; -namespace gpu { -/** - * @brief CUDA kernel iterates over the data until the end of the current field - * - * Also iterates over (one or more) delimiter characters after the field. - * Function applies to formats with field delimiters and line terminators. - * - * @param begin Pointer to the first element of the string - * @param end Pointer to the first element after the string - * @param opts A set of parsing options - * @param escape_char A boolean value to signify whether to consider `\` as escape character or - * just a character. - * - * @return Pointer to the last character in the field, including the - * delimiter(s) following the field data - */ -__device__ __inline__ char const* seek_field_end(char const* begin, - char const* end, - parse_options_view const& opts, - bool escape_char = false) -{ - bool quotation = false; - auto current = begin; - bool escape_next = false; - while (true) { - // Use simple logic to ignore control chars between any quote seq - // Handles nominal cases including doublequotes within quotes, but - // may not output exact failures as PANDAS for malformed fields. - // Check for instances such as "a2\"bc" and "\\" if `escape_char` is true. - - if (*current == opts.quotechar and not escape_next) { - quotation = !quotation; - } else if (!quotation) { - if (*current == opts.delimiter) { - while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) { - ++current; - } - break; - } else if (*current == opts.terminator) { - break; - } else if (*current == '\r' && (current + 1 < end && *(current + 1) == '\n')) { - --end; - break; - } - } - - if (escape_char == true) { - // If a escape character is encountered, escape next character in next loop. - if (escape_next == false and *current == '\\') { - escape_next = true; - } else { - escape_next = false; - } - } - - if (current >= end) break; - current++; - } - return current; -} - /** * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization * for integral types. Handles hexadecimal digits, both uppercase and lowercase. @@ -157,7 +96,7 @@ __device__ __inline__ char const* seek_field_end(char const* begin, * @return uint8_t Numeric value of the character, or `0` */ template ::value>* = nullptr> -__device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) +constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 10; @@ -178,7 +117,7 @@ __device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) * @return uint8_t Numeric value of the character, or `0` */ template ::value>* = nullptr> -__device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) +constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; @@ -187,10 +126,7 @@ __device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) } // Converts character to lowercase. -__inline__ __device__ char to_lower(char const c) -{ - return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; -} +constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; } /** * @brief Checks if string is infinity, case insensitive with/without sign @@ -201,7 +137,7 @@ __inline__ __device__ char to_lower(char const c) * @param end Pointer to the first element after the string * @return true if string is valid infinity, else false. */ -__inline__ __device__ bool is_infinity(char const* begin, char const* end) +constexpr bool is_infinity(char const* begin, char const* end) { if (*begin == '-' || *begin == '+') begin++; char const* cinf = "infinity"; @@ -225,9 +161,10 @@ __inline__ __device__ bool is_infinity(char const* begin, char const* end) * @return The parsed and converted value */ template -__inline__ __device__ T parse_numeric(const char* begin, - const char* end, - parse_options_view const& opts) +constexpr T parse_numeric(const char* begin, + const char* end, + parse_options_view const& opts, + T error_result = std::numeric_limits::quiet_NaN()) { T value{}; bool all_digits_valid = true; @@ -283,11 +220,72 @@ __inline__ __device__ T parse_numeric(const char* begin, if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } } } - if (!all_digits_valid) { return std::numeric_limits::quiet_NaN(); } + if (!all_digits_valid) { return error_result; } return value * sign; } +namespace gpu { +/** + * @brief CUDA kernel iterates over the data until the end of the current field + * + * Also iterates over (one or more) delimiter characters after the field. + * Function applies to formats with field delimiters and line terminators. + * + * @param begin Pointer to the first element of the string + * @param end Pointer to the first element after the string + * @param opts A set of parsing options + * @param escape_char A boolean value to signify whether to consider `\` as escape character or + * just a character. + * + * @return Pointer to the last character in the field, including the + * delimiter(s) following the field data + */ +__device__ __inline__ char const* seek_field_end(char const* begin, + char const* end, + parse_options_view const& opts, + bool escape_char = false) +{ + bool quotation = false; + auto current = begin; + bool escape_next = false; + while (true) { + // Use simple logic to ignore control chars between any quote seq + // Handles nominal cases including doublequotes within quotes, but + // may not output exact failures as PANDAS for malformed fields. + // Check for instances such as "a2\"bc" and "\\" if `escape_char` is true. + + if (*current == opts.quotechar and not escape_next) { + quotation = !quotation; + } else if (!quotation) { + if (*current == opts.delimiter) { + while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) { + ++current; + } + break; + } else if (*current == opts.terminator) { + break; + } else if (*current == '\r' && (current + 1 < end && *(current + 1) == '\n')) { + --end; + break; + } + } + + if (escape_char == true) { + // If a escape character is encountered, escape next character in next loop. + if (escape_next == false and *current == '\\') { + escape_next = true; + } else { + escape_next = false; + } + } + + if (current >= end) break; + current++; + } + return current; +} + /** * @brief Lexicographically compare digits in input against string * representing an integer diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 65503dfa1a3..8cb3c7bb25b 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -21,15 +21,14 @@ namespace detail { namespace { -// temporary. for debugging purposes -#define DEBUG_NEWLINE -#define DEBUG_NEWLINE_LEN (0) +// debug accessibility -// #define DEBUG_NEWLINE "\n" -// #define DEBUG_NEWLINE_LEN (1) +// change to "\n" and 1 to make output more readable +#define DEBUG_NEWLINE +constexpr int DEBUG_NEWLINE_LEN = 0; -// temporary. spark doesn't strictly follow the JSONPath spec. -// I think this probably should be a configurable enum to control +// temporary? spark doesn't strictly follow the JSONPath spec. +// I think this probably could be a configurable enum to control // the kind of output you get and what features are supported. // // Current known differences: @@ -40,8 +39,6 @@ namespace { // - Spark only supports the wildcard operator when in a subscript, eg [*] // It does not handle .* // -#define __SPARK_BEHAVIORS - // Other, non-spark known differences: // // - In jsonpath_ng, name subscripts can use double quotes instead of the standard @@ -49,199 +46,63 @@ namespace { // standard: $.thing['subscript'] // jsonpath_ng: $.thing["subscript"] // -// Currently, this code only allows single-quotes but that can be easily expanded. +// Currently, this code only allows single-quotes but that could be expanded if necessary. // +#define SPARK_BEHAVIORS using namespace cudf; -CUDA_HOST_DEVICE_CALLABLE char to_lower(char const c) -{ - return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; -} - -template ::value>* = nullptr> -CUDA_HOST_DEVICE_CALLABLE uint8_t decode_digit(char c, bool* valid_flag) -{ - if (c >= '0' && c <= '9') return c - '0'; - if (c >= 'a' && c <= 'f') return c - 'a' + 10; - if (c >= 'A' && c <= 'F') return c - 'A' + 10; - - *valid_flag = false; - return 0; -} - -template ::value>* = nullptr> -CUDA_HOST_DEVICE_CALLABLE uint8_t decode_digit(char c, bool* valid_flag) -{ - if (c >= '0' && c <= '9') return c - '0'; - - *valid_flag = false; - return 0; -} - -CUDA_HOST_DEVICE_CALLABLE bool is_infinity(char const* begin, char const* end) -{ - if (*begin == '-' || *begin == '+') begin++; - char const* cinf = "infinity"; - auto index = begin; - while (index < end) { - if (*cinf != to_lower(*index)) break; - index++; - cinf++; - } - return ((index == begin + 3 || index == begin + 8) && index >= end); -} - -template -CUDA_HOST_DEVICE_CALLABLE T parse_numeric(const char* begin, - const char* end, - cudf::io::parse_options_view const& opts) -{ - T value{}; - bool all_digits_valid = true; - - // Handle negative values if necessary - int32_t sign = (*begin == '-') ? -1 : 1; - - // Handle infinity - if (std::is_floating_point::value && is_infinity(begin, end)) { - return sign * std::numeric_limits::infinity(); - } - if (*begin == '-' || *begin == '+') begin++; - - // Skip over the "0x" prefix for hex notation - if (base == 16 && begin + 2 < end && *begin == '0' && *(begin + 1) == 'x') { begin += 2; } - - // Handle the whole part of the number - // auto index = begin; - while (begin < end) { - if (*begin == opts.decimal) { - ++begin; - break; - } else if (base == 10 && (*begin == 'e' || *begin == 'E')) { - break; - } else if (*begin != opts.thousands && *begin != '+') { - value = (value * base) + decode_digit(*begin, &all_digits_valid); - } - ++begin; - } - - if (std::is_floating_point::value) { - // Handle fractional part of the number if necessary - double divisor = 1; - while (begin < end) { - if (*begin == 'e' || *begin == 'E') { - ++begin; - break; - } else if (*begin != opts.thousands && *begin != '+') { - divisor /= base; - value += decode_digit(*begin, &all_digits_valid) * divisor; - } - ++begin; - } - - // Handle exponential part of the number if necessary - if (begin < end) { - const int32_t exponent_sign = *begin == '-' ? -1 : 1; - if (*begin == '-' || *begin == '+') { ++begin; } - int32_t exponent = 0; - while (begin < end) { - exponent = (exponent * 10) + decode_digit(*(begin++), &all_digits_valid); - } - if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } - } - } - if (!all_digits_valid) { return std::numeric_limits::quiet_NaN(); } - - return value * sign; -} - -CUDA_HOST_DEVICE_CALLABLE bool device_strncmp(const char* str1, const char* str2, size_t num_chars) -{ - for (size_t idx = 0; idx < num_chars; idx++) { - if (str1[idx] != str2[idx]) { return false; } - } - return true; -} - -CUDA_HOST_DEVICE_CALLABLE char const* device_strpbrk(const char* str, - size_t str_size, - const char* tok, - size_t tok_size) -{ - size_t pos = 0; - while (pos < str_size) { - size_t tpos = 0; - char c = str[pos]; - while (tpos < tok_size) { - if (c == tok[tpos]) { return str + pos; } - tpos++; - } - pos++; - } - return nullptr; -} +/** + * @brief Result of calling a parse type function. + * + * The primary use of this is to distinguish between "success" and + * "success but no data" return cases. For example, if you are reading the + * values of an array you might call a parse function in a while loop. You + * would want to continue doing this until you either encounter an error (parse_result::ERROR) + * or you get nothing back (parse_result::EMPTY) + */ +enum class parse_result { + ERROR, // failure + SUCCESS, // success + EMPTY, // success, but no data +}; struct json_string { const char* str; int64_t len; - CUDA_HOST_DEVICE_CALLABLE json_string() : str(nullptr), len(-1) {} - CUDA_HOST_DEVICE_CALLABLE json_string(const char* _str, int64_t _len) : str(_str), len(_len) {} + constexpr json_string() : str(nullptr), len(-1) {} + constexpr json_string(const char* _str, int64_t _len) : str(_str), len(_len) {} - CUDA_HOST_DEVICE_CALLABLE bool operator==(json_string const& cmp) + constexpr bool operator==(json_string const& cmp) { return len == cmp.len && str != nullptr && cmp.str != nullptr && - device_strncmp(str, cmp.str, static_cast(len)); + thrust::equal(thrust::seq, str, str + len, cmp.str); } }; -enum class parse_result { - ERROR, - SUCCESS, - EMPTY, -}; - -enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; - -struct json_output { - size_t output_max_len; - size_t output_len; - int element_count; - char* output; - - CUDA_HOST_DEVICE_CALLABLE void add_output(const char* str, size_t len) - { - if (output != nullptr) { - // assert output_len + len < output_max_len - memcpy(output + output_len, str, len); - } - output_len += len; - } - - CUDA_HOST_DEVICE_CALLABLE void add_output(json_string str) { add_output(str.str, str.len); } -}; - -CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) -{ - return c == ' ' || c == '\r' || c == '\n' || c == '\t' ? true : false; -} - +/** + * @brief Base parser class inherited by the (device-side) json_state class and + * (host-side) path_state class. + * + * Contains a number of useful utility functions common to parsing json and + * JSONPath strings. + */ class parser { protected: - CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {} - CUDA_HOST_DEVICE_CALLABLE parser(const char* _input, int64_t _input_len) + constexpr parser() : input(nullptr), input_len(0), pos(nullptr) {} + constexpr parser(const char* _input, int64_t _input_len) : input(_input), input_len(_input_len), pos(_input) { parse_whitespace(); } - CUDA_HOST_DEVICE_CALLABLE parser(parser const& p) { *this = p; } + constexpr parser(parser const& p) : input(p.input), input_len(p.input_len), pos(p.pos) {} - CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; } - CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); } + constexpr bool eof(const char* p) { return p - input >= input_len; } + constexpr bool eof() { return eof(pos); } - CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace() + constexpr bool parse_whitespace() { while (!eof()) { if (is_whitespace(*pos)) { @@ -253,25 +114,21 @@ class parser { return false; } - CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(json_string& str, - bool can_be_empty, - char quote) + constexpr parse_result parse_string(json_string& str, bool can_be_empty, char quote) { str.str = nullptr; str.len = 0; - if (parse_whitespace()) { - if (*pos == quote) { - const char* start = ++pos; - while (!eof()) { - if (*pos == quote) { - str.str = start; - str.len = pos - start; - pos++; - return parse_result::SUCCESS; - } + if (parse_whitespace() && *pos == quote) { + const char* start = ++pos; + while (!eof()) { + if (*pos == quote) { + str.str = start; + str.len = pos - start; pos++; + return parse_result::SUCCESS; } + pos++; } } @@ -281,9 +138,7 @@ class parser { // a name means: // - a string followed by a : // - no string - CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(json_string& name, - bool can_be_empty, - char quote) + constexpr parse_result parse_name(json_string& name, bool can_be_empty, char quote) { if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) { return parse_result::ERROR; @@ -300,18 +155,22 @@ class parser { return parse_result::EMPTY; } - // this function is not particularly strong - CUDA_HOST_DEVICE_CALLABLE parse_result parse_number(json_string& val) + // numbers, true, false, null. + // this function is not particularly strong. badly formed values will get + // consumed without throwing any errors + constexpr parse_result parse_non_string_value(json_string& val) { if (!parse_whitespace()) { return parse_result::ERROR; } - // parse to the end of the number (does not do any error checking on whether - // the number is reasonably formed or not) + // parse to the end of the value char const* start = pos; char const* end = start; while (!eof(end)) { - char c = *end; - if (c == ',' || is_whitespace(c)) { break; } + char const c = *end; + if (c == ',' || c == '}' || c == ']' || is_whitespace(c)) { break; } + + // illegal chars + if (c == '[' || c == '{' || c == ':' || c == '\"') { return parse_result::ERROR; } end++; } pos = end; @@ -326,25 +185,56 @@ class parser { char const* input; int64_t input_len; char const* pos; + + private: + constexpr bool is_whitespace(char c) + { + return c == ' ' || c == '\r' || c == '\n' || c == '\t' ? true : false; + } }; +struct json_output { + size_t output_max_len; + size_t output_len; + int element_count; + char* output; + + constexpr void add_output(const char* str, size_t len) + { + if (output != nullptr) { memcpy(output + output_len, str, len); } + output_len += len; + } + + constexpr void add_output(json_string str) { add_output(str.str, str.len); } +}; + +enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; + class json_state : private parser { public: - CUDA_HOST_DEVICE_CALLABLE json_state() - : parser(), element(json_element_type::NONE), cur_el_start(nullptr) + constexpr json_state() + : parser(), + element(json_element_type::NONE), + cur_el_start(nullptr), + cur_el_type(json_element_type::NONE) { } - CUDA_HOST_DEVICE_CALLABLE json_state(const char* _input, int64_t _input_len) - : parser(_input, _input_len), element(json_element_type::NONE), cur_el_start(nullptr) + constexpr json_state(const char* _input, int64_t _input_len) + : parser(_input, _input_len), + element(json_element_type::NONE), + cur_el_start(nullptr), + cur_el_type(json_element_type::NONE) { } - CUDA_HOST_DEVICE_CALLABLE json_state(json_state const& j) : parser(j) { *this = j; } - - CUDA_HOST_DEVICE_CALLABLE parse_result extract_element(json_output* output, bool list_element) + constexpr json_state(json_state const& j) + : parser(j), element(j.element), cur_el_start(j.cur_el_start), cur_el_type(j.cur_el_type) { - // collapse the current element into a json_string + } + // retrieve the entire current element as a json_string + constexpr parse_result extract_element(json_output* output, bool list_element) + { char const* start = cur_el_start; char const* end = start; @@ -354,7 +244,7 @@ class json_state : private parser { if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; } end = pos; -#if defined(__SPARK_BEHAVIORS) +#if defined(SPARK_BEHAVIORS) // spark/hive-specific behavior. if this is a non-list-element wrapped in quotes, // strip them if (!list_element && *start == '\"' && *(end - 1) == '\"') { @@ -369,11 +259,10 @@ class json_state : private parser { int arr_count = 0; while (!eof(end)) { - char c = *end++; // could do some additional checks here. we know our current // element type, so we could be more strict on what kinds of // characters we expect to see. - switch (c) { + switch (*end++) { case '{': obj_count++; break; case '}': obj_count--; break; case '[': arr_count++; break; @@ -382,6 +271,7 @@ class json_state : private parser { } if (obj_count == 0 && arr_count == 0) { break; } } + if (obj_count > 0 || arr_count > 0) { return parse_result::ERROR; } pos = end; } @@ -391,32 +281,24 @@ class json_state : private parser { } if (output != nullptr) { - // seems like names are never included with JSONPath unless - // they are nested within the element being returned. - /* - if(cur_el_name.len > 0){ - output->add_output({"\"", 1}); - output->add_output(cur_el_name); - output->add_output({"\"", 1}); - output->add_output({":", 1}); - } - */ output->add_output({start, end - start}); output->element_count++; } return parse_result::SUCCESS; } - CUDA_HOST_DEVICE_CALLABLE parse_result skip_element() { return extract_element(nullptr, false); } - - json_element_type element; + constexpr parse_result skip_element() { return extract_element(nullptr, false); } - CUDA_HOST_DEVICE_CALLABLE parse_result next_element() { return next_element_internal(false); } + constexpr parse_result next_element() { return next_element_internal(false); } - CUDA_HOST_DEVICE_CALLABLE parse_result child_element() { return next_element_internal(true); } + constexpr parse_result child_element(bool as_field = false) + { + // cannot retrieve a field from an array + if (as_field && cur_el_type == json_element_type::ARRAY) { return parse_result::ERROR; } + return next_element_internal(true); + } - CUDA_HOST_DEVICE_CALLABLE parse_result next_matching_element(json_string const& name, - bool inclusive) + constexpr parse_result next_matching_element(json_string const& name, bool inclusive) { // if we're not including the current element, skip it if (!inclusive) { @@ -441,16 +323,16 @@ class json_state : private parser { } private: - CUDA_HOST_DEVICE_CALLABLE parse_result parse_value() + constexpr parse_result parse_value() { if (!parse_whitespace()) { return parse_result::ERROR; } // string or number? json_string unused; - return *pos == '\"' ? parse_string(unused, false, '\"') : parse_number(unused); + return *pos == '\"' ? parse_string(unused, false, '\"') : parse_non_string_value(unused); } - CUDA_HOST_DEVICE_CALLABLE parse_result next_element_internal(bool child) + constexpr parse_result next_element_internal(bool child) { // if we're not getting a child element, skip the current element. // this will leave pos as the first character -after- the close of @@ -461,10 +343,15 @@ class json_state : private parser { } // otherwise pos will be at the first character within the current element + // can only get the child of an object or array. + // this could theoretically be handled as an error, but the evaluators I've found + // seem to treat this as "it's nothing" + if (child && (cur_el_type == VALUE || cur_el_type == NONE)) { return parse_result::EMPTY; } + // what's next if (!parse_whitespace()) { return parse_result::EMPTY; } // if we're closing off a parent element, we're done - char c = *pos; + char const c = *pos; if (c == ']' || c == '}') { return parse_result::EMPTY; } // element name, if any @@ -472,7 +359,7 @@ class json_state : private parser { // element type if (!parse_whitespace()) { return parse_result::EMPTY; } - switch (*pos) { + switch (*pos++) { case '[': cur_el_type = ARRAY; break; case '{': cur_el_type = OBJECT; break; @@ -483,13 +370,13 @@ class json_state : private parser { // value type default: cur_el_type = VALUE; break; } - pos++; // the start of the current element is always at the value, not the name cur_el_start = pos - 1; return parse_result::SUCCESS; } + json_element_type element; const char* cur_el_start; json_string cur_el_name; json_element_type cur_el_type; @@ -498,34 +385,24 @@ class json_state : private parser { enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END }; struct path_operator { - CUDA_HOST_DEVICE_CALLABLE path_operator() : type(path_operator_type::ERROR), index(-1) {} - CUDA_HOST_DEVICE_CALLABLE path_operator(path_operator_type _type) : type(_type), index(-1) {} + constexpr path_operator() : type(path_operator_type::ERROR), index(-1) {} + constexpr path_operator(path_operator_type _type) : type(_type), index(-1) {} path_operator_type type; json_string name; int index; }; -struct command_buffer { - rmm::device_uvector commands; - // used as backing memory for the name fields inside the - // path_operator objects - string_scalar json_path; -}; // current state of the JSONPath class path_state : private parser { public: - CUDA_HOST_DEVICE_CALLABLE path_state(const char* _path, size_t _path_len) - : parser(_path, _path_len) - { - } + path_state(const char* _path, size_t _path_len) : parser(_path, _path_len) {} - CUDA_HOST_DEVICE_CALLABLE path_operator get_next_operator() + path_operator get_next_operator() { if (eof()) { return {path_operator_type::END}; } - char c = *pos++; - switch (c) { + switch (*pos++) { case '$': return {path_operator_type::ROOT}; case '.': { @@ -551,7 +428,7 @@ class path_state : private parser { case '[': { path_operator op; json_string term{"]", 1}; - bool is_string = *pos == '\'' ? true : false; + bool const is_string = *pos == '\'' ? true : false; if (parse_path_name(op.name, term)) { pos++; if (op.name.len == 1 && op.name.str[0] == '*') { @@ -560,15 +437,23 @@ class path_state : private parser { if (is_string) { op.type = path_operator_type::CHILD; } else { - op.type = path_operator_type::CHILD_INDEX; - op.index = parse_numeric(op.name.str, op.name.str + op.name.len, json_opts); + op.type = path_operator_type::CHILD_INDEX; + op.index = + cudf::io::parse_numeric(op.name.str, op.name.str + op.name.len, json_opts, -1); + CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath"); } } return op; } } break; - default: break; + // wildcard operator + case '*': { + pos++; + return path_operator{path_operator_type::CHILD_WILDCARD}; + } break; + + default: CUDF_FAIL("Unrecognized JSONPath operator"); break; } return {path_operator_type::ERROR}; } @@ -576,10 +461,9 @@ class path_state : private parser { private: cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'}; - CUDA_HOST_DEVICE_CALLABLE bool parse_path_name(json_string& name, json_string& terminators) + bool parse_path_name(json_string& name, json_string const& terminators) { - char c = *pos; - switch (c) { + switch (*pos) { case '*': name.str = pos; name.len = 1; @@ -592,7 +476,8 @@ class path_state : private parser { default: { size_t const chars_left = input_len - (pos - input); - char const* end = device_strpbrk(pos, chars_left, terminators.str, terminators.len); + char const* end = std::find_first_of( + pos, pos + chars_left, terminators.str, terminators.str + terminators.len); if (end) { name.str = pos; name.len = end - pos; @@ -602,10 +487,13 @@ class path_state : private parser { name.len = chars_left; pos = input + input_len; } - return true; + break; } } + // an empty name is not valid + CUDF_EXPECTS(name.len > 0, "Invalid empty name in JSONpath query string"); + return true; } }; @@ -628,15 +516,25 @@ std::tuple, int, bool> build_command_buffer( if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; } // convert pointer to device pointer if (op.name.len > 0) { op.name.str = json_path.data() + (op.name.str - h_json_path.data()); } + if (op.type == path_operator_type::ROOT) { + CUDF_EXPECTS(h_operators.size() == 0, "Root operator ($) can only exist at the root"); + } + // if we havent' gotten a root operator to start, and we're not empty, quietly push a + // root operator now. + if (h_operators.size() == 0 && op.type != path_operator_type::ROOT && + op.type != path_operator_type::END) { + h_operators.push_back(path_operator{path_operator_type::ROOT}); + } h_operators.push_back(op); } while (op.type != path_operator_type::END); rmm::device_uvector d_operators(h_operators.size(), stream); - cudaMemcpyAsync(d_operators.data(), - h_operators.data(), - sizeof(path_operator) * h_operators.size(), - cudaMemcpyHostToDevice, - stream.value()); + CUDA_TRY(cudaMemcpyAsync(d_operators.data(), + h_operators.data(), + sizeof(path_operator) * h_operators.size(), + cudaMemcpyHostToDevice, + stream.value())); + stream.synchronize(); return {std::move(d_operators), max_stack_depth, @@ -650,17 +548,17 @@ std::tuple, int, bool> build_command_buffer( } while (0) template -CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, - path_operator const* _commands, - json_output& output, - bool _list_element = false) +__device__ parse_result parse_json_path(json_state& _j_state, + path_operator const* _commands, + json_output& output, + bool _list_element = false) { // manually maintained context stack in lieu of calling parse_json_path recursively. struct context { json_state j_state; path_operator const* commands; bool list_element; - int element_count; + // int element_count; bool state_flag; }; context stack[max_command_stack_depth]; @@ -668,10 +566,11 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, auto push_context = [&stack, &stack_pos](json_state const& _j_state, path_operator const* _commands, bool _list_element = false, - int _element_count = 0, - bool _state_flag = false) { + /* int _element_count = 0,*/ + bool _state_flag = false) { if (stack_pos == max_command_stack_depth - 1) { return false; } - stack[stack_pos++] = context{_j_state, _commands, _list_element, _element_count, _state_flag}; + stack[stack_pos++] = + context{_j_state, _commands, _list_element, /*_element_count,*/ _state_flag}; return true; }; auto pop_context = [&stack, &stack_pos](context& c) { @@ -685,6 +584,7 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, parse_result last_result = parse_result::SUCCESS; context ctx; + int element_count = 0; while (pop_context(ctx)) { path_operator op = *ctx.commands; @@ -700,11 +600,11 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, // [1] // will return a single thing case path_operator_type::CHILD: { - PARSE_TRY(ctx.j_state.child_element()); + PARSE_TRY(ctx.j_state.child_element(true)); if (last_result == parse_result::SUCCESS) { PARSE_TRY(ctx.j_state.next_matching_element(op.name, true)); if (last_result == parse_result::SUCCESS) { - push_context(ctx.j_state, ctx.commands + 1, ctx.list_element, ctx.element_count); + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); } } } break; @@ -715,44 +615,48 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, case path_operator_type::CHILD_WILDCARD: { // if we're on the first element of this wildcard if (!ctx.state_flag) { - output.add_output("[" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN); + // we will only ever be returning 1 array + if (!ctx.list_element) { output.add_output({"[" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); } // step into the child element PARSE_TRY(ctx.j_state.child_element()); if (last_result == parse_result::EMPTY) { - output.add_output("]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN); + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } last_result = parse_result::SUCCESS; break; } // first element - PARSE_TRY(ctx.j_state.next_matching_element(op.name, true)); + PARSE_TRY(ctx.j_state.next_matching_element({"*", 1}, true)); if (last_result == parse_result::EMPTY) { - output.add_output("]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN); + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } last_result = parse_result::SUCCESS; break; } // re-push ourselves - push_context(ctx.j_state, ctx.commands, false, 0, true); + push_context(ctx.j_state, ctx.commands, ctx.list_element, true); // push the next command - push_context(ctx.j_state, ctx.commands + 1, true, 0); + push_context(ctx.j_state, ctx.commands + 1, true); } else { - // if we actually processed something to the output, increment count - if (last_result != parse_result::EMPTY) { ctx.element_count++; } - // next element - PARSE_TRY(ctx.j_state.next_matching_element(op.name, false)); + PARSE_TRY(ctx.j_state.next_matching_element({"*", 1}, false)); if (last_result == parse_result::EMPTY) { - output.add_output("]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN); + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } last_result = parse_result::SUCCESS; break; } // re-push ourselves - push_context(ctx.j_state, ctx.commands, false, 0, true); + push_context(ctx.j_state, ctx.commands, ctx.list_element, true); // push the next command - push_context(ctx.j_state, ctx.commands + 1, true, ctx.element_count); + push_context(ctx.j_state, ctx.commands + 1, true); } } break; @@ -763,14 +667,17 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, case path_operator_type::CHILD_INDEX: { PARSE_TRY(ctx.j_state.child_element()); if (last_result == parse_result::SUCCESS) { - json_string any{"*", 1}; + json_string const any{"*", 1}; PARSE_TRY(ctx.j_state.next_matching_element(any, true)); if (last_result == parse_result::SUCCESS) { - for (int idx = 1; idx <= op.index; idx++) { + int idx; + for (idx = 1; idx <= op.index; idx++) { PARSE_TRY(ctx.j_state.next_matching_element(any, false)); if (last_result == parse_result::EMPTY) { break; } } - push_context(ctx.j_state, ctx.commands + 1, ctx.list_element, ctx.element_count); + // if we didn't end up at the index we requested, this is an invalid indexe + if (idx - 1 != op.index) { return parse_result::ERROR; } + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); } } } break; @@ -780,13 +687,15 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, // END case default: { - if (ctx.list_element && ctx.element_count > 0) { + if (ctx.list_element && element_count > 0) { output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); } PARSE_TRY(ctx.j_state.extract_element(&output, ctx.list_element)); + if (ctx.list_element && last_result != parse_result::EMPTY) { element_count++; } } break; } } + return parse_result::SUCCESS; } @@ -794,20 +703,29 @@ CUDA_HOST_DEVICE_CALLABLE parse_result parse_json_path(json_state& _j_state, // a jsonpath containing 7 nested wildcards so this is probably reasonable. constexpr int max_command_stack_depth = 8; -CUDA_HOST_DEVICE_CALLABLE json_output get_json_object_single(char const* input, - size_t input_len, - path_operator const* const commands, - char* out_buf, - size_t out_buf_size) +__device__ thrust::pair get_json_object_single( + char const* input, + size_t input_len, + path_operator const* const commands, + char* out_buf, + size_t out_buf_size) { json_state j_state(input, input_len); json_output output{out_buf_size, 0, 0, out_buf}; - parse_json_path(j_state, commands, output); + auto const result = parse_json_path(j_state, commands, output); - return output; + return {result, output}; } +/** + * @brief Kernel for running the JSONPath query. + * + * This kernel operates in a 2-pass way. On the first pass, it computes + * output sizes. On the second pass it fills in the provided output buffers + * (chars and validity) + * + */ __global__ void get_json_object_kernel(char const* chars, size_type const* offsets, path_operator const* const commands, @@ -820,16 +738,18 @@ __global__ void get_json_object_kernel(char const* chars, bool is_valid = false; if (tid < num_rows) { - size_type src_size = offsets[tid + 1] - offsets[tid]; - size_type output_size = 0; + size_type const src_size = offsets[tid + 1] - offsets[tid]; + size_type output_size = 0; if (src_size > 0) { - char* dst = out_buf ? out_buf + output_offsets[tid] : nullptr; - size_t dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0; + char* dst = out_buf ? out_buf + output_offsets[tid] : nullptr; + size_t const dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0; - json_output out = + parse_result result; + json_output out; + thrust::tie(result, out) = get_json_object_single(chars + offsets[tid], src_size, commands, dst, dst_size); output_size = out.output_len; - if (out.element_count > 0) { is_valid = true; } + if (out.element_count > 0 && result == parse_result::SUCCESS) { is_valid = true; } } // filled in only during the precompute step @@ -857,6 +777,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c CUDF_EXPECTS(std::get<1>(preprocess) <= max_command_stack_depth, "Encountered json_path string that is too complex"); + // allocate output offsets buffer. auto offsets = cudf::make_fixed_width_column( data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr); cudf::mutable_column_view offsets_view(*offsets); @@ -880,7 +801,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c cudf::detail::grid_1d const grid{col.size(), 512}; - // preprocess sizes + // preprocess sizes (returned in the offsets buffer) get_json_object_kernel<<>>( col.chars().head(), col.offsets().head(), @@ -896,7 +817,8 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c offsets_view.head() + col.size() + 1, offsets_view.head(), 0); - size_type output_size = cudf::detail::get_value(offsets_view, col.size(), stream); + size_type const output_size = + cudf::detail::get_value(offsets_view, col.size(), stream); // allocate output string column auto chars = cudf::make_fixed_width_column( diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index bb33b0d8289..3c9aff72337 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -15,189 +15,299 @@ */ #include +#include +#include #include #include #include +// reference: https://jsonpath.herokuapp.com/ /* -const char* json_string = "{ + { "store": { "book": [ - { + { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 - }, - { + }, + { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 - }, - { + }, + { "category": "fiction", "author": "Herman Melville", "title": "Moby Dick", "isbn": "0-553-21311-3", "price": 8.99 - }, - { + }, + { "category": "fiction", "author": "J. R. R. Tolkien", "title": "The Lord of the Rings", "isbn": "0-395-19395-8", "price": 22.99 - } + } ], "bicycle": { - "color": "red", - "price": 19.95 + "color": "red", + "price": 19.95 } }, -}"; + "expensive": 10 + } */ +// this string is formatted to result in a reasonably readable debug printf +std::string json_string{ + "{\n\"store\": {\n\t\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " + "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " + "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " + "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " + "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " + "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " + "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " + "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " + "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " + "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": " + "19.95\n\t}\n},\n\"expensive\": 10\n}"}; + +std::unique_ptr drop_whitespace(cudf::column_view const& col) +{ + cudf::test::strings_column_wrapper whitespace{"\n", "\r", "\t"}; + cudf::test::strings_column_wrapper repl{"", "", ""}; + + cudf::strings_column_view strings(col); + cudf::strings_column_view targets(whitespace); + cudf::strings_column_view replacements(repl); + return cudf::strings::replace(strings, targets, replacements); +} + struct JsonTests : public cudf::test::BaseFixture { }; -TEST_F(JsonTests, GetJsonObject) +TEST_F(JsonTests, GetJsonObjectRootOp) { - // reference: https://jsonpath.herokuapp.com/ - // clang-format off - /* - { - "store": { - "book": [ - { - "category": "reference", - "author": "Nigel Rees", - "title": "Sayings of the Century", - "price": 8.95 - }, - { - "category": "fiction", - "author": "Evelyn Waugh", - "title": "Sword of Honour", - "price": 12.99 - }, - { - "category": "fiction", - "author": "Herman Melville", - "title": "Moby Dick", - "isbn": "0-553-21311-3", - "price": 8.99 - }, - { - "category": "fiction", - "author": "J. R. R. Tolkien", - "title": "The Lord of the Rings", - "isbn": "0-395-19395-8", - "price": 22.99 - } - ], - "bicycle": { - "color": "red", - "price": 19.95 - } - }, - "expensive": 10 - } - */ - // clang-format on - // this string is formatted to result in a reasonably readable debug printf - const char* json_string = - "{\n\"store\": {\n\t\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " - "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " - "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " - "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " - "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " - "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " - "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " - "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " - "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " - "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": " - "19.95\n\t}\n},\n\"expensive\": 10\n}"; + // root + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); - { - cudf::test::strings_column_wrapper input{json_string}; - std::string json_path("$"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto expected = drop_whitespace(input); - cudf::test::print(*result); - } + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); +} +TEST_F(JsonTests, GetJsonObjectChildOp) +{ { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); - - cudf::test::print(*result); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "{\n\t\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " + "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " + "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " + "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " + "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " + "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " + "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " + "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " + "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " + "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": " + "19.95\n\t}\n}"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); } { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.book"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); - - cudf::test::print(*result); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " + "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " + "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " + "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " + "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " + "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " + "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " + "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " + "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " + "22.99\n\t\t}\n\t]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); } +} +TEST_F(JsonTests, GetJsonObjectWildcardOp) +{ { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.*"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[[\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " + "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " + "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " + "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " + "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " + "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " + "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " + "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " + "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " + "22.99\n\t\t}\n\t],\n\t{\n\t\t\"color\": \"red\",\n\t\t\"price\": " + "19.95\n\t}]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } - cudf::test::print(*result); + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("*"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[{\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " + "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " + "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " + "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " + "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " + "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " + "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " + "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " + "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " + "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": " + "19.95\n\t}\n},10]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); } +} +TEST_F(JsonTests, GetJsonObjectSubscriptOp) +{ { cudf::test::strings_column_wrapper input{json_string}; - std::string json_path("$.store.book[*]"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + std::string json_path("$.store.book[2]"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); - cudf::test::print(*result); + cudf::test::strings_column_wrapper expected_raw{ + "{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " + "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " + "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t}"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); } { cudf::test::strings_column_wrapper input{json_string}; - std::string json_path("$.store.book[*].category"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + std::string json_path("$.store['bicycle']"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); - cudf::test::print(*result); + cudf::test::strings_column_wrapper expected_raw{ + "{\n\t\t\"color\": \"red\",\n\t\t\"price\": " + "19.95\n\t}"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); } { cudf::test::strings_column_wrapper input{json_string}; - std::string json_path("$.store.book[*].title"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + std::string json_path("$.store.book[*]"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " + "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " + "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " + "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " + "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " + "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " + "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " + "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " + "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " + "22.99\n\t\t}\n\t]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} - cudf::test::print(*result); +TEST_F(JsonTests, GetJsonObjectFilter) +{ + // queries that result in filtering/collating results (mostly meaning - generates new + // json instead of just returning parts of the existing string + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*]['isbn']"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{"[\"0-553-21311-3\",\"0-395-19395-8\"]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); } { cudf::test::strings_column_wrapper input{json_string}; - std::string json_path("$.store['bicycle']"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + std::string json_path("$.store.book[*].category"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[\"reference\",\"fiction\",\"fiction\",\"fiction\"]"}; + auto expected = drop_whitespace(expected_raw); - cudf::test::print(*result); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); } { cudf::test::strings_column_wrapper input{json_string}; - std::string json_path("$.store.book[*]['isbn']"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + std::string json_path("$.store.book[*].title"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[\"Sayings of the Century\",\"Sword of Honour\",\"Moby Dick\",\"The Lord of the Rings\"]"}; + auto expected = drop_whitespace(expected_raw); - cudf::test::print(*result); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); } { cudf::test::strings_column_wrapper input{json_string}; - std::string json_path("$.store.book[2]"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + std::string json_path("$.store.book.*.price"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); - cudf::test::print(*result); + cudf::test::strings_column_wrapper expected_raw{"[8.95,12.99,8.99,22.99]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); } { @@ -206,56 +316,191 @@ TEST_F(JsonTests, GetJsonObject) // spark: fiction cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.book[2].category"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); - cudf::test::print(*result); + cudf::test::strings_column_wrapper expected_raw{"fiction"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); } +} +TEST_F(JsonTests, GetJsonObjectNullInputs) +{ { - char const* str = "{\"a\" : \"b\"}"; - cudf::test::strings_column_wrapper input{{str, str, str, str}, {1, 0, 1, 0}}; + std::string str("{\"a\" : \"b\"}"); + cudf::test::strings_column_wrapper input({str, str, str, str}, {1, 0, 1, 0}); std::string json_path("$.a"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); - cudf::test::print(*result); + cudf::test::strings_column_wrapper expected_raw({"b", "", "b", ""}, {1, 0, 1, 0}); + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); } +} +TEST_F(JsonTests, GetJsonObjectEmptyQuery) +{ // empty query -> null { - cudf::test::strings_column_wrapper input{""}; + cudf::test::strings_column_wrapper input{"{\"a\" : \"b\"}"}; std::string json_path(""); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); - cudf::test::print(*result); + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); } +} +TEST_F(JsonTests, GetJsonObjectEmptyInputsAndOutputs) +{ // empty input -> null { cudf::test::strings_column_wrapper input{""}; std::string json_path("$"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); - cudf::test::print(*result); + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // slightly different from "empty output". in this case, we're + // returning something, but it happens to be empty. so we expect + // a valid, but empty row + { + cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"}; + std::string json_path("$.store.bicycle"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +// badly formed JSONpath strings +TEST_F(JsonTests, GetJsonObjectIllegalQuery) +{ + // can't have more than one root operator, or a root operator anywhere other + // than the beginning + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$$"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // invalid index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[auh46h-]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); } - // empty output -> null + // invalid index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[[]]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // negative index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[-1]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // child operator with no name specified + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("."); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("]["); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("6hw6,56i3"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } +} + +// queries that are legal, but reference invalid parts of the input +TEST_F(JsonTests, GetJsonObjectInvalidQuery) +{ + // non-existent field { cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; std::string json_path("$[*].c"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); - cudf::test::print(*result); + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); } - // slightly different from "empty output". in this case, we're - // returning something, but it happens to be empty. so we expect - // a valid, but empty row + // non-existent field { - cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"}; - std::string json_path("$.store.bicycle"); + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[*].c[2]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // non-existent field + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book.price"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); - cudf::test::print(*result); + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // out of bounds index + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[4]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); } } From b1a2b0980816d094cd7099240cf8c9a11fa8fe55 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 24 Mar 2021 21:12:39 -0500 Subject: [PATCH 16/33] Update meta.yaml --- conda/recipes/libcudf/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 5657d21889f..7188065eca0 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -176,6 +176,7 @@ test: - test -f $PREFIX/include/cudf/strings/detail/copying.hpp - test -f $PREFIX/include/cudf/strings/detail/fill.hpp - test -f $PREFIX/include/cudf/strings/detail/replace.hpp + - test -f $PREFIX/include/cudf/strings/detail/substring.hpp - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp - test -f $PREFIX/include/cudf/strings/extract.hpp - test -f $PREFIX/include/cudf/strings/findall.hpp From 02e20b7904fb09f5fd1047b860d2dcfb5f2e8776 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 24 Mar 2021 21:42:55 -0500 Subject: [PATCH 17/33] Additional docs and cleanup --- cpp/benchmarks/string/json_benchmark.cpp | 2 +- cpp/include/cudf/strings/detail/substring.hpp | 2 +- cpp/src/strings/json/json_path.cu | 116 +++++++++++++----- 3 files changed, 87 insertions(+), 33 deletions(-) diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json_benchmark.cpp index df1aadef404..dbb33927716 100644 --- a/cpp/benchmarks/string/json_benchmark.cpp +++ b/cpp/benchmarks/string/json_benchmark.cpp @@ -137,4 +137,4 @@ JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]"); JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category"); JSON_BENCHMARK_DEFINE(query6, "$.store.['bicycle']"); JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']"); -JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]"); \ No newline at end of file +JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]"); diff --git a/cpp/include/cudf/strings/detail/substring.hpp b/cpp/include/cudf/strings/detail/substring.hpp index a646d93e2b8..35b7dec6449 100644 --- a/cpp/include/cudf/strings/detail/substring.hpp +++ b/cpp/include/cudf/strings/detail/substring.hpp @@ -27,7 +27,7 @@ namespace strings { namespace detail { /** - * @copydoc cudf::get_json_object + * @copydoc cudf::strings::get_json_object * * @param stream CUDA stream used for device memory operations and kernel launches */ diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 8cb3c7bb25b..880fe167d02 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -53,7 +53,7 @@ constexpr int DEBUG_NEWLINE_LEN = 0; using namespace cudf; /** - * @brief Result of calling a parse type function. + * @brief Result of calling a parse function. * * The primary use of this is to distinguish between "success" and * "success but no data" return cases. For example, if you are reading the @@ -193,6 +193,10 @@ class parser { } }; +/** + * @brief Output buffer object. Used during the preprocess/size-computation step + * and the actual output step. + */ struct json_output { size_t output_max_len; size_t output_len; @@ -210,25 +214,20 @@ struct json_output { enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; +/** + * @brief Parsing class that holds the current state of the json to be parse and provides + * functions for navigating through it. + */ class json_state : private parser { public: - constexpr json_state() - : parser(), - element(json_element_type::NONE), - cur_el_start(nullptr), - cur_el_type(json_element_type::NONE) - { - } + constexpr json_state() : parser(), cur_el_start(nullptr), cur_el_type(json_element_type::NONE) {} constexpr json_state(const char* _input, int64_t _input_len) - : parser(_input, _input_len), - element(json_element_type::NONE), - cur_el_start(nullptr), - cur_el_type(json_element_type::NONE) + : parser(_input, _input_len), cur_el_start(nullptr), cur_el_type(json_element_type::NONE) { } constexpr json_state(json_state const& j) - : parser(j), element(j.element), cur_el_start(j.cur_el_start), cur_el_type(j.cur_el_type) + : parser(j), cur_el_start(j.cur_el_start), cur_el_type(j.cur_el_type) { } @@ -287,10 +286,13 @@ class json_state : private parser { return parse_result::SUCCESS; } + // skip the next element constexpr parse_result skip_element() { return extract_element(nullptr, false); } + // advance to the next element constexpr parse_result next_element() { return next_element_internal(false); } + // advance inside the current element constexpr parse_result child_element(bool as_field = false) { // cannot retrieve a field from an array @@ -298,6 +300,7 @@ class json_state : private parser { return next_element_internal(true); } + // return the next element that matches the specified name. constexpr parse_result next_matching_element(json_string const& name, bool inclusive) { // if we're not including the current element, skip it @@ -323,6 +326,7 @@ class json_state : private parser { } private: + // parse a value - either a string or a number/null/bool constexpr parse_result parse_value() { if (!parse_whitespace()) { return parse_result::ERROR; } @@ -376,28 +380,37 @@ class json_state : private parser { return parse_result::SUCCESS; } - json_element_type element; - const char* cur_el_start; - json_string cur_el_name; - json_element_type cur_el_type; + const char* cur_el_start; // pointer to the first character of the -value- of the current + // element - not the name + json_string cur_el_name; // name of the current element (if applicable) + json_element_type cur_el_type; // type of the current element }; enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END }; +/** + * @brief A "command" operator used to query a json string. A full query is + * an array of these operators applied to the incoming json string, + */ struct path_operator { constexpr path_operator() : type(path_operator_type::ERROR), index(-1) {} constexpr path_operator(path_operator_type _type) : type(_type), index(-1) {} - path_operator_type type; - json_string name; - int index; + path_operator_type type; // operator type + json_string name; // name to match against (if applicable) + int index; // index for subscript operator }; -// current state of the JSONPath +/** + * @brief Parsing class that holds the current state of the JSONPath string to be parsed + * and provides functions for navigating through it. This is only called on the host + * during the preprocess step which builds a command buffer that the gpu uses. + */ class path_state : private parser { public: path_state(const char* _path, size_t _path_len) : parser(_path, _path_len) {} + // get the next operator in the JSONPath string path_operator get_next_operator() { if (eof()) { return {path_operator_type::END}; } @@ -498,6 +511,15 @@ class path_state : private parser { } }; +/** + * @brief Preprocess the incoming JSONPath string on the host to generate a + * command buffer for use by the GPU. + * + * @param json_path The incoming json path + * @param stream Cuda stream to perform any gpu actions on + * @returns A tuple containing the command buffer, the maximum stack depth required and whether or + * not the command buffer is empty. + */ std::tuple, int, bool> build_command_buffer( cudf::string_scalar const& json_path, rmm::cuda_stream_view stream) { @@ -547,18 +569,25 @@ std::tuple, int, bool> build_command_buffer( if (last_result == parse_result::ERROR) { return parse_result::ERROR; } \ } while (0) +/** + * @brief Parse a single json string using the provided command buffer + * + * @param j_state The incoming json string and associated parser + * @param commands The command buffer to be applied to the string. Always ends with a + * path_operator_type::END + * @param output Buffer user to store the results of the query + * @returns A result code indicating success/fail/empty. + */ template -__device__ parse_result parse_json_path(json_state& _j_state, - path_operator const* _commands, - json_output& output, - bool _list_element = false) +__device__ parse_result parse_json_path(json_state& j_state, + path_operator const* commands, + json_output& output) { // manually maintained context stack in lieu of calling parse_json_path recursively. struct context { json_state j_state; path_operator const* commands; bool list_element; - // int element_count; bool state_flag; }; context stack[max_command_stack_depth]; @@ -566,11 +595,9 @@ __device__ parse_result parse_json_path(json_state& _j_state, auto push_context = [&stack, &stack_pos](json_state const& _j_state, path_operator const* _commands, bool _list_element = false, - /* int _element_count = 0,*/ - bool _state_flag = false) { + bool _state_flag = false) { if (stack_pos == max_command_stack_depth - 1) { return false; } - stack[stack_pos++] = - context{_j_state, _commands, _list_element, /*_element_count,*/ _state_flag}; + stack[stack_pos++] = context{_j_state, _commands, _list_element, _state_flag}; return true; }; auto pop_context = [&stack, &stack_pos](context& c) { @@ -580,7 +607,7 @@ __device__ parse_result parse_json_path(json_state& _j_state, } return false; }; - push_context(_j_state, _commands, _list_element); + push_context(j_state, commands, false); parse_result last_result = parse_result::SUCCESS; context ctx; @@ -703,6 +730,20 @@ __device__ parse_result parse_json_path(json_state& _j_state, // a jsonpath containing 7 nested wildcards so this is probably reasonable. constexpr int max_command_stack_depth = 8; +/** + * @brief Parse a single json string using the provided command buffer + * + * This function exists primarily as a shim for debugging purposes. + * + * @param input The incoming json string + * @param input_len Size of the incoming json string + * @param commands The command buffer to be applied to the string. Always ends with a + * path_operator_type::END + * @param out_buf Buffer user to store the results of the query (nullptr in the size computation + * step) + * @param out_buf_size Size of the output buffer + * @returns A pair containing the result code the output buffer. + */ __device__ thrust::pair get_json_object_single( char const* input, size_t input_len, @@ -725,6 +766,13 @@ __device__ thrust::pair get_json_object_single( * output sizes. On the second pass it fills in the provided output buffers * (chars and validity) * + * @param chars The chars child column of the incoming strings column + * @param offsets The offsets of the incoming strings column + * @param commands JSONPath command buffer + * @param out_buf Buffer user to store the results of the query (nullptr in the size computation + * step) + * @param out_validity Output validity buffer (nullptr in the size computation step) + * @param num_rows Number of rows in the input column */ __global__ void get_json_object_kernel(char const* chars, size_type const* offsets, @@ -766,6 +814,9 @@ __global__ void get_json_object_kernel(char const* chars, } } +/** + * @copydoc cudf::strings::detail::get_json_object + */ std::unique_ptr get_json_object(cudf::strings_column_view const& col, cudf::string_scalar const& json_path, rmm::cuda_stream_view stream, @@ -852,6 +903,9 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c } // namespace } // namespace detail +/** + * @copydoc cudf::strings::get_json_object + */ std::unique_ptr get_json_object(cudf::strings_column_view const& col, cudf::string_scalar const& json_path, rmm::mr::device_memory_resource* mr) From fd330feed0b416adcbb33df1c30d72f3effc52aa Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 25 Mar 2021 09:36:56 -0700 Subject: [PATCH 18/33] Update java/src/main/native/src/ColumnViewJni.cpp Co-authored-by: Jason Lowe --- java/src/main/native/src/ColumnViewJni.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index c57d4c1fbd9..028e9f5e640 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1803,7 +1803,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env JNI_NULL_CHECK(env, j_scalar_handle, "path cannot be null", 0); try { - + cudf::jni::auto_set_device(env); cudf::column_view* n_column_view = reinterpret_cast(j_view_handle); cudf::strings_column_view n_strings_col_view(*n_column_view); cudf::string_scalar *n_scalar_path = reinterpret_cast(j_scalar_handle); From 5229790ede1beb4783ed68b31be934009e6e241c Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 25 Mar 2021 09:37:05 -0700 Subject: [PATCH 19/33] Update java/src/main/native/src/ColumnViewJni.cpp Co-authored-by: Jason Lowe --- java/src/main/native/src/ColumnViewJni.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 028e9f5e640..fdfbee116cd 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1796,7 +1796,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv CATCH_STD(env, 0) } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jobject j_object, +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jclass, jlong j_view_handle, jlong j_scalar_handle) { JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0); From 94658646b86578b1794744574019f56e44dbcbc0 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Thu, 25 Mar 2021 16:14:39 -0500 Subject: [PATCH 20/33] Fix spelling. --- cpp/src/strings/json/json_path.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 880fe167d02..a93dfc46f72 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -702,7 +702,7 @@ __device__ parse_result parse_json_path(json_state& j_state, PARSE_TRY(ctx.j_state.next_matching_element(any, false)); if (last_result == parse_result::EMPTY) { break; } } - // if we didn't end up at the index we requested, this is an invalid indexe + // if we didn't end up at the index we requested, this is an invalid index if (idx - 1 != op.index) { return parse_result::ERROR; } push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); } From 4e4865b28714522050e0a4606458f5ae18ef43fe Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Fri, 26 Mar 2021 11:15:29 -0500 Subject: [PATCH 21/33] Make larger test strings more human readable. --- cpp/tests/strings/json_tests.cpp | 346 +++++++++++++++++++++---------- 1 file changed, 235 insertions(+), 111 deletions(-) diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index 3c9aff72337..e745b97de2d 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -23,59 +23,48 @@ #include // reference: https://jsonpath.herokuapp.com/ -/* - { - "store": { - "book": [ - { - "category": "reference", - "author": "Nigel Rees", - "title": "Sayings of the Century", - "price": 8.95 - }, - { - "category": "fiction", - "author": "Evelyn Waugh", - "title": "Sword of Honour", - "price": 12.99 - }, - { - "category": "fiction", - "author": "Herman Melville", - "title": "Moby Dick", - "isbn": "0-553-21311-3", - "price": 8.99 - }, - { - "category": "fiction", - "author": "J. R. R. Tolkien", - "title": "The Lord of the Rings", - "isbn": "0-395-19395-8", - "price": 22.99 - } - ], - "bicycle": { - "color": "red", - "price": 19.95 - } - }, - "expensive": 10 - } -*/ -// this string is formatted to result in a reasonably readable debug printf +// clang-format off std::string json_string{ - "{\n\"store\": {\n\t\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " - "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " - "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " - "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " - "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " - "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " - "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " - "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " - "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " - "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": " - "19.95\n\t}\n},\n\"expensive\": 10\n}"}; + "{" + "\"store\": {" + "\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}," + "\"expensive\": 10" + "}" +}; +// clang-format on std::unique_ptr drop_whitespace(cudf::column_view const& col) { @@ -112,18 +101,44 @@ TEST_F(JsonTests, GetJsonObjectChildOp) auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); - cudf::test::strings_column_wrapper expected_raw{ - "{\n\t\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " - "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " - "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " - "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " - "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " - "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " - "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " - "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " - "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " - "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": " - "19.95\n\t}\n}"}; + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "{" + "\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}" + }; + // clang-format on auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -135,17 +150,38 @@ TEST_F(JsonTests, GetJsonObjectChildOp) auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); + // clang-format off cudf::test::strings_column_wrapper expected_raw{ - "[\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " - "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " - "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " - "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " - "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " - "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " - "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " - "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " - "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " - "22.99\n\t\t}\n\t]"}; + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]" + }; + // clang-format on auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -160,18 +196,44 @@ TEST_F(JsonTests, GetJsonObjectWildcardOp) auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); + // clang-format off cudf::test::strings_column_wrapper expected_raw{ - "[[\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " - "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " - "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " - "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " - "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " - "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " - "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " - "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " - "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " - "22.99\n\t\t}\n\t],\n\t{\n\t\t\"color\": \"red\",\n\t\t\"price\": " - "19.95\n\t}]"}; + "[" + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "{" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "]" + }; + // clang-format on auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -183,18 +245,47 @@ TEST_F(JsonTests, GetJsonObjectWildcardOp) auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); + // clang-format off cudf::test::strings_column_wrapper expected_raw{ - "[{\"book\": [\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " - "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " - "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " - "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " - "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " - "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " - "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " - "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " - "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " - "22.99\n\t\t}\n\t],\n\t\"bicycle\": {\n\t\t\"color\": \"red\",\n\t\t\"price\": " - "19.95\n\t}\n},10]"}; + "[" + "{" + "\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}," + "10" + "]" + }; + // clang-format on auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -209,10 +300,17 @@ TEST_F(JsonTests, GetJsonObjectSubscriptOp) auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); + // clang-format off cudf::test::strings_column_wrapper expected_raw{ - "{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " - "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " - "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t}"}; + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}" + }; + // clang-format on auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -224,9 +322,14 @@ TEST_F(JsonTests, GetJsonObjectSubscriptOp) auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); + // clang-format off cudf::test::strings_column_wrapper expected_raw{ - "{\n\t\t\"color\": \"red\",\n\t\t\"price\": " - "19.95\n\t}"}; + "{" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + }; + // clang-format on auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -238,17 +341,38 @@ TEST_F(JsonTests, GetJsonObjectSubscriptOp) auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); + // clang-format off cudf::test::strings_column_wrapper expected_raw{ - "[\n\t\t{\n\t\t\t\"category\": \"reference\",\n\t\t\t\"author\": " - "\"Nigel Rees\",\n\t\t\t\"title\": \"Sayings of the Century\",\n\t\t\t\"price\": " - "8.95\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Evelyn " - "Waugh\",\n\t\t\t\"title\": \"Sword of Honour\",\n\t\t\t\"price\": " - "12.99\n\t\t},\n\t\t{\n\t\t\t\"category\": \"fiction\",\n\t\t\t\"author\": \"Herman " - "Melville\",\n\t\t\t\"title\": \"Moby Dick\",\n\t\t\t\"isbn\": " - "\"0-553-21311-3\",\n\t\t\t\"price\": 8.99\n\t\t},\n\t\t{\n\t\t\t\"category\": " - "\"fiction\",\n\t\t\t\"author\": \"J. R. R. Tolkien\",\n\t\t\t\"title\": \"The Lord of the " - "Rings\",\n\t\t\t\"isbn\": \"0-395-19395-8\",\n\t\t\t\"price\": " - "22.99\n\t\t}\n\t]"}; + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]" + }; + // clang-format on auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -503,4 +627,4 @@ TEST_F(JsonTests, GetJsonObjectInvalidQuery) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); } -} +} \ No newline at end of file From 3653d0d99f27b2a6d8a976cadda6805fb8f30f28 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Fri, 26 Mar 2021 14:24:04 -0500 Subject: [PATCH 22/33] PR review changes. Changed get_json_object_kernel() to take a column_device_view instead of raw pointers for input data. --- cpp/src/strings/json/json_path.cu | 96 +++++++++++++++---------- cpp/tests/utilities/column_utilities.cu | 2 +- 2 files changed, 59 insertions(+), 39 deletions(-) diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index a93dfc46f72..a67ef865cb1 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -1,9 +1,27 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include #include #include #include #include #include #include +#include #include #include #include @@ -50,8 +68,6 @@ constexpr int DEBUG_NEWLINE_LEN = 0; // #define SPARK_BEHAVIORS -using namespace cudf; - /** * @brief Result of calling a parse function. * @@ -67,6 +83,25 @@ enum class parse_result { EMPTY, // success, but no data }; +/** + * @brief A struct which represents a string. + * + * Typically used to point into a substring of a larger string, such as + * the input json itself. + * + * @code + * // where cur_pos is a pointer to the beginning of a name string in the + * // input json and name_size is the computed size. + * json_string name{cur_pos, name_size}; + * @endcode + * + * Also used for parameter passing in a few cases: + * + * @code + * json_string wildcard{"*", 1}; + * func(wildcard); + * @endcode + */ struct json_string { const char* str; int64_t len; @@ -766,36 +801,34 @@ __device__ thrust::pair get_json_object_single( * output sizes. On the second pass it fills in the provided output buffers * (chars and validity) * - * @param chars The chars child column of the incoming strings column - * @param offsets The offsets of the incoming strings column + * @param col Device view of the incoming string * @param commands JSONPath command buffer - * @param out_buf Buffer user to store the results of the query (nullptr in the size computation + * @param output_offsets Buffer used to store the string offsets for the results of the query + * (nullptr in the size computation step) + * @param out_buf Buffer used to store the results of the query (nullptr in the size computation * step) * @param out_validity Output validity buffer (nullptr in the size computation step) - * @param num_rows Number of rows in the input column */ -__global__ void get_json_object_kernel(char const* chars, - size_type const* offsets, +__global__ void get_json_object_kernel(column_device_view col, path_operator const* const commands, size_type* output_offsets, char* out_buf, - bitmask_type* out_validity, - size_type num_rows) + bitmask_type* out_validity) { uint64_t const tid = threadIdx.x + (blockDim.x * blockIdx.x); bool is_valid = false; - if (tid < num_rows) { - size_type const src_size = offsets[tid + 1] - offsets[tid]; - size_type output_size = 0; - if (src_size > 0) { + if (tid < col.size()) { + string_view const str = col.element(tid); + size_type output_size = 0; + if (str.size_bytes() > 0) { char* dst = out_buf ? out_buf + output_offsets[tid] : nullptr; size_t const dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0; parse_result result; json_output out; thrust::tie(result, out) = - get_json_object_single(chars + offsets[tid], src_size, commands, dst, dst_size); + get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size); output_size = out.output_len; if (out.element_count > 0 && result == parse_result::SUCCESS) { is_valid = true; } } @@ -808,7 +841,7 @@ __global__ void get_json_object_kernel(char const* chars, if (out_validity) { uint32_t mask = __ballot_sync(0xffffffff, is_valid); // 0th lane of the warp writes the validity - if (!(tid % cudf::detail::warp_size) && tid < num_rows) { + if (!(tid % cudf::detail::warp_size) && tid < col.size()) { out_validity[cudf::word_index(tid)] = mask; } } @@ -835,32 +868,21 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c // if the query is empty, return a string column containing all nulls if (std::get<2>(preprocess)) { - thrust::generate(rmm::exec_policy(stream), - offsets_view.head(), - offsets_view.head() + offsets_view.size(), - [] __device__() { return 0; }); - return cudf::make_strings_column( - col.size(), - std::move(offsets), - cudf::make_fixed_width_column( - data_type{type_id::INT8}, 0, mask_state::UNALLOCATED, stream, mr), + return std::make_unique( + data_type{type_id::STRING}, col.size(), + rmm::device_buffer{0, stream, mr}, // no data cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr), - stream, - mr); + col.size()); // null count } cudf::detail::grid_1d const grid{col.size(), 512}; + auto cdv = column_device_view::create(col.parent(), stream); + // preprocess sizes (returned in the offsets buffer) get_json_object_kernel<<>>( - col.chars().head(), - col.offsets().head(), - std::get<0>(preprocess).data(), - offsets_view.head(), - nullptr, - nullptr, - col.size()); + *cdv, std::get<0>(preprocess).data(), offsets_view.head(), nullptr, nullptr); // convert sizes to offsets thrust::exclusive_scan(rmm::exec_policy(stream), @@ -883,13 +905,11 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c // compute results cudf::mutable_column_view chars_view(*chars); get_json_object_kernel<<>>( - col.chars().head(), - col.offsets().head(), + *cdv, std::get<0>(preprocess).data(), offsets_view.head(), chars_view.head(), - static_cast(validity.data()), - col.size()); + static_cast(validity.data())); return make_strings_column(col.size(), std::move(offsets), diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index cea66eced11..ddb3730b039 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -71,7 +71,7 @@ struct column_property_comparator { // equivalent, but not exactly equal columns can have a different number of children if their // sizes are both 0. Specifically, empty string columns may or may not have children. - if (check_exact_equality || lhs.size() > 0) { + if (check_exact_equality || (lhs.size() > 0 && lhs.null_count() < lhs.size())) { EXPECT_EQ(lhs.num_children(), rhs.num_children()); } } From 9c761b8262aa4043552cbd910d569bca4327815e Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Fri, 26 Mar 2021 14:30:36 -0500 Subject: [PATCH 23/33] Fixed missing newline. --- cpp/tests/strings/json_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index e745b97de2d..682a9a0689f 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -627,4 +627,4 @@ TEST_F(JsonTests, GetJsonObjectInvalidQuery) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); } -} \ No newline at end of file +} From e47b0882a6d7513ac5fd54f0960b4552c8962d17 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Sun, 28 Mar 2021 14:53:23 -0500 Subject: [PATCH 24/33] Handle additional disallowed cases when indexing into child elements. Make sure to never parse names when dealing with elements in an array. Add more tests. --- cpp/src/strings/json/json_path.cu | 84 +++++++++++++++------ cpp/tests/strings/json_tests.cpp | 117 ++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+), 24 deletions(-) diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index a67ef865cb1..5a0a684d232 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -255,14 +255,26 @@ enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; */ class json_state : private parser { public: - constexpr json_state() : parser(), cur_el_start(nullptr), cur_el_type(json_element_type::NONE) {} + constexpr json_state() + : parser(), + cur_el_start(nullptr), + cur_el_type(json_element_type::NONE), + parent_el_type(json_element_type::NONE) + { + } constexpr json_state(const char* _input, int64_t _input_len) - : parser(_input, _input_len), cur_el_start(nullptr), cur_el_type(json_element_type::NONE) + : parser(_input, _input_len), + cur_el_start(nullptr), + cur_el_type(json_element_type::NONE), + parent_el_type(json_element_type::NONE) { } constexpr json_state(json_state const& j) - : parser(j), cur_el_start(j.cur_el_start), cur_el_type(j.cur_el_type) + : parser(j), + cur_el_start(j.cur_el_start), + cur_el_type(j.cur_el_type), + parent_el_type(j.parent_el_type) { } @@ -328,11 +340,15 @@ class json_state : private parser { constexpr parse_result next_element() { return next_element_internal(false); } // advance inside the current element - constexpr parse_result child_element(bool as_field = false) + constexpr parse_result child_element(json_element_type expected_type) { - // cannot retrieve a field from an array - if (as_field && cur_el_type == json_element_type::ARRAY) { return parse_result::ERROR; } - return next_element_internal(true); + if (expected_type != NONE && cur_el_type != expected_type) { return parse_result::ERROR; } + + // if we succeed, record our parent element type. + auto const prev_el_type = cur_el_type; + auto const result = next_element_internal(true); + if (result == parse_result::SUCCESS) { parent_el_type = prev_el_type; } + return result; } // return the next element that matches the specified name. @@ -393,8 +409,12 @@ class json_state : private parser { char const c = *pos; if (c == ']' || c == '}') { return parse_result::EMPTY; } - // element name, if any - if (parse_name(cur_el_name, true, '\"') == parse_result::ERROR) { return parse_result::ERROR; } + // if we're not accessing elements of an array, check for name. + bool const array_access = + (cur_el_type == ARRAY && child) || (parent_el_type == ARRAY && !child); + if (!array_access && parse_name(cur_el_name, true, '\"') == parse_result::ERROR) { + return parse_result::ERROR; + } // element type if (!parse_whitespace()) { return parse_result::EMPTY; } @@ -415,10 +435,11 @@ class json_state : private parser { return parse_result::SUCCESS; } - const char* cur_el_start; // pointer to the first character of the -value- of the current - // element - not the name - json_string cur_el_name; // name of the current element (if applicable) - json_element_type cur_el_type; // type of the current element + const char* cur_el_start; // pointer to the first character of the -value- of the current + // element - not the name + json_string cur_el_name; // name of the current element (if applicable) + json_element_type cur_el_type; // type of the current element + json_element_type parent_el_type; // parent element type }; enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END }; @@ -428,12 +449,22 @@ enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, * an array of these operators applied to the incoming json string, */ struct path_operator { - constexpr path_operator() : type(path_operator_type::ERROR), index(-1) {} - constexpr path_operator(path_operator_type _type) : type(_type), index(-1) {} + constexpr path_operator() : type(path_operator_type::ERROR), index(-1), expected_type{NONE} {} + constexpr path_operator(path_operator_type _type, json_element_type _expected_type = NONE) + : type(_type), index(-1), expected_type{_expected_type} + { + } path_operator_type type; // operator type - json_string name; // name to match against (if applicable) - int index; // index for subscript operator + // the expected element type we're applying this operation to. + // for example: + // - you cannot retrieve a subscripted field (eg [5]) from an object. + // - you cannot retrieve a field by name (eg .book) from an array. + // - you -can- use .* for both arrays and objects + // a value of NONE imples any type accepted + json_element_type expected_type; // the expected type of the element we're working with + json_string name; // name to match against (if applicable) + int index; // index for subscript operator }; /** @@ -461,9 +492,11 @@ class path_state : private parser { // Spark currently only handles the wildcard operator inside [*], it does // not handle .* if (op.name.len == 1 && op.name.str[0] == '*') { - op.type = path_operator_type::CHILD_WILDCARD; + op.type = path_operator_type::CHILD_WILDCARD; + op.expected_type = NONE; } else { - op.type = path_operator_type::CHILD; + op.type = path_operator_type::CHILD; + op.expected_type = OBJECT; } return op; } @@ -480,15 +513,18 @@ class path_state : private parser { if (parse_path_name(op.name, term)) { pos++; if (op.name.len == 1 && op.name.str[0] == '*') { - op.type = path_operator_type::CHILD_WILDCARD; + op.type = path_operator_type::CHILD_WILDCARD; + op.expected_type = NONE; } else { if (is_string) { - op.type = path_operator_type::CHILD; + op.type = path_operator_type::CHILD; + op.expected_type = OBJECT; } else { op.type = path_operator_type::CHILD_INDEX; op.index = cudf::io::parse_numeric(op.name.str, op.name.str + op.name.len, json_opts, -1); CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath"); + op.expected_type = ARRAY; } } return op; @@ -662,7 +698,7 @@ __device__ parse_result parse_json_path(json_state& j_state, // [1] // will return a single thing case path_operator_type::CHILD: { - PARSE_TRY(ctx.j_state.child_element(true)); + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); if (last_result == parse_result::SUCCESS) { PARSE_TRY(ctx.j_state.next_matching_element(op.name, true)); if (last_result == parse_result::SUCCESS) { @@ -681,7 +717,7 @@ __device__ parse_result parse_json_path(json_state& j_state, if (!ctx.list_element) { output.add_output({"[" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); } // step into the child element - PARSE_TRY(ctx.j_state.child_element()); + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); if (last_result == parse_result::EMPTY) { if (!ctx.list_element) { output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); @@ -727,7 +763,7 @@ __device__ parse_result parse_json_path(json_state& j_state, // etc // returns a single thing case path_operator_type::CHILD_INDEX: { - PARSE_TRY(ctx.j_state.child_element()); + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); if (last_result == parse_result::SUCCESS) { json_string const any{"*", 1}; PARSE_TRY(ctx.j_state.next_matching_element(any, true)); diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index 682a9a0689f..9fdc4af7e87 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -628,3 +628,120 @@ TEST_F(JsonTests, GetJsonObjectInvalidQuery) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); } } + +TEST_F(JsonTests, MixedOutput) +{ + // various queries on: + // clang-format off + std::vector input_strings { + "{\"a\": {\"b\" : \"c\"}}", + + "{" + "\"a\": {\"b\" : \"c\"}," + "\"d\": [{\"e\":123}, {\"f\":-10}]" + "}", + + "{" + "\"b\": 123" + "}", + + "{" + "\"a\": [\"y\",500]" + "}", + + "{" + "\"a\": \"\"" + "}", + + "{" + "\"a\": {" + "\"z\": {\"i\": 10, \"j\": 100}," + "\"b\": [\"c\", null, true, -1]" + "}" + "}" + }; + // clang-format on + cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end()); + + { + std::string json_path("$.a"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "{\"b\" : \"c\"}", + "{\"b\" : \"c\"}", + "", + "[\"y\",500]", + "", + "{" + "\"z\": {\"i\": 10, \"j\": 100}," + "\"b\": [\"c\", null, true, -1]" + "}" + }, + {1, 1, 0, 1, 1, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a[1]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "", + "", + "", + "500", + "", + "", + }, + {0, 0, 0, 1, 0, 0}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a.b"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "c", + "c", + "", + "", + "", + "[\"c\", null, true, -1]"}, + {1, 1, 0, 0, 0, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a[*]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "[\"c\"]", + "[\"c\"]", + "", + "[\"y\",500]", + "", + "[" + "{\"i\": 10, \"j\": 100}," + "[\"c\", null, true, -1]" + "]" }, + {1, 1, 0, 1, 0, 1}); + // clang-format on + + cudf::test::print(*result); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} \ No newline at end of file From f898ca65cb993075c78dd5bd17d3d58f5c0dd9b4 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Sun, 28 Mar 2021 15:36:11 -0500 Subject: [PATCH 25/33] Distinguish between "no output" (null result) and "empty output" (valid, but empty string) case more clearly. Additional test. --- cpp/src/strings/json/json_path.cu | 24 +++++++++++++++------- cpp/tests/strings/json_tests.cpp | 34 ++++++++++++++++++++++--------- 2 files changed, 41 insertions(+), 17 deletions(-) diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 5a0a684d232..d4d180339ef 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -231,17 +231,30 @@ class parser { /** * @brief Output buffer object. Used during the preprocess/size-computation step * and the actual output step. + * + * There is an important distinction between two cases: + * + * - producing no output at all. that is, the query matched nothing in the input. + * - producing empty output. the query matched something in the input, but the + * value of the result is an empty string. + * + * The `has_output` field is the flag which indicates whether or not the output + * from the query should be considered empty or null. + * */ struct json_output { size_t output_max_len; size_t output_len; - int element_count; + bool has_output; char* output; constexpr void add_output(const char* str, size_t len) { if (output != nullptr) { memcpy(output + output_len, str, len); } output_len += len; + // set this to true even if the string is empty. it implies we have + // seen actual output + has_output = true; } constexpr void add_output(json_string str) { add_output(str.str, str.len); } @@ -326,10 +339,7 @@ class json_state : private parser { if (*pos == ',') { pos++; } } - if (output != nullptr) { - output->add_output({start, end - start}); - output->element_count++; - } + if (output != nullptr) { output->add_output({start, end - start}); } return parse_result::SUCCESS; } @@ -823,7 +833,7 @@ __device__ thrust::pair get_json_object_single( size_t out_buf_size) { json_state j_state(input, input_len); - json_output output{out_buf_size, 0, 0, out_buf}; + json_output output{out_buf_size, 0, false, out_buf}; auto const result = parse_json_path(j_state, commands, output); @@ -866,7 +876,7 @@ __global__ void get_json_object_kernel(column_device_view col, thrust::tie(result, out) = get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size); output_size = out.output_len; - if (out.element_count > 0 && result == parse_result::SUCCESS) { is_valid = true; } + if (out.has_output > 0 && result == parse_result::SUCCESS) { is_valid = true; } } // filled in only during the precompute step diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index 9fdc4af7e87..c2dbd649832 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -27,8 +27,7 @@ // clang-format off std::string json_string{ "{" - "\"store\": {" - "\"book\": [" + "\"store\": {""\"book\": [" "{" "\"category\": \"reference\"," "\"author\": \"Nigel Rees\"," @@ -656,13 +655,12 @@ TEST_F(JsonTests, MixedOutput) "{" "\"a\": {" "\"z\": {\"i\": 10, \"j\": 100}," - "\"b\": [\"c\", null, true, -1]" + "\"b\": [\"c\",null,true,-1]" "}" "}" }; // clang-format on cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end()); - { std::string json_path("$.a"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); @@ -676,7 +674,7 @@ TEST_F(JsonTests, MixedOutput) "", "{" "\"z\": {\"i\": 10, \"j\": 100}," - "\"b\": [\"c\", null, true, -1]" + "\"b\": [\"c\",null,true,-1]" "}" }, {1, 1, 0, 1, 1, 1}); @@ -715,7 +713,7 @@ TEST_F(JsonTests, MixedOutput) "", "", "", - "[\"c\", null, true, -1]"}, + "[\"c\",null,true,-1]"}, {1, 1, 0, 0, 0, 1}); // clang-format on @@ -732,15 +730,31 @@ TEST_F(JsonTests, MixedOutput) "[\"c\"]", "", "[\"y\",500]", - "", + "[]", "[" "{\"i\": 10, \"j\": 100}," - "[\"c\", null, true, -1]" + "[\"c\",null,true,-1]" "]" }, - {1, 1, 0, 1, 0, 1}); + {1, 1, 0, 1, 1, 1}); // clang-format on - cudf::test::print(*result); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a.b[*]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "[]", + "[]", + "", + "", + "", + "[\"c\",null,true,-1]"}, + {1, 1, 0, 0, 0, 1}); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); } From 6829f46195eb2059d8a308afaa5de2d3544303ae Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 29 Mar 2021 11:11:36 -0500 Subject: [PATCH 26/33] Moved get_json_object() declarations out of strings/substring.hpp to strings/json.hpp --- conda/recipes/libcudf/meta.yaml | 5 +- .../detail/{substring.hpp => json.hpp} | 0 cpp/include/cudf/strings/json.hpp | 50 +++++++++++++++++++ cpp/include/cudf/strings/substring.hpp | 19 ------- cpp/tests/strings/json_tests.cpp | 4 +- 5 files changed, 55 insertions(+), 23 deletions(-) rename cpp/include/cudf/strings/detail/{substring.hpp => json.hpp} (100%) create mode 100644 cpp/include/cudf/strings/json.hpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 7188065eca0..60200869ff8 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -175,13 +175,14 @@ test: - test -f $PREFIX/include/cudf/strings/detail/converters.hpp - test -f $PREFIX/include/cudf/strings/detail/copying.hpp - test -f $PREFIX/include/cudf/strings/detail/fill.hpp - - test -f $PREFIX/include/cudf/strings/detail/replace.hpp - - test -f $PREFIX/include/cudf/strings/detail/substring.hpp + - test -f $PREFIX/include/cudf/strings/detail/json.hpp + - test -f $PREFIX/include/cudf/strings/detail/replace.hpp - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp - test -f $PREFIX/include/cudf/strings/extract.hpp - test -f $PREFIX/include/cudf/strings/findall.hpp - test -f $PREFIX/include/cudf/strings/find.hpp - test -f $PREFIX/include/cudf/strings/find_multiple.hpp + - test -f $PREFIX/include/cudf/strings/json.hpp - test -f $PREFIX/include/cudf/strings/padding.hpp - test -f $PREFIX/include/cudf/strings/replace.hpp - test -f $PREFIX/include/cudf/strings/replace_re.hpp diff --git a/cpp/include/cudf/strings/detail/substring.hpp b/cpp/include/cudf/strings/detail/json.hpp similarity index 100% rename from cpp/include/cudf/strings/detail/substring.hpp rename to cpp/include/cudf/strings/detail/json.hpp diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp new file mode 100644 index 00000000000..b39e4a2027c --- /dev/null +++ b/cpp/include/cudf/strings/json.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf { +namespace strings { + +/** + * @addtogroup strings_json + * @{ + * @file + */ + +/** + * @brief Apply a JSONPath string to all rows in an input strings column. + * + * Applies a JSONPath string to an incoming strings column where each row in the column + * is a valid json string. The output is returned by row as a strings column. + * + * https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html + * Implements only the operators: $ . [] * + * + * @param col The input strings column. Each row must contain a valid json string + * @param json_path The JSONPath string to be applied to each row + * @param mr Resource for allocating device memory. + * @return New strings column containing the retrieved json object strings + */ +std::unique_ptr get_json_object( + cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp index 06a38360fdb..d1431060792 100644 --- a/cpp/include/cudf/strings/substring.hpp +++ b/cpp/include/cudf/strings/substring.hpp @@ -197,25 +197,6 @@ std::unique_ptr slice_strings( size_type count, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Apply a JSONPath string to all rows in an input strings column. - * - * Applies a JSONPath string to an incoming strings column where each row in the column - * is a valid json string. The output is returned by row as a strings column. - * - * https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html - * Implements only the operators: $ . [] * - * - * @param col The input strings column. Each row must contain a valid json string - * @param json_path The JSONPath string to be applied to each row - * @param mr Resource for allocating device memory. - * @return New strings column containing the retrieved json object strings - */ -std::unique_ptr get_json_object( - cudf::strings_column_view const& col, - cudf::string_scalar const& json_path, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index c2dbd649832..feaa69e4364 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include @@ -758,4 +758,4 @@ TEST_F(JsonTests, MixedOutput) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); } -} \ No newline at end of file +} From c0743b455133cbdd78b2b3fdcd574e73b616c1b7 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 29 Mar 2021 11:14:18 -0500 Subject: [PATCH 27/33] Clang format --- cpp/tests/strings/json_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index feaa69e4364..44eb35d4163 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -15,9 +15,9 @@ */ #include +#include #include #include -#include #include #include From 285ed92208eb4a6c1a901393dfaa85e04da7bde4 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 29 Mar 2021 12:11:26 -0500 Subject: [PATCH 28/33] Use string_view instead of json_string struct. Cleanup benchmark CMakeLists.txt --- cpp/benchmarks/CMakeLists.txt | 3 +- cpp/src/strings/json/json_path.cu | 155 +++++++++++++----------------- 2 files changed, 66 insertions(+), 92 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 26a1cf95ab6..a331be5b611 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -197,5 +197,4 @@ ConfigureBench(STRINGS_BENCH ################################################################################################### # - json benchmark ------------------------------------------------------------------- ConfigureBench(JSON_BENCH - string/json_benchmark.cpp) - \ No newline at end of file + string/json_benchmark.cpp) \ No newline at end of file diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index d4d180339ef..e3f73d376e7 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -83,39 +83,6 @@ enum class parse_result { EMPTY, // success, but no data }; -/** - * @brief A struct which represents a string. - * - * Typically used to point into a substring of a larger string, such as - * the input json itself. - * - * @code - * // where cur_pos is a pointer to the beginning of a name string in the - * // input json and name_size is the computed size. - * json_string name{cur_pos, name_size}; - * @endcode - * - * Also used for parameter passing in a few cases: - * - * @code - * json_string wildcard{"*", 1}; - * func(wildcard); - * @endcode - */ -struct json_string { - const char* str; - int64_t len; - - constexpr json_string() : str(nullptr), len(-1) {} - constexpr json_string(const char* _str, int64_t _len) : str(_str), len(_len) {} - - constexpr bool operator==(json_string const& cmp) - { - return len == cmp.len && str != nullptr && cmp.str != nullptr && - thrust::equal(thrust::seq, str, str + len, cmp.str); - } -}; - /** * @brief Base parser class inherited by the (device-side) json_state class and * (host-side) path_state class. @@ -125,19 +92,22 @@ struct json_string { */ class parser { protected: - constexpr parser() : input(nullptr), input_len(0), pos(nullptr) {} - constexpr parser(const char* _input, int64_t _input_len) + CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {} + CUDA_HOST_DEVICE_CALLABLE parser(const char* _input, int64_t _input_len) : input(_input), input_len(_input_len), pos(_input) { parse_whitespace(); } - constexpr parser(parser const& p) : input(p.input), input_len(p.input_len), pos(p.pos) {} + CUDA_HOST_DEVICE_CALLABLE parser(parser const& p) + : input(p.input), input_len(p.input_len), pos(p.pos) + { + } - constexpr bool eof(const char* p) { return p - input >= input_len; } - constexpr bool eof() { return eof(pos); } + CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; } + CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); } - constexpr bool parse_whitespace() + CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace() { while (!eof()) { if (is_whitespace(*pos)) { @@ -149,17 +119,17 @@ class parser { return false; } - constexpr parse_result parse_string(json_string& str, bool can_be_empty, char quote) + CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(string_view& str, + bool can_be_empty, + char quote) { - str.str = nullptr; - str.len = 0; + str = string_view(nullptr, 0); if (parse_whitespace() && *pos == quote) { const char* start = ++pos; while (!eof()) { if (*pos == quote) { - str.str = start; - str.len = pos - start; + str = string_view(start, pos - start); pos++; return parse_result::SUCCESS; } @@ -173,14 +143,16 @@ class parser { // a name means: // - a string followed by a : // - no string - constexpr parse_result parse_name(json_string& name, bool can_be_empty, char quote) + CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(string_view& name, + bool can_be_empty, + char quote) { if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) { return parse_result::ERROR; } // if we got a real string, the next char must be a : - if (name.len > 0) { + if (name.size_bytes() > 0) { if (!parse_whitespace()) { return parse_result::ERROR; } if (*pos == ':') { pos++; @@ -193,7 +165,7 @@ class parser { // numbers, true, false, null. // this function is not particularly strong. badly formed values will get // consumed without throwing any errors - constexpr parse_result parse_non_string_value(json_string& val) + CUDA_HOST_DEVICE_CALLABLE parse_result parse_non_string_value(string_view& val) { if (!parse_whitespace()) { return parse_result::ERROR; } @@ -210,8 +182,7 @@ class parser { } pos = end; - val.str = start; - val.len = {end - start}; + val = string_view(start, end - start); return parse_result::SUCCESS; } @@ -222,7 +193,7 @@ class parser { char const* pos; private: - constexpr bool is_whitespace(char c) + CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c == ' ' || c == '\r' || c == '\n' || c == '\t' ? true : false; } @@ -248,7 +219,7 @@ struct json_output { bool has_output; char* output; - constexpr void add_output(const char* str, size_t len) + __device__ void add_output(const char* str, size_t len) { if (output != nullptr) { memcpy(output + output_len, str, len); } output_len += len; @@ -257,7 +228,7 @@ struct json_output { has_output = true; } - constexpr void add_output(json_string str) { add_output(str.str, str.len); } + __device__ void add_output(string_view const& str) { add_output(str.data(), str.size_bytes()); } }; enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; @@ -268,14 +239,14 @@ enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; */ class json_state : private parser { public: - constexpr json_state() + __device__ json_state() : parser(), cur_el_start(nullptr), cur_el_type(json_element_type::NONE), parent_el_type(json_element_type::NONE) { } - constexpr json_state(const char* _input, int64_t _input_len) + __device__ json_state(const char* _input, int64_t _input_len) : parser(_input, _input_len), cur_el_start(nullptr), cur_el_type(json_element_type::NONE), @@ -283,7 +254,7 @@ class json_state : private parser { { } - constexpr json_state(json_state const& j) + __device__ json_state(json_state const& j) : parser(j), cur_el_start(j.cur_el_start), cur_el_type(j.cur_el_type), @@ -291,8 +262,8 @@ class json_state : private parser { { } - // retrieve the entire current element as a json_string - constexpr parse_result extract_element(json_output* output, bool list_element) + // retrieve the entire current element into the output + __device__ parse_result extract_element(json_output* output, bool list_element) { char const* start = cur_el_start; char const* end = start; @@ -339,18 +310,18 @@ class json_state : private parser { if (*pos == ',') { pos++; } } - if (output != nullptr) { output->add_output({start, end - start}); } + if (output != nullptr) { output->add_output({start, static_cast(end - start)}); } return parse_result::SUCCESS; } // skip the next element - constexpr parse_result skip_element() { return extract_element(nullptr, false); } + __device__ parse_result skip_element() { return extract_element(nullptr, false); } // advance to the next element - constexpr parse_result next_element() { return next_element_internal(false); } + __device__ parse_result next_element() { return next_element_internal(false); } // advance inside the current element - constexpr parse_result child_element(json_element_type expected_type) + __device__ parse_result child_element(json_element_type expected_type) { if (expected_type != NONE && cur_el_type != expected_type) { return parse_result::ERROR; } @@ -362,7 +333,7 @@ class json_state : private parser { } // return the next element that matches the specified name. - constexpr parse_result next_matching_element(json_string const& name, bool inclusive) + __device__ parse_result next_matching_element(string_view const& name, bool inclusive) { // if we're not including the current element, skip it if (!inclusive) { @@ -372,7 +343,7 @@ class json_state : private parser { // loop until we find a match or there's nothing left do { // wildcard matches anything - if (name.len == 1 && name.str[0] == '*') { + if (name.size_bytes() == 1 && name.data()[0] == '*') { return parse_result::SUCCESS; } else if (cur_el_name == name) { return parse_result::SUCCESS; @@ -388,16 +359,16 @@ class json_state : private parser { private: // parse a value - either a string or a number/null/bool - constexpr parse_result parse_value() + __device__ parse_result parse_value() { if (!parse_whitespace()) { return parse_result::ERROR; } // string or number? - json_string unused; + string_view unused; return *pos == '\"' ? parse_string(unused, false, '\"') : parse_non_string_value(unused); } - constexpr parse_result next_element_internal(bool child) + __device__ parse_result next_element_internal(bool child) { // if we're not getting a child element, skip the current element. // this will leave pos as the first character -after- the close of @@ -447,7 +418,7 @@ class json_state : private parser { const char* cur_el_start; // pointer to the first character of the -value- of the current // element - not the name - json_string cur_el_name; // name of the current element (if applicable) + string_view cur_el_name; // name of the current element (if applicable) json_element_type cur_el_type; // type of the current element json_element_type parent_el_type; // parent element type }; @@ -459,8 +430,12 @@ enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, * an array of these operators applied to the incoming json string, */ struct path_operator { - constexpr path_operator() : type(path_operator_type::ERROR), index(-1), expected_type{NONE} {} - constexpr path_operator(path_operator_type _type, json_element_type _expected_type = NONE) + CUDA_HOST_DEVICE_CALLABLE path_operator() + : type(path_operator_type::ERROR), index(-1), expected_type{NONE} + { + } + CUDA_HOST_DEVICE_CALLABLE path_operator(path_operator_type _type, + json_element_type _expected_type = NONE) : type(_type), index(-1), expected_type{_expected_type} { } @@ -473,7 +448,7 @@ struct path_operator { // - you -can- use .* for both arrays and objects // a value of NONE imples any type accepted json_element_type expected_type; // the expected type of the element we're working with - json_string name; // name to match against (if applicable) + string_view name; // name to match against (if applicable) int index; // index for subscript operator }; @@ -496,12 +471,12 @@ class path_state : private parser { case '.': { path_operator op; - json_string term{".[", 2}; + string_view term{".[", 2}; if (parse_path_name(op.name, term)) { // this is another potential use case for __SPARK_BEHAVIORS / configurability // Spark currently only handles the wildcard operator inside [*], it does // not handle .* - if (op.name.len == 1 && op.name.str[0] == '*') { + if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') { op.type = path_operator_type::CHILD_WILDCARD; op.expected_type = NONE; } else { @@ -518,11 +493,11 @@ class path_state : private parser { // wildcard: [*] case '[': { path_operator op; - json_string term{"]", 1}; + string_view term{"]", 1}; bool const is_string = *pos == '\'' ? true : false; if (parse_path_name(op.name, term)) { pos++; - if (op.name.len == 1 && op.name.str[0] == '*') { + if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') { op.type = path_operator_type::CHILD_WILDCARD; op.expected_type = NONE; } else { @@ -530,9 +505,9 @@ class path_state : private parser { op.type = path_operator_type::CHILD; op.expected_type = OBJECT; } else { - op.type = path_operator_type::CHILD_INDEX; - op.index = - cudf::io::parse_numeric(op.name.str, op.name.str + op.name.len, json_opts, -1); + op.type = path_operator_type::CHILD_INDEX; + op.index = cudf::io::parse_numeric( + op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1); CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath"); op.expected_type = ARRAY; } @@ -555,12 +530,11 @@ class path_state : private parser { private: cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'}; - bool parse_path_name(json_string& name, json_string const& terminators) + bool parse_path_name(string_view& name, string_view const& terminators) { switch (*pos) { case '*': - name.str = pos; - name.len = 1; + name = string_view(pos, 1); pos++; break; @@ -571,22 +545,20 @@ class path_state : private parser { default: { size_t const chars_left = input_len - (pos - input); char const* end = std::find_first_of( - pos, pos + chars_left, terminators.str, terminators.str + terminators.len); + pos, pos + chars_left, terminators.data(), terminators.data() + terminators.size_bytes()); if (end) { - name.str = pos; - name.len = end - pos; - pos = end; + name = string_view(pos, end - pos); + pos = end; } else { - name.str = pos; - name.len = chars_left; - pos = input + input_len; + name = string_view(pos, chars_left); + pos = input + input_len; } break; } } // an empty name is not valid - CUDF_EXPECTS(name.len > 0, "Invalid empty name in JSONpath query string"); + CUDF_EXPECTS(name.size_bytes() > 0, "Invalid empty name in JSONpath query string"); return true; } @@ -618,7 +590,10 @@ std::tuple, int, bool> build_command_buffer( } if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; } // convert pointer to device pointer - if (op.name.len > 0) { op.name.str = json_path.data() + (op.name.str - h_json_path.data()); } + if (op.name.size_bytes() > 0) { + op.name = + string_view(json_path.data() + (op.name.data() - h_json_path.data()), op.name.size_bytes()); + } if (op.type == path_operator_type::ROOT) { CUDF_EXPECTS(h_operators.size() == 0, "Root operator ($) can only exist at the root"); } @@ -775,7 +750,7 @@ __device__ parse_result parse_json_path(json_state& j_state, case path_operator_type::CHILD_INDEX: { PARSE_TRY(ctx.j_state.child_element(op.expected_type)); if (last_result == parse_result::SUCCESS) { - json_string const any{"*", 1}; + string_view const any{"*", 1}; PARSE_TRY(ctx.j_state.next_matching_element(any, true)); if (last_result == parse_result::SUCCESS) { int idx; From ef03e30eb56ba39d4720d518d540d9263e8be302 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 29 Mar 2021 15:11:15 -0500 Subject: [PATCH 29/33] Fix errant whitespace in meta.yaml. Update benchmarks and JNI bindings to point to new location for get_json_object(). Use a grid stride loop in core kernel. Use some thrust_optionals where appropriate. Compute and return null count instead of just leaving it unknown. --- conda/recipes/libcudf/meta.yaml | 2 +- cpp/benchmarks/string/json_benchmark.cpp | 4 +- cpp/src/strings/json/json_path.cu | 125 ++++++++++++--------- java/src/main/native/src/ColumnViewJni.cpp | 1 + 4 files changed, 77 insertions(+), 55 deletions(-) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 60200869ff8..4ab222a3c85 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -176,7 +176,7 @@ test: - test -f $PREFIX/include/cudf/strings/detail/copying.hpp - test -f $PREFIX/include/cudf/strings/detail/fill.hpp - test -f $PREFIX/include/cudf/strings/detail/json.hpp - - test -f $PREFIX/include/cudf/strings/detail/replace.hpp + - test -f $PREFIX/include/cudf/strings/detail/replace.hpp - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp - test -f $PREFIX/include/cudf/strings/extract.hpp - test -f $PREFIX/include/cudf/strings/findall.hpp diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json_benchmark.cpp index dbb33927716..6fb6a07a8d0 100644 --- a/cpp/benchmarks/string/json_benchmark.cpp +++ b/cpp/benchmarks/string/json_benchmark.cpp @@ -22,8 +22,8 @@ #include #include +#include #include -#include class JsonPath : public cudf::benchmark { }; @@ -135,6 +135,6 @@ JSON_BENCHMARK_DEFINE(query2, "$.store.book"); JSON_BENCHMARK_DEFINE(query3, "$.store.*"); JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]"); JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category"); -JSON_BENCHMARK_DEFINE(query6, "$.store.['bicycle']"); +JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']"); JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']"); JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]"); diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index e3f73d376e7..44a43b7f095 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -215,17 +216,13 @@ class parser { */ struct json_output { size_t output_max_len; - size_t output_len; - bool has_output; char* output; + thrust::optional output_len; __device__ void add_output(const char* str, size_t len) { - if (output != nullptr) { memcpy(output + output_len, str, len); } - output_len += len; - // set this to true even if the string is empty. it implies we have - // seen actual output - has_output = true; + if (output != nullptr) { memcpy(output + output_len.value_or(0), str, len); } + output_len = output_len.value_or(0) + len; } __device__ void add_output(string_view const& str) { add_output(str.data(), str.size_bytes()); } @@ -570,10 +567,9 @@ class path_state : private parser { * * @param json_path The incoming json path * @param stream Cuda stream to perform any gpu actions on - * @returns A tuple containing the command buffer, the maximum stack depth required and whether or - * not the command buffer is empty. + * @returns A pair containing the command buffer, and maximum stack depth required. */ -std::tuple, int, bool> build_command_buffer( +std::pair>, int> build_command_buffer( cudf::string_scalar const& json_path, rmm::cuda_stream_view stream) { std::string h_json_path = json_path.to_string(stream); @@ -606,17 +602,12 @@ std::tuple, int, bool> build_command_buffer( h_operators.push_back(op); } while (op.type != path_operator_type::END); - rmm::device_uvector d_operators(h_operators.size(), stream); - CUDA_TRY(cudaMemcpyAsync(d_operators.data(), - h_operators.data(), - sizeof(path_operator) * h_operators.size(), - cudaMemcpyHostToDevice, - stream.value())); - stream.synchronize(); - - return {std::move(d_operators), - max_stack_depth, - h_operators.size() == 1 && h_operators[0].type == path_operator_type::END ? true : false}; + auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END; + return is_empty + ? std::make_pair(thrust::nullopt, 0) + : std::make_pair( + thrust::make_optional(cudf::detail::make_device_uvector_sync(h_operators, stream)), + max_stack_depth); } #define PARSE_TRY(_x) \ @@ -808,7 +799,7 @@ __device__ thrust::pair get_json_object_single( size_t out_buf_size) { json_state j_state(input, input_len); - json_output output{out_buf_size, 0, false, out_buf}; + json_output output{out_buf_size, out_buf}; auto const result = parse_json_path(j_state, commands, output); @@ -829,17 +820,26 @@ __device__ thrust::pair get_json_object_single( * @param out_buf Buffer used to store the results of the query (nullptr in the size computation * step) * @param out_validity Output validity buffer (nullptr in the size computation step) + * @param out_valid_count Output count of # of valid bits (nullptr in the size computation step) */ -__global__ void get_json_object_kernel(column_device_view col, - path_operator const* const commands, - size_type* output_offsets, - char* out_buf, - bitmask_type* out_validity) +template +__launch_bounds__(block_size) __global__ + void get_json_object_kernel(column_device_view col, + path_operator const* const commands, + size_type* output_offsets, + char* out_buf, + bitmask_type* out_validity, + size_type* out_valid_count) { - uint64_t const tid = threadIdx.x + (blockDim.x * blockIdx.x); + size_type tid = threadIdx.x + (blockDim.x * blockIdx.x); + size_type stride = blockDim.x * gridDim.x; - bool is_valid = false; - if (tid < col.size()) { + if (out_valid_count) { *out_valid_count = 0; } + size_type warp_valid_count{0}; + + auto active_threads = __ballot_sync(0xffffffff, tid < col.size()); + while (tid < col.size()) { + bool is_valid = false; string_view const str = col.element(tid); size_type output_size = 0; if (str.size_bytes() > 0) { @@ -850,21 +850,32 @@ __global__ void get_json_object_kernel(column_device_view col, json_output out; thrust::tie(result, out) = get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size); - output_size = out.output_len; - if (out.has_output > 0 && result == parse_result::SUCCESS) { is_valid = true; } + output_size = out.output_len.value_or(0); + if (out.output_len.has_value() && result == parse_result::SUCCESS) { is_valid = true; } } // filled in only during the precompute step if (!out_buf) { output_offsets[tid] = static_cast(output_size); } - } - // validity filled in only during the output step - if (out_validity) { - uint32_t mask = __ballot_sync(0xffffffff, is_valid); - // 0th lane of the warp writes the validity - if (!(tid % cudf::detail::warp_size) && tid < col.size()) { - out_validity[cudf::word_index(tid)] = mask; + // validity filled in only during the output step + if (out_validity) { + uint32_t mask = __ballot_sync(active_threads, is_valid); + // 0th lane of the warp writes the validity + if (!(tid % cudf::detail::warp_size)) { + out_validity[cudf::word_index(tid)] = mask; + warp_valid_count += __popc(mask); + } } + + tid += stride; + active_threads = __ballot_sync(active_threads, tid < col.size()); + } + + // sum the valid counts across the whole block + if (out_valid_count) { + size_type block_valid_count = + cudf::detail::single_lane_block_sum_reduce(warp_valid_count); + if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); } } } @@ -877,8 +888,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c rmm::mr::device_memory_resource* mr) { // preprocess the json_path into a command buffer - std::tuple, int, bool> preprocess = - build_command_buffer(json_path, stream); + auto preprocess = build_command_buffer(json_path, stream); CUDF_EXPECTS(std::get<1>(preprocess) <= max_command_stack_depth, "Encountered json_path string that is too complex"); @@ -888,7 +898,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c cudf::mutable_column_view offsets_view(*offsets); // if the query is empty, return a string column containing all nulls - if (std::get<2>(preprocess)) { + if (!std::get<0>(preprocess).has_value()) { return std::make_unique( data_type{type_id::STRING}, col.size(), @@ -897,13 +907,20 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c col.size()); // null count } - cudf::detail::grid_1d const grid{col.size(), 512}; + constexpr int block_size = 512; + cudf::detail::grid_1d const grid{col.size(), block_size}; auto cdv = column_device_view::create(col.parent(), stream); // preprocess sizes (returned in the offsets buffer) - get_json_object_kernel<<>>( - *cdv, std::get<0>(preprocess).data(), offsets_view.head(), nullptr, nullptr); + get_json_object_kernel + <<>>( + *cdv, + std::get<0>(preprocess).value().data(), + offsets_view.head(), + nullptr, + nullptr, + nullptr); // convert sizes to offsets thrust::exclusive_scan(rmm::exec_policy(stream), @@ -925,17 +942,20 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c // compute results cudf::mutable_column_view chars_view(*chars); - get_json_object_kernel<<>>( - *cdv, - std::get<0>(preprocess).data(), - offsets_view.head(), - chars_view.head(), - static_cast(validity.data())); + rmm::device_scalar d_valid_count{0, stream}; + get_json_object_kernel + <<>>( + *cdv, + std::get<0>(preprocess).value().data(), + offsets_view.head(), + chars_view.head(), + static_cast(validity.data()), + d_valid_count.data()); return make_strings_column(col.size(), std::move(offsets), std::move(chars), - UNKNOWN_NULL_COUNT, + col.size() - d_valid_count.value(), std::move(validity), stream, mr); @@ -951,6 +971,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c cudf::string_scalar const& json_path, rmm::mr::device_memory_resource* mr) { + CUDF_FUNC_RANGE(); return detail::get_json_object(col, json_path, 0, mr); } diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index fdfbee116cd..0023ebd879e 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include From 975ee51d5ef2ef9e0fb4c60b9df09b26114eb7c2 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 29 Mar 2021 16:41:19 -0500 Subject: [PATCH 30/33] Remove SPARK_BEHAVIORS #define. Use thrust::optional for more kernel parameters. --- cpp/src/strings/json/json_path.cu | 70 +++++++++++-------------------- 1 file changed, 24 insertions(+), 46 deletions(-) diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 44a43b7f095..ad598574044 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -34,6 +34,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -46,29 +48,6 @@ namespace { #define DEBUG_NEWLINE constexpr int DEBUG_NEWLINE_LEN = 0; -// temporary? spark doesn't strictly follow the JSONPath spec. -// I think this probably could be a configurable enum to control -// the kind of output you get and what features are supported. -// -// Current known differences: -// - When returning a string value as a single element, Spark strips the quotes. -// standard: "whee" -// spark: whee -// -// - Spark only supports the wildcard operator when in a subscript, eg [*] -// It does not handle .* -// -// Other, non-spark known differences: -// -// - In jsonpath_ng, name subscripts can use double quotes instead of the standard -// single quotes in the query string. -// standard: $.thing['subscript'] -// jsonpath_ng: $.thing["subscript"] -// -// Currently, this code only allows single-quotes but that could be expanded if necessary. -// -#define SPARK_BEHAVIORS - /** * @brief Result of calling a parse function. * @@ -271,14 +250,13 @@ class json_state : private parser { if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; } end = pos; -#if defined(SPARK_BEHAVIORS) - // spark/hive-specific behavior. if this is a non-list-element wrapped in quotes, - // strip them + // SPARK-specific behavior. if this is a non-list-element wrapped in quotes, + // strip them. we may need to make this behavior configurable in some way + // later on. if (!list_element && *start == '\"' && *(end - 1) == '\"') { start++; end--; } -#endif } // otherwise, march through everything inside else { @@ -816,25 +794,23 @@ __device__ thrust::pair get_json_object_single( * @param col Device view of the incoming string * @param commands JSONPath command buffer * @param output_offsets Buffer used to store the string offsets for the results of the query - * (nullptr in the size computation step) - * @param out_buf Buffer used to store the results of the query (nullptr in the size computation - * step) - * @param out_validity Output validity buffer (nullptr in the size computation step) - * @param out_valid_count Output count of # of valid bits (nullptr in the size computation step) + * @param out_buf Buffer used to store the results of the query + * @param out_validity Output validity buffer + * @param out_valid_count Output count of # of valid bits */ template __launch_bounds__(block_size) __global__ void get_json_object_kernel(column_device_view col, path_operator const* const commands, size_type* output_offsets, - char* out_buf, - bitmask_type* out_validity, - size_type* out_valid_count) + thrust::optional out_buf, + thrust::optional out_validity, + thrust::optional out_valid_count) { size_type tid = threadIdx.x + (blockDim.x * blockIdx.x); size_type stride = blockDim.x * gridDim.x; - if (out_valid_count) { *out_valid_count = 0; } + if (out_valid_count.has_value()) { *(out_valid_count.value()) = 0; } size_type warp_valid_count{0}; auto active_threads = __ballot_sync(0xffffffff, tid < col.size()); @@ -843,8 +819,9 @@ __launch_bounds__(block_size) __global__ string_view const str = col.element(tid); size_type output_size = 0; if (str.size_bytes() > 0) { - char* dst = out_buf ? out_buf + output_offsets[tid] : nullptr; - size_t const dst_size = out_buf ? output_offsets[tid + 1] - output_offsets[tid] : 0; + char* dst = out_buf.has_value() ? out_buf.value() + output_offsets[tid] : nullptr; + size_t const dst_size = + out_buf.has_value() ? output_offsets[tid + 1] - output_offsets[tid] : 0; parse_result result; json_output out; @@ -854,15 +831,16 @@ __launch_bounds__(block_size) __global__ if (out.output_len.has_value() && result == parse_result::SUCCESS) { is_valid = true; } } - // filled in only during the precompute step - if (!out_buf) { output_offsets[tid] = static_cast(output_size); } + // filled in only during the precompute step. during the compute step, the offsets + // are fed back in so we do -not- want to write them out + if (!out_buf.has_value()) { output_offsets[tid] = static_cast(output_size); } // validity filled in only during the output step - if (out_validity) { + if (out_validity.has_value()) { uint32_t mask = __ballot_sync(active_threads, is_valid); // 0th lane of the warp writes the validity if (!(tid % cudf::detail::warp_size)) { - out_validity[cudf::word_index(tid)] = mask; + out_validity.value()[cudf::word_index(tid)] = mask; warp_valid_count += __popc(mask); } } @@ -875,7 +853,7 @@ __launch_bounds__(block_size) __global__ if (out_valid_count) { size_type block_valid_count = cudf::detail::single_lane_block_sum_reduce(warp_valid_count); - if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); } + if (threadIdx.x == 0) { atomicAdd(out_valid_count.value(), block_valid_count); } } } @@ -918,9 +896,9 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c *cdv, std::get<0>(preprocess).value().data(), offsets_view.head(), - nullptr, - nullptr, - nullptr); + thrust::nullopt, + thrust::nullopt, + thrust::nullopt); // convert sizes to offsets thrust::exclusive_scan(rmm::exec_policy(stream), From bc649d88f9fe8fc99061271446130f223900093e Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Tue, 30 Mar 2021 16:59:33 -0500 Subject: [PATCH 31/33] Clean up includes in detail/json.hpp. Change copyright date back to 2020 in substring.hpp. Add strings_json doxygen group. Make sure JSONPath terminology is used consistently. Other small PR review cleanup. --- cpp/include/cudf/strings/detail/json.hpp | 4 +--- cpp/include/cudf/strings/substring.hpp | 2 +- cpp/include/doxygen_groups.h | 1 + cpp/src/strings/json/json_path.cu | 11 ++++------- cpp/tests/CMakeLists.txt | 4 ++-- 5 files changed, 9 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp index 35b7dec6449..e6a0b49f102 100644 --- a/cpp/include/cudf/strings/detail/json.hpp +++ b/cpp/include/cudf/strings/detail/json.hpp @@ -16,12 +16,10 @@ #pragma once -#include +#include #include -#include - namespace cudf { namespace strings { namespace detail { diff --git a/cpp/include/cudf/strings/substring.hpp b/cpp/include/cudf/strings/substring.hpp index d1431060792..6941615a0c2 100644 --- a/cpp/include/cudf/strings/substring.hpp +++ b/cpp/include/cudf/strings/substring.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 65dd5c73475..f78ff98d49d 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -127,6 +127,7 @@ * @defgroup strings_modify Modifying * @defgroup strings_replace Replacing * @defgroup strings_split Splitting + * @defgroup strings_json JSON * @} * @defgroup dictionary_apis Dictionary * @{ diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index ad598574044..d6ae72fba12 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -173,10 +173,7 @@ class parser { char const* pos; private: - CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) - { - return c == ' ' || c == '\r' || c == '\n' || c == '\t' ? true : false; - } + CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' ' ? true : false; } }; /** @@ -533,7 +530,7 @@ class path_state : private parser { } // an empty name is not valid - CUDF_EXPECTS(name.size_bytes() > 0, "Invalid empty name in JSONpath query string"); + CUDF_EXPECTS(name.size_bytes() > 0, "Invalid empty name in JSONPath query string"); return true; } @@ -752,7 +749,7 @@ __device__ parse_result parse_json_path(json_state& j_state, } // hardcoding this for now. to reach a stack depth of 8 would require -// a jsonpath containing 7 nested wildcards so this is probably reasonable. +// a JSONPath containing 7 nested wildcards so this is probably reasonable. constexpr int max_command_stack_depth = 8; /** @@ -868,7 +865,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c // preprocess the json_path into a command buffer auto preprocess = build_command_buffer(json_path, stream); CUDF_EXPECTS(std::get<1>(preprocess) <= max_command_stack_depth, - "Encountered json_path string that is too complex"); + "Encountered JSONPath string that is too complex"); // allocate output offsets buffer. auto offsets = cudf::make_fixed_width_column( diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index dcbba9b7220..79666a3325f 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -333,6 +333,7 @@ ConfigureTest(STRINGS_TEST strings/hash_string.cu strings/integers_tests.cu strings/ipv4_tests.cpp + strings/json_tests.cpp strings/pad_tests.cpp strings/replace_regex_tests.cpp strings/replace_tests.cpp @@ -340,8 +341,7 @@ ConfigureTest(STRINGS_TEST strings/strip_tests.cpp strings/substring_tests.cpp strings/translate_tests.cpp - strings/urls_tests.cpp - strings/json_tests.cpp) + strings/urls_tests.cpp) ################################################################################################### # - structs test ---------------------------------------------------------------------------------- From e69e6bb99fa7da654d3559cdb1c260a5192f05e1 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Tue, 30 Mar 2021 17:49:00 -0500 Subject: [PATCH 32/33] Use offset_type when dealing with output offsets view. --- cpp/src/strings/json/json_path.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index d6ae72fba12..4958848633a 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -799,7 +799,7 @@ template __launch_bounds__(block_size) __global__ void get_json_object_kernel(column_device_view col, path_operator const* const commands, - size_type* output_offsets, + offset_type* output_offsets, thrust::optional out_buf, thrust::optional out_validity, thrust::optional out_valid_count) @@ -830,7 +830,7 @@ __launch_bounds__(block_size) __global__ // filled in only during the precompute step. during the compute step, the offsets // are fed back in so we do -not- want to write them out - if (!out_buf.has_value()) { output_offsets[tid] = static_cast(output_size); } + if (!out_buf.has_value()) { output_offsets[tid] = static_cast(output_size); } // validity filled in only during the output step if (out_validity.has_value()) { @@ -892,19 +892,19 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c <<>>( *cdv, std::get<0>(preprocess).value().data(), - offsets_view.head(), + offsets_view.head(), thrust::nullopt, thrust::nullopt, thrust::nullopt); // convert sizes to offsets thrust::exclusive_scan(rmm::exec_policy(stream), - offsets_view.head(), - offsets_view.head() + col.size() + 1, - offsets_view.head(), + offsets_view.head(), + offsets_view.head() + col.size() + 1, + offsets_view.head(), 0); size_type const output_size = - cudf::detail::get_value(offsets_view, col.size(), stream); + cudf::detail::get_value(offsets_view, col.size(), stream); // allocate output string column auto chars = cudf::make_fixed_width_column( @@ -922,7 +922,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c <<>>( *cdv, std::get<0>(preprocess).value().data(), - offsets_view.head(), + offsets_view.head(), chars_view.head(), static_cast(validity.data()), d_valid_count.data()); From 74a71542099822e9211a823ac02d02fee6534a9f Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Tue, 30 Mar 2021 20:16:29 -0500 Subject: [PATCH 33/33] Newline in benchmark CMakeLists.txt. Remove more includes. Remove unneeded ternary. --- cpp/benchmarks/CMakeLists.txt | 2 +- cpp/src/strings/json/json_path.cu | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 4d5914fe1b8..11af408f1c5 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -206,4 +206,4 @@ ConfigureBench(STRINGS_BENCH ################################################################################################### # - json benchmark ------------------------------------------------------------------- ConfigureBench(JSON_BENCH - string/json_benchmark.cpp) \ No newline at end of file + string/json_benchmark.cpp) diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 4958848633a..cd8aae12070 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -21,14 +21,12 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include @@ -173,7 +171,7 @@ class parser { char const* pos; private: - CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' ' ? true : false; } + CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' '; } }; /**