diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 39587b4bd05..75955428eab 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -178,12 +178,14 @@ test: - test -f $PREFIX/include/cudf/strings/detail/converters.hpp - test -f $PREFIX/include/cudf/strings/detail/copying.hpp - test -f $PREFIX/include/cudf/strings/detail/fill.hpp + - test -f $PREFIX/include/cudf/strings/detail/json.hpp - test -f $PREFIX/include/cudf/strings/detail/replace.hpp - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp - test -f $PREFIX/include/cudf/strings/extract.hpp - test -f $PREFIX/include/cudf/strings/findall.hpp - test -f $PREFIX/include/cudf/strings/find.hpp - test -f $PREFIX/include/cudf/strings/find_multiple.hpp + - test -f $PREFIX/include/cudf/strings/json.hpp - test -f $PREFIX/include/cudf/strings/padding.hpp - test -f $PREFIX/include/cudf/strings/replace.hpp - test -f $PREFIX/include/cudf/strings/replace_re.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5cd82e52180..61cb13d3445 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -346,6 +346,7 @@ add_library(cudf src/strings/find.cu src/strings/find_multiple.cu src/strings/padding.cu + src/strings/json/json_path.cu src/strings/regex/regcomp.cpp src/strings/regex/regexec.cu src/strings/replace/backref_re.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 5aa7e0132f8..11af408f1c5 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -202,3 +202,8 @@ ConfigureBench(STRINGS_BENCH string/substring_benchmark.cpp string/translate_benchmark.cpp string/url_decode_benchmark.cpp) + +################################################################################################### +# - json benchmark ------------------------------------------------------------------- +ConfigureBench(JSON_BENCH + string/json_benchmark.cpp) diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json_benchmark.cpp new file mode 100644 index 00000000000..6fb6a07a8d0 --- /dev/null +++ b/cpp/benchmarks/string/json_benchmark.cpp @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +#include +#include + +class JsonPath : public cudf::benchmark { +}; + +float frand() { return static_cast(rand()) / static_cast(RAND_MAX); } + +int rand_range(int min, int max) { return min + static_cast(frand() * (max - min)); } + +std::vector Books{ + "{\n\"category\": \"reference\",\n\"author\": \"Nigel Rees\",\n\"title\": \"Sayings of the " + "Century\",\n\"price\": 8.95\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"Evelyn Waugh\",\n\"title\": \"Sword of " + "Honour\",\n\"price\": 12.99\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"Herman Melville\",\n\"title\": \"Moby " + "Dick\",\n\"isbn\": \"0-553-21311-3\",\n\"price\": 8.99\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"J. R. R. Tolkien\",\n\"title\": \"The Lord of the " + "Rings\",\n\"isbn\": \"0-395-19395-8\",\n\"price\": 22.99\n}"}; +constexpr int Approx_book_size = 110; +std::vector Bicycles{ + "{\"color\": \"red\", \"price\": 9.95}", + "{\"color\": \"green\", \"price\": 29.95}", + "{\"color\": \"blue\", \"price\": 399.95}", + "{\"color\": \"yellow\", \"price\": 99.95}", + "{\"color\": \"mauve\", \"price\": 199.95}", +}; +constexpr int Approx_bicycle_size = 33; +std::string Misc{"\n\"expensive\": 10\n"}; +std::string generate_field(std::vector const& values, int num_values) +{ + std::string res; + for (int idx = 0; idx < num_values; idx++) { + if (idx > 0) { res += std::string(",\n"); } + int vindex = std::min(static_cast(floor(frand() * values.size())), + static_cast(values.size() - 1)); + res += values[vindex]; + } + return res; +} + +std::string build_row(int desired_bytes) +{ + // always have at least 2 books and 2 bikes + int num_books = 2; + int num_bicycles = 2; + int remaining_bytes = + desired_bytes - ((num_books * Approx_book_size) + (num_bicycles * Approx_bicycle_size)); + + // divide up the remainder between books and bikes + float book_pct = frand(); + float bicycle_pct = 1.0f - book_pct; + num_books += (remaining_bytes * book_pct) / Approx_book_size; + num_bicycles += (remaining_bytes * bicycle_pct) / Approx_bicycle_size; + + std::string books = "\"book\": [\n" + generate_field(Books, num_books) + "]\n"; + std::string bicycles = "\"bicycle\": [\n" + generate_field(Bicycles, num_bicycles) + "]\n"; + + std::string store = "\"store\": {\n"; + if (frand() <= 0.5f) { + store += books + std::string(",\n") + bicycles; + } else { + store += bicycles + std::string(",\n") + books; + } + store += std::string("}\n"); + + std::string row = std::string("{\n"); + if (frand() <= 0.5f) { + row += store + std::string(",\n") + Misc; + } else { + row += Misc + std::string(",\n") + store; + } + row += std::string("}\n"); + return row; +} + +template +static void BM_case(benchmark::State& state, QueryArg&&... query_arg) +{ + srand(5236); + auto iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [desired_bytes = state.range(1)](int index) { return build_row(desired_bytes); }); + int num_rows = state.range(0); + cudf::test::strings_column_wrapper input(iter, iter + num_rows); + cudf::strings_column_view scv(input); + size_t num_chars = scv.chars().size(); + + std::string json_path(query_arg...); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + auto result = cudf::strings::get_json_object(scv, json_path); + cudaStreamSynchronize(0); + } + + // this isn't strictly 100% accurate. a given query isn't necessarily + // going to visit every single incoming character. but in spirit it does. + state.SetBytesProcessed(state.iterations() * num_chars); +} + +#define JSON_BENCHMARK_DEFINE(name, query) \ + BENCHMARK_CAPTURE(BM_case, name, query) \ + ->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +JSON_BENCHMARK_DEFINE(query0, "$"); +JSON_BENCHMARK_DEFINE(query1, "$.store"); +JSON_BENCHMARK_DEFINE(query2, "$.store.book"); +JSON_BENCHMARK_DEFINE(query3, "$.store.*"); +JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]"); +JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category"); +JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']"); +JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']"); +JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]"); diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp new file mode 100644 index 00000000000..e6a0b49f102 --- /dev/null +++ b/cpp/include/cudf/strings/detail/json.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +/** + * @copydoc cudf::strings::get_json_object + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ +std::unique_ptr get_json_object( + cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp new file mode 100644 index 00000000000..b39e4a2027c --- /dev/null +++ b/cpp/include/cudf/strings/json.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf { +namespace strings { + +/** + * @addtogroup strings_json + * @{ + * @file + */ + +/** + * @brief Apply a JSONPath string to all rows in an input strings column. + * + * Applies a JSONPath string to an incoming strings column where each row in the column + * is a valid json string. The output is returned by row as a strings column. + * + * https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html + * Implements only the operators: $ . [] * + * + * @param col The input strings column. Each row must contain a valid json string + * @param json_path The JSONPath string to be applied to each row + * @param mr Resource for allocating device memory. + * @return New strings column containing the retrieved json object strings + */ +std::unique_ptr get_json_object( + cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 65dd5c73475..f78ff98d49d 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -127,6 +127,7 @@ * @defgroup strings_modify Modifying * @defgroup strings_replace Replacing * @defgroup strings_split Splitting + * @defgroup strings_json JSON * @} * @defgroup dictionary_apis Dictionary * @{ diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 86e5f1fdcae..44acc7fc55f 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -196,7 +196,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) || serialized_trie_contains(opts.trie_false, {field_start, field_len})) { atomicAdd(&d_columnData[actual_col].bool_count, 1); - } else if (cudf::io::gpu::is_infinity(field_start, next_delimiter)) { + } else if (cudf::io::is_infinity(field_start, next_delimiter)) { atomicAdd(&d_columnData[actual_col].float_count, 1); } else { long countNumber = 0; @@ -277,7 +277,7 @@ __inline__ __device__ T decode_value(char const *begin, char const *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } template @@ -285,7 +285,7 @@ __inline__ __device__ T decode_value(char const *begin, char const *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } template <> diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index 5efb64fd4d5..75910ae6b5b 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -114,7 +114,7 @@ __inline__ __device__ T decode_value(const char *begin, uint64_t end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } /** @@ -131,7 +131,7 @@ __inline__ __device__ T decode_value(const char *begin, const char *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } /** diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 584d2c9a74a..b7719cba580 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -20,6 +20,8 @@ #include #include +#include + #include using cudf::device_span; @@ -82,67 +84,6 @@ struct parse_options { } }; -namespace gpu { -/** - * @brief CUDA kernel iterates over the data until the end of the current field - * - * Also iterates over (one or more) delimiter characters after the field. - * Function applies to formats with field delimiters and line terminators. - * - * @param begin Pointer to the first element of the string - * @param end Pointer to the first element after the string - * @param opts A set of parsing options - * @param escape_char A boolean value to signify whether to consider `\` as escape character or - * just a character. - * - * @return Pointer to the last character in the field, including the - * delimiter(s) following the field data - */ -__device__ __inline__ char const* seek_field_end(char const* begin, - char const* end, - parse_options_view const& opts, - bool escape_char = false) -{ - bool quotation = false; - auto current = begin; - bool escape_next = false; - while (true) { - // Use simple logic to ignore control chars between any quote seq - // Handles nominal cases including doublequotes within quotes, but - // may not output exact failures as PANDAS for malformed fields. - // Check for instances such as "a2\"bc" and "\\" if `escape_char` is true. - - if (*current == opts.quotechar and not escape_next) { - quotation = !quotation; - } else if (!quotation) { - if (*current == opts.delimiter) { - while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) { - ++current; - } - break; - } else if (*current == opts.terminator) { - break; - } else if (*current == '\r' && (current + 1 < end && *(current + 1) == '\n')) { - --end; - break; - } - } - - if (escape_char == true) { - // If a escape character is encountered, escape next character in next loop. - if (escape_next == false and *current == '\\') { - escape_next = true; - } else { - escape_next = false; - } - } - - if (current >= end) break; - current++; - } - return current; -} - /** * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization * for integral types. Handles hexadecimal digits, both uppercase and lowercase. @@ -155,7 +96,7 @@ __device__ __inline__ char const* seek_field_end(char const* begin, * @return uint8_t Numeric value of the character, or `0` */ template ::value>* = nullptr> -__device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) +constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 10; @@ -176,7 +117,7 @@ __device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) * @return uint8_t Numeric value of the character, or `0` */ template ::value>* = nullptr> -__device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) +constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; @@ -185,10 +126,7 @@ __device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) } // Converts character to lowercase. -__inline__ __device__ char to_lower(char const c) -{ - return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; -} +constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; } /** * @brief Checks if string is infinity, case insensitive with/without sign @@ -199,7 +137,7 @@ __inline__ __device__ char to_lower(char const c) * @param end Pointer to the first element after the string * @return true if string is valid infinity, else false. */ -__inline__ __device__ bool is_infinity(char const* begin, char const* end) +constexpr bool is_infinity(char const* begin, char const* end) { if (*begin == '-' || *begin == '+') begin++; char const* cinf = "infinity"; @@ -223,9 +161,10 @@ __inline__ __device__ bool is_infinity(char const* begin, char const* end) * @return The parsed and converted value */ template -__inline__ __device__ T parse_numeric(const char* begin, - const char* end, - parse_options_view const& opts) +constexpr T parse_numeric(const char* begin, + const char* end, + parse_options_view const& opts, + T error_result = std::numeric_limits::quiet_NaN()) { T value{}; bool all_digits_valid = true; @@ -281,11 +220,72 @@ __inline__ __device__ T parse_numeric(const char* begin, if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } } } - if (!all_digits_valid) { return std::numeric_limits::quiet_NaN(); } + if (!all_digits_valid) { return error_result; } return value * sign; } +namespace gpu { +/** + * @brief CUDA kernel iterates over the data until the end of the current field + * + * Also iterates over (one or more) delimiter characters after the field. + * Function applies to formats with field delimiters and line terminators. + * + * @param begin Pointer to the first element of the string + * @param end Pointer to the first element after the string + * @param opts A set of parsing options + * @param escape_char A boolean value to signify whether to consider `\` as escape character or + * just a character. + * + * @return Pointer to the last character in the field, including the + * delimiter(s) following the field data + */ +__device__ __inline__ char const* seek_field_end(char const* begin, + char const* end, + parse_options_view const& opts, + bool escape_char = false) +{ + bool quotation = false; + auto current = begin; + bool escape_next = false; + while (true) { + // Use simple logic to ignore control chars between any quote seq + // Handles nominal cases including doublequotes within quotes, but + // may not output exact failures as PANDAS for malformed fields. + // Check for instances such as "a2\"bc" and "\\" if `escape_char` is true. + + if (*current == opts.quotechar and not escape_next) { + quotation = !quotation; + } else if (!quotation) { + if (*current == opts.delimiter) { + while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) { + ++current; + } + break; + } else if (*current == opts.terminator) { + break; + } else if (*current == '\r' && (current + 1 < end && *(current + 1) == '\n')) { + --end; + break; + } + } + + if (escape_char == true) { + // If a escape character is encountered, escape next character in next loop. + if (escape_next == false and *current == '\\') { + escape_next = true; + } else { + escape_next = false; + } + } + + if (current >= end) break; + current++; + } + return current; +} + /** * @brief Lexicographically compare digits in input against string * representing an integer diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu new file mode 100644 index 00000000000..cd8aae12070 --- /dev/null +++ b/cpp/src/strings/json/json_path.cu @@ -0,0 +1,952 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +namespace { + +// debug accessibility + +// change to "\n" and 1 to make output more readable +#define DEBUG_NEWLINE +constexpr int DEBUG_NEWLINE_LEN = 0; + +/** + * @brief Result of calling a parse function. + * + * The primary use of this is to distinguish between "success" and + * "success but no data" return cases. For example, if you are reading the + * values of an array you might call a parse function in a while loop. You + * would want to continue doing this until you either encounter an error (parse_result::ERROR) + * or you get nothing back (parse_result::EMPTY) + */ +enum class parse_result { + ERROR, // failure + SUCCESS, // success + EMPTY, // success, but no data +}; + +/** + * @brief Base parser class inherited by the (device-side) json_state class and + * (host-side) path_state class. + * + * Contains a number of useful utility functions common to parsing json and + * JSONPath strings. + */ +class parser { + protected: + CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {} + CUDA_HOST_DEVICE_CALLABLE parser(const char* _input, int64_t _input_len) + : input(_input), input_len(_input_len), pos(_input) + { + parse_whitespace(); + } + + CUDA_HOST_DEVICE_CALLABLE parser(parser const& p) + : input(p.input), input_len(p.input_len), pos(p.pos) + { + } + + CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; } + CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); } + + CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace() + { + while (!eof()) { + if (is_whitespace(*pos)) { + pos++; + } else { + return true; + } + } + return false; + } + + CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(string_view& str, + bool can_be_empty, + char quote) + { + str = string_view(nullptr, 0); + + if (parse_whitespace() && *pos == quote) { + const char* start = ++pos; + while (!eof()) { + if (*pos == quote) { + str = string_view(start, pos - start); + pos++; + return parse_result::SUCCESS; + } + pos++; + } + } + + return can_be_empty ? parse_result::EMPTY : parse_result::ERROR; + } + + // a name means: + // - a string followed by a : + // - no string + CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(string_view& name, + bool can_be_empty, + char quote) + { + if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) { + return parse_result::ERROR; + } + + // if we got a real string, the next char must be a : + if (name.size_bytes() > 0) { + if (!parse_whitespace()) { return parse_result::ERROR; } + if (*pos == ':') { + pos++; + return parse_result::SUCCESS; + } + } + return parse_result::EMPTY; + } + + // numbers, true, false, null. + // this function is not particularly strong. badly formed values will get + // consumed without throwing any errors + CUDA_HOST_DEVICE_CALLABLE parse_result parse_non_string_value(string_view& val) + { + if (!parse_whitespace()) { return parse_result::ERROR; } + + // parse to the end of the value + char const* start = pos; + char const* end = start; + while (!eof(end)) { + char const c = *end; + if (c == ',' || c == '}' || c == ']' || is_whitespace(c)) { break; } + + // illegal chars + if (c == '[' || c == '{' || c == ':' || c == '\"') { return parse_result::ERROR; } + end++; + } + pos = end; + + val = string_view(start, end - start); + + return parse_result::SUCCESS; + } + + protected: + char const* input; + int64_t input_len; + char const* pos; + + private: + CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' '; } +}; + +/** + * @brief Output buffer object. Used during the preprocess/size-computation step + * and the actual output step. + * + * There is an important distinction between two cases: + * + * - producing no output at all. that is, the query matched nothing in the input. + * - producing empty output. the query matched something in the input, but the + * value of the result is an empty string. + * + * The `has_output` field is the flag which indicates whether or not the output + * from the query should be considered empty or null. + * + */ +struct json_output { + size_t output_max_len; + char* output; + thrust::optional output_len; + + __device__ void add_output(const char* str, size_t len) + { + if (output != nullptr) { memcpy(output + output_len.value_or(0), str, len); } + output_len = output_len.value_or(0) + len; + } + + __device__ void add_output(string_view const& str) { add_output(str.data(), str.size_bytes()); } +}; + +enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; + +/** + * @brief Parsing class that holds the current state of the json to be parse and provides + * functions for navigating through it. + */ +class json_state : private parser { + public: + __device__ json_state() + : parser(), + cur_el_start(nullptr), + cur_el_type(json_element_type::NONE), + parent_el_type(json_element_type::NONE) + { + } + __device__ json_state(const char* _input, int64_t _input_len) + : parser(_input, _input_len), + cur_el_start(nullptr), + cur_el_type(json_element_type::NONE), + parent_el_type(json_element_type::NONE) + { + } + + __device__ json_state(json_state const& j) + : parser(j), + cur_el_start(j.cur_el_start), + cur_el_type(j.cur_el_type), + parent_el_type(j.parent_el_type) + { + } + + // retrieve the entire current element into the output + __device__ parse_result extract_element(json_output* output, bool list_element) + { + char const* start = cur_el_start; + char const* end = start; + + // if we're a value type, do a simple value parse. + if (cur_el_type == VALUE) { + pos = cur_el_start; + if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; } + end = pos; + + // SPARK-specific behavior. if this is a non-list-element wrapped in quotes, + // strip them. we may need to make this behavior configurable in some way + // later on. + if (!list_element && *start == '\"' && *(end - 1) == '\"') { + start++; + end--; + } + } + // otherwise, march through everything inside + else { + int obj_count = 0; + int arr_count = 0; + + while (!eof(end)) { + // could do some additional checks here. we know our current + // element type, so we could be more strict on what kinds of + // characters we expect to see. + switch (*end++) { + case '{': obj_count++; break; + case '}': obj_count--; break; + case '[': arr_count++; break; + case ']': arr_count--; break; + default: break; + } + if (obj_count == 0 && arr_count == 0) { break; } + } + if (obj_count > 0 || arr_count > 0) { return parse_result::ERROR; } + pos = end; + } + + // parse trailing , + if (parse_whitespace()) { + if (*pos == ',') { pos++; } + } + + if (output != nullptr) { output->add_output({start, static_cast(end - start)}); } + return parse_result::SUCCESS; + } + + // skip the next element + __device__ parse_result skip_element() { return extract_element(nullptr, false); } + + // advance to the next element + __device__ parse_result next_element() { return next_element_internal(false); } + + // advance inside the current element + __device__ parse_result child_element(json_element_type expected_type) + { + if (expected_type != NONE && cur_el_type != expected_type) { return parse_result::ERROR; } + + // if we succeed, record our parent element type. + auto const prev_el_type = cur_el_type; + auto const result = next_element_internal(true); + if (result == parse_result::SUCCESS) { parent_el_type = prev_el_type; } + return result; + } + + // return the next element that matches the specified name. + __device__ parse_result next_matching_element(string_view const& name, bool inclusive) + { + // if we're not including the current element, skip it + if (!inclusive) { + parse_result result = next_element_internal(false); + if (result != parse_result::SUCCESS) { return result; } + } + // loop until we find a match or there's nothing left + do { + // wildcard matches anything + if (name.size_bytes() == 1 && name.data()[0] == '*') { + return parse_result::SUCCESS; + } else if (cur_el_name == name) { + return parse_result::SUCCESS; + } + + // next + parse_result result = next_element_internal(false); + if (result != parse_result::SUCCESS) { return result; } + } while (1); + + return parse_result::ERROR; + } + + private: + // parse a value - either a string or a number/null/bool + __device__ parse_result parse_value() + { + if (!parse_whitespace()) { return parse_result::ERROR; } + + // string or number? + string_view unused; + return *pos == '\"' ? parse_string(unused, false, '\"') : parse_non_string_value(unused); + } + + __device__ parse_result next_element_internal(bool child) + { + // if we're not getting a child element, skip the current element. + // this will leave pos as the first character -after- the close of + // the current element + if (!child && cur_el_start != nullptr) { + if (skip_element() == parse_result::ERROR) { return parse_result::ERROR; } + cur_el_start = nullptr; + } + // otherwise pos will be at the first character within the current element + + // can only get the child of an object or array. + // this could theoretically be handled as an error, but the evaluators I've found + // seem to treat this as "it's nothing" + if (child && (cur_el_type == VALUE || cur_el_type == NONE)) { return parse_result::EMPTY; } + + // what's next + if (!parse_whitespace()) { return parse_result::EMPTY; } + // if we're closing off a parent element, we're done + char const c = *pos; + if (c == ']' || c == '}') { return parse_result::EMPTY; } + + // if we're not accessing elements of an array, check for name. + bool const array_access = + (cur_el_type == ARRAY && child) || (parent_el_type == ARRAY && !child); + if (!array_access && parse_name(cur_el_name, true, '\"') == parse_result::ERROR) { + return parse_result::ERROR; + } + + // element type + if (!parse_whitespace()) { return parse_result::EMPTY; } + switch (*pos++) { + case '[': cur_el_type = ARRAY; break; + case '{': cur_el_type = OBJECT; break; + + case ',': + case ':': + case '\'': return parse_result::ERROR; + + // value type + default: cur_el_type = VALUE; break; + } + + // the start of the current element is always at the value, not the name + cur_el_start = pos - 1; + return parse_result::SUCCESS; + } + + const char* cur_el_start; // pointer to the first character of the -value- of the current + // element - not the name + string_view cur_el_name; // name of the current element (if applicable) + json_element_type cur_el_type; // type of the current element + json_element_type parent_el_type; // parent element type +}; + +enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END }; + +/** + * @brief A "command" operator used to query a json string. A full query is + * an array of these operators applied to the incoming json string, + */ +struct path_operator { + CUDA_HOST_DEVICE_CALLABLE path_operator() + : type(path_operator_type::ERROR), index(-1), expected_type{NONE} + { + } + CUDA_HOST_DEVICE_CALLABLE path_operator(path_operator_type _type, + json_element_type _expected_type = NONE) + : type(_type), index(-1), expected_type{_expected_type} + { + } + + path_operator_type type; // operator type + // the expected element type we're applying this operation to. + // for example: + // - you cannot retrieve a subscripted field (eg [5]) from an object. + // - you cannot retrieve a field by name (eg .book) from an array. + // - you -can- use .* for both arrays and objects + // a value of NONE imples any type accepted + json_element_type expected_type; // the expected type of the element we're working with + string_view name; // name to match against (if applicable) + int index; // index for subscript operator +}; + +/** + * @brief Parsing class that holds the current state of the JSONPath string to be parsed + * and provides functions for navigating through it. This is only called on the host + * during the preprocess step which builds a command buffer that the gpu uses. + */ +class path_state : private parser { + public: + path_state(const char* _path, size_t _path_len) : parser(_path, _path_len) {} + + // get the next operator in the JSONPath string + path_operator get_next_operator() + { + if (eof()) { return {path_operator_type::END}; } + + switch (*pos++) { + case '$': return {path_operator_type::ROOT}; + + case '.': { + path_operator op; + string_view term{".[", 2}; + if (parse_path_name(op.name, term)) { + // this is another potential use case for __SPARK_BEHAVIORS / configurability + // Spark currently only handles the wildcard operator inside [*], it does + // not handle .* + if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') { + op.type = path_operator_type::CHILD_WILDCARD; + op.expected_type = NONE; + } else { + op.type = path_operator_type::CHILD; + op.expected_type = OBJECT; + } + return op; + } + } break; + + // 3 ways this can be used + // indices: [0] + // name: ['book'] + // wildcard: [*] + case '[': { + path_operator op; + string_view term{"]", 1}; + bool const is_string = *pos == '\'' ? true : false; + if (parse_path_name(op.name, term)) { + pos++; + if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') { + op.type = path_operator_type::CHILD_WILDCARD; + op.expected_type = NONE; + } else { + if (is_string) { + op.type = path_operator_type::CHILD; + op.expected_type = OBJECT; + } else { + op.type = path_operator_type::CHILD_INDEX; + op.index = cudf::io::parse_numeric( + op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1); + CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath"); + op.expected_type = ARRAY; + } + } + return op; + } + } break; + + // wildcard operator + case '*': { + pos++; + return path_operator{path_operator_type::CHILD_WILDCARD}; + } break; + + default: CUDF_FAIL("Unrecognized JSONPath operator"); break; + } + return {path_operator_type::ERROR}; + } + + private: + cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'}; + + bool parse_path_name(string_view& name, string_view const& terminators) + { + switch (*pos) { + case '*': + name = string_view(pos, 1); + pos++; + break; + + case '\'': + if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; } + break; + + default: { + size_t const chars_left = input_len - (pos - input); + char const* end = std::find_first_of( + pos, pos + chars_left, terminators.data(), terminators.data() + terminators.size_bytes()); + if (end) { + name = string_view(pos, end - pos); + pos = end; + } else { + name = string_view(pos, chars_left); + pos = input + input_len; + } + break; + } + } + + // an empty name is not valid + CUDF_EXPECTS(name.size_bytes() > 0, "Invalid empty name in JSONPath query string"); + + return true; + } +}; + +/** + * @brief Preprocess the incoming JSONPath string on the host to generate a + * command buffer for use by the GPU. + * + * @param json_path The incoming json path + * @param stream Cuda stream to perform any gpu actions on + * @returns A pair containing the command buffer, and maximum stack depth required. + */ +std::pair>, int> build_command_buffer( + cudf::string_scalar const& json_path, rmm::cuda_stream_view stream) +{ + std::string h_json_path = json_path.to_string(stream); + path_state p_state(h_json_path.data(), static_cast(h_json_path.size())); + + std::vector h_operators; + + path_operator op; + int max_stack_depth = 1; + do { + op = p_state.get_next_operator(); + if (op.type == path_operator_type::ERROR) { + CUDF_FAIL("Encountered invalid JSONPath input string"); + } + if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; } + // convert pointer to device pointer + if (op.name.size_bytes() > 0) { + op.name = + string_view(json_path.data() + (op.name.data() - h_json_path.data()), op.name.size_bytes()); + } + if (op.type == path_operator_type::ROOT) { + CUDF_EXPECTS(h_operators.size() == 0, "Root operator ($) can only exist at the root"); + } + // if we havent' gotten a root operator to start, and we're not empty, quietly push a + // root operator now. + if (h_operators.size() == 0 && op.type != path_operator_type::ROOT && + op.type != path_operator_type::END) { + h_operators.push_back(path_operator{path_operator_type::ROOT}); + } + h_operators.push_back(op); + } while (op.type != path_operator_type::END); + + auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END; + return is_empty + ? std::make_pair(thrust::nullopt, 0) + : std::make_pair( + thrust::make_optional(cudf::detail::make_device_uvector_sync(h_operators, stream)), + max_stack_depth); +} + +#define PARSE_TRY(_x) \ + do { \ + last_result = _x; \ + if (last_result == parse_result::ERROR) { return parse_result::ERROR; } \ + } while (0) + +/** + * @brief Parse a single json string using the provided command buffer + * + * @param j_state The incoming json string and associated parser + * @param commands The command buffer to be applied to the string. Always ends with a + * path_operator_type::END + * @param output Buffer user to store the results of the query + * @returns A result code indicating success/fail/empty. + */ +template +__device__ parse_result parse_json_path(json_state& j_state, + path_operator const* commands, + json_output& output) +{ + // manually maintained context stack in lieu of calling parse_json_path recursively. + struct context { + json_state j_state; + path_operator const* commands; + bool list_element; + bool state_flag; + }; + context stack[max_command_stack_depth]; + int stack_pos = 0; + auto push_context = [&stack, &stack_pos](json_state const& _j_state, + path_operator const* _commands, + bool _list_element = false, + bool _state_flag = false) { + if (stack_pos == max_command_stack_depth - 1) { return false; } + stack[stack_pos++] = context{_j_state, _commands, _list_element, _state_flag}; + return true; + }; + auto pop_context = [&stack, &stack_pos](context& c) { + if (stack_pos > 0) { + c = stack[--stack_pos]; + return true; + } + return false; + }; + push_context(j_state, commands, false); + + parse_result last_result = parse_result::SUCCESS; + context ctx; + int element_count = 0; + while (pop_context(ctx)) { + path_operator op = *ctx.commands; + + switch (op.type) { + // whatever the first object is + case path_operator_type::ROOT: + PARSE_TRY(ctx.j_state.next_element()); + push_context(ctx.j_state, ctx.commands + 1); + break; + + // .name + // ['name'] + // [1] + // will return a single thing + case path_operator_type::CHILD: { + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); + if (last_result == parse_result::SUCCESS) { + PARSE_TRY(ctx.j_state.next_matching_element(op.name, true)); + if (last_result == parse_result::SUCCESS) { + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); + } + } + } break; + + // .* + // [*] + // will return an array of things + case path_operator_type::CHILD_WILDCARD: { + // if we're on the first element of this wildcard + if (!ctx.state_flag) { + // we will only ever be returning 1 array + if (!ctx.list_element) { output.add_output({"[" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); } + + // step into the child element + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); + if (last_result == parse_result::EMPTY) { + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + last_result = parse_result::SUCCESS; + break; + } + + // first element + PARSE_TRY(ctx.j_state.next_matching_element({"*", 1}, true)); + if (last_result == parse_result::EMPTY) { + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + last_result = parse_result::SUCCESS; + break; + } + + // re-push ourselves + push_context(ctx.j_state, ctx.commands, ctx.list_element, true); + // push the next command + push_context(ctx.j_state, ctx.commands + 1, true); + } else { + // next element + PARSE_TRY(ctx.j_state.next_matching_element({"*", 1}, false)); + if (last_result == parse_result::EMPTY) { + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + last_result = parse_result::SUCCESS; + break; + } + + // re-push ourselves + push_context(ctx.j_state, ctx.commands, ctx.list_element, true); + // push the next command + push_context(ctx.j_state, ctx.commands + 1, true); + } + } break; + + // [0] + // [1] + // etc + // returns a single thing + case path_operator_type::CHILD_INDEX: { + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); + if (last_result == parse_result::SUCCESS) { + string_view const any{"*", 1}; + PARSE_TRY(ctx.j_state.next_matching_element(any, true)); + if (last_result == parse_result::SUCCESS) { + int idx; + for (idx = 1; idx <= op.index; idx++) { + PARSE_TRY(ctx.j_state.next_matching_element(any, false)); + if (last_result == parse_result::EMPTY) { break; } + } + // if we didn't end up at the index we requested, this is an invalid index + if (idx - 1 != op.index) { return parse_result::ERROR; } + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); + } + } + } break; + + // some sort of error. + case path_operator_type::ERROR: return parse_result::ERROR; break; + + // END case + default: { + if (ctx.list_element && element_count > 0) { + output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + PARSE_TRY(ctx.j_state.extract_element(&output, ctx.list_element)); + if (ctx.list_element && last_result != parse_result::EMPTY) { element_count++; } + } break; + } + } + + return parse_result::SUCCESS; +} + +// hardcoding this for now. to reach a stack depth of 8 would require +// a JSONPath containing 7 nested wildcards so this is probably reasonable. +constexpr int max_command_stack_depth = 8; + +/** + * @brief Parse a single json string using the provided command buffer + * + * This function exists primarily as a shim for debugging purposes. + * + * @param input The incoming json string + * @param input_len Size of the incoming json string + * @param commands The command buffer to be applied to the string. Always ends with a + * path_operator_type::END + * @param out_buf Buffer user to store the results of the query (nullptr in the size computation + * step) + * @param out_buf_size Size of the output buffer + * @returns A pair containing the result code the output buffer. + */ +__device__ thrust::pair get_json_object_single( + char const* input, + size_t input_len, + path_operator const* const commands, + char* out_buf, + size_t out_buf_size) +{ + json_state j_state(input, input_len); + json_output output{out_buf_size, out_buf}; + + auto const result = parse_json_path(j_state, commands, output); + + return {result, output}; +} + +/** + * @brief Kernel for running the JSONPath query. + * + * This kernel operates in a 2-pass way. On the first pass, it computes + * output sizes. On the second pass it fills in the provided output buffers + * (chars and validity) + * + * @param col Device view of the incoming string + * @param commands JSONPath command buffer + * @param output_offsets Buffer used to store the string offsets for the results of the query + * @param out_buf Buffer used to store the results of the query + * @param out_validity Output validity buffer + * @param out_valid_count Output count of # of valid bits + */ +template +__launch_bounds__(block_size) __global__ + void get_json_object_kernel(column_device_view col, + path_operator const* const commands, + offset_type* output_offsets, + thrust::optional out_buf, + thrust::optional out_validity, + thrust::optional out_valid_count) +{ + size_type tid = threadIdx.x + (blockDim.x * blockIdx.x); + size_type stride = blockDim.x * gridDim.x; + + if (out_valid_count.has_value()) { *(out_valid_count.value()) = 0; } + size_type warp_valid_count{0}; + + auto active_threads = __ballot_sync(0xffffffff, tid < col.size()); + while (tid < col.size()) { + bool is_valid = false; + string_view const str = col.element(tid); + size_type output_size = 0; + if (str.size_bytes() > 0) { + char* dst = out_buf.has_value() ? out_buf.value() + output_offsets[tid] : nullptr; + size_t const dst_size = + out_buf.has_value() ? output_offsets[tid + 1] - output_offsets[tid] : 0; + + parse_result result; + json_output out; + thrust::tie(result, out) = + get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size); + output_size = out.output_len.value_or(0); + if (out.output_len.has_value() && result == parse_result::SUCCESS) { is_valid = true; } + } + + // filled in only during the precompute step. during the compute step, the offsets + // are fed back in so we do -not- want to write them out + if (!out_buf.has_value()) { output_offsets[tid] = static_cast(output_size); } + + // validity filled in only during the output step + if (out_validity.has_value()) { + uint32_t mask = __ballot_sync(active_threads, is_valid); + // 0th lane of the warp writes the validity + if (!(tid % cudf::detail::warp_size)) { + out_validity.value()[cudf::word_index(tid)] = mask; + warp_valid_count += __popc(mask); + } + } + + tid += stride; + active_threads = __ballot_sync(active_threads, tid < col.size()); + } + + // sum the valid counts across the whole block + if (out_valid_count) { + size_type block_valid_count = + cudf::detail::single_lane_block_sum_reduce(warp_valid_count); + if (threadIdx.x == 0) { atomicAdd(out_valid_count.value(), block_valid_count); } + } +} + +/** + * @copydoc cudf::strings::detail::get_json_object + */ +std::unique_ptr get_json_object(cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // preprocess the json_path into a command buffer + auto preprocess = build_command_buffer(json_path, stream); + CUDF_EXPECTS(std::get<1>(preprocess) <= max_command_stack_depth, + "Encountered JSONPath string that is too complex"); + + // allocate output offsets buffer. + auto offsets = cudf::make_fixed_width_column( + data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr); + cudf::mutable_column_view offsets_view(*offsets); + + // if the query is empty, return a string column containing all nulls + if (!std::get<0>(preprocess).has_value()) { + return std::make_unique( + data_type{type_id::STRING}, + col.size(), + rmm::device_buffer{0, stream, mr}, // no data + cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr), + col.size()); // null count + } + + constexpr int block_size = 512; + cudf::detail::grid_1d const grid{col.size(), block_size}; + + auto cdv = column_device_view::create(col.parent(), stream); + + // preprocess sizes (returned in the offsets buffer) + get_json_object_kernel + <<>>( + *cdv, + std::get<0>(preprocess).value().data(), + offsets_view.head(), + thrust::nullopt, + thrust::nullopt, + thrust::nullopt); + + // convert sizes to offsets + thrust::exclusive_scan(rmm::exec_policy(stream), + offsets_view.head(), + offsets_view.head() + col.size() + 1, + offsets_view.head(), + 0); + size_type const output_size = + cudf::detail::get_value(offsets_view, col.size(), stream); + + // allocate output string column + auto chars = cudf::make_fixed_width_column( + data_type{type_id::INT8}, output_size, mask_state::UNALLOCATED, stream, mr); + + // potential optimization : if we know that all outputs are valid, we could skip creating + // the validity mask altogether + rmm::device_buffer validity = + cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr); + + // compute results + cudf::mutable_column_view chars_view(*chars); + rmm::device_scalar d_valid_count{0, stream}; + get_json_object_kernel + <<>>( + *cdv, + std::get<0>(preprocess).value().data(), + offsets_view.head(), + chars_view.head(), + static_cast(validity.data()), + d_valid_count.data()); + + return make_strings_column(col.size(), + std::move(offsets), + std::move(chars), + col.size() - d_valid_count.value(), + std::move(validity), + stream, + mr); +} + +} // namespace +} // namespace detail + +/** + * @copydoc cudf::strings::get_json_object + */ +std::unique_ptr get_json_object(cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::get_json_object(col, json_path, 0, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 082f039054e..f9904dda49e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -334,6 +334,7 @@ ConfigureTest(STRINGS_TEST strings/hash_string.cu strings/integers_tests.cu strings/ipv4_tests.cpp + strings/json_tests.cpp strings/pad_tests.cpp strings/replace_regex_tests.cpp strings/replace_tests.cpp diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp new file mode 100644 index 00000000000..44eb35d4163 --- /dev/null +++ b/cpp/tests/strings/json_tests.cpp @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +// reference: https://jsonpath.herokuapp.com/ + +// clang-format off +std::string json_string{ + "{" + "\"store\": {""\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}," + "\"expensive\": 10" + "}" +}; +// clang-format on + +std::unique_ptr drop_whitespace(cudf::column_view const& col) +{ + cudf::test::strings_column_wrapper whitespace{"\n", "\r", "\t"}; + cudf::test::strings_column_wrapper repl{"", "", ""}; + + cudf::strings_column_view strings(col); + cudf::strings_column_view targets(whitespace); + cudf::strings_column_view replacements(repl); + return cudf::strings::replace(strings, targets, replacements); +} + +struct JsonTests : public cudf::test::BaseFixture { +}; + +TEST_F(JsonTests, GetJsonObjectRootOp) +{ + // root + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + auto expected = drop_whitespace(input); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); +} + +TEST_F(JsonTests, GetJsonObjectChildOp) +{ + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "{" + "\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectWildcardOp) +{ + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.*"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "{" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("*"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "{" + "\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}," + "10" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectSubscriptOp) +{ + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[2]"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store['bicycle']"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "{" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*]"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectFilter) +{ + // queries that result in filtering/collating results (mostly meaning - generates new + // json instead of just returning parts of the existing string + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*]['isbn']"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{"[\"0-553-21311-3\",\"0-395-19395-8\"]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*].category"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[\"reference\",\"fiction\",\"fiction\",\"fiction\"]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*].title"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[\"Sayings of the Century\",\"Sword of Honour\",\"Moby Dick\",\"The Lord of the Rings\"]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book.*.price"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{"[8.95,12.99,8.99,22.99]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + // spark behavioral difference. + // standard: "fiction" + // spark: fiction + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[2].category"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{"fiction"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectNullInputs) +{ + { + std::string str("{\"a\" : \"b\"}"); + cudf::test::strings_column_wrapper input({str, str, str, str}, {1, 0, 1, 0}); + + std::string json_path("$.a"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw({"b", "", "b", ""}, {1, 0, 1, 0}); + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectEmptyQuery) +{ + // empty query -> null + { + cudf::test::strings_column_wrapper input{"{\"a\" : \"b\"}"}; + std::string json_path(""); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +TEST_F(JsonTests, GetJsonObjectEmptyInputsAndOutputs) +{ + // empty input -> null + { + cudf::test::strings_column_wrapper input{""}; + std::string json_path("$"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // slightly different from "empty output". in this case, we're + // returning something, but it happens to be empty. so we expect + // a valid, but empty row + { + cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"}; + std::string json_path("$.store.bicycle"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +// badly formed JSONpath strings +TEST_F(JsonTests, GetJsonObjectIllegalQuery) +{ + // can't have more than one root operator, or a root operator anywhere other + // than the beginning + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$$"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // invalid index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[auh46h-]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // invalid index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[[]]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // negative index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[-1]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // child operator with no name specified + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("."); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("]["); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("6hw6,56i3"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } +} + +// queries that are legal, but reference invalid parts of the input +TEST_F(JsonTests, GetJsonObjectInvalidQuery) +{ + // non-existent field + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[*].c"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // non-existent field + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[*].c[2]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // non-existent field + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book.price"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // out of bounds index + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[4]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +TEST_F(JsonTests, MixedOutput) +{ + // various queries on: + // clang-format off + std::vector input_strings { + "{\"a\": {\"b\" : \"c\"}}", + + "{" + "\"a\": {\"b\" : \"c\"}," + "\"d\": [{\"e\":123}, {\"f\":-10}]" + "}", + + "{" + "\"b\": 123" + "}", + + "{" + "\"a\": [\"y\",500]" + "}", + + "{" + "\"a\": \"\"" + "}", + + "{" + "\"a\": {" + "\"z\": {\"i\": 10, \"j\": 100}," + "\"b\": [\"c\",null,true,-1]" + "}" + "}" + }; + // clang-format on + cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end()); + { + std::string json_path("$.a"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "{\"b\" : \"c\"}", + "{\"b\" : \"c\"}", + "", + "[\"y\",500]", + "", + "{" + "\"z\": {\"i\": 10, \"j\": 100}," + "\"b\": [\"c\",null,true,-1]" + "}" + }, + {1, 1, 0, 1, 1, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a[1]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "", + "", + "", + "500", + "", + "", + }, + {0, 0, 0, 1, 0, 0}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a.b"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "c", + "c", + "", + "", + "", + "[\"c\",null,true,-1]"}, + {1, 1, 0, 0, 0, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a[*]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "[\"c\"]", + "[\"c\"]", + "", + "[\"y\",500]", + "[]", + "[" + "{\"i\": 10, \"j\": 100}," + "[\"c\",null,true,-1]" + "]" }, + {1, 1, 0, 1, 1, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a.b[*]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "[]", + "[]", + "", + "", + "", + "[\"c\",null,true,-1]"}, + {1, 1, 0, 0, 0, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 78a67464654..a54c86405a5 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -71,7 +71,7 @@ struct column_property_comparator { // equivalent, but not exactly equal columns can have a different number of children if their // sizes are both 0. Specifically, empty string columns may or may not have children. - if (check_exact_equality || lhs.size() > 0) { + if (check_exact_equality || (lhs.size() > 0 && lhs.null_count() < lhs.size())) { EXPECT_EQ(lhs.num_children(), rhs.num_children()); } } diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 5d869ab75fb..402c64dd83d 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2083,6 +2083,23 @@ public final ColumnVector substring(ColumnView start, ColumnView end) { return new ColumnVector(substringColumn(getNativeView(), start.getNativeView(), end.getNativeView())); } + /** + * Apply a JSONPath string to all rows in an input strings column. + * + * Applies a JSONPath string to an incoming strings column where each row in the column + * is a valid json string. The output is returned by row as a strings column. + * + * For reference, https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html + * Note: Only implements the operators: $ . [] * + * + * @param path The JSONPath string to be applied to each row + * @return new strings ColumnVector containing the retrieved json object strings + */ + public final ColumnVector getJSONObject(Scalar path) { + assert(type.equals(DType.STRING)) : "column type must be a String"; + return new ColumnVector(getJSONObject(getNativeView(), path.getScalarHandle())); + } + /** * Returns a new strings column where target string within each string is replaced with the specified * replacement string. @@ -2649,6 +2666,8 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) { */ private static native long stringTimestampToTimestamp(long viewHandle, int unit, String format); + private static native long getJSONObject(long viewHandle, long scalarHandle) throws CudfException; + /** * Native method to parse and convert a timestamp column vector to string column vector. A unix * timestamp is a long value representing how many units since 1970-01-01 00:00:00:000 in either diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index dc1acc50b5f..cec3a1a92a6 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -65,6 +66,8 @@ #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp" +#include "jni.h" +#include "jni_utils.hpp" namespace { @@ -1835,4 +1838,24 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv } CATCH_STD(env, 0) } + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jclass, + jlong j_view_handle, jlong j_scalar_handle) { + + JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0); + JNI_NULL_CHECK(env, j_scalar_handle, "path cannot be null", 0); + + try { + cudf::jni::auto_set_device(env); + cudf::column_view* n_column_view = reinterpret_cast(j_view_handle); + cudf::strings_column_view n_strings_col_view(*n_column_view); + cudf::string_scalar *n_scalar_path = reinterpret_cast(j_scalar_handle); + + auto result = cudf::strings::get_json_object(n_strings_col_view, *n_scalar_path); + + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0) + +} } // extern "C" diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index fe1cba5ceb1..ce2c287a1c8 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4132,6 +4132,50 @@ void testCopyToColumnVector() { } } + @Test + void testGetJSONObject() { + String jsonString = "{ \"store\": {\n" + + " \"book\": [\n" + + " { \"category\": \"reference\",\n" + + " \"author\": \"Nigel Rees\",\n" + + " \"title\": \"Sayings of the Century\",\n" + + " \"price\": 8.95\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"Evelyn Waugh\",\n" + + " \"title\": \"Sword of Honour\",\n" + + " \"price\": 12.99\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"Herman Melville\",\n" + + " \"title\": \"Moby Dick\",\n" + + " \"isbn\": \"0-553-21311-3\",\n" + + " \"price\": 8.99\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"J. R. R. Tolkien\",\n" + + " \"title\": \"The Lord of the Rings\",\n" + + " \"isbn\": \"0-395-19395-8\",\n" + + " \"price\": 22.99\n" + + " }\n" + + " ],\n" + + " \"bicycle\": {\n" + + " \"color\": \"red\",\n" + + " \"price\": 19.95\n" + + " }\n" + + " }\n" + + "}"; + + try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString); + ColumnVector expectedAuthors = ColumnVector.fromStrings("[\"Nigel Rees\",\"Evelyn " + + "Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]", "[\"Nigel Rees\",\"Evelyn " + + "Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]"); + Scalar path = Scalar.fromString("$.store.book[*].author"); + ColumnVector gotAuthors = json.getJSONObject(path)) { + assertColumnsAreEqual(expectedAuthors, gotAuthors); + } + } + @Test void testMakeStructEmpty() { final int numRows = 10;