From 6123d9aeece5773f34ffa4ed75910bacfc393460 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 3 May 2021 11:44:50 -0500 Subject: [PATCH 1/7] Add a flag for allowing single quotes in JSON strings. Modify get_json_object() to take an explicit options struct for controlling behaviors. --- cpp/include/cudf/strings/detail/json.hpp | 1 + cpp/include/cudf/strings/json.hpp | 80 ++++++++++- cpp/src/strings/json/json_path.cu | 172 +++++++++++++---------- cpp/tests/strings/json_tests.cpp | 118 +++++++++++++++- 4 files changed, 291 insertions(+), 80 deletions(-) diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp index e6a0b49f102..a46f9cd87b9 100644 --- a/cpp/include/cudf/strings/detail/json.hpp +++ b/cpp/include/cudf/strings/detail/json.hpp @@ -32,6 +32,7 @@ namespace detail { std::unique_ptr get_json_object( cudf::strings_column_view const& col, cudf::string_scalar const& json_path, + thrust::optional options, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp index b39e4a2027c..bf95e9396c6 100644 --- a/cpp/include/cudf/strings/json.hpp +++ b/cpp/include/cudf/strings/json.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #include +#include + namespace cudf { namespace strings { @@ -26,6 +28,78 @@ namespace strings { * @file */ +/** + * @brief Settings for `get_json_object()`. + */ +class get_json_object_options { + // allow single quotes to represent strings in JSON + bool allow_single_quotes = false; + + // individual string values are returned with quotes stripped. + bool strip_quotes_from_single_strings = false; + + public: + /** + * @brief Default constructor. + * + * This has been added since Cython requires a default constructor to create objects on stack. + */ + explicit get_json_object_options() = default; + + /** + * @brief Returns true/false depending on whether single-quotes for representing strings + * are allowed. + */ + CUDA_HOST_DEVICE_CALLABLE bool get_allow_single_quotes() const { return allow_single_quotes; } + + /** + * @brief Returns true/false depending on whether individually returned string values have + * their quotes stripped. + * + * When set to true, if the return value for a given row is an individual string + * (not an object, or an array of strings), strip the quotes from the string and return only the + * contents of the string itself. Example: + * + * @code{.pseudo} + * + * With strip_quotes_from_single_strings OFF: + * Input = {"a" : "b"} + * Query = $.a + * Output = "b" + * + * With strip_quotes_from_single_strings ON: + * Input = {"a" : "b"} + * Query = $.a + * Output = b + * + * @endcode + */ + CUDA_HOST_DEVICE_CALLABLE bool get_strip_quotes_from_single_strings() const + { + return strip_quotes_from_single_strings; + } + + /** + * @brief Set whether single-quotes for strings are allowed. + * + * @param _allow_single_quotes bool indicating desired behavior. + */ + void set_allow_single_quotes(bool _allow_single_quotes) + { + allow_single_quotes = _allow_single_quotes; + } + + /** + * @brief Set whether individually returned string values have their quotes stripped. + * + * @param _strip_quotes_from_single_strings bool indicating desired behavior. + */ + void set_strip_quotes_from_single_strings(bool _strip_quotes_from_single_strings) + { + strip_quotes_from_single_strings = _strip_quotes_from_single_strings; + } +}; + /** * @brief Apply a JSONPath string to all rows in an input strings column. * @@ -37,13 +111,15 @@ namespace strings { * * @param col The input strings column. Each row must contain a valid json string * @param json_path The JSONPath string to be applied to each row + * @param options Options for controlling the behavior of the function * @param mr Resource for allocating device memory. * @return New strings column containing the retrieved json object strings */ std::unique_ptr get_json_object( cudf::strings_column_view const& col, cudf::string_scalar const& json_path, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + thrust::optional options = thrust::nullopt, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group } // namespace strings diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 3b0290736ae..72d8c458cc1 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -97,80 +98,39 @@ class parser { return false; } + // if quote = 0, allow either ' or " CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(string_view& str, bool can_be_empty, char quote) { str = string_view(nullptr, 0); - if (parse_whitespace() && *pos == quote) { - const char* start = ++pos; - while (!eof()) { - if (*pos == quote) { - str = string_view(start, pos - start); + if (parse_whitespace()) { + // if the user specifies 0 for quote, allow either ' or ". otherwise + // use the char directly + if ((quote == 0 && (*pos == '\'' || *pos == '\"')) || (quote == *pos)) { + quote = *pos; + + const char* start = ++pos; + while (!eof()) { + if (*pos == quote) { + str = string_view(start, pos - start); + pos++; + return parse_result::SUCCESS; + } pos++; - return parse_result::SUCCESS; } - pos++; } } return can_be_empty ? parse_result::EMPTY : parse_result::ERROR; } - // a name means: - // - a string followed by a : - // - no string - CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(string_view& name, - bool can_be_empty, - char quote) - { - if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) { - return parse_result::ERROR; - } - - // if we got a real string, the next char must be a : - if (name.size_bytes() > 0) { - if (!parse_whitespace()) { return parse_result::ERROR; } - if (*pos == ':') { - pos++; - return parse_result::SUCCESS; - } - } - return parse_result::EMPTY; - } - - // numbers, true, false, null. - // this function is not particularly strong. badly formed values will get - // consumed without throwing any errors - CUDA_HOST_DEVICE_CALLABLE parse_result parse_non_string_value(string_view& val) - { - if (!parse_whitespace()) { return parse_result::ERROR; } - - // parse to the end of the value - char const* start = pos; - char const* end = start; - while (!eof(end)) { - char const c = *end; - if (c == ',' || c == '}' || c == ']' || is_whitespace(c)) { break; } - - // illegal chars - if (c == '[' || c == '{' || c == ':' || c == '\"') { return parse_result::ERROR; } - end++; - } - pos = end; - - val = string_view(start, end - start); - - return parse_result::SUCCESS; - } - protected: char const* input; int64_t input_len; char const* pos; - private: CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' '; } }; @@ -217,19 +177,23 @@ class json_state : private parser { parent_el_type(json_element_type::NONE) { } - __device__ json_state(const char* _input, int64_t _input_len) + __device__ json_state(const char* _input, + int64_t _input_len, + thrust::optional _options) : parser(_input, _input_len), cur_el_start(nullptr), cur_el_type(json_element_type::NONE), parent_el_type(json_element_type::NONE) { + options = _options.value_or(get_json_object_options{}); } __device__ json_state(json_state const& j) : parser(j), cur_el_start(j.cur_el_start), cur_el_type(j.cur_el_type), - parent_el_type(j.parent_el_type) + parent_el_type(j.parent_el_type), + options(j.options) { } @@ -245,10 +209,9 @@ class json_state : private parser { if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; } end = pos; - // SPARK-specific behavior. if this is a non-list-element wrapped in quotes, - // strip them. we may need to make this behavior configurable in some way - // later on. - if (!list_element && *start == '\"' && *(end - 1) == '\"') { + // potentially strip quotes from individually returned string values. + if (options.get_strip_quotes_from_single_strings() && !list_element && is_quote(*start) && + *(end - 1) == *start) { start++; end--; } @@ -327,7 +290,54 @@ class json_state : private parser { return parse_result::ERROR; } + // a name means: + // - a string followed by a : + // - no string + CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(string_view& name, bool can_be_empty) + { + char const quote = options.get_allow_single_quotes() ? 0 : '\"'; + + if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) { + return parse_result::ERROR; + } + + // if we got a real string, the next char must be a : + if (name.size_bytes() > 0) { + if (!parse_whitespace()) { return parse_result::ERROR; } + if (*pos == ':') { + pos++; + return parse_result::SUCCESS; + } + } + return parse_result::EMPTY; + } + private: + // numbers, true, false, null. + // this function is not particularly strong. badly formed values will get + // consumed without throwing any errors + CUDA_HOST_DEVICE_CALLABLE parse_result parse_non_string_value(string_view& val) + { + if (!parse_whitespace()) { return parse_result::ERROR; } + + // parse to the end of the value + char const* start = pos; + char const* end = start; + while (!eof(end)) { + char const c = *end; + if (c == ',' || c == '}' || c == ']' || is_whitespace(c)) { break; } + + // illegal chars + if (c == '[' || c == '{' || c == ':' || is_quote(c)) { return parse_result::ERROR; } + end++; + } + pos = end; + + val = string_view(start, end - start); + + return parse_result::SUCCESS; + } + // parse a value - either a string or a number/null/bool __device__ parse_result parse_value() { @@ -335,7 +345,7 @@ class json_state : private parser { // string or number? string_view unused; - return *pos == '\"' ? parse_string(unused, false, '\"') : parse_non_string_value(unused); + return is_quote(*pos) ? parse_string(unused, false, *pos) : parse_non_string_value(unused); } __device__ parse_result next_element_internal(bool child) @@ -363,7 +373,7 @@ class json_state : private parser { // if we're not accessing elements of an array, check for name. bool const array_access = (cur_el_type == ARRAY && child) || (parent_el_type == ARRAY && !child); - if (!array_access && parse_name(cur_el_name, true, '\"') == parse_result::ERROR) { + if (!array_access && parse_name(cur_el_name, true) == parse_result::ERROR) { return parse_result::ERROR; } @@ -374,8 +384,12 @@ class json_state : private parser { case '{': cur_el_type = OBJECT; break; case ',': - case ':': - case '\'': return parse_result::ERROR; + case ':': return parse_result::ERROR; + + case '\'': + if (!options.get_allow_single_quotes()) { return parse_result::ERROR; } + cur_el_type = VALUE; + break; // value type default: cur_el_type = VALUE; break; @@ -386,11 +400,17 @@ class json_state : private parser { return parse_result::SUCCESS; } + CUDA_HOST_DEVICE_CALLABLE bool is_quote(char c) + { + return options.get_allow_single_quotes() ? (c == '\"' || c == '\'') : c == '\"'; + } + const char* cur_el_start; // pointer to the first character of the -value- of the current // element - not the name string_view cur_el_name; // name of the current element (if applicable) json_element_type cur_el_type; // type of the current element json_element_type parent_el_type; // parent element type + get_json_object_options options; // behavior options }; enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END }; @@ -762,6 +782,7 @@ constexpr int max_command_stack_depth = 8; * @param out_buf Buffer user to store the results of the query (nullptr in the size computation * step) * @param out_buf_size Size of the output buffer + * @param options Options controlling behavior * @returns A pair containing the result code the output buffer. */ __device__ thrust::pair get_json_object_single( @@ -769,9 +790,10 @@ __device__ thrust::pair get_json_object_single( size_t input_len, path_operator const* const commands, char* out_buf, - size_t out_buf_size) + size_t out_buf_size, + thrust::optional options) { - json_state j_state(input, input_len); + json_state j_state(input, input_len, options); json_output output{out_buf_size, out_buf}; auto const result = parse_json_path(j_state, commands, output); @@ -792,6 +814,7 @@ __device__ thrust::pair get_json_object_single( * @param out_buf Buffer used to store the results of the query * @param out_validity Output validity buffer * @param out_valid_count Output count of # of valid bits + * @param options Options controlling behavior */ template __launch_bounds__(block_size) __global__ @@ -800,7 +823,8 @@ __launch_bounds__(block_size) __global__ offset_type* output_offsets, thrust::optional out_buf, thrust::optional out_validity, - thrust::optional out_valid_count) + thrust::optional out_valid_count, + thrust::optional options) { size_type tid = threadIdx.x + (blockDim.x * blockIdx.x); size_type stride = blockDim.x * gridDim.x; @@ -821,7 +845,7 @@ __launch_bounds__(block_size) __global__ parse_result result; json_output out; thrust::tie(result, out) = - get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size); + get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size, options); output_size = out.output_len.value_or(0); if (out.output_len.has_value() && result == parse_result::SUCCESS) { is_valid = true; } } @@ -857,6 +881,7 @@ __launch_bounds__(block_size) __global__ */ std::unique_ptr get_json_object(cudf::strings_column_view const& col, cudf::string_scalar const& json_path, + thrust::optional options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -893,7 +918,8 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c offsets_view.head(), thrust::nullopt, thrust::nullopt, - thrust::nullopt); + thrust::nullopt, + options); // convert sizes to offsets thrust::exclusive_scan(rmm::exec_policy(stream), @@ -923,7 +949,8 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c offsets_view.head(), chars_view.head(), static_cast(validity.data()), - d_valid_count.data()); + d_valid_count.data(), + options); return make_strings_column(col.size(), std::move(offsets), @@ -942,10 +969,11 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c */ std::unique_ptr get_json_object(cudf::strings_column_view const& col, cudf::string_scalar const& json_path, + thrust::optional options, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::get_json_object(col, json_path, rmm::cuda_stream_default, mr); + return detail::get_json_object(col, json_path, options, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index 44eb35d4163..76a2515d8ed 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -442,7 +442,7 @@ TEST_F(JsonTests, GetJsonObjectFilter) auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); - cudf::test::strings_column_wrapper expected_raw{"fiction"}; + cudf::test::strings_column_wrapper expected_raw{"\"fiction\""}; auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -459,7 +459,7 @@ TEST_F(JsonTests, GetJsonObjectNullInputs) auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); - cudf::test::strings_column_wrapper expected_raw({"b", "", "b", ""}, {1, 0, 1, 0}); + cudf::test::strings_column_wrapper expected_raw({"\"b\"", "", "\"b\"", ""}, {1, 0, 1, 0}); auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -501,7 +501,7 @@ TEST_F(JsonTests, GetJsonObjectEmptyInputsAndOutputs) std::string json_path("$.store.bicycle"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); - cudf::test::strings_column_wrapper expected({""}, {1}); + cudf::test::strings_column_wrapper expected({"\"\""}, {1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); } @@ -671,7 +671,7 @@ TEST_F(JsonTests, MixedOutput) "{\"b\" : \"c\"}", "", "[\"y\",500]", - "", + "\"\"", "{" "\"z\": {\"i\": 10, \"j\": 100}," "\"b\": [\"c\",null,true,-1]" @@ -708,8 +708,8 @@ TEST_F(JsonTests, MixedOutput) // clang-format off cudf::test::strings_column_wrapper expected({ - "c", - "c", + "\"c\"", + "\"c\"", "", "", "", @@ -759,3 +759,109 @@ TEST_F(JsonTests, MixedOutput) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); } } + +TEST_F(JsonTests, StripQuotes) +{ + // we normally expect our outputs here to be + // "b" (with quotes) + // but with string_quotes_from_single_strings set, we expect + // b (no quotes) + { + std::string str("{\"a\" : \"b\"}"); + cudf::test::strings_column_wrapper input({str, str}); + + cudf::strings::get_json_object_options options; + options.set_strip_quotes_from_single_strings(true); + + std::string json_path("$.a"); + auto result_raw = + cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw({"b", "b"}); + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + // a valid, but empty row + { + cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"}; + std::string json_path("$.store.bicycle"); + + cudf::strings::get_json_object_options options; + options.set_strip_quotes_from_single_strings(true); + + auto result = + cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options); + + cudf::test::strings_column_wrapper expected({""}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +TEST_F(JsonTests, AllowSingleQuotes) +{ + // Tests allowing single quotes for strings. + // Note: this flag allows a mix of single and double quotes. it doesn't explicitly require + // single-quotes only. + + // various queries on: + // clang-format off + std::vector input_strings { + "{\'a\': {\'b\' : \'c\'}}", + + "{" + "\'a\': {\'b\' : \"c\"}," + "\'d\': [{\"e\":123}, {\'f\':-10}]" + "}", + + "{" + "\'b\': 123" + "}", + + "{" + "\"a\": [\'y\',500]" + "}", + + "{" + "\'a\': \"\"" + "}", + + "{" + "\"a\": {" + "\'z\': {\'i\': 10, \'j\': 100}," + "\'b\': [\'c\',null,true,-1]" + "}" + "}" + }; + // clang-format on + cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end()); + { + std::string json_path("$.a"); + + cudf::strings::get_json_object_options options; + options.set_allow_single_quotes(true); + + auto result = + cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "{\'b\' : \'c\'}", + "{\'b\' : \"c\"}", + "", + "[\'y\',500]", + "\"\"", + "{" + "\'z\': {\'i\': 10, \'j\': 100}," + "\'b\': [\'c\',null,true,-1]" + "}" + }, + {1, 1, 0, 1, 1, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} \ No newline at end of file From 7c2edaa84f92d97bd55e003f3fb13f13a2bf53e9 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 3 May 2021 11:55:18 -0500 Subject: [PATCH 2/7] Small formatting/doc tweaks. --- cpp/include/cudf/strings/json.hpp | 2 -- cpp/tests/strings/json_tests.cpp | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp index bf95e9396c6..40414def306 100644 --- a/cpp/include/cudf/strings/json.hpp +++ b/cpp/include/cudf/strings/json.hpp @@ -41,8 +41,6 @@ class get_json_object_options { public: /** * @brief Default constructor. - * - * This has been added since Cython requires a default constructor to create objects on stack. */ explicit get_json_object_options() = default; diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index 76a2515d8ed..fb6b3a0ae3f 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -864,4 +864,4 @@ TEST_F(JsonTests, AllowSingleQuotes) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); } -} \ No newline at end of file +} From c70b0b31db406dfbafbf32c270266c0b9aaf6c34 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Fri, 14 May 2021 14:52:13 -0500 Subject: [PATCH 3/7] Add some doxygen comments. Flip strip_quotes_from_single_strings back to true by default. --- cpp/include/cudf/strings/json.hpp | 2 +- cpp/src/strings/json/json_path.cu | 38 +++++++++++++++++++++++++------ cpp/tests/strings/json_tests.cpp | 24 +++++++++---------- 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp index 40414def306..e44def7d5cd 100644 --- a/cpp/include/cudf/strings/json.hpp +++ b/cpp/include/cudf/strings/json.hpp @@ -36,7 +36,7 @@ class get_json_object_options { bool allow_single_quotes = false; // individual string values are returned with quotes stripped. - bool strip_quotes_from_single_strings = false; + bool strip_quotes_from_single_strings = true; public: /** diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 72d8c458cc1..36e0ef0c494 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -290,9 +290,27 @@ class json_state : private parser { return parse_result::ERROR; } - // a name means: - // - a string followed by a : - // - no string + /** + * @brief Parse a name field for a JSON element. + * + * When parsing JSON objects, it is not always a requirement that the name + * actually exists. For example, the outer object bounded by {} here has + * no name, while the inner element "a" does. + * + * { + * "a" : "b" + * } + * + * The user can specify whether or not the name string must be present via + * the `can_be_empty` flag. + * + * When a name is present, it must be followed by a : + * + * @param name (Output) The resulting name. + * @param can_be_empty Parameter indicating whether it is valid for the name + * to not be present. + * @returns A result code indicating success, failure or other result. + */ CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(string_view& name, bool can_be_empty) { char const quote = options.get_allow_single_quotes() ? 0 : '\"'; @@ -313,9 +331,15 @@ class json_state : private parser { } private: - // numbers, true, false, null. - // this function is not particularly strong. badly formed values will get - // consumed without throwing any errors + /** + * @brief Parse a non-string JSON value. + * + * Non-string values include numbers, true, false, or null. This function does not + * do any validation of the value. + * + * @param val (Output) The string containing the parsed value + * @returns A result code indicating success, failure or other result. + */ CUDA_HOST_DEVICE_CALLABLE parse_result parse_non_string_value(string_view& val) { if (!parse_whitespace()) { return parse_result::ERROR; } @@ -402,7 +426,7 @@ class json_state : private parser { CUDA_HOST_DEVICE_CALLABLE bool is_quote(char c) { - return options.get_allow_single_quotes() ? (c == '\"' || c == '\'') : c == '\"'; + return (c == '\"') || (options.get_allow_single_quotes() && (c == '\'')); } const char* cur_el_start; // pointer to the first character of the -value- of the current diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index fb6b3a0ae3f..7d65c751613 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -442,7 +442,7 @@ TEST_F(JsonTests, GetJsonObjectFilter) auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); - cudf::test::strings_column_wrapper expected_raw{"\"fiction\""}; + cudf::test::strings_column_wrapper expected_raw{"fiction"}; auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -459,7 +459,7 @@ TEST_F(JsonTests, GetJsonObjectNullInputs) auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); - cudf::test::strings_column_wrapper expected_raw({"\"b\"", "", "\"b\"", ""}, {1, 0, 1, 0}); + cudf::test::strings_column_wrapper expected_raw({"b", "", "b", ""}, {1, 0, 1, 0}); auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -501,7 +501,7 @@ TEST_F(JsonTests, GetJsonObjectEmptyInputsAndOutputs) std::string json_path("$.store.bicycle"); auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); - cudf::test::strings_column_wrapper expected({"\"\""}, {1}); + cudf::test::strings_column_wrapper expected({""}, {1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); } @@ -671,7 +671,7 @@ TEST_F(JsonTests, MixedOutput) "{\"b\" : \"c\"}", "", "[\"y\",500]", - "\"\"", + "", "{" "\"z\": {\"i\": 10, \"j\": 100}," "\"b\": [\"c\",null,true,-1]" @@ -708,8 +708,8 @@ TEST_F(JsonTests, MixedOutput) // clang-format off cudf::test::strings_column_wrapper expected({ - "\"c\"", - "\"c\"", + "c", + "c", "", "", "", @@ -763,22 +763,22 @@ TEST_F(JsonTests, MixedOutput) TEST_F(JsonTests, StripQuotes) { // we normally expect our outputs here to be - // "b" (with quotes) - // but with string_quotes_from_single_strings set, we expect - // b (no quotes) + // b (no quotes) + // but with string_quotes_from_single_strings false, we expect + // "b" (with quotes) { std::string str("{\"a\" : \"b\"}"); cudf::test::strings_column_wrapper input({str, str}); cudf::strings::get_json_object_options options; - options.set_strip_quotes_from_single_strings(true); + options.set_strip_quotes_from_single_strings(false); std::string json_path("$.a"); auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options); auto result = drop_whitespace(*result_raw); - cudf::test::strings_column_wrapper expected_raw({"b", "b"}); + cudf::test::strings_column_wrapper expected_raw({"\"b\"", "\"b\""}); auto expected = drop_whitespace(expected_raw); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); @@ -853,7 +853,7 @@ TEST_F(JsonTests, AllowSingleQuotes) "{\'b\' : \"c\"}", "", "[\'y\',500]", - "\"\"", + "", "{" "\'z\': {\'i\': 10, \'j\': 100}," "\'b\': [\'c\',null,true,-1]" From ff9b3a8d36b0c9e4c02f0e7523cb649b3aff093e Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Fri, 14 May 2021 15:06:55 -0500 Subject: [PATCH 4/7] Switch to using plain optional parameters for passing get_json_object_options. --- cpp/include/cudf/strings/detail/json.hpp | 2 +- cpp/include/cudf/strings/json.hpp | 4 ++-- cpp/src/strings/json/json_path.cu | 16 +++++++--------- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp index a46f9cd87b9..f52870b7db0 100644 --- a/cpp/include/cudf/strings/detail/json.hpp +++ b/cpp/include/cudf/strings/detail/json.hpp @@ -32,7 +32,7 @@ namespace detail { std::unique_ptr get_json_object( cudf::strings_column_view const& col, cudf::string_scalar const& json_path, - thrust::optional options, + get_json_object_options options = get_json_object_options{}, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp index e44def7d5cd..9081fa23eec 100644 --- a/cpp/include/cudf/strings/json.hpp +++ b/cpp/include/cudf/strings/json.hpp @@ -116,8 +116,8 @@ class get_json_object_options { std::unique_ptr get_json_object( cudf::strings_column_view const& col, cudf::string_scalar const& json_path, - thrust::optional options = thrust::nullopt, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + get_json_object_options options = get_json_object_options{}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group } // namespace strings diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 36e0ef0c494..53cdc64cfff 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -177,15 +177,13 @@ class json_state : private parser { parent_el_type(json_element_type::NONE) { } - __device__ json_state(const char* _input, - int64_t _input_len, - thrust::optional _options) + __device__ json_state(const char* _input, int64_t _input_len, get_json_object_options _options) : parser(_input, _input_len), cur_el_start(nullptr), cur_el_type(json_element_type::NONE), - parent_el_type(json_element_type::NONE) + parent_el_type(json_element_type::NONE), + options(_options) { - options = _options.value_or(get_json_object_options{}); } __device__ json_state(json_state const& j) @@ -815,7 +813,7 @@ __device__ thrust::pair get_json_object_single( path_operator const* const commands, char* out_buf, size_t out_buf_size, - thrust::optional options) + get_json_object_options options) { json_state j_state(input, input_len, options); json_output output{out_buf_size, out_buf}; @@ -848,7 +846,7 @@ __launch_bounds__(block_size) __global__ thrust::optional out_buf, thrust::optional out_validity, thrust::optional out_valid_count, - thrust::optional options) + get_json_object_options options) { size_type tid = threadIdx.x + (blockDim.x * blockIdx.x); size_type stride = blockDim.x * gridDim.x; @@ -905,7 +903,7 @@ __launch_bounds__(block_size) __global__ */ std::unique_ptr get_json_object(cudf::strings_column_view const& col, cudf::string_scalar const& json_path, - thrust::optional options, + get_json_object_options options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -993,7 +991,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c */ std::unique_ptr get_json_object(cudf::strings_column_view const& col, cudf::string_scalar const& json_path, - thrust::optional options, + get_json_object_options options, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); From 14ca29a6e10444fc82c3d5efb8499132bf68b488 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 17 May 2021 11:35:33 -0500 Subject: [PATCH 5/7] Add additional string/quotes test. Small other changes. --- cpp/tests/strings/json_tests.cpp | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index 7d65c751613..5c81057b6d7 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -808,8 +808,8 @@ TEST_F(JsonTests, AllowSingleQuotes) // single-quotes only. // various queries on: - // clang-format off - std::vector input_strings { + std::vector input_strings{ + // clang-format off "{\'a\': {\'b\' : \'c\'}}", "{" @@ -834,9 +834,18 @@ TEST_F(JsonTests, AllowSingleQuotes) "\'z\': {\'i\': 10, \'j\': 100}," "\'b\': [\'c\',null,true,-1]" "}" - "}" + "}", + + "{" + "\'a\': \"abc'def\"" + "}", + + "{" + "\'a\': \"'abc'def'\"" + "}", + // clang-format on }; - // clang-format on + cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end()); { std::string json_path("$.a"); @@ -857,9 +866,11 @@ TEST_F(JsonTests, AllowSingleQuotes) "{" "\'z\': {\'i\': 10, \'j\': 100}," "\'b\': [\'c\',null,true,-1]" - "}" + "}", + "abc'def", + "'abc'def'" }, - {1, 1, 0, 1, 1, 1}); + {1, 1, 0, 1, 1, 1, 1, 1}); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); From f6e09a9cd14a59deca26bd8f952e28bfdf31e0b3 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 17 May 2021 15:59:17 -0500 Subject: [PATCH 6/7] More doxygen comments and tweaks. --- cpp/src/strings/json/json_path.cu | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 53cdc64cfff..20d4aa2a307 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -98,7 +98,16 @@ class parser { return false; } - // if quote = 0, allow either ' or " + /** + * @brief Parse a quote-enclosed JSON string. + * + * @param[out] str The resulting string. + * @param can_be_empty Parameter indicating whether it is valid for the string + * to not be present. + * @param quote Character expected as the surrounding quotes. A value of 0 + * indicates allowing either single or double quotes (but not a mixture of both). + * @returns A result code indicating success, failure or other result. + */ CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(string_view& str, bool can_be_empty, char quote) @@ -295,16 +304,18 @@ class json_state : private parser { * actually exists. For example, the outer object bounded by {} here has * no name, while the inner element "a" does. * + * ``` * { * "a" : "b" * } + * ``` * * The user can specify whether or not the name string must be present via * the `can_be_empty` flag. * - * When a name is present, it must be followed by a : + * When a name is present, it must be followed by a colon `:` * - * @param name (Output) The resulting name. + * @param[out] name The resulting name. * @param can_be_empty Parameter indicating whether it is valid for the name * to not be present. * @returns A result code indicating success, failure or other result. From a2b6005efd073761c7cc7f705460cfc054e5b081 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 19 May 2021 09:45:44 -0500 Subject: [PATCH 7/7] Remove default parameter for get_json_object_options struct. --- cpp/include/cudf/strings/detail/json.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp index f52870b7db0..85094175572 100644 --- a/cpp/include/cudf/strings/detail/json.hpp +++ b/cpp/include/cudf/strings/detail/json.hpp @@ -32,7 +32,7 @@ namespace detail { std::unique_ptr get_json_object( cudf::strings_column_view const& col, cudf::string_scalar const& json_path, - get_json_object_options options = get_json_object_options{}, + get_json_object_options options, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());