Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolve path parsing issues in get_json_object #15082

Merged
16 changes: 10 additions & 6 deletions cpp/src/json/json_path.cu
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@ class path_state : private parser {
case '.': {
path_operator op;
string_view term{".[", 2};
if (parse_path_name(op.name, term)) {
if (parse_path_name(op.name, term, false /*outside_brackets*/)) {
// this is another potential use case for __SPARK_BEHAVIORS / configurability
// Spark currently only handles the wildcard operator inside [*], it does
// not handle .*
Expand All @@ -564,7 +564,7 @@ class path_state : private parser {
path_operator op;
string_view term{"]", 1};
bool const is_string = *pos == '\'';
if (parse_path_name(op.name, term)) {
if (parse_path_name(op.name, term, true /*inside_brackets*/)) {
pos++;
if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') {
op.type = path_operator_type::CHILD_WILDCARD;
Expand Down Expand Up @@ -600,7 +600,8 @@ class path_state : private parser {
private:
cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'};

bool parse_path_name(string_view& name, string_view const& terminators)
// inside_brackets is set to true while parsing values enclosed within [ ] example: ['book']
bool parse_path_name(string_view& name, string_view const& terminators, bool inside_brackets)
SurajAralihalli marked this conversation as resolved.
Show resolved Hide resolved
{
switch (*pos) {
case '*':
Expand All @@ -609,8 +610,11 @@ class path_state : private parser {
break;

case '\'':
if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; }
break;
if (inside_brackets) {
if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; }
break;
}
// if not inside the [ ] -> go to default

default: {
size_t const chars_left = input_len - (pos - input);
Expand Down Expand Up @@ -656,7 +660,7 @@ std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_comma
do {
op = p_state.get_next_operator();
if (op.type == path_operator_type::ERROR) {
CUDF_FAIL("Encountered invalid JSONPath input string");
CUDF_FAIL("Encountered invalid JSONPath input string", std::invalid_argument);
}
if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; }
// convert pointer to device pointer
Expand Down
38 changes: 38 additions & 0 deletions cpp/tests/json/json_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,15 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
};
EXPECT_THROW(query(), std::invalid_argument);
}

{
auto const input = cudf::test::strings_column_wrapper{R"({"a": "b"})"};
auto const json_path = std::string{"${a}"};
auto const query = [&]() {
auto const result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
};
EXPECT_THROW(query(), std::invalid_argument);
}
}

// queries that are legal, but reference invalid parts of the input
Expand Down Expand Up @@ -1018,4 +1027,33 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls)
do_test("$.tup[*].a.x", "[\"5\"]", "[null,null,null,\"5\"]");
}

TEST_F(JsonPathTests, QueriesContainingQuotes)
SurajAralihalli marked this conversation as resolved.
Show resolved Hide resolved
{
std::string input_string = R"({"AB": 1, "A.B": 2, "'A": {"B'": 3}, "A": {"B": 4} })";

auto do_test = [&input_string](auto const& json_path_string,
auto const& expected_string,
bool const& expect_null = false) {
auto const input = cudf::test::strings_column_wrapper{input_string};
auto const json_path = std::string{json_path_string};
cudf::get_json_object_options options;
options.set_allow_single_quotes(true);
auto const result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
auto const expected =
cudf::test::strings_column_wrapper{std::initializer_list<std::string>{expected_string},
std::initializer_list<bool>{!expect_null}};

CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
};

// Set 1
do_test(R"($.AB)", "1");
do_test(R"($['A.B'])", "2");
do_test(R"($.'A.B')", "3");
do_test(R"($.A.B)", "4");

// Set 2
do_test(R"($.'A)", R"({"B'": 3})");
}

CUDF_TEST_PROGRAM_MAIN()
Loading