Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolve path parsing issues in get_json_object #15082

Merged
24 changes: 18 additions & 6 deletions cpp/src/json/json_path.cu
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,14 @@ struct path_operator {
int index{-1}; // index for subscript operator
};

/**
* @brief Enum to specify whether parsing values enclosed within brackets, like `['book']`
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
*/
enum class bracket_state : bool {
INSIDE, ///< Parsing inside brackets
OUTSIDE ///< Parsing outside brackets
};
ttnghia marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Parsing class that holds the current state of the JSONPath string to be parsed
* and provides functions for navigating through it. This is only called on the host
Expand All @@ -541,7 +549,7 @@ class path_state : private parser {
case '.': {
path_operator op;
string_view term{".[", 2};
if (parse_path_name(op.name, term)) {
if (parse_path_name(op.name, term, bracket_state::OUTSIDE)) {
// this is another potential use case for __SPARK_BEHAVIORS / configurability
// Spark currently only handles the wildcard operator inside [*], it does
// not handle .*
Expand All @@ -564,7 +572,7 @@ class path_state : private parser {
path_operator op;
string_view term{"]", 1};
bool const is_string = *pos == '\'';
if (parse_path_name(op.name, term)) {
if (parse_path_name(op.name, term, bracket_state::INSIDE)) {
pos++;
if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') {
op.type = path_operator_type::CHILD_WILDCARD;
Expand Down Expand Up @@ -600,7 +608,8 @@ class path_state : private parser {
private:
cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'};

bool parse_path_name(string_view& name, string_view const& terminators)
// b_state is set to INSIDE while parsing values enclosed within [ ], otherwise OUTSIDE
bool parse_path_name(string_view& name, string_view const& terminators, bracket_state b_state)
{
switch (*pos) {
case '*':
Expand All @@ -609,8 +618,11 @@ class path_state : private parser {
break;

case '\'':
if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; }
break;
if (b_state == bracket_state::INSIDE) {
if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; }
break;
}
// if not inside the [ ] -> go to default

default: {
size_t const chars_left = input_len - (pos - input);
Expand Down Expand Up @@ -656,7 +668,7 @@ std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_comma
do {
op = p_state.get_next_operator();
if (op.type == path_operator_type::ERROR) {
CUDF_FAIL("Encountered invalid JSONPath input string");
CUDF_FAIL("Encountered invalid JSONPath input string", std::invalid_argument);
}
if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; }
// convert pointer to device pointer
Expand Down
38 changes: 38 additions & 0 deletions cpp/tests/json/json_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,15 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
};
EXPECT_THROW(query(), std::invalid_argument);
}

{
auto const input = cudf::test::strings_column_wrapper{R"({"a": "b"})"};
auto const json_path = std::string{"${a}"};
auto const query = [&]() {
auto const result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
};
EXPECT_THROW(query(), std::invalid_argument);
}
}

// queries that are legal, but reference invalid parts of the input
Expand Down Expand Up @@ -1018,4 +1027,33 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls)
do_test("$.tup[*].a.x", "[\"5\"]", "[null,null,null,\"5\"]");
}

TEST_F(JsonPathTests, QueriesContainingQuotes)
SurajAralihalli marked this conversation as resolved.
Show resolved Hide resolved
{
std::string input_string = R"({"AB": 1, "A.B": 2, "'A": {"B'": 3}, "A": {"B": 4} })";

auto do_test = [&input_string](auto const& json_path_string,
auto const& expected_string,
bool const& expect_null = false) {
auto const input = cudf::test::strings_column_wrapper{input_string};
auto const json_path = std::string{json_path_string};
cudf::get_json_object_options options;
options.set_allow_single_quotes(true);
auto const result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
auto const expected =
cudf::test::strings_column_wrapper{std::initializer_list<std::string>{expected_string},
std::initializer_list<bool>{!expect_null}};

CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
};

// Set 1
do_test(R"($.AB)", "1");
do_test(R"($['A.B'])", "2");
do_test(R"($.'A.B')", "3");
do_test(R"($.A.B)", "4");

// Set 2
do_test(R"($.'A)", R"({"B'": 3})");
}

CUDF_TEST_PROGRAM_MAIN()
10 changes: 9 additions & 1 deletion java/src/main/native/src/ColumnViewJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2452,7 +2452,15 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(
options.set_allow_single_quotes(allow_single_quotes);
options.set_strip_quotes_from_single_strings(strip_quotes_from_single_strings);
options.set_missing_fields_as_nulls(missing_fields_as_nulls);
return release_as_jlong(cudf::get_json_object(n_strings_col_view, *n_scalar_path, options));
auto result_col_ptr = [&]() {
SurajAralihalli marked this conversation as resolved.
Show resolved Hide resolved
try {
return cudf::get_json_object(n_strings_col_view, *n_scalar_path, options);
} catch (std::invalid_argument const &err) {
auto const null_scalar = cudf::string_scalar(std::string(""), false);
return cudf::make_column_from_scalar(null_scalar, n_strings_col_view.size());
} catch (...) { throw; }
}();
return release_as_jlong(result_col_ptr);
}
CATCH_STD(env, 0)
}
Expand Down
16 changes: 16 additions & 0 deletions java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -6405,6 +6405,22 @@ void testGetJSONObjectWithSingleQuotes() {
}
}

@Test
void testGetJSONObjectWithInvalidQueries() {
String jsonString = "{" +
"\'a\': \'A\"\'" +
"}";

GetJsonObjectOptions options = GetJsonObjectOptions.builder().allowSingleQuotes(true).build();
try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString);
Scalar nullString = Scalar.fromString(null);
ColumnVector expectedAuthors = ColumnVector.fromScalar(nullString, 2);
Scalar path = Scalar.fromString(".");
ColumnVector gotAuthors = json.getJSONObject(path, options)) {
assertColumnsAreEqual(expectedAuthors, gotAuthors);
}
}

@Test
void testMakeStructEmpty() {
final int numRows = 10;
Expand Down
Loading