Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Address inconsistency in single quote normalization in JSON reader #15324

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 26 additions & 15 deletions cpp/src/io/json/json_normalization.cu
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,11 @@ struct TransduceToNormalizedQuotes {
// SQS | {'} -> {"}
// SQS | {"} -> {\"}
// SQS | {\} -> <nop>
// DQS | {\} -> <nop>
// SEC | {'} -> {'}
// SEC | Sigma\{'} -> {\*}
// DEC | {'} -> {'}
// DEC | Sigma\{'} -> {\*}

// Whether this transition translates to the escape sequence: \"
bool const outputs_escape_sequence =
Expand All @@ -119,20 +122,23 @@ struct TransduceToNormalizedQuotes {
return '"';
}
// Case when the read symbol is an escape character - the actual translation for \<s> for some
// symbol <s> is handled by transitions from SEC. For now, there is no output for this
// transition
if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR)) &&
((state_id == static_cast<StateT>(dfa_states::TT_SQS)))) {
// symbol <s> is handled by transitions from SEC. The same logic applies for the transition from
// DEC. For now, there is no output for this transition
if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR) &&
(state_id == static_cast<StateT>(dfa_states::TT_SQS) ||
state_id == static_cast<StateT>(dfa_states::TT_DQS))) {
return 0;
}
// Case when an escaped single quote in an input single-quoted string needs to be replaced by an
// unescaped single quote
if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) &&
((state_id == static_cast<StateT>(dfa_states::TT_SEC)))) {
// Case when an escaped single quote in an input single-quoted or double-quoted string needs
// to be replaced by an unescaped single quote
if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR) &&
(state_id == static_cast<StateT>(dfa_states::TT_SEC) ||
state_id == static_cast<StateT>(dfa_states::TT_DEC))) {
return '\'';
}
// Case when an escaped symbol <s> that is not a single-quote needs to be replaced with \<s>
if (state_id == static_cast<StateT>(dfa_states::TT_SEC)) {
if (state_id == static_cast<StateT>(dfa_states::TT_SEC) ||
state_id == static_cast<StateT>(dfa_states::TT_DEC)) {
return (relative_offset == 0) ? '\\' : read_symbol;
}
// In all other cases we simply output the input symbol
Expand All @@ -156,18 +162,23 @@ struct TransduceToNormalizedQuotes {
(match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
// Number of characters to output on this transition
if (sqs_outputs_escape_sequence) { return 2; }

// Whether this transition translates to the escape sequence \<s> or unescaped '
bool const sec_outputs_escape_sequence =
(state_id == static_cast<StateT>(dfa_states::TT_SEC)) &&
bool const sec_dec_outputs_escape_sequence =
(state_id == static_cast<StateT>(dfa_states::TT_SEC) ||
state_id == static_cast<StateT>(dfa_states::TT_DEC)) &&
(match_id != static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR));
// Number of characters to output on this transition
if (sec_outputs_escape_sequence) { return 2; }
if (sec_dec_outputs_escape_sequence) { return 2; }

// Whether this transition translates to no output <nop>
bool const sqs_outputs_nop =
(state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
bool const sqs_dqs_outputs_nop =
(state_id == static_cast<StateT>(dfa_states::TT_SQS) ||
state_id == static_cast<StateT>(dfa_states::TT_DQS)) &&
(match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR));
// Number of characters to output on this transition
if (sqs_outputs_nop) { return 0; }
if (sqs_dqs_outputs_nop) { return 0; }

return 1;
}
};
Expand Down
63 changes: 35 additions & 28 deletions cpp/tests/io/json_quote_normalization_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,106 +60,113 @@ void run_test(const std::string& host_input, const std::string& expected_host_ou
preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization1)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Single)
{
std::string input = R"({"A":'TEST"'})";
std::string output = R"({"A":"TEST\""})";
std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])";
std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization2)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_MoreSingle)
{
std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])";
std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])";
std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])";
std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization3)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleInSingle)
{
std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])";
std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])";
std::string input = R"({"A":'TEST"'})";
std::string output = R"({"A":"TEST\""})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_MoreDoubleInSingle)
{
std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})";
std::string output =
R"({"ain't ain't a word and you ain't supposed to say it":"\"\"\"\"\"\"\"\"\"\"\""})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization5)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_StillMoreDoubleInSingle)
{
std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})";
std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})";
std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])";
std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization6)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleInSingleAndViceVersa)
{
std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])";
std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])";
std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])";
std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleAndSingleInSingle)
{
std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})";
std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization7)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_EscapedSingleInDouble)
{
std::string input = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
std::string output = R"(["\t","\\t","\\","\\'\"\\\\","\n","\b"])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization8)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_EscapedDoubleInSingle)
{
std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])";
std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])";
std::string input = R"(["\t","\\t","\\",'\\\'\"\\\\',"\n","\b"])";
std::string output = R"(["\t","\\t","\\","\\'\"\\\\","\n","\b"])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid1)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MismatchedQuotes)
{
std::string input = R"(["THIS IS A TEST'])";
std::string output = R"(["THIS IS A TEST'])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid2)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MismatchedQuotesEscapedOutput)
{
std::string input = R"(['THIS IS A TEST"])";
std::string output = R"(["THIS IS A TEST\"])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid3)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MoreMismatchedQuotes)
{
std::string input = R"({"MORE TEST'N":'RESUL})";
std::string output = R"({"MORE TEST'N":"RESUL})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid4)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_NoEndQuote)
{
std::string input = R"({"NUMBER":100'0,'STRING':'SOMETHING'})";
std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid5)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_InvalidJSON)
{
std::string input = R"({'NUMBER':100"0,"STRING":"SOMETHING"})";
std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid6)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBackslash)
{
std::string input = R"({'a':'\\''})";
std::string output = R"({"a":"\\""})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid7)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces)
{
std::string input = R"(}'a': 'b'{)";
std::string output = R"(}"a": "b"{)";
Expand Down
Loading