diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 86e4da664a8..b3a029224d7 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -103,8 +103,11 @@ struct TransduceToNormalizedQuotes { // SQS | {'} -> {"} // SQS | {"} -> {\"} // SQS | {\} -> + // DQS | {\} -> // SEC | {'} -> {'} // SEC | Sigma\{'} -> {\*} + // DEC | {'} -> {'} + // DEC | Sigma\{'} -> {\*} // Whether this transition translates to the escape sequence: \" bool const outputs_escape_sequence = @@ -119,20 +122,23 @@ struct TransduceToNormalizedQuotes { return '"'; } // Case when the read symbol is an escape character - the actual translation for \ for some - // symbol is handled by transitions from SEC. For now, there is no output for this - // transition - if ((match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SQS)))) { + // symbol is handled by transitions from SEC. The same logic applies for the transition from + // DEC. For now, there is no output for this transition + if (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR) && + (state_id == static_cast(dfa_states::TT_SQS) || + state_id == static_cast(dfa_states::TT_DQS))) { return 0; } - // Case when an escaped single quote in an input single-quoted string needs to be replaced by an - // unescaped single quote - if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SEC)))) { + // Case when an escaped single quote in an input single-quoted or double-quoted string needs + // to be replaced by an unescaped single quote + if (match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR) && + (state_id == static_cast(dfa_states::TT_SEC) || + state_id == static_cast(dfa_states::TT_DEC))) { return '\''; } // Case when an escaped symbol that is not a single-quote needs to be replaced with \ - if (state_id == static_cast(dfa_states::TT_SEC)) { + if (state_id == static_cast(dfa_states::TT_SEC) || + state_id == static_cast(dfa_states::TT_DEC)) { return (relative_offset == 0) ? '\\' : read_symbol; } // In all other cases we simply output the input symbol @@ -156,18 +162,23 @@ struct TransduceToNormalizedQuotes { (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); // Number of characters to output on this transition if (sqs_outputs_escape_sequence) { return 2; } + // Whether this transition translates to the escape sequence \ or unescaped ' - bool const sec_outputs_escape_sequence = - (state_id == static_cast(dfa_states::TT_SEC)) && + bool const sec_dec_outputs_escape_sequence = + (state_id == static_cast(dfa_states::TT_SEC) || + state_id == static_cast(dfa_states::TT_DEC)) && (match_id != static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)); // Number of characters to output on this transition - if (sec_outputs_escape_sequence) { return 2; } + if (sec_dec_outputs_escape_sequence) { return 2; } + // Whether this transition translates to no output - bool const sqs_outputs_nop = - (state_id == static_cast(dfa_states::TT_SQS)) && + bool const sqs_dqs_outputs_nop = + (state_id == static_cast(dfa_states::TT_SQS) || + state_id == static_cast(dfa_states::TT_DQS)) && (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)); // Number of characters to output on this transition - if (sqs_outputs_nop) { return 0; } + if (sqs_dqs_outputs_nop) { return 0; } + return 1; } }; diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index b13e5bd4177..593c8136e6a 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -60,28 +60,28 @@ void run_test(const std::string& host_input, const std::string& expected_host_ou preprocessed_host_output, expected_host_output, preprocessed_host_output.size()); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization1) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Single) { - std::string input = R"({"A":'TEST"'})"; - std::string output = R"({"A":"TEST\""})"; + std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])"; + std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization2) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_MoreSingle) { - std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])"; - std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])"; + std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])"; + std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization3) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleInSingle) { - std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])"; - std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])"; + std::string input = R"({"A":'TEST"'})"; + std::string output = R"({"A":"TEST\""})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_MoreDoubleInSingle) { std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})"; std::string output = @@ -89,77 +89,84 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4) run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization5) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_StillMoreDoubleInSingle) { - std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})"; - std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})"; + std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])"; + std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization6) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleInSingleAndViceVersa) { - std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])"; - std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])"; + std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])"; + std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleAndSingleInSingle) +{ + std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})"; + std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization7) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_EscapedSingleInDouble) { std::string input = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; - std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; + std::string output = R"(["\t","\\t","\\","\\'\"\\\\","\n","\b"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization8) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_EscapedDoubleInSingle) { - std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])"; - std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])"; + std::string input = R"(["\t","\\t","\\",'\\\'\"\\\\',"\n","\b"])"; + std::string output = R"(["\t","\\t","\\","\\'\"\\\\","\n","\b"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid1) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MismatchedQuotes) { std::string input = R"(["THIS IS A TEST'])"; std::string output = R"(["THIS IS A TEST'])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid2) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MismatchedQuotesEscapedOutput) { std::string input = R"(['THIS IS A TEST"])"; std::string output = R"(["THIS IS A TEST\"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid3) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MoreMismatchedQuotes) { std::string input = R"({"MORE TEST'N":'RESUL})"; std::string output = R"({"MORE TEST'N":"RESUL})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid4) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_NoEndQuote) { std::string input = R"({"NUMBER":100'0,'STRING':'SOMETHING'})"; std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid5) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_InvalidJSON) { std::string input = R"({'NUMBER':100"0,"STRING":"SOMETHING"})"; std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid6) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBackslash) { std::string input = R"({'a':'\\''})"; std::string output = R"({"a":"\\""})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid7) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces) { std::string input = R"(}'a': 'b'{)"; std::string output = R"(}"a": "b"{)";