From 3db250b8a42c1568816c289d56f6d288c3dfac64 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 15 Mar 2024 23:12:23 +0000 Subject: [PATCH 1/4] inconsistency fix --- cpp/src/io/json/json_normalization.cu | 41 +++++++----- .../io/json_quote_normalization_test.cpp | 64 +++++++++++-------- 2 files changed, 62 insertions(+), 43 deletions(-) diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 86e4da664a8..2f0d4847582 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -103,8 +103,11 @@ struct TransduceToNormalizedQuotes { // SQS | {'} -> {"} // SQS | {"} -> {\"} // SQS | {\} -> + // DQS | {\} -> // SEC | {'} -> {'} // SEC | Sigma\{'} -> {\*} + // DEC | {'} -> {'} + // DEC | Sigma\{'} -> {\*} // Whether this transition translates to the escape sequence: \" bool const outputs_escape_sequence = @@ -119,20 +122,25 @@ struct TransduceToNormalizedQuotes { return '"'; } // Case when the read symbol is an escape character - the actual translation for \ for some - // symbol is handled by transitions from SEC. For now, there is no output for this - // transition - if ((match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SQS)))) { + // symbol is handled by transitions from SEC. The same logic applies for the transition from + // DEC. For now, there is no output for this transition + if ((match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR) && + state_id == static_cast(dfa_states::TT_SQS)) || + (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR) && + state_id == static_cast(dfa_states::TT_DQS))) { return 0; } - // Case when an escaped single quote in an input single-quoted string needs to be replaced by an - // unescaped single quote - if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SEC)))) { + // Case when an escaped single quote in an input single-quoted or double-quoted string needs + // to be replaced by an unescaped single quote + if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR) && + state_id == static_cast(dfa_states::TT_SEC)) || + (match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR) && + state_id == static_cast(dfa_states::TT_DEC))) { return '\''; } // Case when an escaped symbol that is not a single-quote needs to be replaced with \ - if (state_id == static_cast(dfa_states::TT_SEC)) { + if (state_id == static_cast(dfa_states::TT_SEC) || + state_id == static_cast(dfa_states::TT_DEC)) { return (relative_offset == 0) ? '\\' : read_symbol; } // In all other cases we simply output the input symbol @@ -156,18 +164,21 @@ struct TransduceToNormalizedQuotes { (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); // Number of characters to output on this transition if (sqs_outputs_escape_sequence) { return 2; } + // Whether this transition translates to the escape sequence \ or unescaped ' - bool const sec_outputs_escape_sequence = - (state_id == static_cast(dfa_states::TT_SEC)) && + bool const sec_dec_outputs_escape_sequence = + (state_id == static_cast(dfa_states::TT_SEC) || state_id == static_cast(dfa_states::TT_DEC)) && (match_id != static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)); // Number of characters to output on this transition - if (sec_outputs_escape_sequence) { return 2; } + if (sec_dec_outputs_escape_sequence) { return 2; } + // Whether this transition translates to no output - bool const sqs_outputs_nop = - (state_id == static_cast(dfa_states::TT_SQS)) && + bool const sqs_dqs_outputs_nop = + (state_id == static_cast(dfa_states::TT_SQS) || state_id == static_cast(dfa_states::TT_DQS)) && (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)); // Number of characters to output on this transition - if (sqs_outputs_nop) { return 0; } + if (sqs_dqs_outputs_nop) { return 0; } + return 1; } }; diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index b13e5bd4177..ede19411c6e 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -60,28 +60,29 @@ void run_test(const std::string& host_input, const std::string& expected_host_ou preprocessed_host_output, expected_host_output, preprocessed_host_output.size()); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization1) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Single) { - std::string input = R"({"A":'TEST"'})"; - std::string output = R"({"A":"TEST\""})"; + std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])"; + std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization2) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_MoreSingle) { - std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])"; - std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])"; + std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])"; + std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization3) + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleInSingle) { - std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])"; - std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])"; + std::string input = R"({"A":'TEST"'})"; + std::string output = R"({"A":"TEST\""})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_MoreDoubleInSingle) { std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})"; std::string output = @@ -89,77 +90,84 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4) run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization5) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_StillMoreDoubleInSingle) { - std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})"; - std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})"; + std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])"; + std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization6) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleInSingleAndViceVersa) { - std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])"; - std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])"; + std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])"; + std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleAndSingleInSingle) +{ + std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})"; + std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization7) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_EscapedSingleInDouble) { std::string input = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; - std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; + std::string output = R"(["\t","\\t","\\","\\'\"\\\\","\n","\b"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization8) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_EscapedDoubleInSingle) { - std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])"; - std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])"; + std::string input = R"(["\t","\\t","\\",'\\\'\"\\\\',"\n","\b"])"; + std::string output = R"(["\t","\\t","\\","\\'\"\\\\","\n","\b"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid1) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MismatchedQuotes) { std::string input = R"(["THIS IS A TEST'])"; std::string output = R"(["THIS IS A TEST'])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid2) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MismatchedQuotesEscapedOutput) { std::string input = R"(['THIS IS A TEST"])"; std::string output = R"(["THIS IS A TEST\"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid3) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MoreMismatchedQuotes) { std::string input = R"({"MORE TEST'N":'RESUL})"; std::string output = R"({"MORE TEST'N":"RESUL})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid4) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_NoEndQuote) { std::string input = R"({"NUMBER":100'0,'STRING':'SOMETHING'})"; std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid5) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_InvalidJSON) { std::string input = R"({'NUMBER':100"0,"STRING":"SOMETHING"})"; std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid6) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBackslash) { std::string input = R"({'a':'\\''})"; std::string output = R"({"a":"\\""})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid7) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces) { std::string input = R"(}'a': 'b'{)"; std::string output = R"(}"a": "b"{)"; From c80f4731ac77af2eae9248e68f9cb619b445612e Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 15 Mar 2024 23:13:42 +0000 Subject: [PATCH 2/4] formatting --- cpp/src/io/json/json_normalization.cu | 20 ++++++++++--------- .../io/json_quote_normalization_test.cpp | 1 - 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 2f0d4847582..ac3f8a753f6 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -125,21 +125,21 @@ struct TransduceToNormalizedQuotes { // symbol is handled by transitions from SEC. The same logic applies for the transition from // DEC. For now, there is no output for this transition if ((match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR) && - state_id == static_cast(dfa_states::TT_SQS)) || - (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR) && - state_id == static_cast(dfa_states::TT_DQS))) { + state_id == static_cast(dfa_states::TT_SQS)) || + (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR) && + state_id == static_cast(dfa_states::TT_DQS))) { return 0; } - // Case when an escaped single quote in an input single-quoted or double-quoted string needs + // Case when an escaped single quote in an input single-quoted or double-quoted string needs // to be replaced by an unescaped single quote if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR) && - state_id == static_cast(dfa_states::TT_SEC)) || + state_id == static_cast(dfa_states::TT_SEC)) || (match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR) && - state_id == static_cast(dfa_states::TT_DEC))) { + state_id == static_cast(dfa_states::TT_DEC))) { return '\''; } // Case when an escaped symbol that is not a single-quote needs to be replaced with \ - if (state_id == static_cast(dfa_states::TT_SEC) || + if (state_id == static_cast(dfa_states::TT_SEC) || state_id == static_cast(dfa_states::TT_DEC)) { return (relative_offset == 0) ? '\\' : read_symbol; } @@ -167,14 +167,16 @@ struct TransduceToNormalizedQuotes { // Whether this transition translates to the escape sequence \ or unescaped ' bool const sec_dec_outputs_escape_sequence = - (state_id == static_cast(dfa_states::TT_SEC) || state_id == static_cast(dfa_states::TT_DEC)) && + (state_id == static_cast(dfa_states::TT_SEC) || + state_id == static_cast(dfa_states::TT_DEC)) && (match_id != static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)); // Number of characters to output on this transition if (sec_dec_outputs_escape_sequence) { return 2; } // Whether this transition translates to no output bool const sqs_dqs_outputs_nop = - (state_id == static_cast(dfa_states::TT_SQS) || state_id == static_cast(dfa_states::TT_DQS)) && + (state_id == static_cast(dfa_states::TT_SQS) || + state_id == static_cast(dfa_states::TT_DQS)) && (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)); // Number of characters to output on this transition if (sqs_dqs_outputs_nop) { return 0; } diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index ede19411c6e..593c8136e6a 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -74,7 +74,6 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_MoreSingle) run_test(input, output); } - TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleInSingle) { std::string input = R"({"A":'TEST"'})"; From ce75b30114b4d168c1255e2d9a4e61a6604d3f1d Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 18 Mar 2024 18:49:22 +0000 Subject: [PATCH 3/4] PR reviews --- cpp/src/io/json/json_column.patch | 34 +++++++++++++++++++++++++++ cpp/src/io/json/json_normalization.cu | 10 ++++---- 2 files changed, 38 insertions(+), 6 deletions(-) create mode 100644 cpp/src/io/json/json_column.patch diff --git a/cpp/src/io/json/json_column.patch b/cpp/src/io/json/json_column.patch new file mode 100644 index 00000000000..2ee16d6ea3a --- /dev/null +++ b/cpp/src/io/json/json_column.patch @@ -0,0 +1,34 @@ +diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu +index 10646fad35..437c27d7a5 100644 +--- a/cpp/src/io/json/json_column.cu ++++ b/cpp/src/io/json/json_column.cu +@@ -674,6 +674,7 @@ void make_device_json_column(device_span input, + reinitialize_as_string(old_col_id, col); + // all its children (which are already inserted) are ignored later. + } ++ col.forced_as_string_column = true; + columns.try_emplace(this_col_id, columns.at(old_col_id)); + continue; + } +@@ -915,6 +916,8 @@ std::pair, std::vector> device_json_co + : "n/a"); + #endif + target_type = schema.value().type; ++ } else if(json_col.forced_as_string_column) { ++ target_type = data_type{type_id::STRING}; + } + // Infer column type, if we don't have an explicit type for it + else { +diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp +index f41b024bb1..64fffdb27f 100644 +--- a/cpp/src/io/json/nested_json.hpp ++++ b/cpp/src/io/json/nested_json.hpp +@@ -160,6 +160,8 @@ struct device_json_column { + std::vector column_order; + // Counting the current number of items in this column + row_offset_t num_rows = 0; ++ // Force as string column ++ bool forced_as_string_column{false}; + + /** + * @brief Construct a new d json column object diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index ac3f8a753f6..b3a029224d7 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -124,17 +124,15 @@ struct TransduceToNormalizedQuotes { // Case when the read symbol is an escape character - the actual translation for \ for some // symbol is handled by transitions from SEC. The same logic applies for the transition from // DEC. For now, there is no output for this transition - if ((match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR) && - state_id == static_cast(dfa_states::TT_SQS)) || - (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR) && + if (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR) && + (state_id == static_cast(dfa_states::TT_SQS) || state_id == static_cast(dfa_states::TT_DQS))) { return 0; } // Case when an escaped single quote in an input single-quoted or double-quoted string needs // to be replaced by an unescaped single quote - if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR) && - state_id == static_cast(dfa_states::TT_SEC)) || - (match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR) && + if (match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR) && + (state_id == static_cast(dfa_states::TT_SEC) || state_id == static_cast(dfa_states::TT_DEC))) { return '\''; } From bc61565abf5a782c6853dbb2a3d06af4e0579cb1 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 18 Mar 2024 23:18:20 +0000 Subject: [PATCH 4/4] oops, accidental push --- cpp/src/io/json/json_column.patch | 34 ------------------------------- 1 file changed, 34 deletions(-) delete mode 100644 cpp/src/io/json/json_column.patch diff --git a/cpp/src/io/json/json_column.patch b/cpp/src/io/json/json_column.patch deleted file mode 100644 index 2ee16d6ea3a..00000000000 --- a/cpp/src/io/json/json_column.patch +++ /dev/null @@ -1,34 +0,0 @@ -diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu -index 10646fad35..437c27d7a5 100644 ---- a/cpp/src/io/json/json_column.cu -+++ b/cpp/src/io/json/json_column.cu -@@ -674,6 +674,7 @@ void make_device_json_column(device_span input, - reinitialize_as_string(old_col_id, col); - // all its children (which are already inserted) are ignored later. - } -+ col.forced_as_string_column = true; - columns.try_emplace(this_col_id, columns.at(old_col_id)); - continue; - } -@@ -915,6 +916,8 @@ std::pair, std::vector> device_json_co - : "n/a"); - #endif - target_type = schema.value().type; -+ } else if(json_col.forced_as_string_column) { -+ target_type = data_type{type_id::STRING}; - } - // Infer column type, if we don't have an explicit type for it - else { -diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp -index f41b024bb1..64fffdb27f 100644 ---- a/cpp/src/io/json/nested_json.hpp -+++ b/cpp/src/io/json/nested_json.hpp -@@ -160,6 +160,8 @@ struct device_json_column { - std::vector column_order; - // Counting the current number of items in this column - row_offset_t num_rows = 0; -+ // Force as string column -+ bool forced_as_string_column{false}; - - /** - * @brief Construct a new d json column object