diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 86e4da664a8..b3a029224d7 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -103,8 +103,11 @@ struct TransduceToNormalizedQuotes { // SQS | {'} -> {"} // SQS | {"} -> {\"} // SQS | {\} -> + // DQS | {\} -> // SEC | {'} -> {'} // SEC | Sigma\{'} -> {\*} + // DEC | {'} -> {'} + // DEC | Sigma\{'} -> {\*} // Whether this transition translates to the escape sequence: \" bool const outputs_escape_sequence = @@ -119,20 +122,23 @@ struct TransduceToNormalizedQuotes { return '"'; } // Case when the read symbol is an escape character - the actual translation for \ for some - // symbol is handled by transitions from SEC. For now, there is no output for this - // transition - if ((match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SQS)))) { + // symbol is handled by transitions from SEC. The same logic applies for the transition from + // DEC. For now, there is no output for this transition + if (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR) && + (state_id == static_cast(dfa_states::TT_SQS) || + state_id == static_cast(dfa_states::TT_DQS))) { return 0; } - // Case when an escaped single quote in an input single-quoted string needs to be replaced by an - // unescaped single quote - if ((match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) && - ((state_id == static_cast(dfa_states::TT_SEC)))) { + // Case when an escaped single quote in an input single-quoted or double-quoted string needs + // to be replaced by an unescaped single quote + if (match_id == static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR) && + (state_id == static_cast(dfa_states::TT_SEC) || + state_id == static_cast(dfa_states::TT_DEC))) { return '\''; } // Case when an escaped symbol that is not a single-quote needs to be replaced with \ - if (state_id == static_cast(dfa_states::TT_SEC)) { + if (state_id == static_cast(dfa_states::TT_SEC) || + state_id == static_cast(dfa_states::TT_DEC)) { return (relative_offset == 0) ? '\\' : read_symbol; } // In all other cases we simply output the input symbol @@ -156,18 +162,23 @@ struct TransduceToNormalizedQuotes { (match_id == static_cast(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR)); // Number of characters to output on this transition if (sqs_outputs_escape_sequence) { return 2; } + // Whether this transition translates to the escape sequence \ or unescaped ' - bool const sec_outputs_escape_sequence = - (state_id == static_cast(dfa_states::TT_SEC)) && + bool const sec_dec_outputs_escape_sequence = + (state_id == static_cast(dfa_states::TT_SEC) || + state_id == static_cast(dfa_states::TT_DEC)) && (match_id != static_cast(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)); // Number of characters to output on this transition - if (sec_outputs_escape_sequence) { return 2; } + if (sec_dec_outputs_escape_sequence) { return 2; } + // Whether this transition translates to no output - bool const sqs_outputs_nop = - (state_id == static_cast(dfa_states::TT_SQS)) && + bool const sqs_dqs_outputs_nop = + (state_id == static_cast(dfa_states::TT_SQS) || + state_id == static_cast(dfa_states::TT_DQS)) && (match_id == static_cast(dfa_symbol_group_id::ESCAPE_CHAR)); // Number of characters to output on this transition - if (sqs_outputs_nop) { return 0; } + if (sqs_dqs_outputs_nop) { return 0; } + return 1; } }; diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index fb17545875a..d881ab6f9b7 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -2896,9 +2896,9 @@ __device__ std::pair get_extremum(statistics_val const* s return {scratch, sizeof(float)}; } case dtype_int64: + case dtype_decimal64: case dtype_timestamp64: case dtype_float64: return {stats_val, sizeof(int64_t)}; - case dtype_decimal64: case dtype_decimal128: byte_reverse128(stats_val->d128_val, scratch); return {scratch, sizeof(__int128_t)}; diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp index b13e5bd4177..593c8136e6a 100644 --- a/cpp/tests/io/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json_quote_normalization_test.cpp @@ -60,28 +60,28 @@ void run_test(const std::string& host_input, const std::string& expected_host_ou preprocessed_host_output, expected_host_output, preprocessed_host_output.size()); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization1) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Single) { - std::string input = R"({"A":'TEST"'})"; - std::string output = R"({"A":"TEST\""})"; + std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])"; + std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization2) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_MoreSingle) { - std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])"; - std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])"; + std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])"; + std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization3) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleInSingle) { - std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])"; - std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])"; + std::string input = R"({"A":'TEST"'})"; + std::string output = R"({"A":"TEST\""})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_MoreDoubleInSingle) { std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})"; std::string output = @@ -89,77 +89,84 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4) run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization5) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_StillMoreDoubleInSingle) { - std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})"; - std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})"; + std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])"; + std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization6) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleInSingleAndViceVersa) { - std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])"; - std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])"; + std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])"; + std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])"; + run_test(input, output); +} + +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleAndSingleInSingle) +{ + std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})"; + std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization7) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_EscapedSingleInDouble) { std::string input = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; - std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])"; + std::string output = R"(["\t","\\t","\\","\\'\"\\\\","\n","\b"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization8) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_EscapedDoubleInSingle) { - std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])"; - std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])"; + std::string input = R"(["\t","\\t","\\",'\\\'\"\\\\',"\n","\b"])"; + std::string output = R"(["\t","\\t","\\","\\'\"\\\\","\n","\b"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid1) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MismatchedQuotes) { std::string input = R"(["THIS IS A TEST'])"; std::string output = R"(["THIS IS A TEST'])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid2) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MismatchedQuotesEscapedOutput) { std::string input = R"(['THIS IS A TEST"])"; std::string output = R"(["THIS IS A TEST\"])"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid3) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MoreMismatchedQuotes) { std::string input = R"({"MORE TEST'N":'RESUL})"; std::string output = R"({"MORE TEST'N":"RESUL})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid4) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_NoEndQuote) { std::string input = R"({"NUMBER":100'0,'STRING':'SOMETHING'})"; std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid5) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_InvalidJSON) { std::string input = R"({'NUMBER':100"0,"STRING":"SOMETHING"})"; std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid6) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBackslash) { std::string input = R"({'a':'\\''})"; std::string output = R"({"a":"\\""})"; run_test(input, output); } -TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid7) +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces) { std::string input = R"(}'a': 'b'{)"; std::string output = R"(}"a": "b"{)"; diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 200c58bb9aa..ffa672fb564 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -719,6 +719,64 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) EXPECT_EQ(ph.data_page_header.num_values, num_rows); } +TEST_F(ParquetWriterTest, Decimal32Stats) +{ + // check that decimal64 min and max statistics are written properly + std::vector expected_min{0, 0, 0xb2, 0xa1}; + std::vector expected_max{0xb2, 0xa1, 0, 0}; + + int32_t val0 = 0xa1b2; + int32_t val1 = val0 << 16; + column_wrapper col0{{numeric::decimal32(val0, numeric::scale_type{0}), + numeric::decimal32(val1, numeric::scale_type{0})}}; + + auto expected = table_view{{col0}}; + + auto const filepath = temp_env->get_temp_filepath("Decimal32Stats.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + auto const stats = get_statistics(fmd.row_groups[0].columns[0]); + + EXPECT_EQ(expected_min, stats.min_value); + EXPECT_EQ(expected_max, stats.max_value); +} + +TEST_F(ParquetWriterTest, Decimal64Stats) +{ + // check that decimal64 min and max statistics are written properly + std::vector expected_min{0, 0, 0, 0, 0xd4, 0xc3, 0xb2, 0xa1}; + std::vector expected_max{0xd4, 0xc3, 0xb2, 0xa1, 0, 0, 0, 0}; + + int64_t val0 = 0xa1b2'c3d4UL; + int64_t val1 = val0 << 32; + column_wrapper col0{{numeric::decimal64(val0, numeric::scale_type{0}), + numeric::decimal64(val1, numeric::scale_type{0})}}; + + auto expected = table_view{{col0}}; + + auto const filepath = temp_env->get_temp_filepath("Decimal64Stats.parquet"); + const cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + + read_footer(source, &fmd); + + auto const stats = get_statistics(fmd.row_groups[0].columns[0]); + + EXPECT_EQ(expected_min, stats.min_value); + EXPECT_EQ(expected_max, stats.max_value); +} + TEST_F(ParquetWriterTest, Decimal128Stats) { // check that decimal128 min and max statistics are written in network byte order