Skip to content

Commit

Permalink
Merge branch 'branch-24.04' into lumpy_strings
Browse files Browse the repository at this point in the history
  • Loading branch information
ttnghia authored Mar 19, 2024
2 parents bd87566 + 4a5fab7 commit 9e80e84
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 44 deletions.
41 changes: 26 additions & 15 deletions cpp/src/io/json/json_normalization.cu
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,11 @@ struct TransduceToNormalizedQuotes {
// SQS | {'} -> {"}
// SQS | {"} -> {\"}
// SQS | {\} -> <nop>
// DQS | {\} -> <nop>
// SEC | {'} -> {'}
// SEC | Sigma\{'} -> {\*}
// DEC | {'} -> {'}
// DEC | Sigma\{'} -> {\*}

// Whether this transition translates to the escape sequence: \"
bool const outputs_escape_sequence =
Expand All @@ -119,20 +122,23 @@ struct TransduceToNormalizedQuotes {
return '"';
}
// Case when the read symbol is an escape character - the actual translation for \<s> for some
// symbol <s> is handled by transitions from SEC. For now, there is no output for this
// transition
if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR)) &&
((state_id == static_cast<StateT>(dfa_states::TT_SQS)))) {
// symbol <s> is handled by transitions from SEC. The same logic applies for the transition from
// DEC. For now, there is no output for this transition
if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR) &&
(state_id == static_cast<StateT>(dfa_states::TT_SQS) ||
state_id == static_cast<StateT>(dfa_states::TT_DQS))) {
return 0;
}
// Case when an escaped single quote in an input single-quoted string needs to be replaced by an
// unescaped single quote
if ((match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR)) &&
((state_id == static_cast<StateT>(dfa_states::TT_SEC)))) {
// Case when an escaped single quote in an input single-quoted or double-quoted string needs
// to be replaced by an unescaped single quote
if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR) &&
(state_id == static_cast<StateT>(dfa_states::TT_SEC) ||
state_id == static_cast<StateT>(dfa_states::TT_DEC))) {
return '\'';
}
// Case when an escaped symbol <s> that is not a single-quote needs to be replaced with \<s>
if (state_id == static_cast<StateT>(dfa_states::TT_SEC)) {
if (state_id == static_cast<StateT>(dfa_states::TT_SEC) ||
state_id == static_cast<StateT>(dfa_states::TT_DEC)) {
return (relative_offset == 0) ? '\\' : read_symbol;
}
// In all other cases we simply output the input symbol
Expand All @@ -156,18 +162,23 @@ struct TransduceToNormalizedQuotes {
(match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
// Number of characters to output on this transition
if (sqs_outputs_escape_sequence) { return 2; }

// Whether this transition translates to the escape sequence \<s> or unescaped '
bool const sec_outputs_escape_sequence =
(state_id == static_cast<StateT>(dfa_states::TT_SEC)) &&
bool const sec_dec_outputs_escape_sequence =
(state_id == static_cast<StateT>(dfa_states::TT_SEC) ||
state_id == static_cast<StateT>(dfa_states::TT_DEC)) &&
(match_id != static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR));
// Number of characters to output on this transition
if (sec_outputs_escape_sequence) { return 2; }
if (sec_dec_outputs_escape_sequence) { return 2; }

// Whether this transition translates to no output <nop>
bool const sqs_outputs_nop =
(state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
bool const sqs_dqs_outputs_nop =
(state_id == static_cast<StateT>(dfa_states::TT_SQS) ||
state_id == static_cast<StateT>(dfa_states::TT_DQS)) &&
(match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR));
// Number of characters to output on this transition
if (sqs_outputs_nop) { return 0; }
if (sqs_dqs_outputs_nop) { return 0; }

return 1;
}
};
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/parquet/page_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2896,9 +2896,9 @@ __device__ std::pair<void const*, uint32_t> get_extremum(statistics_val const* s
return {scratch, sizeof(float)};
}
case dtype_int64:
case dtype_decimal64:
case dtype_timestamp64:
case dtype_float64: return {stats_val, sizeof(int64_t)};
case dtype_decimal64:
case dtype_decimal128:
byte_reverse128(stats_val->d128_val, scratch);
return {scratch, sizeof(__int128_t)};
Expand Down
63 changes: 35 additions & 28 deletions cpp/tests/io/json_quote_normalization_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,106 +60,113 @@ void run_test(const std::string& host_input, const std::string& expected_host_ou
preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization1)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Single)
{
std::string input = R"({"A":'TEST"'})";
std::string output = R"({"A":"TEST\""})";
std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])";
std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization2)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_MoreSingle)
{
std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])";
std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])";
std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])";
std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization3)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleInSingle)
{
std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])";
std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])";
std::string input = R"({"A":'TEST"'})";
std::string output = R"({"A":"TEST\""})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization4)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_MoreDoubleInSingle)
{
std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})";
std::string output =
R"({"ain't ain't a word and you ain't supposed to say it":"\"\"\"\"\"\"\"\"\"\"\""})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization5)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_StillMoreDoubleInSingle)
{
std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})";
std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})";
std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])";
std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization6)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleInSingleAndViceVersa)
{
std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])";
std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])";
std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])";
std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_DoubleAndSingleInSingle)
{
std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})";
std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization7)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_EscapedSingleInDouble)
{
std::string input = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
std::string output = R"(["\t","\\t","\\","\\'\"\\\\","\n","\b"])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization8)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_EscapedDoubleInSingle)
{
std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])";
std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])";
std::string input = R"(["\t","\\t","\\",'\\\'\"\\\\',"\n","\b"])";
std::string output = R"(["\t","\\t","\\","\\'\"\\\\","\n","\b"])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid1)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MismatchedQuotes)
{
std::string input = R"(["THIS IS A TEST'])";
std::string output = R"(["THIS IS A TEST'])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid2)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MismatchedQuotesEscapedOutput)
{
std::string input = R"(['THIS IS A TEST"])";
std::string output = R"(["THIS IS A TEST\"])";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid3)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_MoreMismatchedQuotes)
{
std::string input = R"({"MORE TEST'N":'RESUL})";
std::string output = R"({"MORE TEST'N":"RESUL})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid4)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_NoEndQuote)
{
std::string input = R"({"NUMBER":100'0,'STRING':'SOMETHING'})";
std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid5)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_InvalidJSON)
{
std::string input = R"({'NUMBER':100"0,"STRING":"SOMETHING"})";
std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid6)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBackslash)
{
std::string input = R"({'a':'\\''})";
std::string output = R"({"a":"\\""})";
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid7)
TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces)
{
std::string input = R"(}'a': 'b'{)";
std::string output = R"(}"a": "b"{)";
Expand Down
58 changes: 58 additions & 0 deletions cpp/tests/io/parquet_writer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,64 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
EXPECT_EQ(ph.data_page_header.num_values, num_rows);
}

TEST_F(ParquetWriterTest, Decimal32Stats)
{
// check that decimal64 min and max statistics are written properly
std::vector<uint8_t> expected_min{0, 0, 0xb2, 0xa1};
std::vector<uint8_t> expected_max{0xb2, 0xa1, 0, 0};

int32_t val0 = 0xa1b2;
int32_t val1 = val0 << 16;
column_wrapper<numeric::decimal32> col0{{numeric::decimal32(val0, numeric::scale_type{0}),
numeric::decimal32(val1, numeric::scale_type{0})}};

auto expected = table_view{{col0}};

auto const filepath = temp_env->get_temp_filepath("Decimal32Stats.parquet");
const cudf::io::parquet_writer_options out_opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
cudf::io::write_parquet(out_opts);

auto const source = cudf::io::datasource::create(filepath);
cudf::io::parquet::detail::FileMetaData fmd;

read_footer(source, &fmd);

auto const stats = get_statistics(fmd.row_groups[0].columns[0]);

EXPECT_EQ(expected_min, stats.min_value);
EXPECT_EQ(expected_max, stats.max_value);
}

TEST_F(ParquetWriterTest, Decimal64Stats)
{
// check that decimal64 min and max statistics are written properly
std::vector<uint8_t> expected_min{0, 0, 0, 0, 0xd4, 0xc3, 0xb2, 0xa1};
std::vector<uint8_t> expected_max{0xd4, 0xc3, 0xb2, 0xa1, 0, 0, 0, 0};

int64_t val0 = 0xa1b2'c3d4UL;
int64_t val1 = val0 << 32;
column_wrapper<numeric::decimal64> col0{{numeric::decimal64(val0, numeric::scale_type{0}),
numeric::decimal64(val1, numeric::scale_type{0})}};

auto expected = table_view{{col0}};

auto const filepath = temp_env->get_temp_filepath("Decimal64Stats.parquet");
const cudf::io::parquet_writer_options out_opts =
cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
cudf::io::write_parquet(out_opts);

auto const source = cudf::io::datasource::create(filepath);
cudf::io::parquet::detail::FileMetaData fmd;

read_footer(source, &fmd);

auto const stats = get_statistics(fmd.row_groups[0].columns[0]);

EXPECT_EQ(expected_min, stats.min_value);
EXPECT_EQ(expected_max, stats.max_value);
}

TEST_F(ParquetWriterTest, Decimal128Stats)
{
// check that decimal128 min and max statistics are written in network byte order
Expand Down

0 comments on commit 9e80e84

Please sign in to comment.