From b614fe82662840af0adf7ded71aa698a5718941e Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 10 Aug 2023 15:51:04 -0700 Subject: [PATCH 01/25] sum w/o minmax; double sum, string sum --- cpp/src/io/orc/stats_enc.cu | 36 ++++++++++++------- .../io/statistics/typed_statistics_chunk.cuh | 5 +-- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 8fada7d5d72..d14708268a8 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -265,11 +265,14 @@ __global__ void __launch_bounds__(encode_threads_per_block) // optional double maximum = 2; // optional double sum = 3; // } - if (s->chunk.has_minmax) { + if (s->chunk.has_minmax || s->chunk.has_sum) { *cur = 3 * 8 + ProtofType::FIXEDLEN; cur += 2; - cur = pb_put_fixed64(cur, 1, &s->chunk.min_value.fp_val); - cur = pb_put_fixed64(cur, 2, &s->chunk.max_value.fp_val); + if (s->chunk.has_minmax) { + cur = pb_put_fixed64(cur, 1, &s->chunk.min_value.fp_val); + cur = pb_put_fixed64(cur, 2, &s->chunk.max_value.fp_val); + } + if (s->chunk.has_sum) { cur = pb_put_fixed64(cur, 3, &s->chunk.sum.fp_val); } fld_start[1] = cur - (fld_start + 2); } break; @@ -280,18 +283,25 @@ __global__ void __launch_bounds__(encode_threads_per_block) // optional string maximum = 2; // optional sint64 sum = 3; // sum will store the total length of all strings // } - if (s->chunk.has_minmax && s->chunk.has_sum) { - uint32_t sz = (pb_put_int(cur, 3, s->chunk.sum.i_val) - cur) + - (pb_put_uint(cur, 1, s->chunk.min_value.str_val.length) - cur) + - (pb_put_uint(cur, 2, s->chunk.max_value.str_val.length) - cur) + - s->chunk.min_value.str_val.length + s->chunk.max_value.str_val.length; + if (s->chunk.has_minmax || s->chunk.has_sum) { + uint32_t sz = 0; + if (s->chunk.has_minmax) { + sz += (pb_put_uint(cur, 1, s->chunk.min_value.str_val.length) - cur) + + (pb_put_uint(cur, 2, s->chunk.max_value.str_val.length) - cur) + + s->chunk.min_value.str_val.length + s->chunk.max_value.str_val.length; + } + if (s->chunk.has_sum) { sz += pb_put_int(cur, 3, s->chunk.sum.i_val) - cur; } + cur[0] = 4 * 8 + ProtofType::FIXEDLEN; cur = pb_encode_uint(cur + 1, sz); - cur = pb_put_binary( - cur, 1, s->chunk.min_value.str_val.ptr, s->chunk.min_value.str_val.length); - cur = pb_put_binary( - cur, 2, s->chunk.max_value.str_val.ptr, s->chunk.max_value.str_val.length); - cur = pb_put_int(cur, 3, s->chunk.sum.i_val); + + if (s->chunk.has_minmax) { + cur = pb_put_binary( + cur, 1, s->chunk.min_value.str_val.ptr, s->chunk.min_value.str_val.length); + cur = pb_put_binary( + cur, 2, s->chunk.max_value.str_val.ptr, s->chunk.max_value.str_val.length); + } + if (s->chunk.has_sum) { cur = pb_put_int(cur, 3, s->chunk.sum.i_val); } } break; case dtype_bool: diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh index d007209a12a..cd784247ebf 100644 --- a/cpp/src/io/statistics/typed_statistics_chunk.cuh +++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh @@ -103,7 +103,8 @@ class union_member { * @tparam is_aggregation_supported Set to true if input type is meant to be aggregated */ template -struct typed_statistics_chunk {}; +struct typed_statistics_chunk { +}; template struct typed_statistics_chunk { @@ -244,9 +245,9 @@ get_untyped_chunk(typed_statistics_chunk const& chunk) stat.null_count = chunk.null_count; stat.has_minmax = chunk.has_minmax; stat.has_sum = [&]() { - if (!chunk.has_minmax) return false; // invalidate the sum if overflow or underflow is possible if constexpr (std::is_floating_point_v or std::is_integral_v) { + if (!chunk.has_minmax) { return true; } return std::numeric_limits::max() / chunk.non_nulls >= static_cast(chunk.maximum_value) and std::numeric_limits::lowest() / chunk.non_nulls <= From 9a7988a79966356cc23eaa6e4210f9319fbf63f7 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 10 Aug 2023 16:29:33 -0700 Subject: [PATCH 02/25] write hasNull! --- cpp/src/io/orc/stats_enc.cu | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index d14708268a8..4d96c69c13b 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -228,12 +228,14 @@ __global__ void __launch_bounds__(encode_threads_per_block) // Encode and update actual bfr size if (idx < statistics_count && t == 0) { - s->chunk = chunks[idx]; - s->group = groups[idx]; - s->stats_dtype = s->group.stats_dtype; - s->base = blob_bfr + s->group.start_chunk; - s->end = blob_bfr + s->group.start_chunk + s->group.num_chunks; - uint8_t* cur = pb_put_uint(s->base, 1, s->chunk.non_nulls); + s->chunk = chunks[idx]; + s->group = groups[idx]; + s->stats_dtype = s->group.stats_dtype; + s->base = blob_bfr + s->group.start_chunk; + s->end = blob_bfr + s->group.start_chunk + s->group.num_chunks; + uint8_t* cur = pb_put_uint(s->base, 1, s->chunk.non_nulls); + cur = pb_put_uint(cur, 10, s->chunk.null_count != 0); // hasNull (bool) + uint8_t* fld_start = cur; switch (s->stats_dtype) { case dtype_int8: From 70c3b28f239ca59fadb972bcb439b44249407ef2 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 10 Aug 2023 16:29:50 -0700 Subject: [PATCH 03/25] tests --- cpp/tests/io/orc_test.cpp | 49 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index cff7b1cf081..f4d646fde47 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1010,6 +1010,7 @@ TEST_F(OrcStatisticsTest, Basic) auto& s1 = stats[1]; EXPECT_EQ(*s1.number_of_values, 4ul); + EXPECT_TRUE(*s1.has_null); auto& ts1 = std::get(s1.type_specific_stats); EXPECT_EQ(*ts1.minimum, 1); EXPECT_EQ(*ts1.maximum, 7); @@ -1017,14 +1018,15 @@ TEST_F(OrcStatisticsTest, Basic) auto& s2 = stats[2]; EXPECT_EQ(*s2.number_of_values, 4ul); + EXPECT_TRUE(*s2.has_null); auto& ts2 = std::get(s2.type_specific_stats); EXPECT_EQ(*ts2.minimum, 1.); EXPECT_EQ(*ts2.maximum, 7.); - // No sum ATM, filed #7087 - ASSERT_FALSE(ts2.sum); + EXPECT_EQ(*ts2.sum, 16.); auto& s3 = stats[3]; EXPECT_EQ(*s3.number_of_values, 9ul); + EXPECT_FALSE(*s3.has_null); auto& ts3 = std::get(s3.type_specific_stats); EXPECT_EQ(*ts3.minimum, "Friday"); EXPECT_EQ(*ts3.maximum, "Wednesday"); @@ -1032,10 +1034,12 @@ TEST_F(OrcStatisticsTest, Basic) auto& s4 = stats[4]; EXPECT_EQ(*s4.number_of_values, 9ul); + EXPECT_FALSE(*s4.has_null); EXPECT_EQ(std::get(s4.type_specific_stats).count[0], 8ul); auto& s5 = stats[5]; EXPECT_EQ(*s5.number_of_values, 4ul); + EXPECT_TRUE(*s5.has_null); auto& ts5 = std::get(s5.type_specific_stats); EXPECT_EQ(*ts5.minimum_utc, 1000); EXPECT_EQ(*ts5.maximum_utc, 7000); @@ -1861,4 +1865,45 @@ TEST_F(OrcWriterTest, EmptyChildStringColumn) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } +template +void check_all_null_stats(cudf::io::column_statistics const& stats) +{ + EXPECT_EQ(stats.number_of_values, 0); + EXPECT_TRUE(stats.has_null); + + auto const ts = std::get(stats.type_specific_stats); + EXPECT_FALSE(ts.minimum.has_value()); + EXPECT_FALSE(ts.maximum.has_value()); + EXPECT_TRUE(ts.sum.has_value()); + EXPECT_EQ(*ts.sum, 0); +} + +TEST_F(OrcStatisticsTest, AllNulls) +{ + bool all_null[] = {false, false, false}; // all null + + std::vector doubles{1.1, 2.2, 3.3}; + float64_col double_col(doubles.begin(), doubles.end(), all_null); + + std::vector ints{1, 2, 3}; + int32_col int_col(ints.begin(), ints.end(), all_null); + + std::vector strings{"1", "2", "3"}; + str_col string_col(strings.begin(), strings.end(), all_null); + + cudf::table_view expected({int_col, double_col, string_col}); + + std::vector out_buffer; + cudf::io::orc_writer_options out_opts = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected); + cudf::io::write_orc(out_opts); + + auto const stats = cudf::io::read_parsed_orc_statistics( + cudf::io::source_info{out_buffer.data(), out_buffer.size()}); + + check_all_null_stats(stats.file_stats[1]); + check_all_null_stats(stats.file_stats[2]); + check_all_null_stats(stats.file_stats[3]); +} + CUDF_TEST_PROGRAM_MAIN() From b39237605e4f57907b410130b410f1a2207a507a Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 10 Aug 2023 16:30:52 -0700 Subject: [PATCH 04/25] style --- cpp/src/io/statistics/typed_statistics_chunk.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh index cd784247ebf..e6ec1471cb7 100644 --- a/cpp/src/io/statistics/typed_statistics_chunk.cuh +++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh @@ -103,8 +103,7 @@ class union_member { * @tparam is_aggregation_supported Set to true if input type is meant to be aggregated */ template -struct typed_statistics_chunk { -}; +struct typed_statistics_chunk {}; template struct typed_statistics_chunk { From eb6cdae9f26b4f7c606f480a0773d39910ded5c2 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 10 Aug 2023 16:39:53 -0700 Subject: [PATCH 05/25] clean up --- cpp/src/io/orc/stats_enc.cu | 3 --- cpp/tests/io/orc_test.cpp | 5 ++--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 4d96c69c13b..8dc237dcefe 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -126,9 +126,6 @@ struct stats_state_s { statistics_chunk chunk; statistics_merge_group group; statistics_dtype stats_dtype; //!< Statistics data type for this column - // ORC stats - uint64_t numberOfValues; - uint8_t hasNull; }; /* diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index f4d646fde47..5f1eb59a8d8 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1263,9 +1263,8 @@ TEST_F(OrcStatisticsTest, Overflow) TEST_F(OrcStatisticsTest, HasNull) { - // cudf's ORC writer doesn't yet support the ability to encode the hasNull value in statistics so - // we're embedding a file created using pyorc - // + // This test can now be implemented with libcudf; keeping the pyorc version to keep the test + // inputs diversified // Method to create file: // >>> import pyorc // >>> output = open("./temp.orc", "wb") From 9fbe6c5809f577ce239e108f9a85c98fc2441470 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 14 Aug 2023 11:24:13 -0700 Subject: [PATCH 06/25] fix python tests --- python/cudf/cudf/tests/test_orc.py | 60 ++++++++++++++++++------------ 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index aafc8831bf4..07aa5430f4f 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -633,16 +633,19 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): for col in gdf: if "minimum" in file_stats[0][col]: stats_min = file_stats[0][col]["minimum"] - actual_min = gdf[col].min() - assert normalized_equals(actual_min, stats_min) + if stats_min is not None: + actual_min = gdf[col].min() + assert normalized_equals(actual_min, stats_min) if "maximum" in file_stats[0][col]: stats_max = file_stats[0][col]["maximum"] - actual_max = gdf[col].max() - assert normalized_equals(actual_max, stats_max) + if stats_max is not None: + actual_max = gdf[col].max() + assert normalized_equals(actual_max, stats_max) if "number_of_values" in file_stats[0][col]: stats_num_vals = file_stats[0][col]["number_of_values"] - actual_num_vals = gdf[col].count() - assert stats_num_vals == actual_num_vals + if stats_num_vals is not None: + actual_num_vals = gdf[col].count() + assert stats_num_vals == actual_num_vals # compare stripe statistics with actual min/max for stripe_idx in range(0, orc_file.nstripes): @@ -651,21 +654,24 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): stripe_df = cudf.DataFrame(stripe.to_pandas()) for col in stripe_df: if "minimum" in stripes_stats[stripe_idx][col]: - actual_min = stripe_df[col].min() stats_min = stripes_stats[stripe_idx][col]["minimum"] - assert normalized_equals(actual_min, stats_min) + if stats_min is not None: + actual_min = stripe_df[col].min() + assert normalized_equals(actual_min, stats_min) if "maximum" in stripes_stats[stripe_idx][col]: - actual_max = stripe_df[col].max() stats_max = stripes_stats[stripe_idx][col]["maximum"] - assert normalized_equals(actual_max, stats_max) + if stats_max is not None: + actual_max = stripe_df[col].max() + assert normalized_equals(actual_max, stats_max) if "number_of_values" in stripes_stats[stripe_idx][col]: stats_num_vals = stripes_stats[stripe_idx][col][ "number_of_values" ] - actual_num_vals = stripe_df[col].count() - assert stats_num_vals == actual_num_vals + if stats_num_vals is not None: + actual_num_vals = stripe_df[col].count() + assert stats_num_vals == actual_num_vals @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) @@ -733,16 +739,19 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): for col in expect: if "minimum" in file_stats[0][col]: stats_min = file_stats[0][col]["minimum"] - actual_min = expect[col].min() - assert normalized_equals(actual_min, stats_min) + if stats_min is not None: + actual_min = expect[col].min() + assert normalized_equals(actual_min, stats_min) if "maximum" in file_stats[0][col]: stats_max = file_stats[0][col]["maximum"] - actual_max = expect[col].max() - assert normalized_equals(actual_max, stats_max) + if stats_max is not None: + actual_max = expect[col].max() + assert normalized_equals(actual_max, stats_max) if "number_of_values" in file_stats[0][col]: stats_num_vals = file_stats[0][col]["number_of_values"] - actual_num_vals = expect[col].count() - assert stats_num_vals == actual_num_vals + if stats_num_vals is not None: + actual_num_vals = expect[col].count() + assert stats_num_vals == actual_num_vals # compare stripe statistics with actual min/max for stripe_idx in range(0, orc_file.nstripes): @@ -751,21 +760,24 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): stripe_df = cudf.DataFrame(stripe.to_pandas()) for col in stripe_df: if "minimum" in stripes_stats[stripe_idx][col]: - actual_min = stripe_df[col].min() stats_min = stripes_stats[stripe_idx][col]["minimum"] - assert normalized_equals(actual_min, stats_min) + if stats_min is not None: + actual_min = stripe_df[col].min() + assert normalized_equals(actual_min, stats_min) if "maximum" in stripes_stats[stripe_idx][col]: - actual_max = stripe_df[col].max() stats_max = stripes_stats[stripe_idx][col]["maximum"] - assert normalized_equals(actual_max, stats_max) + if stats_max is not None: + actual_max = stripe_df[col].max() + assert normalized_equals(actual_max, stats_max) if "number_of_values" in stripes_stats[stripe_idx][col]: stats_num_vals = stripes_stats[stripe_idx][col][ "number_of_values" ] - actual_num_vals = stripe_df[col].count() - assert stats_num_vals == actual_num_vals + if stats_num_vals is not None: + actual_num_vals = stripe_df[col].count() + assert stats_num_vals == actual_num_vals @pytest.mark.parametrize("nrows", [1, 100, 6000000]) From d833ff09b0ade4e87259ee53208def018746862e Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 14 Aug 2023 11:24:29 -0700 Subject: [PATCH 07/25] remove incorrect bucket stats --- cpp/src/io/orc/stats_enc.cu | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 8dc237dcefe..6609a5db46b 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -308,11 +308,7 @@ __global__ void __launch_bounds__(encode_threads_per_block) // message BucketStatistics { // repeated uint64 count = 1 [packed=true]; // } - if (s->chunk.has_sum) { // Sum is equal to the number of 'true' values - cur[0] = 5 * 8 + ProtofType::FIXEDLEN; - cur = pb_put_packed_uint(cur + 2, 1, s->chunk.sum.u_val); - fld_start[1] = cur - (fld_start + 2); - } + // Not implemented, see https://github.com/rapidsai/cudf/issues/7087 break; case dtype_decimal64: case dtype_decimal128: From 1db56c58ea765b944c08dbac711152f87f170d6f Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 14 Aug 2023 14:34:05 -0700 Subject: [PATCH 08/25] remove bool column from C++ stats tests --- cpp/tests/io/orc_test.cpp | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 5f1eb59a8d8..e07d67fe119 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -987,10 +987,9 @@ TEST_F(OrcStatisticsTest, Basic) column_wrapper col2( sequence, sequence + num_rows, validity); column_wrapper col3{strings.begin(), strings.end()}; - column_wrapper col4(sequence, sequence + num_rows); - column_wrapper col5( + column_wrapper col4( sequence, sequence + num_rows, validity); - table_view expected({col1, col2, col3, col4, col5}); + table_view expected({col1, col2, col3, col4}); auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc"); @@ -1001,7 +1000,7 @@ TEST_F(OrcStatisticsTest, Basic) auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath}); auto const expected_column_names = - std::vector{"", "_col0", "_col1", "_col2", "_col3", "_col4"}; + std::vector{"", "_col0", "_col1", "_col2", "_col3"}; EXPECT_EQ(stats.column_names, expected_column_names); auto validate_statistics = [&](std::vector const& stats) { @@ -1033,18 +1032,13 @@ TEST_F(OrcStatisticsTest, Basic) EXPECT_EQ(*ts3.sum, 58ul); auto& s4 = stats[4]; - EXPECT_EQ(*s4.number_of_values, 9ul); - EXPECT_FALSE(*s4.has_null); - EXPECT_EQ(std::get(s4.type_specific_stats).count[0], 8ul); - - auto& s5 = stats[5]; - EXPECT_EQ(*s5.number_of_values, 4ul); - EXPECT_TRUE(*s5.has_null); - auto& ts5 = std::get(s5.type_specific_stats); - EXPECT_EQ(*ts5.minimum_utc, 1000); - EXPECT_EQ(*ts5.maximum_utc, 7000); - ASSERT_FALSE(ts5.minimum); - ASSERT_FALSE(ts5.maximum); + EXPECT_EQ(*s4.number_of_values, 4ul); + EXPECT_TRUE(*s4.has_null); + auto& ts4 = std::get(s4.type_specific_stats); + EXPECT_EQ(*ts4.minimum_utc, 1000); + EXPECT_EQ(*ts4.maximum_utc, 7000); + ASSERT_FALSE(ts4.minimum); + ASSERT_FALSE(ts4.maximum); }; validate_statistics(stats.file_stats); From 2f35d5a35951a5a2f7cbb72fc3df57dadc1327cd Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 21 Aug 2023 15:54:59 -0700 Subject: [PATCH 09/25] test clean up --- cpp/tests/io/orc_test.cpp | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index e07d67fe119..efe4a917400 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1873,16 +1873,9 @@ void check_all_null_stats(cudf::io::column_statistics const& stats) TEST_F(OrcStatisticsTest, AllNulls) { - bool all_null[] = {false, false, false}; // all null - - std::vector doubles{1.1, 2.2, 3.3}; - float64_col double_col(doubles.begin(), doubles.end(), all_null); - - std::vector ints{1, 2, 3}; - int32_col int_col(ints.begin(), ints.end(), all_null); - - std::vector strings{"1", "2", "3"}; - str_col string_col(strings.begin(), strings.end(), all_null); + float64_col double_col({0., 0., 0.}, cudf::test::iterators::all_nulls()); + int32_col int_col({0, 0, 0}, cudf::test::iterators::all_nulls()); + str_col string_col({"", "", ""}, cudf::test::iterators::all_nulls()); cudf::table_view expected({int_col, double_col, string_col}); From ee1347f77db7991d6b3a6445c269d1a842f345ec Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 24 Aug 2023 13:55:01 -0700 Subject: [PATCH 10/25] restore bool stats --- cpp/src/io/orc/stats_enc.cu | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 5ba2a66527d..8ff4468b728 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -308,7 +308,12 @@ __global__ void __launch_bounds__(encode_threads_per_block) // message BucketStatistics { // repeated uint64 count = 1 [packed=true]; // } - // Not implemented, see https://github.com/rapidsai/cudf/issues/7087 + if (s->chunk.has_sum) { + cur[0] = 5 * 8 + ProtofType::FIXEDLEN; + // count is equal to the number of 'true' values, despite what specs say + cur = pb_put_packed_uint(cur + 2, 1, s->chunk.sum.u_val); + fld_start[1] = cur - (fld_start + 2); + } break; case dtype_decimal64: case dtype_decimal128: From d7facdab1c8b8bb9d93f981e160a1c9e512533d1 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 24 Aug 2023 14:46:35 -0700 Subject: [PATCH 11/25] add bool test; fix docs --- cpp/include/cudf/io/orc_metadata.hpp | 4 ++-- cpp/tests/io/orc_test.cpp | 14 +++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index 623ee2e49fc..d225d04161b 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -111,10 +111,10 @@ struct string_statistics : minmax_statistics, sum_statistics count; ///< Count of `false` and `true` values + std::vector count; ///< number of `true` values }; /** diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index efe4a917400..63eeb4183cd 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -986,10 +986,11 @@ TEST_F(OrcStatisticsTest, Basic) sequence, sequence + num_rows, validity); column_wrapper col2( sequence, sequence + num_rows, validity); - column_wrapper col3{strings.begin(), strings.end()}; + str_col col3{strings.begin(), strings.end()}; column_wrapper col4( sequence, sequence + num_rows, validity); - table_view expected({col1, col2, col3, col4}); + bool_col col5({true, true, true, true, true, false, false, false, false}, validity); + table_view expected({col1, col2, col3, col4, col5}); auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc"); @@ -1000,10 +1001,11 @@ TEST_F(OrcStatisticsTest, Basic) auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath}); auto const expected_column_names = - std::vector{"", "_col0", "_col1", "_col2", "_col3"}; + std::vector{"", "_col0", "_col1", "_col2", "_col3", "_col4"}; EXPECT_EQ(stats.column_names, expected_column_names); auto validate_statistics = [&](std::vector const& stats) { + ASSERT_EQ(stats.size(), 6ul); auto& s0 = stats[0]; EXPECT_EQ(*s0.number_of_values, 9ul); @@ -1039,6 +1041,12 @@ TEST_F(OrcStatisticsTest, Basic) EXPECT_EQ(*ts4.maximum_utc, 7000); ASSERT_FALSE(ts4.minimum); ASSERT_FALSE(ts4.maximum); + + auto& s5 = stats[5]; + EXPECT_EQ(*s5.number_of_values, 4ul); + EXPECT_TRUE(*s5.has_null); + auto& ts5 = std::get(s5.type_specific_stats); + EXPECT_EQ(ts5.count[0], 2); }; validate_statistics(stats.file_stats); From bfa0d8ba17f2b343cd2dbf1670c3717e8f9fb5ad Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 24 Aug 2023 14:56:28 -0700 Subject: [PATCH 12/25] add timestamp min/max --- cpp/src/io/orc/stats_enc.cu | 3 +++ cpp/tests/io/orc_test.cpp | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 8ff4468b728..38c4724e0b3 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -352,6 +352,9 @@ __global__ void __launch_bounds__(encode_threads_per_block) if (s->chunk.has_minmax) { cur[0] = 9 * 8 + ProtofType::FIXEDLEN; cur += 2; + // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC + cur = pb_put_int(cur, 1, s->chunk.min_value.i_val); // minimum + cur = pb_put_int(cur, 2, s->chunk.max_value.i_val); // maximum cur = pb_put_int(cur, 3, s->chunk.min_value.i_val); // minimumUtc cur = pb_put_int(cur, 4, s->chunk.max_value.i_val); // maximumUtc fld_start[1] = cur - (fld_start + 2); diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 63eeb4183cd..86df456c7f4 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1037,10 +1037,10 @@ TEST_F(OrcStatisticsTest, Basic) EXPECT_EQ(*s4.number_of_values, 4ul); EXPECT_TRUE(*s4.has_null); auto& ts4 = std::get(s4.type_specific_stats); + EXPECT_EQ(*ts4.minimum, 1000); + EXPECT_EQ(*ts4.maximum, 7000); EXPECT_EQ(*ts4.minimum_utc, 1000); EXPECT_EQ(*ts4.maximum_utc, 7000); - ASSERT_FALSE(ts4.minimum); - ASSERT_FALSE(ts4.maximum); auto& s5 = stats[5]; EXPECT_EQ(*s5.number_of_values, 4ul); From cb71541b2b0f57acd1900eed4afaf4b2557b15aa Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 24 Aug 2023 15:17:57 -0700 Subject: [PATCH 13/25] fix init buffersize --- cpp/src/io/orc/stats_enc.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 38c4724e0b3..e7154705576 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -87,11 +87,15 @@ __global__ void __launch_bounds__(block_size, 1) case dtype_int8: case dtype_int16: case dtype_int32: - case dtype_date32: case dtype_int64: - case dtype_timestamp64: stats_len = pb_fldlen_common + pb_fld_hdrlen + 3 * (pb_fld_hdrlen + pb_fldlen_int64); break; + case dtype_date32: + stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64); + break; + case dtype_timestamp64: + stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64); + break; case dtype_float32: case dtype_float64: stats_len = pb_fldlen_common + pb_fld_hdrlen + 3 * (pb_fld_hdrlen + pb_fldlen_float64); From 227a9c17f5ee282fdb2947e54717c6128aaf67a0 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 25 Aug 2023 14:59:12 -0700 Subject: [PATCH 14/25] use int128 for all decimal columns --- cpp/src/io/orc/stats_enc.cu | 2 +- cpp/src/io/parquet/page_enc.cu | 4 ++-- cpp/src/io/statistics/statistics_type_identification.cuh | 5 ++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index e7154705576..256199ccc4d 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -327,7 +327,7 @@ __global__ void __launch_bounds__(encode_threads_per_block) // optional string maximum = 2; // optional string sum = 3; // } - if (s->chunk.has_minmax) { + if (s->chunk.has_minmax or s->chunk.has_sum) { // TODO: Decimal support (decimal min/max stored as strings) } break; diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index d066b454840..e0509aa62e4 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -1837,8 +1837,8 @@ __device__ std::pair get_extremum(statistics_val const* s } case dtype_int64: case dtype_timestamp64: - case dtype_float64: - case dtype_decimal64: return {stats_val, sizeof(int64_t)}; + case dtype_float64: return {stats_val, sizeof(int64_t)}; + case dtype_decimal64: case dtype_decimal128: byte_reverse128(stats_val->d128_val, scratch); return {scratch, sizeof(__int128_t)}; diff --git a/cpp/src/io/statistics/statistics_type_identification.cuh b/cpp/src/io/statistics/statistics_type_identification.cuh index 32931d7d34d..63bfdfaaf4e 100644 --- a/cpp/src/io/statistics/statistics_type_identification.cuh +++ b/cpp/src/io/statistics/statistics_type_identification.cuh @@ -125,7 +125,7 @@ class extrema_type { using non_arithmetic_extrema_type = typename std::conditional_t< cudf::is_fixed_point() or cudf::is_duration() or cudf::is_timestamp(), - typename std::conditional_t, __int128_t, int64_t>, + typename std::conditional_t(), __int128_t, int64_t>, typename std::conditional_t< std::is_same_v, string_view, @@ -134,8 +134,7 @@ class extrema_type { // unsigned int/bool -> uint64_t // signed int -> int64_t // float/double -> double - // decimal32/64 -> int64_t - // decimal128 -> __int128_t + // decimal32/64/128 -> __int128_t // duration_[T] -> int64_t // string_view -> string_view // byte_array_view -> byte_array_view From f2f60909a87820e1f3e64453ac48d5dc3ef6ca9c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 29 Aug 2023 16:27:48 -0700 Subject: [PATCH 15/25] actual dec stats size --- cpp/src/io/orc/stats_enc.cu | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 256199ccc4d..8477cd4363a 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -16,9 +16,11 @@ #include "orc_gpu.hpp" -#include #include +#include +#include + #include namespace cudf { @@ -49,6 +51,21 @@ __global__ void __launch_bounds__(init_threads_per_block) } } +__device__ int32_t compute_output_size(__int128_t const& value, int32_t scale) +{ + if (scale >= 0) return strings::detail::count_digits(value) + scale; + + auto const abs_value = numeric::detail::abs(value); + auto const exp_ten = numeric::detail::exp10<__int128_t>(-scale); + auto const fraction = strings::detail::count_digits(abs_value % exp_ten); + auto const num_zeros = std::max(0, (-scale - fraction)); + return static_cast(value < 0) + // sign if negative + strings::detail::count_digits(abs_value / exp_ten) + // integer + 1 + // decimal point + num_zeros + // zeros padding + fraction; // size of fraction +} + /** * @brief Get the buffer size and offsets of encoded statistics * @@ -101,9 +118,14 @@ __global__ void __launch_bounds__(block_size, 1) stats_len = pb_fldlen_common + pb_fld_hdrlen + 3 * (pb_fld_hdrlen + pb_fldlen_float64); break; case dtype_decimal64: - case dtype_decimal128: - stats_len = pb_fldlen_common + pb_fld_hdrlen16 + 3 * (pb_fld_hdrlen + pb_fldlen_decimal); - break; + case dtype_decimal128: { + auto const scale = groups[idx].col_dtype.scale(); + auto const min_size = compute_output_size(chunks[idx].min_value.d128_val, scale)); + auto const max_size = compute_output_size(chunks[idx].max_value.d128_val, scale)); + auto const sum_size = compute_output_size(chunks[idx].sum.d128_val, scale)); + stats_len = + pb_fldlen_common + pb_fld_hdrlen16 + 3 * pb_fld_hdrlen + min_size + max_size + sum_size; + } break; case dtype_string: stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fldlen_int64) + chunks[idx].min_value.str_val.length + chunks[idx].max_value.str_val.length; From 64636f923188aeb42c58cc604ec18d902b18f38f Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 30 Aug 2023 15:33:05 -0700 Subject: [PATCH 16/25] add decimal stats --- cpp/src/io/orc/stats_enc.cu | 79 ++++++++++++++++++++++++++++++++----- cpp/tests/io/orc_test.cpp | 19 +++++++-- 2 files changed, 86 insertions(+), 12 deletions(-) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 8477cd4363a..cb7f1b56a75 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -66,6 +66,33 @@ __device__ int32_t compute_output_size(__int128_t const& value, int32_t scale) fraction; // size of fraction } +__device__ void decimal_to_string(__int128_t value, int32_t scale, char* out_ptr) +{ + if (scale >= 0) { + out_ptr += strings::detail::integer_to_string(value, out_ptr); + thrust::generate_n(thrust::seq, out_ptr, scale, []() { return '0'; }); // add zeros + return; + } + + // scale < 0 + // write format: [-]integer.fraction + // where integer = abs(value) / (10^abs(scale)) + // fraction = abs(value) % (10^abs(scale)) + if (value < 0) *out_ptr++ = '-'; // add sign + auto const abs_value = numeric::detail::abs(value); + auto const exp_ten = numeric::detail::exp10<__int128_t>(-scale); + auto const num_zeros = std::max(0, (-scale - strings::detail::count_digits(abs_value % exp_ten))); + + out_ptr += + strings::detail::integer_to_string(abs_value / exp_ten, out_ptr); // add the integer part + *out_ptr++ = '.'; // add decimal point + + thrust::generate_n(thrust::seq, out_ptr, num_zeros, []() { return '0'; }); // add zeros + out_ptr += num_zeros; + + strings::detail::integer_to_string(abs_value % exp_ten, out_ptr); // add the fraction part +} + /** * @brief Get the buffer size and offsets of encoded statistics * @@ -79,9 +106,10 @@ constexpr unsigned int pb_fld_hdrlen16 = 2; // > 127-byte length constexpr unsigned int pb_fld_hdrlen32 = 5; // > 16KB length constexpr unsigned int pb_fldlen_int64 = 10; constexpr unsigned int pb_fldlen_float64 = 8; -constexpr unsigned int pb_fldlen_decimal = 40; // Assume decimal2string fits in 40 characters constexpr unsigned int pb_fldlen_bucket1 = 1 + pb_fldlen_int64; -constexpr unsigned int pb_fldlen_common = 2 * pb_fld_hdrlen + pb_fldlen_int64; +// statistics field number + number of values + has null +constexpr unsigned int pb_fldlen_common = + pb_fld_hdrlen + (pb_fld_hdrlen + pb_fldlen_int64) + 2 * pb_fld_hdrlen; template __global__ void __launch_bounds__(block_size, 1) @@ -120,14 +148,14 @@ __global__ void __launch_bounds__(block_size, 1) case dtype_decimal64: case dtype_decimal128: { auto const scale = groups[idx].col_dtype.scale(); - auto const min_size = compute_output_size(chunks[idx].min_value.d128_val, scale)); - auto const max_size = compute_output_size(chunks[idx].max_value.d128_val, scale)); - auto const sum_size = compute_output_size(chunks[idx].sum.d128_val, scale)); - stats_len = - pb_fldlen_common + pb_fld_hdrlen16 + 3 * pb_fld_hdrlen + min_size + max_size + sum_size; + auto const min_size = compute_output_size(chunks[idx].min_value.d128_val, scale); + auto const max_size = compute_output_size(chunks[idx].max_value.d128_val, scale); + auto const sum_size = compute_output_size(chunks[idx].sum.d128_val, scale); + stats_len = pb_fldlen_common + pb_fld_hdrlen16 + 3 * (pb_fld_hdrlen + pb_fld_hdrlen16) + + min_size + max_size + sum_size; } break; case dtype_string: - stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fldlen_int64) + + stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fld_hdrlen32) + chunks[idx].min_value.str_val.length + chunks[idx].max_value.str_val.length; break; case dtype_none: stats_len = pb_fldlen_common; @@ -201,6 +229,15 @@ __device__ inline uint8_t* pb_put_binary(uint8_t* p, uint32_t id, void const* by return p + len; } +__device__ inline uint8_t* pb_put_decimal( + uint8_t* p, uint32_t id, __int128_t value, int32_t scale, int32_t len) +{ + p[0] = id * 8 + ProtofType::FIXEDLEN; + p = pb_encode_uint(p + 1, len); + decimal_to_string(value, scale, reinterpret_cast(p)); + return p + len; +} + // Protobuf field encoding for 64-bit raw encoding (double) __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* raw64) { @@ -350,7 +387,31 @@ __global__ void __launch_bounds__(encode_threads_per_block) // optional string sum = 3; // } if (s->chunk.has_minmax or s->chunk.has_sum) { - // TODO: Decimal support (decimal min/max stored as strings) + auto const scale = s->group.col_dtype.scale(); + + uint32_t sz = 0; + auto const min_size = + s->chunk.has_minmax ? compute_output_size(s->chunk.min_value.d128_val, scale) : 0; + auto const max_size = + s->chunk.has_minmax ? compute_output_size(s->chunk.max_value.d128_val, scale) : 0; + if (s->chunk.has_minmax) { + // two bytes per string for length, plus the strings themselves + sz += 2 + min_size + 2 + max_size; + } + auto const sum_size = + s->chunk.has_sum ? compute_output_size(s->chunk.sum.d128_val, scale) : 0; + if (s->chunk.has_sum) { sz += 2 + sum_size; } + + cur[0] = 6 * 8 + ProtofType::FIXEDLEN; + cur = pb_encode_uint(cur + 1, sz); + + if (s->chunk.has_minmax) { + cur = pb_put_decimal(cur, 1, s->chunk.min_value.d128_val, scale, min_size); // minimum + cur = pb_put_decimal(cur, 2, s->chunk.max_value.d128_val, scale, max_size); // maximum + } + if (s->chunk.has_sum) { + cur = pb_put_decimal(cur, 3, s->chunk.sum.d128_val, scale, sum_size); // sum + } } break; case dtype_date32: diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 86df456c7f4..2535ee7f885 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -990,7 +990,12 @@ TEST_F(OrcStatisticsTest, Basic) column_wrapper col4( sequence, sequence + num_rows, validity); bool_col col5({true, true, true, true, true, false, false, false, false}, validity); - table_view expected({col1, col2, col3, col4, col5}); + + auto data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i * 1000; }); + cudf::test::fixed_point_column_wrapper col6( + data, data + num_rows, numeric::scale_type{-1}); + + table_view expected({col1, col2, col3, col4, col5, col6}); auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc"); @@ -1001,11 +1006,11 @@ TEST_F(OrcStatisticsTest, Basic) auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath}); auto const expected_column_names = - std::vector{"", "_col0", "_col1", "_col2", "_col3", "_col4"}; + std::vector{"", "_col0", "_col1", "_col2", "_col3", "_col4", "_col5"}; EXPECT_EQ(stats.column_names, expected_column_names); auto validate_statistics = [&](std::vector const& stats) { - ASSERT_EQ(stats.size(), 6ul); + ASSERT_EQ(stats.size(), 7ul); auto& s0 = stats[0]; EXPECT_EQ(*s0.number_of_values, 9ul); @@ -1047,6 +1052,14 @@ TEST_F(OrcStatisticsTest, Basic) EXPECT_TRUE(*s5.has_null); auto& ts5 = std::get(s5.type_specific_stats); EXPECT_EQ(ts5.count[0], 2); + + auto& s6 = stats[6]; + EXPECT_EQ(*s6.number_of_values, 9ul); + EXPECT_FALSE(*s6.has_null); + auto& ts6 = std::get(s6.type_specific_stats); + EXPECT_EQ(*ts6.minimum, "0.0"); + EXPECT_EQ(*ts6.maximum, "800.0"); + EXPECT_EQ(*ts6.sum, "3600.0"); }; validate_statistics(stats.file_stats); From e79496cc2b150808561a61007588692b2c0e93e0 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 31 Aug 2023 13:52:37 -0700 Subject: [PATCH 17/25] add timestamp nanoseconds --- cpp/include/cudf/io/orc_metadata.hpp | 6 +++-- cpp/src/io/orc/orc.cpp | 4 +++- cpp/src/io/orc/stats_enc.cu | 23 +++++++++++++++---- .../statistics_type_identification.cuh | 17 +++++++------- cpp/tests/io/orc_test.cpp | 16 ++++++++----- 5 files changed, 45 insertions(+), 21 deletions(-) diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index d225d04161b..e62bd6413b1 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -141,8 +141,10 @@ using binary_statistics = sum_statistics; * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC. */ struct timestamp_statistics : minmax_statistics { - std::optional minimum_utc; ///< minimum in milliseconds - std::optional maximum_utc; ///< maximum in milliseconds + std::optional minimum_utc; ///< minimum in milliseconds + std::optional maximum_utc; ///< maximum in milliseconds + std::optional minimum_nanos; ///< nanoseconds part of the minimum + std::optional maximum_nanos; ///< nanoseconds part of the maximum }; namespace orc { diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index fc50b7118be..bc399b75ef9 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -178,7 +178,9 @@ void ProtobufReader::read(timestamp_statistics& s, size_t maxlen) auto op = std::tuple(field_reader(1, s.minimum), field_reader(2, s.maximum), field_reader(3, s.minimum_utc), - field_reader(4, s.maximum_utc)); + field_reader(4, s.maximum_utc), + field_reader(5, s.minimum_nanos), + field_reader(6, s.maximum_nanos)); function_builder(s, maxlen, op); } diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index cb7f1b56a75..b18aa8781c6 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -246,6 +246,14 @@ __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* r return p + 9; } +__device__ std::pair split_nanosecond_timestamp(int64_t nano_count) +{ + auto const ns = cuda::std::chrono::nanoseconds(nano_count); + auto const ms_floor = cuda::std::chrono::floor(ns); + auto const ns_remainder = ns - ms_floor; + return {ms_floor.count(), ns_remainder.count()}; +} + /** * @brief Encode statistics in ORC protobuf format * @@ -435,15 +443,22 @@ __global__ void __launch_bounds__(encode_threads_per_block) // optional sint64 maximum = 2; // optional sint64 minimumUtc = 3; // min,max values saved as milliseconds since UNIX epoch // optional sint64 maximumUtc = 4; + // optional int32 minimumNanos = 5; // lower 6 TS digits for min/max to achieve nanosecond + // precision optional int32 maximumNanos = 6; // } if (s->chunk.has_minmax) { cur[0] = 9 * 8 + ProtofType::FIXEDLEN; cur += 2; + auto [min_ms, min_ns_remainder] = split_nanosecond_timestamp(s->chunk.min_value.i_val); + auto [max_ms, max_ns_remainder] = split_nanosecond_timestamp(s->chunk.max_value.i_val); + // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC - cur = pb_put_int(cur, 1, s->chunk.min_value.i_val); // minimum - cur = pb_put_int(cur, 2, s->chunk.max_value.i_val); // maximum - cur = pb_put_int(cur, 3, s->chunk.min_value.i_val); // minimumUtc - cur = pb_put_int(cur, 4, s->chunk.max_value.i_val); // maximumUtc + cur = pb_put_int(cur, 1, min_ms); // minimum + cur = pb_put_int(cur, 2, max_ms); // maximum + cur = pb_put_int(cur, 3, min_ms); // minimumUtc + cur = pb_put_int(cur, 4, max_ms); // maximumUtc + cur = pb_put_int(cur, 5, min_ns_remainder); // minimumNanos + cur = pb_put_int(cur, 6, max_ns_remainder); // maximumNanos fld_start[1] = cur - (fld_start + 2); } break; diff --git a/cpp/src/io/statistics/statistics_type_identification.cuh b/cpp/src/io/statistics/statistics_type_identification.cuh index 63bfdfaaf4e..9fda971063c 100644 --- a/cpp/src/io/statistics/statistics_type_identification.cuh +++ b/cpp/src/io/statistics/statistics_type_identification.cuh @@ -49,15 +49,15 @@ enum class is_int96_timestamp { YES, NO }; template struct conversion_map; -// Every timestamp or duration type is converted to milliseconds in ORC statistics +// Every timestamp or duration type is converted to nanoseconds in ORC statistics template struct conversion_map { - using types = std::tuple, - std::pair, - std::pair, - std::pair, - std::pair, - std::pair>; + using types = std::tuple, + std::pair, + std::pair, + std::pair, + std::pair, + std::pair>; }; // In Parquet timestamps and durations with second resolution are converted to @@ -108,7 +108,8 @@ class type_conversion { }; template -struct dependent_false : std::false_type {}; +struct dependent_false : std::false_type { +}; /** * @brief Utility class to convert a leaf column element into its extrema type diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 2535ee7f885..85dff735145 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -976,6 +976,8 @@ TEST_F(OrcReaderTest, CombinedSkipRowTest) TEST_F(OrcStatisticsTest, Basic) { auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); + auto ns_sequence = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i - 4) * 1000002; }); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); std::vector strings{ @@ -987,8 +989,8 @@ TEST_F(OrcStatisticsTest, Basic) column_wrapper col2( sequence, sequence + num_rows, validity); str_col col3{strings.begin(), strings.end()}; - column_wrapper col4( - sequence, sequence + num_rows, validity); + column_wrapper col4( + ns_sequence, ns_sequence + num_rows, validity); bool_col col5({true, true, true, true, true, false, false, false, false}, validity); auto data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i * 1000; }); @@ -1042,10 +1044,12 @@ TEST_F(OrcStatisticsTest, Basic) EXPECT_EQ(*s4.number_of_values, 4ul); EXPECT_TRUE(*s4.has_null); auto& ts4 = std::get(s4.type_specific_stats); - EXPECT_EQ(*ts4.minimum, 1000); - EXPECT_EQ(*ts4.maximum, 7000); - EXPECT_EQ(*ts4.minimum_utc, 1000); - EXPECT_EQ(*ts4.maximum_utc, 7000); + EXPECT_EQ(*ts4.minimum, -4); + EXPECT_EQ(*ts4.maximum, 3); + EXPECT_EQ(*ts4.minimum_utc, -4); + EXPECT_EQ(*ts4.maximum_utc, 3); + EXPECT_EQ(*ts4.minimum_nanos, 999994); + EXPECT_EQ(*ts4.maximum_nanos, 6); auto& s5 = stats[5]; EXPECT_EQ(*s5.number_of_values, 4ul); From b5dbea18a52eb743f12c743bae09db5cde6a8bb0 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 31 Aug 2023 14:49:56 -0700 Subject: [PATCH 18/25] de-duplicate decimal to string code --- .../detail/convert/fixed_point_to_string.cuh | 63 +++++++++++++++++ cpp/src/io/orc/stats_enc.cu | 70 ++++--------------- .../strings/convert/convert_fixed_point.cu | 54 ++------------ 3 files changed, 81 insertions(+), 106 deletions(-) create mode 100644 cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh new file mode 100644 index 00000000000..624af7ef9f6 --- /dev/null +++ b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf::strings::detail { + +__device__ inline int32_t fixed_point_string_size(__int128_t const& value, int32_t scale) +{ + if (scale >= 0) return count_digits(value) + scale; + + auto const abs_value = numeric::detail::abs(value); + auto const exp_ten = numeric::detail::exp10<__int128_t>(-scale); + auto const fraction = count_digits(abs_value % exp_ten); + auto const num_zeros = std::max(0, (-scale - fraction)); + return static_cast(value < 0) + // sign if negative + count_digits(abs_value / exp_ten) + // integer + 1 + // decimal point + num_zeros + // zeros padding + fraction; // size of fraction +} + +__device__ inline void fixed_point_to_string(__int128_t value, int32_t scale, char* out_ptr) +{ + if (scale >= 0) { + out_ptr += integer_to_string(value, out_ptr); + thrust::generate_n(thrust::seq, out_ptr, scale, []() { return '0'; }); // add zeros + return; + } + + // scale < 0 + // write format: [-]integer.fraction + // where integer = abs(value) / (10^abs(scale)) + // fraction = abs(value) % (10^abs(scale)) + if (value < 0) *out_ptr++ = '-'; // add sign + auto const abs_value = numeric::detail::abs(value); + auto const exp_ten = numeric::detail::exp10<__int128_t>(-scale); + auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten))); + + out_ptr += integer_to_string(abs_value / exp_ten, out_ptr); // add the integer part + *out_ptr++ = '.'; // add decimal point + + thrust::generate_n(thrust::seq, out_ptr, num_zeros, []() { return '0'; }); // add zeros + out_ptr += num_zeros; + + integer_to_string(abs_value % exp_ten, out_ptr); // add the fraction part +} + +} // namespace cudf::strings::detail \ No newline at end of file diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index b18aa8781c6..2c8c4a5cde9 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -19,14 +19,13 @@ #include #include -#include +#include #include -namespace cudf { -namespace io { -namespace orc { -namespace gpu { +namespace cudf::io::orc::gpu { + +using strings::detail::fixed_point_string_size; constexpr unsigned int init_threads_per_group = 32; constexpr unsigned int init_groups_per_block = 4; @@ -51,48 +50,6 @@ __global__ void __launch_bounds__(init_threads_per_block) } } -__device__ int32_t compute_output_size(__int128_t const& value, int32_t scale) -{ - if (scale >= 0) return strings::detail::count_digits(value) + scale; - - auto const abs_value = numeric::detail::abs(value); - auto const exp_ten = numeric::detail::exp10<__int128_t>(-scale); - auto const fraction = strings::detail::count_digits(abs_value % exp_ten); - auto const num_zeros = std::max(0, (-scale - fraction)); - return static_cast(value < 0) + // sign if negative - strings::detail::count_digits(abs_value / exp_ten) + // integer - 1 + // decimal point - num_zeros + // zeros padding - fraction; // size of fraction -} - -__device__ void decimal_to_string(__int128_t value, int32_t scale, char* out_ptr) -{ - if (scale >= 0) { - out_ptr += strings::detail::integer_to_string(value, out_ptr); - thrust::generate_n(thrust::seq, out_ptr, scale, []() { return '0'; }); // add zeros - return; - } - - // scale < 0 - // write format: [-]integer.fraction - // where integer = abs(value) / (10^abs(scale)) - // fraction = abs(value) % (10^abs(scale)) - if (value < 0) *out_ptr++ = '-'; // add sign - auto const abs_value = numeric::detail::abs(value); - auto const exp_ten = numeric::detail::exp10<__int128_t>(-scale); - auto const num_zeros = std::max(0, (-scale - strings::detail::count_digits(abs_value % exp_ten))); - - out_ptr += - strings::detail::integer_to_string(abs_value / exp_ten, out_ptr); // add the integer part - *out_ptr++ = '.'; // add decimal point - - thrust::generate_n(thrust::seq, out_ptr, num_zeros, []() { return '0'; }); // add zeros - out_ptr += num_zeros; - - strings::detail::integer_to_string(abs_value % exp_ten, out_ptr); // add the fraction part -} - /** * @brief Get the buffer size and offsets of encoded statistics * @@ -148,9 +105,9 @@ __global__ void __launch_bounds__(block_size, 1) case dtype_decimal64: case dtype_decimal128: { auto const scale = groups[idx].col_dtype.scale(); - auto const min_size = compute_output_size(chunks[idx].min_value.d128_val, scale); - auto const max_size = compute_output_size(chunks[idx].max_value.d128_val, scale); - auto const sum_size = compute_output_size(chunks[idx].sum.d128_val, scale); + auto const min_size = fixed_point_string_size(chunks[idx].min_value.d128_val, scale); + auto const max_size = fixed_point_string_size(chunks[idx].max_value.d128_val, scale); + auto const sum_size = fixed_point_string_size(chunks[idx].sum.d128_val, scale); stats_len = pb_fldlen_common + pb_fld_hdrlen16 + 3 * (pb_fld_hdrlen + pb_fld_hdrlen16) + min_size + max_size + sum_size; } break; @@ -234,7 +191,7 @@ __device__ inline uint8_t* pb_put_decimal( { p[0] = id * 8 + ProtofType::FIXEDLEN; p = pb_encode_uint(p + 1, len); - decimal_to_string(value, scale, reinterpret_cast(p)); + strings::detail::fixed_point_to_string(value, scale, reinterpret_cast(p)); return p + len; } @@ -399,15 +356,15 @@ __global__ void __launch_bounds__(encode_threads_per_block) uint32_t sz = 0; auto const min_size = - s->chunk.has_minmax ? compute_output_size(s->chunk.min_value.d128_val, scale) : 0; + s->chunk.has_minmax ? fixed_point_string_size(s->chunk.min_value.d128_val, scale) : 0; auto const max_size = - s->chunk.has_minmax ? compute_output_size(s->chunk.max_value.d128_val, scale) : 0; + s->chunk.has_minmax ? fixed_point_string_size(s->chunk.max_value.d128_val, scale) : 0; if (s->chunk.has_minmax) { // two bytes per string for length, plus the strings themselves sz += 2 + min_size + 2 + max_size; } auto const sum_size = - s->chunk.has_sum ? compute_output_size(s->chunk.sum.d128_val, scale) : 0; + s->chunk.has_sum ? fixed_point_string_size(s->chunk.sum.d128_val, scale) : 0; if (s->chunk.has_sum) { sz += 2 + sum_size; } cur[0] = 6 * 8 + ProtofType::FIXEDLEN; @@ -518,7 +475,4 @@ void orc_encode_statistics(uint8_t* blob_bfr, blob_bfr, groups, chunks, statistics_count); } -} // namespace gpu -} // namespace orc -} // namespace io -} // namespace cudf +} // namespace cudf::io::orc::gpu diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index a3336258d3e..51aab9faeba 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -200,62 +200,19 @@ struct from_fixed_point_fn { size_type* d_offsets{}; char* d_chars{}; - /** - * @brief Calculates the size of the string required to convert the element, in base-10 format. - * - * Output format is [-]integer.fraction - */ - __device__ int32_t compute_output_size(DecimalType value) - { - auto const scale = d_decimals.type().scale(); - - if (scale >= 0) return count_digits(value) + scale; - - auto const abs_value = numeric::detail::abs(value); - auto const exp_ten = numeric::detail::exp10(-scale); - auto const fraction = count_digits(abs_value % exp_ten); - auto const num_zeros = std::max(0, (-scale - fraction)); - return static_cast(value < 0) + // sign if negative - count_digits(abs_value / exp_ten) + // integer - 1 + // decimal point - num_zeros + // zeros padding - fraction; // size of fraction - } - /** * @brief Converts a decimal element into a string. * * The value is converted into base-10 digits [0-9] * plus the decimal point and a negative sign prefix. */ - __device__ void decimal_to_string(size_type idx) + __device__ void fixed_point_element_to_string(size_type idx) { auto const value = d_decimals.element(idx); auto const scale = d_decimals.type().scale(); char* d_buffer = d_chars + d_offsets[idx]; - if (scale >= 0) { - d_buffer += integer_to_string(value, d_buffer); - thrust::generate_n(thrust::seq, d_buffer, scale, []() { return '0'; }); // add zeros - return; - } - - // scale < 0 - // write format: [-]integer.fraction - // where integer = abs(value) / (10^abs(scale)) - // fraction = abs(value) % (10^abs(scale)) - if (value < 0) *d_buffer++ = '-'; // add sign - auto const abs_value = numeric::detail::abs(value); - auto const exp_ten = numeric::detail::exp10(-scale); - auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten))); - - d_buffer += integer_to_string(abs_value / exp_ten, d_buffer); // add the integer part - *d_buffer++ = '.'; // add decimal point - - thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; }); // add zeros - d_buffer += num_zeros; - - integer_to_string(abs_value % exp_ten, d_buffer); // add the fraction part + fixed_point_to_string(value, scale, d_buffer); } __device__ void operator()(size_type idx) @@ -265,9 +222,10 @@ struct from_fixed_point_fn { return; } if (d_chars != nullptr) { - decimal_to_string(idx); + fixed_point_element_to_string(idx); } else { - d_offsets[idx] = compute_output_size(d_decimals.element(idx)); + d_offsets[idx] = + fixed_point_string_size(d_decimals.element(idx), d_decimals.type().scale()); } } }; From 7b27f56c086c5a6986da879d2a509048b7d42e75 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 31 Aug 2023 15:54:09 -0700 Subject: [PATCH 19/25] don't assume dec len --- cpp/src/io/orc/stats_enc.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 2c8c4a5cde9..5ec45934453 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -59,8 +59,7 @@ __global__ void __launch_bounds__(init_threads_per_block) constexpr unsigned int buffersize_reduction_dim = 32; constexpr unsigned int block_size = buffersize_reduction_dim * buffersize_reduction_dim; constexpr unsigned int pb_fld_hdrlen = 1; -constexpr unsigned int pb_fld_hdrlen16 = 2; // > 127-byte length -constexpr unsigned int pb_fld_hdrlen32 = 5; // > 16KB length +constexpr unsigned int pb_fld_hdrlen32 = 5; constexpr unsigned int pb_fldlen_int64 = 10; constexpr unsigned int pb_fldlen_float64 = 8; constexpr unsigned int pb_fldlen_bucket1 = 1 + pb_fldlen_int64; @@ -108,7 +107,7 @@ __global__ void __launch_bounds__(block_size, 1) auto const min_size = fixed_point_string_size(chunks[idx].min_value.d128_val, scale); auto const max_size = fixed_point_string_size(chunks[idx].max_value.d128_val, scale); auto const sum_size = fixed_point_string_size(chunks[idx].sum.d128_val, scale); - stats_len = pb_fldlen_common + pb_fld_hdrlen16 + 3 * (pb_fld_hdrlen + pb_fld_hdrlen16) + + stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fld_hdrlen32) + min_size + max_size + sum_size; } break; case dtype_string: @@ -360,12 +359,13 @@ __global__ void __launch_bounds__(encode_threads_per_block) auto const max_size = s->chunk.has_minmax ? fixed_point_string_size(s->chunk.max_value.d128_val, scale) : 0; if (s->chunk.has_minmax) { - // two bytes per string for length, plus the strings themselves - sz += 2 + min_size + 2 + max_size; + // encoded string lengths, plus the strings + sz += (pb_put_uint(cur, 1, min_size) - cur) + min_size + + (pb_put_uint(cur, 1, max_size) - cur) + max_size; } auto const sum_size = s->chunk.has_sum ? fixed_point_string_size(s->chunk.sum.d128_val, scale) : 0; - if (s->chunk.has_sum) { sz += 2 + sum_size; } + if (s->chunk.has_sum) { sz += (pb_put_uint(cur, 1, sum_size) - cur) + sum_size; } cur[0] = 6 * 8 + ProtofType::FIXEDLEN; cur = pb_encode_uint(cur + 1, sz); From e65310da15e796f6c98726748bf9fcb0607e83b0 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 31 Aug 2023 15:54:15 -0700 Subject: [PATCH 20/25] expand tests --- cpp/tests/io/orc_test.cpp | 53 ++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 85dff735145..770c098d064 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -976,8 +976,10 @@ TEST_F(OrcReaderTest, CombinedSkipRowTest) TEST_F(OrcStatisticsTest, Basic) { auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); - auto ns_sequence = + auto ts_sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i - 4) * 1000002; }); + auto dec_sequence = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i * 1001; }); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); std::vector strings{ @@ -990,14 +992,15 @@ TEST_F(OrcStatisticsTest, Basic) sequence, sequence + num_rows, validity); str_col col3{strings.begin(), strings.end()}; column_wrapper col4( - ns_sequence, ns_sequence + num_rows, validity); - bool_col col5({true, true, true, true, true, false, false, false, false}, validity); + ts_sequence, ts_sequence + num_rows, validity); + column_wrapper col5( + ts_sequence, ts_sequence + num_rows, validity); + bool_col col6({true, true, true, true, true, false, false, false, false}, validity); - auto data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i * 1000; }); - cudf::test::fixed_point_column_wrapper col6( - data, data + num_rows, numeric::scale_type{-1}); + cudf::test::fixed_point_column_wrapper col7( + dec_sequence, dec_sequence + num_rows, numeric::scale_type{120}); - table_view expected({col1, col2, col3, col4, col5, col6}); + table_view expected({col1, col2, col3, col4, col5, col6, col7}); auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc"); @@ -1007,12 +1010,15 @@ TEST_F(OrcStatisticsTest, Basic) auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath}); - auto const expected_column_names = - std::vector{"", "_col0", "_col1", "_col2", "_col3", "_col4", "_col5"}; + auto expected_column_names = std::vector{""}; + std::generate_n(std::back_inserter(expected_column_names), expected.num_columns(), []() mutable { + static int starting_index = 0; + return "_col" + std::to_string(starting_index++); + }); EXPECT_EQ(stats.column_names, expected_column_names); auto validate_statistics = [&](std::vector const& stats) { - ASSERT_EQ(stats.size(), 7ul); + ASSERT_EQ(stats.size(), expected.num_columns() + 1); auto& s0 = stats[0]; EXPECT_EQ(*s0.number_of_values, 9ul); @@ -1054,16 +1060,27 @@ TEST_F(OrcStatisticsTest, Basic) auto& s5 = stats[5]; EXPECT_EQ(*s5.number_of_values, 4ul); EXPECT_TRUE(*s5.has_null); - auto& ts5 = std::get(s5.type_specific_stats); - EXPECT_EQ(ts5.count[0], 2); + auto& ts5 = std::get(s5.type_specific_stats); + EXPECT_EQ(*ts5.minimum, -3001); + EXPECT_EQ(*ts5.maximum, 3000); + EXPECT_EQ(*ts5.minimum_utc, -3001); + EXPECT_EQ(*ts5.maximum_utc, 3000); + EXPECT_EQ(*ts5.minimum_nanos, 994000); + EXPECT_EQ(*ts5.maximum_nanos, 6000); auto& s6 = stats[6]; - EXPECT_EQ(*s6.number_of_values, 9ul); - EXPECT_FALSE(*s6.has_null); - auto& ts6 = std::get(s6.type_specific_stats); - EXPECT_EQ(*ts6.minimum, "0.0"); - EXPECT_EQ(*ts6.maximum, "800.0"); - EXPECT_EQ(*ts6.sum, "3600.0"); + EXPECT_EQ(*s6.number_of_values, 4ul); + EXPECT_TRUE(*s6.has_null); + auto& ts6 = std::get(s6.type_specific_stats); + EXPECT_EQ(ts6.count[0], 2); + + auto& s7 = stats[7]; + EXPECT_EQ(*s7.number_of_values, 9ul); + EXPECT_FALSE(*s7.has_null); + auto& ts7 = std::get(s7.type_specific_stats); + EXPECT_EQ(*ts7.minimum, "0.0"); + EXPECT_EQ(*ts7.maximum, "800.8"); + EXPECT_EQ(*ts7.sum, "3603.6"); }; validate_statistics(stats.file_stats); From 8c58c6f8e58fa1fe5d7bf4e845c36ecce42cfb6a Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 1 Sep 2023 11:35:29 -0700 Subject: [PATCH 21/25] mostly docs --- cpp/include/cudf/io/orc_metadata.hpp | 4 ++-- .../detail/convert/fixed_point_to_string.cuh | 19 ++++++++++++++++++- cpp/src/io/orc/stats_enc.cu | 6 +++++- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index e62bd6413b1..82d59803c25 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -111,10 +111,10 @@ struct string_statistics : minmax_statistics, sum_statistics count; ///< number of `true` values + std::vector count; ///< count of `true` values }; /** diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh index 624af7ef9f6..3ae887eb99d 100644 --- a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh +++ b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh @@ -19,6 +19,13 @@ namespace cudf::strings::detail { +/** + * @brief Returns the number of digits in the given fixed point number. + * + * @param value The value of the fixed point number + * @param scale The scale of the fixed point number + * @return int32_t The number of digits required to represent the fixed point number + */ __device__ inline int32_t fixed_point_string_size(__int128_t const& value, int32_t scale) { if (scale >= 0) return count_digits(value) + scale; @@ -34,7 +41,17 @@ __device__ inline int32_t fixed_point_string_size(__int128_t const& value, int32 fraction; // size of fraction } -__device__ inline void fixed_point_to_string(__int128_t value, int32_t scale, char* out_ptr) +/** + * @brief Converts the given fixed point number to a string. + * + * Caller is responsible for ensuring that the output buffer is large enough. The required output + * buffer size can be obtained by calling `fixed_point_string_size`. + * + * @param value The value of the fixed point number + * @param scale The scale of the fixed point number + * @param out_ptr The pointer to the output string + */ +__device__ inline void fixed_point_to_string(__int128_t const& value, int32_t scale, char* out_ptr) { if (scale >= 0) { out_ptr += integer_to_string(value, out_ptr); diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 5ec45934453..c61356cb617 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -60,6 +60,7 @@ constexpr unsigned int buffersize_reduction_dim = 32; constexpr unsigned int block_size = buffersize_reduction_dim * buffersize_reduction_dim; constexpr unsigned int pb_fld_hdrlen = 1; constexpr unsigned int pb_fld_hdrlen32 = 5; +constexpr unsigned int pb_fldlen_int32 = 5; constexpr unsigned int pb_fldlen_int64 = 10; constexpr unsigned int pb_fldlen_float64 = 8; constexpr unsigned int pb_fldlen_bucket1 = 1 + pb_fldlen_int64; @@ -95,7 +96,8 @@ __global__ void __launch_bounds__(block_size, 1) stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64); break; case dtype_timestamp64: - stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64); + stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64) + + 2 * (pb_fld_hdrlen + pb_fldlen_int32); break; case dtype_float32: case dtype_float64: @@ -107,6 +109,7 @@ __global__ void __launch_bounds__(block_size, 1) auto const min_size = fixed_point_string_size(chunks[idx].min_value.d128_val, scale); auto const max_size = fixed_point_string_size(chunks[idx].max_value.d128_val, scale); auto const sum_size = fixed_point_string_size(chunks[idx].sum.d128_val, scale); + // common + total field length + encoded string lengths + strings stats_len = pb_fldlen_common + pb_fld_hdrlen32 + 3 * (pb_fld_hdrlen + pb_fld_hdrlen32) + min_size + max_size + sum_size; } break; @@ -202,6 +205,7 @@ __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* r return p + 9; } +// Splits a nanosecond timestamp into milliseconds and nanoseconds __device__ std::pair split_nanosecond_timestamp(int64_t nano_count) { auto const ns = cuda::std::chrono::nanoseconds(nano_count); From eacb5783b79c618319fcb7ce9e4693f7f82dafb2 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 1 Sep 2023 15:03:06 -0700 Subject: [PATCH 22/25] style --- .../cudf/strings/detail/convert/fixed_point_to_string.cuh | 2 +- cpp/src/io/statistics/statistics_type_identification.cuh | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh index 3ae887eb99d..0ee26ec9ee2 100644 --- a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh +++ b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh @@ -77,4 +77,4 @@ __device__ inline void fixed_point_to_string(__int128_t const& value, int32_t sc integer_to_string(abs_value % exp_ten, out_ptr); // add the fraction part } -} // namespace cudf::strings::detail \ No newline at end of file +} // namespace cudf::strings::detail diff --git a/cpp/src/io/statistics/statistics_type_identification.cuh b/cpp/src/io/statistics/statistics_type_identification.cuh index 9fda971063c..ea8c71f0dcb 100644 --- a/cpp/src/io/statistics/statistics_type_identification.cuh +++ b/cpp/src/io/statistics/statistics_type_identification.cuh @@ -108,8 +108,7 @@ class type_conversion { }; template -struct dependent_false : std::false_type { -}; +struct dependent_false : std::false_type {}; /** * @brief Utility class to convert a leaf column element into its extrema type From 7808cb3bd5277fa502eaf740269a10681ed7a0d3 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 5 Sep 2023 11:40:39 -0700 Subject: [PATCH 23/25] test fix --- cpp/tests/io/orc_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 770c098d064..663e21e0af8 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -998,7 +998,7 @@ TEST_F(OrcStatisticsTest, Basic) bool_col col6({true, true, true, true, true, false, false, false, false}, validity); cudf::test::fixed_point_column_wrapper col7( - dec_sequence, dec_sequence + num_rows, numeric::scale_type{120}); + dec_sequence, dec_sequence + num_rows, numeric::scale_type{-1}); table_view expected({col1, col2, col3, col4, col5, col6, col7}); From d0270198ee21b7505e5850b6b9b685c4be51474f Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 18 Sep 2023 09:27:16 -0700 Subject: [PATCH 24/25] simplify lambda --- cpp/tests/io/orc_test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 663e21e0af8..890ef914713 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1011,10 +1011,10 @@ TEST_F(OrcStatisticsTest, Basic) auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath}); auto expected_column_names = std::vector{""}; - std::generate_n(std::back_inserter(expected_column_names), expected.num_columns(), []() mutable { - static int starting_index = 0; - return "_col" + std::to_string(starting_index++); - }); + std::generate_n( + std::back_inserter(expected_column_names), + expected.num_columns(), + [starting_index = 0]() mutable { return "_col" + std::to_string(starting_index++); }); EXPECT_EQ(stats.column_names, expected_column_names); auto validate_statistics = [&](std::vector const& stats) { From 21361096d3a8a903b3ed54bfd5fd99e221ee8a4b Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 18 Sep 2023 09:27:23 -0700 Subject: [PATCH 25/25] const --- cpp/src/io/orc/stats_enc.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index c61356cb617..69d7ec95acd 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -410,8 +410,10 @@ __global__ void __launch_bounds__(encode_threads_per_block) if (s->chunk.has_minmax) { cur[0] = 9 * 8 + ProtofType::FIXEDLEN; cur += 2; - auto [min_ms, min_ns_remainder] = split_nanosecond_timestamp(s->chunk.min_value.i_val); - auto [max_ms, max_ns_remainder] = split_nanosecond_timestamp(s->chunk.max_value.i_val); + auto const [min_ms, min_ns_remainder] = + split_nanosecond_timestamp(s->chunk.min_value.i_val); + auto const [max_ms, max_ns_remainder] = + split_nanosecond_timestamp(s->chunk.max_value.i_val); // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC cur = pb_put_int(cur, 1, min_ms); // minimum