From 15563a124f71f1367088a44073cea3d8169b3310 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 1 Sep 2021 14:41:32 -0700 Subject: [PATCH 1/3] detect verflow using min/max; cpp test --- .../io/statistics/typed_statistics_chunk.cuh | 14 +++++-- cpp/tests/io/orc_test.cpp | 38 +++++++++++++++++++ 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh index 597bb34a189..18e3c3e3b2a 100644 --- a/cpp/src/io/statistics/typed_statistics_chunk.cuh +++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh @@ -237,14 +237,22 @@ template __inline__ __device__ statistics_chunk get_untyped_chunk(const typed_statistics_chunk& chunk) { + using E = typename detail::extrema_type::type; statistics_chunk stat; stat.non_nulls = chunk.non_nulls; stat.null_count = chunk.num_rows - chunk.non_nulls; stat.has_minmax = chunk.has_minmax; - stat.has_sum = - chunk.has_minmax; // If a valid input was encountered we assume that the sum is valid + stat.has_sum = [&]() { + if (!chunk.has_minmax) return false; + if constexpr (std::is_floating_point_v or std::is_integral_v) { + return std::numeric_limits::max() / chunk.non_nulls >= + static_cast(chunk.maximum_value) and + std::numeric_limits::lowest() / chunk.non_nulls <= + static_cast(chunk.minimum_value); + } + return true; + }(); if (chunk.has_minmax) { - using E = typename detail::extrema_type::type; if constexpr (std::is_floating_point_v) { union_member::get(stat.min_value) = (chunk.minimum_value != 0.0) ? chunk.minimum_value : CUDART_NEG_ZERO; diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index ba8f1ec1e12..5323f79767f 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1197,4 +1197,42 @@ TEST_F(OrcWriterTest, Decimal32) CUDF_TEST_EXPECT_COLUMNS_EQUAL(col64, result.tbl->view().column(0)); } +TEST_F(OrcStatisticsTest, Overflow) +{ + int num_rows = 10; + auto too_large_seq = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * (std::numeric_limits::max()/20); }); + auto too_small_seq = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * (std::numeric_limits::min()/20); }); + auto not_too_large_seq = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * (std::numeric_limits::max()/200); }); + auto not_too_small_seq = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * (std::numeric_limits::min()/200); }); + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); + + column_wrapper col1( + too_large_seq, too_large_seq + num_rows, validity); + column_wrapper col2( + too_small_seq, too_small_seq + num_rows, validity); + column_wrapper col3( + not_too_large_seq, not_too_large_seq + num_rows, validity); + column_wrapper col4( + not_too_small_seq, not_too_small_seq + num_rows, validity); + table_view tbl({col1, col2, col3, col4}); + + auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc"); + + cudf_io::orc_writer_options out_opts = + cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl); + cudf_io::write_orc(out_opts); + + auto const stats = cudf_io::read_parsed_orc_statistics(cudf_io::source_info{filepath}); + + auto check_sum_exist = [&](int idx, bool expected){ + auto const& s = stats.file_stats[idx]; + auto const& ts = std::get(s.type_specific_stats); + EXPECT_EQ(ts.sum.has_value(), expected); + }; + check_sum_exist(1, false); + check_sum_exist(2, false); + check_sum_exist(3, true); + check_sum_exist(4, true); +} + CUDF_TEST_PROGRAM_MAIN() From 766ac181abe413cd824c178db20b40b8c7507472 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 1 Sep 2021 14:47:30 -0700 Subject: [PATCH 2/3] style fix --- cpp/tests/io/orc_test.cpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 5323f79767f..fbeba925f1b 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1199,12 +1199,16 @@ TEST_F(OrcWriterTest, Decimal32) TEST_F(OrcStatisticsTest, Overflow) { - int num_rows = 10; - auto too_large_seq = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * (std::numeric_limits::max()/20); }); - auto too_small_seq = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * (std::numeric_limits::min()/20); }); - auto not_too_large_seq = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * (std::numeric_limits::max()/200); }); - auto not_too_small_seq = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * (std::numeric_limits::min()/200); }); - auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); + int num_rows = 10; + auto too_large_seq = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i * (std::numeric_limits::max() / 20); }); + auto too_small_seq = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i * (std::numeric_limits::min() / 20); }); + auto not_too_large_seq = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i * (std::numeric_limits::max() / 200); }); + auto not_too_small_seq = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i * (std::numeric_limits::min() / 200); }); + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); column_wrapper col1( too_large_seq, too_large_seq + num_rows, validity); @@ -1224,11 +1228,11 @@ TEST_F(OrcStatisticsTest, Overflow) auto const stats = cudf_io::read_parsed_orc_statistics(cudf_io::source_info{filepath}); - auto check_sum_exist = [&](int idx, bool expected){ - auto const& s = stats.file_stats[idx]; + auto check_sum_exist = [&](int idx, bool expected) { + auto const& s = stats.file_stats[idx]; auto const& ts = std::get(s.type_specific_stats); EXPECT_EQ(ts.sum.has_value(), expected); - }; + }; check_sum_exist(1, false); check_sum_exist(2, false); check_sum_exist(3, true); From f0a228c2bff3c7c9e595a13344b4bc9a03f5f2ea Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 1 Sep 2021 16:42:53 -0700 Subject: [PATCH 3/3] add comment --- cpp/src/io/statistics/typed_statistics_chunk.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh index 18e3c3e3b2a..d727b43f56e 100644 --- a/cpp/src/io/statistics/typed_statistics_chunk.cuh +++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh @@ -244,6 +244,7 @@ get_untyped_chunk(const typed_statistics_chunk& chunk) stat.has_minmax = chunk.has_minmax; stat.has_sum = [&]() { if (!chunk.has_minmax) return false; + // invalidate the sum if overlow or underflow is possible if constexpr (std::is_floating_point_v or std::is_integral_v) { return std::numeric_limits::max() / chunk.non_nulls >= static_cast(chunk.maximum_value) and