Skip to content

Commit

Permalink
target multiple fragments per page to attempt to stay under page size…
Browse files Browse the repository at this point in the history
… limit
  • Loading branch information
etseidl committed Feb 4, 2023
1 parent 18ee7c1 commit cd9bbcf
Showing 1 changed file with 11 additions and 3 deletions.
14 changes: 11 additions & 3 deletions cpp/src/io/parquet/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1463,10 +1463,18 @@ void writer::impl::write(table_view const& table, std::vector<partition_info> co
auto iter = thrust::make_counting_iterator<size_type>(0);
if (table.num_rows() > 0 && max_page_fragment_size_ == cudf::io::default_max_page_fragment_size) {
std::for_each(iter, iter + single_streams_table.num_columns(), [&](size_type index) {
auto const avg_len = util::div_rounding_up_safe(
column_size(single_streams_table.column(index), stream), table.num_rows());
// dividing page size by average row length will tend to overshoot the desired page
// size when there's high variability in the row lengths. instead, shoot for multiple
// fragments per page to smooth things out. using 2 was too unbalanced in final page
// sizes, so using 4 which seems to be a good compromise at smoothing things out without
// getting fragment sizes too small.
constexpr int target_frags_per_page = 4;
auto const avg_len =
target_frags_per_page *
util::div_rounding_up_safe(column_size(single_streams_table.column(index), stream),
table.num_rows());
if (avg_len > 0) {
auto const frag_size = util::div_rounding_up_safe<size_type>(max_page_size_bytes, avg_len);
auto const frag_size = util::div_rounding_up_safe<size_type>(max_page_size_bytes, avg_len);
column_frag_size[index] = std::min<size_type>(max_page_fragment_size_, frag_size);
}
});
Expand Down

0 comments on commit cd9bbcf

Please sign in to comment.