From 7a1b60a9a6bb16c148794fbb1d2adab5182daede Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 16 Aug 2023 21:48:21 -0700 Subject: [PATCH 01/10] list correct encodings in chunk metadata --- cpp/src/io/parquet/page_enc.cu | 20 ++++++++++++++ cpp/src/io/parquet/parquet_gpu.hpp | 27 +++++++++++++++++-- cpp/src/io/parquet/writer_impl.cu | 43 +++++++++++++++++++++--------- 3 files changed, 76 insertions(+), 14 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 20993d12af8..43a5aa569a0 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -229,6 +229,15 @@ Encoding __device__ determine_encoding(PageType page_type, } } +struct Or { + /// Binary sum operator, returns a + b + template + __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const + { + return a | b; + } +}; + } // anonymous namespace // blockDim {512,1,1} @@ -1445,6 +1454,7 @@ __global__ void __launch_bounds__(decide_compression_block_size) uint32_t uncompressed_data_size = 0; uint32_t compressed_data_size = 0; + uint32_t encodings = 0; auto const num_pages = ck_g[warp_id].num_pages; for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) { auto const& curr_page = ck_g[warp_id].pages[page_id]; @@ -1457,10 +1467,14 @@ __global__ void __launch_bounds__(decide_compression_block_size) atomicOr(&compression_error[warp_id], 1); } } + // collect encoding info for the chunk metadata + encodings |= encoding_to_mask(curr_page.encoding); } uncompressed_data_size = warp_reduce(temp_storage[warp_id][0]).Sum(uncompressed_data_size); compressed_data_size = warp_reduce(temp_storage[warp_id][1]).Sum(compressed_data_size); __syncwarp(); + encodings = warp_reduce(temp_storage[warp_id][0]).Reduce(encodings, Or{}); + __syncwarp(); if (lane_id == 0) { auto const write_compressed = compressed_data_size != 0 and compression_error[warp_id] == 0 and @@ -1469,6 +1483,12 @@ __global__ void __launch_bounds__(decide_compression_block_size) chunks[chunk_id].bfr_size = uncompressed_data_size; chunks[chunk_id].compressed_size = write_compressed ? compressed_data_size : uncompressed_data_size; + + // if there is repetition or definition level data add RLE encoding + auto const rle_bits = + ck_g[warp_id].col_desc->num_def_level_bits() + ck_g[warp_id].col_desc->num_rep_level_bits(); + if (rle_bits > 0) { encodings |= EncodingMask::RLE; } + chunks[chunk_id].encodings = encodings; } } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index fc4ad026b61..897fbefe5dc 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -326,8 +326,8 @@ struct parquet_column_device_view : stats_column_desc { ConvertedType converted_type; //!< logical data type uint8_t level_bits; //!< bits to encode max definition (lower nibble) & repetition (upper nibble) //!< levels - constexpr uint8_t num_def_level_bits() { return level_bits & 0xf; } - constexpr uint8_t num_rep_level_bits() { return level_bits >> 4; } + constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; } + constexpr uint8_t num_rep_level_bits() const { return level_bits >> 4; } size_type const* const* nesting_offsets; //!< If column is a nested type, contains offset array of each nesting level @@ -365,6 +365,28 @@ constexpr size_t kDictScratchSize = (1 << kDictHashBits) * sizeof(uint32_t); struct EncPage; struct slot_type; +// convert Encoding to a mask value +constexpr uint32_t encoding_to_mask(Encoding encoding) +{ + return 1 << static_cast(encoding); +} + +/** + * @brief Encoding values as mask bits + */ +enum EncodingMask { + PLAIN = encoding_to_mask(Encoding::PLAIN), + GROUP_VAR_INT = encoding_to_mask(Encoding::GROUP_VAR_INT), + PLAIN_DICTIONARY = encoding_to_mask(Encoding::PLAIN_DICTIONARY), + RLE = encoding_to_mask(Encoding::RLE), + BIT_PACKED = encoding_to_mask(Encoding::BIT_PACKED), + DELTA_BINARY_PACKED = encoding_to_mask(Encoding::DELTA_BINARY_PACKED), + DELTA_LENGTH_BYTE_ARRAY = encoding_to_mask(Encoding::DELTA_LENGTH_BYTE_ARRAY), + DELTA_BYTE_ARRAY = encoding_to_mask(Encoding::DELTA_BYTE_ARRAY), + RLE_DICTIONARY = encoding_to_mask(Encoding::RLE_DICTIONARY), + BYTE_STREAM_SPLIT = encoding_to_mask(Encoding::BYTE_STREAM_SPLIT) +}; + /** * @brief Struct describing an encoder column chunk */ @@ -401,6 +423,7 @@ struct EncColumnChunk { bool use_dictionary; //!< True if the chunk uses dictionary encoding uint8_t* column_index_blob; //!< Binary blob containing encoded column index for this chunk uint32_t column_index_size; //!< Size of column index blob + uint32_t encodings; //!< Mask representing the set of encodings used for this chunk }; /** diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 021b6cffa5a..f23826d35a6 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -193,6 +193,34 @@ parquet::Compression to_parquet_compression(compression_type compression) } } +/** + * @brief Convert a mask of encodings to a vector. + * + * @param encodings Vector of `Encoding`s to populate + * @param enc_mask Mask of encodings used + */ +void update_chunk_encodings(std::vector& encodings, uint32_t enc_mask) +{ + // only convert the encodings we support + if ((enc_mask & gpu::EncodingMask::PLAIN) != 0) { encodings.push_back(Encoding::PLAIN); } + if ((enc_mask & gpu::EncodingMask::PLAIN_DICTIONARY) != 0) { + encodings.push_back(Encoding::PLAIN_DICTIONARY); + } + if ((enc_mask & gpu::EncodingMask::RLE) != 0) { encodings.push_back(Encoding::RLE); } + if ((enc_mask & gpu::EncodingMask::DELTA_BINARY_PACKED) != 0) { + encodings.push_back(Encoding::DELTA_BINARY_PACKED); + } + if ((enc_mask & gpu::EncodingMask::DELTA_LENGTH_BYTE_ARRAY) != 0) { + encodings.push_back(Encoding::DELTA_LENGTH_BYTE_ARRAY); + } + if ((enc_mask & gpu::EncodingMask::DELTA_BYTE_ARRAY) != 0) { + encodings.push_back(Encoding::DELTA_BYTE_ARRAY); + } + if ((enc_mask & gpu::EncodingMask::RLE_DICTIONARY) != 0) { + encodings.push_back(Encoding::RLE_DICTIONARY); + } +} + /** * @brief Compute size (in bytes) of the data stored in the given column. * @@ -1671,6 +1699,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, ck.start_row = start_row; ck.num_rows = (uint32_t)row_group.num_rows; ck.first_fragment = c * num_fragments + f; + ck.encodings = 0; auto chunk_fragments = row_group_fragments[c].subspan(f, fragments_in_chunk); // In fragment struct, add a pointer to the chunk it belongs to // In each fragment in chunk_fragments, update the chunk pointer here. @@ -1687,7 +1716,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, }); auto& column_chunk_meta = row_group.columns[c].meta_data; column_chunk_meta.type = parquet_columns[c].physical_type(); - column_chunk_meta.encodings = {Encoding::PLAIN, Encoding::RLE}; column_chunk_meta.path_in_schema = parquet_columns[c].get_path_in_schema(); column_chunk_meta.codec = UNCOMPRESSED; column_chunk_meta.num_values = ck.num_values; @@ -1703,17 +1731,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, row_group_fragments.host_to_device_async(stream); [[maybe_unused]] auto dict_info_owner = build_chunk_dictionaries( chunks, col_desc, row_group_fragments, compression, dict_policy, max_dictionary_size, stream); - for (size_t p = 0; p < partitions.size(); p++) { - for (int rg = 0; rg < num_rg_in_part[p]; rg++) { - size_t global_rg = global_rowgroup_base[p] + rg; - for (int col = 0; col < num_columns; col++) { - if (chunks.host_view()[rg][col].use_dictionary) { - agg_meta->file(p).row_groups[global_rg].columns[col].meta_data.encodings.push_back( - Encoding::PLAIN_DICTIONARY); - } - } - } - } // The code preceding this used a uniform fragment size for all columns. Now recompute // fragments with a (potentially) varying number of fragments per column. @@ -1949,6 +1966,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } max_write_size = std::max(max_write_size, ck.compressed_size); + update_chunk_encodings(column_chunk_meta.encodings, ck.encodings); + if (ck.ck_stat_size != 0) { std::vector const stats_blob = cudf::detail::make_std_vector_sync( device_span(dev_bfr, ck.ck_stat_size), stream); From 8004f64b52e5228a1a9fbe5ab1ce1e24f67ea221 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 17 Aug 2023 09:52:07 -0700 Subject: [PATCH 02/10] update docstring for DecideCompression --- cpp/src/io/parquet/parquet_gpu.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 897fbefe5dc..66019d143ae 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -716,6 +716,8 @@ void EncodePages(device_span pages, /** * @brief Launches kernel to make the compressed vs uncompressed chunk-level decision * + * Also calculates the set of page encodings used for each chunk. + * * @param[in,out] chunks Column chunks (updated with actual compressed/uncompressed sizes) * @param[in] stream CUDA stream to use */ From 99f76b335b9e833d53a8d2ec883df414a2ed8b90 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 17 Aug 2023 13:57:16 -0700 Subject: [PATCH 03/10] add test --- cpp/tests/io/parquet_test.cpp | 73 +++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 8c7d598d33f..5227a2c5bcf 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -6599,4 +6599,77 @@ TEST_F(ParquetWriterTest, TimestampMicrosINT96NoOverflow) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } +TEST_P(ParquetV2Test, CheckEncodings) +{ + using cudf::io::parquet::Encoding; + constexpr auto num_rows = 100'000; + auto const is_v2 = GetParam(); + + auto const validity = cudf::test::iterators::no_nulls(); + // data should be PLAIN for v1, RLE for V2 + auto col0_data = + cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 2 == 0; }); + // data should be PLAIN for both + auto col1_data = random_values(num_rows); + // data should be PLAIN_DICTIONARY for v1, PLAIN and RLE_DICTIONARY for v2 + auto col2_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); + + cudf::test::fixed_width_column_wrapper col0{col0_data, col0_data + num_rows, validity}; + column_wrapper col1{col1_data.begin(), col1_data.end(), validity}; + column_wrapper col2{col2_data, col2_data + num_rows, validity}; + + auto expected = table_view{{col0, col1, col2}}; + + auto const filename = is_v2 ? "CheckEncodingsV2.parquet" : "CheckEncodingsV1.parquet"; + auto filepath = temp_env->get_temp_filepath(filename); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .max_page_size_rows(num_rows) + .write_v2_headers(is_v2); + cudf::io::write_parquet(out_opts); + + // make sure the expected encodings are present + auto contains = [](auto vec, auto enc) { + for (size_t i = 0; i < vec.size(); i++) { + if (vec[i] == enc) return true; + } + return false; + }; + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::FileMetaData fmd; + + read_footer(source, &fmd); + auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings; + auto const& chunk1_enc = fmd.row_groups[0].columns[1].meta_data.encodings; + auto const& chunk2_enc = fmd.row_groups[0].columns[2].meta_data.encodings; + if (is_v2) { + // col0 should have RLE for rep/def and data + EXPECT_TRUE(chunk0_enc.size() == 1); + EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE)); + // col1 should have RLE for rep/def and PLAIN for data + EXPECT_TRUE(chunk1_enc.size() == 2); + EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE)); + EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN)); + // col2 should have RLE for rep/def, PLAIN for dict, and PLAIN_DICTIONARY for data + EXPECT_TRUE(chunk2_enc.size() == 3); + EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE)); + EXPECT_TRUE(contains(chunk2_enc, Encoding::PLAIN)); + EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE_DICTIONARY)); + } else { + // col0 should have RLE for rep/def and PLAIN for data + EXPECT_TRUE(chunk0_enc.size() == 2); + EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE)); + EXPECT_TRUE(contains(chunk0_enc, Encoding::PLAIN)); + // col1 should have RLE for rep/def and PLAIN for data + EXPECT_TRUE(chunk1_enc.size() == 2); + EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE)); + EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN)); + // col2 should have RLE for rep/def and PLAIN_DICTIONARY for data and dict + EXPECT_TRUE(chunk2_enc.size() == 2); + EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE)); + EXPECT_TRUE(contains(chunk2_enc, Encoding::PLAIN_DICTIONARY)); + } +} + CUDF_TEST_PROGRAM_MAIN() From 61a79d46db79274219cedd35c6f92901b0990d03 Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 17 Aug 2023 14:15:06 -0700 Subject: [PATCH 04/10] fix comments for Or --- cpp/src/io/parquet/page_enc.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 43a5aa569a0..b780e27f36c 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -229,8 +229,9 @@ Encoding __device__ determine_encoding(PageType page_type, } } +// operator to use with warp_reduce. stolen from cub::Sum struct Or { - /// Binary sum operator, returns a + b + /// Binary OR operator, returns a | b template __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const { From 8b545c95b00ef8df01723b188156b53c5286e06b Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 17 Aug 2023 19:30:04 -0700 Subject: [PATCH 05/10] Apply suggestions from code review Co-authored-by: Vukasin Milovanovic --- cpp/src/io/parquet/page_enc.cu | 2 +- cpp/tests/io/parquet_test.cpp | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index b780e27f36c..fc1795b3007 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -233,7 +233,7 @@ Encoding __device__ determine_encoding(PageType page_type, struct Or { /// Binary OR operator, returns a | b template - __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const + __host__ __device__ __forceinline__ T operator()(T const& a, T const& b) const { return a | b; } diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 5227a2c5bcf..94d7deb4fb4 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -6629,11 +6629,8 @@ TEST_P(ParquetV2Test, CheckEncodings) cudf::io::write_parquet(out_opts); // make sure the expected encodings are present - auto contains = [](auto vec, auto enc) { - for (size_t i = 0; i < vec.size(); i++) { - if (vec[i] == enc) return true; - } - return false; + auto contains = [](auto const& vec, auto const& enc) { + return std::find(vec.begin(), vec.end(), enc) != vec.end(); }; auto const source = cudf::io::datasource::create(filepath); From d9e9ab611dc630ec7c34409b758f4b82b0ae1505 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 17 Aug 2023 19:41:41 -0700 Subject: [PATCH 06/10] better name for functor --- cpp/src/io/parquet/page_enc.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index fc1795b3007..6cf4fb01b59 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -230,7 +230,7 @@ Encoding __device__ determine_encoding(PageType page_type, } // operator to use with warp_reduce. stolen from cub::Sum -struct Or { +struct BitwiseOr { /// Binary OR operator, returns a | b template __host__ __device__ __forceinline__ T operator()(T const& a, T const& b) const @@ -1474,7 +1474,7 @@ __global__ void __launch_bounds__(decide_compression_block_size) uncompressed_data_size = warp_reduce(temp_storage[warp_id][0]).Sum(uncompressed_data_size); compressed_data_size = warp_reduce(temp_storage[warp_id][1]).Sum(compressed_data_size); __syncwarp(); - encodings = warp_reduce(temp_storage[warp_id][0]).Reduce(encodings, Or{}); + encodings = warp_reduce(temp_storage[warp_id][0]).Reduce(encodings, BitwiseOr{}); __syncwarp(); if (lane_id == 0) { From d911d6d0cb5dba6a73095770b09360747b852b3a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 17 Aug 2023 20:06:42 -0700 Subject: [PATCH 07/10] simplify update_chunk_encodings --- cpp/src/io/parquet/parquet_common.hpp | 1 + cpp/src/io/parquet/writer_impl.cu | 20 +++----------------- 2 files changed, 4 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index ab6290c4ed6..5f8f1617cb9 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -92,6 +92,7 @@ enum class Encoding : uint8_t { DELTA_BYTE_ARRAY = 7, RLE_DICTIONARY = 8, BYTE_STREAM_SPLIT = 9, + NUM_ENCODINGS = 10, }; /** diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index f23826d35a6..18ac2c9eae7 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -201,23 +201,9 @@ parquet::Compression to_parquet_compression(compression_type compression) */ void update_chunk_encodings(std::vector& encodings, uint32_t enc_mask) { - // only convert the encodings we support - if ((enc_mask & gpu::EncodingMask::PLAIN) != 0) { encodings.push_back(Encoding::PLAIN); } - if ((enc_mask & gpu::EncodingMask::PLAIN_DICTIONARY) != 0) { - encodings.push_back(Encoding::PLAIN_DICTIONARY); - } - if ((enc_mask & gpu::EncodingMask::RLE) != 0) { encodings.push_back(Encoding::RLE); } - if ((enc_mask & gpu::EncodingMask::DELTA_BINARY_PACKED) != 0) { - encodings.push_back(Encoding::DELTA_BINARY_PACKED); - } - if ((enc_mask & gpu::EncodingMask::DELTA_LENGTH_BYTE_ARRAY) != 0) { - encodings.push_back(Encoding::DELTA_LENGTH_BYTE_ARRAY); - } - if ((enc_mask & gpu::EncodingMask::DELTA_BYTE_ARRAY) != 0) { - encodings.push_back(Encoding::DELTA_BYTE_ARRAY); - } - if ((enc_mask & gpu::EncodingMask::RLE_DICTIONARY) != 0) { - encodings.push_back(Encoding::RLE_DICTIONARY); + for (uint8_t enc = 0; enc < Encoding::NUM_ENCODINGS; enc++) { + auto const enc_enum = static_cast(enc); + if ((enc_mask & encoding_to_mask(enc_enum) != 0) { encodings.push_back(enc_enum); } } } From d00543531f310ae1cc238e679b66ce8b089b6b3d Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 17 Aug 2023 20:15:09 -0700 Subject: [PATCH 08/10] fix comment --- cpp/tests/io/parquet_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 94d7deb4fb4..b210452f619 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -6648,7 +6648,7 @@ TEST_P(ParquetV2Test, CheckEncodings) EXPECT_TRUE(chunk1_enc.size() == 2); EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE)); EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN)); - // col2 should have RLE for rep/def, PLAIN for dict, and PLAIN_DICTIONARY for data + // col2 should have RLE for rep/def, PLAIN for dict, and RLE_DICTIONARY for data EXPECT_TRUE(chunk2_enc.size() == 3); EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE)); EXPECT_TRUE(contains(chunk2_enc, Encoding::PLAIN)); From a6fc01bf3105964c18cfce8fc3d16be408e4de0d Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 17 Aug 2023 21:02:03 -0700 Subject: [PATCH 09/10] some cleanup --- cpp/src/io/parquet/writer_impl.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 18ac2c9eae7..40cb107da48 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -201,9 +201,9 @@ parquet::Compression to_parquet_compression(compression_type compression) */ void update_chunk_encodings(std::vector& encodings, uint32_t enc_mask) { - for (uint8_t enc = 0; enc < Encoding::NUM_ENCODINGS; enc++) { + for (uint8_t enc = 0; enc < static_cast(Encoding::NUM_ENCODINGS); enc++) { auto const enc_enum = static_cast(enc); - if ((enc_mask & encoding_to_mask(enc_enum) != 0) { encodings.push_back(enc_enum); } + if ((enc_mask & gpu::encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); } } } From a5ffee63f77e79bbc48537b2aab9b5df1c20a73f Mon Sep 17 00:00:00 2001 From: seidl Date: Thu, 17 Aug 2023 21:21:00 -0700 Subject: [PATCH 10/10] EncodingMask enum is not needed --- cpp/src/io/parquet/page_enc.cu | 2 +- cpp/src/io/parquet/parquet_gpu.hpp | 16 ---------------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 6cf4fb01b59..c369f83d6b0 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -1488,7 +1488,7 @@ __global__ void __launch_bounds__(decide_compression_block_size) // if there is repetition or definition level data add RLE encoding auto const rle_bits = ck_g[warp_id].col_desc->num_def_level_bits() + ck_g[warp_id].col_desc->num_rep_level_bits(); - if (rle_bits > 0) { encodings |= EncodingMask::RLE; } + if (rle_bits > 0) { encodings |= encoding_to_mask(Encoding::RLE); } chunks[chunk_id].encodings = encodings; } } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 66019d143ae..3e72bc50f40 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -371,22 +371,6 @@ constexpr uint32_t encoding_to_mask(Encoding encoding) return 1 << static_cast(encoding); } -/** - * @brief Encoding values as mask bits - */ -enum EncodingMask { - PLAIN = encoding_to_mask(Encoding::PLAIN), - GROUP_VAR_INT = encoding_to_mask(Encoding::GROUP_VAR_INT), - PLAIN_DICTIONARY = encoding_to_mask(Encoding::PLAIN_DICTIONARY), - RLE = encoding_to_mask(Encoding::RLE), - BIT_PACKED = encoding_to_mask(Encoding::BIT_PACKED), - DELTA_BINARY_PACKED = encoding_to_mask(Encoding::DELTA_BINARY_PACKED), - DELTA_LENGTH_BYTE_ARRAY = encoding_to_mask(Encoding::DELTA_LENGTH_BYTE_ARRAY), - DELTA_BYTE_ARRAY = encoding_to_mask(Encoding::DELTA_BYTE_ARRAY), - RLE_DICTIONARY = encoding_to_mask(Encoding::RLE_DICTIONARY), - BYTE_STREAM_SPLIT = encoding_to_mask(Encoding::BYTE_STREAM_SPLIT) -}; - /** * @brief Struct describing an encoder column chunk */