Skip to content

Commit

Permalink
Support reading bloom filters from Parquet files and filter row group…
Browse files Browse the repository at this point in the history
…s using them (#17289)

This PR adds support to read bloom filters from Parquet files and use them to filter row groups based on `col == literal` like predicate(s), if provided. 

Related to #17164

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)
  - Bradley Dice (https://github.com/bdice)

URL: #17289
mhaseeb123 authored Jan 14, 2025

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent fe75cb8 commit 41215e2
Showing 16 changed files with 1,098 additions and 67 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -516,6 +516,7 @@ add_library(
src/datetime/timezone.cpp
src/io/orc/writer_impl.cu
src/io/parquet/arrow_schema_writer.cpp
src/io/parquet/bloom_filter_reader.cu
src/io/parquet/compact_protocol_reader.cpp
src/io/parquet/compact_protocol_writer.cpp
src/io/parquet/decode_preprocess.cu
683 changes: 683 additions & 0 deletions cpp/src/io/parquet/bloom_filter_reader.cu

Large diffs are not rendered by default.

35 changes: 33 additions & 2 deletions cpp/src/io/parquet/compact_protocol_reader.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2024, NVIDIA CORPORATION.
* Copyright (c) 2018-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -658,14 +658,43 @@ void CompactProtocolReader::read(ColumnChunk* c)
function_builder(this, op);
}

void CompactProtocolReader::read(BloomFilterAlgorithm* alg)
{
auto op = std::make_tuple(parquet_field_union_enumerator(1, alg->algorithm));
function_builder(this, op);
}

void CompactProtocolReader::read(BloomFilterHash* hash)
{
auto op = std::make_tuple(parquet_field_union_enumerator(1, hash->hash));
function_builder(this, op);
}

void CompactProtocolReader::read(BloomFilterCompression* comp)
{
auto op = std::make_tuple(parquet_field_union_enumerator(1, comp->compression));
function_builder(this, op);
}

void CompactProtocolReader::read(BloomFilterHeader* bf)
{
auto op = std::make_tuple(parquet_field_int32(1, bf->num_bytes),
parquet_field_struct(2, bf->algorithm),
parquet_field_struct(3, bf->hash),
parquet_field_struct(4, bf->compression));
function_builder(this, op);
}

void CompactProtocolReader::read(ColumnChunkMetaData* c)
{
using optional_size_statistics =
parquet_field_optional<SizeStatistics, parquet_field_struct<SizeStatistics>>;
using optional_list_enc_stats =
parquet_field_optional<std::vector<PageEncodingStats>,
parquet_field_struct_list<PageEncodingStats>>;
auto op = std::make_tuple(parquet_field_enum<Type>(1, c->type),
using optional_i64 = parquet_field_optional<int64_t, parquet_field_int64>;
using optional_i32 = parquet_field_optional<int32_t, parquet_field_int32>;
auto op = std::make_tuple(parquet_field_enum<Type>(1, c->type),
parquet_field_enum_list(2, c->encodings),
parquet_field_string_list(3, c->path_in_schema),
parquet_field_enum<Compression>(4, c->codec),
@@ -677,6 +706,8 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c)
parquet_field_int64(11, c->dictionary_page_offset),
parquet_field_struct(12, c->statistics),
optional_list_enc_stats(13, c->encoding_stats),
optional_i64(14, c->bloom_filter_offset),
optional_i32(15, c->bloom_filter_length),
optional_size_statistics(16, c->size_statistics));
function_builder(this, op);
}
6 changes: 5 additions & 1 deletion cpp/src/io/parquet/compact_protocol_reader.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2024, NVIDIA CORPORATION.
* Copyright (c) 2018-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -108,6 +108,10 @@ class CompactProtocolReader {
void read(IntType* t);
void read(RowGroup* r);
void read(ColumnChunk* c);
void read(BloomFilterAlgorithm* bf);
void read(BloomFilterHash* bf);
void read(BloomFilterCompression* bf);
void read(BloomFilterHeader* bf);
void read(ColumnChunkMetaData* c);
void read(PageHeader* p);
void read(DataPageHeader* d);
52 changes: 51 additions & 1 deletion cpp/src/io/parquet/parquet.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018-2024, NVIDIA CORPORATION.
* Copyright (c) 2018-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -382,12 +382,62 @@ struct ColumnChunkMetaData {
// Set of all encodings used for pages in this column chunk. This information can be used to
// determine if all data pages are dictionary encoded for example.
std::optional<std::vector<PageEncodingStats>> encoding_stats;
// Byte offset from beginning of file to Bloom filter data.
std::optional<int64_t> bloom_filter_offset;
// Size of Bloom filter data including the serialized header, in bytes. Added in 2.10 so readers
// may not read this field from old files and it can be obtained after the BloomFilterHeader has
// been deserialized. Writers should write this field so readers can read the bloom filter in a
// single I/O.
std::optional<int32_t> bloom_filter_length;
// Optional statistics to help estimate total memory when converted to in-memory representations.
// The histograms contained in these statistics can also be useful in some cases for more
// fine-grained nullability/list length filter pushdown.
std::optional<SizeStatistics> size_statistics;
};

/**
* @brief The algorithm used in bloom filter
*/
struct BloomFilterAlgorithm {
// Block-based Bloom filter.
enum class Algorithm { UNDEFINED, SPLIT_BLOCK };
Algorithm algorithm{Algorithm::SPLIT_BLOCK};
};

/**
* @brief The hash function used in Bloom filter
*/
struct BloomFilterHash {
// xxHash_64
enum class Hash { UNDEFINED, XXHASH };
Hash hash{Hash::XXHASH};
};

/**
* @brief The compression used in the bloom filter
*/
struct BloomFilterCompression {
enum class Compression { UNDEFINED, UNCOMPRESSED };
Compression compression{Compression::UNCOMPRESSED};
};

/**
* @brief Bloom filter header struct
*
* The bloom filter data of a column chunk stores this header at the beginning
* following by the filter bitset.
*/
struct BloomFilterHeader {
// The size of bitset in bytes
int32_t num_bytes;
// The algorithm for setting bits
BloomFilterAlgorithm algorithm;
// The hash function used for bloom filter
BloomFilterHash hash;
// The compression used in the bloom filter
BloomFilterCompression compression;
};

/**
* @brief Thrift-derived struct describing a chunk of data for a particular
* column
136 changes: 88 additions & 48 deletions cpp/src/io/parquet/predicate_pushdown.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
#include <thrust/iterator/counting_iterator.h>

#include <algorithm>
#include <limits>
#include <numeric>
#include <optional>
#include <unordered_set>
@@ -388,6 +389,7 @@ class stats_expression_converter : public ast::detail::expression_transformer {
} // namespace

std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::filter_row_groups(
host_span<std::unique_ptr<datasource> const> sources,
host_span<std::vector<size_type> const> row_group_indices,
host_span<data_type const> output_dtypes,
host_span<int const> output_column_schemas,
@@ -396,7 +398,6 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
{
auto mr = cudf::get_current_device_resource_ref();
// Create row group indices.
std::vector<std::vector<size_type>> filtered_row_group_indices;
std::vector<std::vector<size_type>> all_row_group_indices;
host_span<std::vector<size_type> const> input_row_group_indices;
if (row_group_indices.empty()) {
@@ -412,18 +413,22 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
} else {
input_row_group_indices = row_group_indices;
}
auto const total_row_groups = std::accumulate(input_row_group_indices.begin(),
input_row_group_indices.end(),
0,
[](size_type sum, auto const& per_file_row_groups) {
return sum + per_file_row_groups.size();
});
auto const total_row_groups = std::accumulate(
input_row_group_indices.begin(),
input_row_group_indices.end(),
size_t{0},
[](size_t sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); });

// Check if we have less than 2B total row groups.
CUDF_EXPECTS(total_row_groups <= std::numeric_limits<cudf::size_type>::max(),
"Total number of row groups exceed the size_type's limit");

// Converts Column chunk statistics to a table
// where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1]
// For each column, it contains #sources * #column_chunks_per_src rows.
std::vector<std::unique_ptr<column>> columns;
stats_caster const stats_col{total_row_groups, per_file_metadata, input_row_group_indices};
stats_caster const stats_col{
static_cast<size_type>(total_row_groups), per_file_metadata, input_row_group_indices};
for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) {
auto const schema_idx = output_column_schemas[col_idx];
auto const& dtype = output_dtypes[col_idx];
@@ -452,44 +457,23 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8,
"Filter expression must return a boolean column");

auto const host_bitmask = [&] {
auto const num_bitmasks = num_bitmask_words(predicate.size());
if (predicate.nullable()) {
return cudf::detail::make_host_vector_sync(
device_span<bitmask_type const>(predicate.null_mask(), num_bitmasks), stream);
} else {
auto bitmask = cudf::detail::make_host_vector<bitmask_type>(num_bitmasks, stream);
std::fill(bitmask.begin(), bitmask.end(), ~bitmask_type{0});
return bitmask;
}
}();
// Filter stats table with StatsAST expression and collect filtered row group indices
auto const filtered_row_group_indices = collect_filtered_row_group_indices(
stats_table, stats_expr.get_stats_expr(), input_row_group_indices, stream);

auto validity_it = cudf::detail::make_counting_transform_iterator(
0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); });
// Span of row groups to apply bloom filtering on.
auto const bloom_filter_input_row_groups =
filtered_row_group_indices.has_value()
? host_span<std::vector<size_type> const>(filtered_row_group_indices.value())
: input_row_group_indices;

auto const is_row_group_required = cudf::detail::make_host_vector_sync(
device_span<uint8_t const>(predicate.data<uint8_t>(), predicate.size()), stream);
// Apply bloom filtering on the bloom filter input row groups
auto const bloom_filtered_row_groups = apply_bloom_filters(
sources, bloom_filter_input_row_groups, output_dtypes, output_column_schemas, filter, stream);

// Return only filtered row groups based on predicate
// if all are required or all are nulls, return.
if (std::all_of(is_row_group_required.cbegin(),
is_row_group_required.cend(),
[](auto i) { return bool(i); }) or
predicate.null_count() == predicate.size()) {
return std::nullopt;
}
size_type is_required_idx = 0;
for (auto const& input_row_group_index : input_row_group_indices) {
std::vector<size_type> filtered_row_groups;
for (auto const rg_idx : input_row_group_index) {
if ((!validity_it[is_required_idx]) || is_row_group_required[is_required_idx]) {
filtered_row_groups.push_back(rg_idx);
}
++is_required_idx;
}
filtered_row_group_indices.push_back(std::move(filtered_row_groups));
}
return {std::move(filtered_row_group_indices)};
// Return bloom filtered row group indices iff collected
return bloom_filtered_row_groups.has_value() ? bloom_filtered_row_groups
: filtered_row_group_indices;
}

// convert column named expression to column index reference expression
@@ -510,14 +494,14 @@ named_to_reference_converter::named_to_reference_converter(
std::reference_wrapper<ast::expression const> named_to_reference_converter::visit(
ast::literal const& expr)
{
_stats_expr = std::reference_wrapper<ast::expression const>(expr);
_converted_expr = std::reference_wrapper<ast::expression const>(expr);
return expr;
}

std::reference_wrapper<ast::expression const> named_to_reference_converter::visit(
ast::column_reference const& expr)
{
_stats_expr = std::reference_wrapper<ast::expression const>(expr);
_converted_expr = std::reference_wrapper<ast::expression const>(expr);
return expr;
}

@@ -531,7 +515,7 @@ std::reference_wrapper<ast::expression const> named_to_reference_converter::visi
}
auto col_index = col_index_it->second;
_col_ref.emplace_back(col_index);
_stats_expr = std::reference_wrapper<ast::expression const>(_col_ref.back());
_converted_expr = std::reference_wrapper<ast::expression const>(_col_ref.back());
return std::reference_wrapper<ast::expression const>(_col_ref.back());
}

@@ -546,7 +530,7 @@ std::reference_wrapper<ast::expression const> named_to_reference_converter::visi
} else if (cudf::ast::detail::ast_operator_arity(op) == 1) {
_operators.emplace_back(op, new_operands.front());
}
_stats_expr = std::reference_wrapper<ast::expression const>(_operators.back());
_converted_expr = std::reference_wrapper<ast::expression const>(_operators.back());
return std::reference_wrapper<ast::expression const>(_operators.back());
}

@@ -640,4 +624,60 @@ class names_from_expression : public ast::detail::expression_transformer {
return names_from_expression(expr, skip_names).to_vector();
}

std::optional<std::vector<std::vector<size_type>>> collect_filtered_row_group_indices(
cudf::table_view table,
std::reference_wrapper<ast::expression const> ast_expr,
host_span<std::vector<size_type> const> input_row_group_indices,
rmm::cuda_stream_view stream)
{
// Filter the input table using AST expression
auto predicate_col = cudf::detail::compute_column(
table, ast_expr.get(), stream, cudf::get_current_device_resource_ref());
auto predicate = predicate_col->view();
CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8,
"Filter expression must return a boolean column");

auto const host_bitmask = [&] {
auto const num_bitmasks = num_bitmask_words(predicate.size());
if (predicate.nullable()) {
return cudf::detail::make_host_vector_sync(
device_span<bitmask_type const>(predicate.null_mask(), num_bitmasks), stream);
} else {
auto bitmask = cudf::detail::make_host_vector<bitmask_type>(num_bitmasks, stream);
std::fill(bitmask.begin(), bitmask.end(), ~bitmask_type{0});
return bitmask;
}
}();

auto validity_it = cudf::detail::make_counting_transform_iterator(
0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); });

// Return only filtered row groups based on predicate
auto const is_row_group_required = cudf::detail::make_host_vector_sync(
device_span<uint8_t const>(predicate.data<uint8_t>(), predicate.size()), stream);

// Return if all are required, or all are nulls.
if (predicate.null_count() == predicate.size() or std::all_of(is_row_group_required.cbegin(),
is_row_group_required.cend(),
[](auto i) { return bool(i); })) {
return std::nullopt;
}

// Collect indices of the filtered row groups
size_type is_required_idx = 0;
std::vector<std::vector<size_type>> filtered_row_group_indices;
for (auto const& input_row_group_index : input_row_group_indices) {
std::vector<size_type> filtered_row_groups;
for (auto const rg_idx : input_row_group_index) {
if ((!validity_it[is_required_idx]) || is_row_group_required[is_required_idx]) {
filtered_row_groups.push_back(rg_idx);
}
++is_required_idx;
}
filtered_row_group_indices.push_back(std::move(filtered_row_groups));
}

return {filtered_row_group_indices};
}

} // namespace cudf::io::parquet::detail
5 changes: 3 additions & 2 deletions cpp/src/io/parquet/reader_impl_helpers.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -1030,6 +1030,7 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con

std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
aggregate_reader_metadata::select_row_groups(
host_span<std::unique_ptr<datasource> const> sources,
host_span<std::vector<size_type> const> row_group_indices,
int64_t skip_rows_opt,
std::optional<size_type> const& num_rows_opt,
@@ -1042,7 +1043,7 @@ aggregate_reader_metadata::select_row_groups(
// if filter is not empty, then gather row groups to read after predicate pushdown
if (filter.has_value()) {
filtered_row_group_indices = filter_row_groups(
row_group_indices, output_dtypes, output_column_schemas, filter.value(), stream);
sources, row_group_indices, output_dtypes, output_column_schemas, filter.value(), stream);
if (filtered_row_group_indices.has_value()) {
row_group_indices =
host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
90 changes: 82 additions & 8 deletions cpp/src/io/parquet/reader_impl_helpers.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -195,6 +195,38 @@ class aggregate_reader_metadata {
*/
void column_info_for_row_group(row_group_info& rg_info, size_type chunk_start_row) const;

/**
* @brief Reads bloom filter bitsets for the specified columns from the given lists of row
* groups.
*
* @param sources Dataset sources
* @param row_group_indices Lists of row groups to read bloom filters from, one per source
* @param[out] bloom_filter_data List of bloom filter data device buffers
* @param column_schemas Schema indices of columns whose bloom filters will be read
* @param stream CUDA stream used for device memory operations and kernel launches
*
* @return A flattened list of bloom filter bitset device buffers for each predicate column across
* row group
*/
[[nodiscard]] std::vector<rmm::device_buffer> read_bloom_filters(
host_span<std::unique_ptr<datasource> const> sources,
host_span<std::vector<size_type> const> row_group_indices,
host_span<int const> column_schemas,
size_type num_row_groups,
rmm::cuda_stream_view stream) const;

/**
* @brief Collects Parquet types for the columns with the specified schema indices
*
* @param row_group_indices Lists of row groups, once per source
* @param column_schemas Schema indices of columns whose types will be collected
*
* @return A list of parquet types for the columns matching the provided schema indices
*/
[[nodiscard]] std::vector<Type> get_parquet_types(
host_span<std::vector<size_type> const> row_group_indices,
host_span<int const> column_schemas) const;

public:
aggregate_reader_metadata(host_span<std::unique_ptr<datasource> const> sources,
bool use_arrow_schema,
@@ -323,26 +355,49 @@ class aggregate_reader_metadata {
/**
* @brief Filters the row groups based on predicate filter
*
* @param sources Lists of input datasources
* @param row_group_indices Lists of row groups to read, one per source
* @param output_dtypes Datatypes of of output columns
* @param output_dtypes Datatypes of output columns
* @param output_column_schemas schema indices of output columns
* @param filter AST expression to filter row groups based on Column chunk statistics
* @param stream CUDA stream used for device memory operations and kernel launches
* @return Filtered row group indices, if any is filtered.
* @return Filtered row group indices, if any is filtered
*/
[[nodiscard]] std::optional<std::vector<std::vector<size_type>>> filter_row_groups(
host_span<std::unique_ptr<datasource> const> sources,
host_span<std::vector<size_type> const> row_group_indices,
host_span<data_type const> output_dtypes,
host_span<int const> output_column_schemas,
std::reference_wrapper<ast::expression const> filter,
rmm::cuda_stream_view stream) const;

/**
* @brief Filters the row groups using bloom filters
*
* @param sources Dataset sources
* @param row_group_indices Lists of input row groups to read, one per source
* @param output_dtypes Datatypes of output columns
* @param output_column_schemas schema indices of output columns
* @param filter AST expression to filter row groups based on bloom filter membership
* @param stream CUDA stream used for device memory operations and kernel launches
*
* @return Filtered row group indices, if any is filtered
*/
[[nodiscard]] std::optional<std::vector<std::vector<size_type>>> apply_bloom_filters(
host_span<std::unique_ptr<datasource> const> sources,
host_span<std::vector<size_type> const> input_row_group_indices,
host_span<data_type const> output_dtypes,
host_span<int const> output_column_schemas,
std::reference_wrapper<ast::expression const> filter,
rmm::cuda_stream_view stream) const;

/**
* @brief Filters and reduces down to a selection of row groups
*
* The input `row_start` and `row_count` parameters will be recomputed and output as the valid
* values based on the input row group list.
*
* @param sources Lists of input datasources
* @param row_group_indices Lists of row groups to read, one per source
* @param row_start Starting row of the selection
* @param row_count Total number of rows selected
@@ -351,10 +406,11 @@ class aggregate_reader_metadata {
* @param filter Optional AST expression to filter row groups based on Column chunk statistics
* @param stream CUDA stream used for device memory operations and kernel launches
* @return A tuple of corrected row_start, row_count, list of row group indexes and its
* starting row, and list of number of rows per source.
* starting row, and list of number of rows per source
*/
[[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
select_row_groups(host_span<std::vector<size_type> const> row_group_indices,
select_row_groups(host_span<std::unique_ptr<datasource> const> sources,
host_span<std::vector<size_type> const> row_group_indices,
int64_t row_start,
std::optional<size_type> const& row_count,
host_span<data_type const> output_dtypes,
@@ -413,22 +469,22 @@ class named_to_reference_converter : public ast::detail::expression_transformer
std::reference_wrapper<ast::expression const> visit(ast::operation const& expr) override;

/**
* @brief Returns the AST to apply on Column chunk statistics.
* @brief Returns the converted AST expression
*
* @return AST operation expression
*/
[[nodiscard]] std::optional<std::reference_wrapper<ast::expression const>> get_converted_expr()
const
{
return _stats_expr;
return _converted_expr;
}

private:
std::vector<std::reference_wrapper<ast::expression const>> visit_operands(
cudf::host_span<std::reference_wrapper<ast::expression const> const> operands);

std::unordered_map<std::string, size_type> column_name_to_index;
std::optional<std::reference_wrapper<ast::expression const>> _stats_expr;
std::optional<std::reference_wrapper<ast::expression const>> _converted_expr;
// Using std::list or std::deque to avoid reference invalidation
std::list<ast::column_reference> _col_ref;
std::list<ast::operation> _operators;
@@ -445,4 +501,22 @@ class named_to_reference_converter : public ast::detail::expression_transformer
std::optional<std::reference_wrapper<ast::expression const>> expr,
std::vector<std::string> const& skip_names);

/**
* @brief Filter table using the provided (StatsAST or BloomfilterAST) expression and
* collect filtered row group indices
*
* @param table Table of stats or bloom filter membership columns
* @param ast_expr StatsAST or BloomfilterAST expression to filter with
* @param input_row_group_indices Lists of input row groups to read, one per source
* @param stream CUDA stream used for device memory operations and kernel launches
*
* @return Collected filtered row group indices, one vector per source, if any. A std::nullopt if
* all row groups are required or if the computed predicate is all nulls
*/
[[nodiscard]] std::optional<std::vector<std::vector<size_type>>> collect_filtered_row_group_indices(
cudf::table_view ast_table,
std::reference_wrapper<ast::expression const> ast_expr,
host_span<std::vector<size_type> const> input_row_group_indices,
rmm::cuda_stream_view stream);

} // namespace cudf::io::parquet::detail
5 changes: 3 additions & 2 deletions cpp/src/io/parquet/reader_impl_preprocess.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -1286,7 +1286,8 @@ void reader::impl::preprocess_file(read_mode mode)
_file_itm_data.global_num_rows,
_file_itm_data.row_groups,
_file_itm_data.num_rows_per_source) =
_metadata->select_row_groups(_options.row_group_indices,
_metadata->select_row_groups(_sources,
_options.row_group_indices,
_options.skip_rows,
_options.num_rows,
output_dtypes,
5 changes: 3 additions & 2 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -318,14 +318,15 @@ ConfigureTest(
)
ConfigureTest(
PARQUET_TEST
io/parquet_test.cpp
io/parquet_bloom_filter_test.cu
io/parquet_chunked_reader_test.cu
io/parquet_chunked_writer_test.cpp
io/parquet_common.cpp
io/parquet_misc_test.cpp
io/parquet_reader_test.cpp
io/parquet_writer_test.cpp
io/parquet_test.cpp
io/parquet_v2_test.cpp
io/parquet_writer_test.cpp
GPUS 1
PERCENT 30
)
90 changes: 90 additions & 0 deletions cpp/tests/io/parquet_bloom_filter_test.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/iterator_utilities.hpp>

#include <cudf/detail/cuco_helpers.hpp>
#include <cudf/hashing.hpp>
#include <cudf/hashing/detail/xxhash_64.cuh>
#include <cudf/utilities/default_stream.hpp>

#include <cuco/bloom_filter.cuh>
#include <cuco/bloom_filter_policies.cuh>

using StringType = cudf::string_view;

class ParquetBloomFilterTest : public cudf::test::BaseFixture {};

TEST_F(ParquetBloomFilterTest, TestStrings)
{
using key_type = StringType;
using policy_type = cuco::arrow_filter_policy<key_type, cudf::hashing::detail::XXHash_64>;
using word_type = policy_type::word_type;

std::size_t constexpr num_filter_blocks = 4;
auto stream = cudf::get_default_stream();

// strings keys to insert
auto keys = cudf::test::strings_column_wrapper(
{"seventh", "fifteenth", "second", "tenth", "fifth", "first",
"seventh", "tenth", "ninth", "ninth", "seventeenth", "eighteenth",
"thirteenth", "fifth", "fourth", "twelfth", "second", "second",
"fourth", "seventh", "seventh", "tenth", "thirteenth", "seventeenth",
"fifth", "seventeenth", "eighth", "fourth", "second", "eighteenth",
"fifteenth", "second", "seventeenth", "thirteenth", "eighteenth", "fifth",
"seventh", "tenth", "fourteenth", "first", "fifth", "fifth",
"tenth", "thirteenth", "fourteenth", "third", "third", "sixth",
"first", "third"});

auto d_keys = cudf::column_device_view::create(keys);

// Spawn a bloom filter
cuco::bloom_filter<key_type,
cuco::extent<size_t>,
cuda::thread_scope_device,
policy_type,
cudf::detail::cuco_allocator<char>>
filter{num_filter_blocks,
cuco::thread_scope_device,
{{cudf::DEFAULT_HASH_SEED}},
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
stream};

// Add strings to the bloom filter
filter.add(d_keys->begin<key_type>(), d_keys->end<key_type>(), stream);

// Number of words in the filter
cudf::size_type const num_words = filter.block_extent() * filter.words_per_block;

// Filter bitset as a column
auto const bitset = cudf::column_view{
cudf::data_type{cudf::type_id::UINT32}, num_words, filter.data(), nullptr, 0, 0, {}};

// Expected filter bitset words computed using Arrow's implementation here:
// https://godbolt.org/z/oKfqcPWbY
auto expected = cudf::test::fixed_width_column_wrapper<word_type>(
{4194306U, 4194305U, 2359296U, 1073774592U, 524544U, 1024U, 268443648U,
8519680U, 2147500040U, 8421380U, 269500416U, 4202624U, 8396802U, 100665344U,
2147747840U, 5243136U, 131146U, 655364U, 285345792U, 134222340U, 545390596U,
2281717768U, 51201U, 41943553U, 1619656708U, 67441680U, 8462730U, 361220U,
2216738864U, 587333888U, 4219272U, 873463873U});

// Check the bitset for equality
CUDF_TEST_EXPECT_COLUMNS_EQUAL(bitset, expected);
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
57 changes: 56 additions & 1 deletion python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
# Copyright (c) 2019-2025, NVIDIA CORPORATION.

import datetime
import decimal
import glob
import hashlib
import math
@@ -4370,3 +4371,57 @@ def test_parquet_reader_mismatched_nullability_structs(tmpdir):
cudf.read_parquet([buf2, buf1]),
cudf.concat([df2, df1]).reset_index(drop=True),
)


@pytest.mark.parametrize(
"stats_fname,bloom_filter_fname",
[
(
"mixed_card_ndv_100_chunk_stats.snappy.parquet",
"mixed_card_ndv_100_bf_fpp0.1_nostats.snappy.parquet",
),
(
"mixed_card_ndv_500_chunk_stats.snappy.parquet",
"mixed_card_ndv_500_bf_fpp0.1_nostats.snappy.parquet",
),
],
)
@pytest.mark.parametrize(
"predicate,expected_len",
[
([[("str", "==", "FINDME")], [("fp64", "==", float(500))]], 2),
([("fixed_pt", "==", decimal.Decimal(float(500)))], 2),
([[("ui32", "==", np.uint32(500)), ("str", "==", "FINDME")]], 2),
([[("str", "==", "FINDME")], [("ui32", ">=", np.uint32(0))]], 1000),
(
[
("str", "!=", "FINDME"),
("fixed_pt", "==", decimal.Decimal(float(500))),
],
0,
),
],
)
def test_parquet_bloom_filters(
datadir, stats_fname, bloom_filter_fname, predicate, expected_len
):
fname_stats = datadir / stats_fname
fname_bf = datadir / bloom_filter_fname
df_stats = cudf.read_parquet(fname_stats, filters=predicate).reset_index(
drop=True
)
df_bf = cudf.read_parquet(fname_bf, filters=predicate).reset_index(
drop=True
)

# Check if tables equal
assert_eq(
df_stats,
df_bf,
)

# Check for table length
assert_eq(
len(df_stats),
expected_len,
)

0 comments on commit 41215e2

Please sign in to comment.