rapidsai · rapids-bot · May 16, 2024 · Feb 21, 2024 · Mar 1, 2024 · Mar 1, 2024
@@ -195,6 +195,10 @@ class parquet_reader_options {
   /**
    * @brief Sets AST based filter for predicate pushdown.
    *
+   * The filter can utilize cudf::ast::column_name_reference to reference a column by its name,
+   * even if it's not necessarily present in the requested projected columns.
+   * To refer to output column indices, you can use cudf::ast::column_reference.
+   *
    * @param filter AST expression to use as filter
    */
   void set_filter(ast::expression const& filter) { _filter = filter; }
@@ -292,9 +296,13 @@ class parquet_reader_options_builder {
   }
 
   /**
-   * @brief Sets vector of individual row groups to read.
+   * @brief Sets AST based filter for predicate pushdown.
    *
-   * @param filter Vector of row groups to read
+   * The filter can utilize cudf::ast::column_name_reference to reference a column by its name,
+   * even if it's not necessarily present in the requested projected columns.
+   * To refer to output column indices, you can use cudf::ast::column_reference.
+   *
+   * @param filter AST expression to use as filter
    * @return this for chaining
    */
   parquet_reader_options_builder& filter(ast::expression const& filter)

@@ -374,9 +374,44 @@ class stats_expression_converter : public ast::detail::expression_transformer {
 };
 }  // namespace
 
+void aggregate_reader_metadata::cache_root_dtypes_names(bool strings_to_categorical,
+                                                        type_id timestamp_type_id)
+{
+  // TODO, get types and names for only names present in filter.? and their col_idx.
+  // create root column types and names as vector
+  if (!_root_level_types.empty()) return;
+  std::function<cudf::data_type(int)> get_dtype = [strings_to_categorical,
+                                                   timestamp_type_id,
+                                                   &get_dtype,
+                                                   this](int schema_idx) -> cudf::data_type {
+    // returns type of root level columns only.
+    // if (schema_idx < 0) { return false; }
+    auto const& schema_elem = get_schema(schema_idx);
+    if (schema_elem.is_stub()) {
+      CUDF_EXPECTS(schema_elem.num_children == 1, "Unexpected number of children for stub");
+      return get_dtype(schema_elem.children_idx[0]);
+    }
+
+    auto const one_level_list = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx));
+    // if we're at the root, this is a new output column
+    auto const col_type = one_level_list
+                            ? type_id::LIST
+                            : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
+    auto const dtype    = to_data_type(col_type, schema_elem);
+    // path_is_valid is skipped for nested columns here. TODO: more test cases where no leaf.
+    return dtype;
+  };
+
+  auto const& root = get_schema(0);
+  for (auto const& schema_idx : root.children_idx) {
+    if (schema_idx < 0) { continue; }
+    _root_level_types.push_back(get_dtype(schema_idx));
+    _root_level_names.push_back(get_schema(schema_idx).name);
+  }
+}
+
 std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::filter_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
-  host_span<data_type const> output_dtypes,
   std::reference_wrapper<ast::expression const> filter,
   rmm::cuda_stream_view stream) const
 {
@@ -410,8 +445,8 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   // For each column, it contains #sources * #column_chunks_per_src rows.
   std::vector<std::unique_ptr<column>> columns;
   stats_caster stats_col{total_row_groups, per_file_metadata, input_row_group_indices};
-  for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) {
-    auto const& dtype = output_dtypes[col_idx];
+  for (size_t col_idx = 0; col_idx < _root_level_types.size(); col_idx++) {
+    auto const& dtype = _root_level_types[col_idx];
     // Only comparable types except fixed point are supported.
     if (cudf::is_compound(dtype) && dtype.id() != cudf::type_id::STRING) {
       // placeholder only for unsupported types.
@@ -427,9 +462,13 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
     columns.push_back(std::move(max_col));
   }
   auto stats_table = cudf::table(std::move(columns));
+  // named filter to reference filter w.r.t parquet schema order.
+  auto expr_conv        = named_to_reference_converter(filter, _root_level_names);
+  auto reference_filter = expr_conv.get_converted_expr();
 
   // Converts AST to StatsAST with reference to min, max columns in above `stats_table`.
-  stats_expression_converter stats_expr{filter, static_cast<size_type>(output_dtypes.size())};
+  stats_expression_converter stats_expr{reference_filter.value().get(),
+                                        static_cast<size_type>(_root_level_types.size())};
   auto stats_ast     = stats_expr.get_stats_expr();
   auto predicate_col = cudf::detail::compute_column(stats_table, stats_ast.get(), stream, mr);
   auto predicate     = predicate_col->view();
@@ -529,4 +568,40 @@ named_to_reference_converter::visit_operands(
   return transformed_operands;
 }
 
+// extract column names from expression
+std::reference_wrapper<ast::expression const> names_from_expression::visit(ast::literal const& expr)
+{
+  return expr;
+}
+
+std::reference_wrapper<ast::expression const> names_from_expression::visit(
+  ast::column_reference const& expr)
+{
+  return expr;
+}
+
+std::reference_wrapper<ast::expression const> names_from_expression::visit(
+  ast::column_name_reference const& expr)
+{
+  // collect column names
+  auto col_name = expr.get_column_name();
+  if (_skip_names.count(col_name) == 0) { _column_names.insert(col_name); }
+  return expr;
+}
+
+std::reference_wrapper<ast::expression const> names_from_expression::visit(
+  ast::operation const& expr)
+{
+  visit_operands(expr.get_operands());
+  return expr;
+}
+
+void names_from_expression::visit_operands(
+  std::vector<std::reference_wrapper<ast::expression const>> operands)
+{
+  for (auto const& operand : operands) {
+    operand.get().accept(*this);
+  }
+}
+
 }  // namespace cudf::io::parquet::detail
@@ -23,6 +23,8 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <bitset>
 #include <numeric>
 
@@ -357,13 +359,25 @@ reader::impl::impl(std::size_t chunk_read_limit,
   // Binary columns can be read as binary or strings
   _reader_column_schema = options.get_column_schema();
 
-  // Select only columns required by the options
+  // Select only columns required by the options and filter
+  std::optional<std::vector<std::string>> filter_columns_names;
+  if (options.get_filter().has_value() and options.get_columns().has_value()) {
+    // list, struct, dictionary are not supported by AST filter yet.
+    // extract columns not present in get_columns() & keep count to remove at end.
+    auto extractor       = names_from_expression(options.get_filter(), *(options.get_columns()));
+    filter_columns_names = extractor.get_column_names();
+    _num_filter_columns  = filter_columns_names->size();
+  }
   std::tie(_input_columns, _output_buffers, _output_column_schemas) =
     _metadata->select_columns(options.get_columns(),
+                              filter_columns_names,
                               options.is_enabled_use_pandas_metadata(),
                               _strings_to_categorical,
                               _timestamp_type.id());
 
+  // Find the name, and dtypes of parquet root level schema. (save it in _metadata.)
+  _metadata->cache_root_dtypes_names(_strings_to_categorical, _timestamp_type.id());
+
   // Save the states of the output buffers for reuse in `chunk_read()`.
   for (auto const& buff : _output_buffers) {
     _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
@@ -489,7 +503,12 @@ table_with_metadata reader::impl::finalize_output(
       *read_table, filter.value().get(), _stream, rmm::mr::get_current_device_resource());
     CUDF_EXPECTS(predicate->view().type().id() == type_id::BOOL8,
                  "Predicate filter should return a boolean");
-    auto output_table = cudf::detail::apply_boolean_mask(*read_table, *predicate, _stream, _mr);
+    // Exclude columns present in filter only in output
+    auto counting_it        = thrust::make_counting_iterator<std::size_t>(0);
+    auto const output_count = read_table->num_columns() - _num_filter_columns;
+    auto only_output        = read_table->select(counting_it, counting_it + output_count);
+    auto output_table = cudf::detail::apply_boolean_mask(only_output, *predicate, _stream, _mr);
+    if (_num_filter_columns > 0) { out_metadata.schema_info.resize(output_count); }
     return {std::move(output_table), std::move(out_metadata)};
   }
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
@@ -509,7 +528,7 @@ table_with_metadata reader::impl::read(
   auto expr_conv     = named_to_reference_converter(filter, metadata);
   auto output_filter = expr_conv.get_converted_expr();
 
-  prepare_data(skip_rows, num_rows, uses_custom_row_bounds, row_group_indices, output_filter);
+  prepare_data(skip_rows, num_rows, uses_custom_row_bounds, row_group_indices, filter);
   return read_chunk_internal(uses_custom_row_bounds, output_filter);
 }
 

@@ -364,6 +364,9 @@ class reader::impl {
   // _output_buffers associated metadata
   std::unique_ptr<table_metadata> _output_metadata;
 
+  // number of extra filter columns
+  std::size_t _num_filter_columns{0};
+
   bool _strings_to_categorical = false;
   std::optional<std::vector<reader_column_schema>> _reader_column_schema;
   data_type _timestamp_type{type_id::EMPTY};

@@ -18,6 +18,7 @@
 
 #include "io/utilities/row_selection.hpp"
 
+#include <functional>
 #include <numeric>
 #include <regex>
 
@@ -444,14 +445,13 @@ aggregate_reader_metadata::select_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   int64_t skip_rows_opt,
   std::optional<size_type> const& num_rows_opt,
-  host_span<data_type const> output_dtypes,
   std::optional<std::reference_wrapper<ast::expression const>> filter,
   rmm::cuda_stream_view stream) const
 {
   std::optional<std::vector<std::vector<size_type>>> filtered_row_group_indices;
+  // if filter is not empty, then gather row groups to read after predicate pushdown
   if (filter.has_value()) {
-    filtered_row_group_indices =
-      filter_row_groups(row_group_indices, output_dtypes, filter.value(), stream);
+    filtered_row_group_indices = filter_row_groups(row_group_indices, filter.value(), stream);
     if (filtered_row_group_indices.has_value()) {
       row_group_indices =
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
@@ -499,10 +499,12 @@ aggregate_reader_metadata::select_row_groups(
 std::tuple<std::vector<input_column_info>,
            std::vector<cudf::io::detail::inline_column_buffer>,
            std::vector<size_type>>
-aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
-                                          bool include_index,
-                                          bool strings_to_categorical,
-                                          type_id timestamp_type_id) const
+aggregate_reader_metadata::select_columns(
+  std::optional<std::vector<std::string>> const& use_names,
+  std::optional<std::vector<std::string>> const& filter_columns_names,
+  bool include_index,
+  bool strings_to_categorical,
+  type_id timestamp_type_id) const
 {
   auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
     auto const& col_schema_idx =
@@ -667,13 +669,18 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
 
     // Find which of the selected paths are valid and get their schema index
     std::vector<path_info> valid_selected_paths;
-    for (auto const& selected_path : *use_names) {
-      auto found_path =
-        std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
-          return valid_path.full_path == selected_path;
-        });
-      if (found_path != all_paths.end()) {
-        valid_selected_paths.push_back({selected_path, found_path->schema_idx});
+    // vector reference pushback (*use_names). If filter names passed.
+    std::vector<std::reference_wrapper<std::vector<std::string> const>> column_names{
+      *use_names, *filter_columns_names};
+    for (auto const& used_column_names : column_names) {
+      for (auto const& selected_path : used_column_names.get()) {
+        auto found_path =
+          std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
+            return valid_path.full_path == selected_path;
+          });
+        if (found_path != all_paths.end()) {
+          valid_selected_paths.push_back({selected_path, found_path->schema_idx});
+        }
       }
     }