rapidsai · rapids-bot · Apr 15, 2022 · Mar 23, 2022 · Mar 23, 2022 · Mar 23, 2022
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -244,6 +244,7 @@ class column_in_metadata {
   bool _use_int96_timestamp = false;
   // bool _output_as_binary = false;
   thrust::optional<uint8_t> _decimal_precision;
+  thrust::optional<int32_t> _parquet_field_id;
   std::vector<column_in_metadata> children;
 
  public:
@@ -324,6 +325,18 @@ class column_in_metadata {
     return *this;
   }
 
+  /**
+   * @brief Set the parquet field id of this column.
+   *
+   * @param field_id The parquet field id to set
+   * @return this for chaining
+   */
+  column_in_metadata& set_parquet_field_id(int32_t field_id)
+  {
+    _parquet_field_id = field_id;
+    return *this;
+  }
+
   /**
    * @brief Get reference to a child of this column
    *
@@ -379,6 +392,18 @@ class column_in_metadata {
    */
   [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
 
+  /**
+   * @brief Get whether parquet field id has been set for this column
+   */
+  [[nodiscard]] bool is_parquet_field_id_set() const { return _parquet_field_id.has_value(); }
+
+  /**
+   * @brief Get the parquet field id that was set for this column.
+   * @throws If parquet field id was not set for this column.
+   *         Check using `is_parquet_field_id_set()` first.
+   */
+  [[nodiscard]] int32_t get_parquet_field_id() const { return _parquet_field_id.value(); }
+
   /**
    * @brief Get the number of children of this column
    */

@@ -144,6 +144,7 @@ size_t CompactProtocolWriter::write(const SchemaElement& s)
       c.field_int(8, s.decimal_precision);
     }
   }
+  if (s.has_field_id) { c.field_int(9, s.field_id); }
   auto const isset = s.logical_type.isset;
   // TODO: add handling for all logical types
   // if (isset.STRING or isset.MAP or isset.LIST or isset.ENUM or isset.DECIMAL or isset.DATE or

@@ -145,6 +145,8 @@ struct SchemaElement {
   int32_t num_children                = 0;
   int32_t decimal_scale               = 0;
   int32_t decimal_precision           = 0;
+  bool has_field_id                   = false;
+  int32_t field_id                    = 0;
 
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
@@ -157,7 +159,8 @@ struct SchemaElement {
     return type == other.type && converted_type == other.converted_type &&
            type_length == other.type_length && repetition_type == other.repetition_type &&
            name == other.name && num_children == other.num_children &&
-           decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision;
+           decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision &&
+           has_field_id == other.has_field_id && field_id == other.field_id;
   }
 
   // the parquet format is a little squishy when it comes to interpreting

@@ -534,6 +534,14 @@ std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const& linke
     [&](LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) {
       bool col_nullable = is_col_nullable(col, col_meta, single_write_mode);
 
+      auto set_field_id = [&schema, parent_idx](schema_tree_node& s,
+                                                column_in_metadata const& col_meta) {
+        if (schema[parent_idx].name != "list" and col_meta.is_parquet_field_id_set()) {
+          s.has_field_id = true;
+          s.field_id     = col_meta.get_parquet_field_id();
+        }
+      };
+
       if (col->type().id() == type_id::STRUCT) {
         // if struct, add current and recursively call for all children
         schema_tree_node struct_schema{};
@@ -543,6 +551,7 @@ std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const& linke
         struct_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
         struct_schema.num_children = col->num_children();
         struct_schema.parent_idx   = parent_idx;
+        set_field_id(struct_schema, col_meta);
         schema.push_back(std::move(struct_schema));
 
         auto struct_node_index = schema.size() - 1;
@@ -567,6 +576,7 @@ std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const& linke
         list_schema_1.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
         list_schema_1.num_children = 1;
         list_schema_1.parent_idx   = parent_idx;
+        set_field_id(list_schema_1, col_meta);
         schema.push_back(std::move(list_schema_1));
 
         schema_tree_node list_schema_2{};
@@ -598,7 +608,10 @@ std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const& linke
         map_schema.converted_type = ConvertedType::MAP;
         map_schema.repetition_type =
           col_nullable ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED;
-        map_schema.name         = col_meta.get_name();
+        map_schema.name = col_meta.get_name();
+        if (col_meta.is_parquet_field_id_set()) {
+          map_schema.field_id = col_meta.get_parquet_field_id();
+        }
         map_schema.num_children = 1;
         map_schema.parent_idx   = parent_idx;
         schema.push_back(std::move(map_schema));
@@ -655,6 +668,7 @@ std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const& linke
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
         col_schema.parent_idx  = parent_idx;
         col_schema.leaf_column = col;
+        set_field_id(col_schema, col_meta);
         schema.push_back(col_schema);
       }
     };

@@ -217,15 +217,21 @@ struct ParquetWriterTimestampTypeTest : public ParquetWriterTest {
   auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
 
+// Typed test fixture for all types
+template <typename T>
+struct ParquetWriterSchemaTest : public ParquetWriterTest {
+  auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
+};
+
 // Declare typed test cases
 // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352
 using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
 TYPED_TEST_SUITE(ParquetWriterNumericTypeTest, SupportedTypes);
-using SupportedChronoTypes = cudf::test::Concat<cudf::test::ChronoTypes, cudf::test::DurationTypes>;
-TYPED_TEST_SUITE(ParquetWriterChronoTypeTest, SupportedChronoTypes);
+TYPED_TEST_SUITE(ParquetWriterChronoTypeTest, cudf::test::ChronoTypes);
 using SupportedTimestampTypes =
   cudf::test::Types<cudf::timestamp_ms, cudf::timestamp_us, cudf::timestamp_ns>;
 TYPED_TEST_SUITE(ParquetWriterTimestampTypeTest, SupportedTimestampTypes);
+TYPED_TEST_SUITE(ParquetWriterSchemaTest, cudf::test::AllTypes);
 
 // Base test fixture for chunked writer tests
 struct ParquetChunkedWriterTest : public cudf::test::BaseFixture {
@@ -3198,4 +3204,31 @@ TEST_F(ParquetWriterTest, RowGroupSizeInvalid)
                cudf::logic_error);
 }
 
+TYPED_TEST(ParquetWriterSchemaTest, FieldID)
+{
+  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+
+  constexpr auto num_rows = 800;
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
+    sequence, sequence + num_rows, validity);
+
+  std::vector<std::unique_ptr<column>> cols;
+  cols.push_back(col.release());
+  auto expected = std::make_unique<table>(std::move(cols));
+
+  cudf_io::table_input_metadata expected_metadata(*expected);
+  auto constexpr gold = 825;
+  expected_metadata.column_metadata[0].set_parquet_field_id(gold);
+
+  auto filepath = temp_env->get_temp_filepath("FieldID.parquet");
+  cudf_io::parquet_writer_options out_opts =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
+      .metadata(&expected_metadata);
+
+  auto got_metadata = out_opts.get_metadata();
+  EXPECT_EQ(true, got_metadata->column_metadata[0].is_parquet_field_id_set());
+  EXPECT_EQ(gold, got_metadata->column_metadata[0].get_parquet_field_id());
+}
+
 CUDF_TEST_PROGRAM_MAIN()