diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 7e4ab5b8d9d..23ed0153f3f 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -244,6 +244,7 @@ class column_in_metadata { bool _use_int96_timestamp = false; // bool _output_as_binary = false; thrust::optional _decimal_precision; + thrust::optional _parquet_field_id; std::vector children; public: @@ -324,6 +325,18 @@ class column_in_metadata { return *this; } + /** + * @brief Set the parquet field id of this column. + * + * @param field_id The parquet field id to set + * @return this for chaining + */ + column_in_metadata& set_parquet_field_id(int32_t field_id) + { + _parquet_field_id = field_id; + return *this; + } + /** * @brief Get reference to a child of this column * @@ -379,6 +392,18 @@ class column_in_metadata { */ [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); } + /** + * @brief Get whether parquet field id has been set for this column. + */ + [[nodiscard]] bool is_parquet_field_id_set() const { return _parquet_field_id.has_value(); } + + /** + * @brief Get the parquet field id that was set for this column. + * @throws If parquet field id was not set for this column. + * Check using `is_parquet_field_id_set()` first. + */ + [[nodiscard]] int32_t get_parquet_field_id() const { return _parquet_field_id.value(); } + /** * @brief Get the number of children of this column */ diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 7feaa8e61b4..a1fc2edb0bb 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -156,6 +156,7 @@ bool CompactProtocolReader::read(SchemaElement* s) ParquetFieldEnum(6, s->converted_type), ParquetFieldInt32(7, s->decimal_scale), ParquetFieldInt32(8, s->decimal_precision), + ParquetFieldOptionalInt32(9, s->field_id), ParquetFieldStruct(10, s->logical_type)); return function_builder(this, op); } diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index ba48f7b127f..ddca6c37e08 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -18,6 +18,8 @@ #include "parquet.hpp" +#include + #include #include #include @@ -137,6 +139,7 @@ class CompactProtocolReader { friend class ParquetFieldBool; friend class ParquetFieldInt8; friend class ParquetFieldInt32; + friend class ParquetFieldOptionalInt32; friend class ParquetFieldInt64; template friend class ParquetFieldStructListFunctor; @@ -216,6 +219,27 @@ class ParquetFieldInt32 { int field() { return field_val; } }; +/** + * @brief Functor to set value to optional 32 bit integer read from CompactProtocolReader + * + * @return True if field type is not int32 + */ +class ParquetFieldOptionalInt32 { + int field_val; + thrust::optional& val; + + public: + ParquetFieldOptionalInt32(int f, thrust::optional& v) : field_val(f), val(v) {} + + inline bool operator()(CompactProtocolReader* cpr, int field_type) + { + val = cpr->get_i32(); + return (field_type != ST_FLD_I32); + } + + int field() { return field_val; } +}; + /** * @brief Functor to set value to 64 bit integer read from CompactProtocolReader * diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 927844cb1c2..176ecb6a572 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -144,6 +144,7 @@ size_t CompactProtocolWriter::write(const SchemaElement& s) c.field_int(8, s.decimal_precision); } } + if (s.field_id) { c.field_int(9, s.field_id.value()); } auto const isset = s.logical_type.isset; // TODO: add handling for all logical types // if (isset.STRING or isset.MAP or isset.LIST or isset.ENUM or isset.DECIMAL or isset.DATE or diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index b1800640c91..ccaf3485bdf 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -18,6 +18,8 @@ #include "parquet_common.hpp" +#include + #include #include #include @@ -145,6 +147,7 @@ struct SchemaElement { int32_t num_children = 0; int32_t decimal_scale = 0; int32_t decimal_precision = 0; + thrust::optional field_id = thrust::nullopt; // The following fields are filled in later during schema initialization int max_definition_level = 0; @@ -157,7 +160,8 @@ struct SchemaElement { return type == other.type && converted_type == other.converted_type && type_length == other.type_length && repetition_type == other.repetition_type && name == other.name && num_children == other.num_children && - decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision; + decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision && + field_id == other.field_id; } // the parquet format is a little squishy when it comes to interpreting diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 70a594423c9..c6e563c7bc8 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -538,6 +538,13 @@ std::vector construct_schema_tree(LinkedColVector const& linke [&](LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) { bool col_nullable = is_col_nullable(col, col_meta, single_write_mode); + auto set_field_id = [&schema, parent_idx](schema_tree_node& s, + column_in_metadata const& col_meta) { + if (schema[parent_idx].name != "list" and col_meta.is_parquet_field_id_set()) { + s.field_id = col_meta.get_parquet_field_id(); + } + }; + if (col->type().id() == type_id::STRUCT) { // if struct, add current and recursively call for all children schema_tree_node struct_schema{}; @@ -547,6 +554,7 @@ std::vector construct_schema_tree(LinkedColVector const& linke struct_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name(); struct_schema.num_children = col->num_children(); struct_schema.parent_idx = parent_idx; + set_field_id(struct_schema, col_meta); schema.push_back(std::move(struct_schema)); auto struct_node_index = schema.size() - 1; @@ -571,6 +579,7 @@ std::vector construct_schema_tree(LinkedColVector const& linke list_schema_1.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name(); list_schema_1.num_children = 1; list_schema_1.parent_idx = parent_idx; + set_field_id(list_schema_1, col_meta); schema.push_back(std::move(list_schema_1)); schema_tree_node list_schema_2{}; @@ -602,7 +611,10 @@ std::vector construct_schema_tree(LinkedColVector const& linke map_schema.converted_type = ConvertedType::MAP; map_schema.repetition_type = col_nullable ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED; - map_schema.name = col_meta.get_name(); + map_schema.name = col_meta.get_name(); + if (col_meta.is_parquet_field_id_set()) { + map_schema.field_id = col_meta.get_parquet_field_id(); + } map_schema.num_children = 1; map_schema.parent_idx = parent_idx; schema.push_back(std::move(map_schema)); @@ -659,6 +671,7 @@ std::vector construct_schema_tree(LinkedColVector const& linke col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name(); col_schema.parent_idx = parent_idx; col_schema.leaf_column = col; + set_field_id(col_schema, col_meta); schema.push_back(col_schema); } }; diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index cd0aab3caeb..3905df2b274 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -219,15 +219,21 @@ struct ParquetWriterTimestampTypeTest : public ParquetWriterTest { auto type() { return cudf::data_type{cudf::type_to_id()}; } }; +// Typed test fixture for all types +template +struct ParquetWriterSchemaTest : public ParquetWriterTest { + auto type() { return cudf::data_type{cudf::type_to_id()}; } +}; + // Declare typed test cases // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352 using SupportedTypes = cudf::test::Types; TYPED_TEST_SUITE(ParquetWriterNumericTypeTest, SupportedTypes); -using SupportedChronoTypes = cudf::test::Concat; -TYPED_TEST_SUITE(ParquetWriterChronoTypeTest, SupportedChronoTypes); +TYPED_TEST_SUITE(ParquetWriterChronoTypeTest, cudf::test::ChronoTypes); using SupportedTimestampTypes = cudf::test::Types; TYPED_TEST_SUITE(ParquetWriterTimestampTypeTest, SupportedTimestampTypes); +TYPED_TEST_SUITE(ParquetWriterSchemaTest, cudf::test::AllTypes); // Base test fixture for chunked writer tests struct ParquetChunkedWriterTest : public cudf::test::BaseFixture { diff --git a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java index 78b3d5d52ec..f3fb7de6abe 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java @@ -33,9 +33,15 @@ public class ColumnWriterOptions { private boolean isNullable; private boolean isMap = false; private String columnName; + // only for Parquet + private boolean hasParquetFieldId; + private int parquetFieldId; + private ColumnWriterOptions(AbstractStructBuilder builder) { this.columnName = builder.name; this.isNullable = builder.isNullable; + this.hasParquetFieldId = builder.hasParquetFieldId; + this.parquetFieldId = builder.parquetFieldId; this.childColumnOptions = (ColumnWriterOptions[]) builder.children.toArray(new ColumnWriterOptions[0]); } @@ -67,6 +73,10 @@ public AbstractStructBuilder(String name, boolean isNullable) { super(name, isNullable); } + public AbstractStructBuilder(String name, boolean isNullable, int parquetFieldId) { + super(name, isNullable, parquetFieldId); + } + protected AbstractStructBuilder() { super(); } @@ -84,6 +94,9 @@ public static abstract class NestedBuilder children = new ArrayList<>(); protected boolean isNullable = true; protected String name = ""; + // Parquet structure needs + protected boolean hasParquetFieldId; + protected int parquetFieldId; /** * Builder specific to build a Struct meta @@ -93,22 +106,43 @@ protected NestedBuilder(String name, boolean isNullable) { this.isNullable = isNullable; } + protected NestedBuilder(String name, boolean isNullable, int parquetFieldId) { + this.name = name; + this.isNullable = isNullable; + this.hasParquetFieldId = true; + this.parquetFieldId = parquetFieldId; + } + protected NestedBuilder() {} - protected ColumnWriterOptions withColumns(String name, boolean isNullable) { + protected ColumnWriterOptions withColumn(String name, boolean isNullable) { return new ColumnWriterOptions(name, isNullable); } + protected ColumnWriterOptions withColumn(String name, boolean isNullable, int parquetFieldId) { + return new ColumnWriterOptions(name, isNullable, parquetFieldId); + } + protected ColumnWriterOptions withDecimal(String name, int precision, boolean isNullable) { return new ColumnWriterOptions(name, false, precision, isNullable); } + protected ColumnWriterOptions withDecimal(String name, int precision, + boolean isNullable, int parquetFieldId) { + return new ColumnWriterOptions(name, false, precision, isNullable, parquetFieldId); + } + protected ColumnWriterOptions withTimestamp(String name, boolean isInt96, boolean isNullable) { return new ColumnWriterOptions(name, isInt96, UNKNOWN_PRECISION, isNullable); } + protected ColumnWriterOptions withTimestamp(String name, boolean isInt96, + boolean isNullable, int parquetFieldId) { + return new ColumnWriterOptions(name, isInt96, UNKNOWN_PRECISION, isNullable, parquetFieldId); + } + /** * Set the list column meta. * Lists should have only one child in ColumnVector, but the metadata expects a @@ -155,16 +189,16 @@ public T withStructColumn(StructColumnWriterOptions child) { /** * Set column name */ - public T withNonNullableColumns(String... name) { - withColumns(false, name); + public T withNonNullableColumns(String... names) { + withColumns(false, names); return (T) this; } /** * Set nullable column meta data */ - public T withNullableColumns(String... name) { - withColumns(true, name); + public T withNullableColumns(String... names) { + withColumns(true, names); return (T) this; } @@ -172,13 +206,22 @@ public T withNullableColumns(String... name) { * Set a simple child meta data * @return this for chaining. */ - public T withColumns(boolean nullable, String... name) { - for (String n : name) { - children.add(withColumns(n, nullable)); + public T withColumns(boolean nullable, String... names) { + for (String n : names) { + children.add(withColumn(n, nullable)); } return (T) this; } + /** + * Set a simple child meta data + * @return this for chaining. + */ + public T withColumn(boolean nullable, String name, int parquetFieldId) { + children.add(withColumn(name, nullable, parquetFieldId)); + return (T) this; + } + /** * Set a Decimal child meta data * @return this for chaining. @@ -188,6 +231,15 @@ public T withDecimalColumn(String name, int precision, boolean nullable) { return (T) this; } + /** + * Set a Decimal child meta data + * @return this for chaining. + */ + public T withDecimalColumn(String name, int precision, boolean nullable, int parquetFieldId) { + children.add(withDecimal(name, precision, nullable, parquetFieldId)); + return (T) this; + } + /** * Set a Decimal child meta data * @return this for chaining. @@ -206,6 +258,15 @@ public T withDecimalColumn(String name, int precision) { return (T) this; } + /** + * Set a timestamp child meta data + * @return this for chaining. + */ + public T withTimestampColumn(String name, boolean isInt96, boolean nullable, int parquetFieldId) { + children.add(withTimestamp(name, isInt96, nullable, parquetFieldId)); + return (T) this; + } + /** * Set a timestamp child meta data * @return this for chaining. @@ -244,6 +305,13 @@ public ColumnWriterOptions(String columnName, boolean isTimestampTypeInt96, this.columnName = columnName; } + public ColumnWriterOptions(String columnName, boolean isTimestampTypeInt96, + int precision, boolean isNullable, int parquetFieldId) { + this(columnName, isTimestampTypeInt96, precision, isNullable); + this.hasParquetFieldId = true; + this.parquetFieldId = parquetFieldId; + } + public ColumnWriterOptions(String columnName, boolean isNullable) { this.isTimestampTypeInt96 = false; this.precision = UNKNOWN_PRECISION; @@ -251,6 +319,12 @@ public ColumnWriterOptions(String columnName, boolean isNullable) { this.columnName = columnName; } + public ColumnWriterOptions(String columnName, boolean isNullable, int parquetFieldId) { + this(columnName, isNullable); + this.hasParquetFieldId = true; + this.parquetFieldId = parquetFieldId; + } + public ColumnWriterOptions(String columnName) { this(columnName, true); } @@ -302,6 +376,24 @@ int[] getFlatPrecision() { } } + boolean[] getFlatHasParquetFieldId() { + boolean[] ret = {hasParquetFieldId}; + if (childColumnOptions.length > 0) { + return getFlatBooleans(ret, (opt) -> opt.getFlatHasParquetFieldId()); + } else { + return ret; + } + } + + int[] getFlatParquetFieldId() { + int[] ret = {parquetFieldId}; + if (childColumnOptions.length > 0) { + return getFlatInts(ret, (opt) -> opt.getFlatParquetFieldId()); + } else { + return ret; + } + } + boolean[] getFlatIsNullable() { boolean[] ret = {isNullable}; if (childColumnOptions.length > 0) { @@ -418,6 +510,13 @@ public static StructBuilder structBuilder(String name, boolean isNullable) { return new StructBuilder(name, isNullable); } + /** + * Creates a StructBuilder for column called 'name' + */ + public static StructBuilder structBuilder(String name, boolean isNullable, int parquetFieldId) { + return new StructBuilder(name, isNullable, parquetFieldId); + } + /** * Creates a StructBuilder for column called 'name' */ @@ -477,6 +576,10 @@ public StructBuilder(String name, boolean isNullable) { super(name, isNullable); } + public StructBuilder(String name, boolean isNullable, int parquetFieldId) { + super(name, isNullable, parquetFieldId); + } + public StructColumnWriterOptions build() { return new StructColumnWriterOptions(this); } diff --git a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java index 9292975d0ce..3a3b7d721b7 100644 --- a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java @@ -41,6 +41,16 @@ int[] getFlatPrecision() { return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatPrecision()); } + @Override + boolean[] getFlatHasParquetFieldId() { + return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatHasParquetFieldId()); + } + + @Override + int[] getFlatParquetFieldId() { + return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatParquetFieldId()); + } + @Override int[] getFlatNumChildren() { return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatNumChildren()); diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index ff966643866..24f7d44ed28 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -289,7 +289,10 @@ private static native long writeParquetFileBegin(String[] columnNames, int statsFreq, boolean[] isInt96, int[] precisions, - boolean[] isMapValues, String filename) throws CudfException; + boolean[] isMapValues, + boolean[] hasParquetFieldIds, + int[] parquetFieldIds, + String filename) throws CudfException; /** * Setup everything to write parquet formatted data to a buffer. @@ -319,6 +322,8 @@ private static native long writeParquetBufferBegin(String[] columnNames, boolean[] isInt96, int[] precisions, boolean[] isMapValues, + boolean[] hasParquetFieldIds, + int[] parquetFieldIds, HostBufferConsumer consumer) throws CudfException; /** @@ -1201,6 +1206,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) { boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96(); boolean[] isMapValues = options.getFlatIsMap(); int[] precisions = options.getFlatPrecision(); + boolean[] hasParquetFieldIds = options.getFlatHasParquetFieldId(); + int[] parquetFieldIds = options.getFlatParquetFieldId(); int[] flatNumChildren = options.getFlatNumChildren(); this.consumer = null; @@ -1215,6 +1222,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) { timeInt96Values, precisions, isMapValues, + hasParquetFieldIds, + parquetFieldIds, outputFile.getAbsolutePath()); } @@ -1224,6 +1233,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96(); boolean[] isMapValues = options.getFlatIsMap(); int[] precisions = options.getFlatPrecision(); + boolean[] hasParquetFieldIds = options.getFlatHasParquetFieldId(); + int[] parquetFieldIds = options.getFlatParquetFieldId(); int[] flatNumChildren = options.getFlatNumChildren(); this.consumer = consumer; @@ -1238,6 +1249,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons timeInt96Values, precisions, isMapValues, + hasParquetFieldIds, + parquetFieldIds, consumer); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index cebe476dd87..919958d4db2 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -673,6 +673,8 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata, cudf::jni::native_jbooleanArray &is_int96, cudf::jni::native_jintArray &precisions, cudf::jni::native_jbooleanArray &is_map, + cudf::jni::native_jbooleanArray &hasParquetFieldIds, + cudf::jni::native_jintArray &parquetFieldIds, cudf::jni::native_jintArray &children, int num_children, int read_index) { int write_index = 0; for (int i = 0; i < num_children; i++, write_index++) { @@ -687,12 +689,15 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata, if (is_map[read_index]) { child.set_list_column_as_map(); } + if (!parquetFieldIds.is_null() && hasParquetFieldIds[read_index]) { + child.set_parquet_field_id(parquetFieldIds[read_index]); + } column_metadata.add_child(child); int childs_children = children[read_index++]; if (childs_children > 0) { - read_index = - set_column_metadata(column_metadata.child(write_index), col_names, nullability, is_int96, - precisions, is_map, children, childs_children, read_index); + read_index = set_column_metadata(column_metadata.child(write_index), col_names, nullability, + is_int96, precisions, is_map, hasParquetFieldIds, + parquetFieldIds, children, childs_children, read_index); } } return read_index; @@ -701,12 +706,15 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata, void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_names, jintArray &j_children, jbooleanArray &j_col_nullability, jbooleanArray &j_is_int96, jintArray &j_precisions, - jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata) { + jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata, + jbooleanArray &j_hasParquetFieldIds, jintArray &j_parquetFieldIds) { cudf::jni::auto_set_device(env); cudf::jni::native_jstringArray col_names(env, j_col_names); cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability); cudf::jni::native_jbooleanArray is_int96(env, j_is_int96); cudf::jni::native_jintArray precisions(env, j_precisions); + cudf::jni::native_jbooleanArray hasParquetFieldIds(env, j_hasParquetFieldIds); + cudf::jni::native_jintArray parquetFieldIds(env, j_parquetFieldIds); cudf::jni::native_jintArray children(env, j_children); cudf::jni::native_jbooleanArray is_map(env, j_is_map); @@ -729,11 +737,14 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam if (is_map[read_index]) { metadata.column_metadata[write_index].set_list_column_as_map(); } + if (!parquetFieldIds.is_null() && hasParquetFieldIds[read_index]) { + metadata.column_metadata[write_index].set_parquet_field_id(parquetFieldIds[read_index]); + } int childs_children = children[read_index++]; if (childs_children > 0) { - read_index = - set_column_metadata(metadata.column_metadata[write_index], cpp_names, col_nullability, - is_int96, precisions, is_map, children, childs_children, read_index); + read_index = set_column_metadata( + metadata.column_metadata[write_index], cpp_names, col_nullability, is_int96, precisions, + is_map, hasParquetFieldIds, parquetFieldIds, children, childs_children, read_index); } } } @@ -1539,7 +1550,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin( JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children, jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, - jbooleanArray j_is_map, jobject consumer) { + jbooleanArray j_is_map, jbooleanArray j_hasParquetFieldIds, jintArray j_parquetFieldIds, + jobject consumer) { JNI_NULL_CHECK(env, j_col_names, "null columns", 0); JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0); JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0); @@ -1554,7 +1566,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin( sink_info sink{data_sink.get()}; table_input_metadata metadata; createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96, - j_precisions, j_is_map, metadata); + j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds); auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector(); auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector(); @@ -1583,7 +1595,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin( JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children, jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, - jbooleanArray j_is_map, jstring j_output_path) { + jbooleanArray j_is_map, jbooleanArray j_hasParquetFieldIds, jintArray j_parquetFieldIds, + jstring j_output_path) { JNI_NULL_CHECK(env, j_col_names, "null columns", 0); JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0); JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0); @@ -1596,7 +1609,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin( using namespace cudf::jni; table_input_metadata metadata; createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96, - j_precisions, j_is_map, metadata); + j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds); auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector(); auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector(); @@ -1721,8 +1734,12 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin( table_input_metadata metadata; // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue. jbooleanArray j_is_int96 = NULL; + // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue. + jbooleanArray j_hasParquetFieldIds = NULL; + jintArray j_parquetFieldIds = NULL; + createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96, - j_precisions, j_is_map, metadata); + j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds); auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector(); auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector(); @@ -1766,8 +1783,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin( table_input_metadata metadata; // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue. jbooleanArray j_is_int96 = NULL; + // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue. + jbooleanArray j_hasParquetFieldIds = NULL; + jintArray j_parquetFieldIds = NULL; createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96, - j_precisions, j_is_map, metadata); + j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds); auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector(); auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector(); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 7be1ca2118b..af28cfb6d6c 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -36,6 +36,7 @@ import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; import org.junit.jupiter.api.Test; @@ -7899,6 +7900,126 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException { } } + @Test + void testParquetWriteWithFieldId() throws IOException { + // field IDs are: + // c1: -1, c2: 2, c3: 3, c31: 31, c32: 32, c4: -4, c5: not specified + ColumnWriterOptions.StructBuilder sBuilder = + structBuilder("c3", true, 3) + .withColumn(true, "c31", 31) + .withColumn(true, "c32", 32); + ParquetWriterOptions options = ParquetWriterOptions.builder() + .withColumn(true, "c1", -1) + .withDecimalColumn("c2", 9, true, 2) + .withStructColumn(sBuilder.build()) + .withTimestampColumn("c4", true, true, -4) + .withColumns( true, "c5") + .build(); + + File tempFile = File.createTempFile("test-field-id", ".parquet"); + try { + HostColumnVector.StructType structType = new HostColumnVector.StructType( + true, + new HostColumnVector.BasicType(true, DType.STRING), + new HostColumnVector.BasicType(true, DType.STRING)); + + try (Table table0 = new Table.TestBuilder() + .column(true, false) // c1 + .decimal32Column(0, 298, 2473) // c2 + .column(structType, // c3 + new HostColumnVector.StructData("a", "b"), new HostColumnVector.StructData("a", "b")) + .timestampMicrosecondsColumn(1000L, 2000L) // c4 + .column("a", "b") // c5 + .build()) { + try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) { + writer.write(table0); + } + } + + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath( + new Path(tempFile.getAbsolutePath()), + new Configuration()))) { + MessageType schema = reader.getFooter().getFileMetaData().getSchema(); + assert (schema.getFields().get(0).getId().intValue() == -1); + assert (schema.getFields().get(1).getId().intValue() == 2); + assert (schema.getFields().get(2).getId().intValue() == 3); + assert (((GroupType) schema.getFields().get(2)).getFields().get(0).getId().intValue() == 31); + assert (((GroupType) schema.getFields().get(2)).getFields().get(1).getId().intValue() == 32); + assert (schema.getFields().get(3).getId().intValue() == -4); + assert (schema.getFields().get(4).getId() == null); + } + } finally { + tempFile.delete(); + } + } + + @Test + void testParquetWriteWithFieldIdNestNotSpecified() throws IOException { + // field IDs are: + // c0: no field ID + // c1: 1 + // c2: no field ID + // c21: 21 + // c22: no field ID + // c3: 3 + // c31: 31 + // c32: no field ID + // c4: 0 + ColumnWriterOptions.StructBuilder c2Builder = + structBuilder("c2", true) + .withColumn(true, "c21", 21) + .withColumns(true, "c22"); + ColumnWriterOptions.StructBuilder c3Builder = + structBuilder("c3", true, 3) + .withColumn(true, "c31", 31) + .withColumns(true, "c32"); + ParquetWriterOptions options = ParquetWriterOptions.builder() + .withColumns(true, "c0") + .withDecimalColumn("c1", 9, true, 1) + .withStructColumn(c2Builder.build()) + .withStructColumn(c3Builder.build()) + .withColumn(true, "c4", 0) + .build(); + + File tempFile = File.createTempFile("test-field-id", ".parquet"); + try { + HostColumnVector.StructType structType = new HostColumnVector.StructType( + true, + new HostColumnVector.BasicType(true, DType.STRING), + new HostColumnVector.BasicType(true, DType.STRING)); + + try (Table table0 = new Table.TestBuilder() + .column(true, false) // c0 + .decimal32Column(0, 298, 2473) // c1 + .column(structType, // c2 + new HostColumnVector.StructData("a", "b"), new HostColumnVector.StructData("a", "b")) + .column(structType, // c3 + new HostColumnVector.StructData("a", "b"), new HostColumnVector.StructData("a", "b")) + .column("a", "b") // c4 + .build()) { + try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) { + writer.write(table0); + } + } + + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath( + new Path(tempFile.getAbsolutePath()), + new Configuration()))) { + MessageType schema = reader.getFooter().getFileMetaData().getSchema(); + assert (schema.getFields().get(0).getId() == null); + assert (schema.getFields().get(1).getId().intValue() == 1); + assert (schema.getFields().get(2).getId() == null); + assert (((GroupType) schema.getFields().get(2)).getFields().get(0).getId().intValue() == 21); + assert (((GroupType) schema.getFields().get(2)).getFields().get(1).getId() == null); + assert (((GroupType) schema.getFields().get(3)).getFields().get(0).getId().intValue() == 31); + assert (((GroupType) schema.getFields().get(3)).getFields().get(1).getId() == null); + assert (schema.getFields().get(4).getId().intValue() == 0); + } + } finally { + tempFile.delete(); + } + } + /** Return a column where DECIMAL64 has been up-casted to DECIMAL128 */ private ColumnVector castDecimal64To128(ColumnView c) { DType dtype = c.getType();