From de3d17ac4fb17f8b8bedd0f8aa6252319d2f40ef Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 14 Apr 2022 10:42:31 +0800 Subject: [PATCH] Add a boolean sentinel for field ID --- .../ai/rapids/cudf/ColumnWriterOptions.java | 25 +++++++++++++++---- .../CompressionMetadataWriterOptions.java | 5 ++++ java/src/main/java/ai/rapids/cudf/Table.java | 6 +++++ java/src/main/native/src/TableJni.cpp | 22 ++++++++++------ .../test/java/ai/rapids/cudf/TableTest.java | 17 ++++++++----- 5 files changed, 56 insertions(+), 19 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java index b68c6631b5d..f3fb7de6abe 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java @@ -34,11 +34,13 @@ public class ColumnWriterOptions { private boolean isMap = false; private String columnName; // only for Parquet + private boolean hasParquetFieldId; private int parquetFieldId; private ColumnWriterOptions(AbstractStructBuilder builder) { this.columnName = builder.name; this.isNullable = builder.isNullable; + this.hasParquetFieldId = builder.hasParquetFieldId; this.parquetFieldId = builder.parquetFieldId; this.childColumnOptions = (ColumnWriterOptions[]) builder.children.toArray(new ColumnWriterOptions[0]); @@ -93,6 +95,7 @@ public static abstract class NestedBuilder 0) { + return getFlatBooleans(ret, (opt) -> opt.getFlatHasParquetFieldId()); + } else { + return ret; + } + } + int[] getFlatParquetFieldId() { int[] ret = {parquetFieldId}; if (childColumnOptions.length > 0) { diff --git a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java index c1bf02e74b0..3a3b7d721b7 100644 --- a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java @@ -41,6 +41,11 @@ int[] getFlatPrecision() { return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatPrecision()); } + @Override + boolean[] getFlatHasParquetFieldId() { + return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatHasParquetFieldId()); + } + @Override int[] getFlatParquetFieldId() { return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatParquetFieldId()); diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 8cc1251d4d6..24f7d44ed28 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -290,6 +290,7 @@ private static native long writeParquetFileBegin(String[] columnNames, boolean[] isInt96, int[] precisions, boolean[] isMapValues, + boolean[] hasParquetFieldIds, int[] parquetFieldIds, String filename) throws CudfException; @@ -321,6 +322,7 @@ private static native long writeParquetBufferBegin(String[] columnNames, boolean[] isInt96, int[] precisions, boolean[] isMapValues, + boolean[] hasParquetFieldIds, int[] parquetFieldIds, HostBufferConsumer consumer) throws CudfException; @@ -1204,6 +1206,7 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) { boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96(); boolean[] isMapValues = options.getFlatIsMap(); int[] precisions = options.getFlatPrecision(); + boolean[] hasParquetFieldIds = options.getFlatHasParquetFieldId(); int[] parquetFieldIds = options.getFlatParquetFieldId(); int[] flatNumChildren = options.getFlatNumChildren(); @@ -1219,6 +1222,7 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) { timeInt96Values, precisions, isMapValues, + hasParquetFieldIds, parquetFieldIds, outputFile.getAbsolutePath()); } @@ -1229,6 +1233,7 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96(); boolean[] isMapValues = options.getFlatIsMap(); int[] precisions = options.getFlatPrecision(); + boolean[] hasParquetFieldIds = options.getFlatHasParquetFieldId(); int[] parquetFieldIds = options.getFlatParquetFieldId(); int[] flatNumChildren = options.getFlatNumChildren(); @@ -1244,6 +1249,7 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons timeInt96Values, precisions, isMapValues, + hasParquetFieldIds, parquetFieldIds, consumer); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index f715f8308e0..d5cd07c9826 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -706,12 +706,13 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam jintArray &j_children, jbooleanArray &j_col_nullability, jbooleanArray &j_is_int96, jintArray &j_precisions, jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata, - jintArray &j_parquetFieldIds) { + jbooleanArray &j_hasParquetFieldIds, jintArray &j_parquetFieldIds) { cudf::jni::auto_set_device(env); cudf::jni::native_jstringArray col_names(env, j_col_names); cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability); cudf::jni::native_jbooleanArray is_int96(env, j_is_int96); cudf::jni::native_jintArray precisions(env, j_precisions); + cudf::jni::native_jbooleanArray hasParquetFieldIds(env, j_hasParquetFieldIds); cudf::jni::native_jintArray parquetFieldIds(env, j_parquetFieldIds); cudf::jni::native_jintArray children(env, j_children); cudf::jni::native_jbooleanArray is_map(env, j_is_map); @@ -735,7 +736,7 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam if (is_map[read_index]) { metadata.column_metadata[write_index].set_list_column_as_map(); } - if (!parquetFieldIds.is_null()) { + if (!parquetFieldIds.is_null() && hasParquetFieldIds[read_index]) { metadata.column_metadata[write_index].set_parquet_field_id(parquetFieldIds[read_index]); } int childs_children = children[read_index++]; @@ -1548,7 +1549,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin( JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children, jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, - jbooleanArray j_is_map, jintArray j_parquetFieldIds, jobject consumer) { + jbooleanArray j_is_map, jbooleanArray j_hasParquetFieldIds, jintArray j_parquetFieldIds, + jobject consumer) { JNI_NULL_CHECK(env, j_col_names, "null columns", 0); JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0); JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0); @@ -1563,7 +1565,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin( sink_info sink{data_sink.get()}; table_input_metadata metadata; createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96, - j_precisions, j_is_map, metadata, j_parquetFieldIds); + j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds); auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector(); auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector(); @@ -1592,7 +1594,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin( JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children, jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, - jbooleanArray j_is_map, jintArray j_parquetFieldIds, jstring j_output_path) { + jbooleanArray j_is_map, jbooleanArray j_hasParquetFieldIds, jintArray j_parquetFieldIds, + jstring j_output_path) { JNI_NULL_CHECK(env, j_col_names, "null columns", 0); JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0); JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0); @@ -1605,7 +1608,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin( using namespace cudf::jni; table_input_metadata metadata; createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96, - j_precisions, j_is_map, metadata, j_parquetFieldIds); + j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds); auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector(); auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector(); @@ -1731,9 +1734,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin( // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue. jbooleanArray j_is_int96 = NULL; // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue. + jbooleanArray j_hasParquetFieldIds = NULL; jintArray j_parquetFieldIds = NULL; + createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96, - j_precisions, j_is_map, metadata, j_parquetFieldIds); + j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds); auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector(); auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector(); @@ -1778,9 +1783,10 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin( // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue. jbooleanArray j_is_int96 = NULL; // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue. + jbooleanArray j_hasParquetFieldIds = NULL; jintArray j_parquetFieldIds = NULL; createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96, - j_precisions, j_is_map, metadata, j_parquetFieldIds); + j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds); auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector(); auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector(); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index eb11e241bd0..120e30fbe63 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -7902,15 +7902,18 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException { @Test void testParquetWriteWithFieldId() throws IOException { + // field IDs are: + // c1: -1, c2: 2, c3: 3, c31: 31, c32: 32, c4: -4, c5: not specified ColumnWriterOptions.StructBuilder sBuilder = structBuilder("c3", true, 3) - .withColumns(true, "c31", 31) - .withColumns(true, "c32", 32); + .withColumn(true, "c31", 31) + .withColumn(true, "c32", 32); ParquetWriterOptions options = ParquetWriterOptions.builder() - .withColumns(true, "c1", 1) + .withColumn(true, "c1", -1) .withDecimalColumn("c2", 9, true, 2) .withStructColumn(sBuilder.build()) - .withTimestampColumn("c4", true, true, 4) + .withTimestampColumn("c4", true, true, -4) + .withColumns( true, "c5") .build(); File tempFile = File.createTempFile("test-field-id", ".parquet"); @@ -7926,6 +7929,7 @@ void testParquetWriteWithFieldId() throws IOException { .column(structType, // c3 new HostColumnVector.StructData("a", "b"), new HostColumnVector.StructData("a", "b")) .timestampMicrosecondsColumn(1000L, 2000L) // c4 + .column("a", "b") // c5 .build()) { try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) { writer.write(table0); @@ -7936,12 +7940,13 @@ void testParquetWriteWithFieldId() throws IOException { new Path(tempFile.getAbsolutePath()), new Configuration()))) { MessageType schema = reader.getFooter().getFileMetaData().getSchema(); - assert (schema.getFields().get(0).getId().intValue() == 1); + assert (schema.getFields().get(0).getId().intValue() == -1); assert (schema.getFields().get(1).getId().intValue() == 2); assert (schema.getFields().get(2).getId().intValue() == 3); assert (((GroupType) schema.getFields().get(2)).getFields().get(0).getId().intValue() == 31); assert (((GroupType) schema.getFields().get(2)).getFields().get(1).getId().intValue() == 32); - assert (schema.getFields().get(3).getId().intValue() == 4); + assert (schema.getFields().get(3).getId().intValue() == -4); + assert (schema.getFields().get(4).getId() == null); } } finally { tempFile.delete();