From 1c8d67603dff141d8c6604098b57eda4f677c1a1 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Thu, 11 Mar 2021 12:12:36 +0800 Subject: [PATCH 01/16] Pass names of children struct columns to native Pass the names of children struct columns to the naitve for arrow IPC writer, which is required to build column_metadata. Also add the related unit tests. Signed-off-by: Firestarman --- .../ai/rapids/cudf/ArrowIPCWriterOptions.java | 31 +++++++- .../java/ai/rapids/cudf/ColumnMetadata.java | 47 ++++++++++++ java/src/main/java/ai/rapids/cudf/Table.java | 12 ++-- java/src/main/native/src/TableJni.cpp | 72 +++++++++++++------ .../test/java/ai/rapids/cudf/TableTest.java | 55 +++++++++++--- 5 files changed, 179 insertions(+), 38 deletions(-) create mode 100644 java/src/main/java/ai/rapids/cudf/ColumnMetadata.java diff --git a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java index 298e99b059d..bfa7e28c4e3 100644 --- a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,10 @@ package ai.rapids.cudf; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + /** * Settings for writing Arrow IPC data. */ @@ -34,11 +38,13 @@ public interface DoneOnGpu { private final long size; private final DoneOnGpu callback; + private final ColumnMetadata[] columnMeta; private ArrowIPCWriterOptions(Builder builder) { super(builder); this.size = builder.size; this.callback = builder.callback; + this.columnMeta = builder.columnMeta.toArray(new ColumnMetadata[builder.columnMeta.size()]); } public long getMaxChunkSize() { @@ -49,9 +55,23 @@ public DoneOnGpu getCallback() { return callback; } + public ColumnMetadata[] getColumnMetadata() { + if (columnMeta == null || columnMeta.length == 0) { + // For compatibility. Try building from column names when column meta is empty. + // Should remove this once all the callers update to use only column metadata. + return Arrays + .stream(getColumnNames()) + .map(ColumnMetadata::new) + .toArray(ColumnMetadata[]::new); + } else { + return columnMeta; + } + } + public static class Builder extends WriterBuilder { private long size = -1; private DoneOnGpu callback = (ignored) -> {}; + private List columnMeta = new ArrayList<>(); public Builder withMaxChunkSize(long size) { this.size = size; @@ -67,6 +87,15 @@ public Builder withCallback(DoneOnGpu callback) { return this; } + /** + * This should be used instead of `withColumnNames` when there are children + * columns of struct type. + */ + public Builder withColumnMetadata(ColumnMetadata... columnMeta) { + this.columnMeta.addAll(Arrays.asList(columnMeta)); + return this; + } + public ArrowIPCWriterOptions build() { return new ArrowIPCWriterOptions(this); } diff --git a/java/src/main/java/ai/rapids/cudf/ColumnMetadata.java b/java/src/main/java/ai/rapids/cudf/ColumnMetadata.java new file mode 100644 index 00000000000..ed1d95ff37b --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/ColumnMetadata.java @@ -0,0 +1,47 @@ +/* + * + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Detailed meta data information for arrow array. + * + * (This is analogous to the native `column_metadata`.) + */ +public class ColumnMetadata { + // No getXXX for name, since it is accessed from native. + private String name; + private List children = new ArrayList<>(); + + public ColumnMetadata(final String colName) { + this.name = colName; + } + + public ColumnMetadata addChildren(ColumnMetadata... childrenMeta) { + children.addAll(Arrays.asList(childrenMeta)); + return this; + } + + public ColumnMetadata[] getChildren() { + return children.toArray(new ColumnMetadata[children.size()]); + } +} diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index fcc23777d69..3d036afe8f6 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -365,19 +365,19 @@ private static native long writeORCBufferBegin(String[] columnNames, /** * Setup everything to write Arrow IPC formatted data to a file. - * @param columnNames names that correspond to the table columns + * @param columnMeta column metadata that correspond to the table columns * @param filename local output path * @return a handle that is used in later calls to writeArrowIPCChunk and writeArrowIPCEnd. */ - private static native long writeArrowIPCFileBegin(String[] columnNames, String filename); + private static native long writeArrowIPCFileBegin(ColumnMetadata[] columnMeta, String filename); /** * Setup everything to write Arrow IPC formatted data to a buffer. - * @param columnNames names that correspond to the table columns + * @param columnMeta column metadata that correspond to the table columns * @param consumer consumer of host buffers produced. * @return a handle that is used in later calls to writeArrowIPCChunk and writeArrowIPCEnd. */ - private static native long writeArrowIPCBufferBegin(String[] columnNames, + private static native long writeArrowIPCBufferBegin(ColumnMetadata[] columnMeta, HostBufferConsumer consumer); /** @@ -988,7 +988,7 @@ private ArrowIPCTableWriter(ArrowIPCWriterOptions options, this.consumer = null; this.maxChunkSize = options.getMaxChunkSize(); this.handle = writeArrowIPCFileBegin( - options.getColumnNames(), + options.getColumnMetadata(), outputFile.getAbsolutePath()); } @@ -998,7 +998,7 @@ private ArrowIPCTableWriter(ArrowIPCWriterOptions options, this.consumer = consumer; this.maxChunkSize = options.getMaxChunkSize(); this.handle = writeArrowIPCBufferBegin( - options.getColumnNames(), + options.getColumnMetadata(), consumer); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index e051f68be4e..e87803fc5ab 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -203,16 +203,16 @@ typedef jni_table_writer_handle native_orc_writer_ class native_arrow_ipc_writer_handle final { public: - explicit native_arrow_ipc_writer_handle(const std::vector &col_names, + explicit native_arrow_ipc_writer_handle(const std::vector &col_meta, const std::string &file_name) - : initialized(false), column_names(col_names), file_name(file_name) {} + : initialized(false), column_metadata(col_meta), file_name(file_name) {} - explicit native_arrow_ipc_writer_handle(const std::vector &col_names, + explicit native_arrow_ipc_writer_handle(const std::vector &col_meta, const std::shared_ptr &sink) - : initialized(false), column_names(col_names), file_name(""), sink(sink) {} + : initialized(false), column_metadata(col_meta), file_name(""), sink(sink) {} bool initialized; - std::vector column_names; + std::vector column_metadata; std::string file_name; std::shared_ptr sink; std::shared_ptr writer; @@ -563,6 +563,42 @@ bool valid_window_parameters(native_jintArray const &values, values.size() == preceding.size() && values.size() == following.size(); } +static void build_one_column_metadata(JNIEnv *env, jobject meta_obj, + jfieldID name_id, + jmethodID children_mid, + cudf::column_metadata& out) { + // get column name + cudf::jni::native_jstring col_name(env, (jstring)env->GetObjectField(meta_obj, name_id)); + out.name = std::string(col_name.get()); + // children + jobjectArray j_children_meta = (jobjectArray)env->CallObjectMethod(meta_obj, children_mid); + cudf::jni::native_jobjectArray children_meta(env, j_children_meta); + for (int i = 0; i < children_meta.size(); i ++) { + cudf::column_metadata cudf_col_meta; + build_one_column_metadata(env, children_meta.get(i), name_id, children_mid, cudf_col_meta); + out.children_meta.push_back(std::move(cudf_col_meta)); + } +} + +static void build_column_metadata_from_handle(JNIEnv *env, jobjectArray meta_handle, + std::vector& out_meta) { + native_jobjectArray col_meta(env, meta_handle); + out_meta.reserve(col_meta.size()); + // Init the column meatadata class + jclass col_meta_cls = env->FindClass("ai/rapids/cudf/ColumnMetadata"); + JNI_NULL_CHECK(env, col_meta_cls, "Can not find class: ai/rapids/cudf/ColumnMetadata", ); + jfieldID name_id = env->GetFieldID(col_meta_cls, "name", "Ljava/lang/String;"); + jmethodID children_mid = + env->GetMethodID(col_meta_cls, "getChildren", "()[Lai/rapids/cudf/ColumnMetadata;"); + + for(int x = 0; x < col_meta.size(); x ++) { + cudf::column_metadata cudf_col_meta; + build_one_column_metadata(env, col_meta.get(x), name_id, children_mid, cudf_col_meta); + out_meta.push_back(std::move(cudf_col_meta)); + } + cudf::jni::check_java_exception(env); +} + } // namespace } // namespace jni @@ -1196,36 +1232,37 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass } JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(JNIEnv *env, jclass, - jobjectArray j_col_names, + jobjectArray j_col_meta, jobject consumer) { - JNI_NULL_CHECK(env, j_col_names, "null columns", 0); + JNI_NULL_CHECK(env, j_col_meta, "null columns", 0); JNI_NULL_CHECK(env, consumer, "null consumer", 0); try { cudf::jni::auto_set_device(env); - cudf::jni::native_jstringArray col_names(env, j_col_names); - std::shared_ptr data_sink( new cudf::jni::jni_arrow_output_stream(env, consumer)); + std::vector col_meta; + cudf::jni::build_column_metadata_from_handle(env, j_col_meta, col_meta); cudf::jni::native_arrow_ipc_writer_handle *ret = - new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), data_sink); + new cudf::jni::native_arrow_ipc_writer_handle(col_meta, data_sink); return reinterpret_cast(ret); } CATCH_STD(env, 0) } JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv *env, jclass, - jobjectArray j_col_names, + jobjectArray j_col_meta, jstring j_output_path) { - JNI_NULL_CHECK(env, j_col_names, "null columns", 0); + JNI_NULL_CHECK(env, j_col_meta, "null columns", 0); JNI_NULL_CHECK(env, j_output_path, "null output path", 0); try { cudf::jni::auto_set_device(env); - cudf::jni::native_jstringArray col_names(env, j_col_names); cudf::jni::native_jstring output_path(env, j_output_path); + std::vector col_meta; + cudf::jni::build_column_metadata_from_handle(env, j_col_meta, col_meta); cudf::jni::native_arrow_ipc_writer_handle *ret = - new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), output_path.get()); + new cudf::jni::native_arrow_ipc_writer_handle(col_meta, output_path.get()); return reinterpret_cast(ret); } CATCH_STD(env, 0) @@ -1245,12 +1282,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv cudf::jni::auto_set_device(env); std::unique_ptr> result( new std::shared_ptr(nullptr)); - auto column_metadata = std::vector{}; - column_metadata.reserve(state->column_names.size()); - std::transform(std::begin(state->column_names), std::end(state->column_names), - std::back_inserter(column_metadata), - [](auto const &column_name) { return cudf::column_metadata{column_name}; }); - *result = cudf::to_arrow(*tview, column_metadata); + *result = cudf::to_arrow(*tview, state->column_metadata); if (!result->get()) { return 0; } diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 88196a4112a..a7f5bc2b447 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -4056,15 +4056,38 @@ void testTableBasedFilter() { } private Table getExpectedFileTable() { - return new TestBuilder() - .column(true, false, false, true, false) - .column(5, 1, 0, 2, 7) - .column(new Byte[]{2, 3, 4, 5, 9}) - .column(3l, 9l, 4l, 2l, 20l) - .column("this", "is", "a", "test", "string") - .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f) - .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d) - .build(); + return getExpectedFileTable(false); + } + + private Table getExpectedFileTable(boolean withNestedColumns) { + TestBuilder tb = new TestBuilder() + .column(true, false, false, true, false) + .column(5, 1, 0, 2, 7) + .column(new Byte[]{2, 3, 4, 5, 9}) + .column(3l, 9l, 4l, 2l, 20l) + .column("this", "is", "a", "test", "string") + .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f) + .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d); + if (withNestedColumns) { + StructType nestedType = new StructType(true, + new BasicType(false, DType.INT32), new BasicType(false, DType.STRING)); + tb.column(nestedType, + struct(1, "k1"), struct(2, "k2"), struct(3, "k3"), + struct(4, "k4"), new HostColumnVector.StructData((List) null)) + .column(new ListType(false, new BasicType(false, DType.INT32)), + Arrays.asList(1, 2), + Arrays.asList(3, 4), + Arrays.asList(5), + Arrays.asList(6, 7), + Arrays.asList(8, 9, 10)) + .column(new ListType(false, nestedType), + Arrays.asList(struct(1, "k1"), struct(2, "k2"), struct(3, "k3")), + Arrays.asList(struct(4, "k4"), struct(5, "k5")), + Arrays.asList(struct(6, "k6")), + Arrays.asList(new HostColumnVector.StructData((List) null)), + Arrays.asList()); + } + return tb.build(); } private Table getExpectedFileTableWithDecimals() { @@ -4272,10 +4295,20 @@ void testArrowIPCWriteToFileWithNamesAndMetadata() throws IOException { @Test void testArrowIPCWriteToBufferChunked() { - try (Table table0 = getExpectedFileTable(); + try (Table table0 = getExpectedFileTable(true); MyBufferConsumer consumer = new MyBufferConsumer()) { + ColumnMetadata[] colMeta = Arrays.asList("first", "second", "third", "fourth", "fifth", + "sixth", "seventh").stream().map(ColumnMetadata::new) + .toArray(ColumnMetadata[]::new); + ColumnMetadata structCM = new ColumnMetadata("eight"); + structCM.addChildren(new ColumnMetadata("id"), new ColumnMetadata("name")); + ColumnMetadata arrayPriCM = new ColumnMetadata("nine"); + arrayPriCM.addChildren(new ColumnMetadata("aid")); + ColumnMetadata arrayStructCM = new ColumnMetadata("ten"); + arrayStructCM.addChildren(structCM); ArrowIPCWriterOptions options = ArrowIPCWriterOptions.builder() - .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh") + .withColumnMetadata(colMeta) + .withColumnMetadata(structCM, arrayPriCM, arrayStructCM) .build(); try (TableWriter writer = Table.writeArrowIPCChunked(options, consumer)) { writer.write(table0); From 6adafda79907e3005baa0857869749252ec91bfd Mon Sep 17 00:00:00 2001 From: Firestarman Date: Tue, 16 Mar 2021 09:53:21 +0800 Subject: [PATCH 02/16] Add a metadata for offset column of array type. This is required by native. Signed-off-by: Firestarman --- .../java/ai/rapids/cudf/ArrowIPCWriterOptions.java | 5 ++++- java/src/main/native/src/TableJni.cpp | 2 +- java/src/test/java/ai/rapids/cudf/TableTest.java | 11 ++++++----- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java index bfa7e28c4e3..51bdffae125 100644 --- a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java @@ -57,7 +57,7 @@ public DoneOnGpu getCallback() { public ColumnMetadata[] getColumnMetadata() { if (columnMeta == null || columnMeta.length == 0) { - // For compatibility. Try building from column names when column meta is empty. + // This is for compatibility. Try building from column names when column meta is empty. // Should remove this once all the callers update to use only column metadata. return Arrays .stream(getColumnNames()) @@ -90,6 +90,9 @@ public Builder withCallback(DoneOnGpu callback) { /** * This should be used instead of `withColumnNames` when there are children * columns of struct type. + * + * (`columnNullability` is not used by Arrow IPC Writer, so it's fine to be ignored here. + * It can be placed into `ColumnMetadata` if needing it in the future.) */ public Builder withColumnMetadata(ColumnMetadata... columnMeta) { this.columnMeta.addAll(Arrays.asList(columnMeta)); diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index e87803fc5ab..10e2968d76b 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -569,7 +569,7 @@ static void build_one_column_metadata(JNIEnv *env, jobject meta_obj, cudf::column_metadata& out) { // get column name cudf::jni::native_jstring col_name(env, (jstring)env->GetObjectField(meta_obj, name_id)); - out.name = std::string(col_name.get()); + out.name = std::string(col_name.get() == NULL ? "" : col_name.get()); // children jobjectArray j_children_meta = (jobjectArray)env->CallObjectMethod(meta_obj, children_mid); cudf::jni::native_jobjectArray children_meta(env, j_children_meta); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index a7f5bc2b447..0bb77b6538f 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -4300,12 +4300,13 @@ void testArrowIPCWriteToBufferChunked() { ColumnMetadata[] colMeta = Arrays.asList("first", "second", "third", "fourth", "fifth", "sixth", "seventh").stream().map(ColumnMetadata::new) .toArray(ColumnMetadata[]::new); - ColumnMetadata structCM = new ColumnMetadata("eight"); + ColumnMetadata structCM = new ColumnMetadata("eighth"); structCM.addChildren(new ColumnMetadata("id"), new ColumnMetadata("name")); - ColumnMetadata arrayPriCM = new ColumnMetadata("nine"); - arrayPriCM.addChildren(new ColumnMetadata("aid")); - ColumnMetadata arrayStructCM = new ColumnMetadata("ten"); - arrayStructCM.addChildren(structCM); + ColumnMetadata arrayPriCM = new ColumnMetadata("ninth"); + // Array type needs a stub metadata for the offset column + arrayPriCM.addChildren(new ColumnMetadata(null), new ColumnMetadata("aid")); + ColumnMetadata arrayStructCM = new ColumnMetadata("tenth"); + arrayStructCM.addChildren(new ColumnMetadata(null), structCM); ArrowIPCWriterOptions options = ArrowIPCWriterOptions.builder() .withColumnMetadata(colMeta) .withColumnMetadata(structCM, arrayPriCM, arrayStructCM) From 44fa20434b8e800f7f305cb167ad07d3937ebfde Mon Sep 17 00:00:00 2001 From: Firestarman Date: Wed, 17 Mar 2021 15:16:50 +0800 Subject: [PATCH 03/16] Use the regular JNI call to create the c++ column meta Do this to avoid callback callback into the JVM. Signed-off-by: Firestarman --- .../ai/rapids/cudf/ArrowIPCWriterOptions.java | 11 ++-- .../java/ai/rapids/cudf/ColumnMetadata.java | 52 +++++++++++++---- java/src/main/java/ai/rapids/cudf/Table.java | 26 +++++---- java/src/main/native/CMakeLists.txt | 1 + .../src/main/native/src/ColumnMetadataJni.cpp | 58 +++++++++++++++++++ java/src/main/native/src/TableJni.cpp | 44 +++----------- 6 files changed, 131 insertions(+), 61 deletions(-) create mode 100644 java/src/main/native/src/ColumnMetadataJni.cpp diff --git a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java index 51bdffae125..41eb0eb7aed 100644 --- a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; /** * Settings for writing Arrow IPC data. @@ -38,13 +39,13 @@ public interface DoneOnGpu { private final long size; private final DoneOnGpu callback; - private final ColumnMetadata[] columnMeta; + private final List columnMeta; private ArrowIPCWriterOptions(Builder builder) { super(builder); this.size = builder.size; this.callback = builder.callback; - this.columnMeta = builder.columnMeta.toArray(new ColumnMetadata[builder.columnMeta.size()]); + this.columnMeta = builder.columnMeta; } public long getMaxChunkSize() { @@ -55,14 +56,14 @@ public DoneOnGpu getCallback() { return callback; } - public ColumnMetadata[] getColumnMetadata() { - if (columnMeta == null || columnMeta.length == 0) { + public List getColumnMetadata() { + if (columnMeta == null || columnMeta.size() == 0) { // This is for compatibility. Try building from column names when column meta is empty. // Should remove this once all the callers update to use only column metadata. return Arrays .stream(getColumnNames()) .map(ColumnMetadata::new) - .toArray(ColumnMetadata[]::new); + .collect(Collectors.toList()); } else { return columnMeta; } diff --git a/java/src/main/java/ai/rapids/cudf/ColumnMetadata.java b/java/src/main/java/ai/rapids/cudf/ColumnMetadata.java index ed1d95ff37b..31c86018502 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnMetadata.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnMetadata.java @@ -28,20 +28,50 @@ * (This is analogous to the native `column_metadata`.) */ public class ColumnMetadata { - // No getXXX for name, since it is accessed from native. - private String name; - private List children = new ArrayList<>(); + private String name; + private List children = new ArrayList<>(); - public ColumnMetadata(final String colName) { - this.name = colName; - } + public ColumnMetadata(final String colName) { + this.name = colName; + } + + public ColumnMetadata addChildren(ColumnMetadata... childrenMeta) { + children.addAll(Arrays.asList(childrenMeta)); + return this; + } - public ColumnMetadata addChildren(ColumnMetadata... childrenMeta) { - children.addAll(Arrays.asList(childrenMeta)); - return this; + /** + * returns a cudf::column_metadata * cast to a long. We don't want to force + * users to close a ColumnMetadata. Because of the ColumnMetadata objects are created in + * pure java, but when it is time to use them this method is called to return a pointer to + * the c++ column_metadata instance. All values returned by this can be used multiple times, + * and should be closed by calling the static close method. Yes, this creates a lot more JNI + * calls, but it keeps the user API clean. + */ + long createNativeInstance() throws CudfException { + long[] childrenHandles = createNativeInstances(children); + try { + return create(name, childrenHandles); + } finally { + close(childrenHandles); } + } - public ColumnMetadata[] getChildren() { - return children.toArray(new ColumnMetadata[children.size()]); + static void close(long[] metaHandles) throws CudfException { + if (metaHandles == null) { + return; + } + for (long h : metaHandles) { + close(h); } + } + + static long[] createNativeInstances(List metadataList) { + return metadataList.stream() + .mapToLong(ColumnMetadata::createNativeInstance) + .toArray(); + } + + private static native void close(long metaHandle) throws CudfException; + private static native long create(final String name, long[] children) throws CudfException; } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 3d036afe8f6..2ca1b6600b1 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -365,19 +365,19 @@ private static native long writeORCBufferBegin(String[] columnNames, /** * Setup everything to write Arrow IPC formatted data to a file. - * @param columnMeta column metadata that correspond to the table columns + * @param columnsMeta column metadata that correspond to the table columns * @param filename local output path * @return a handle that is used in later calls to writeArrowIPCChunk and writeArrowIPCEnd. */ - private static native long writeArrowIPCFileBegin(ColumnMetadata[] columnMeta, String filename); + private static native long writeArrowIPCFileBegin(long[] columnsMeta, String filename); /** * Setup everything to write Arrow IPC formatted data to a buffer. - * @param columnMeta column metadata that correspond to the table columns + * @param columnsMeta column metadata that correspond to the table columns * @param consumer consumer of host buffers produced. * @return a handle that is used in later calls to writeArrowIPCChunk and writeArrowIPCEnd. */ - private static native long writeArrowIPCBufferBegin(ColumnMetadata[] columnMeta, + private static native long writeArrowIPCBufferBegin(long[] columnsMeta, HostBufferConsumer consumer); /** @@ -987,9 +987,12 @@ private ArrowIPCTableWriter(ArrowIPCWriterOptions options, this.callback = options.getCallback(); this.consumer = null; this.maxChunkSize = options.getMaxChunkSize(); - this.handle = writeArrowIPCFileBegin( - options.getColumnMetadata(), - outputFile.getAbsolutePath()); + long[] metaHandles = ColumnMetadata.createNativeInstances(options.getColumnMetadata()); + try { + this.handle = writeArrowIPCFileBegin(metaHandles, outputFile.getAbsolutePath()); + } finally { + ColumnMetadata.close(metaHandles); + } } private ArrowIPCTableWriter(ArrowIPCWriterOptions options, @@ -997,9 +1000,12 @@ private ArrowIPCTableWriter(ArrowIPCWriterOptions options, this.callback = options.getCallback(); this.consumer = consumer; this.maxChunkSize = options.getMaxChunkSize(); - this.handle = writeArrowIPCBufferBegin( - options.getColumnMetadata(), - consumer); + long[] metaHandles = ColumnMetadata.createNativeInstances(options.getColumnMetadata()); + try { + this.handle = writeArrowIPCBufferBegin(metaHandles, consumer); + } finally { + ColumnMetadata.close(metaHandles); + } } @Override diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index c1239fe69ea..a70e6bea72f 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -241,6 +241,7 @@ set(SOURCE_FILES "src/AggregationJni.cpp" "src/CudfJni.cpp" "src/CudaJni.cpp" + "src/ColumnMetadataJni.cpp" "src/ColumnVectorJni.cpp" "src/ColumnViewJni.cpp" "src/ContiguousTableJni.cpp" diff --git a/java/src/main/native/src/ColumnMetadataJni.cpp b/java/src/main/native/src/ColumnMetadataJni.cpp new file mode 100644 index 00000000000..87551eefaf7 --- /dev/null +++ b/java/src/main/native/src/ColumnMetadataJni.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "jni_utils.hpp" + +extern "C" { + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnMetadata_close(JNIEnv *env, + jclass, + jlong j_handle) { + JNI_NULL_CHECK(env, j_handle, "column metadata handle is null", ); + try { + auto to_del = reinterpret_cast(j_handle); + delete to_del; + } + CATCH_STD(env, ); +} + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnMetadata_create(JNIEnv *env, + jclass, + jstring j_name, + jlongArray j_children) { + try { + // No need to set device since no GPU ops here. + cudf::jni::native_jstring col_name(env, j_name); + cudf::jni::native_jlongArray meta_children(env, j_children); + // Create a meta with empty name if `col_name` is NULL. + auto name = std::string(col_name.is_null() ? "" : col_name.get()); + cudf::column_metadata *cm = new cudf::column_metadata(name); + if (!meta_children.is_null()) { + // add the children + for (int i = 0; i < meta_children.size(); i++) { + cudf::column_metadata *child = reinterpret_cast(meta_children[i]); + // copy to `this`. + cm->children_meta.push_back(*child); + } + } + return reinterpret_cast(cm); + } + CATCH_STD(env, 0); +} + +} // extern "C" diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 10e2968d76b..1d23a27d77b 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -563,40 +563,14 @@ bool valid_window_parameters(native_jintArray const &values, values.size() == preceding.size() && values.size() == following.size(); } -static void build_one_column_metadata(JNIEnv *env, jobject meta_obj, - jfieldID name_id, - jmethodID children_mid, - cudf::column_metadata& out) { - // get column name - cudf::jni::native_jstring col_name(env, (jstring)env->GetObjectField(meta_obj, name_id)); - out.name = std::string(col_name.get() == NULL ? "" : col_name.get()); - // children - jobjectArray j_children_meta = (jobjectArray)env->CallObjectMethod(meta_obj, children_mid); - cudf::jni::native_jobjectArray children_meta(env, j_children_meta); - for (int i = 0; i < children_meta.size(); i ++) { - cudf::column_metadata cudf_col_meta; - build_one_column_metadata(env, children_meta.get(i), name_id, children_mid, cudf_col_meta); - out.children_meta.push_back(std::move(cudf_col_meta)); - } -} - -static void build_column_metadata_from_handle(JNIEnv *env, jobjectArray meta_handle, +static void build_column_metadata_from_handle(JNIEnv *env, jlongArray j_meta_handles, std::vector& out_meta) { - native_jobjectArray col_meta(env, meta_handle); - out_meta.reserve(col_meta.size()); - // Init the column meatadata class - jclass col_meta_cls = env->FindClass("ai/rapids/cudf/ColumnMetadata"); - JNI_NULL_CHECK(env, col_meta_cls, "Can not find class: ai/rapids/cudf/ColumnMetadata", ); - jfieldID name_id = env->GetFieldID(col_meta_cls, "name", "Ljava/lang/String;"); - jmethodID children_mid = - env->GetMethodID(col_meta_cls, "getChildren", "()[Lai/rapids/cudf/ColumnMetadata;"); - - for(int x = 0; x < col_meta.size(); x ++) { - cudf::column_metadata cudf_col_meta; - build_one_column_metadata(env, col_meta.get(x), name_id, children_mid, cudf_col_meta); - out_meta.push_back(std::move(cudf_col_meta)); - } - cudf::jni::check_java_exception(env); + cudf::jni::native_jlongArray meta_handles(env, j_meta_handles); + for (int i = 0; i < meta_handles.size(); i++) { + cudf::column_metadata *child = reinterpret_cast(meta_handles[i]); + // copy the child into `out_meta`. + out_meta.push_back(*child); + } } } // namespace @@ -1232,7 +1206,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass } JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(JNIEnv *env, jclass, - jobjectArray j_col_meta, + jlongArray j_col_meta, jobject consumer) { JNI_NULL_CHECK(env, j_col_meta, "null columns", 0); JNI_NULL_CHECK(env, consumer, "null consumer", 0); @@ -1251,7 +1225,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(JNIEnv } JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv *env, jclass, - jobjectArray j_col_meta, + jlongArray j_col_meta, jstring j_output_path) { JNI_NULL_CHECK(env, j_col_meta, "null columns", 0); JNI_NULL_CHECK(env, j_output_path, "null output path", 0); From a767e7008d15fcefcc60aedec2643f62bd5994bf Mon Sep 17 00:00:00 2001 From: Firestarman Date: Wed, 17 Mar 2021 16:16:39 +0800 Subject: [PATCH 04/16] Correct the year Signed-off-by: Firestarman --- java/src/main/native/src/ColumnMetadataJni.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/native/src/ColumnMetadataJni.cpp b/java/src/main/native/src/ColumnMetadataJni.cpp index 87551eefaf7..f90d09c69ff 100644 --- a/java/src/main/native/src/ColumnMetadataJni.cpp +++ b/java/src/main/native/src/ColumnMetadataJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From d83d0e9a24d1ad953e49848a80843540c53cd017 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Wed, 17 Mar 2021 16:27:05 +0800 Subject: [PATCH 05/16] comment update Signed-off-by: Firestarman --- java/src/main/native/src/TableJni.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 1d23a27d77b..c1ac5a7c8d1 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -567,9 +567,9 @@ static void build_column_metadata_from_handle(JNIEnv *env, jlongArray j_meta_han std::vector& out_meta) { cudf::jni::native_jlongArray meta_handles(env, j_meta_handles); for (int i = 0; i < meta_handles.size(); i++) { - cudf::column_metadata *child = reinterpret_cast(meta_handles[i]); - // copy the child into `out_meta`. - out_meta.push_back(*child); + cudf::column_metadata *cm = reinterpret_cast(meta_handles[i]); + // copy to `out_meta`. + out_meta.push_back(*cm); } } From 4c2eeef858318fbd3ec01722c14eb63c3c062bad Mon Sep 17 00:00:00 2001 From: Firestarman Date: Thu, 18 Mar 2021 11:02:18 +0800 Subject: [PATCH 06/16] Align with the flattened name approach. Signed-off-by: Firestarman --- .../ai/rapids/cudf/ArrowIPCWriterOptions.java | 35 +------- .../java/ai/rapids/cudf/ColumnMetadata.java | 77 ---------------- java/src/main/java/ai/rapids/cudf/Table.java | 26 +++--- java/src/main/native/CMakeLists.txt | 1 - .../src/main/native/src/ColumnMetadataJni.cpp | 58 ------------ java/src/main/native/src/TableJni.cpp | 88 +++++++++++++------ .../test/java/ai/rapids/cudf/TableTest.java | 16 +--- 7 files changed, 77 insertions(+), 224 deletions(-) delete mode 100644 java/src/main/java/ai/rapids/cudf/ColumnMetadata.java delete mode 100644 java/src/main/native/src/ColumnMetadataJni.cpp diff --git a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java index 41eb0eb7aed..298e99b059d 100644 --- a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,11 +18,6 @@ package ai.rapids.cudf; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; - /** * Settings for writing Arrow IPC data. */ @@ -39,13 +34,11 @@ public interface DoneOnGpu { private final long size; private final DoneOnGpu callback; - private final List columnMeta; private ArrowIPCWriterOptions(Builder builder) { super(builder); this.size = builder.size; this.callback = builder.callback; - this.columnMeta = builder.columnMeta; } public long getMaxChunkSize() { @@ -56,23 +49,9 @@ public DoneOnGpu getCallback() { return callback; } - public List getColumnMetadata() { - if (columnMeta == null || columnMeta.size() == 0) { - // This is for compatibility. Try building from column names when column meta is empty. - // Should remove this once all the callers update to use only column metadata. - return Arrays - .stream(getColumnNames()) - .map(ColumnMetadata::new) - .collect(Collectors.toList()); - } else { - return columnMeta; - } - } - public static class Builder extends WriterBuilder { private long size = -1; private DoneOnGpu callback = (ignored) -> {}; - private List columnMeta = new ArrayList<>(); public Builder withMaxChunkSize(long size) { this.size = size; @@ -88,18 +67,6 @@ public Builder withCallback(DoneOnGpu callback) { return this; } - /** - * This should be used instead of `withColumnNames` when there are children - * columns of struct type. - * - * (`columnNullability` is not used by Arrow IPC Writer, so it's fine to be ignored here. - * It can be placed into `ColumnMetadata` if needing it in the future.) - */ - public Builder withColumnMetadata(ColumnMetadata... columnMeta) { - this.columnMeta.addAll(Arrays.asList(columnMeta)); - return this; - } - public ArrowIPCWriterOptions build() { return new ArrowIPCWriterOptions(this); } diff --git a/java/src/main/java/ai/rapids/cudf/ColumnMetadata.java b/java/src/main/java/ai/rapids/cudf/ColumnMetadata.java deleted file mode 100644 index 31c86018502..00000000000 --- a/java/src/main/java/ai/rapids/cudf/ColumnMetadata.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package ai.rapids.cudf; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * Detailed meta data information for arrow array. - * - * (This is analogous to the native `column_metadata`.) - */ -public class ColumnMetadata { - private String name; - private List children = new ArrayList<>(); - - public ColumnMetadata(final String colName) { - this.name = colName; - } - - public ColumnMetadata addChildren(ColumnMetadata... childrenMeta) { - children.addAll(Arrays.asList(childrenMeta)); - return this; - } - - /** - * returns a cudf::column_metadata * cast to a long. We don't want to force - * users to close a ColumnMetadata. Because of the ColumnMetadata objects are created in - * pure java, but when it is time to use them this method is called to return a pointer to - * the c++ column_metadata instance. All values returned by this can be used multiple times, - * and should be closed by calling the static close method. Yes, this creates a lot more JNI - * calls, but it keeps the user API clean. - */ - long createNativeInstance() throws CudfException { - long[] childrenHandles = createNativeInstances(children); - try { - return create(name, childrenHandles); - } finally { - close(childrenHandles); - } - } - - static void close(long[] metaHandles) throws CudfException { - if (metaHandles == null) { - return; - } - for (long h : metaHandles) { - close(h); - } - } - - static long[] createNativeInstances(List metadataList) { - return metadataList.stream() - .mapToLong(ColumnMetadata::createNativeInstance) - .toArray(); - } - - private static native void close(long metaHandle) throws CudfException; - private static native long create(final String name, long[] children) throws CudfException; -} diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 2ca1b6600b1..fcc23777d69 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -365,19 +365,19 @@ private static native long writeORCBufferBegin(String[] columnNames, /** * Setup everything to write Arrow IPC formatted data to a file. - * @param columnsMeta column metadata that correspond to the table columns + * @param columnNames names that correspond to the table columns * @param filename local output path * @return a handle that is used in later calls to writeArrowIPCChunk and writeArrowIPCEnd. */ - private static native long writeArrowIPCFileBegin(long[] columnsMeta, String filename); + private static native long writeArrowIPCFileBegin(String[] columnNames, String filename); /** * Setup everything to write Arrow IPC formatted data to a buffer. - * @param columnsMeta column metadata that correspond to the table columns + * @param columnNames names that correspond to the table columns * @param consumer consumer of host buffers produced. * @return a handle that is used in later calls to writeArrowIPCChunk and writeArrowIPCEnd. */ - private static native long writeArrowIPCBufferBegin(long[] columnsMeta, + private static native long writeArrowIPCBufferBegin(String[] columnNames, HostBufferConsumer consumer); /** @@ -987,12 +987,9 @@ private ArrowIPCTableWriter(ArrowIPCWriterOptions options, this.callback = options.getCallback(); this.consumer = null; this.maxChunkSize = options.getMaxChunkSize(); - long[] metaHandles = ColumnMetadata.createNativeInstances(options.getColumnMetadata()); - try { - this.handle = writeArrowIPCFileBegin(metaHandles, outputFile.getAbsolutePath()); - } finally { - ColumnMetadata.close(metaHandles); - } + this.handle = writeArrowIPCFileBegin( + options.getColumnNames(), + outputFile.getAbsolutePath()); } private ArrowIPCTableWriter(ArrowIPCWriterOptions options, @@ -1000,12 +997,9 @@ private ArrowIPCTableWriter(ArrowIPCWriterOptions options, this.callback = options.getCallback(); this.consumer = consumer; this.maxChunkSize = options.getMaxChunkSize(); - long[] metaHandles = ColumnMetadata.createNativeInstances(options.getColumnMetadata()); - try { - this.handle = writeArrowIPCBufferBegin(metaHandles, consumer); - } finally { - ColumnMetadata.close(metaHandles); - } + this.handle = writeArrowIPCBufferBegin( + options.getColumnNames(), + consumer); } @Override diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index a70e6bea72f..c1239fe69ea 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -241,7 +241,6 @@ set(SOURCE_FILES "src/AggregationJni.cpp" "src/CudfJni.cpp" "src/CudaJni.cpp" - "src/ColumnMetadataJni.cpp" "src/ColumnVectorJni.cpp" "src/ColumnViewJni.cpp" "src/ContiguousTableJni.cpp" diff --git a/java/src/main/native/src/ColumnMetadataJni.cpp b/java/src/main/native/src/ColumnMetadataJni.cpp deleted file mode 100644 index f90d09c69ff..00000000000 --- a/java/src/main/native/src/ColumnMetadataJni.cpp +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "jni_utils.hpp" - -extern "C" { - -JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnMetadata_close(JNIEnv *env, - jclass, - jlong j_handle) { - JNI_NULL_CHECK(env, j_handle, "column metadata handle is null", ); - try { - auto to_del = reinterpret_cast(j_handle); - delete to_del; - } - CATCH_STD(env, ); -} - -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnMetadata_create(JNIEnv *env, - jclass, - jstring j_name, - jlongArray j_children) { - try { - // No need to set device since no GPU ops here. - cudf::jni::native_jstring col_name(env, j_name); - cudf::jni::native_jlongArray meta_children(env, j_children); - // Create a meta with empty name if `col_name` is NULL. - auto name = std::string(col_name.is_null() ? "" : col_name.get()); - cudf::column_metadata *cm = new cudf::column_metadata(name); - if (!meta_children.is_null()) { - // add the children - for (int i = 0; i < meta_children.size(); i++) { - cudf::column_metadata *child = reinterpret_cast(meta_children[i]); - // copy to `this`. - cm->children_meta.push_back(*child); - } - } - return reinterpret_cast(cm); - } - CATCH_STD(env, 0); -} - -} // extern "C" diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index c1ac5a7c8d1..9b700257604 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -203,20 +203,23 @@ typedef jni_table_writer_handle native_orc_writer_ class native_arrow_ipc_writer_handle final { public: - explicit native_arrow_ipc_writer_handle(const std::vector &col_meta, + explicit native_arrow_ipc_writer_handle(const std::vector &col_names, const std::string &file_name) - : initialized(false), column_metadata(col_meta), file_name(file_name) {} + : initialized(false), column_names(col_names), file_name(file_name) {} - explicit native_arrow_ipc_writer_handle(const std::vector &col_meta, + explicit native_arrow_ipc_writer_handle(const std::vector &col_names, const std::shared_ptr &sink) - : initialized(false), column_metadata(col_meta), file_name(""), sink(sink) {} + : initialized(false), column_names(col_names), file_name(""), sink(sink) {} +private: bool initialized; - std::vector column_metadata; + std::vector column_names; + std::vector columns_meta; std::string file_name; std::shared_ptr sink; std::shared_ptr writer; +public: void write(std::shared_ptr &arrow_tab, int64_t max_chunk) { if (!initialized) { if (!sink) { @@ -245,6 +248,50 @@ class native_arrow_ipc_writer_handle final { } initialized = false; } + + std::vector get_column_metadata(const table_view& tview) { + if (!column_names.empty() && columns_meta.empty()) { + // Rebuild the structure of column meta according to table schema. + // All the tables written by this writer should share the same schema, + // so build column metadata only once. + columns_meta.reserve(tview.num_columns()); + size_t idx = 0; + for (auto itr = tview.begin(); itr < tview.end(); ++itr) { + columns_meta.push_back(std::move(build_one_column_meta(*itr, idx))); + idx ++; + } + } + return columns_meta; + } + +private: + // `column_metadata` has no move constructor, + // still return a local var instead of passing as an out argument ? + cudf::column_metadata build_one_column_meta(const column_view& cview, size_t& idx) { + auto col_meta = cudf::column_metadata{get_column_name(idx)}; + if (cview.type().id() == cudf::type_id::LIST) { + // list type requires a stub metadata for offset column. + // It is ok the child metadata uses the same name, so keep `idx` unchanged. + col_meta.children_meta = {{}, build_one_column_meta(cview.child(0), idx)}; + } else if (cview.type().id() == cudf::type_id::STRUCT) { + // struct type + col_meta.children_meta.reserve(cview.num_children()); + for (auto itr = cview.child_begin(); itr < cview.child_end(); ++itr) { + col_meta.children_meta.push_back(std::move(build_one_column_meta(*itr, ++idx))); + } + } else if (cview.type().id() == cudf::type_id::DICTIONARY32) { + // not supported yet in JNI, nested type? + } + return col_meta; + } + + std::string get_column_name(size_t idx) { + if (idx < 0 || idx >= column_names.size()) { + throw cudf::jni::jni_exception( + "Missing column names for struct columns or nested struct columns"); + } + return column_names[idx]; + } }; class jni_arrow_output_stream final : public arrow::io::OutputStream { @@ -563,16 +610,6 @@ bool valid_window_parameters(native_jintArray const &values, values.size() == preceding.size() && values.size() == following.size(); } -static void build_column_metadata_from_handle(JNIEnv *env, jlongArray j_meta_handles, - std::vector& out_meta) { - cudf::jni::native_jlongArray meta_handles(env, j_meta_handles); - for (int i = 0; i < meta_handles.size(); i++) { - cudf::column_metadata *cm = reinterpret_cast(meta_handles[i]); - // copy to `out_meta`. - out_meta.push_back(*cm); - } -} - } // namespace } // namespace jni @@ -1206,37 +1243,36 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass } JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(JNIEnv *env, jclass, - jlongArray j_col_meta, + jobjectArray j_col_names, jobject consumer) { - JNI_NULL_CHECK(env, j_col_meta, "null columns", 0); + JNI_NULL_CHECK(env, j_col_names, "null columns", 0); JNI_NULL_CHECK(env, consumer, "null consumer", 0); try { cudf::jni::auto_set_device(env); + cudf::jni::native_jstringArray col_names(env, j_col_names); + std::shared_ptr data_sink( new cudf::jni::jni_arrow_output_stream(env, consumer)); - std::vector col_meta; - cudf::jni::build_column_metadata_from_handle(env, j_col_meta, col_meta); cudf::jni::native_arrow_ipc_writer_handle *ret = - new cudf::jni::native_arrow_ipc_writer_handle(col_meta, data_sink); + new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), data_sink); return reinterpret_cast(ret); } CATCH_STD(env, 0) } JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv *env, jclass, - jlongArray j_col_meta, + jobjectArray j_col_names, jstring j_output_path) { - JNI_NULL_CHECK(env, j_col_meta, "null columns", 0); + JNI_NULL_CHECK(env, j_col_names, "null columns", 0); JNI_NULL_CHECK(env, j_output_path, "null output path", 0); try { cudf::jni::auto_set_device(env); + cudf::jni::native_jstringArray col_names(env, j_col_names); cudf::jni::native_jstring output_path(env, j_output_path); - std::vector col_meta; - cudf::jni::build_column_metadata_from_handle(env, j_col_meta, col_meta); cudf::jni::native_arrow_ipc_writer_handle *ret = - new cudf::jni::native_arrow_ipc_writer_handle(col_meta, output_path.get()); + new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), output_path.get()); return reinterpret_cast(ret); } CATCH_STD(env, 0) @@ -1256,7 +1292,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv cudf::jni::auto_set_device(env); std::unique_ptr> result( new std::shared_ptr(nullptr)); - *result = cudf::to_arrow(*tview, state->column_metadata); + *result = cudf::to_arrow(*tview, state->get_column_metadata(*tview)); if (!result->get()) { return 0; } diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 0bb77b6538f..fe15f5c7e42 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -4297,19 +4297,11 @@ void testArrowIPCWriteToFileWithNamesAndMetadata() throws IOException { void testArrowIPCWriteToBufferChunked() { try (Table table0 = getExpectedFileTable(true); MyBufferConsumer consumer = new MyBufferConsumer()) { - ColumnMetadata[] colMeta = Arrays.asList("first", "second", "third", "fourth", "fifth", - "sixth", "seventh").stream().map(ColumnMetadata::new) - .toArray(ColumnMetadata[]::new); - ColumnMetadata structCM = new ColumnMetadata("eighth"); - structCM.addChildren(new ColumnMetadata("id"), new ColumnMetadata("name")); - ColumnMetadata arrayPriCM = new ColumnMetadata("ninth"); - // Array type needs a stub metadata for the offset column - arrayPriCM.addChildren(new ColumnMetadata(null), new ColumnMetadata("aid")); - ColumnMetadata arrayStructCM = new ColumnMetadata("tenth"); - arrayStructCM.addChildren(new ColumnMetadata(null), structCM); ArrowIPCWriterOptions options = ArrowIPCWriterOptions.builder() - .withColumnMetadata(colMeta) - .withColumnMetadata(structCM, arrayPriCM, arrayStructCM) + .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh") + .withColumnNames("eighth", "eighth_id", "eighth_name") + .withColumnNames("ninth") + .withColumnNames("tenth", "tenth_id", "tenth_name") .build(); try (TableWriter writer = Table.writeArrowIPCChunked(options, consumer)) { writer.write(table0); From fbbbe5b2e3f8b5f97d79c58c6b2de0c7b2f912c1 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Thu, 18 Mar 2021 15:32:46 +0800 Subject: [PATCH 07/16] Return a reference from `get_column_name` Signed-off-by: Firestarman --- java/src/main/native/src/TableJni.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 9b700257604..29270e86057 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -285,7 +285,7 @@ class native_arrow_ipc_writer_handle final { return col_meta; } - std::string get_column_name(size_t idx) { + std::string& get_column_name(size_t idx) { if (idx < 0 || idx >= column_names.size()) { throw cudf::jni::jni_exception( "Missing column names for struct columns or nested struct columns"); From bc33abc67c3ebe6872c7b376e53b0f4fa355bcb0 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Thu, 18 Mar 2021 15:55:39 +0800 Subject: [PATCH 08/16] Update the function signaure Signed-off-by: Firestarman --- java/src/main/native/src/TableJni.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 29270e86057..4f6e1628809 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -285,7 +285,7 @@ class native_arrow_ipc_writer_handle final { return col_meta; } - std::string& get_column_name(size_t idx) { + std::string& get_column_name(const size_t idx) { if (idx < 0 || idx >= column_names.size()) { throw cudf::jni::jni_exception( "Missing column names for struct columns or nested struct columns"); From eb39f16245c738740e44a58eeb013482061ef1c9 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Thu, 18 Mar 2021 16:15:34 +0800 Subject: [PATCH 09/16] correct the child index of array type column Signed-off-by: Firestarman --- java/src/main/native/src/TableJni.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 4f6e1628809..1bff260110d 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -270,9 +270,9 @@ class native_arrow_ipc_writer_handle final { cudf::column_metadata build_one_column_meta(const column_view& cview, size_t& idx) { auto col_meta = cudf::column_metadata{get_column_name(idx)}; if (cview.type().id() == cudf::type_id::LIST) { - // list type requires a stub metadata for offset column. + // list type requires a stub metadata for offset column, index is 0. // It is ok the child metadata uses the same name, so keep `idx` unchanged. - col_meta.children_meta = {{}, build_one_column_meta(cview.child(0), idx)}; + col_meta.children_meta = {{}, build_one_column_meta(cview.child(1), idx)}; } else if (cview.type().id() == cudf::type_id::STRUCT) { // struct type col_meta.children_meta.reserve(cview.num_children()); From 55a1e17835baf8d067060dffba119036a1cca108 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Thu, 18 Mar 2021 17:03:14 +0800 Subject: [PATCH 10/16] Correct the index for list type Signed-off-by: Firestarman --- java/src/main/native/src/TableJni.cpp | 3 +-- java/src/test/java/ai/rapids/cudf/TableTest.java | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 1bff260110d..c6ed399726e 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -271,8 +271,7 @@ class native_arrow_ipc_writer_handle final { auto col_meta = cudf::column_metadata{get_column_name(idx)}; if (cview.type().id() == cudf::type_id::LIST) { // list type requires a stub metadata for offset column, index is 0. - // It is ok the child metadata uses the same name, so keep `idx` unchanged. - col_meta.children_meta = {{}, build_one_column_meta(cview.child(1), idx)}; + col_meta.children_meta = {{}, build_one_column_meta(cview.child(1), ++idx)}; } else if (cview.type().id() == cudf::type_id::STRUCT) { // struct type col_meta.children_meta.reserve(cview.num_children()); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index fe15f5c7e42..23b87b4e124 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -4300,8 +4300,8 @@ void testArrowIPCWriteToBufferChunked() { ArrowIPCWriterOptions options = ArrowIPCWriterOptions.builder() .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh") .withColumnNames("eighth", "eighth_id", "eighth_name") - .withColumnNames("ninth") - .withColumnNames("tenth", "tenth_id", "tenth_name") + .withColumnNames("ninth", "ninth_child") + .withColumnNames("tenth", "tenth_child", "child_id", "child_name") .build(); try (TableWriter writer = Table.writeArrowIPCChunked(options, consumer)) { writer.write(table0); From 07d09e5dce9275e17006f4735ade7daeb705a419 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Thu, 18 Mar 2021 19:58:06 +0800 Subject: [PATCH 11/16] Comment update Signed-off-by: Firestarman --- java/rmm_log.txt | 6 ++++++ java/src/main/native/src/TableJni.cpp | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 java/rmm_log.txt diff --git a/java/rmm_log.txt b/java/rmm_log.txt new file mode 100644 index 00000000000..832414790e5 --- /dev/null +++ b/java/rmm_log.txt @@ -0,0 +1,6 @@ +[ 21473][17:06:35:478567][info ] ----- RMM LOG BEGIN [PTDS DISABLED] ----- +[ 21473][17:06:35:478583][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] +[ 21473][17:06:35:478604][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] +[ 21473][17:06:35:478616][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] +[ 21473][17:06:36:049064][error ] [A][Stream 0x1][Upstream 1024B][FAILURE maximum pool size exceeded] +[ 21473][17:06:36:991597][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index c6ed399726e..0c5f91ef8a9 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -265,8 +265,8 @@ class native_arrow_ipc_writer_handle final { } private: - // `column_metadata` has no move constructor, - // still return a local var instead of passing as an out argument ? + // Still return an oject instead of being passed as an out argument, even + // `column_metadata` has no move constructor and would be copied. cudf::column_metadata build_one_column_meta(const column_view& cview, size_t& idx) { auto col_meta = cudf::column_metadata{get_column_name(idx)}; if (cview.type().id() == cudf::type_id::LIST) { From 87d403abba3facd4eab89b70b0b15621447ff8ef Mon Sep 17 00:00:00 2001 From: Firestarman Date: Thu, 18 Mar 2021 19:59:57 +0800 Subject: [PATCH 12/16] Remove unexpected rmm log file Signed-off-by: Firestarman --- java/rmm_log.txt | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 java/rmm_log.txt diff --git a/java/rmm_log.txt b/java/rmm_log.txt deleted file mode 100644 index 832414790e5..00000000000 --- a/java/rmm_log.txt +++ /dev/null @@ -1,6 +0,0 @@ -[ 21473][17:06:35:478567][info ] ----- RMM LOG BEGIN [PTDS DISABLED] ----- -[ 21473][17:06:35:478583][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] -[ 21473][17:06:35:478604][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] -[ 21473][17:06:35:478616][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] -[ 21473][17:06:36:049064][error ] [A][Stream 0x1][Upstream 1024B][FAILURE maximum pool size exceeded] -[ 21473][17:06:36:991597][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] From 1f21c3a40149e3bd5468a2ccb0b8410f2a7e4763 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Fri, 19 Mar 2021 09:39:08 +0800 Subject: [PATCH 13/16] Address some comments Signed-off-by: Firestarman --- .../ai/rapids/cudf/ArrowIPCWriterOptions.java | 59 +++++++++++++++++++ java/src/main/native/src/TableJni.cpp | 11 ++-- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java index 298e99b059d..04abdb71910 100644 --- a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java @@ -67,6 +67,65 @@ public Builder withCallback(DoneOnGpu callback) { return this; } + /** + * Add the name(s) for nullable column(s). + * + * Column names should be flattened for the columns of nested type. For examples, + *
+     *   a struct column:
+     *                   ("struct_col": {"field_1", "field_2"}),
+     *   output:
+     *                   ["struct_col", "field_1", "field_2"]
+     *
+     *   a list column:
+     *                   ("list_col": [])
+     *   output:
+     *                   ("list_col", "list_child")
+     *
+     *   a list of struct column:
+     *                   ("list_struct_col": [{"field_1", "field_2"}])
+     *   output:
+     *                   ["list_struct_col", "list_struct_child", "field_1", "field_2"]
+     * 
+ * Even the child of list column has no name by default, it should be assigned a name. + * + * @param columnNames The column names corresponding to the written table(s). + */ + @Override + public Builder withColumnNames(String... columnNames) { + return super.withColumnNames(columnNames); + } + + /** + * Add the name(s) for non-nullable column(s). + * + * Column names should be flattened in sequence for the columns of nested type to make + * sure each column (including children columns) has a name. For examples, + *
+     *   a struct column:
+     *                   ("struct_col": {"field_1", "field_2"})
+     *   output:
+     *                   ["struct_col", "field_1", "field_2"]
+     *
+     *   a list column:
+     *                   ("list_col": [])
+     *   output:
+     *                   ("list_col", "list_child")
+     *
+     *   a list of struct column:
+     *                   ("list_struct_col": [{"field_1", "field_2"}])
+     *   output:
+     *                   ["list_struct_col", "list_struct_child", "field_1", "field_2"]
+     * 
+ * Even the child of list column has no name by default, it should be assigned a name as well. + * + * @param columnNames The column names corresponding to the written table(s). + */ + @Override + public Builder withNotNullableColumnNames(String... columnNames) { + return super.withNotNullableColumnNames(columnNames); + } + public ArrowIPCWriterOptions build() { return new ArrowIPCWriterOptions(this); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 0c5f91ef8a9..8db3fc8ea90 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -257,16 +257,18 @@ class native_arrow_ipc_writer_handle final { columns_meta.reserve(tview.num_columns()); size_t idx = 0; for (auto itr = tview.begin(); itr < tview.end(); ++itr) { - columns_meta.push_back(std::move(build_one_column_meta(*itr, idx))); + columns_meta.push_back(build_one_column_meta(*itr, idx)); idx ++; } + if (idx < column_names.size()) { + throw cudf::jni::jni_exception( + "The number of column names is bigger than the columns number in the table."); + } } return columns_meta; } private: - // Still return an oject instead of being passed as an out argument, even - // `column_metadata` has no move constructor and would be copied. cudf::column_metadata build_one_column_meta(const column_view& cview, size_t& idx) { auto col_meta = cudf::column_metadata{get_column_name(idx)}; if (cview.type().id() == cudf::type_id::LIST) { @@ -276,10 +278,11 @@ class native_arrow_ipc_writer_handle final { // struct type col_meta.children_meta.reserve(cview.num_children()); for (auto itr = cview.child_begin(); itr < cview.child_end(); ++itr) { - col_meta.children_meta.push_back(std::move(build_one_column_meta(*itr, ++idx))); + col_meta.children_meta.push_back(build_one_column_meta(*itr, ++idx)); } } else if (cview.type().id() == cudf::type_id::DICTIONARY32) { // not supported yet in JNI, nested type? + throw cudf::jni::jni_exception("Unsupported type 'DICTIONARY32'"); } return col_meta; } From 420f524a75cb45de8030e0cbe8f1ca328a8d1a8f Mon Sep 17 00:00:00 2001 From: Firestarman Date: Fri, 19 Mar 2021 10:28:29 +0800 Subject: [PATCH 14/16] error message update Signed-off-by: Firestarman --- java/src/main/native/src/TableJni.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 8db3fc8ea90..744bcfe47c6 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -289,8 +289,7 @@ class native_arrow_ipc_writer_handle final { std::string& get_column_name(const size_t idx) { if (idx < 0 || idx >= column_names.size()) { - throw cudf::jni::jni_exception( - "Missing column names for struct columns or nested struct columns"); + throw cudf::jni::jni_exception("Missing names for columns or nested columns"); } return column_names[idx]; } From b8966a0528978085987542bde4dd04331ec8e4c3 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Fri, 19 Mar 2021 13:18:05 +0800 Subject: [PATCH 15/16] Only root columns and nested struct columns consume names Signed-off-by: Firestarman --- .../ai/rapids/cudf/ArrowIPCWriterOptions.java | 45 +++++++++---------- java/src/main/native/src/TableJni.cpp | 27 +++++++---- .../test/java/ai/rapids/cudf/TableTest.java | 4 +- 3 files changed, 42 insertions(+), 34 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java index 04abdb71910..ee5ae094b29 100644 --- a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java @@ -70,24 +70,24 @@ public Builder withCallback(DoneOnGpu callback) { /** * Add the name(s) for nullable column(s). * - * Column names should be flattened for the columns of nested type. For examples, + * Please note the column names of the nested struct columns should be flattened in sequence. + * For examples, *
-     *   a struct column:
-     *                   ("struct_col": {"field_1", "field_2"}),
+     *   A table with an int column and a struct column:
+     *                   ["int_col", "struct_col":{"field_1", "field_2"}]
      *   output:
-     *                   ["struct_col", "field_1", "field_2"]
+     *                   ["int_col", "struct_col", "field_1", "field_2"]
      *
-     *   a list column:
-     *                   ("list_col": [])
+     *   A table with an int column and a list of non-nested type column:
+     *                   ["int_col", "list_col":[]]
      *   output:
-     *                   ("list_col", "list_child")
+     *                   ["int_col", "list_col"]
      *
-     *   a list of struct column:
-     *                   ("list_struct_col": [{"field_1", "field_2"}])
+     *   A table with an int column and a list of struct column:
+     *                   ["int_col", "list_struct_col":[{"field_1", "field_2"}]]
      *   output:
-     *                   ["list_struct_col", "list_struct_child", "field_1", "field_2"]
+     *                   ["int_col", "list_struct_col", "field_1", "field_2"]
      * 
- * Even the child of list column has no name by default, it should be assigned a name. * * @param columnNames The column names corresponding to the written table(s). */ @@ -99,25 +99,24 @@ public Builder withColumnNames(String... columnNames) { /** * Add the name(s) for non-nullable column(s). * - * Column names should be flattened in sequence for the columns of nested type to make - * sure each column (including children columns) has a name. For examples, + * Please note the column names of the nested struct columns should be flattened in sequence. + * For examples, *
-     *   a struct column:
-     *                   ("struct_col": {"field_1", "field_2"})
+     *   A table with an int column and a struct column:
+     *                   ["int_col", "struct_col":{"field_1", "field_2"}]
      *   output:
-     *                   ["struct_col", "field_1", "field_2"]
+     *                   ["int_col", "struct_col", "field_1", "field_2"]
      *
-     *   a list column:
-     *                   ("list_col": [])
+     *   A table with an int column and a list of non-nested type column:
+     *                   ["int_col", "list_col":[]]
      *   output:
-     *                   ("list_col", "list_child")
+     *                   ["int_col", "list_col"]
      *
-     *   a list of struct column:
-     *                   ("list_struct_col": [{"field_1", "field_2"}])
+     *   A table with an int column and a list of struct column:
+     *                   ["int_col", "list_struct_col":[{"field_1", "field_2"}]]
      *   output:
-     *                   ["list_struct_col", "list_struct_child", "field_1", "field_2"]
+     *                   ["int_col", "list_struct_col", "field_1", "field_2"]
      * 
- * Even the child of list column has no name by default, it should be assigned a name as well. * * @param columnNames The column names corresponding to the written table(s). */ diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 744bcfe47c6..85fbe3cf4bc 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -249,7 +249,7 @@ class native_arrow_ipc_writer_handle final { initialized = false; } - std::vector get_column_metadata(const table_view& tview) { + std::vector get_column_metadata(const cudf::table_view& tview) { if (!column_names.empty() && columns_meta.empty()) { // Rebuild the structure of column meta according to table schema. // All the tables written by this writer should share the same schema, @@ -257,8 +257,10 @@ class native_arrow_ipc_writer_handle final { columns_meta.reserve(tview.num_columns()); size_t idx = 0; for (auto itr = tview.begin(); itr < tview.end(); ++itr) { + // It should consume the column names only when a column is + // - type of struct, or + // - not a child. columns_meta.push_back(build_one_column_meta(*itr, idx)); - idx ++; } if (idx < column_names.size()) { throw cudf::jni::jni_exception( @@ -269,16 +271,23 @@ class native_arrow_ipc_writer_handle final { } private: - cudf::column_metadata build_one_column_meta(const column_view& cview, size_t& idx) { - auto col_meta = cudf::column_metadata{get_column_name(idx)}; + cudf::column_metadata build_one_column_meta(const cudf::column_view& cview, size_t& idx, + const bool consume_name = true) { + auto col_meta = cudf::column_metadata{}; + if (consume_name) { + col_meta.name = get_column_name(idx++); + } + // Process children if (cview.type().id() == cudf::type_id::LIST) { - // list type requires a stub metadata for offset column, index is 0. - col_meta.children_meta = {{}, build_one_column_meta(cview.child(1), ++idx)}; + // list type: + // - requires a stub metadata for offset column(index: 0). + // - does not require a name for the child column(index 1). + col_meta.children_meta = {{}, build_one_column_meta(cview.child(1), idx, false)}; } else if (cview.type().id() == cudf::type_id::STRUCT) { - // struct type + // struct type always consumes the column names. col_meta.children_meta.reserve(cview.num_children()); for (auto itr = cview.child_begin(); itr < cview.child_end(); ++itr) { - col_meta.children_meta.push_back(build_one_column_meta(*itr, ++idx)); + col_meta.children_meta.push_back(build_one_column_meta(*itr, idx)); } } else if (cview.type().id() == cudf::type_id::DICTIONARY32) { // not supported yet in JNI, nested type? @@ -289,7 +298,7 @@ class native_arrow_ipc_writer_handle final { std::string& get_column_name(const size_t idx) { if (idx < 0 || idx >= column_names.size()) { - throw cudf::jni::jni_exception("Missing names for columns or nested columns"); + throw cudf::jni::jni_exception("Missing names for columns or nested struct columns"); } return column_names[idx]; } diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 23b87b4e124..625260b255f 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -4300,8 +4300,8 @@ void testArrowIPCWriteToBufferChunked() { ArrowIPCWriterOptions options = ArrowIPCWriterOptions.builder() .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh") .withColumnNames("eighth", "eighth_id", "eighth_name") - .withColumnNames("ninth", "ninth_child") - .withColumnNames("tenth", "tenth_child", "child_id", "child_name") + .withColumnNames("ninth") + .withColumnNames("tenth", "child_id", "child_name") .build(); try (TableWriter writer = Table.writeArrowIPCChunked(options, consumer)) { writer.write(table0); From 54395f0e971968279da93f2155e61f66f23b01bb Mon Sep 17 00:00:00 2001 From: Firestarman Date: Fri, 19 Mar 2021 13:27:27 +0800 Subject: [PATCH 16/16] error message update Signed-off-by: Firestarman --- java/src/main/native/src/TableJni.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 85fbe3cf4bc..43616ea413d 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -263,8 +263,7 @@ class native_arrow_ipc_writer_handle final { columns_meta.push_back(build_one_column_meta(*itr, idx)); } if (idx < column_names.size()) { - throw cudf::jni::jni_exception( - "The number of column names is bigger than the columns number in the table."); + throw cudf::jni::jni_exception("Too many column names are provided."); } } return columns_meta;