From b8966a0528978085987542bde4dd04331ec8e4c3 Mon Sep 17 00:00:00 2001 From: Firestarman Date: Fri, 19 Mar 2021 13:18:05 +0800 Subject: [PATCH] Only root columns and nested struct columns consume names Signed-off-by: Firestarman --- .../ai/rapids/cudf/ArrowIPCWriterOptions.java | 45 +++++++++---------- java/src/main/native/src/TableJni.cpp | 27 +++++++---- .../test/java/ai/rapids/cudf/TableTest.java | 4 +- 3 files changed, 42 insertions(+), 34 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java index 04abdb71910..ee5ae094b29 100644 --- a/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ArrowIPCWriterOptions.java @@ -70,24 +70,24 @@ public Builder withCallback(DoneOnGpu callback) { /** * Add the name(s) for nullable column(s). * - * Column names should be flattened for the columns of nested type. For examples, + * Please note the column names of the nested struct columns should be flattened in sequence. + * For examples, *
-     *   a struct column:
-     *                   ("struct_col": {"field_1", "field_2"}),
+     *   A table with an int column and a struct column:
+     *                   ["int_col", "struct_col":{"field_1", "field_2"}]
      *   output:
-     *                   ["struct_col", "field_1", "field_2"]
+     *                   ["int_col", "struct_col", "field_1", "field_2"]
      *
-     *   a list column:
-     *                   ("list_col": [])
+     *   A table with an int column and a list of non-nested type column:
+     *                   ["int_col", "list_col":[]]
      *   output:
-     *                   ("list_col", "list_child")
+     *                   ["int_col", "list_col"]
      *
-     *   a list of struct column:
-     *                   ("list_struct_col": [{"field_1", "field_2"}])
+     *   A table with an int column and a list of struct column:
+     *                   ["int_col", "list_struct_col":[{"field_1", "field_2"}]]
      *   output:
-     *                   ["list_struct_col", "list_struct_child", "field_1", "field_2"]
+     *                   ["int_col", "list_struct_col", "field_1", "field_2"]
      * 
- * Even the child of list column has no name by default, it should be assigned a name. * * @param columnNames The column names corresponding to the written table(s). */ @@ -99,25 +99,24 @@ public Builder withColumnNames(String... columnNames) { /** * Add the name(s) for non-nullable column(s). * - * Column names should be flattened in sequence for the columns of nested type to make - * sure each column (including children columns) has a name. For examples, + * Please note the column names of the nested struct columns should be flattened in sequence. + * For examples, *
-     *   a struct column:
-     *                   ("struct_col": {"field_1", "field_2"})
+     *   A table with an int column and a struct column:
+     *                   ["int_col", "struct_col":{"field_1", "field_2"}]
      *   output:
-     *                   ["struct_col", "field_1", "field_2"]
+     *                   ["int_col", "struct_col", "field_1", "field_2"]
      *
-     *   a list column:
-     *                   ("list_col": [])
+     *   A table with an int column and a list of non-nested type column:
+     *                   ["int_col", "list_col":[]]
      *   output:
-     *                   ("list_col", "list_child")
+     *                   ["int_col", "list_col"]
      *
-     *   a list of struct column:
-     *                   ("list_struct_col": [{"field_1", "field_2"}])
+     *   A table with an int column and a list of struct column:
+     *                   ["int_col", "list_struct_col":[{"field_1", "field_2"}]]
      *   output:
-     *                   ["list_struct_col", "list_struct_child", "field_1", "field_2"]
+     *                   ["int_col", "list_struct_col", "field_1", "field_2"]
      * 
- * Even the child of list column has no name by default, it should be assigned a name as well. * * @param columnNames The column names corresponding to the written table(s). */ diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 744bcfe47c6..85fbe3cf4bc 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -249,7 +249,7 @@ class native_arrow_ipc_writer_handle final { initialized = false; } - std::vector get_column_metadata(const table_view& tview) { + std::vector get_column_metadata(const cudf::table_view& tview) { if (!column_names.empty() && columns_meta.empty()) { // Rebuild the structure of column meta according to table schema. // All the tables written by this writer should share the same schema, @@ -257,8 +257,10 @@ class native_arrow_ipc_writer_handle final { columns_meta.reserve(tview.num_columns()); size_t idx = 0; for (auto itr = tview.begin(); itr < tview.end(); ++itr) { + // It should consume the column names only when a column is + // - type of struct, or + // - not a child. columns_meta.push_back(build_one_column_meta(*itr, idx)); - idx ++; } if (idx < column_names.size()) { throw cudf::jni::jni_exception( @@ -269,16 +271,23 @@ class native_arrow_ipc_writer_handle final { } private: - cudf::column_metadata build_one_column_meta(const column_view& cview, size_t& idx) { - auto col_meta = cudf::column_metadata{get_column_name(idx)}; + cudf::column_metadata build_one_column_meta(const cudf::column_view& cview, size_t& idx, + const bool consume_name = true) { + auto col_meta = cudf::column_metadata{}; + if (consume_name) { + col_meta.name = get_column_name(idx++); + } + // Process children if (cview.type().id() == cudf::type_id::LIST) { - // list type requires a stub metadata for offset column, index is 0. - col_meta.children_meta = {{}, build_one_column_meta(cview.child(1), ++idx)}; + // list type: + // - requires a stub metadata for offset column(index: 0). + // - does not require a name for the child column(index 1). + col_meta.children_meta = {{}, build_one_column_meta(cview.child(1), idx, false)}; } else if (cview.type().id() == cudf::type_id::STRUCT) { - // struct type + // struct type always consumes the column names. col_meta.children_meta.reserve(cview.num_children()); for (auto itr = cview.child_begin(); itr < cview.child_end(); ++itr) { - col_meta.children_meta.push_back(build_one_column_meta(*itr, ++idx)); + col_meta.children_meta.push_back(build_one_column_meta(*itr, idx)); } } else if (cview.type().id() == cudf::type_id::DICTIONARY32) { // not supported yet in JNI, nested type? @@ -289,7 +298,7 @@ class native_arrow_ipc_writer_handle final { std::string& get_column_name(const size_t idx) { if (idx < 0 || idx >= column_names.size()) { - throw cudf::jni::jni_exception("Missing names for columns or nested columns"); + throw cudf::jni::jni_exception("Missing names for columns or nested struct columns"); } return column_names[idx]; } diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 23b87b4e124..625260b255f 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -4300,8 +4300,8 @@ void testArrowIPCWriteToBufferChunked() { ArrowIPCWriterOptions options = ArrowIPCWriterOptions.builder() .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh") .withColumnNames("eighth", "eighth_id", "eighth_name") - .withColumnNames("ninth", "ninth_child") - .withColumnNames("tenth", "tenth_child", "child_id", "child_name") + .withColumnNames("ninth") + .withColumnNames("tenth", "child_id", "child_name") .build(); try (TableWriter writer = Table.writeArrowIPCChunked(options, consumer)) { writer.write(table0);