From 867d6eeccaf30a61e02572f5b57ec4c0e252d150 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Fri, 16 Apr 2021 18:33:09 -0700 Subject: [PATCH] Revert "Java API change for supporting structs (#7730)" (#7987) This reverts commit 3327f7be8eae964ea3c9fc9a025c4c67eacbe3d3. We have to revert this because the dependent project is broken and my system is in a broken state. Authors: - Raza Jafri (https://github.com/razajafri) Approvers: - Rong Ou (https://github.com/rongou) - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/7987 --- cpp/include/cudf/io/parquet.hpp | 3 +- .../cudf/ParquetColumnWriterOptions.java | 423 ------------------ .../ai/rapids/cudf/ParquetWriterOptions.java | 120 +++-- java/src/main/java/ai/rapids/cudf/Table.java | 46 +- java/src/main/native/src/TableJni.cpp | 155 +++---- .../test/java/ai/rapids/cudf/TableTest.java | 122 ++--- 6 files changed, 174 insertions(+), 695 deletions(-) delete mode 100644 java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index f2c57f3a9fa..7cb3db1eb30 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -389,10 +389,9 @@ class column_in_metadata { bool _use_int96_timestamp = false; // bool _output_as_binary = false; thrust::optional _decimal_precision; - - public: std::vector children; + public: /** * @brief Set the name of this column * diff --git a/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java deleted file mode 100644 index 2787d65cfdc..00000000000 --- a/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java +++ /dev/null @@ -1,423 +0,0 @@ -/* - * - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package ai.rapids.cudf; - -import java.util.ArrayList; -import java.util.List; - -/** - * Per column settings for writing Parquet files. - */ -class ParquetColumnWriterOptions { - private boolean isTimestampTypeInt96; - private int precision; - private boolean isNullable; - private String columName; - private ParquetColumnWriterOptions(AbstractStructBuilder builder) { - this.columName = builder.name; - this.isNullable = builder.isNullable; - this.childColumnOptions = - (ParquetColumnWriterOptions[]) builder.children.toArray(new ParquetColumnWriterOptions[0]); - } - - /** - * Constructor used for list - */ - private ParquetColumnWriterOptions(ListBuilder builder) { - assert(builder.children.size() == 1) : "Lists can only have one child"; - this.columName = builder.name; - this.isNullable = builder.isNullable; - // we are adding the child twice even though lists have one child only because the way the cudf - // has implemented this it requires two children to be set for the list, but it drops the - // first one. This is something that is a lower priority and might be fixed in future - this.childColumnOptions = - new ParquetColumnWriterOptions[]{DUMMY_CHILD, builder.children.get(0)}; - } - - protected ParquetColumnWriterOptions[] childColumnOptions = {}; - protected abstract static class AbstractStructBuilder - extends NestedBuilder { - /** - * Builder specific to build a Struct meta - * @param name - */ - public AbstractStructBuilder(String name, boolean isNullable) { - super(name, isNullable); - } - - protected AbstractStructBuilder() { - super(); - } - } - - // This child is needed as the first child of a List column meta due to how cudf has been - // implemented. Cudf drops the first child from the meta if a column is a LIST. This is done - // this way due to some complications in the parquet reader. There was change to fix this here: - // https://github.com/rapidsai/cudf/pull/7461/commits/5ce33b40abb87cc7b76b5efeb0a3a0215f9ef6fb - // but it was reverted later on here: - // https://github.com/rapidsai/cudf/pull/7461/commits/f248eb7265de995a95f998d46d897fb0ae47f53e - static ParquetColumnWriterOptions DUMMY_CHILD = new ParquetColumnWriterOptions("DUMMY"); - - static abstract class NestedBuilder { - protected List children = new ArrayList<>(); - protected boolean isNullable = true; - protected String name = ""; - - /** - * Builder specific to build a Struct meta - * @param name - */ - protected NestedBuilder(String name, boolean isNullable) { - this.name = name; - this.isNullable = isNullable; - } - - protected NestedBuilder() {} - - protected ParquetColumnWriterOptions withColumn(String name, boolean isNullable) { - return new ParquetColumnWriterOptions(name, isNullable); - } - - protected ParquetColumnWriterOptions withDecimal(String name, int precision, - boolean isNullable) { - return new ParquetColumnWriterOptions(name, false, precision, isNullable); - } - - protected ParquetColumnWriterOptions withTimestamp(String name, boolean isInt96, - boolean isNullable) { - return new ParquetColumnWriterOptions(name, isInt96, 0, isNullable); - } - - /** - * Set the list column meta. - * Lists should have only one child in ColumnVector, but the metadata expects a - * LIST column to have two children and the first child to be the - * {@link ParquetColumnWriterOptions#DUMMY_CHILD}. - * This is the current behavior in cudf and will change in future - * @param child - * @return this for chaining. - */ - public T withListColumn(ParquetListColumnWriterOptions child) { - assert (child.getChildColumnOptions().length == 2) : "Lists can only have two children"; - if (child.getChildColumnOptions()[0] != DUMMY_CHILD) { - throw new IllegalArgumentException("First child in the list has to be DUMMY_CHILD"); - } - if (child.getChildColumnOptions()[1].getColumName().isEmpty()) { - throw new IllegalArgumentException("Column name can't be empty"); - } - children.add(child); - return (T) this; - } - - /** - * Set a child struct meta data - * @param child - * @return this for chaining. - */ - public T withStructColumn(ParquetStructColumnWriterOptions child) { - for (ParquetColumnWriterOptions opt: child.getChildColumnOptions()) { - if (opt.getColumName().isEmpty()) { - throw new IllegalArgumentException("Column name can't be empty"); - } - } - children.add(child); - return (T) this; - } - - /** - * Set column name - * @param name - */ - public T withNonNullableColumn(String... name) { - withColumn(false, name); - return (T) this; - } - - /** - * Set nullable column meta data - * @param name - */ - public T withNullableColumn(String... name) { - withColumn(true, name); - return (T) this; - } - - /** - * Set a simple child meta data - * @param name - * @return this for chaining. - */ - public T withColumn(boolean nullable, String... name) { - for (String n : name) { - children.add(withColumn(n, nullable)); - } - return (T) this; - } - - /** - * Set a Decimal child meta data - * @param name - * @param precision - * @return this for chaining. - */ - public T withDecimalColumn(String name, int precision, boolean nullable) { - children.add(withDecimal(name, precision, nullable)); - return (T) this; - } - - /** - * Set a Decimal child meta data - * @param name - * @param precision - * @return this for chaining. - */ - public T withNullableDecimalColumn(String name, int precision) { - withDecimalColumn(name, precision, true); - return (T) this; - } - - /** - * Set a Decimal child meta data - * @param name - * @param precision - * @return this for chaining. - */ - public T withDecimalColumn(String name, int precision) { - withDecimalColumn(name, precision, false); - return (T) this; - } - - /** - * Set a timestamp child meta data - * @param name - * @param isInt96 - * @return this for chaining. - */ - public T withTimestampColumn(String name, boolean isInt96, boolean nullable) { - children.add(withTimestamp(name, isInt96, nullable)); - return (T) this; - } - - /** - * Set a timestamp child meta data - * @param name - * @param isInt96 - * @return this for chaining. - */ - public T withTimestampColumn(String name, boolean isInt96) { - withTimestampColumn(name, isInt96, false); - return (T) this; - } - - /** - * Set a timestamp child meta data - * @param name - * @param isInt96 - * @return this for chaining. - */ - public T withNullableTimestampColumn(String name, boolean isInt96) { - withTimestampColumn(name, isInt96, true); - return (T) this; - } - } - - ParquetColumnWriterOptions(String columnName, boolean isTimestampTypeInt96, - int precision, boolean isNullable) { - this.isTimestampTypeInt96 = isTimestampTypeInt96; - this.precision = precision; - this.isNullable = isNullable; - this.columName = columnName; - } - - ParquetColumnWriterOptions(String columnName, boolean isNullable) { - this.isTimestampTypeInt96 = false; - this.precision = 0; - this.isNullable = isNullable; - this.columName = columnName; - } - - ParquetColumnWriterOptions(String columnName) { - this(columnName, true); - } - - boolean[] getFlatIsTimeTypeInt96() { - boolean[] ret = {isTimestampTypeInt96}; - - for (ParquetColumnWriterOptions opt: childColumnOptions) { - boolean[] b = opt.getFlatIsTimeTypeInt96(); - boolean[] tmp = new boolean[ret.length + b.length]; - System.arraycopy(ret, 0, tmp, 0, ret.length); - System.arraycopy(b, 0, tmp, ret.length, b.length); - ret = tmp; - } - return ret; - } - - int[] getFlatPrecision() { - int[] ret = {precision}; - - for (ParquetColumnWriterOptions opt: childColumnOptions) { - int[] b = opt.getFlatPrecision(); - int[] tmp = new int[ret.length + b.length]; - System.arraycopy(ret, 0, tmp, 0, ret.length); - System.arraycopy(b, 0, tmp, ret.length, b.length); - ret = tmp; - } - return ret; - } - - boolean[] getFlatIsNullable() { - boolean[] ret = {isNullable}; - - for (ParquetColumnWriterOptions opt: childColumnOptions) { - boolean[] b = opt.getFlatIsNullable(); - boolean[] tmp = new boolean[ret.length + b.length]; - System.arraycopy(ret, 0, tmp, 0, ret.length); - System.arraycopy(b, 0, tmp, ret.length, b.length); - ret = tmp; - } - return ret; - } - - int[] getFlatNumChildren() { - int[] ret = {childColumnOptions.length}; - - for (ParquetColumnWriterOptions opt: childColumnOptions) { - int[] b = opt.getFlatNumChildren(); - int[] tmp = new int[ret.length + b.length]; - System.arraycopy(ret, 0, tmp, 0, ret.length); - System.arraycopy(b, 0, tmp, ret.length, b.length); - ret = tmp; - } - return ret; - } - - String[] getFlatColumnNames() { - String[] ret = {columName}; - for (ParquetColumnWriterOptions opt: childColumnOptions) { - String[] b = opt.getFlatColumnNames(); - String[] tmp = new String[ret.length + b.length]; - System.arraycopy(ret, 0, tmp, 0, ret.length); - System.arraycopy(b, 0, tmp, ret.length, b.length); - ret = tmp; - } - return ret; - } - - /** - * Creates a ListBuilder for column called 'name' - * @param name - */ - public static ListBuilder listBuilder(String name) { - return new ListBuilder(name, true); - } - - /** - * Creates a ListBuilder for column called 'name' - * @param name - * @param isNullable - */ - public static ListBuilder listBuilder(String name, boolean isNullable) { - return new ListBuilder(name, isNullable); - } - - /** - * Creates a StructBuilder for column called 'name' - * @param name - * @param isNullable - */ - public static StructBuilder structBuilder(String name, boolean isNullable) { - return new StructBuilder(name, isNullable); - } - - /** - * Creates a StructBuilder for column called 'name' - * @param name - */ - public static StructBuilder structBuilder(String name) { - return new StructBuilder(name, true); - } - - /** - * Return if the column can have null values - */ - public String getColumName() { - return columName; - } - - /** - * Return if the column can have null values - */ - public boolean isNullable() { - return isNullable; - } - - /** - * Return the precision for this column - */ - public int getPrecision() { - return precision; - } - - /** - * Returns true if the writer is expected to write timestamps in INT96 - */ - public boolean isTimestampTypeInt96() { - return isTimestampTypeInt96; - } - - /** - * Return the child columnOptions for this column - */ - public ParquetColumnWriterOptions[] getChildColumnOptions() { - return childColumnOptions; - } - - public static class ParquetStructColumnWriterOptions extends ParquetColumnWriterOptions { - protected ParquetStructColumnWriterOptions(AbstractStructBuilder builder) { - super(builder); - } - } - - public static class ParquetListColumnWriterOptions extends ParquetColumnWriterOptions { - protected ParquetListColumnWriterOptions(ListBuilder builder) { - super(builder); - } - } - - public static class StructBuilder extends AbstractStructBuilder { - public StructBuilder(String name, boolean isNullable) { - super(name, isNullable); - } - - public ParquetStructColumnWriterOptions build() { - return new ParquetStructColumnWriterOptions(this); - } - } - - public static class ListBuilder extends NestedBuilder { - public ListBuilder(String name, boolean isNullable) { - super(name, isNullable); - } - - public ParquetListColumnWriterOptions build() { - return new ParquetListColumnWriterOptions(this); - } - } -} diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java index c0882f54251..2e793494b7b 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java @@ -18,33 +18,10 @@ package ai.rapids.cudf; -import java.util.LinkedHashMap; -import java.util.Map; - /** - * This class represents settings for writing Parquet files. It includes meta data information - * that will be used by the Parquet writer to write the file + * Settings for writing Parquet files. */ -public final class ParquetWriterOptions extends ParquetColumnWriterOptions.ParquetStructColumnWriterOptions { - private final CompressionType compressionType; - private final Map metadata; - private final StatisticsFrequency statsGranularity; - - private ParquetWriterOptions(Builder builder) { - super(builder); - this.statsGranularity = builder.statsGranularity; - this.compressionType = builder.compressionType; - this.metadata = builder.metadata; - } - - String[] getMetadataKeys() { - return metadata.keySet().toArray(new String[metadata.size()]); - } - - String[] getMetadataValues() { - return metadata.values().toArray(new String[metadata.size()]); - } - +public class ParquetWriterOptions extends CompressedMetadataWriterOptions { public enum StatisticsFrequency { /** Do not generate statistics */ NONE(0), @@ -62,61 +39,32 @@ public enum StatisticsFrequency { } } - public static Builder builder() { - return new Builder(); - } - - public StatisticsFrequency getStatisticsFrequency() { - return statsGranularity; - } - - public CompressionType getCompressionType() { - return compressionType; - } - - public Map getMetadata() { - return metadata; - } - - public static class Builder extends ParquetColumnWriterOptions.AbstractStructBuilder { + public static class Builder extends CMWriterBuilder { private StatisticsFrequency statsGranularity = StatisticsFrequency.ROWGROUP; - final Map metadata = new LinkedHashMap<>(); - CompressionType compressionType = CompressionType.AUTO; + private boolean isTimestampTypeInt96 = false; + private int[] precisionValues = null; - public Builder() { - super(); - } - - /** - * Add a metadata key and a value - * @param key - * @param value - */ - public Builder withMetadata(String key, String value) { - this.metadata.put(key, value); + public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) { + this.statsGranularity = statsGranularity; return this; } /** - * Add a map of metadata keys and values - * @param metadata + * Set whether the timestamps should be written in INT96 */ - public Builder withMetadata(Map metadata) { - this.metadata.putAll(metadata); + public Builder withTimestampInt96(boolean int96) { + this.isTimestampTypeInt96 = int96; return this; } /** - * Set the compression type to use for writing - * @param compression + * This is a temporary hack to make things work. This API will go away once we can update the + * parquet APIs properly. + * @param precisionValues a value for each column, non-decimal columns are ignored. + * @return this for chaining. */ - public Builder withCompressionType(CompressionType compression) { - this.compressionType = compression; - return this; - } - - public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) { - this.statsGranularity = statsGranularity; + public Builder withDecimalPrecisions(int ... precisionValues) { + this.precisionValues = precisionValues; return this; } @@ -124,4 +72,40 @@ public ParquetWriterOptions build() { return new ParquetWriterOptions(this); } } + + public static Builder builder() { + return new Builder(); + } + + private final StatisticsFrequency statsGranularity; + + private ParquetWriterOptions(Builder builder) { + super(builder); + this.statsGranularity = builder.statsGranularity; + this.isTimestampTypeInt96 = builder.isTimestampTypeInt96; + this.precisions = builder.precisionValues; + } + + public StatisticsFrequency getStatisticsFrequency() { + return statsGranularity; + } + + /** + * Return the flattened list of precisions if set otherwise empty array will be returned. + * For a definition of what `flattened` means please look at {@link Builder#withDecimalPrecisions} + */ + public int[] getPrecisions() { + return precisions; + } + + /** + * Returns true if the writer is expected to write timestamps in INT96 + */ + public boolean isTimestampTypeInt96() { + return isTimestampTypeInt96; + } + + private boolean isTimestampTypeInt96; + + private int[] precisions; } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 5b17621cb42..8f256987dd2 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -244,7 +244,6 @@ private static native long[] readParquet(String[] filterColumnNames, String file /** * Setup everything to write parquet formatted data to a file. * @param columnNames names that correspond to the table columns - * @param flatNumChildren flattened list of children per column * @param nullable true if the column can have nulls else false * @param metadataKeys Metadata key names to place in the Parquet file * @param metadataValues Metadata values corresponding to metadataKeys @@ -257,20 +256,18 @@ private static native long[] readParquet(String[] filterColumnNames, String file * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd. */ private static native long writeParquetFileBegin(String[] columnNames, - int[] flatNumChildren, boolean[] nullable, String[] metadataKeys, String[] metadataValues, int compression, int statsFreq, - boolean[] isInt96, + boolean isInt96, int[] precisions, String filename) throws CudfException; /** * Setup everything to write parquet formatted data to a buffer. * @param columnNames names that correspond to the table columns - * @param flatNumChildren flattened list of children per column * @param nullable true if the column can have nulls else false * @param metadataKeys Metadata key names to place in the Parquet file * @param metadataValues Metadata values corresponding to metadataKeys @@ -283,13 +280,12 @@ private static native long writeParquetFileBegin(String[] columnNames, * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd. */ private static native long writeParquetBufferBegin(String[] columnNames, - int[] flatNumChildren, boolean[] nullable, String[] metadataKeys, String[] metadataValues, int compression, int statsFreq, - boolean[] isInt96, + boolean isInt96, int[] precisions, HostBufferConsumer consumer) throws CudfException; @@ -823,43 +819,35 @@ private static class ParquetTableWriter implements TableWriter { HostBufferConsumer consumer; private ParquetTableWriter(ParquetWriterOptions options, File outputFile) { - String[] columnNames = options.getFlatColumnNames(); - boolean[] columnNullabilities = options.getFlatIsNullable(); - boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96(); - int[] precisions = options.getFlatPrecision(); - int[] flatNumChildren = options.getFlatNumChildren(); - + int numColumns = options.getColumnNames().length; + assert (numColumns == options.getColumnNullability().length); + int[] precisions = options.getPrecisions(); + if (precisions != null) { + assert (numColumns >= options.getPrecisions().length); + } this.consumer = null; - this.handle = writeParquetFileBegin(columnNames, - flatNumChildren, - columnNullabilities, + this.handle = writeParquetFileBegin(options.getColumnNames(), + options.getColumnNullability(), options.getMetadataKeys(), options.getMetadataValues(), options.getCompressionType().nativeId, options.getStatisticsFrequency().nativeId, - timeInt96Values, - precisions, + options.isTimestampTypeInt96(), + options.getPrecisions(), outputFile.getAbsolutePath()); } private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer consumer) { - String[] columnNames = options.getFlatColumnNames(); - boolean[] columnNullabilities = options.getFlatIsNullable(); - boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96(); - int[] precisions = options.getFlatPrecision(); - int[] flatNumChildren = options.getFlatNumChildren(); - - this.consumer = consumer; - this.handle = writeParquetBufferBegin(columnNames, - flatNumChildren, - columnNullabilities, + this.handle = writeParquetBufferBegin(options.getColumnNames(), + options.getColumnNullability(), options.getMetadataKeys(), options.getMetadataValues(), options.getCompressionType().nativeId, options.getStatisticsFrequency().nativeId, - timeInt96Values, - precisions, + options.isTimestampTypeInt96(), + options.getPrecisions(), consumer); + this.consumer = consumer; } @Override diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 7d981229906..346ae8435cc 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -37,11 +37,9 @@ #include #include #include -#include "cudf/types.hpp" #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp" -#include "jni_utils.hpp" #include "row_conversion.hpp" #include @@ -1070,102 +1068,56 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet( CATCH_STD(env, NULL); } -int set_column_metadata(cudf::io::column_in_metadata &column_metadata, - std::vector &col_names, - cudf::jni::native_jbooleanArray &nullability, - cudf::jni::native_jbooleanArray &isInt96, - cudf::jni::native_jintArray &precisions, - cudf::jni::native_jintArray &children, int read_index) { - int write_index = 0; - int num_children = children[read_index++]; - column_metadata.children.resize(num_children); - for (int i = 0 ; i < num_children; i++, write_index++) { - column_metadata.child(write_index) - .set_name(col_names[read_index]) - .set_decimal_precision(precisions[read_index]) - .set_int96_timestamps(isInt96[read_index]) - .set_nullability(nullability[read_index]); - if (children[read_index] > 0) { - read_index = set_column_metadata(column_metadata.child(write_index), col_names, nullability, - isInt96, precisions, children, read_index); - } - else { - read_index++; - } - } - return read_index; -} - -void createTableMetaData(JNIEnv *env, jobjectArray &j_col_names, jintArray &j_children, - jbooleanArray &j_col_nullability, jobjectArray &j_metadata_keys, - jobjectArray &j_metadata_values, jint j_compression, jint j_stats_freq, - jbooleanArray &j_isInt96, jintArray &j_precisions, - cudf::io::table_input_metadata& metadata) { - cudf::jni::auto_set_device(env); - cudf::jni::native_jstringArray col_names(env, j_col_names); - cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability); - cudf::jni::native_jbooleanArray isInt96(env, j_isInt96); - cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys); - cudf::jni::native_jstringArray meta_values(env, j_metadata_values); - cudf::jni::native_jintArray precisions(env, j_precisions); - cudf::jni::native_jintArray children(env, j_children); - - auto cpp_names = col_names.as_cpp_vector(); - - int top_level_children = - children[0]; // this should never be 0, because a table must have columns - - // first value are dummy when reading - // but we write at index 0 - metadata.column_metadata.resize(top_level_children); - int read_index = 1; // the read_index, which will be used to read the arrays - for (int i = read_index, write_index = 0; i <= top_level_children; i++, write_index++) { - metadata.column_metadata[write_index] - .set_name(cpp_names[read_index]) - .set_nullability(col_nullability[read_index]) - .set_int96_timestamps(isInt96[read_index]) - .set_decimal_precision(precisions[read_index]); - - if (children[read_index] > 0) { - read_index = set_column_metadata(metadata.column_metadata[write_index], cpp_names, - col_nullability, isInt96, precisions, children, read_index); - } else { - read_index++; - } - } - for (auto i = 0; i < meta_keys.size(); ++i) { - metadata.user_data[meta_keys[i].get()] = meta_values[i].get(); - } - -} - JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin( - JNIEnv *env, jclass, jobjectArray j_col_names, jintArray j_children, - jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, - jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, - jobject consumer) { + JNIEnv *env, jclass, jobjectArray j_col_names, jbooleanArray j_col_nullability, + jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, + jint j_stats_freq, jboolean j_isInt96, jintArray j_precisions, jobject consumer) { JNI_NULL_CHECK(env, j_col_names, "null columns", 0); JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0); JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0); JNI_NULL_CHECK(env, j_metadata_values, "null metadata values", 0); JNI_NULL_CHECK(env, consumer, "null consumer", 0); try { - std::unique_ptr data_sink( - new cudf::jni::jni_writer_data_sink(env, consumer)); - + cudf::jni::auto_set_device(env); using namespace cudf::io; - sink_info sink{data_sink.get()}; + cudf::jni::native_jstringArray col_names(env, j_col_names); + cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability); + cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys); + cudf::jni::native_jstringArray meta_values(env, j_metadata_values); + cudf::jni::native_jintArray precisions(env, j_precisions); + + auto cpp_names = col_names.as_cpp_vector(); table_input_metadata metadata; - createTableMetaData(env, j_col_names, j_children, j_col_nullability, j_metadata_keys, - j_metadata_values, j_compression, j_stats_freq, j_isInt96, j_precisions, - metadata); + metadata.column_metadata.resize(col_nullability.size()); + for (int i = 0; i < col_nullability.size(); i++) { + metadata.column_metadata[i] + .set_name(cpp_names[i]) + .set_nullability(col_nullability[i]) + .set_int96_timestamps(j_isInt96); + } + + // Precisions is not always set + for (int i = 0; i < precisions.size(); i++) { + metadata.column_metadata[i] + .set_decimal_precision(precisions[i]); + } + + for (auto i = 0; i < meta_keys.size(); ++i) { + metadata.user_data[meta_keys[i].get()] = meta_values[i].get(); + } + std::unique_ptr data_sink( + new cudf::jni::jni_writer_data_sink(env, consumer)); + sink_info sink{data_sink.get()}; + std::vector const v_precisions( + precisions.data(), precisions.data() + precisions.size()); chunked_parquet_writer_options opts = chunked_parquet_writer_options::builder(sink) .metadata(&metadata) .compression(static_cast(j_compression)) .stats_level(static_cast(j_stats_freq)) .build(); + auto writer_ptr = std::make_unique(opts); cudf::jni::native_parquet_writer_handle *ret = new cudf::jni::native_parquet_writer_handle(std::move(writer_ptr), std::move(data_sink)); @@ -1175,22 +1127,44 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin( } JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin( - JNIEnv *env, jclass, jobjectArray j_col_names, jintArray j_children, - jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, - jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, - jstring j_output_path) { + JNIEnv *env, jclass, jobjectArray j_col_names, jbooleanArray j_col_nullability, + jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, + jint j_stats_freq, jboolean j_isInt96, jintArray j_precisions, jstring j_output_path) { JNI_NULL_CHECK(env, j_col_names, "null columns", 0); JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0); JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0); JNI_NULL_CHECK(env, j_metadata_values, "null metadata values", 0); JNI_NULL_CHECK(env, j_output_path, "null output path", 0); try { + cudf::jni::auto_set_device(env); + using namespace cudf::io; + cudf::jni::native_jstringArray col_names(env, j_col_names); + cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability); + cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys); + cudf::jni::native_jstringArray meta_values(env, j_metadata_values); cudf::jni::native_jstring output_path(env, j_output_path); + cudf::jni::native_jintArray precisions(env, j_precisions); - using namespace cudf::io; + auto cpp_names = col_names.as_cpp_vector(); table_input_metadata metadata; - createTableMetaData(env, j_col_names, j_children, j_col_nullability, j_metadata_keys, - j_metadata_values, j_compression, j_stats_freq, j_isInt96, j_precisions, metadata); + metadata.column_metadata.resize(col_nullability.size()); + for (int i = 0; i < col_nullability.size(); i++) { + metadata.column_metadata[i] + .set_name(cpp_names[i]) + .set_nullability(col_nullability[i]) + .set_int96_timestamps(j_isInt96); + } + + // Precisions is not always set + for (int i = 0; i < precisions.size(); i++) { + metadata.column_metadata[i] + .set_decimal_precision(precisions[i]); + } + + for (auto i = 0; i < meta_keys.size(); ++i) { + metadata.user_data[meta_keys[i].get()] = meta_values[i].get(); + } + sink_info sink{output_path.get()}; chunked_parquet_writer_options opts = chunked_parquet_writer_options::builder(sink) @@ -1859,7 +1833,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jc JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_interleaveColumns(JNIEnv *env, jclass, jlongArray j_cudf_table_view) { - JNI_NULL_CHECK(env, j_cudf_table_view, "table is null", 0); try { + JNI_NULL_CHECK(env, j_cudf_table_view, "table is null", 0); + try { cudf::jni::auto_set_device(env); cudf::table_view *table_view = reinterpret_cast(j_cudf_table_view); std::unique_ptr result = cudf::interleave_columns(*table_view); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index a047158d977..8b7ece5d60b 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -50,9 +50,6 @@ import static ai.rapids.cudf.Aggregate.max; import static ai.rapids.cudf.Aggregate.first; import static ai.rapids.cudf.Aggregate.last; -import static ai.rapids.cudf.ParquetColumnWriterOptions.*; -import static ai.rapids.cudf.ParquetWriterOptions.listBuilder; -import static ai.rapids.cudf.ParquetWriterOptions.structBuilder; import static ai.rapids.cudf.Table.TestBuilder; import static ai.rapids.cudf.Table.count; import static ai.rapids.cudf.Table.mean; @@ -4548,42 +4545,36 @@ void testTableBasedFilter() { } private Table getExpectedFileTable() { - return getExpectedFileTable(false, false); + return getExpectedFileTable(false); } private Table getExpectedFileTable(boolean withNestedColumns) { - return getExpectedFileTable(true, true); - } - - private Table getExpectedFileTable(boolean withStructColumns, boolean withListColumn) { TestBuilder tb = new TestBuilder() - .column(true, false, false, true, false) - .column(5, 1, 0, 2, 7) - .column(new Byte[]{2, 3, 4, 5, 9}) - .column(3l, 9l, 4l, 2l, 20l) - .column("this", "is", "a", "test", "string") - .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f) - .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d); - StructType nestedType = new StructType(true, - new BasicType(false, DType.INT32), new BasicType(false, DType.STRING)); - if (withStructColumns) { + .column(true, false, false, true, false) + .column(5, 1, 0, 2, 7) + .column(new Byte[]{2, 3, 4, 5, 9}) + .column(3l, 9l, 4l, 2l, 20l) + .column("this", "is", "a", "test", "string") + .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f) + .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d); + if (withNestedColumns) { + StructType nestedType = new StructType(true, + new BasicType(false, DType.INT32), new BasicType(false, DType.STRING)); tb.column(nestedType, - struct(1, "k1"), struct(2, "k2"), struct(3, "k3"), - struct(4, "k4"), new HostColumnVector.StructData((List) null)); - } - if (withListColumn) { - tb.column(new ListType(false, new BasicType(false, DType.INT32)), - Arrays.asList(1, 2), - Arrays.asList(3, 4), - Arrays.asList(5), - Arrays.asList(6, 7), - Arrays.asList(8, 9, 10)) - .column(new ListType(false, nestedType), - Arrays.asList(struct(1, "k1"), struct(2, "k2"), struct(3, "k3")), - Arrays.asList(struct(4, "k4"), struct(5, "k5")), - Arrays.asList(struct(6, "k6")), - Arrays.asList(new HostColumnVector.StructData((List) null)), - Arrays.asList()); + struct(1, "k1"), struct(2, "k2"), struct(3, "k3"), + struct(4, "k4"), new HostColumnVector.StructData((List) null)) + .column(new ListType(false, new BasicType(false, DType.INT32)), + Arrays.asList(1, 2), + Arrays.asList(3, 4), + Arrays.asList(5), + Arrays.asList(6, 7), + Arrays.asList(8, 9, 10)) + .column(new ListType(false, nestedType), + Arrays.asList(struct(1, "k1"), struct(2, "k2"), struct(3, "k3")), + Arrays.asList(struct(4, "k4"), struct(5, "k5")), + Arrays.asList(struct(6, "k6")), + Arrays.asList(new HostColumnVector.StructData((List) null)), + Arrays.asList()); } return tb.build(); } @@ -4651,9 +4642,9 @@ void testParquetWriteToBufferChunkedInt96() { try (Table table0 = getExpectedFileTableWithDecimals(); MyBufferConsumer consumer = new MyBufferConsumer()) { ParquetWriterOptions options = ParquetWriterOptions.builder() - .withNonNullableColumn("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6") - .withDecimalColumn("_c7", 5) - .withDecimalColumn("_c8", 5) + .withColumnNames("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9") + .withTimestampInt96(true) + .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 5, 5) .build(); try (TableWriter writer = Table.writeParquetChunked(options, consumer)) { @@ -4668,47 +4659,13 @@ void testParquetWriteToBufferChunkedInt96() { } } - @Test - void testParquetWriteToBufferChunkedWithNested() { - ParquetWriterOptions options = ParquetWriterOptions.builder() - .withNullableColumn("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6") - .withStructColumn(structBuilder("_c7") - .withNullableColumn("_c7-1") - .withNullableColumn("_c7-2") - .build()) - .withListColumn(listBuilder("_c8") - .withNullableColumn("c8-1").build()) - .withListColumn(listBuilder("c9") - .withStructColumn(structBuilder("c9-1") - .withNullableColumn("c9-1-1") - .withNullableColumn("c9-1-2").build()) - .build()) - .build(); - try (Table table0 = getExpectedFileTable(true); - MyBufferConsumer consumer = new MyBufferConsumer()) { - try (TableWriter writer = Table.writeParquetChunked(options, consumer)) { - writer.write(table0); - writer.write(table0); - writer.write(table0); - } - try (Table table1 = Table.readParquet(ParquetOptions.DEFAULT, consumer.buffer, 0, - consumer.offset); - Table concat = Table.concatenate(table0, table0, table0)) { - assertTablesAreEqual(concat, table1); - } - } - } - @Test void testParquetWriteToBufferChunked() { ParquetWriterOptions options = ParquetWriterOptions.builder() - .withNullableColumn("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6") - .withStructColumn(structBuilder("_c7") - .withNullableColumn("_c7-1") - .withNullableColumn("_c7-2") - .build()) + .withColumnNames("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7") + .withTimestampInt96(true) .build(); - try (Table table0 = getExpectedFileTable(true, false); + try (Table table0 = getExpectedFileTable(); MyBufferConsumer consumer = new MyBufferConsumer()) { try (TableWriter writer = Table.writeParquetChunked(options, consumer)) { writer.write(table0); @@ -4727,11 +4684,11 @@ void testParquetWriteToFileWithNames() throws IOException { File tempFile = File.createTempFile("test-names", ".parquet"); try (Table table0 = getExpectedFileTableWithDecimals()) { ParquetWriterOptions options = ParquetWriterOptions.builder() - .withNonNullableColumn("first", "second", "third", "fourth", "fifth", "sixth", "seventh") - .withDecimalColumn("eighth", 5) - .withDecimalColumn("nineth", 6) + .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh", + "eighth", "nineth") .withCompressionType(CompressionType.NONE) .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE) + .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 5, 6) .build(); try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) { writer.write(table0); @@ -4749,12 +4706,12 @@ void testParquetWriteToFileWithNamesAndMetadata() throws IOException { File tempFile = File.createTempFile("test-names-metadata", ".parquet"); try (Table table0 = getExpectedFileTableWithDecimals()) { ParquetWriterOptions options = ParquetWriterOptions.builder() - .withNonNullableColumn("first", "second", "third", "fourth", "fifth", "sixth", "seventh") - .withDecimalColumn("eighth", 6) - .withDecimalColumn("nineth", 8) + .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh", + "eighth", "nineth") .withMetadata("somekey", "somevalue") .withCompressionType(CompressionType.NONE) .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE) + .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 6, 8) .build(); try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) { writer.write(table0); @@ -4772,11 +4729,10 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException { File tempFile = File.createTempFile("test-uncompressed", ".parquet"); try (Table table0 = getExpectedFileTableWithDecimals()) { ParquetWriterOptions options = ParquetWriterOptions.builder() - .withNonNullableColumn("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6") - .withDecimalColumn("_c7", 4) - .withDecimalColumn("_c8", 6) + .withColumnNames("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9") .withCompressionType(CompressionType.NONE) .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE) + .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 4, 6) .build(); try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) { writer.write(table0);