From 7d695600c589ff064650c68c606c4c537054699e Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 17 Jun 2021 13:23:32 -0700 Subject: [PATCH 1/8] Optimizations * Split returns a ColumnView in cases where we don't need to own the underlying buffers * Added a method to write ColumnView to parquet directly, circumventing the formation of Table with ColumnVectors --- .../java/ai/rapids/cudf/ColumnVector.java | 53 +++++++++++++++++++ .../main/java/ai/rapids/cudf/ColumnView.java | 28 +++++----- java/src/main/java/ai/rapids/cudf/Table.java | 48 +++++++++++++++++ java/src/main/native/CMakeLists.txt | 15 +++++- java/src/main/native/src/ColumnViewJni.cpp | 14 ++--- 5 files changed, 137 insertions(+), 21 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index e543d0c7b21..6d398100a2a 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -1639,4 +1639,57 @@ public static ColumnVector empty(HostColumnVector.DataType colType) { throw new IllegalArgumentException("Unsupported data type: " + colType); } } + + /** + * Splits a column (including null values) into a set of columns + * according to a set of indices. The caller owns the ColumnVectors and is responsible + * closing them. + * + * The "split" function divides the input column into multiple intervals + * of rows using the splits indices values and it stores the intervals into the + * output columns. Regarding the interval of indices, a pair of values are taken + * from the indices array in a consecutive manner. The pair of indices are + * left-closed and right-open. + * + * The indices array ('splits') is require to be a monotonic non-decreasing set. + * The indices in the array are required to comply with the following conditions: + * a, b belongs to Range[0, input column size] + * a <= b, where the position of a is less or equal to the position of b. + * + * The split function will take a pair of indices from the indices array + * ('splits') in a consecutive manner. For the first pair, the function will + * take the value 0 and the first element of the indices array. For the last pair, + * the function will take the last element of the indices array and the size of + * the input column. + * + * Exceptional cases for the indices array are: + * When the values in the pair are equal, the function return an empty column. + * When the values in the pair are 'strictly decreasing', the outcome is + * undefined. + * When any of the values in the pair don't belong to the range[0, input column + * size), the outcome is undefined. + * When the indices array is empty, an empty vector of columns is returned. + * + * The input columns may have different sizes. The number of + * columns must be equal to the number of indices in the array plus one. + * + * Example: + * input: {10, 12, 14, 16, 18, 20, 22, 24, 26, 28} + * splits: {2, 5, 9} + * output: {{10, 12}, {14, 16, 18}, {20, 22, 24, 26}, {28}} + * + * Note that this is very similar to the output from a PartitionedTable. + * + * @param indices the indexes to split with + * @return A new ColumnVector array with slices from the original ColumnVector + */ + @Override + public final ColumnVector[] split(int... indices) { + ColumnView[] views = super.split(indices); + ColumnVector[] columnVectors = new ColumnVector[views.length]; + for (int i = 0; i < views.length; i++) { + columnVectors[i] = views[i].copyToColumnVector(); + } + return columnVectors; + } } diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index db42a8c9ca2..2eeb97a2723 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -551,9 +551,8 @@ public final ColumnVector subVector(int start, int end) { } /** - * Splits a column (including null values) into a set of columns - * according to a set of indices. The caller owns the ColumnVectors and is responsible - * closing them. + * Splits a ColumnView (including null values) into a set of ColumnViews + * according to a set of indices. No data is moved or copied * * The "split" function divides the input column into multiple intervals * of rows using the splits indices values and it stores the intervals into the @@ -561,10 +560,10 @@ public final ColumnVector subVector(int start, int end) { * from the indices array in a consecutive manner. The pair of indices are * left-closed and right-open. * - * The indices array ('splits') is require to be a monotonic non-decreasing set. + * The indices array ('splits') is required to be a monotonic non-decreasing set. * The indices in the array are required to comply with the following conditions: * a, b belongs to Range[0, input column size] - * a <= b, where the position of a is less or equal to the position of b. + * a <= b, where the position of 'a' is less or equal to the position of 'b'. * * The split function will take a pair of indices from the indices array * ('splits') in a consecutive manner. For the first pair, the function will @@ -578,9 +577,9 @@ public final ColumnVector subVector(int start, int end) { * undefined. * When any of the values in the pair don't belong to the range[0, input column * size), the outcome is undefined. - * When the indices array is empty, an empty vector of columns is returned. + * When the indices array is empty, an empty array of ColumnViews is returned. * - * The input columns may have different sizes. The number of + * The output columns may have different sizes. The number of * columns must be equal to the number of indices in the array plus one. * * Example: @@ -590,16 +589,19 @@ public final ColumnVector subVector(int start, int end) { * * Note that this is very similar to the output from a PartitionedTable. * - * @param indices the indexes to split with - * @return A new ColumnVector array with slices from the original ColumnVector + * NOTE: Nothing is copied out from the vector and the slices will only be relevant for the + * lifecycle of the underlying ColumnVector + * + * @param indices the indices to split with + * @return A new ColumnView array with slices from the original ColumnView */ - public final ColumnVector[] split(int... indices) { + public ColumnView[] split(int... indices) { long[] nativeHandles = split(this.getNativeView(), indices); - ColumnVector[] columnVectors = new ColumnVector[nativeHandles.length]; + ColumnView[] columnViews = new ColumnView[nativeHandles.length]; for (int i = 0; i < nativeHandles.length; i++) { - columnVectors[i] = new ColumnVector(nativeHandles[i]); + columnViews[i] = new ColumnView(nativeHandles[i]); } - return columnVectors; + return columnViews; } /** diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 3cb836c7feb..f99cb805ab6 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -79,6 +79,32 @@ public Table(ColumnVector... columns) { nativeHandle = createCudfTableView(viewPointers); } + /** + * Creates a Table that makes a copy of the array of {@link ColumnView}s passed to it. + * NOTE: The refcounts on the {@link ColumnVector} pointed by {@link ColumnView} will not be + * increased so once the {@link ColumnVector} is deleted, this {@link Table} will be useless + * use the {@link Table(ColumnVector)} if you want the table to point to ColumnVectors instead + * @param columnsViews - Array of ColumnViews + */ + private Table(ColumnView... columnsViews) { + assert columnsViews != null && columnsViews.length > 0 : "ColumnViews can't be null or empty"; + rows = columnsViews[0].getRowCount(); + + for (ColumnView columnView : columnsViews) { + assert (null != columnView) : "ColumnViews can't be null"; + assert (rows == columnView.getRowCount()) : "All columns should have the same number of " + + "rows " + columnView.getType(); + } + + // Since Arrays are mutable objects make a copy + long[] viewPointers = new long[columnsViews.length]; + for (int i = 0; i < columnsViews.length; i++) { + viewPointers[i] = columnsViews[i].getNativeView(); + } + + nativeHandle = createCudfTableView(viewPointers); + } + /** * Create a Table from an array of existing on device cudf::column pointers. Ownership of the * columns is transferred to the ColumnVectors held by the new Table. In the case of an exception @@ -920,6 +946,28 @@ public static TableWriter writeParquetChunked(ParquetWriterOptions options, return new ParquetTableWriter(options, consumer); } + /** + * This is an evolving API and most likely be removed in future releases. Please use with the + * caveat that this will not exist in the near future + * @param options the parquet writer options. + * @param consumer a class that will be called when host buffers are ready with parquet + * formatted data in them. + * @param columnViews ColumnViews to write to Parquet + */ + public static void writeColumnViewsToParquet(ParquetWriterOptions options, + HostBufferConsumer consumer, + ColumnView... columnViews) { + + try (ParquetTableWriter writer = new ParquetTableWriter(options, consumer); + Table notARealTable = new Table(columnViews)) { + long total = 0; + for (ColumnView cv: columnViews) { + total += cv.getDeviceMemorySize(); + } + writeParquetChunk(writer.handle, notARealTable.nativeHandle, total); + } + } + /** * Writes this table to a Parquet file on the host * diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index b75988309c3..85c769f2513 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -52,7 +52,11 @@ message(VERBOSE "CUDF_JNI: Build with GPUDirect Storage support: ${USE_GDS}") message(VERBOSE "CUDF_JNI: Build with static Arrow library: ${CUDF_JNI_ARROW_STATIC}") set(CUDF_SOURCE_DIR "${PROJECT_SOURCE_DIR}/../../../../cpp") -set(CUDF_CPP_BUILD_DIR "${CUDF_SOURCE_DIR}/build") +if (DEFINED ENV{CUDF_CPP_BUILD_DIR}) + set(CUDF_CPP_BUILD_DIR "$ENV{CUDF_CPP_BUILD_DIR}") +else() + set(CUDF_CPP_BUILD_DIR "${CUDF_SOURCE_DIR}/build") +endif() set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/" @@ -111,27 +115,34 @@ endif(CUDA_STATIC_RUNTIME) ################################################################################################### # - Thrust/CUB/libcudacxx ------------------------------------------------------------------------------------ - +message("cpp build dir: " ${CUDF_CPP_BUILD_DIR}) find_path(THRUST_INCLUDE "thrust" HINTS "$ENV{CUDF_ROOT}/_deps/thrust-src" "${CUDF_CPP_BUILD_DIR}/_deps/thrust-src" "$ENV{CONDA_PREFIX}/include") +message(STATUS "THRUST: THRUST_INCLUDE set to ${THRUST_INCLUDE}") + find_path(CUB_INCLUDE "cub" HINTS "$ENV{CUDF_ROOT}/_deps/thrust-src" "${CUDF_CPP_BUILD_DIR}/_deps/thrust-src" "$ENV{CONDA_PREFIX}/include") +message(STATUS "CUB: CUB_INCLUDE set to ${CUB_INCLUDE}") + find_path(LIBCUDACXX_INCLUDE "cuda" HINTS "$ENV{CUDF_ROOT}/_deps/libcudacxx-src/include" "${CUDF_CPP_BUILD_DIR}/_deps/libcudacxx-src/include") +message(STATUS "LIBCUDACXX: LIBCUDACXX_INCLUDE set to ${LIBCUDACXX_INCLUDE}") + find_path(SPDLOG_INCLUDE "spdlog" HINTS "${CUDF_CPP_BUILD_DIR}/_deps/spdlog-src/include" "$ENV{RMM_ROOT}/_deps/spdlog-src/include" "$ENV{RMM_ROOT}/include" "$ENV{CONDA_PREFIX}/include") +message(STATUS "SPDLOG: SPDLOG_INCLUDE set to ${SPDLOG_INCLUDE}") ################################################################################################### # - CUDF ------------------------------------------------------------------------------------------ diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index d41ed97b4cb..72bca0623cb 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -470,14 +470,16 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv *env, j } std::vector result = cudf::split(*n_column, indices); + + // std::transform(result.begin(), result.end(), result.begin(), + // [](cudf::column_view const &c) -> jlong { return reinterpret_cast(&c); }); + + // cudf::jni::native_jlongArray n_result(env, (jlong*)result.data(), result.size()); + cudf::jni::native_jlongArray n_result(env, result.size()); - std::vector> column_result(result.size()); for (size_t i = 0; i < result.size(); i++) { - column_result[i].reset(new cudf::column(result[i])); - n_result[i] = reinterpret_cast(column_result[i].get()); - } - for (size_t i = 0; i < result.size(); i++) { - column_result[i].release(); + cudf::column_view const * c = new cudf::column_view(result[i]); + n_result[i] = reinterpret_cast(c); } return n_result.get_jArray(); } From 17e6366eff0b01971372c40ebbd88416045f9d9e Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 17 Jun 2021 13:53:40 -0700 Subject: [PATCH 2/8] code cleanup --- java/src/main/native/CMakeLists.txt | 1 - java/src/main/native/src/ColumnViewJni.cpp | 5 ----- 2 files changed, 6 deletions(-) diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 85c769f2513..6c891511a1a 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -115,7 +115,6 @@ endif(CUDA_STATIC_RUNTIME) ################################################################################################### # - Thrust/CUB/libcudacxx ------------------------------------------------------------------------------------ -message("cpp build dir: " ${CUDF_CPP_BUILD_DIR}) find_path(THRUST_INCLUDE "thrust" HINTS "$ENV{CUDF_ROOT}/_deps/thrust-src" "${CUDF_CPP_BUILD_DIR}/_deps/thrust-src" diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 72bca0623cb..9356e698d31 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -471,11 +471,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv *env, j std::vector result = cudf::split(*n_column, indices); - // std::transform(result.begin(), result.end(), result.begin(), - // [](cudf::column_view const &c) -> jlong { return reinterpret_cast(&c); }); - - // cudf::jni::native_jlongArray n_result(env, (jlong*)result.data(), result.size()); - cudf::jni::native_jlongArray n_result(env, result.size()); for (size_t i = 0; i < result.size(); i++) { cudf::column_view const * c = new cudf::column_view(result[i]); From e615ba5b11907f1b2ff03412f24cb678383e04bc Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 17 Jun 2021 15:24:12 -0700 Subject: [PATCH 3/8] Addressed review comments --- .../java/ai/rapids/cudf/ColumnVector.java | 53 +----------------- .../main/java/ai/rapids/cudf/ColumnView.java | 54 +++++++++++++++++- java/src/main/java/ai/rapids/cudf/Table.java | 56 ++++++++----------- 3 files changed, 78 insertions(+), 85 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 6d398100a2a..89c913b1e8b 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -1640,56 +1640,5 @@ public static ColumnVector empty(HostColumnVector.DataType colType) { } } - /** - * Splits a column (including null values) into a set of columns - * according to a set of indices. The caller owns the ColumnVectors and is responsible - * closing them. - * - * The "split" function divides the input column into multiple intervals - * of rows using the splits indices values and it stores the intervals into the - * output columns. Regarding the interval of indices, a pair of values are taken - * from the indices array in a consecutive manner. The pair of indices are - * left-closed and right-open. - * - * The indices array ('splits') is require to be a monotonic non-decreasing set. - * The indices in the array are required to comply with the following conditions: - * a, b belongs to Range[0, input column size] - * a <= b, where the position of a is less or equal to the position of b. - * - * The split function will take a pair of indices from the indices array - * ('splits') in a consecutive manner. For the first pair, the function will - * take the value 0 and the first element of the indices array. For the last pair, - * the function will take the last element of the indices array and the size of - * the input column. - * - * Exceptional cases for the indices array are: - * When the values in the pair are equal, the function return an empty column. - * When the values in the pair are 'strictly decreasing', the outcome is - * undefined. - * When any of the values in the pair don't belong to the range[0, input column - * size), the outcome is undefined. - * When the indices array is empty, an empty vector of columns is returned. - * - * The input columns may have different sizes. The number of - * columns must be equal to the number of indices in the array plus one. - * - * Example: - * input: {10, 12, 14, 16, 18, 20, 22, 24, 26, 28} - * splits: {2, 5, 9} - * output: {{10, 12}, {14, 16, 18}, {20, 22, 24, 26}, {28}} - * - * Note that this is very similar to the output from a PartitionedTable. - * - * @param indices the indexes to split with - * @return A new ColumnVector array with slices from the original ColumnVector - */ - @Override - public final ColumnVector[] split(int... indices) { - ColumnView[] views = super.split(indices); - ColumnVector[] columnVectors = new ColumnVector[views.length]; - for (int i = 0; i < views.length; i++) { - columnVectors[i] = views[i].copyToColumnVector(); - } - return columnVectors; - } + } diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 2eeb97a2723..1b9b497e5de 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -550,6 +550,58 @@ public final ColumnVector subVector(int start, int end) { return tmp[0]; } + /** + * Splits a column (including null values) into a set of columns + * according to a set of indices. The caller owns the ColumnVectors and is responsible + * closing them. + * + * The "split" function divides the input column into multiple intervals + * of rows using the splits indices values and it stores the intervals into the + * output columns. Regarding the interval of indices, a pair of values are taken + * from the indices array in a consecutive manner. The pair of indices are + * left-closed and right-open. + * + * The indices array ('splits') is require to be a monotonic non-decreasing set. + * The indices in the array are required to comply with the following conditions: + * a, b belongs to Range[0, input column size] + * a <= b, where the position of a is less or equal to the position of b. + * + * The split function will take a pair of indices from the indices array + * ('splits') in a consecutive manner. For the first pair, the function will + * take the value 0 and the first element of the indices array. For the last pair, + * the function will take the last element of the indices array and the size of + * the input column. + * + * Exceptional cases for the indices array are: + * When the values in the pair are equal, the function return an empty column. + * When the values in the pair are 'strictly decreasing', the outcome is + * undefined. + * When any of the values in the pair don't belong to the range[0, input column + * size), the outcome is undefined. + * When the indices array is empty, an empty vector of columns is returned. + * + * The input columns may have different sizes. The number of + * columns must be equal to the number of indices in the array plus one. + * + * Example: + * input: {10, 12, 14, 16, 18, 20, 22, 24, 26, 28} + * splits: {2, 5, 9} + * output: {{10, 12}, {14, 16, 18}, {20, 22, 24, 26}, {28}} + * + * Note that this is very similar to the output from a PartitionedTable. + * + * @param indices the indexes to split with + * @return A new ColumnVector array with slices from the original ColumnVector + */ + public final ColumnVector[] split(int... indices) { + ColumnView[] views = splitAsViews(indices); + ColumnVector[] columnVectors = new ColumnVector[views.length]; + for (int i = 0; i < views.length; i++) { + columnVectors[i] = views[i].copyToColumnVector(); + } + return columnVectors; + } + /** * Splits a ColumnView (including null values) into a set of ColumnViews * according to a set of indices. No data is moved or copied @@ -595,7 +647,7 @@ public final ColumnVector subVector(int start, int end) { * @param indices the indices to split with * @return A new ColumnView array with slices from the original ColumnView */ - public ColumnView[] split(int... indices) { + public ColumnView[] splitAsViews(int... indices) { long[] nativeHandles = split(this.getNativeView(), indices); ColumnView[] columnViews = new ColumnView[nativeHandles.length]; for (int i = 0; i < nativeHandles.length; i++) { diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index f99cb805ab6..f918cf3618f 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -79,32 +79,6 @@ public Table(ColumnVector... columns) { nativeHandle = createCudfTableView(viewPointers); } - /** - * Creates a Table that makes a copy of the array of {@link ColumnView}s passed to it. - * NOTE: The refcounts on the {@link ColumnVector} pointed by {@link ColumnView} will not be - * increased so once the {@link ColumnVector} is deleted, this {@link Table} will be useless - * use the {@link Table(ColumnVector)} if you want the table to point to ColumnVectors instead - * @param columnsViews - Array of ColumnViews - */ - private Table(ColumnView... columnsViews) { - assert columnsViews != null && columnsViews.length > 0 : "ColumnViews can't be null or empty"; - rows = columnsViews[0].getRowCount(); - - for (ColumnView columnView : columnsViews) { - assert (null != columnView) : "ColumnViews can't be null"; - assert (rows == columnView.getRowCount()) : "All columns should have the same number of " + - "rows " + columnView.getType(); - } - - // Since Arrays are mutable objects make a copy - long[] viewPointers = new long[columnsViews.length]; - for (int i = 0; i < columnsViews.length; i++) { - viewPointers[i] = columnsViews[i].getNativeView(); - } - - nativeHandle = createCudfTableView(viewPointers); - } - /** * Create a Table from an array of existing on device cudf::column pointers. Ownership of the * columns is transferred to the ColumnVectors held by the new Table. In the case of an exception @@ -957,14 +931,32 @@ public static TableWriter writeParquetChunked(ParquetWriterOptions options, public static void writeColumnViewsToParquet(ParquetWriterOptions options, HostBufferConsumer consumer, ColumnView... columnViews) { + assert columnViews != null && columnViews.length > 0 : "ColumnViews can't be null or empty"; + long rows = columnViews[0].getRowCount(); + + for (ColumnView columnView : columnViews) { + assert (null != columnView) : "ColumnViews can't be null"; + assert (rows == columnView.getRowCount()) : "All columns should have the same number of " + + "rows " + columnView.getType(); + } - try (ParquetTableWriter writer = new ParquetTableWriter(options, consumer); - Table notARealTable = new Table(columnViews)) { - long total = 0; - for (ColumnView cv: columnViews) { - total += cv.getDeviceMemorySize(); + // Since Arrays are mutable objects make a copy + long[] viewPointers = new long[columnViews.length]; + for (int i = 0; i < columnViews.length; i++) { + viewPointers[i] = columnViews[i].getNativeView(); + } + + long nativeHandle = createCudfTableView(viewPointers); + try { + try (ParquetTableWriter writer = new ParquetTableWriter(options, consumer)) { + long total = 0; + for (ColumnView cv : columnViews) { + total += cv.getDeviceMemorySize(); + } + writeParquetChunk(writer.handle, nativeHandle, total); } - writeParquetChunk(writer.handle, notARealTable.nativeHandle, total); + } finally { + deleteCudfTable(nativeHandle); } } From de7d9e1405a017de1115658729eaee45272c4591 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Thu, 17 Jun 2021 16:00:32 -0700 Subject: [PATCH 4/8] removed empty lines --- java/src/main/java/ai/rapids/cudf/ColumnVector.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 89c913b1e8b..e543d0c7b21 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -1639,6 +1639,4 @@ public static ColumnVector empty(HostColumnVector.DataType colType) { throw new IllegalArgumentException("Unsupported data type: " + colType); } } - - } From 802a3801af83b00c5ec91c81affa7bb7f4dc26ee Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Fri, 18 Jun 2021 10:09:55 -0700 Subject: [PATCH 5/8] addressed review comments --- .../main/java/ai/rapids/cudf/ColumnView.java | 23 +++++++++++++------ java/src/main/java/ai/rapids/cudf/Table.java | 6 ++--- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 1b9b497e5de..eeafd2c1527 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -595,16 +595,26 @@ public final ColumnVector subVector(int start, int end) { */ public final ColumnVector[] split(int... indices) { ColumnView[] views = splitAsViews(indices); - ColumnVector[] columnVectors = new ColumnVector[views.length]; - for (int i = 0; i < views.length; i++) { - columnVectors[i] = views[i].copyToColumnVector(); + try { + ColumnVector[] columnVectors = new ColumnVector[views.length]; + for (int i = 0; i < views.length; i++) { + columnVectors[i] = views[i].copyToColumnVector(); + } + return columnVectors; + } finally { + for (ColumnView view: views) { + view.close(); + } } - return columnVectors; } /** * Splits a ColumnView (including null values) into a set of ColumnViews - * according to a set of indices. No data is moved or copied + * according to a set of indices. No data is moved or copied. + * + * IMPORTANT NOTE: Nothing is copied out from the vector and the slices will only be relevant for + * the lifecycle of the underlying ColumnVector + * * * The "split" function divides the input column into multiple intervals * of rows using the splits indices values and it stores the intervals into the @@ -641,8 +651,7 @@ public final ColumnVector[] split(int... indices) { * * Note that this is very similar to the output from a PartitionedTable. * - * NOTE: Nothing is copied out from the vector and the slices will only be relevant for the - * lifecycle of the underlying ColumnVector + * * @param indices the indices to split with * @return A new ColumnView array with slices from the original ColumnView diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index f918cf3618f..c0515521cc5 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -922,9 +922,9 @@ public static TableWriter writeParquetChunked(ParquetWriterOptions options, /** * This is an evolving API and most likely be removed in future releases. Please use with the - * caveat that this will not exist in the near future - * @param options the parquet writer options. - * @param consumer a class that will be called when host buffers are ready with parquet + * caveat that this will not exist in the near future. + * @param options the Parquet writer options. + * @param consumer a class that will be called when host buffers are ready with Parquet * formatted data in them. * @param columnViews ColumnViews to write to Parquet */ From 2dda55a2ce6398ae8febce037c9144cc29caa6a1 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Fri, 18 Jun 2021 10:13:12 -0700 Subject: [PATCH 6/8] removed empty lines --- java/rmm_log.txt | 6 ++++++ java/src/main/java/ai/rapids/cudf/ColumnView.java | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 java/rmm_log.txt diff --git a/java/rmm_log.txt b/java/rmm_log.txt new file mode 100644 index 00000000000..685c4d76d22 --- /dev/null +++ b/java/rmm_log.txt @@ -0,0 +1,6 @@ +[ 26071][10:10:58:581576][info ] ----- RMM LOG BEGIN [PTDS DISABLED] ----- +[ 26071][10:10:58:581601][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] +[ 26071][10:10:58:581633][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] +[ 26071][10:10:58:581650][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] +[ 26071][10:10:59:089502][error ] [A][Stream 0x1][Upstream 1024B][FAILURE maximum pool size exceeded] +[ 26071][10:11:01:377575][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index eeafd2c1527..ecd5f7f1ffe 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -615,7 +615,6 @@ public final ColumnVector[] split(int... indices) { * IMPORTANT NOTE: Nothing is copied out from the vector and the slices will only be relevant for * the lifecycle of the underlying ColumnVector * - * * The "split" function divides the input column into multiple intervals * of rows using the splits indices values and it stores the intervals into the * output columns. Regarding the interval of indices, a pair of values are taken @@ -651,7 +650,6 @@ public final ColumnVector[] split(int... indices) { * * Note that this is very similar to the output from a PartitionedTable. * - * * @param indices the indices to split with * @return A new ColumnView array with slices from the original ColumnView From 3f7d70bc8a7b0b817614b94864af50712d9bd2fe Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Fri, 18 Jun 2021 10:14:34 -0700 Subject: [PATCH 7/8] removed the rmm_log.txt --- java/rmm_log.txt | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 java/rmm_log.txt diff --git a/java/rmm_log.txt b/java/rmm_log.txt deleted file mode 100644 index 685c4d76d22..00000000000 --- a/java/rmm_log.txt +++ /dev/null @@ -1,6 +0,0 @@ -[ 26071][10:10:58:581576][info ] ----- RMM LOG BEGIN [PTDS DISABLED] ----- -[ 26071][10:10:58:581601][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] -[ 26071][10:10:58:581633][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] -[ 26071][10:10:58:581650][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] -[ 26071][10:10:59:089502][error ] [A][Stream 0x1][Upstream 1024B][FAILURE maximum pool size exceeded] -[ 26071][10:11:01:377575][error ] [A][Stream 0x1][Upstream 3458764513820540928B][FAILURE maximum pool size exceeded] From f9858e09480a24d17705dc6518e1a940e94998ce Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Fri, 18 Jun 2021 10:30:41 -0700 Subject: [PATCH 8/8] close vectors when there is an exception --- java/src/main/java/ai/rapids/cudf/ColumnView.java | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index ecd5f7f1ffe..100536af9e7 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -595,14 +595,21 @@ public final ColumnVector subVector(int start, int end) { */ public final ColumnVector[] split(int... indices) { ColumnView[] views = splitAsViews(indices); + ColumnVector[] columnVectors = new ColumnVector[views.length]; try { - ColumnVector[] columnVectors = new ColumnVector[views.length]; for (int i = 0; i < views.length; i++) { columnVectors[i] = views[i].copyToColumnVector(); } return columnVectors; + } catch (Throwable t) { + for (ColumnVector cv : columnVectors) { + if (cv != null) { + cv.close(); + } + } + throw t; } finally { - for (ColumnView view: views) { + for (ColumnView view : views) { view.close(); } } @@ -613,7 +620,7 @@ public final ColumnVector[] split(int... indices) { * according to a set of indices. No data is moved or copied. * * IMPORTANT NOTE: Nothing is copied out from the vector and the slices will only be relevant for - * the lifecycle of the underlying ColumnVector + * the lifecycle of the underlying ColumnVector. * * The "split" function divides the input column into multiple intervals * of rows using the splits indices values and it stores the intervals into the