rapidsai · rapids-bot · Jun 18, 2021 · Jun 17, 2021 · Jun 17, 2021 · Jun 17, 2021
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -1639,4 +1639,57 @@ public static ColumnVector empty(HostColumnVector.DataType colType) {
       throw new IllegalArgumentException("Unsupported data type: " + colType);
     }
   }
+
+  /**
+   * Splits a column (including null values) into a set of columns
+   * according to a set of indices. The caller owns the ColumnVectors and is responsible
+   * closing them.
+   *
+   * The "split" function divides the input column into multiple intervals
+   * of rows using the splits indices values and it stores the intervals into the
+   * output columns. Regarding the interval of indices, a pair of values are taken
+   * from the indices array in a consecutive manner. The pair of indices are
+   * left-closed and right-open.
+   *
+   * The indices array ('splits') is require to be a monotonic non-decreasing set.
+   * The indices in the array are required to comply with the following conditions:
+   * a, b belongs to Range[0, input column size]
+   * a <= b, where the position of a is less or equal to the position of b.
+   *
+   * The split function will take a pair of indices from the indices array
+   * ('splits') in a consecutive manner. For the first pair, the function will
+   * take the value 0 and the first element of the indices array. For the last pair,
+   * the function will take the last element of the indices array and the size of
+   * the input column.
+   *
+   * Exceptional cases for the indices array are:
+   * When the values in the pair are equal, the function return an empty column.
+   * When the values in the pair are 'strictly decreasing', the outcome is
+   * undefined.
+   * When any of the values in the pair don't belong to the range[0, input column
+   * size), the outcome is undefined.
+   * When the indices array is empty, an empty vector of columns is returned.
+   *
+   * The input columns may have different sizes. The number of
+   * columns must be equal to the number of indices in the array plus one.
+   *
+   * Example:
+   * input:   {10, 12, 14, 16, 18, 20, 22, 24, 26, 28}
+   * splits: {2, 5, 9}
+   * output:  {{10, 12}, {14, 16, 18}, {20, 22, 24, 26}, {28}}
+   *
+   * Note that this is very similar to the output from a PartitionedTable.
+   *
+   * @param indices the indexes to split with
+   * @return A new ColumnVector array with slices from the original ColumnVector
+   */
+  @Override
+  public final ColumnVector[] split(int... indices) {
+    ColumnView[] views = super.split(indices);
+    ColumnVector[] columnVectors = new ColumnVector[views.length];
+    for (int i = 0; i < views.length; i++) {
+      columnVectors[i] = views[i].copyToColumnVector();
+    }
+    return columnVectors;
+  }
 }
@@ -551,20 +551,19 @@ public final ColumnVector subVector(int start, int end) {
   }
 
   /**
-   * Splits a column (including null values) into a set of columns
-   * according to a set of indices. The caller owns the ColumnVectors and is responsible
-   * closing them.
+   * Splits a ColumnView (including null values) into a set of ColumnViews
+   * according to a set of indices. No data is moved or copied
    *
    * The "split" function divides the input column into multiple intervals
    * of rows using the splits indices values and it stores the intervals into the
    * output columns. Regarding the interval of indices, a pair of values are taken
    * from the indices array in a consecutive manner. The pair of indices are
    * left-closed and right-open.
    *
-   * The indices array ('splits') is require to be a monotonic non-decreasing set.
+   * The indices array ('splits') is required to be a monotonic non-decreasing set.
    * The indices in the array are required to comply with the following conditions:
    * a, b belongs to Range[0, input column size]
-   * a <= b, where the position of a is less or equal to the position of b.
+   * a <= b, where the position of 'a' is less or equal to the position of 'b'.
    *
    * The split function will take a pair of indices from the indices array
    * ('splits') in a consecutive manner. For the first pair, the function will
@@ -578,9 +577,9 @@ public final ColumnVector subVector(int start, int end) {
    * undefined.
    * When any of the values in the pair don't belong to the range[0, input column
    * size), the outcome is undefined.
-   * When the indices array is empty, an empty vector of columns is returned.
+   * When the indices array is empty, an empty array of ColumnViews is returned.
    *
-   * The input columns may have different sizes. The number of
+   * The output columns may have different sizes. The number of
    * columns must be equal to the number of indices in the array plus one.
    *
    * Example:
@@ -590,16 +589,19 @@ public final ColumnVector subVector(int start, int end) {
    *
    * Note that this is very similar to the output from a PartitionedTable.
    *
-   * @param indices the indexes to split with
-   * @return A new ColumnVector array with slices from the original ColumnVector
+   * NOTE: Nothing is copied out from the vector and the slices will only be relevant for the
+   * lifecycle of the underlying ColumnVector
+   *
+   * @param indices the indices to split with
+   * @return A new ColumnView array with slices from the original ColumnView
    */
-  public final ColumnVector[] split(int... indices) {
+  public ColumnView[] split(int... indices) {
     long[] nativeHandles = split(this.getNativeView(), indices);
-    ColumnVector[] columnVectors = new ColumnVector[nativeHandles.length];
+    ColumnView[] columnViews = new ColumnView[nativeHandles.length];
     for (int i = 0; i < nativeHandles.length; i++) {
-      columnVectors[i] = new ColumnVector(nativeHandles[i]);
+      columnViews[i] = new ColumnView(nativeHandles[i]);
     }
-    return columnVectors;
+    return columnViews;
   }
 
   /**

@@ -79,6 +79,32 @@ public Table(ColumnVector... columns) {
     nativeHandle = createCudfTableView(viewPointers);
   }
 
+  /**
+   * Creates a Table that makes a copy of the array of {@link ColumnView}s passed to it.
+   * NOTE: The refcounts on the {@link ColumnVector} pointed by {@link ColumnView} will not be
+   * increased so once the {@link ColumnVector} is deleted, this {@link Table} will be useless
+   * use the {@link Table(ColumnVector)} if you want the table to point to ColumnVectors instead
+   * @param columnsViews - Array of ColumnViews
+   */
+  private Table(ColumnView... columnsViews) {
+    assert columnsViews != null && columnsViews.length > 0 : "ColumnViews can't be null or empty";
+    rows = columnsViews[0].getRowCount();
+
+    for (ColumnView columnView : columnsViews) {
+      assert (null != columnView) : "ColumnViews can't be null";
+      assert (rows == columnView.getRowCount()) : "All columns should have the same number of " +
+          "rows " + columnView.getType();
+    }
+
+    // Since Arrays are mutable objects make a copy
+    long[] viewPointers = new long[columnsViews.length];
+    for (int i = 0; i < columnsViews.length; i++) {
+      viewPointers[i] = columnsViews[i].getNativeView();
+    }
+
+    nativeHandle = createCudfTableView(viewPointers);
+  }
+
   /**
    * Create a Table from an array of existing on device cudf::column pointers. Ownership of the
    * columns is transferred to the ColumnVectors held by the new Table. In the case of an exception
@@ -920,6 +946,28 @@ public static TableWriter writeParquetChunked(ParquetWriterOptions options,
     return new ParquetTableWriter(options, consumer);
   }
 
+  /**
+   * This is an evolving API and most likely be removed in future releases. Please use with the
+   * caveat that this will not exist in the near future
+   * @param options the parquet writer options.
+   * @param consumer a class that will be called when host buffers are ready with parquet
+   *                 formatted data in them.
+   * @param columnViews ColumnViews to write to Parquet
+   */
+  public static void writeColumnViewsToParquet(ParquetWriterOptions options,
+                                               HostBufferConsumer consumer,
+                                               ColumnView... columnViews) {
+
+    try (ParquetTableWriter writer = new ParquetTableWriter(options, consumer);
+         Table notARealTable = new Table(columnViews)) {
+      long total = 0;
+      for (ColumnView cv: columnViews) {
+        total += cv.getDeviceMemorySize();
+      }
+      writeParquetChunk(writer.handle, notARealTable.nativeHandle, total);
+    }
+  }
+
   /**
    * Writes this table to a Parquet file on the host
    *

@@ -52,7 +52,11 @@ message(VERBOSE "CUDF_JNI: Build with GPUDirect Storage support: ${USE_GDS}")
 message(VERBOSE "CUDF_JNI: Build with static Arrow library: ${CUDF_JNI_ARROW_STATIC}")
 
 set(CUDF_SOURCE_DIR "${PROJECT_SOURCE_DIR}/../../../../cpp")
-set(CUDF_CPP_BUILD_DIR "${CUDF_SOURCE_DIR}/build")
+if (DEFINED ENV{CUDF_CPP_BUILD_DIR}) 
+  set(CUDF_CPP_BUILD_DIR "$ENV{CUDF_CPP_BUILD_DIR}")
+else()
+  set(CUDF_CPP_BUILD_DIR "${CUDF_SOURCE_DIR}/build")
+endif()
 
 set(CMAKE_MODULE_PATH
     "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/"
@@ -111,27 +115,33 @@ endif(CUDA_STATIC_RUNTIME)
 
 ###################################################################################################
 # - Thrust/CUB/libcudacxx ------------------------------------------------------------------------------------
-
 find_path(THRUST_INCLUDE "thrust"
     HINTS "$ENV{CUDF_ROOT}/_deps/thrust-src"
           "${CUDF_CPP_BUILD_DIR}/_deps/thrust-src"
           "$ENV{CONDA_PREFIX}/include")
 
+message(STATUS "THRUST: THRUST_INCLUDE set to ${THRUST_INCLUDE}")
+
 find_path(CUB_INCLUDE "cub"
     HINTS "$ENV{CUDF_ROOT}/_deps/thrust-src"
           "${CUDF_CPP_BUILD_DIR}/_deps/thrust-src"
           "$ENV{CONDA_PREFIX}/include")
 
+message(STATUS "CUB: CUB_INCLUDE set to ${CUB_INCLUDE}")
+
 find_path(LIBCUDACXX_INCLUDE "cuda"
     HINTS "$ENV{CUDF_ROOT}/_deps/libcudacxx-src/include"
           "${CUDF_CPP_BUILD_DIR}/_deps/libcudacxx-src/include")
 
+message(STATUS "LIBCUDACXX: LIBCUDACXX_INCLUDE set to ${LIBCUDACXX_INCLUDE}")
+
 find_path(SPDLOG_INCLUDE "spdlog"
     HINTS "${CUDF_CPP_BUILD_DIR}/_deps/spdlog-src/include"
           "$ENV{RMM_ROOT}/_deps/spdlog-src/include"
           "$ENV{RMM_ROOT}/include"
           "$ENV{CONDA_PREFIX}/include")
 
+message(STATUS "SPDLOG: SPDLOG_INCLUDE set to ${SPDLOG_INCLUDE}")
 ###################################################################################################
 # - CUDF ------------------------------------------------------------------------------------------
 

@@ -470,14 +470,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv *env, j
     }
 
     std::vector<cudf::column_view> result = cudf::split(*n_column, indices);
+
     cudf::jni::native_jlongArray n_result(env, result.size());
-    std::vector<std::unique_ptr<cudf::column>> column_result(result.size());
     for (size_t i = 0; i < result.size(); i++) {
-      column_result[i].reset(new cudf::column(result[i]));
-      n_result[i] = reinterpret_cast<jlong>(column_result[i].get());
-    }
-    for (size_t i = 0; i < result.size(); i++) {
-      column_result[i].release();
+      cudf::column_view const * c = new cudf::column_view(result[i]);
+      n_result[i] = reinterpret_cast<jlong>(c);
     }
     return n_result.get_jArray();
   }