Add JNI for strings::split_re and strings::split_record_re (#10139)

This PR adds Java binding for the new strings API `strings::split_re` and `strings::split_record_re`, which allows splitting strings by regular expression delimiters. In addition, the Java string split overloads with default split pattern (an empty string) are removed in this PR. That is because with default empty pattern the Java's split API produces different results than cudf. Finally, some cleanup has been perform automatically thanks to IntelliJ IDE. Depends on #10128. This is breaking change which is fixed by NVIDIA/spark-rapids#4714. Thus, it should be merged at the same time with NVIDIA/spark-rapids#4714. Authors: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Andy Grove (https://github.com/andygrove) URL: #10139
rapidsai · Feb 14, 2022 · 7025c40 · 7025c40
1 parent 463266f
commit 7025c40
Show file tree

Hide file tree

Showing 3 changed files with 283 additions and 142 deletions.
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -826,18 +826,18 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co
   /**
    * Creates a deep copy of a column while replacing the validity mask. The validity mask is the
    * device_vector equivalent of the boolean column given as argument.
-   * 
+   *
    * The boolColumn must have the same number of rows as the current column.
-   * The result column will have the same number of rows as the current column. 
+   * The result column will have the same number of rows as the current column.
    * For all indices `i` where the boolColumn is `true`, the result column will have a valid value at index i.
    * For all other values (i.e. `false` or `null`), the result column will have nulls.
-   * 
+   *
    * If the current column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
    * then the row value is undefined.
-   * 
+   *
    * @param boolColumn bool column whose value is to be used as the validity mask.
    * @return Deep copy of the column with replaced validity mask.
-   */    
+   */
   public final ColumnVector copyWithBooleanColumnAsValidity(ColumnView boolColumn) {
     return new ColumnVector(copyWithBooleanColumnAsValidity(getNativeView(), boolColumn.getNativeView()));
   }
@@ -2345,88 +2345,128 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) {
   }
 
   /**
-   * Returns a list of columns by splitting each string using the specified delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
-   *                  An empty string indicates split on whitespace.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
-   * @return New table of strings columns.
+   * Returns a list of columns by splitting each string using the specified pattern. The number of
+   * rows in the output columns will be the same as the input column. Null entries are added for a
+   * row where split results have been exhausted. Null input entries result in all nulls in the
+   * corresponding rows of the output columns.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return list of strings columns as a table.
    */
-  public final Table stringSplit(Scalar delimiter, int maxSplit) {
+  public final Table stringSplit(String pattern, int limit, boolean splitByRegex) {
     assert type.equals(DType.STRING) : "column type must be a String";
-    assert delimiter != null : "delimiter may not be null";
-    assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
-    return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle(), maxSplit));
+    assert pattern != null : "pattern is null";
+    assert pattern.length() > 0 : "empty pattern is not supported";
+    assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported";
+    return new Table(stringSplit(this.getNativeView(), pattern, limit, splitByRegex));
   }
-  
+
   /**
-   * Returns a list of columns by splitting each string using the specified delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
-   *                  An empty string indicates split on whitespace.
-   * @return New table of strings columns.
+   * Returns a list of columns by splitting each string using the specified pattern. The number of
+   * rows in the output columns will be the same as the input column. Null entries are added for a
+   * row where split results have been exhausted. Null input entries result in all nulls in the
+   * corresponding rows of the output columns.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return list of strings columns as a table.
    */
-  public final Table stringSplit(Scalar delimiter) {
-    return stringSplit(delimiter, -1);
+  public final Table stringSplit(String pattern, boolean splitByRegex) {
+    return stringSplit(pattern, -1, splitByRegex);
   }
 
   /**
-   * Returns a list of columns by splitting each string using whitespace as the delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @return New table of strings columns.
+   * Returns a list of columns by splitting each string using the specified string literal
+   * delimiter. The number of rows in the output columns will be the same as the input column.
+   * Null entries are added for a row where split results have been exhausted. Null input entries
+   * result in all nulls in the corresponding rows of the output columns.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @return list of strings columns as a table.
    */
-  public final Table stringSplit() {
-    try (Scalar emptyString = Scalar.fromString("")) {
-      return stringSplit(emptyString, -1);
-    }
+  public final Table stringSplit(String delimiter, int limit) {
+    return stringSplit(delimiter, limit, false);
   }
 
   /**
-   * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
+   * Returns a list of columns by splitting each string using the specified string literal
+   * delimiter. The number of rows in the output columns will be the same as the input column.
+   * Null entries are added for a row where split results have been exhausted. Null input entries
+   * result in all nulls in the corresponding rows of the output columns.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @return list of strings columns as a table.
    */
-  public final ColumnVector stringSplitRecord() {
-    return stringSplitRecord(-1);
+  public final Table stringSplit(String delimiter) {
+    return stringSplit(delimiter, -1, false);
   }
 
   /**
-   * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified pattern.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return a LIST column of string elements.
    */
-  public final ColumnVector stringSplitRecord(int maxSplit) {
-    try (Scalar emptyString = Scalar.fromString("")) {
-      return stringSplitRecord(emptyString, maxSplit);
-    }
+  public final ColumnVector stringSplitRecord(String pattern, int limit, boolean splitByRegex) {
+    assert type.equals(DType.STRING) : "column type must be String";
+    assert pattern != null : "pattern is null";
+    assert pattern.length() > 0 : "empty pattern is not supported";
+    assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported";
+    return new ColumnVector(
+        stringSplitRecord(this.getNativeView(), pattern, limit, splitByRegex));
+  }
+
+  /**
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified pattern.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return a LIST column of string elements.
+   */
+  public final ColumnVector stringSplitRecord(String pattern, boolean splitByRegex) {
+    return stringSplitRecord(pattern, -1, splitByRegex);
   }
 
   /**
-   * Returns a column of lists of strings by splitting each string using the specified delimiter.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
-   *                  An empty string indicates split on whitespace.
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified string literal delimiter.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @return a LIST column of string elements.
    */
-  public final ColumnVector stringSplitRecord(Scalar delimiter) {
-    return stringSplitRecord(delimiter, -1);
+  public final ColumnVector stringSplitRecord(String delimiter, int limit) {
+    return stringSplitRecord(delimiter, limit, false);
   }
 
   /**
-   * Returns a column that is a list of strings. Each string list is made by splitting each input
-   * string using the specified delimiter.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
-   *                  An empty string indicates split on whitespace.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
-   * @return New table of strings columns.
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified string literal delimiter.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @return a LIST column of string elements.
    */
-  public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) {
-    assert type.equals(DType.STRING) : "column type must be a String";
-    assert delimiter != null : "delimiter may not be null";
-    assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
-    return new ColumnVector(
-        stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit));
+  public final ColumnVector stringSplitRecord(String delimiter) {
+    return stringSplitRecord(delimiter, -1, false);
   }
 
   /**
@@ -3248,7 +3288,7 @@ public enum FindOptions {FIND_FIRST, FIND_LAST};
    * Create a column of int32 indices, indicating the position of the scalar search key
    * in each list row.
    * All indices are 0-based. If a search key is not found, the index is set to -1.
-   * The index is set to null if one of the following is true: 
+   * The index is set to null if one of the following is true:
    * 1. The search key is null.
    * 2. The list row is null.
    * @param key The scalar search key
@@ -3265,7 +3305,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) {
    * Create a column of int32 indices, indicating the position of each row in the
    * search key column in the corresponding row of the lists column.
    * All indices are 0-based. If a search key is not found, the index is set to -1.
-   * The index is set to null if one of the following is true: 
+   * The index is set to null if one of the following is true:
    * 1. The search key row is null.
    * 2. The list row is null.
    * @param keys ColumnView of search keys.
@@ -3531,15 +3571,36 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle
   private static native long substringLocate(long columnView, long substringScalar, int start, int end);
 
   /**
-   * Native method which returns array of columns by splitting each string using the specified
-   * delimiter.
-   * @param columnView native handle of the cudf::column_view being operated on.
-   * @param delimiter  UTF-8 encoded string identifying the split points in each string.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * Returns a list of columns by splitting each string using the specified pattern. The number of
+   * rows in the output columns will be the same as the input column. Null entries are added for a
+   * row where split results have been exhausted. Null input entries result in all nulls in the
+   * corresponding rows of the output columns.
+   *
+   * @param nativeHandle native handle of the input strings column that being operated on.
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
    */
-  private static native long[] stringSplit(long columnView, long delimiter, int maxSplit);
+  private static native long[] stringSplit(long nativeHandle, String pattern, int limit,
+                                           boolean splitByRegex);
 
-  private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit);
+  /**
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified string literal delimiter.
+   *
+   * @param nativeHandle native handle of the input strings column that being operated on.
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   */
+  private static native long stringSplitRecord(long nativeHandle, String pattern, int limit,
+                                               boolean splitByRegex);
 
   /**
    * Native method to calculate substring from a given string column. 0 indexing.
@@ -3714,7 +3775,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
   /**
    * Native method to search list rows for null elements.
    * @param nativeView the column view handle of the list
-   * @return column handle of the resultant boolean column 
+   * @return column handle of the resultant boolean column
    */
   private static native long listContainsNulls(long nativeView);
 
@@ -3896,20 +3957,20 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi
   /**
    * Native method to deep copy a column while replacing the null mask. The null mask is the
    * device_vector equivalent of the boolean column given as argument.
-   * 
+   *
    * The boolColumn must have the same number of rows as the exemplar column.
    * The result column will have the same number of rows as the exemplar.
    * For all indices `i` where the boolean column is `true`, the result column will have a valid value at index i.
    * For all other values (i.e. `false` or `null`), the result column will have nulls.
-   * 
+   *
    * If the exemplar column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
    * then the resultant row value is undefined.
-   * 
+   *
    * @param exemplarViewHandle column view of the column that is deep copied.
    * @param boolColumnViewHandle bool column whose value is to be used as the null mask.
    * @return Deep copy of the column with replaced null mask.
-   */                                                      
-  private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, 
+   */
+  private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle,
                                                              long boolColumnViewHandle) throws CudfException;
 
   ////////