rapidsai · rapids-bot · Feb 14, 2022 · Jan 26, 2022 · Jan 26, 2022 · Jan 26, 2022
@@ -826,18 +826,18 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co
   /**
    * Creates a deep copy of a column while replacing the validity mask. The validity mask is the
    * device_vector equivalent of the boolean column given as argument.
-   * 
+   *
    * The boolColumn must have the same number of rows as the current column.
-   * The result column will have the same number of rows as the current column. 
+   * The result column will have the same number of rows as the current column.
    * For all indices `i` where the boolColumn is `true`, the result column will have a valid value at index i.
    * For all other values (i.e. `false` or `null`), the result column will have nulls.
-   * 
+   *
    * If the current column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
    * then the row value is undefined.
-   * 
+   *
    * @param boolColumn bool column whose value is to be used as the validity mask.
    * @return Deep copy of the column with replaced validity mask.
-   */    
+   */
   public final ColumnVector copyWithBooleanColumnAsValidity(ColumnView boolColumn) {
     return new ColumnVector(copyWithBooleanColumnAsValidity(getNativeView(), boolColumn.getNativeView()));
   }
@@ -2345,88 +2345,128 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) {
   }
 
   /**
-   * Returns a list of columns by splitting each string using the specified delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
-   *                  An empty string indicates split on whitespace.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
-   * @return New table of strings columns.
+   * Returns a list of columns by splitting each string using the specified pattern. The number of
+   * rows in the output columns will be the same as the input column. Null entries are added for a
+   * row where split results have been exhausted. Null input entries result in all nulls in the
+   * corresponding rows of the output columns.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return list of strings columns as a table.
    */
-  public final Table stringSplit(Scalar delimiter, int maxSplit) {
+  public final Table stringSplit(String pattern, int limit, boolean splitByRegex) {
     assert type.equals(DType.STRING) : "column type must be a String";
-    assert delimiter != null : "delimiter may not be null";
-    assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
-    return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle(), maxSplit));
+    assert pattern != null : "pattern is null";
+    assert pattern.length() > 0 : "empty pattern is not supported";
+    assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported";
+    return new Table(stringSplit(this.getNativeView(), pattern, limit, splitByRegex));
   }
-  
+
   /**
-   * Returns a list of columns by splitting each string using the specified delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
-   *                  An empty string indicates split on whitespace.
-   * @return New table of strings columns.
+   * Returns a list of columns by splitting each string using the specified pattern. The number of
+   * rows in the output columns will be the same as the input column. Null entries are added for a
+   * row where split results have been exhausted. Null input entries result in all nulls in the
+   * corresponding rows of the output columns.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return list of strings columns as a table.
    */
-  public final Table stringSplit(Scalar delimiter) {
-    return stringSplit(delimiter, -1);
+  public final Table stringSplit(String pattern, boolean splitByRegex) {
+    return stringSplit(pattern, -1, splitByRegex);
   }
 
   /**
-   * Returns a list of columns by splitting each string using whitespace as the delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @return New table of strings columns.
+   * Returns a list of columns by splitting each string using the specified string literal
+   * delimiter. The number of rows in the output columns will be the same as the input column.
+   * Null entries are added for a row where split results have been exhausted. Null input entries
+   * result in all nulls in the corresponding rows of the output columns.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @return list of strings columns as a table.
    */
-  public final Table stringSplit() {
-    try (Scalar emptyString = Scalar.fromString("")) {
-      return stringSplit(emptyString, -1);
-    }
+  public final Table stringSplit(String delimiter, int limit) {
+    return stringSplit(delimiter, limit, false);
   }
 
   /**
-   * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
+   * Returns a list of columns by splitting each string using the specified string literal
+   * delimiter. The number of rows in the output columns will be the same as the input column.
+   * Null entries are added for a row where split results have been exhausted. Null input entries
+   * result in all nulls in the corresponding rows of the output columns.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @return list of strings columns as a table.
    */
-  public final ColumnVector stringSplitRecord() {
-    return stringSplitRecord(-1);
+  public final Table stringSplit(String delimiter) {
+    return stringSplit(delimiter, -1, false);
   }
 
   /**
-   * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified pattern.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return a LIST column of string elements.
    */
-  public final ColumnVector stringSplitRecord(int maxSplit) {
-    try (Scalar emptyString = Scalar.fromString("")) {
-      return stringSplitRecord(emptyString, maxSplit);
-    }
+  public final ColumnVector stringSplitRecord(String pattern, int limit, boolean splitByRegex) {
+    assert type.equals(DType.STRING) : "column type must be String";
+    assert pattern != null : "pattern is null";
+    assert pattern.length() > 0 : "empty pattern is not supported";
+    assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported";
+    return new ColumnVector(
+        stringSplitRecord(this.getNativeView(), pattern, limit, splitByRegex));
+  }
+
+  /**
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified pattern.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return a LIST column of string elements.
+   */
+  public final ColumnVector stringSplitRecord(String pattern, boolean splitByRegex) {
+    return stringSplitRecord(pattern, -1, splitByRegex);
   }
 
   /**
-   * Returns a column of lists of strings by splitting each string using the specified delimiter.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
-   *                  An empty string indicates split on whitespace.
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified string literal delimiter.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @return a LIST column of string elements.
    */
-  public final ColumnVector stringSplitRecord(Scalar delimiter) {
-    return stringSplitRecord(delimiter, -1);
+  public final ColumnVector stringSplitRecord(String delimiter, int limit) {
+    return stringSplitRecord(delimiter, limit, false);
   }
 
   /**
-   * Returns a column that is a list of strings. Each string list is made by splitting each input
-   * string using the specified delimiter.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
-   *                  An empty string indicates split on whitespace.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
-   * @return New table of strings columns.
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified string literal delimiter.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @return a LIST column of string elements.
    */
-  public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) {
-    assert type.equals(DType.STRING) : "column type must be a String";
-    assert delimiter != null : "delimiter may not be null";
-    assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
-    return new ColumnVector(
-        stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit));
+  public final ColumnVector stringSplitRecord(String delimiter) {
+    return stringSplitRecord(delimiter, -1, false);
   }
 
   /**
@@ -3248,7 +3288,7 @@ public enum FindOptions {FIND_FIRST, FIND_LAST};
    * Create a column of int32 indices, indicating the position of the scalar search key
    * in each list row.
    * All indices are 0-based. If a search key is not found, the index is set to -1.
-   * The index is set to null if one of the following is true: 
+   * The index is set to null if one of the following is true:
    * 1. The search key is null.
    * 2. The list row is null.
    * @param key The scalar search key
@@ -3265,7 +3305,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) {
    * Create a column of int32 indices, indicating the position of each row in the
    * search key column in the corresponding row of the lists column.
    * All indices are 0-based. If a search key is not found, the index is set to -1.
-   * The index is set to null if one of the following is true: 
+   * The index is set to null if one of the following is true:
    * 1. The search key row is null.
    * 2. The list row is null.
    * @param keys ColumnView of search keys.
@@ -3531,15 +3571,36 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle
   private static native long substringLocate(long columnView, long substringScalar, int start, int end);
 
   /**
-   * Native method which returns array of columns by splitting each string using the specified
-   * delimiter.
-   * @param columnView native handle of the cudf::column_view being operated on.
-   * @param delimiter  UTF-8 encoded string identifying the split points in each string.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * Returns a list of columns by splitting each string using the specified pattern. The number of
+   * rows in the output columns will be the same as the input column. Null entries are added for a
+   * row where split results have been exhausted. Null input entries result in all nulls in the
+   * corresponding rows of the output columns.
+   *
+   * @param nativeHandle native handle of the input strings column that being operated on.
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
    */
-  private static native long[] stringSplit(long columnView, long delimiter, int maxSplit);
+  private static native long[] stringSplit(long nativeHandle, String pattern, int limit,
+                                           boolean splitByRegex);
 
-  private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit);
+  /**
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified string literal delimiter.
+   *
+   * @param nativeHandle native handle of the input strings column that being operated on.
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   */
+  private static native long stringSplitRecord(long nativeHandle, String pattern, int limit,
+                                               boolean splitByRegex);
 
   /**
    * Native method to calculate substring from a given string column. 0 indexing.
@@ -3714,7 +3775,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
   /**
    * Native method to search list rows for null elements.
    * @param nativeView the column view handle of the list
-   * @return column handle of the resultant boolean column 
+   * @return column handle of the resultant boolean column
    */
   private static native long listContainsNulls(long nativeView);
 
@@ -3896,20 +3957,20 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi
   /**
    * Native method to deep copy a column while replacing the null mask. The null mask is the
    * device_vector equivalent of the boolean column given as argument.
-   * 
+   *
    * The boolColumn must have the same number of rows as the exemplar column.
    * The result column will have the same number of rows as the exemplar.
    * For all indices `i` where the boolean column is `true`, the result column will have a valid value at index i.
    * For all other values (i.e. `false` or `null`), the result column will have nulls.
-   * 
+   *
    * If the exemplar column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
    * then the resultant row value is undefined.
-   * 
+   *
    * @param exemplarViewHandle column view of the column that is deep copied.
    * @param boolColumnViewHandle bool column whose value is to be used as the null mask.
    * @return Deep copy of the column with replaced null mask.
-   */                                                      
-  private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, 
+   */
+  private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle,
                                                              long boolColumnViewHandle) throws CudfException;
 
   ////////