From 7025c40c16bfea9b37ac9110f62b7b23a2c51725 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 14 Feb 2022 09:01:45 -0700 Subject: [PATCH] Add JNI for `strings::split_re` and `strings::split_record_re` (#10139) This PR adds Java binding for the new strings API `strings::split_re` and `strings::split_record_re`, which allows splitting strings by regular expression delimiters. In addition, the Java string split overloads with default split pattern (an empty string) are removed in this PR. That is because with default empty pattern the Java's split API produces different results than cudf. Finally, some cleanup has been perform automatically thanks to IntelliJ IDE. Depends on https://github.com/rapidsai/cudf/pull/10128. This is breaking change which is fixed by https://github.com/NVIDIA/spark-rapids/pull/4714. Thus, it should be merged at the same time with https://github.com/NVIDIA/spark-rapids/pull/4714. Authors: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Andy Grove (https://github.com/andygrove) URL: https://github.com/rapidsai/cudf/pull/10139 --- .../main/java/ai/rapids/cudf/ColumnView.java | 215 +++++++++++------- java/src/main/native/src/ColumnViewJni.cpp | 81 +++++-- .../java/ai/rapids/cudf/ColumnVectorTest.java | 129 +++++++---- 3 files changed, 283 insertions(+), 142 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 3ff2a370e4f..f91ee5535b1 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -826,18 +826,18 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co /** * Creates a deep copy of a column while replacing the validity mask. The validity mask is the * device_vector equivalent of the boolean column given as argument. - * + * * The boolColumn must have the same number of rows as the current column. - * The result column will have the same number of rows as the current column. + * The result column will have the same number of rows as the current column. * For all indices `i` where the boolColumn is `true`, the result column will have a valid value at index i. * For all other values (i.e. `false` or `null`), the result column will have nulls. - * + * * If the current column has a null at a given index `i`, and the new validity mask is `true` at index `i`, * then the row value is undefined. - * + * * @param boolColumn bool column whose value is to be used as the validity mask. * @return Deep copy of the column with replaced validity mask. - */ + */ public final ColumnVector copyWithBooleanColumnAsValidity(ColumnView boolColumn) { return new ColumnVector(copyWithBooleanColumnAsValidity(getNativeView(), boolColumn.getNativeView())); } @@ -2345,88 +2345,128 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) { } /** - * Returns a list of columns by splitting each string using the specified delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return list of strings columns as a table. */ - public final Table stringSplit(Scalar delimiter, int maxSplit) { + public final Table stringSplit(String pattern, int limit, boolean splitByRegex) { assert type.equals(DType.STRING) : "column type must be a String"; - assert delimiter != null : "delimiter may not be null"; - assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; - return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle(), maxSplit)); + assert pattern != null : "pattern is null"; + assert pattern.length() > 0 : "empty pattern is not supported"; + assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported"; + return new Table(stringSplit(this.getNativeView(), pattern, limit, splitByRegex)); } - + /** - * Returns a list of columns by splitting each string using the specified delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return list of strings columns as a table. */ - public final Table stringSplit(Scalar delimiter) { - return stringSplit(delimiter, -1); + public final Table stringSplit(String pattern, boolean splitByRegex) { + return stringSplit(pattern, -1, splitByRegex); } /** - * Returns a list of columns by splitting each string using whitespace as the delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified string literal + * delimiter. The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. Null input entries + * result in all nulls in the corresponding rows of the output columns. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @return list of strings columns as a table. */ - public final Table stringSplit() { - try (Scalar emptyString = Scalar.fromString("")) { - return stringSplit(emptyString, -1); - } + public final Table stringSplit(String delimiter, int limit) { + return stringSplit(delimiter, limit, false); } /** - * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. + * Returns a list of columns by splitting each string using the specified string literal + * delimiter. The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. Null input entries + * result in all nulls in the corresponding rows of the output columns. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @return list of strings columns as a table. */ - public final ColumnVector stringSplitRecord() { - return stringSplitRecord(-1); + public final Table stringSplit(String delimiter) { + return stringSplit(delimiter, -1, false); } /** - * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified pattern. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return a LIST column of string elements. */ - public final ColumnVector stringSplitRecord(int maxSplit) { - try (Scalar emptyString = Scalar.fromString("")) { - return stringSplitRecord(emptyString, maxSplit); - } + public final ColumnVector stringSplitRecord(String pattern, int limit, boolean splitByRegex) { + assert type.equals(DType.STRING) : "column type must be String"; + assert pattern != null : "pattern is null"; + assert pattern.length() > 0 : "empty pattern is not supported"; + assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported"; + return new ColumnVector( + stringSplitRecord(this.getNativeView(), pattern, limit, splitByRegex)); + } + + /** + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified pattern. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return a LIST column of string elements. + */ + public final ColumnVector stringSplitRecord(String pattern, boolean splitByRegex) { + return stringSplitRecord(pattern, -1, splitByRegex); } /** - * Returns a column of lists of strings by splitting each string using the specified delimiter. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @return a LIST column of string elements. */ - public final ColumnVector stringSplitRecord(Scalar delimiter) { - return stringSplitRecord(delimiter, -1); + public final ColumnVector stringSplitRecord(String delimiter, int limit) { + return stringSplitRecord(delimiter, limit, false); } /** - * Returns a column that is a list of strings. Each string list is made by splitting each input - * string using the specified delimiter. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. - * @return New table of strings columns. + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @return a LIST column of string elements. */ - public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) { - assert type.equals(DType.STRING) : "column type must be a String"; - assert delimiter != null : "delimiter may not be null"; - assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; - return new ColumnVector( - stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit)); + public final ColumnVector stringSplitRecord(String delimiter) { + return stringSplitRecord(delimiter, -1, false); } /** @@ -3248,7 +3288,7 @@ public enum FindOptions {FIND_FIRST, FIND_LAST}; * Create a column of int32 indices, indicating the position of the scalar search key * in each list row. * All indices are 0-based. If a search key is not found, the index is set to -1. - * The index is set to null if one of the following is true: + * The index is set to null if one of the following is true: * 1. The search key is null. * 2. The list row is null. * @param key The scalar search key @@ -3265,7 +3305,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) { * Create a column of int32 indices, indicating the position of each row in the * search key column in the corresponding row of the lists column. * All indices are 0-based. If a search key is not found, the index is set to -1. - * The index is set to null if one of the following is true: + * The index is set to null if one of the following is true: * 1. The search key row is null. * 2. The list row is null. * @param keys ColumnView of search keys. @@ -3531,15 +3571,36 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle private static native long substringLocate(long columnView, long substringScalar, int start, int end); /** - * Native method which returns array of columns by splitting each string using the specified - * delimiter. - * @param columnView native handle of the cudf::column_view being operated on. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param nativeHandle native handle of the input strings column that being operated on. + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. */ - private static native long[] stringSplit(long columnView, long delimiter, int maxSplit); + private static native long[] stringSplit(long nativeHandle, String pattern, int limit, + boolean splitByRegex); - private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit); + /** + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param nativeHandle native handle of the input strings column that being operated on. + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + */ + private static native long stringSplitRecord(long nativeHandle, String pattern, int limit, + boolean splitByRegex); /** * Native method to calculate substring from a given string column. 0 indexing. @@ -3714,7 +3775,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat /** * Native method to search list rows for null elements. * @param nativeView the column view handle of the list - * @return column handle of the resultant boolean column + * @return column handle of the resultant boolean column */ private static native long listContainsNulls(long nativeView); @@ -3896,20 +3957,20 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi /** * Native method to deep copy a column while replacing the null mask. The null mask is the * device_vector equivalent of the boolean column given as argument. - * + * * The boolColumn must have the same number of rows as the exemplar column. * The result column will have the same number of rows as the exemplar. * For all indices `i` where the boolean column is `true`, the result column will have a valid value at index i. * For all other values (i.e. `false` or `null`), the result column will have nulls. - * + * * If the exemplar column has a null at a given index `i`, and the new validity mask is `true` at index `i`, * then the resultant row value is undefined. - * + * * @param exemplarViewHandle column view of the column that is deep copied. * @param boolColumnViewHandle bool column whose value is to be used as the null mask. * @return Deep copy of the column with replaced null mask. - */ - private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, + */ + private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, long boolColumnViewHandle) throws CudfException; //////// diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index eec4a78a457..548844aa0d3 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -561,34 +562,78 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass, - jlong column_view, - jlong delimiter_ptr, - jint max_split) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, delimiter_ptr, "string scalar delimiter is null", 0); + jlong input_handle, + jstring pattern_obj, + jint limit, + jboolean split_by_regex) { + JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); + + if (limit == 0 || limit == 1) { + // Cannot achieve the results of splitting with limit == 0 or limit == 1. + // This is because cudf operates on a different parameter (`max_split`) which is converted from + // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an + // unlimited split. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "limit == 0 and limit == 1 are not supported", 0); + } + try { cudf::jni::auto_set_device(env); - cudf::strings_column_view const scv{*reinterpret_cast(column_view)}; - auto delimiter = reinterpret_cast(delimiter_ptr); + auto const input = reinterpret_cast(input_handle); + auto const strs_input = cudf::strings_column_view{*input}; - return cudf::jni::convert_table_for_return(env, - cudf::strings::split(scv, *delimiter, max_split)); + auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj); + if (pattern_jstr.is_empty()) { + // Java's split API produces different behaviors than cudf when splitting with empty + // pattern. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); + } + + auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes()); + auto const max_split = limit > 1 ? limit - 1 : limit; + auto result = split_by_regex ? + cudf::strings::split_re(strs_input, pattern, max_split) : + cudf::strings::split(strs_input, cudf::string_scalar{pattern}, max_split); + return cudf::jni::convert_table_for_return(env, std::move(result)); } CATCH_STD(env, 0); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass, - jlong column_view, - jlong delimiter, - jint max_split) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0); + jlong input_handle, + jstring pattern_obj, + jint limit, + jboolean split_by_regex) { + JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); + + if (limit == 0 || limit == 1) { + // Cannot achieve the results of splitting with limit == 0 or limit == 1. + // This is because cudf operates on a different parameter (`max_split`) which is converted from + // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an + // unlimited split. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "limit == 0 and limit == 1 are not supported", 0); + } + try { cudf::jni::auto_set_device(env); - cudf::column_view *cv = reinterpret_cast(column_view); - cudf::strings_column_view scv(*cv); - cudf::string_scalar *ss_scalar = reinterpret_cast(delimiter); - return release_as_jlong(cudf::strings::split_record(scv, *ss_scalar, max_split)); + auto const input = reinterpret_cast(input_handle); + auto const strs_input = cudf::strings_column_view{*input}; + + auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj); + if (pattern_jstr.is_empty()) { + // Java's split API produces different behaviors than cudf when splitting with empty + // pattern. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); + } + + auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes()); + auto const max_split = limit > 1 ? limit - 1 : limit; + auto result = + split_by_regex ? + cudf::strings::split_record_re(strs_input, pattern, max_split) : + cudf::strings::split_record(strs_input, cudf::string_scalar{pattern}, max_split); + return release_as_jlong(result); } CATCH_STD(env, 0); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 9c00cdbc084..b759c746735 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4364,10 +4364,10 @@ void testExtractListElements() { ColumnVector expected = ColumnVector.fromStrings("Héllo", "thésé", null, - null, + "", "ARé", "test"); - ColumnVector tmp = v.stringSplitRecord(); + ColumnVector tmp = v.stringSplitRecord(" "); ColumnVector result = tmp.extractListElement(0)) { assertColumnsAreEqual(expected, result); } @@ -4761,28 +4761,12 @@ void testListSortRowsWithStringChild() { } } - @Test - void testStringSplitRecord() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings"); - ColumnVector expected = ColumnVector.fromLists( - new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.STRING)), - Arrays.asList("Héllo", "there"), - Arrays.asList("thésé"), - Arrays.asList("null"), - Arrays.asList(""), - Arrays.asList("ARé", "some"), - Arrays.asList("test", "strings")); - Scalar pattern = Scalar.fromString(" "); - ColumnVector result = v.stringSplitRecord(pattern, -1)) { - assertColumnsAreEqual(expected, result); - } - } - @Test void testStringSplit() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here"); - Table expectedSplitOnce = new Table.TestBuilder() + String pattern = " "; + try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", + "ARé some things", "test strings here"); + Table expectedSplitLimit2 = new Table.TestBuilder() .column("Héllo", "thésé", null, "", "ARé", "test") .column("there all", null, null, null, "some things", "strings here") .build(); @@ -4791,41 +4775,92 @@ void testStringSplit() { .column("there", null, null, null, "some", "strings") .column("all", null, null, null, "things", "here") .build(); - Scalar pattern = Scalar.fromString(" "); - Table resultSplitOnce = v.stringSplit(pattern, 1); + Table resultSplitLimit2 = v.stringSplit(pattern, 2); Table resultSplitAll = v.stringSplit(pattern)) { - assertTablesAreEqual(expectedSplitOnce, resultSplitOnce); + assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2); assertTablesAreEqual(expectedSplitAll, resultSplitAll); } } @Test - void teststringSplitWhiteSpace() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo thesé", null, "are\tsome", "tést\nString", " "); - Table expected = new Table.TestBuilder().column("Héllo", null, "are", "tést", null) - .column("thesé", null, "some", "String", null) - .build(); - Table result = v.stringSplit()) { - assertTablesAreEqual(expected, result); + void testStringSplitByRegularExpression() { + String pattern = "[_ ]"; + try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "", + "ARé some_things", "test_strings_here"); + Table expectedSplitLimit2 = new Table.TestBuilder() + .column("Héllo", "thésé", null, "", "ARé", "test") + .column("there all", null, null, null, "some_things", "strings_here") + .build(); + Table expectedSplitAll = new Table.TestBuilder() + .column("Héllo", "thésé", null, "", "ARé", "test") + .column("there", null, null, null, "some", "strings") + .column("all", null, null, null, "things", "here") + .build(); + Table resultSplitLimit2 = v.stringSplit(pattern, 2, true); + Table resultSplitAll = v.stringSplit(pattern, true)) { + assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertTablesAreEqual(expectedSplitAll, resultSplitAll); } } @Test - void teststringSplitThrowsException() { - assertThrows(CudfException.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Scalar delimiter = Scalar.fromString(null); - Table result = cv.stringSplit(delimiter)) {} - }); - assertThrows(AssertionError.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Scalar delimiter = Scalar.fromInt(1); - Table result = cv.stringSplit(delimiter)) {} - }); - assertThrows(AssertionError.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Table result = cv.stringSplit(null)) {} - }); + void testStringSplitRecord() { + String pattern = " "; + try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", + "ARé some things", "test strings here"); + ColumnVector expectedSplitLimit2 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some things"), + Arrays.asList("test", "strings here")); + ColumnVector expectedSplitAll = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there", "all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some", "things"), + Arrays.asList("test", "strings", "here")); + ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2); + ColumnVector resultSplitAll = v.stringSplitRecord(pattern)) { + assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertColumnsAreEqual(expectedSplitAll, resultSplitAll); + } + } + + @Test + void testStringSplitRecordByRegularExpression() { + String pattern = "[_ ]"; + try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "", + "ARé some_things", "test_strings_here"); + ColumnVector expectedSplitLimit2 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some_things"), + Arrays.asList("test", "strings_here")); + ColumnVector expectedSplitAll = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there", "all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some", "things"), + Arrays.asList("test", "strings", "here")); + ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2, true); + ColumnVector resultSplitAll = v.stringSplitRecord(pattern, true)) { + assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertColumnsAreEqual(expectedSplitAll, resultSplitAll); + } } @Test