From df9eac9d7713b95de6a49675c49b95dc42641382 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Thu, 13 May 2021 19:19:02 +0000 Subject: [PATCH 01/22] concat_ws initial version without array support, include tests Signed-off-by: Thomas Graves --- .../java/ai/rapids/cudf/ColumnVector.java | 19 ++++++ java/src/main/native/src/ColumnVectorJni.cpp | 30 +++++++++ .../java/ai/rapids/cudf/ColumnVectorTest.java | 63 ++++++++++++++++++- 3 files changed, 110 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index ea93a2daf36..28ff31471d6 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -525,6 +525,23 @@ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, Col return new ColumnVector(stringConcatenation(column_views, separator.getScalarHandle(), narep.getScalarHandle())); } + public static ColumnVector stringConcatenateWs(ColumnView[] columns, ColumnView sep_col, Scalar separator_narep, Scalar col_narep) { + // TODO fix asserts + assert columns.length >= 2 : ".stringConcatenate() operation requires at least 2 columns"; + // assert separator_narep != null : "separator scalar provided may not be null"; + // assert separator.getType().equals(DType.STRING) : "separator scalar must be a string scalar"; + // assert narep != null : "narep scalar provided may not be null"; + // assert narep.getType().equals(DType.STRING) : "narep scalar must be a string scalar"; + + long[] column_views = new long[columns.length]; + for(int i = 0; i < columns.length; i++) { + assert columns[i] != null : "Column vectors passed may not be null"; + column_views[i] = columns[i].getNativeView(); + } + + return new ColumnVector(stringConcatenationWs(column_views, sep_col.getNativeView(), separator_narep.getScalarHandle(), col_narep.getScalarHandle())); + } + /** * Concatenate columns of lists horizontally (row by row), combining a corresponding row * from each column into a single list row of a new column. @@ -726,6 +743,8 @@ private static native long makeList(long[] handles, long typeHandle, int scale, */ private static native long stringConcatenation(long[] columnViews, long separator, long narep); + private static native long stringConcatenationWs(long[] columnViews, long sep_column, long separator_narep, long col_narep); + /** * Native method to hash each row of the given table. Hashing function dispatched on the * native side using the hashId. diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index 85bbdd41b4a..9b7ffdfbf31 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -125,6 +125,36 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv *env, CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationWs(JNIEnv *env, jclass, + jlongArray column_handles, + jlong sep_handle, + jlong separator_narep, + jlong col_narep) { + JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0); + JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0); + JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0); + JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0); + try { + cudf::jni::auto_set_device(env); + const auto& separator_narep_scalar = *reinterpret_cast(separator_narep); + const auto& col_narep_scalar = *reinterpret_cast(col_narep); + + cudf::jni::native_jpointerArray n_cudf_columns(env, column_handles); + std::vector column_views; + std::transform(n_cudf_columns.data(), + n_cudf_columns.data() + n_cudf_columns.size(), + std::back_inserter(column_views), + [](auto const &p_column) { return *p_column; }); + + cudf::column_view *column = reinterpret_cast(sep_handle); + cudf::strings_column_view strings_column(*column); + std::unique_ptr result = + cudf::strings::concatenate(cudf::table_view(column_views), strings_column, separator_narep_scalar, col_narep_scalar); + // cudf::strings::concatenate(cudf::table_view(column_views), strings_column); + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0); +} JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenation(JNIEnv *env, jclass, jlongArray column_handles, diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 09ddef633e3..2d22e00f626 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2109,6 +2109,66 @@ void testStringConcatSeparators() { } } + @Test + void testStringConcatWsSimple() { + try (ColumnVector sv1 = ColumnVector.fromStrings("a"); + ColumnVector sv2 = ColumnVector.fromStrings("B"); + ColumnVector sv3 = ColumnVector.fromStrings("cd"); + ColumnVector sv4 = ColumnVector.fromStrings("\u0480\u0481"); + ColumnVector sv5 = ColumnVector.fromStrings("E\tf"); + ColumnVector sv6 = ColumnVector.fromStrings("M"); + ColumnVector sv7 = ColumnVector.fromStrings("\\G\u0100"); + ColumnVector sep_col = ColumnVector.fromStrings("-*"); + ColumnVector e_concat = ColumnVector.fromStrings("a-*B-*cd-*\u0480\u0481-*E\tf-*M-*\\G\u0100"); + Scalar separatorString = Scalar.fromString(null); + Scalar nullString = Scalar.fromString(null); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2, sv3, sv4, sv5, sv6, sv7}, sep_col, separatorString, nullString)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + @Test + void testStringConcatWsNullSep() { + try (ColumnVector sv1 = ColumnVector.fromStrings("a", "c"); + ColumnVector sv2 = ColumnVector.fromStrings("b", "d"); + Scalar nullString = Scalar.fromString(null); + ColumnVector sep_col = ColumnVector.fromScalar(nullString, 2); + ColumnVector e_concat = ColumnVector.fromScalar(nullString, 2); + Scalar separatorString = Scalar.fromString(null); + Scalar nullEmptyString = Scalar.fromString(null); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, nullEmptyString)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + @Test + void testStringConcatWsNullValueInCol() { + try (ColumnVector sv1 = ColumnVector.fromStrings("a", "c", null); + ColumnVector sv2 = ColumnVector.fromStrings("b", "", "e"); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-"); + ColumnVector e_concat = ColumnVector.fromStrings("a-b", "c-", "e"); + Scalar separatorString = Scalar.fromString(null); + Scalar nullEmptyString = Scalar.fromString(null); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, nullEmptyString)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + /* + @Test + void testStringConcatWsNullValueInArray() { + try (ColumnVector sv1 = ColumnVector.fromStrings(["a", "s"], "c", null); + ColumnVector sv2 = ColumnVector.fromStrings("b", "", "e"); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-"); + ColumnVector e_concat = ColumnVector.fromStrings("a-b", "c-", "e"); + Scalar separatorString = Scalar.fromString(null); + Scalar nullEmptyString = Scalar.fromString(null); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, nullEmptyString)) { + assertColumnsAreEqual(e_concat, concat); + } + } + */ + @Test void testListConcatByRow() { try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true, @@ -3820,8 +3880,7 @@ void testIsInteger() { @Test void testIsFloat() { String[] floatStrings = {"A", "nan", "Inf", "-Inf", "Infinity", "infinity", "-0.0", "0.0", - "3.4028235E38", "3.4028236E38", "-3.4028235E38", "-3.4028236E38", "1.2e-24", "NULL", "null", - null, "423"}; + "3.4028235E38", "3.4028236E38", "-3.4028235E38", "-3.4028236E38", "1.2e-24", "NULL", "null", null, "423"}; try (ColumnVector floatStringCV = ColumnVector.fromStrings(floatStrings); ColumnVector isFloat = floatStringCV.isFloat(); ColumnVector floats = floatStringCV.asFloats(); From 701c8018f124c63ffac2ea06eefb9d640f53d393 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Thu, 13 May 2021 19:54:49 +0000 Subject: [PATCH 02/22] Add documentation and asserts Signed-off-by: Thomas Graves --- .../java/ai/rapids/cudf/ColumnVector.java | 36 +++++++++++++++---- .../java/ai/rapids/cudf/ColumnVectorTest.java | 28 ++++++++++++++- 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 28ff31471d6..8703b3cd2b9 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -525,13 +525,23 @@ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, Col return new ColumnVector(stringConcatenation(column_views, separator.getScalarHandle(), narep.getScalarHandle())); } + /** + * Concatenate columns of strings together using a separator specified for each row + * and returns the result as a string column. + * @param columns array of columns containing strings, must be more than 1 columns + * @param sep_col strings column that provides the separator for a given row + * @param separator_narep String that should be used in place of a null separator for a given + * row. + * @param col_narep string String that should be used in place of any null strings + * found in any column. + * @return A new java column vector containing the concatenated strings with separator between. + */ public static ColumnVector stringConcatenateWs(ColumnView[] columns, ColumnView sep_col, Scalar separator_narep, Scalar col_narep) { - // TODO fix asserts - assert columns.length >= 2 : ".stringConcatenate() operation requires at least 2 columns"; - // assert separator_narep != null : "separator scalar provided may not be null"; - // assert separator.getType().equals(DType.STRING) : "separator scalar must be a string scalar"; - // assert narep != null : "narep scalar provided may not be null"; - // assert narep.getType().equals(DType.STRING) : "narep scalar must be a string scalar"; + assert columns.length >= 1 : ".stringConcatenate() operation requires at least 1 column"; + assert separator_narep != null : "separator narep scalar provided may not be null"; + assert col_narep != null : "column narep scalar provided may not be null"; + assert separator_narep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; + assert col_narep.getType().equals(DType.STRING) : "column narep scalar must be a string scalar"; long[] column_views = new long[columns.length]; for(int i = 0; i < columns.length; i++) { @@ -539,7 +549,8 @@ public static ColumnVector stringConcatenateWs(ColumnView[] columns, ColumnView column_views[i] = columns[i].getNativeView(); } - return new ColumnVector(stringConcatenationWs(column_views, sep_col.getNativeView(), separator_narep.getScalarHandle(), col_narep.getScalarHandle())); + return new ColumnVector(stringConcatenationWs(column_views, sep_col.getNativeView(), + separator_narep.getScalarHandle(), col_narep.getScalarHandle())); } /** @@ -743,6 +754,17 @@ private static native long makeList(long[] handles, long typeHandle, int scale, */ private static native long stringConcatenation(long[] columnViews, long separator, long narep); + /** + * Native method to concatenate columns of strings together using a separator specified for each row + * and returns the result as a string column. + * @param columns array of longs holding the native handles of the column_views to combine. + * @param sep_column long holding the native handle of the strings_column_view used as separators. + * @param separator_narep String scalar that should be used in place of a null separator for a given + * row. + * @param col_narep string String scalar that should be used in place of any null strings + * found in any column. + * @return A new java column vector containing the concatenated strings with separator between. + */ private static native long stringConcatenationWs(long[] columnViews, long sep_column, long separator_narep, long col_narep); /** diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 2d22e00f626..24a50b06dbc 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2154,6 +2154,31 @@ void testStringConcatWsNullValueInCol() { } } + @Test + void testStringConcatWsSingleCol() { + try (ColumnVector sv1 = ColumnVector.fromStrings("a", "c", "e"); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-"); + ColumnVector e_concat = ColumnVector.fromStrings("a", "c", "e"); + Scalar separatorString = Scalar.fromString(null); + Scalar nullEmptyString = Scalar.fromString(null); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1}, sep_col, separatorString, nullEmptyString)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + @Test + void testStringConcatWsNullSepNaRep() { + try (ColumnVector sv1 = ColumnVector.fromStrings("a", "c", "e"); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-"); + ColumnVector e_concat = ColumnVector.fromStrings("a", "c", "e"); + Scalar separatorString = Scalar.fromString(null); + Scalar nullEmptyString = Scalar.fromString(null); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1}, sep_col, separatorString, nullEmptyString)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + /* @Test void testStringConcatWsNullValueInArray() { @@ -3880,7 +3905,8 @@ void testIsInteger() { @Test void testIsFloat() { String[] floatStrings = {"A", "nan", "Inf", "-Inf", "Infinity", "infinity", "-0.0", "0.0", - "3.4028235E38", "3.4028236E38", "-3.4028235E38", "-3.4028236E38", "1.2e-24", "NULL", "null", null, "423"}; + "3.4028235E38", "3.4028236E38", "-3.4028235E38", "-3.4028236E38", "1.2e-24", "NULL", "null", + null, "423"}; try (ColumnVector floatStringCV = ColumnVector.fromStrings(floatStrings); ColumnVector isFloat = floatStringCV.isFloat(); ColumnVector floats = floatStringCV.asFloats(); From ab9349cef688f0c9757eb8fba3411c495cb85b36 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Thu, 13 May 2021 20:59:20 +0000 Subject: [PATCH 03/22] documentation and more tests Signed-off-by: Thomas Graves --- .../java/ai/rapids/cudf/ColumnVector.java | 35 ++++++++++++++++++- .../java/ai/rapids/cudf/ColumnVectorTest.java | 33 ++++++++++++++--- 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 8703b3cd2b9..ababd6ce90f 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -527,7 +527,23 @@ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, Col /** * Concatenate columns of strings together using a separator specified for each row - * and returns the result as a string column. + * and returns the result as a string column. If the row separator for a given row is null, + * output column for that row is null. Null column values for a given row are skipped. + * @param columns array of columns containing strings, must be more than 1 columns + * @param sep_col strings column that provides the separator for a given row + * @return A new java column vector containing the concatenated strings with separator between. + */ + public static ColumnVector stringConcatenateWs(ColumnView[] columns, ColumnView sep_col) { + try (Scalar nullString = Scalar.fromString(null)) { + return stringConcatenateWs(columns, sep_col, nullString, nullString); + } + } + + /** + * Concatenate columns of strings together using a separator specified for each row + * and returns the result as a string column. If the row separator for a given row is null, + * output column for that row is null unless separator_narep is provided. Null column values + * for a given row are skipped unless col_narep is provided. * @param columns array of columns containing strings, must be more than 1 columns * @param sep_col strings column that provides the separator for a given row * @param separator_narep String that should be used in place of a null separator for a given @@ -553,6 +569,23 @@ public static ColumnVector stringConcatenateWs(ColumnView[] columns, ColumnView separator_narep.getScalarHandle(), col_narep.getScalarHandle())); } + public static ColumnVector concatenateListElements(ColumnView[] columns, ColumnView sep_col, Scalar separator_narep, Scalar col_narep) { + assert columns.length >= 1 : ".stringConcatenate() operation requires at least 1 column"; + assert separator_narep != null : "separator narep scalar provided may not be null"; + assert col_narep != null : "column narep scalar provided may not be null"; + assert separator_narep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; + assert col_narep.getType().equals(DType.STRING) : "column narep scalar must be a string scalar"; + + long[] column_views = new long[columns.length]; + for(int i = 0; i < columns.length; i++) { + assert columns[i] != null : "Column vectors passed may not be null"; + column_views[i] = columns[i].getNativeView(); + } + + return new ColumnVector(stringConcatenationWs(column_views, sep_col.getNativeView(), + separator_narep.getScalarHandle(), col_narep.getScalarHandle())); + } + /** * Concatenate columns of lists horizontally (row by row), combining a corresponding row * from each column into a single list row of a new column. diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 24a50b06dbc..c587b75ab95 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2109,6 +2109,26 @@ void testStringConcatSeparators() { } } + @Test + void testConcatWsTypeError() { + try (ColumnVector v0 = ColumnVector.fromInts(1, 2, 3, 4); + ColumnVector v1 = ColumnVector.fromFloats(5.0f, 6.0f); + ColumnVector sep_col = ColumnVector.fromStrings("-*"); + Scalar separatorString = Scalar.fromString(null); + Scalar nullString = Scalar.fromString(null)) { + assertThrows(CudfException.class, () -> ColumnVector.stringConcatenateWs(new ColumnView[]{v0, v1}, sep_col, separatorString, nullString)); + } + } + + @Test + void testConcatWsNoColumn() { + try (ColumnVector sep_col = ColumnVector.fromStrings("-*"); + Scalar separatorString = Scalar.fromString(null); + Scalar nullString = Scalar.fromString(null)) { + assertThrows(AssertionError.class, () -> ColumnVector.stringConcatenateWs(new ColumnView[]{}, sep_col, separatorString, nullString)); + } + } + @Test void testStringConcatWsSimple() { try (ColumnVector sv1 = ColumnVector.fromStrings("a"); @@ -2167,18 +2187,21 @@ void testStringConcatWsSingleCol() { } @Test - void testStringConcatWsNullSepNaRep() { - try (ColumnVector sv1 = ColumnVector.fromStrings("a", "c", "e"); + void testStringConcatWsNullAllCol() { + try (Scalar nullString = Scalar.fromString(null); + ColumnVector sv1 = ColumnVector.fromScalar(nullString, 3); + ColumnVector sv2 = ColumnVector.fromScalar(nullString, 3); ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-"); - ColumnVector e_concat = ColumnVector.fromStrings("a", "c", "e"); + // TODO - Spark expects this to be empty String + // ColumnVector e_concat = ColumnVector.fromStrings("", "", ""); + ColumnVector e_concat = ColumnVector.fromScalar(nullString, 3); Scalar separatorString = Scalar.fromString(null); Scalar nullEmptyString = Scalar.fromString(null); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1}, sep_col, separatorString, nullEmptyString)) { + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, nullEmptyString)) { assertColumnsAreEqual(e_concat, concat); } } - /* @Test void testStringConcatWsNullValueInArray() { From 06d2cb36da3a5bf6dc4c3eaa4581e9c03c9ab0e2 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Thu, 13 May 2021 21:57:25 +0000 Subject: [PATCH 04/22] Initial jni for list column view concatenate with separator Signed-off-by: Thomas Graves --- .../java/ai/rapids/cudf/ColumnVector.java | 15 +++++------ java/src/main/native/src/ColumnVectorJni.cpp | 27 ++++++++++++++++++- .../java/ai/rapids/cudf/ColumnVectorTest.java | 15 +++++------ 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index ababd6ce90f..d060103b423 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -569,20 +569,15 @@ public static ColumnVector stringConcatenateWs(ColumnView[] columns, ColumnView separator_narep.getScalarHandle(), col_narep.getScalarHandle())); } - public static ColumnVector concatenateListElements(ColumnView[] columns, ColumnView sep_col, Scalar separator_narep, Scalar col_narep) { - assert columns.length >= 1 : ".stringConcatenate() operation requires at least 1 column"; + public static ColumnVector stringConcatenateListElementsWs(ColumnView list_column, ColumnView sep_col, Scalar separator_narep, Scalar col_narep) { assert separator_narep != null : "separator narep scalar provided may not be null"; assert col_narep != null : "column narep scalar provided may not be null"; assert separator_narep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; assert col_narep.getType().equals(DType.STRING) : "column narep scalar must be a string scalar"; + // TODO + // assert type.equals(DType.LIST) : "A column of type LIST is required for .extractListElement()"; - long[] column_views = new long[columns.length]; - for(int i = 0; i < columns.length; i++) { - assert columns[i] != null : "Column vectors passed may not be null"; - column_views[i] = columns[i].getNativeView(); - } - - return new ColumnVector(stringConcatenationWs(column_views, sep_col.getNativeView(), + return new ColumnVector(stringConcatenationListElementsWs(list_column.getNativeView(), sep_col.getNativeView(), separator_narep.getScalarHandle(), col_narep.getScalarHandle())); } @@ -800,6 +795,8 @@ private static native long makeList(long[] handles, long typeHandle, int scale, */ private static native long stringConcatenationWs(long[] columnViews, long sep_column, long separator_narep, long col_narep); + private static native long stringConcatenationListElementsWs(long list_column, long sep_column, long separator_narep, long col_narep); + /** * Native method to hash each row of the given table. Hashing function dispatched on the * native side using the hashId. diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index 9b7ffdfbf31..332239985fc 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "cudf_jni_apis.hpp" @@ -150,7 +151,31 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationWs(J cudf::strings_column_view strings_column(*column); std::unique_ptr result = cudf::strings::concatenate(cudf::table_view(column_views), strings_column, separator_narep_scalar, col_narep_scalar); - // cudf::strings::concatenate(cudf::table_view(column_views), strings_column); + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationListElementsWs(JNIEnv *env, jclass, + jlong column_handle, + jlong sep_handle, + jlong separator_narep, + jlong col_narep) { + JNI_NULL_CHECK(env, column_handle, "column handle is null", 0); + JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0); + JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0); + JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0); + try { + cudf::jni::auto_set_device(env); + const auto& separator_narep_scalar = *reinterpret_cast(separator_narep); + const auto& col_narep_scalar = *reinterpret_cast(col_narep); + + cudf::column_view *column = reinterpret_cast(sep_handle); + cudf::strings_column_view strings_column(*column); + cudf::column_view *cv = reinterpret_cast(column_handle); + cudf::lists_column_view lcv(*cv); + std::unique_ptr result = + cudf::strings::concatenate_list_elements(lcv, strings_column, separator_narep_scalar, col_narep_scalar); return reinterpret_cast(result.release()); } CATCH_STD(env, 0); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index c587b75ab95..060764e3ef6 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2202,20 +2202,19 @@ void testStringConcatWsNullAllCol() { } } - /* @Test - void testStringConcatWsNullValueInArray() { - try (ColumnVector sv1 = ColumnVector.fromStrings(["a", "s"], "c", null); - ColumnVector sv2 = ColumnVector.fromStrings("b", "", "e"); - ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-"); - ColumnVector e_concat = ColumnVector.fromStrings("a-b", "c-", "e"); + void testStringConcatWsSingleListCol() { + try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("aaa"), Arrays.asList("b", "c", "d")); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-"); + ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d"); Scalar separatorString = Scalar.fromString(null); Scalar nullEmptyString = Scalar.fromString(null); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, nullEmptyString)) { + ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, separatorString, nullEmptyString)) { assertColumnsAreEqual(e_concat, concat); } } - */ @Test void testListConcatByRow() { From c0fd1a3175b92b06e68cdf7cd0568503eed55546 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Fri, 14 May 2021 13:40:07 +0000 Subject: [PATCH 05/22] Add more tests for arrays Signed-off-by: Thomas Graves --- .../java/ai/rapids/cudf/ColumnVectorTest.java | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 060764e3ef6..e3a94d022a0 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2206,9 +2206,28 @@ void testStringConcatWsNullAllCol() { void testStringConcatWsSingleListCol() { try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.STRING)), - Arrays.asList("aaa"), Arrays.asList("b", "c", "d")); + Arrays.asList("aaa"), Arrays.asList("b", "c", "d"), Arrays.asList("\u0480\u0481", null, "asdfbe", null)); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "*"); + // ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481*asdfbe"); + // TODO - this is different then spark, should be above line for null + ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", null); + // ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481**asdfbe*"); + Scalar separatorString = Scalar.fromString(null); + Scalar nullEmptyString = Scalar.fromString(null); + ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, separatorString, nullEmptyString)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + @Test + void testStringConcatWsSingleListColAllNulls() { + try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("aaa"), Arrays.asList(null, null, null)); ColumnVector sep_col = ColumnVector.fromStrings("-", "-"); - ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d"); + // TODO - SPARK expects empty string when all nulls, not null + // ColumnVector e_concat = ColumnVector.fromStrings("aaa", ""); + ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); Scalar separatorString = Scalar.fromString(null); Scalar nullEmptyString = Scalar.fromString(null); ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, separatorString, nullEmptyString)) { From 3d613e3b5037acfcad6757fe9ae45b9333a56dbe Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 13:06:00 +0000 Subject: [PATCH 06/22] another test --- .../test/java/ai/rapids/cudf/ColumnVectorTest.java | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index e3a94d022a0..c35230de680 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2235,6 +2235,20 @@ void testStringConcatWsSingleListColAllNulls() { } } + @Test + void testStringConcatWsSingleListColEmptyArray() { + try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("aaa", "bbbb"), Arrays.asList()); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-"); + ColumnVector e_concat = ColumnVector.fromStrings("aaa-bbbb", null); + Scalar separatorString = Scalar.fromString(null); + Scalar nullEmptyString = Scalar.fromString(null); + ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, separatorString, nullEmptyString)) { + assertColumnsAreEqual(e_concat, concat); + } + } + @Test void testListConcatByRow() { try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true, From 44438cc7a4e6b409251d2fed03d4b2a4e5bf7882 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 10:39:03 -0500 Subject: [PATCH 07/22] Update to changes to concate api Signed-off-by: Thomas Graves --- .../java/ai/rapids/cudf/ColumnVector.java | 8 ++-- java/src/main/native/src/ColumnVectorJni.cpp | 4 +- .../java/ai/rapids/cudf/ColumnVectorTest.java | 46 +++++++++---------- 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index d060103b423..29b963a3164 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -534,8 +534,9 @@ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, Col * @return A new java column vector containing the concatenated strings with separator between. */ public static ColumnVector stringConcatenateWs(ColumnView[] columns, ColumnView sep_col) { - try (Scalar nullString = Scalar.fromString(null)) { - return stringConcatenateWs(columns, sep_col, nullString, nullString); + try (Scalar nullString = Scalar.fromString(null); + Scalar emptyString = Scalar.fromString("")) { + return stringConcatenateWs(columns, sep_col, nullString, emptyString); } } @@ -552,7 +553,8 @@ public static ColumnVector stringConcatenateWs(ColumnView[] columns, ColumnView * found in any column. * @return A new java column vector containing the concatenated strings with separator between. */ - public static ColumnVector stringConcatenateWs(ColumnView[] columns, ColumnView sep_col, Scalar separator_narep, Scalar col_narep) { + public static ColumnVector stringConcatenateWs(ColumnView[] columns, + ColumnView sep_col, Scalar separator_narep, Scalar col_narep) { assert columns.length >= 1 : ".stringConcatenate() operation requires at least 1 column"; assert separator_narep != null : "separator narep scalar provided may not be null"; assert col_narep != null : "column narep scalar provided may not be null"; diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index 332239985fc..a5cd8a08e6a 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -150,7 +150,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationWs(J cudf::column_view *column = reinterpret_cast(sep_handle); cudf::strings_column_view strings_column(*column); std::unique_ptr result = - cudf::strings::concatenate(cudf::table_view(column_views), strings_column, separator_narep_scalar, col_narep_scalar); + cudf::strings::concatenate(cudf::table_view(column_views), strings_column, separator_narep_scalar, col_narep_scalar, cudf::strings::separator_on_nulls::NO); return reinterpret_cast(result.release()); } CATCH_STD(env, 0); @@ -175,7 +175,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationList cudf::column_view *cv = reinterpret_cast(column_handle); cudf::lists_column_view lcv(*cv); std::unique_ptr result = - cudf::strings::concatenate_list_elements(lcv, strings_column, separator_narep_scalar, col_narep_scalar); + cudf::strings::join_list_elements(lcv, strings_column, separator_narep_scalar, col_narep_scalar, cudf::strings::separator_on_nulls::NO); return reinterpret_cast(result.release()); } CATCH_STD(env, 0); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index c35230de680..792326ec7c3 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2141,8 +2141,8 @@ void testStringConcatWsSimple() { ColumnVector sep_col = ColumnVector.fromStrings("-*"); ColumnVector e_concat = ColumnVector.fromStrings("a-*B-*cd-*\u0480\u0481-*E\tf-*M-*\\G\u0100"); Scalar separatorString = Scalar.fromString(null); - Scalar nullString = Scalar.fromString(null); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2, sv3, sv4, sv5, sv6, sv7}, sep_col, separatorString, nullString)) { + Scalar col_narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2, sv3, sv4, sv5, sv6, sv7}, sep_col, separatorString, col_narep)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2155,8 +2155,8 @@ void testStringConcatWsNullSep() { ColumnVector sep_col = ColumnVector.fromScalar(nullString, 2); ColumnVector e_concat = ColumnVector.fromScalar(nullString, 2); Scalar separatorString = Scalar.fromString(null); - Scalar nullEmptyString = Scalar.fromString(null); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, nullEmptyString)) { + Scalar col_narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, col_narep)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2168,8 +2168,8 @@ void testStringConcatWsNullValueInCol() { ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-"); ColumnVector e_concat = ColumnVector.fromStrings("a-b", "c-", "e"); Scalar separatorString = Scalar.fromString(null); - Scalar nullEmptyString = Scalar.fromString(null); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, nullEmptyString)) { + Scalar col_narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, col_narep)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2180,8 +2180,8 @@ void testStringConcatWsSingleCol() { ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-"); ColumnVector e_concat = ColumnVector.fromStrings("a", "c", "e"); Scalar separatorString = Scalar.fromString(null); - Scalar nullEmptyString = Scalar.fromString(null); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1}, sep_col, separatorString, nullEmptyString)) { + Scalar col_narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1}, sep_col, separatorString, col_narep)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2192,12 +2192,10 @@ void testStringConcatWsNullAllCol() { ColumnVector sv1 = ColumnVector.fromScalar(nullString, 3); ColumnVector sv2 = ColumnVector.fromScalar(nullString, 3); ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-"); - // TODO - Spark expects this to be empty String - // ColumnVector e_concat = ColumnVector.fromStrings("", "", ""); - ColumnVector e_concat = ColumnVector.fromScalar(nullString, 3); + ColumnVector e_concat = ColumnVector.fromStrings("", "", ""); Scalar separatorString = Scalar.fromString(null); - Scalar nullEmptyString = Scalar.fromString(null); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, nullEmptyString)) { + Scalar col_narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, col_narep)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2208,13 +2206,11 @@ void testStringConcatWsSingleListCol() { new HostColumnVector.BasicType(true, DType.STRING)), Arrays.asList("aaa"), Arrays.asList("b", "c", "d"), Arrays.asList("\u0480\u0481", null, "asdfbe", null)); ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "*"); - // ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481*asdfbe"); - // TODO - this is different then spark, should be above line for null - ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", null); - // ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481**asdfbe*"); + // TODO - nulls in middle broken + ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481*asdfbe"); Scalar separatorString = Scalar.fromString(null); - Scalar nullEmptyString = Scalar.fromString(null); - ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, separatorString, nullEmptyString)) { + Scalar col_narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, separatorString, col_narep)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2226,11 +2222,11 @@ void testStringConcatWsSingleListColAllNulls() { Arrays.asList("aaa"), Arrays.asList(null, null, null)); ColumnVector sep_col = ColumnVector.fromStrings("-", "-"); // TODO - SPARK expects empty string when all nulls, not null - // ColumnVector e_concat = ColumnVector.fromStrings("aaa", ""); - ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); + ColumnVector e_concat = ColumnVector.fromStrings("aaa", ""); + // ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); Scalar separatorString = Scalar.fromString(null); - Scalar nullEmptyString = Scalar.fromString(null); - ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, separatorString, nullEmptyString)) { + Scalar col_narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, separatorString, col_narep)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2243,8 +2239,8 @@ void testStringConcatWsSingleListColEmptyArray() { ColumnVector sep_col = ColumnVector.fromStrings("-", "-"); ColumnVector e_concat = ColumnVector.fromStrings("aaa-bbbb", null); Scalar separatorString = Scalar.fromString(null); - Scalar nullEmptyString = Scalar.fromString(null); - ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, separatorString, nullEmptyString)) { + Scalar col_narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, separatorString, col_narep)) { assertColumnsAreEqual(e_concat, concat); } } From c43b0e0d802a77ba89ad35e8d1bef23e1ba230e2 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 11:39:38 -0500 Subject: [PATCH 08/22] Update api for new separator parameter Signed-off-by: Thomas Graves --- .../java/ai/rapids/cudf/ColumnVector.java | 28 ++-- java/src/main/native/src/ColumnVectorJni.cpp | 16 ++- .../java/ai/rapids/cudf/ColumnVectorTest.java | 127 ++++++++++++++++-- 3 files changed, 147 insertions(+), 24 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 29b963a3164..c81ac4d8827 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -536,7 +536,7 @@ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, Col public static ColumnVector stringConcatenateWs(ColumnView[] columns, ColumnView sep_col) { try (Scalar nullString = Scalar.fromString(null); Scalar emptyString = Scalar.fromString("")) { - return stringConcatenateWs(columns, sep_col, nullString, emptyString); + return stringConcatenateWs(columns, sep_col, nullString, emptyString, false); } } @@ -550,11 +550,13 @@ public static ColumnVector stringConcatenateWs(ColumnView[] columns, ColumnView * @param separator_narep String that should be used in place of a null separator for a given * row. * @param col_narep string String that should be used in place of any null strings - * found in any column. + * found in any column. + * @param separate_nulls boolean if true, then the separator is included for null rows if + * `col_narep` is valid. * @return A new java column vector containing the concatenated strings with separator between. */ public static ColumnVector stringConcatenateWs(ColumnView[] columns, - ColumnView sep_col, Scalar separator_narep, Scalar col_narep) { + ColumnView sep_col, Scalar separator_narep, Scalar col_narep, boolean separate_nulls) { assert columns.length >= 1 : ".stringConcatenate() operation requires at least 1 column"; assert separator_narep != null : "separator narep scalar provided may not be null"; assert col_narep != null : "column narep scalar provided may not be null"; @@ -568,19 +570,18 @@ public static ColumnVector stringConcatenateWs(ColumnView[] columns, } return new ColumnVector(stringConcatenationWs(column_views, sep_col.getNativeView(), - separator_narep.getScalarHandle(), col_narep.getScalarHandle())); + separator_narep.getScalarHandle(), col_narep.getScalarHandle(), separate_nulls)); } - public static ColumnVector stringConcatenateListElementsWs(ColumnView list_column, ColumnView sep_col, Scalar separator_narep, Scalar col_narep) { + public static ColumnVector stringConcatenateListElementsWs(ColumnView list_column, + ColumnView sep_col, Scalar separator_narep, Scalar col_narep, boolean separate_nulls) { assert separator_narep != null : "separator narep scalar provided may not be null"; assert col_narep != null : "column narep scalar provided may not be null"; assert separator_narep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; assert col_narep.getType().equals(DType.STRING) : "column narep scalar must be a string scalar"; - // TODO - // assert type.equals(DType.LIST) : "A column of type LIST is required for .extractListElement()"; return new ColumnVector(stringConcatenationListElementsWs(list_column.getNativeView(), sep_col.getNativeView(), - separator_narep.getScalarHandle(), col_narep.getScalarHandle())); + separator_narep.getScalarHandle(), col_narep.getScalarHandle(), separate_nulls)); } /** @@ -793,11 +794,18 @@ private static native long makeList(long[] handles, long typeHandle, int scale, * row. * @param col_narep string String scalar that should be used in place of any null strings * found in any column. + * @param separate_nulls boolean if true, then the separator is included for null rows if + * `col_narep` is valid. * @return A new java column vector containing the concatenated strings with separator between. */ - private static native long stringConcatenationWs(long[] columnViews, long sep_column, long separator_narep, long col_narep); + private static native long stringConcatenationWs(long[] columnViews, long sep_column, + long separator_narep, long col_narep, + boolean separate_nulls); - private static native long stringConcatenationListElementsWs(long list_column, long sep_column, long separator_narep, long col_narep); + private static native long stringConcatenationListElementsWs(long list_column, long sep_column, + long separator_narep, + long col_narep, + boolean separate_nulls); /** * Native method to hash each row of the given table. Hashing function dispatched on the diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index a5cd8a08e6a..fbe80e27cca 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -130,7 +130,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationWs(J jlongArray column_handles, jlong sep_handle, jlong separator_narep, - jlong col_narep) { + jlong col_narep, + jboolean separate_nulls) { JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0); JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0); JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0); @@ -139,6 +140,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationWs(J cudf::jni::auto_set_device(env); const auto& separator_narep_scalar = *reinterpret_cast(separator_narep); const auto& col_narep_scalar = *reinterpret_cast(col_narep); + auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES + : cudf::strings::separator_on_nulls::NO; cudf::jni::native_jpointerArray n_cudf_columns(env, column_handles); std::vector column_views; @@ -150,7 +153,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationWs(J cudf::column_view *column = reinterpret_cast(sep_handle); cudf::strings_column_view strings_column(*column); std::unique_ptr result = - cudf::strings::concatenate(cudf::table_view(column_views), strings_column, separator_narep_scalar, col_narep_scalar, cudf::strings::separator_on_nulls::NO); + cudf::strings::concatenate(cudf::table_view(column_views), strings_column, + separator_narep_scalar, col_narep_scalar, null_policy); return reinterpret_cast(result.release()); } CATCH_STD(env, 0); @@ -160,7 +164,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationList jlong column_handle, jlong sep_handle, jlong separator_narep, - jlong col_narep) { + jlong col_narep, + jboolean separate_nulls) { JNI_NULL_CHECK(env, column_handle, "column handle is null", 0); JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0); JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0); @@ -169,13 +174,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationList cudf::jni::auto_set_device(env); const auto& separator_narep_scalar = *reinterpret_cast(separator_narep); const auto& col_narep_scalar = *reinterpret_cast(col_narep); + auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES + : cudf::strings::separator_on_nulls::NO; cudf::column_view *column = reinterpret_cast(sep_handle); cudf::strings_column_view strings_column(*column); cudf::column_view *cv = reinterpret_cast(column_handle); cudf::lists_column_view lcv(*cv); std::unique_ptr result = - cudf::strings::join_list_elements(lcv, strings_column, separator_narep_scalar, col_narep_scalar, cudf::strings::separator_on_nulls::NO); + cudf::strings::join_list_elements(lcv, strings_column, separator_narep_scalar, + col_narep_scalar, null_policy); return reinterpret_cast(result.release()); } CATCH_STD(env, 0); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 792326ec7c3..52079b92bff 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2116,7 +2116,8 @@ void testConcatWsTypeError() { ColumnVector sep_col = ColumnVector.fromStrings("-*"); Scalar separatorString = Scalar.fromString(null); Scalar nullString = Scalar.fromString(null)) { - assertThrows(CudfException.class, () -> ColumnVector.stringConcatenateWs(new ColumnView[]{v0, v1}, sep_col, separatorString, nullString)); + assertThrows(CudfException.class, () -> ColumnVector.stringConcatenateWs( + new ColumnView[]{v0, v1}, sep_col, separatorString, nullString, false)); } } @@ -2125,7 +2126,8 @@ void testConcatWsNoColumn() { try (ColumnVector sep_col = ColumnVector.fromStrings("-*"); Scalar separatorString = Scalar.fromString(null); Scalar nullString = Scalar.fromString(null)) { - assertThrows(AssertionError.class, () -> ColumnVector.stringConcatenateWs(new ColumnView[]{}, sep_col, separatorString, nullString)); + assertThrows(AssertionError.class, () -> ColumnVector.stringConcatenateWs( + new ColumnView[]{}, sep_col, separatorString, nullString, false)); } } @@ -2142,7 +2144,28 @@ void testStringConcatWsSimple() { ColumnVector e_concat = ColumnVector.fromStrings("a-*B-*cd-*\u0480\u0481-*E\tf-*M-*\\G\u0100"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2, sv3, sv4, sv5, sv6, sv7}, sep_col, separatorString, col_narep)) { + ColumnVector concat = ColumnVector.stringConcatenateWs( + new ColumnView[]{sv1, sv2, sv3, sv4, sv5, sv6, sv7}, sep_col, separatorString, + col_narep, false)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + @Test + void testStringConcatWsSimpleOtherApi() { + try (ColumnVector sv1 = ColumnVector.fromStrings("a"); + ColumnVector sv2 = ColumnVector.fromStrings("B"); + ColumnVector sv3 = ColumnVector.fromStrings("cd"); + ColumnVector sv4 = ColumnVector.fromStrings("\u0480\u0481"); + ColumnVector sv5 = ColumnVector.fromStrings("E\tf"); + ColumnVector sv6 = ColumnVector.fromStrings("M"); + ColumnVector sv7 = ColumnVector.fromStrings("\\G\u0100"); + ColumnVector sep_col = ColumnVector.fromStrings("-*"); + ColumnVector e_concat = ColumnVector.fromStrings("a-*B-*cd-*\u0480\u0481-*E\tf-*M-*\\G\u0100"); + Scalar separatorString = Scalar.fromString(null); + Scalar col_narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateWs( + new ColumnView[]{sv1, sv2, sv3, sv4, sv5, sv6, sv7}, sep_col)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2156,7 +2179,8 @@ void testStringConcatWsNullSep() { ColumnVector e_concat = ColumnVector.fromScalar(nullString, 2); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, col_narep)) { + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, + sep_col, separatorString, col_narep, false)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2169,7 +2193,37 @@ void testStringConcatWsNullValueInCol() { ColumnVector e_concat = ColumnVector.fromStrings("a-b", "c-", "e"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, col_narep)) { + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, + sep_col, separatorString, col_narep, false)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + @Test + void testStringConcatWsNullValueInColKeepNull() { + try (ColumnVector sv1 = ColumnVector.fromStrings("a", "c", null); + ColumnVector sv2 = ColumnVector.fromStrings("b", "", "e"); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-"); + ColumnVector e_concat = ColumnVector.fromStrings("a-b", "c-", null); + Scalar separatorString = Scalar.fromString(null); + Scalar col_narep = Scalar.fromString(null); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, + sep_col, separatorString, col_narep, true)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + @Test + void testStringConcatWsNullValueInColSepTrue() { + try (ColumnVector sv1 = ColumnVector.fromStrings("a", "c", null); + ColumnVector sv2 = ColumnVector.fromStrings("b", "", "e"); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-"); + // this is failing? + ColumnVector e_concat = ColumnVector.fromStrings("a-b-", "c-", "e"); + Scalar separatorString = Scalar.fromString(null); + Scalar col_narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, + sep_col, separatorString, col_narep, true)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2181,7 +2235,8 @@ void testStringConcatWsSingleCol() { ColumnVector e_concat = ColumnVector.fromStrings("a", "c", "e"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1}, sep_col, separatorString, col_narep)) { + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1}, + sep_col, separatorString, col_narep, false)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2195,7 +2250,23 @@ void testStringConcatWsNullAllCol() { ColumnVector e_concat = ColumnVector.fromStrings("", "", ""); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, sep_col, separatorString, col_narep)) { + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, + sep_col, separatorString, col_narep, false)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + @Test + void testStringConcatWsNullAllColSepTrue() { + try (Scalar nullString = Scalar.fromString(null); + ColumnVector sv1 = ColumnVector.fromScalar(nullString, 3); + ColumnVector sv2 = ColumnVector.fromScalar(nullString, 3); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-"); + ColumnVector e_concat = ColumnVector.fromStrings("-", "-", "-"); + Scalar separatorString = Scalar.fromString(null); + Scalar col_narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, + sep_col, separatorString, col_narep, true)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2210,7 +2281,8 @@ void testStringConcatWsSingleListCol() { ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481*asdfbe"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, separatorString, col_narep)) { + ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, + separatorString, col_narep, false)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2226,7 +2298,41 @@ void testStringConcatWsSingleListColAllNulls() { // ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, separatorString, col_narep)) { + ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, + separatorString, col_narep, false)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + @Test + void testStringConcatWsSingleListColAllNullsSepTrue() { + try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("aaa"), Arrays.asList(null, null, null)); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-"); + // TODO - SPARK expects empty string when all nulls, not null + // currently broken + ColumnVector e_concat = ColumnVector.fromStrings("aaa", "---"); + // ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); + Scalar separatorString = Scalar.fromString(null); + Scalar col_narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, + separatorString, col_narep, true)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + @Test + void testStringConcatWsSingleListColAllNullsKeepNulls() { + try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("aaa"), Arrays.asList(null, null, null)); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-"); + ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); + Scalar separatorString = Scalar.fromString(null); + Scalar col_narep = Scalar.fromString(null); + ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, + separatorString, col_narep, true)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2240,7 +2346,8 @@ void testStringConcatWsSingleListColEmptyArray() { ColumnVector e_concat = ColumnVector.fromStrings("aaa-bbbb", null); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, separatorString, col_narep)) { + ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, + separatorString, col_narep, false)) { assertColumnsAreEqual(e_concat, concat); } } From 61f56c89822d3e78eabedb4af6e9630e0ed92f64 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 13:49:32 -0500 Subject: [PATCH 09/22] Update to new parameter is concatenate list api --- .../java/ai/rapids/cudf/ColumnVector.java | 8 ++-- java/src/main/native/src/ColumnVectorJni.cpp | 8 +++- .../java/ai/rapids/cudf/ColumnVectorTest.java | 37 ++++++++++++------- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index c81ac4d8827..bbf2c2803de 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -574,14 +574,15 @@ public static ColumnVector stringConcatenateWs(ColumnView[] columns, } public static ColumnVector stringConcatenateListElementsWs(ColumnView list_column, - ColumnView sep_col, Scalar separator_narep, Scalar col_narep, boolean separate_nulls) { + ColumnView sep_col, Scalar separator_narep, Scalar col_narep, boolean separate_nulls, + boolean empty_string_output_if_empty_list) { assert separator_narep != null : "separator narep scalar provided may not be null"; assert col_narep != null : "column narep scalar provided may not be null"; assert separator_narep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; assert col_narep.getType().equals(DType.STRING) : "column narep scalar must be a string scalar"; return new ColumnVector(stringConcatenationListElementsWs(list_column.getNativeView(), sep_col.getNativeView(), - separator_narep.getScalarHandle(), col_narep.getScalarHandle(), separate_nulls)); + separator_narep.getScalarHandle(), col_narep.getScalarHandle(), separate_nulls, empty_string_output_if_empty_list)); } /** @@ -805,7 +806,8 @@ private static native long stringConcatenationWs(long[] columnViews, long sep_co private static native long stringConcatenationListElementsWs(long list_column, long sep_column, long separator_narep, long col_narep, - boolean separate_nulls); + boolean separate_nulls, + boolean empty_string_output_if_empty_list); /** * Native method to hash each row of the given table. Hashing function dispatched on the diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index fbe80e27cca..cafa8d0a123 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -165,7 +165,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationList jlong sep_handle, jlong separator_narep, jlong col_narep, - jboolean separate_nulls) { + jboolean separate_nulls, + jboolean empty_string_output_if_empty_list) { JNI_NULL_CHECK(env, column_handle, "column handle is null", 0); JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0); JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0); @@ -176,6 +177,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationList const auto& col_narep_scalar = *reinterpret_cast(col_narep); auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES : cudf::strings::separator_on_nulls::NO; + auto empty_list_output = + empty_string_output_if_empty_list ? cudf::strings::output_if_empty_list::EMPTY_STRING + : cudf::strings::output_if_empty_list::NULL_ELEMENT; cudf::column_view *column = reinterpret_cast(sep_handle); cudf::strings_column_view strings_column(*column); @@ -183,7 +187,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationList cudf::lists_column_view lcv(*cv); std::unique_ptr result = cudf::strings::join_list_elements(lcv, strings_column, separator_narep_scalar, - col_narep_scalar, null_policy); + col_narep_scalar, null_policy, empty_list_output); return reinterpret_cast(result.release()); } CATCH_STD(env, 0); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 52079b92bff..98cf54b5846 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2219,7 +2219,7 @@ void testStringConcatWsNullValueInColSepTrue() { ColumnVector sv2 = ColumnVector.fromStrings("b", "", "e"); ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-"); // this is failing? - ColumnVector e_concat = ColumnVector.fromStrings("a-b-", "c-", "e"); + ColumnVector e_concat = ColumnVector.fromStrings("a-b", "c-", "-e"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, @@ -2277,12 +2277,11 @@ void testStringConcatWsSingleListCol() { new HostColumnVector.BasicType(true, DType.STRING)), Arrays.asList("aaa"), Arrays.asList("b", "c", "d"), Arrays.asList("\u0480\u0481", null, "asdfbe", null)); ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "*"); - // TODO - nulls in middle broken ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481*asdfbe"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, - separatorString, col_narep, false)) { + separatorString, col_narep, false, false)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2293,13 +2292,11 @@ void testStringConcatWsSingleListColAllNulls() { new HostColumnVector.BasicType(true, DType.STRING)), Arrays.asList("aaa"), Arrays.asList(null, null, null)); ColumnVector sep_col = ColumnVector.fromStrings("-", "-"); - // TODO - SPARK expects empty string when all nulls, not null ColumnVector e_concat = ColumnVector.fromStrings("aaa", ""); - // ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, - separatorString, col_narep, false)) { + separatorString, col_narep, false, false)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2310,14 +2307,11 @@ void testStringConcatWsSingleListColAllNullsSepTrue() { new HostColumnVector.BasicType(true, DType.STRING)), Arrays.asList("aaa"), Arrays.asList(null, null, null)); ColumnVector sep_col = ColumnVector.fromStrings("-", "-"); - // TODO - SPARK expects empty string when all nulls, not null - // currently broken - ColumnVector e_concat = ColumnVector.fromStrings("aaa", "---"); - // ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); + ColumnVector e_concat = ColumnVector.fromStrings("aaa", "--"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, - separatorString, col_narep, true)) { + separatorString, col_narep, true, false)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2332,7 +2326,7 @@ void testStringConcatWsSingleListColAllNullsKeepNulls() { Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(null); ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, - separatorString, col_narep, true)) { + separatorString, col_narep, true, false)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2346,8 +2340,25 @@ void testStringConcatWsSingleListColEmptyArray() { ColumnVector e_concat = ColumnVector.fromStrings("aaa-bbbb", null); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); + // set the parameter to return null on empty array ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, - separatorString, col_narep, false)) { + separatorString, col_narep, false, false)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + @Test + void testStringConcatWsSingleListColEmptyArrayReturnEmpty() { + try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("aaa", "bbbb"), Arrays.asList()); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-"); + ColumnVector e_concat = ColumnVector.fromStrings("aaa-bbbb", ""); + Scalar separatorString = Scalar.fromString(null); + Scalar col_narep = Scalar.fromString(""); + // set the parameter to return empty string on empty array + ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, + separatorString, col_narep, false, true)) { assertColumnsAreEqual(e_concat, concat); } } From 38ae371d666907e2c93b7e58c38e5f00bd7b9918 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 16:12:30 -0500 Subject: [PATCH 10/22] Add in the extra paramters for concatenate with scalar separator and list concatenate with scalar separator --- .../java/ai/rapids/cudf/ColumnVector.java | 188 +++++++++++++++--- java/src/main/native/src/ColumnVectorJni.cpp | 77 +++++-- .../java/ai/rapids/cudf/ColumnVectorTest.java | 34 ++-- 3 files changed, 235 insertions(+), 64 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index bbf2c2803de..aba2831131b 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -500,7 +500,8 @@ public static ColumnVector stringConcatenate(ColumnView[] columns) { /** * Concatenate columns of strings together, combining a corresponding row from each column into - * a single string row of a new column. + * a single string row of a new column. This version inludes the separator for null rows + * if 'narep' is valid. * @param separator string scalar inserted between each string being merged. * @param narep string scalar indicating null behavior. If set to null and any string in the row * is null the resulting string will be null. If not null, null values in any column @@ -509,6 +510,23 @@ public static ColumnVector stringConcatenate(ColumnView[] columns) { * @return A new java column vector containing the concatenated strings. */ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, ColumnView[] columns) { + return stringConcatenate(separator, narep, columns, true); + } + + /** + * Concatenate columns of strings together, combining a corresponding row from each column into + * a single string row of a new column. + * @param separator string scalar inserted between each string being merged. + * @param narep string scalar indicating null behavior. If set to null and any string in the row + * is null the resulting string will be null. If not null, null values in any column + * will be replaced by the specified string. + * @param columns array of columns containing strings, must be non-empty + * @param separate_nulls if true, then the separator is included for null rows if + * `col_narep` is valid. + * @return A new java column vector containing the concatenated strings. + */ + public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, ColumnView[] columns, + boolean separate_nulls) { assert columns != null : "input columns should not be null"; assert columns.length > 0 : "input columns should not be empty"; assert separator != null : "separator scalar provided may not be null"; @@ -522,7 +540,8 @@ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, Col column_views[i] = columns[i].getNativeView(); } - return new ColumnVector(stringConcatenation(column_views, separator.getScalarHandle(), narep.getScalarHandle())); + return new ColumnVector(stringConcatenation(column_views, separator.getScalarHandle(), + narep.getScalarHandle(), separate_nulls)); } /** @@ -533,29 +552,30 @@ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, Col * @param sep_col strings column that provides the separator for a given row * @return A new java column vector containing the concatenated strings with separator between. */ - public static ColumnVector stringConcatenateWs(ColumnView[] columns, ColumnView sep_col) { + public static ColumnVector stringConcatenate(ColumnView[] columns, ColumnView sep_col) { try (Scalar nullString = Scalar.fromString(null); Scalar emptyString = Scalar.fromString("")) { - return stringConcatenateWs(columns, sep_col, nullString, emptyString, false); + return stringConcatenate(columns, sep_col, nullString, emptyString, false); } } /** * Concatenate columns of strings together using a separator specified for each row * and returns the result as a string column. If the row separator for a given row is null, - * output column for that row is null unless separator_narep is provided. Null column values - * for a given row are skipped unless col_narep is provided. + * output column for that row is null unless separator_narep is provided. + * The separator is applied between two output row values if the separate_nulls + * is `YES` or only between valid rows if separate_nulls is `NO`. * @param columns array of columns containing strings, must be more than 1 columns * @param sep_col strings column that provides the separator for a given row * @param separator_narep String that should be used in place of a null separator for a given * row. - * @param col_narep string String that should be used in place of any null strings + * @param col_narep string that should be used in place of any null strings * found in any column. - * @param separate_nulls boolean if true, then the separator is included for null rows if + * @param separate_nulls if true, then the separator is included for null rows if * `col_narep` is valid. * @return A new java column vector containing the concatenated strings with separator between. */ - public static ColumnVector stringConcatenateWs(ColumnView[] columns, + public static ColumnVector stringConcatenate(ColumnView[] columns, ColumnView sep_col, Scalar separator_narep, Scalar col_narep, boolean separate_nulls) { assert columns.length >= 1 : ".stringConcatenate() operation requires at least 1 column"; assert separator_narep != null : "separator narep scalar provided may not be null"; @@ -569,11 +589,49 @@ public static ColumnVector stringConcatenateWs(ColumnView[] columns, column_views[i] = columns[i].getNativeView(); } - return new ColumnVector(stringConcatenationWs(column_views, sep_col.getNativeView(), + return new ColumnVector(stringConcatenationSepCol(column_views, sep_col.getNativeView(), separator_narep.getScalarHandle(), col_narep.getScalarHandle(), separate_nulls)); } - public static ColumnVector stringConcatenateListElementsWs(ColumnView list_column, + /** + * Given a lists column of strings (each row is a list of strings), concatenates the strings + * within each row and returns a single strings column result. Each new string is created by + * concatenating the strings from the same row (same list element) delimited by the separator + * provided. This version of the function relaces nulls with empty string and returns null + * for empty list. + * @param list_column column containing lists of strings to concatenate. + * @param sep_col strings column that provides separators for concatenation. + * @return A new java column vector containing the concatenated strings with separator between. + */ + public static ColumnVector stringConcatenateListElements(ColumnView list_column, + ColumnView sep_col) { + try (Scalar nullString = Scalar.fromString(null); + Scalar emptyString = Scalar.fromString("")) { + return stringConcatenateListElements(list_column, sep_col, nullString, emptyString, + false, false); + } + } + + /** + * Given a lists column of strings (each row is a list of strings), concatenates the strings + * within each row and returns a single strings column result. + * Each new string is created by concatenating the strings from the same row (same list element) + * delimited by the row separator provided in the sep_colstrings column. + * @param list_column column containing lists of strings to concatenate. + * @param sep_col strings column that provides separators for concatenation. + * @param separator_narep string that should be used to replace null separator, default is an + * invalid-scalar denoting that rows containing null separator will + * result in null string in the corresponding output rows. + * @param string_narep string that should be used to replace null strings in any non-null list + * row, default is an invalid-scalar denoting that list rows containing null + * strings will result in null string in the corresponding output rows. + * @param separate_nulls if true, then the separator is included for null rows if + * `col_narep` is valid. + * @param empty_string_output_if_empty_list if set to true, any input row that is an empty list + * will result in an empty string. Otherwise, it will result in a null. + * @return A new java column vector containing the concatenated strings with separator between. + */ + public static ColumnVector stringConcatenateListElements(ColumnView list_column, ColumnView sep_col, Scalar separator_narep, Scalar col_narep, boolean separate_nulls, boolean empty_string_output_if_empty_list) { assert separator_narep != null : "separator narep scalar provided may not be null"; @@ -581,8 +639,38 @@ public static ColumnVector stringConcatenateListElementsWs(ColumnView list_colum assert separator_narep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; assert col_narep.getType().equals(DType.STRING) : "column narep scalar must be a string scalar"; - return new ColumnVector(stringConcatenationListElementsWs(list_column.getNativeView(), sep_col.getNativeView(), - separator_narep.getScalarHandle(), col_narep.getScalarHandle(), separate_nulls, empty_string_output_if_empty_list)); + return new ColumnVector(stringConcatenationListElements(list_column.getNativeView(), + sep_col.getNativeView(), separator_narep.getScalarHandle(), col_narep.getScalarHandle(), + separate_nulls, empty_string_output_if_empty_list)); + } + + /** + * Given a lists column of strings (each row is a list of strings), concatenates the strings + * within each row and returns a single strings column result. Each new string is created by + * concatenating the strings from the same row (same list element) delimited by the + * separator provided. + * @param list_column column containing lists of strings to concatenate. + * @param separator string scalar inserted between each string being merged. + * @param narep string scalar indicating null behavior. If set to null and any string in the row + * is null the resulting string will be null. If not null, null values in any + * column will be replaced by the specified string. The underlying value in the + * string scalar may be null, but the object passed in may not. + * @param separate_nulls if true, then the separator is included for null rows if + * `col_narep` is valid. + * @param empty_string_output_if_empty_list if set to true, any input row that is an empty list + * will result in an empty string. Otherwise, it will result in a null. + * @return A new java column vector containing the concatenated strings with separator between. + */ + public static ColumnVector stringConcatenateListElements(ColumnView list_column, + Scalar separator, Scalar narep, boolean separate_nulls, + boolean empty_string_output_if_empty_list) { + assert separator != null : "separator scalar provided may not be null"; + assert narep != null : "column narep scalar provided may not be null"; + assert narep.getType().equals(DType.STRING) : "narep scalar must be a string scalar"; + + return new ColumnVector(stringConcatenationListElementsScalarSep(list_column.getNativeView(), + separator.getScalarHandle(), narep.getScalarHandle(), separate_nulls, + empty_string_output_if_empty_list)); } /** @@ -781,10 +869,13 @@ private static native long makeList(long[] handles, long typeHandle, int scale, * the resulting string will be null. If not null, null values in any column will be * replaced by the specified string. The underlying value in the string scalar may be null, * but the object passed in may not. + * @param separate_nulls boolean if true, then the separator is included for null rows if + * `col_narep` is valid. * @return native handle of the resulting cudf column, used to construct the Java column * by the stringConcatenate method. */ - private static native long stringConcatenation(long[] columnViews, long separator, long narep); + private static native long stringConcatenation(long[] columnViews, long separator, long narep, + boolean separate_nulls); /** * Native method to concatenate columns of strings together using a separator specified for each row @@ -797,17 +888,64 @@ private static native long makeList(long[] handles, long typeHandle, int scale, * found in any column. * @param separate_nulls boolean if true, then the separator is included for null rows if * `col_narep` is valid. - * @return A new java column vector containing the concatenated strings with separator between. - */ - private static native long stringConcatenationWs(long[] columnViews, long sep_column, - long separator_narep, long col_narep, - boolean separate_nulls); - - private static native long stringConcatenationListElementsWs(long list_column, long sep_column, - long separator_narep, - long col_narep, - boolean separate_nulls, - boolean empty_string_output_if_empty_list); + * @return native handle of the resulting cudf column, used to construct the Java column. + */ + private static native long stringConcatenationSepCol(long[] columnViews, + long sep_column, + long separator_narep, + long col_narep, + boolean separate_nulls); + + /** + * Native method to concatenate a list column of strings (each row is a list of strings), + * concatenates the strings within each row and returns a single strings column result. + * Each new string is created by concatenating the strings from the same row (same list element) + * delimited by the row separator provided in the `separators` strings column. + * @param list_column long holding the native handle of the column containing lists of strings + * to concatenate. + * @param sep_col long holding the native handle of the strings column that provides separators + * for concatenation. + * @param separator_narep String scalar that should be used in place of a null separator for a given + * row. + * @param col_narep string String scalar that should be used in place of any null strings + * found in any column. + * @param separate_nulls boolean if true, then the separator is included for null rows if + * `col_narep` is valid. + * @param empty_string_output_if_empty_list boolean if true, any input row that is an empty list + * will result in an empty string. Otherwise, it will result in a null. + * @return native handle of the resulting cudf column, used to construct the Java column. + */ + private static native long stringConcatenationListElements(long list_column, + long sep_column, + long separator_narep, + long col_narep, + boolean separate_nulls, + boolean empty_string_output_if_empty_list); + + /** + * Native method to concatenate a list column of strings (each row is a list of strings), + * concatenates the strings within each row and returns a single strings column result. + * Each new string is created by concatenating the strings from the same row (same list element) + * delimited by the separator provided. + * @param list_column long holding the native handle of the column containing lists of strings + * to concatenate. + * @param separator string scalar inserted between each string being merged, may not be null. + * @param narep string scalar indicating null behavior. If set to null and any string in the row + * is null the resulting string will be null. If not null, null values in any + * column will be replaced by the specified string. The underlying value in the + * string scalar may be null, but the object passed in may not. + * @param separate_nulls boolean if true, then the separator is included for null rows if + * `col_narep` is valid. + * @param empty_string_output_if_empty_list boolean if true, any input row that is an empty list + * will result in an empty string. Otherwise, it will + * result in a null. + * @return native handle of the resulting cudf column, used to construct the Java column. + */ + private static native long stringConcatenationListElementsScalarSep(long list_column, + long separator, + long narep, + boolean separate_nulls, + boolean empty_string_output_if_empty_list); /** * Native method to hash each row of the given table. Hashing function dispatched on the diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index cafa8d0a123..426fad16988 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -126,7 +126,37 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv *env, CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationWs(JNIEnv *env, jclass, +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenation(JNIEnv *env, jclass, + jlongArray column_handles, + jlong separator, + jlong narep, + jboolean separate_nulls) { + JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0); + JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0); + JNI_NULL_CHECK(env, narep, "narep string scalar object is null", 0); + try { + cudf::jni::auto_set_device(env); + const auto& separator_scalar = *reinterpret_cast(separator); + const auto& narep_scalar = *reinterpret_cast(narep); + auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES + : cudf::strings::separator_on_nulls::NO; + + cudf::jni::native_jpointerArray n_cudf_columns(env, column_handles); + std::vector column_views; + std::transform(n_cudf_columns.data(), + n_cudf_columns.data() + n_cudf_columns.size(), + std::back_inserter(column_views), + [](auto const &p_column) { return *p_column; }); + + std::unique_ptr result = + cudf::strings::concatenate(cudf::table_view(column_views), separator_scalar, + narep_scalar, null_policy); + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(JNIEnv *env, jclass, jlongArray column_handles, jlong sep_handle, jlong separator_narep, @@ -160,13 +190,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationWs(J CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationListElementsWs(JNIEnv *env, jclass, - jlong column_handle, - jlong sep_handle, - jlong separator_narep, - jlong col_narep, - jboolean separate_nulls, - jboolean empty_string_output_if_empty_list) { +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationListElements(JNIEnv *env, jclass, + jlong column_handle, + jlong sep_handle, + jlong separator_narep, + jlong col_narep, + jboolean separate_nulls, + jboolean empty_string_output_if_empty_list) { JNI_NULL_CHECK(env, column_handle, "column handle is null", 0); JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0); JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0); @@ -193,27 +223,30 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationList CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenation(JNIEnv *env, jclass, - jlongArray column_handles, - jlong separator, - jlong narep) { - JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0); +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationListElementsScalarSep(JNIEnv *env, jclass, + jlong column_handle, + jlong separator, + jlong narep, + jboolean separate_nulls, + jboolean empty_string_output_if_empty_list) { + JNI_NULL_CHECK(env, column_handle, "column handle is null", 0); JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0); - JNI_NULL_CHECK(env, narep, "narep string scalar object is null", 0); + JNI_NULL_CHECK(env, narep, "separator narep string scalar object is null", 0); try { cudf::jni::auto_set_device(env); const auto& separator_scalar = *reinterpret_cast(separator); const auto& narep_scalar = *reinterpret_cast(narep); + auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES + : cudf::strings::separator_on_nulls::NO; + auto empty_list_output = + empty_string_output_if_empty_list ? cudf::strings::output_if_empty_list::EMPTY_STRING + : cudf::strings::output_if_empty_list::NULL_ELEMENT; - cudf::jni::native_jpointerArray n_cudf_columns(env, column_handles); - std::vector column_views; - std::transform(n_cudf_columns.data(), - n_cudf_columns.data() + n_cudf_columns.size(), - std::back_inserter(column_views), - [](auto const &p_column) { return *p_column; }); - + cudf::column_view *cv = reinterpret_cast(column_handle); + cudf::lists_column_view lcv(*cv); std::unique_ptr result = - cudf::strings::concatenate(cudf::table_view(column_views), separator_scalar, narep_scalar); + cudf::strings::join_list_elements(lcv, separator_scalar, narep_scalar, + null_policy, empty_list_output); return reinterpret_cast(result.release()); } CATCH_STD(env, 0); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 98cf54b5846..aea7c5a7f81 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2116,7 +2116,7 @@ void testConcatWsTypeError() { ColumnVector sep_col = ColumnVector.fromStrings("-*"); Scalar separatorString = Scalar.fromString(null); Scalar nullString = Scalar.fromString(null)) { - assertThrows(CudfException.class, () -> ColumnVector.stringConcatenateWs( + assertThrows(CudfException.class, () -> ColumnVector.stringConcatenate( new ColumnView[]{v0, v1}, sep_col, separatorString, nullString, false)); } } @@ -2126,7 +2126,7 @@ void testConcatWsNoColumn() { try (ColumnVector sep_col = ColumnVector.fromStrings("-*"); Scalar separatorString = Scalar.fromString(null); Scalar nullString = Scalar.fromString(null)) { - assertThrows(AssertionError.class, () -> ColumnVector.stringConcatenateWs( + assertThrows(AssertionError.class, () -> ColumnVector.stringConcatenate( new ColumnView[]{}, sep_col, separatorString, nullString, false)); } } @@ -2144,7 +2144,7 @@ void testStringConcatWsSimple() { ColumnVector e_concat = ColumnVector.fromStrings("a-*B-*cd-*\u0480\u0481-*E\tf-*M-*\\G\u0100"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateWs( + ColumnVector concat = ColumnVector.stringConcatenate( new ColumnView[]{sv1, sv2, sv3, sv4, sv5, sv6, sv7}, sep_col, separatorString, col_narep, false)) { assertColumnsAreEqual(e_concat, concat); @@ -2164,7 +2164,7 @@ void testStringConcatWsSimpleOtherApi() { ColumnVector e_concat = ColumnVector.fromStrings("a-*B-*cd-*\u0480\u0481-*E\tf-*M-*\\G\u0100"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateWs( + ColumnVector concat = ColumnVector.stringConcatenate( new ColumnView[]{sv1, sv2, sv3, sv4, sv5, sv6, sv7}, sep_col)) { assertColumnsAreEqual(e_concat, concat); } @@ -2179,7 +2179,7 @@ void testStringConcatWsNullSep() { ColumnVector e_concat = ColumnVector.fromScalar(nullString, 2); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, + ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1, sv2}, sep_col, separatorString, col_narep, false)) { assertColumnsAreEqual(e_concat, concat); } @@ -2193,7 +2193,7 @@ void testStringConcatWsNullValueInCol() { ColumnVector e_concat = ColumnVector.fromStrings("a-b", "c-", "e"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, + ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1, sv2}, sep_col, separatorString, col_narep, false)) { assertColumnsAreEqual(e_concat, concat); } @@ -2207,7 +2207,7 @@ void testStringConcatWsNullValueInColKeepNull() { ColumnVector e_concat = ColumnVector.fromStrings("a-b", "c-", null); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(null); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, + ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1, sv2}, sep_col, separatorString, col_narep, true)) { assertColumnsAreEqual(e_concat, concat); } @@ -2222,7 +2222,7 @@ void testStringConcatWsNullValueInColSepTrue() { ColumnVector e_concat = ColumnVector.fromStrings("a-b", "c-", "-e"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, + ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1, sv2}, sep_col, separatorString, col_narep, true)) { assertColumnsAreEqual(e_concat, concat); } @@ -2235,7 +2235,7 @@ void testStringConcatWsSingleCol() { ColumnVector e_concat = ColumnVector.fromStrings("a", "c", "e"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1}, + ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1}, sep_col, separatorString, col_narep, false)) { assertColumnsAreEqual(e_concat, concat); } @@ -2250,7 +2250,7 @@ void testStringConcatWsNullAllCol() { ColumnVector e_concat = ColumnVector.fromStrings("", "", ""); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, + ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1, sv2}, sep_col, separatorString, col_narep, false)) { assertColumnsAreEqual(e_concat, concat); } @@ -2265,7 +2265,7 @@ void testStringConcatWsNullAllColSepTrue() { ColumnVector e_concat = ColumnVector.fromStrings("-", "-", "-"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateWs(new ColumnView[]{sv1, sv2}, + ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1, sv2}, sep_col, separatorString, col_narep, true)) { assertColumnsAreEqual(e_concat, concat); } @@ -2280,7 +2280,7 @@ void testStringConcatWsSingleListCol() { ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481*asdfbe"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, + ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, separatorString, col_narep, false, false)) { assertColumnsAreEqual(e_concat, concat); } @@ -2295,7 +2295,7 @@ void testStringConcatWsSingleListColAllNulls() { ColumnVector e_concat = ColumnVector.fromStrings("aaa", ""); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, + ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, separatorString, col_narep, false, false)) { assertColumnsAreEqual(e_concat, concat); } @@ -2310,7 +2310,7 @@ void testStringConcatWsSingleListColAllNullsSepTrue() { ColumnVector e_concat = ColumnVector.fromStrings("aaa", "--"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, + ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, separatorString, col_narep, true, false)) { assertColumnsAreEqual(e_concat, concat); } @@ -2325,7 +2325,7 @@ void testStringConcatWsSingleListColAllNullsKeepNulls() { ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(null); - ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, + ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, separatorString, col_narep, true, false)) { assertColumnsAreEqual(e_concat, concat); } @@ -2341,7 +2341,7 @@ void testStringConcatWsSingleListColEmptyArray() { Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); // set the parameter to return null on empty array - ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, + ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, separatorString, col_narep, false, false)) { assertColumnsAreEqual(e_concat, concat); } @@ -2357,7 +2357,7 @@ void testStringConcatWsSingleListColEmptyArrayReturnEmpty() { Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); // set the parameter to return empty string on empty array - ColumnVector concat = ColumnVector.stringConcatenateListElementsWs(cv1, sep_col, + ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, separatorString, col_narep, false, true)) { assertColumnsAreEqual(e_concat, concat); } From 963cb392214752c14e81a19d90bc7bda5321bcbd Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 16:30:06 -0500 Subject: [PATCH 11/22] Add more tests --- .../java/ai/rapids/cudf/ColumnVector.java | 18 ++++---- java/src/main/native/src/ColumnVectorJni.cpp | 26 ++++++------ .../java/ai/rapids/cudf/ColumnVectorTest.java | 41 +++++++++++++++++++ 3 files changed, 63 insertions(+), 22 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index aba2831131b..8520d6a983e 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -639,7 +639,7 @@ public static ColumnVector stringConcatenateListElements(ColumnView list_column, assert separator_narep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; assert col_narep.getType().equals(DType.STRING) : "column narep scalar must be a string scalar"; - return new ColumnVector(stringConcatenationListElements(list_column.getNativeView(), + return new ColumnVector(stringConcatenationListElementsSepCol(list_column.getNativeView(), sep_col.getNativeView(), separator_narep.getScalarHandle(), col_narep.getScalarHandle(), separate_nulls, empty_string_output_if_empty_list)); } @@ -668,7 +668,7 @@ public static ColumnVector stringConcatenateListElements(ColumnView list_column, assert narep != null : "column narep scalar provided may not be null"; assert narep.getType().equals(DType.STRING) : "narep scalar must be a string scalar"; - return new ColumnVector(stringConcatenationListElementsScalarSep(list_column.getNativeView(), + return new ColumnVector(stringConcatenationListElements(list_column.getNativeView(), separator.getScalarHandle(), narep.getScalarHandle(), separate_nulls, empty_string_output_if_empty_list)); } @@ -915,12 +915,12 @@ private static native long stringConcatenationSepCol(long[] columnViews, * will result in an empty string. Otherwise, it will result in a null. * @return native handle of the resulting cudf column, used to construct the Java column. */ - private static native long stringConcatenationListElements(long list_column, - long sep_column, - long separator_narep, - long col_narep, - boolean separate_nulls, - boolean empty_string_output_if_empty_list); + private static native long stringConcatenationListElementsSepCol(long list_column, + long sep_column, + long separator_narep, + long col_narep, + boolean separate_nulls, + boolean empty_string_output_if_empty_list); /** * Native method to concatenate a list column of strings (each row is a list of strings), @@ -941,7 +941,7 @@ private static native long stringConcatenationListElements(long list_column, * result in a null. * @return native handle of the resulting cudf column, used to construct the Java column. */ - private static native long stringConcatenationListElementsScalarSep(long list_column, + private static native long stringConcatenationListElements(long list_column, long separator, long narep, boolean separate_nulls, diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index 426fad16988..051ce5e9277 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -190,13 +190,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepC CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationListElements(JNIEnv *env, jclass, - jlong column_handle, - jlong sep_handle, - jlong separator_narep, - jlong col_narep, - jboolean separate_nulls, - jboolean empty_string_output_if_empty_list) { +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationListElementsSepCol(JNIEnv *env, jclass, + jlong column_handle, + jlong sep_handle, + jlong separator_narep, + jlong col_narep, + jboolean separate_nulls, + jboolean empty_string_output_if_empty_list) { JNI_NULL_CHECK(env, column_handle, "column handle is null", 0); JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0); JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0); @@ -223,12 +223,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationList CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationListElementsScalarSep(JNIEnv *env, jclass, - jlong column_handle, - jlong separator, - jlong narep, - jboolean separate_nulls, - jboolean empty_string_output_if_empty_list) { +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationListElements(JNIEnv *env, jclass, + jlong column_handle, + jlong separator, + jlong narep, + jboolean separate_nulls, + jboolean empty_string_output_if_empty_list) { JNI_NULL_CHECK(env, column_handle, "column handle is null", 0); JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0); JNI_NULL_CHECK(env, narep, "separator narep string scalar object is null", 0); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index aea7c5a7f81..2615de3f31c 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2109,6 +2109,19 @@ void testStringConcatSeparators() { } } + @Test + void testStringConcatWsSeparators() { + try (ColumnVector sv1 = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf", null, null, "\\G\u0100"); + ColumnVector sv2 = ColumnVector.fromStrings("b", "C", "\u0500\u0501", "x\nYz", null, null, "", null); + ColumnVector e_concat = ColumnVector.fromStrings("aA1\t\ud721b", "BA1\t\ud721C", "cdA1\t\ud721\u0500\u0501", + "\u0480\u0481A1\t\ud721x\nYz", "E\tf", "", "", "\\G\u0100"); + Scalar separatorString = Scalar.fromString("A1\t\ud721"); + Scalar narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenate(separatorString, narep, new ColumnView[]{sv1, sv2}, false)) { + assertColumnsAreEqual(concat, e_concat); + } + } + @Test void testConcatWsTypeError() { try (ColumnVector v0 = ColumnVector.fromInts(1, 2, 3, 4); @@ -2286,6 +2299,20 @@ void testStringConcatWsSingleListCol() { } } + @Test + void testStringConcatWsSingleListColScalarSep() { + try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("aaa"), Arrays.asList("b", "c", "d"), Arrays.asList("\u0480\u0481", null, "asdfbe", null)); + Scalar separatorString = Scalar.fromString("-"); + ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481-asdfbe"); + Scalar narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, separatorString, + narep, false, false)) { + assertColumnsAreEqual(e_concat, concat); + } + } + @Test void testStringConcatWsSingleListColAllNulls() { try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, @@ -2301,6 +2328,20 @@ void testStringConcatWsSingleListColAllNulls() { } } + @Test + void testStringConcatWsSingleListColAllNullsScalarSep() { + try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("aaa"), Arrays.asList(null, null, null)); + ColumnVector e_concat = ColumnVector.fromStrings("aaa", ""); + Scalar separatorString = Scalar.fromString("-"); + Scalar narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, + separatorString, narep, false, false)) { + assertColumnsAreEqual(e_concat, concat); + } + } + @Test void testStringConcatWsSingleListColAllNullsSepTrue() { try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, From 93fba2afda2d76bfe4bfe27678bd9b57c468ed17 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 16:56:04 -0500 Subject: [PATCH 12/22] Update spacing Signed-off-by: Thomas Graves --- cpp/CMakeLists.txt | 2 +- cpp/include/cudf/strings/combine.hpp | 155 ++++-- cpp/include/cudf/strings/detail/combine.hpp | 4 +- cpp/src/io/csv/writer_impl.cu | 9 +- cpp/src/strings/combine/concatenate.cu | 173 +++--- .../combine/concatenate_list_elements.cu | 264 --------- cpp/tests/CMakeLists.txt | 2 +- .../concatenate_list_elements_tests.cpp | 511 ------------------ .../strings/combine/concatenate_tests.cpp | 118 +++- java/src/main/native/src/ColumnVectorJni.cpp | 14 +- .../java/ai/rapids/cudf/ColumnVectorTest.java | 2 +- python/cudf/cudf/_lib/cpp/strings/combine.pxd | 4 +- python/cudf/cudf/_lib/strings/combine.pyx | 6 +- 13 files changed, 311 insertions(+), 953 deletions(-) delete mode 100644 cpp/src/strings/combine/concatenate_list_elements.cu delete mode 100644 cpp/tests/strings/combine/concatenate_list_elements_tests.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index abad4d7bbca..2bc1aa5b4e3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -332,8 +332,8 @@ add_library(cudf src/strings/char_types/char_cases.cu src/strings/char_types/char_types.cu src/strings/combine/concatenate.cu - src/strings/combine/concatenate_list_elements.cu src/strings/combine/join.cu + src/strings/combine/join_list_elements.cu src/strings/contains.cu src/strings/convert/convert_booleans.cu src/strings/convert/convert_datetime.cu diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp index 6887ef0e670..e0d90a99c65 100644 --- a/cpp/include/cudf/strings/combine.hpp +++ b/cpp/include/cudf/strings/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,12 +30,30 @@ namespace strings { * @brief Strings APIs for concatenate and join */ +/** + * @brief Setting for specifying how separators are added with + * null strings elements. + */ +enum class separator_on_nulls { + YES, ///< Always add separators between elements + NO ///< Do not add separators if an element is null +}; + +/** + * @brief Setting for specifying what will be output from `join_list_elements` when an input list + * is empty. + */ +enum class output_if_empty_list { + EMPTY_STRING, ///< Empty list will result in empty string + NULL_ELEMENT ///< Empty list will result in a null +}; + /** * @brief Concatenates all strings in the column into one new string delimited * by an optional separator string. * * This returns a column with one string. Any null entries are ignored unless - * the narep parameter specifies a replacement string. + * the @p narep parameter specifies a replacement string. * * @code{.pseudo} * Example: @@ -70,11 +88,9 @@ std::unique_ptr join_strings( * * - If row separator for a given row is null, output column for that row is null, unless * there is a valid @p separator_narep - * - If all column values for a given row is null, output column for that row is null, unless - * there is a valid @p col_narep - * - null column values for a given row are skipped, if the column replacement isn't valid - * - The separator is only applied between two valid column values - * - If valid @p separator_narep and @p col_narep are provided, the output column is always + * - The separator is applied between two output row values if the @p separate_nulls + * is `YES` or only between valid rows if @p separate_nulls is `NO`. + * - If @p separator_narep and @p col_narep are both valid, the output column is always * non nullable * * @code{.pseudo} @@ -83,16 +99,23 @@ std::unique_ptr join_strings( * c1 = [null, 'cc', 'dd', null, null, 'gg'] * c2 = ['bb', '', null, null, null, 'hh'] * sep = ['::', '%%', '^^', '!', '*', null] - * out0 = concatenate([c0, c1, c2], sep) - * out0 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, null] + * out = concatenate({c0, c1, c2}, sep) + * // all rows have at least one null or sep[i]==null + * out is [null, null, null, null, null, null] * * sep_rep = '+' - * out1 = concatenate([c0, c1, c2], sep, sep_rep) - * out1 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, 'ff+gg+hh'] - * - * col_rep = '-' - * out2 = concatenate([c0, c1, c2], sep, invalid_sep_rep, col_rep) - * out2 is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null] + * out = concatenate({c0, c1, c2}, sep, sep_rep) + * // all rows with at least one null output as null + * out is [null, null, null, null, null, 'ff+gg+hh'] + * + * col_narep = '-' + * out = concatenate({c0, c1, c2}, sep, col_narep) + * out is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null] + * + * col_narep = '' + * out = concatenate({c0, c1, c2}, sep, col_narep, separator_on_nulls:NO) + * // parameter suppresses separator for null rows + * out is ['aa::bb', 'cc%%', '^^dd', 'ee', '', null] * @endcode * * @throw cudf::logic_error if no input columns are specified - table view is empty @@ -108,6 +131,8 @@ std::unique_ptr join_strings( * @param col_narep String that should be used in place of any null strings * found in any column. Default of invalid-scalar means no null column value replacements. * Default is an invalid string. + * @param separate_nulls If YES, then the separator is included for null rows + * if `col_narep` is valid. * @param mr Resource for allocating device memory. * @return New column with concatenated results. */ @@ -116,15 +141,9 @@ std::unique_ptr concatenate( strings_column_view const& separators, string_scalar const& separator_narep = string_scalar("", false), string_scalar const& col_narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @addtogroup strings_combine - * @{ - * @file strings/combine.hpp - * @brief Strings APIs for concatenate and join - */ - /** * @brief Row-wise concatenates the given list of strings columns and * returns a single strings column result. @@ -136,16 +155,22 @@ std::unique_ptr concatenate( * row to be null entry unless a narep string is specified to be used * in its place. * - * The number of strings in the columns provided must be the same. + * If @p separate_nulls is set to `NO` and @p narep is valid then + * separators are not added to the output between null elements. + * Otherwise, separators are always added if @p narep is valid. * * @code{.pseudo} * Example: - * s1 = ['aa', null, '', 'aa'] - * s2 = ['', 'bb', 'bb', null] - * r1 = concatenate([s1,s2]) - * r1 is ['aa', null, 'bb', null] - * r2 = concatenate([s1,s2],':','_') - * r2 is ['aa:', '_:bb', ':bb', 'aa:_'] + * s1 = ['aa', null, '', 'dd'] + * s2 = ['', 'bb', 'cc', null] + * out = concatenate({s1, s2}) + * out is ['aa', null, 'cc', null] + * + * out = concatenate({s1, s2}, ':', '_') + * out is ['aa:', '_:bb', ':cc', 'dd:_'] + * + * out = concatenate({s1, s2}, ':', '', separator_on_nulls::NO) + * out is ['aa:', 'bb', ':cc', 'dd'] * @endcode * * @throw cudf::logic_error if input columns are not all strings columns. @@ -157,6 +182,7 @@ std::unique_ptr concatenate( * @param narep String that should be used in place of any null strings * found in any column. Default of invalid-scalar means any null entry in any column will * produces a null result for that row. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column with concatenated results. */ @@ -164,6 +190,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, string_scalar const& separator = string_scalar(""), string_scalar const& narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -171,24 +198,30 @@ std::unique_ptr concatenate( * within each row and returns a single strings column result. * * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the row separator provided in the `separators` strings column. + * delimited by the row separator provided in the @p separators strings column. * * A null list row will always result in a null string in the output row. Any non-null list row * having a null element will result in the corresponding output row to be null unless a valid - * `string_narep` scalar is provided to be used in its place. Any null row in the `separators` - * column will also result in a null output row unless a valid `separator_narep` scalar is provided + * @p string_narep scalar is provided to be used in its place. Any null row in the @p separators + * column will also result in a null output row unless a valid @p separator_narep scalar is provided * to be used in place of the null separators. * + * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the + * output between null elements. Otherwise, separators are always added if @p narep is valid. + * * @code{.pseudo} * Example: - * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff', 'gg'} ] + * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff', 'gg'] ] * sep = ['::', '%%', '!', '*', null] * - * r1 = strings::concatenate_list_elements(s, sep) - * r1 is ['aa::bb::cc', null, '!dd', null, null] + * out = join_list_elements(s, sep) + * out is ['aa::bb::cc', null, '!dd', null, null] + * + * out = join_list_elements(s, sep, ':', '_') + * out is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg'] * - * r2 = strings::concatenate_list_elements(s, sep, ':', '_') - * r2 is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg'] + * out = join_list_elements(s, sep, ':', '', separator_on_nulls::NO) + * out is ['aa::bb::cc', null, '!dd', 'ee', 'ff:gg'] * @endcode * * @throw cudf::logic_error if input column is not lists of strings column. @@ -203,36 +236,47 @@ std::unique_ptr concatenate( * @param string_narep String that should be used to replace null strings in any non-null list row, * default is an invalid-scalar denoting that list rows containing null strings will result * in null string in the corresponding output rows. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. + * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will + * result in an empty string. Otherwise, it will result in a null. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with concatenated results. */ -std::unique_ptr concatenate_list_elements( +std::unique_ptr join_list_elements( const lists_column_view& lists_strings_column, const strings_column_view& separators, - string_scalar const& separator_narep = string_scalar("", false), - string_scalar const& string_narep = string_scalar("", false), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + string_scalar const& separator_narep = string_scalar("", false), + string_scalar const& string_narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, + output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings * within each row and returns a single strings column result. * * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the separator provided. + * delimited by the @p separator provided. * * A null list row will always result in a null string in the output row. Any non-null list row - * having a null elenent will result in the corresponding output row to be null unless a narep - * string is specified to be used in its place. + * having a null elenent will result in the corresponding output row to be null unless a + * @p narep string is specified to be used in its place. + * + * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the + * output between null elements. Otherwise, separators are always added if @p narep is valid. * * @code{.pseudo} * Example: - * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff'} ] + * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff'] ] + * + * out = join_list_elements(s) + * out is ['aabbcc', null, 'dd', null, 'ff'] * - * r1 = strings::concatenate_list_elements(s) - * r1 is ['aabbcc', null, 'dd', null, 'ff'] + * out = join_list_elements(s, ':', '_') + * out is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff'] * - * r2 = strings::concatenate_list_elements(s, ':', '_') - * r2 is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff'] + * out = join_list_elements(s, ':', '', separator_on_nulls::NO) + * out is ['aa:bb:cc', null, ':dd', 'ee', 'ff'] * @endcode * * @throw cudf::logic_error if input column is not lists of strings column. @@ -244,14 +288,19 @@ std::unique_ptr concatenate_list_elements( * @param narep String that should be used to replace null strings in any non-null list row, default * is an invalid-scalar denoting that list rows containing null strings will result in null * string in the corresponding output rows. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. + * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will result + * in an empty string. Otherwise, it will result in a null. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with concatenated results. */ -std::unique_ptr concatenate_list_elements( +std::unique_ptr join_list_elements( const lists_column_view& lists_strings_column, - string_scalar const& separator = string_scalar(""), - string_scalar const& narep = string_scalar("", false), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + string_scalar const& separator = string_scalar(""), + string_scalar const& narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, + output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group } // namespace strings diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp index 6e25a4dfa38..d6bdf398886 100644 --- a/cpp/include/cudf/strings/detail/combine.hpp +++ b/cpp/include/cudf/strings/detail/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -36,6 +37,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index d2b6be5eead..68e365bbe60 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -404,11 +404,14 @@ void writer::impl::write(table_view const& table, auto str_table_view = str_table_ptr->view(); // concatenate columns in each row into one big string column - //(using null representation and delimiter): + // (using null representation and delimiter): // std::string delimiter_str{options_.get_inter_column_delimiter()}; - auto str_concat_col = cudf::strings::detail::concatenate( - str_table_view, delimiter_str, options_.get_na_rep(), stream); + auto str_concat_col = cudf::strings::detail::concatenate(str_table_view, + delimiter_str, + options_.get_na_rep(), + strings::separator_on_nulls::YES, + stream); write_chunked(str_concat_col->view(), metadata, stream); } diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index 5d7b9152ff3..d9c57a79045 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -41,55 +41,83 @@ namespace strings { namespace detail { namespace { -/** - * @brief Concatenate strings functor - * - * This will concatenate the strings from each row of the given table - * and apply the separator. The null-replacement string `d_narep` is - * used in place of any string in a row that contains a null entry. - */ -struct concat_strings_fn { +struct concat_strings_base { table_device_view const d_table; - string_view const d_separator; string_scalar_device_view const d_narep; + separator_on_nulls separate_nulls; offset_type* d_offsets{}; char* d_chars{}; - __device__ void operator()(size_type idx) + /** + * @brief Concatenate each table row to a single output string. + * + * This will concatenate the strings from each row of the given table + * and apply the separator. The null-replacement string `d_narep` is + * used in place of any string in a row that contains a null entry. + * + * @param idx The current row to process + * @param d_separator String to place in between each column's row + */ + __device__ void process_row(size_type idx, string_view const d_separator) { - bool const null_element = - thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { - return col.is_null(idx); - }); - // handle a null row - if (null_element && !d_narep.is_valid()) { + if (!d_narep.is_valid() && + thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { + return col.is_null(idx); + })) { if (!d_chars) d_offsets[idx] = 0; return; } - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - size_type bytes = 0; + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + offset_type bytes = 0; + bool write_separator = false; + for (auto itr = d_table.begin(); itr < d_table.end(); ++itr) { - auto const d_column = *itr; - auto const d_str = - d_column.is_null(idx) ? d_narep.value() : d_column.element(idx); - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); - bytes += d_str.size_bytes(); - // separator goes only in between elements - if (itr + 1 < d_table.end()) { + auto const d_column = *itr; + bool const null_element = d_column.is_null(idx); + + if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) { if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator); bytes += d_separator.size_bytes(); + write_separator = false; } + + // write out column's row data (or narep if the row is null) + auto const d_str = null_element ? d_narep.value() : d_column.element(idx); + if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); + bytes += d_str.size_bytes(); + + write_separator = + write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element; } + if (!d_chars) d_offsets[idx] = bytes; } }; +/** + * @brief Single separator concatenate functor + */ +struct concat_strings_fn : concat_strings_base { + string_view const d_separator; + + concat_strings_fn(table_device_view const& d_table, + string_view const& d_separator, + string_scalar_device_view const& d_narep, + separator_on_nulls separate_nulls) + : concat_strings_base{d_table, d_narep, separate_nulls}, d_separator(d_separator) + { + } + + __device__ void operator()(size_type idx) { process_row(idx, d_separator); } +}; + } // namespace std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -112,7 +140,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); - concat_strings_fn fn{*d_table, d_separator, d_narep}; + concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls}; auto children = make_strings_children(fn, strings_count, stream, mr); // create resulting null mask @@ -120,9 +148,9 @@ std::unique_ptr concatenate(table_view const& strings_columns, thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), [d_table = *d_table, d_narep] __device__(size_type idx) { - bool null_element = thrust::any_of( + if (d_narep.is_valid()) return true; + return !thrust::any_of( thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); }); - return (!null_element || d_narep.is_valid()); }, stream, mr); @@ -145,68 +173,42 @@ namespace { * when a separator row is null `d_separator_narep`. The `d_narep` is * used in place of a null entry in the strings columns. */ -struct multi_separator_concat_fn { - table_device_view const d_table; +struct multi_separator_concat_fn : concat_strings_base { column_device_view const d_separators; string_scalar_device_view const d_separator_narep; - string_scalar_device_view const d_narep; - offset_type* d_offsets{}; - char* d_chars{}; - __device__ void operator()(size_type idx) + multi_separator_concat_fn(table_device_view const& d_table, + column_device_view const& d_separators, + string_scalar_device_view const& d_separator_narep, + string_scalar_device_view const& d_narep, + separator_on_nulls separate_nulls) + : concat_strings_base{d_table, d_narep, separate_nulls}, + d_separators(d_separators), + d_separator_narep(d_separator_narep) { - bool const all_nulls = - thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { - return col.is_null(idx); - }); + } - if ((d_separators.is_null(idx) && !d_separator_narep.is_valid()) || - (all_nulls && !d_narep.is_valid())) { + __device__ void operator()(size_type idx) + { + if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) { if (!d_chars) d_offsets[idx] = 0; return; } - // point to output location - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - offset_type bytes = 0; - - // there is at least one non-null column value auto const d_separator = d_separators.is_valid(idx) ? d_separators.element(idx) : d_separator_narep.value(); - auto const d_null_rep = d_narep.is_valid() ? d_narep.value() : string_view{}; - - // write output entry for this row - bool colval_written = false; // state variable for writing separators - for (auto const d_column : d_table) { - // if the row is null and if there is no replacement, skip it - if (d_column.is_null(idx) && !d_narep.is_valid()) continue; - - // separator in this row is written only after the first output - if (colval_written) { - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator); - bytes += d_separator.size_bytes(); - } - - // write out column's row data (or narep if the row is null) - string_view const d_str = - d_column.is_null(idx) ? d_null_rep : d_column.element(idx); - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); - bytes += d_str.size_bytes(); - - // column's string or narep could by empty so we need this flag - // to know we got this far even if no actual bytes were copied - colval_written = true; // use the separator before the next column - } - - if (!d_chars) d_offsets[idx] = bytes; + // base class utility function handles the rest + process_row(idx, d_separator); } }; + } // namespace std::unique_ptr concatenate(table_view const& strings_columns, strings_column_view const& separators, string_scalar const& separator_narep, string_scalar const& col_narep, + separator_on_nulls separate_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -234,20 +236,19 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); - multi_separator_concat_fn mscf{*d_table, separator_col_view, separator_rep, col_rep}; + multi_separator_concat_fn mscf{ + *d_table, separator_col_view, separator_rep, col_rep, separate_nulls}; auto children = make_strings_children(mscf, strings_count, stream, mr); // Create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), - [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type ridx) { - if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return false; - bool all_nulls = - thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& col) { - return col.is_null(ridx); - }); - return all_nulls ? col_rep.is_valid() : true; + [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type idx) { + if (!separator_col_view.is_valid(idx) && !separator_rep.is_valid()) return false; + if (col_rep.is_valid()) return true; + return !thrust::any_of( + thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); }); }, stream, mr); @@ -268,21 +269,29 @@ std::unique_ptr concatenate(table_view const& strings_columns, std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate(strings_columns, separator, narep, rmm::cuda_stream_default, mr); + return detail::concatenate( + strings_columns, separator, narep, separate_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr concatenate(table_view const& strings_columns, strings_column_view const& separators, string_scalar const& separator_narep, string_scalar const& col_narep, + separator_on_nulls separate_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate( - strings_columns, separators, separator_narep, col_narep, rmm::cuda_stream_default, mr); + return detail::concatenate(strings_columns, + separators, + separator_narep, + col_narep, + separate_nulls, + rmm::cuda_stream_default, + mr); } } // namespace strings diff --git a/cpp/src/strings/combine/concatenate_list_elements.cu b/cpp/src/strings/combine/concatenate_list_elements.cu deleted file mode 100644 index 1157b8f3fce..00000000000 --- a/cpp/src/strings/combine/concatenate_list_elements.cu +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace cudf { -namespace strings { -namespace detail { - -namespace { -/** - * @brief Compute string sizes, string validities, and concatenate strings functor. - * - * This functor is executed twice. In the first pass, the sizes and validities of the output strings - * will be computed. In the second pass, this will concatenate the strings within each list element - * of the given lists column and apply the separator. The null-replacement string scalar - * `string_narep_dv` (if valid) is used in place of any null string. - * - * @tparam Functor The functor which can check for validity of the input list at a given list index - * as well as access to the separator corresponding to the list index. - */ -template -struct compute_size_and_concatenate_fn { - Functor const func; - column_device_view const lists_dv; - offset_type const* const list_offsets; - column_device_view const strings_dv; - string_scalar_device_view const string_narep_dv; - - offset_type* d_offsets{nullptr}; - - // If d_chars == nullptr: only compute sizes and validities of the output strings. - // If d_chars != nullptr: only concatenate strings. - char* d_chars{nullptr}; - - // We need to set `1` or `0` for the validities of the output strings. - int8_t* d_validities{nullptr}; - - __device__ void operator()(size_type const idx) - { - // If this is the second pass, and the row `idx` is known to be a null string - if (d_chars and not d_validities[idx]) { return; } - - if (not d_chars and func.is_null_list(lists_dv, idx)) { - d_offsets[idx] = 0; - d_validities[idx] = false; - return; - } - - auto const separator = func.separator(idx); - auto const separator_size = separator.size_bytes(); - auto size_bytes = size_type{0}; - bool written = false; - char* output_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; - - for (size_type str_idx = list_offsets[idx], idx_end = list_offsets[idx + 1]; str_idx < idx_end; - ++str_idx) { - if (not d_chars and (strings_dv.is_null(str_idx) and not string_narep_dv.is_valid())) { - d_offsets[idx] = 0; - d_validities[idx] = false; - return; // early termination: the entire list of strings will result in a null string - } - auto const d_str = strings_dv.is_null(str_idx) ? string_narep_dv.value() - : strings_dv.element(str_idx); - size_bytes += separator_size + d_str.size_bytes(); - if (output_ptr) { - // Separator is inserted only in between strings - if (written) { output_ptr = detail::copy_string(output_ptr, separator); } - output_ptr = detail::copy_string(output_ptr, d_str); - written = true; - } - } - - // Separator is inserted only in between strings - if (not d_chars) { - d_offsets[idx] = static_cast(size_bytes - separator_size); - d_validities[idx] = true; - } - } -}; - -/** - * @brief Functor accompanying with `compute_size_and_concatenate_fn` for computing output string - * sizes, output string validities, and concatenating strings within list elements; used when the - * separator is a string scalar. - */ -struct scalar_separator_fn { - string_scalar_device_view const d_separator; - - __device__ bool is_null_list(column_device_view const& lists_dv, size_type const idx) const - noexcept - { - return lists_dv.is_null(idx); - } - - __device__ string_view separator(size_type const) const noexcept { return d_separator.value(); } -}; - -} // namespace - -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - string_scalar const& separator, - string_scalar const& narep, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, - "The input column must be a column of lists of strings"); - CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar"); - - auto const num_rows = lists_strings_column.size(); - if (num_rows == 0) { return detail::make_empty_strings_column(stream, mr); } - - // Accessing the child strings column of the lists column must be done by calling `child()` on the - // lists column, not `get_sliced_child()`. This is because calling to `offsets_begin()` on the - // lists column returns a pointer to the offsets of the original lists column, which may not start - // from `0`. - auto const strings_col = strings_column_view(lists_strings_column.child()); - auto const lists_dv_ptr = column_device_view::create(lists_strings_column.parent(), stream); - auto const strings_dv_ptr = column_device_view::create(strings_col.parent(), stream); - auto const sep_dv = get_scalar_device_view(const_cast(separator)); - auto const string_narep_dv = get_scalar_device_view(const_cast(narep)); - - auto const func = scalar_separator_fn{sep_dv}; - auto const comp_fn = compute_size_and_concatenate_fn{ - func, - *lists_dv_ptr, - lists_strings_column.offsets_begin(), - *strings_dv_ptr, - string_narep_dv, - }; - auto [offsets_column, chars_column, null_mask, null_count] = - make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); - - return make_strings_column(num_rows, - std::move(offsets_column), - std::move(chars_column), - null_count, - std::move(null_mask), - stream, - mr); -} - -namespace { -/** - * @brief Functor accompanying with `compute_size_and_concatenate_fn` for computing output string - * sizes, output string validities, and concatenating strings within list elements; used when the - * separators are given as a strings column. - */ -struct column_separators_fn { - column_device_view const separators_dv; - string_scalar_device_view const sep_narep_dv; - - __device__ bool is_null_list(column_device_view const& lists_dv, size_type const idx) const - noexcept - { - return lists_dv.is_null(idx) or (separators_dv.is_null(idx) and not sep_narep_dv.is_valid()); - } - - __device__ string_view separator(size_type const idx) const noexcept - { - return separators_dv.is_valid(idx) ? separators_dv.element(idx) - : sep_narep_dv.value(); - } -}; - -} // namespace - -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - strings_column_view const& separators, - string_scalar const& separator_narep, - string_scalar const& string_narep, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, - "The input column must be a column of lists of strings"); - CUDF_EXPECTS(lists_strings_column.size() == separators.size(), - "Separators column should be the same size as the lists columns"); - - auto const num_rows = lists_strings_column.size(); - if (num_rows == 0) { return detail::make_empty_strings_column(stream, mr); } - - // Accessing the child strings column of the lists column must be done by calling `child()` on the - // lists column, not `get_sliced_child()`. This is because calling to `offsets_begin()` on the - // lists column returns a pointer to the offsets of the original lists column, which may not start - // from `0`. - auto const strings_col = strings_column_view(lists_strings_column.child()); - auto const lists_dv_ptr = column_device_view::create(lists_strings_column.parent(), stream); - auto const strings_dv_ptr = column_device_view::create(strings_col.parent(), stream); - auto const string_narep_dv = get_scalar_device_view(const_cast(string_narep)); - auto const sep_dv_ptr = column_device_view::create(separators.parent(), stream); - auto const sep_narep_dv = get_scalar_device_view(const_cast(separator_narep)); - - auto const func = column_separators_fn{*sep_dv_ptr, sep_narep_dv}; - auto const comp_fn = compute_size_and_concatenate_fn{ - func, - *lists_dv_ptr, - lists_strings_column.offsets_begin(), - *strings_dv_ptr, - string_narep_dv, - }; - auto [offsets_column, chars_column, null_mask, null_count] = - make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); - - return make_strings_column(num_rows, - std::move(offsets_column), - std::move(chars_column), - null_count, - std::move(null_mask), - stream, - mr); -} - -} // namespace detail - -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - string_scalar const& separator, - string_scalar const& narep, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements( - lists_strings_column, separator, narep, rmm::cuda_stream_default, mr); -} - -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - strings_column_view const& separators, - string_scalar const& separator_narep, - string_scalar const& string_narep, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements( - lists_strings_column, separators, separator_narep, string_narep, rmm::cuda_stream_default, mr); -} - -} // namespace strings -} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d87b4b81bdc..ce80e4bf064 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -328,8 +328,8 @@ ConfigureTest(STRINGS_TEST strings/booleans_tests.cpp strings/case_tests.cpp strings/chars_types_tests.cpp - strings/combine/concatenate_list_elements_tests.cpp strings/combine/concatenate_tests.cpp + strings/combine/join_list_elements_tests.cpp strings/combine/join_strings_tests.cpp strings/concatenate_tests.cpp strings/contains_tests.cpp diff --git a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp b/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp deleted file mode 100644 index b6afd588dfb..00000000000 --- a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp +++ /dev/null @@ -1,511 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include - -struct StringsListsConcatenateTest : public cudf::test::BaseFixture { -}; - -namespace { -using STR_LISTS = cudf::test::lists_column_wrapper; -using STR_COL = cudf::test::strings_column_wrapper; -using INT_LISTS = cudf::test::lists_column_wrapper; - -constexpr bool print_all{false}; - -auto null_at(cudf::size_type idx) -{ - return cudf::detail::make_counting_transform_iterator(0, [idx](auto i) { return i != idx; }); -} - -auto all_nulls() -{ - return cudf::detail::make_counting_transform_iterator(0, [](auto) { return false; }); -} - -auto nulls_from_nullptr(std::vector const& strs) -{ - return thrust::make_transform_iterator(strs.begin(), [](auto ptr) { return ptr != nullptr; }); -} - -} // namespace - -TEST_F(StringsListsConcatenateTest, InvalidInput) -{ - // Invalid list type - { - auto const string_lists = INT_LISTS{{1, 2, 3}, {4, 5, 6}}.release(); - auto const string_lv = cudf::lists_column_view(string_lists->view()); - EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv), cudf::logic_error); - } - - // Invalid scalar separator - { - auto const string_lists = - STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); - auto const string_lv = cudf::lists_column_view(string_lists->view()); - EXPECT_THROW( - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("", false)), - cudf::logic_error); - } - - // Invalid column separators - { - auto const string_lists = - STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); - auto const string_lv = cudf::lists_column_view(string_lists->view()); - auto const separators = STR_COL{"+++"}.release(); // size doesn't match with lists column size - EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv, separators->view()), - cudf::logic_error); - } -} - -TEST_F(StringsListsConcatenateTest, EmptyInput) -{ - auto const string_lists = STR_LISTS{}.release(); - auto const string_lv = cudf::lists_column_view(string_lists->view()); - auto const expected = STR_COL{}; - auto results = cudf::strings::concatenate_list_elements(string_lv); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - - auto const separators = STR_COL{}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); -} - -TEST_F(StringsListsConcatenateTest, ZeroSizeStringsInput) -{ - auto const string_lists = - STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); - auto const string_lv = cudf::lists_column_view(string_lists->view()); - auto const expected = STR_COL{"", "", ""}; - - auto results = cudf::strings::concatenate_list_elements(string_lv); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - - auto const separators = STR_COL{"", "", ""}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); -} - -TEST_F(StringsListsConcatenateTest, AllNullsStringsInput) -{ - auto const string_lists = STR_LISTS{ - STR_LISTS{{""}, all_nulls()}, - STR_LISTS{{"", "", ""}, all_nulls()}, - STR_LISTS{{"", ""}, - all_nulls()}}.release(); - auto const string_lv = cudf::lists_column_view(string_lists->view()); - auto const expected = STR_COL{{"", "", ""}, all_nulls()}; - - auto results = cudf::strings::concatenate_list_elements(string_lv); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - - auto const separators = STR_COL{{"", "", ""}, all_nulls()}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); -} - -TEST_F(StringsListsConcatenateTest, ScalarSeparator) -{ - auto const string_lists = STR_LISTS{{STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, - STR_LISTS{}, /*NULL*/ - STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, - STR_LISTS{"zzz", "xxxxx"}}, - null_at(1)} - .release(); - auto const string_lv = cudf::lists_column_view(string_lists->view()); - - // No null replacement - { - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); - std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // With null replacement - { - auto const results = cudf::strings::concatenate_list_elements( - string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); - std::vector h_expected{ - "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } -} - -TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) -{ - auto const string_lists = STR_LISTS{ - {STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, - STR_LISTS{}, /*NULL*/ - STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, - STR_LISTS{"zzz", "xxxxx"}, - STR_LISTS{"11111", "11111", "11111", "11111", "11111"}, /*NULL*/ - STR_LISTS{{"abcdef", "012345", "" /*NULL*/, "xxx000"}, null_at(2)}, - STR_LISTS{{"xyz" /*NULL*/, "11111", "00000"}, null_at(0)}, - STR_LISTS{"0a0b0c", "5x5y5z"}, - STR_LISTS{"xxx"}, /*NULL*/ - STR_LISTS{"ééé", "12345abcdef"}, - STR_LISTS{"aaaééébbbéééccc", "12345"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 1 && i != 4 && i != 8; - })}.release(); - - // Sliced the entire lists column, no null replacement - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); - std::vector h_expected{nullptr, - nullptr, - nullptr, - "zzz+++xxxxx", - nullptr, - nullptr, - nullptr, - "0a0b0c+++5x5y5z", - nullptr, - "ééé+++12345abcdef", - "aaaééébbbéééccc+++12345"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the entire lists column, with null replacement - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( - string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); - std::vector h_expected{"a+++___+++ccc", - nullptr, - "___+++efgh+++ijk", - "zzz+++xxxxx", - nullptr, - "abcdef+++012345+++___+++xxx000", - "___+++11111+++00000", - "0a0b0c+++5x5y5z", - nullptr, - "ééé+++12345abcdef", - "aaaééébbbéééccc+++12345"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the first half of the lists column, no null replacement - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); - std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the first half of the lists column, with null replacement - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements( - string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); - std::vector h_expected{ - "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the second half of the lists column, no null replacement - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); - std::vector h_expected{ - nullptr, nullptr, "0a0b0c+++5x5y5z", nullptr, "ééé+++12345abcdef", "aaaééébbbéééccc+++12345"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the second half of the lists column, with null replacement - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( - string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); - std::vector h_expected{"abcdef+++012345+++___+++xxx000", - "___+++11111+++00000", - "0a0b0c+++5x5y5z", - nullptr, - "ééé+++12345abcdef", - "aaaééébbbéééccc+++12345"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the middle part of the lists column, no null replacement - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); - std::vector h_expected{ - "zzz+++xxxxx", nullptr, nullptr, nullptr, "0a0b0c+++5x5y5z"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the middle part of the lists column, with null replacement - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements( - string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); - std::vector h_expected{"zzz+++xxxxx", - nullptr, - "abcdef+++012345+++___+++xxx000", - "___+++11111+++00000", - "0a0b0c+++5x5y5z"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } -} - -TEST_F(StringsListsConcatenateTest, ColumnSeparators) -{ - auto const string_lists = STR_LISTS{{STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, - STR_LISTS{}, /*NULL*/ - STR_LISTS{"0a0b0c", "xyzééé"}, - STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, - STR_LISTS{{"ééé" /*NULL*/, "ááá", "ííí"}, null_at(0)}, - STR_LISTS{"zzz", "xxxxx"}}, - null_at(1)} - .release(); - auto const string_lv = cudf::lists_column_view(string_lists->view()); - auto const separators = STR_COL{ - {"+++", "***", "!!!" /*NULL*/, "$$$" /*NULL*/, "%%%", "^^^"}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 2 && i != 3; - })}.release(); - - // No null replacement - { - auto const results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); - std::vector h_expected{nullptr, nullptr, nullptr, nullptr, nullptr, "zzz^^^xxxxx"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // With null replacement for separators - { - auto const results = cudf::strings::concatenate_list_elements( - string_lv, separators->view(), cudf::string_scalar("|||")); - std::vector h_expected{ - nullptr, nullptr, "0a0b0c|||xyzééé", nullptr, nullptr, "zzz^^^xxxxx"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // With null replacement for strings - { - auto const results = cudf::strings::concatenate_list_elements( - string_lv, separators->view(), cudf::string_scalar("", false), cudf::string_scalar("XXXXX")); - std::vector h_expected{ - "a+++XXXXX+++ccc", nullptr, nullptr, nullptr, "XXXXX%%%ááá%%%ííí", "zzz^^^xxxxx"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // With null replacement for both separators and strings - { - auto const results = cudf::strings::concatenate_list_elements( - string_lv, separators->view(), cudf::string_scalar("|||"), cudf::string_scalar("XXXXX")); - std::vector h_expected{"a+++XXXXX+++ccc", - nullptr, - "0a0b0c|||xyzééé", - "XXXXX|||efgh|||ijk", - "XXXXX%%%ááá%%%ííí", - "zzz^^^xxxxx"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } -} - -TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) -{ - auto const string_lists = STR_LISTS{ - {STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, - STR_LISTS{}, /*NULL*/ - STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, - STR_LISTS{"zzz", "xxxxx"}, - STR_LISTS{"11111", "11111", "11111", "11111", "11111"}, /*NULL*/ - STR_LISTS{{"abcdef", "012345", "" /*NULL*/, "xxx000"}, null_at(2)}, - STR_LISTS{{"xyz" /*NULL*/, "11111", "00000"}, null_at(0)}, - STR_LISTS{"0a0b0c", "5x5y5z"}, - STR_LISTS{"xxx"}, /*NULL*/ - STR_LISTS{"ééé", "12345abcdef"}, - STR_LISTS{"aaaééébbbéééccc", "12345"}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 1 && i != 4 && i != 8; - })}.release(); - auto const separators = STR_COL{ - {"+++", "***", "!!!" /*NULL*/, "$$$" /*NULL*/, "%%%", "^^^", "~!~", "###", "&&&", "-+-", "=+="}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i != 2 && i != 3; - })}.release(); - - // Sliced the entire lists column, no null replacement - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); - auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); - std::vector h_expected{nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - "0a0b0c###5x5y5z", - nullptr, - "ééé-+-12345abcdef", - "aaaééébbbéééccc=+=12345"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the entire lists column, with null replacements - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); - auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( - string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); - std::vector h_expected{"a+++___+++ccc", - nullptr, - "___|||efgh|||ijk", - "zzz|||xxxxx", - nullptr, - "abcdef^^^012345^^^___^^^xxx000", - "___~!~11111~!~00000", - "0a0b0c###5x5y5z", - nullptr, - "ééé-+-12345abcdef", - "aaaééébbbéééccc=+=12345"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the first half of the lists column, no null replacement - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); - auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); - std::vector h_expected{nullptr, nullptr, nullptr, nullptr}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the first half of the lists column, with null replacements - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); - auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements( - string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); - std::vector h_expected{ - "a+++___+++ccc", nullptr, "___|||efgh|||ijk", "zzz|||xxxxx"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the second half of the lists column, no null replacement - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); - auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); - std::vector h_expected{ - nullptr, nullptr, "0a0b0c###5x5y5z", nullptr, "ééé-+-12345abcdef", "aaaééébbbéééccc=+=12345"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the second half of the lists column, with null replacements - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); - auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( - string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); - std::vector h_expected{"abcdef^^^012345^^^___^^^xxx000", - "___~!~11111~!~00000", - "0a0b0c###5x5y5z", - nullptr, - "ééé-+-12345abcdef", - "aaaééébbbéééccc=+=12345"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the middle part of the lists column, no null replacement - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); - auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); - std::vector h_expected{nullptr, nullptr, nullptr, nullptr, "0a0b0c###5x5y5z"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } - - // Sliced the middle part of the lists column, with null replacements - { - auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); - auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements( - string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); - std::vector h_expected{"zzz|||xxxxx", - nullptr, - "abcdef^^^012345^^^___^^^xxx000", - "___~!~11111~!~00000", - "0a0b0c###5x5y5z"}; - auto const expected = - STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - } -} diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp index c1c390e8a82..3291f066cac 100644 --- a/cpp/tests/strings/combine/concatenate_tests.cpp +++ b/cpp/tests/strings/combine/concatenate_tests.cpp @@ -95,6 +95,58 @@ TEST_F(StringsCombineTest, Concatenate) } } +TEST_F(StringsCombineTest, ConcatenateSkipNulls) +{ + cudf::test::strings_column_wrapper strings1({"eee", "", "", "", "aa", "bbb", "ééé"}, + {1, 0, 0, 1, 1, 1, 1}); + cudf::test::strings_column_wrapper strings2({"xyz", "", "d", "éa", "", "", "f"}, + {1, 0, 1, 1, 1, 0, 1}); + cudf::test::strings_column_wrapper strings3({"q", "", "s", "t", "u", "", "w"}, + {1, 1, 1, 1, 1, 0, 1}); + + cudf::table_view table({strings1, strings2, strings3}); + + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "++", "+d+s", "+éa+t", "aa++u", "bbb++", "ééé+f+w"}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::YES); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "d+s", "+éa+t", "aa++u", "bbb", "ééé+f+w"}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "", "+éa+t", "aa++u", "", "ééé+f+w"}, {1, 0, 0, 1, 1, 0, 1}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar("", false), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper sep_col({"+", "-", ".", "@", "*", "^^", "#"}); + auto results = cudf::strings::concatenate(table, + cudf::strings_column_view(sep_col), + cudf::string_scalar(""), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "d.s", "@éa@t", "aa**u", "bbb", "ééé#f#w"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } +} + TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( @@ -295,12 +347,20 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnEmptyAndNullStringsNoR auto sep_col = cudf::test::strings_column_wrapper( {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false}); - auto exp_results = cudf::test::strings_column_wrapper( - {"", "", "", "", "", "", "", ""}, {false, false, true, false, true, false, true, false}); - + auto exp_results1 = cudf::test::strings_column_wrapper( + {"", "", "", "", "", "", "", ""}, {false, false, true, false, false, false, false, false}); auto results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false}); + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + cudf::string_scalar("", false), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacements) @@ -315,13 +375,23 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacement {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"}, {true, true, false, true, false, true, false, true, true, true, true, true}); - auto exp_results = cudf::test::strings_column_wrapper( - {"eeexyzfoo", "~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""}, - {true, true, false, true, false, true, false, true, true, false, false, false}); + auto exp_results1 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "", "", "", "", "", "", "", "", "", ""}, + {true, true, false, false, false, false, false, false, false, false, false, false}); auto results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""}, + {true, true, false, true, false, true, false, true, true, true, true, true}); + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + cudf::string_scalar("", false), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorReplacement) @@ -335,26 +405,26 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorRepl auto sep_col = cudf::test::strings_column_wrapper( {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"}, {true, true, false, true, false, true, false, true, true, true, true, true}); - auto sep_rep = cudf::string_scalar("!!!!!!!!!!"); + auto sep_rep = cudf::string_scalar("!!!!!!!"); - auto exp_results = cudf::test::strings_column_wrapper( - {"eeexyzfoo", - "~~~", - "!!!!!!!!!!éaff", - "éééf", - "éa", - "", - "éaff", - "valid", - "doo", - "", - "", - ""}, - {true, true, true, true, true, true, true, true, true, false, false, false}); + auto exp_results1 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""}, + {true, true, true, false, false, false, false, false, false, false, false, false}); auto results = cudf::strings::concatenate( cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col), sep_rep); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""}, + {true, true, true, true, true, true, true, true, true, true, true, true}); + + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + sep_rep, + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixColumnReplacement) diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index 051ce5e9277..c39175bceac 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -157,11 +157,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenation(JNI } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(JNIEnv *env, jclass, - jlongArray column_handles, - jlong sep_handle, - jlong separator_narep, - jlong col_narep, - jboolean separate_nulls) { + jlongArray column_handles, + jlong sep_handle, + jlong separator_narep, + jlong col_narep, + jboolean separate_nulls) { JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0); JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0); JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0); @@ -169,7 +169,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepC try { cudf::jni::auto_set_device(env); const auto& separator_narep_scalar = *reinterpret_cast(separator_narep); - const auto& col_narep_scalar = *reinterpret_cast(col_narep); + const auto& col_narep_scalar = *reinterpret_cast(col_narep); auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES : cudf::strings::separator_on_nulls::NO; @@ -204,7 +204,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationList try { cudf::jni::auto_set_device(env); const auto& separator_narep_scalar = *reinterpret_cast(separator_narep); - const auto& col_narep_scalar = *reinterpret_cast(col_narep); + const auto& col_narep_scalar = *reinterpret_cast(col_narep); auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES : cudf::strings::separator_on_nulls::NO; auto empty_list_output = diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 2615de3f31c..1371d43195c 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2110,7 +2110,7 @@ void testStringConcatSeparators() { } @Test - void testStringConcatWsSeparators() { + void testStringConcatSeparatorsEmptyStringForNull() { try (ColumnVector sv1 = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf", null, null, "\\G\u0100"); ColumnVector sv2 = ColumnVector.fromStrings("b", "C", "\u0500\u0501", "x\nYz", null, null, "", null); ColumnVector e_concat = ColumnVector.fromStrings("aA1\t\ud721b", "BA1\t\ud721C", "cdA1\t\ud721\u0500\u0501", diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/cpp/strings/combine.pxd index 250c6441882..51c706b68d0 100644 --- a/python/cudf/cudf/_lib/cpp/strings/combine.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/combine.pxd @@ -18,13 +18,13 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil: string_scalar separator, string_scalar narep) except + - cdef unique_ptr[column] concatenate_list_elements( + cdef unique_ptr[column] join_list_elements( column_view lists_strings_column, column_view separators, string_scalar separator_narep, string_scalar string_narep) except + - cdef unique_ptr[column] concatenate_list_elements( + cdef unique_ptr[column] join_list_elements( column_view lists_strings_column, string_scalar separator, string_scalar narep) except + diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 25619de3ed0..0d7dfb5c619 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -16,7 +16,7 @@ from cudf._lib.table cimport Table from cudf._lib.cpp.strings.combine cimport ( concatenate as cpp_concatenate, join_strings as cpp_join_strings, - concatenate_list_elements as cpp_concatenate_list_elements + join_list_elements as cpp_join_list_elements ) @@ -105,7 +105,7 @@ def join_lists_with_scalar( ) with nogil: - c_result = move(cpp_concatenate_list_elements( + c_result = move(cpp_join_list_elements( source_view, scalar_separator[0], scalar_narep[0] @@ -142,7 +142,7 @@ def join_lists_with_column( ) with nogil: - c_result = move(cpp_concatenate_list_elements( + c_result = move(cpp_join_list_elements( source_view, separator_view, scalar_separator_narep[0], From f3c18a8fe1555410033e02a7d6c424a420129a98 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 16:57:59 -0500 Subject: [PATCH 13/22] Revert "Update spacing" This reverts commit 93fba2afda2d76bfe4bfe27678bd9b57c468ed17. --- cpp/CMakeLists.txt | 2 +- cpp/include/cudf/strings/combine.hpp | 155 ++---- cpp/include/cudf/strings/detail/combine.hpp | 4 +- cpp/src/io/csv/writer_impl.cu | 9 +- cpp/src/strings/combine/concatenate.cu | 173 +++--- .../combine/concatenate_list_elements.cu | 264 +++++++++ cpp/tests/CMakeLists.txt | 2 +- .../concatenate_list_elements_tests.cpp | 511 ++++++++++++++++++ .../strings/combine/concatenate_tests.cpp | 118 +--- java/src/main/native/src/ColumnVectorJni.cpp | 14 +- .../java/ai/rapids/cudf/ColumnVectorTest.java | 2 +- python/cudf/cudf/_lib/cpp/strings/combine.pxd | 4 +- python/cudf/cudf/_lib/strings/combine.pyx | 6 +- 13 files changed, 953 insertions(+), 311 deletions(-) create mode 100644 cpp/src/strings/combine/concatenate_list_elements.cu create mode 100644 cpp/tests/strings/combine/concatenate_list_elements_tests.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2bc1aa5b4e3..abad4d7bbca 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -332,8 +332,8 @@ add_library(cudf src/strings/char_types/char_cases.cu src/strings/char_types/char_types.cu src/strings/combine/concatenate.cu + src/strings/combine/concatenate_list_elements.cu src/strings/combine/join.cu - src/strings/combine/join_list_elements.cu src/strings/contains.cu src/strings/convert/convert_booleans.cu src/strings/convert/convert_datetime.cu diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp index e0d90a99c65..6887ef0e670 100644 --- a/cpp/include/cudf/strings/combine.hpp +++ b/cpp/include/cudf/strings/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,30 +30,12 @@ namespace strings { * @brief Strings APIs for concatenate and join */ -/** - * @brief Setting for specifying how separators are added with - * null strings elements. - */ -enum class separator_on_nulls { - YES, ///< Always add separators between elements - NO ///< Do not add separators if an element is null -}; - -/** - * @brief Setting for specifying what will be output from `join_list_elements` when an input list - * is empty. - */ -enum class output_if_empty_list { - EMPTY_STRING, ///< Empty list will result in empty string - NULL_ELEMENT ///< Empty list will result in a null -}; - /** * @brief Concatenates all strings in the column into one new string delimited * by an optional separator string. * * This returns a column with one string. Any null entries are ignored unless - * the @p narep parameter specifies a replacement string. + * the narep parameter specifies a replacement string. * * @code{.pseudo} * Example: @@ -88,9 +70,11 @@ std::unique_ptr join_strings( * * - If row separator for a given row is null, output column for that row is null, unless * there is a valid @p separator_narep - * - The separator is applied between two output row values if the @p separate_nulls - * is `YES` or only between valid rows if @p separate_nulls is `NO`. - * - If @p separator_narep and @p col_narep are both valid, the output column is always + * - If all column values for a given row is null, output column for that row is null, unless + * there is a valid @p col_narep + * - null column values for a given row are skipped, if the column replacement isn't valid + * - The separator is only applied between two valid column values + * - If valid @p separator_narep and @p col_narep are provided, the output column is always * non nullable * * @code{.pseudo} @@ -99,23 +83,16 @@ std::unique_ptr join_strings( * c1 = [null, 'cc', 'dd', null, null, 'gg'] * c2 = ['bb', '', null, null, null, 'hh'] * sep = ['::', '%%', '^^', '!', '*', null] - * out = concatenate({c0, c1, c2}, sep) - * // all rows have at least one null or sep[i]==null - * out is [null, null, null, null, null, null] + * out0 = concatenate([c0, c1, c2], sep) + * out0 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, null] * * sep_rep = '+' - * out = concatenate({c0, c1, c2}, sep, sep_rep) - * // all rows with at least one null output as null - * out is [null, null, null, null, null, 'ff+gg+hh'] - * - * col_narep = '-' - * out = concatenate({c0, c1, c2}, sep, col_narep) - * out is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null] - * - * col_narep = '' - * out = concatenate({c0, c1, c2}, sep, col_narep, separator_on_nulls:NO) - * // parameter suppresses separator for null rows - * out is ['aa::bb', 'cc%%', '^^dd', 'ee', '', null] + * out1 = concatenate([c0, c1, c2], sep, sep_rep) + * out1 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, 'ff+gg+hh'] + * + * col_rep = '-' + * out2 = concatenate([c0, c1, c2], sep, invalid_sep_rep, col_rep) + * out2 is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null] * @endcode * * @throw cudf::logic_error if no input columns are specified - table view is empty @@ -131,8 +108,6 @@ std::unique_ptr join_strings( * @param col_narep String that should be used in place of any null strings * found in any column. Default of invalid-scalar means no null column value replacements. * Default is an invalid string. - * @param separate_nulls If YES, then the separator is included for null rows - * if `col_narep` is valid. * @param mr Resource for allocating device memory. * @return New column with concatenated results. */ @@ -141,9 +116,15 @@ std::unique_ptr concatenate( strings_column_view const& separators, string_scalar const& separator_narep = string_scalar("", false), string_scalar const& col_narep = string_scalar("", false), - separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @addtogroup strings_combine + * @{ + * @file strings/combine.hpp + * @brief Strings APIs for concatenate and join + */ + /** * @brief Row-wise concatenates the given list of strings columns and * returns a single strings column result. @@ -155,22 +136,16 @@ std::unique_ptr concatenate( * row to be null entry unless a narep string is specified to be used * in its place. * - * If @p separate_nulls is set to `NO` and @p narep is valid then - * separators are not added to the output between null elements. - * Otherwise, separators are always added if @p narep is valid. + * The number of strings in the columns provided must be the same. * * @code{.pseudo} * Example: - * s1 = ['aa', null, '', 'dd'] - * s2 = ['', 'bb', 'cc', null] - * out = concatenate({s1, s2}) - * out is ['aa', null, 'cc', null] - * - * out = concatenate({s1, s2}, ':', '_') - * out is ['aa:', '_:bb', ':cc', 'dd:_'] - * - * out = concatenate({s1, s2}, ':', '', separator_on_nulls::NO) - * out is ['aa:', 'bb', ':cc', 'dd'] + * s1 = ['aa', null, '', 'aa'] + * s2 = ['', 'bb', 'bb', null] + * r1 = concatenate([s1,s2]) + * r1 is ['aa', null, 'bb', null] + * r2 = concatenate([s1,s2],':','_') + * r2 is ['aa:', '_:bb', ':bb', 'aa:_'] * @endcode * * @throw cudf::logic_error if input columns are not all strings columns. @@ -182,7 +157,6 @@ std::unique_ptr concatenate( * @param narep String that should be used in place of any null strings * found in any column. Default of invalid-scalar means any null entry in any column will * produces a null result for that row. - * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column with concatenated results. */ @@ -190,7 +164,6 @@ std::unique_ptr concatenate( table_view const& strings_columns, string_scalar const& separator = string_scalar(""), string_scalar const& narep = string_scalar("", false), - separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -198,30 +171,24 @@ std::unique_ptr concatenate( * within each row and returns a single strings column result. * * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the row separator provided in the @p separators strings column. + * delimited by the row separator provided in the `separators` strings column. * * A null list row will always result in a null string in the output row. Any non-null list row * having a null element will result in the corresponding output row to be null unless a valid - * @p string_narep scalar is provided to be used in its place. Any null row in the @p separators - * column will also result in a null output row unless a valid @p separator_narep scalar is provided + * `string_narep` scalar is provided to be used in its place. Any null row in the `separators` + * column will also result in a null output row unless a valid `separator_narep` scalar is provided * to be used in place of the null separators. * - * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the - * output between null elements. Otherwise, separators are always added if @p narep is valid. - * * @code{.pseudo} * Example: - * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff', 'gg'] ] + * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff', 'gg'} ] * sep = ['::', '%%', '!', '*', null] * - * out = join_list_elements(s, sep) - * out is ['aa::bb::cc', null, '!dd', null, null] - * - * out = join_list_elements(s, sep, ':', '_') - * out is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg'] + * r1 = strings::concatenate_list_elements(s, sep) + * r1 is ['aa::bb::cc', null, '!dd', null, null] * - * out = join_list_elements(s, sep, ':', '', separator_on_nulls::NO) - * out is ['aa::bb::cc', null, '!dd', 'ee', 'ff:gg'] + * r2 = strings::concatenate_list_elements(s, sep, ':', '_') + * r2 is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg'] * @endcode * * @throw cudf::logic_error if input column is not lists of strings column. @@ -236,47 +203,36 @@ std::unique_ptr concatenate( * @param string_narep String that should be used to replace null strings in any non-null list row, * default is an invalid-scalar denoting that list rows containing null strings will result * in null string in the corresponding output rows. - * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. - * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will - * result in an empty string. Otherwise, it will result in a null. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with concatenated results. */ -std::unique_ptr join_list_elements( +std::unique_ptr concatenate_list_elements( const lists_column_view& lists_strings_column, const strings_column_view& separators, - string_scalar const& separator_narep = string_scalar("", false), - string_scalar const& string_narep = string_scalar("", false), - separator_on_nulls separate_nulls = separator_on_nulls::YES, - output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + string_scalar const& separator_narep = string_scalar("", false), + string_scalar const& string_narep = string_scalar("", false), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings * within each row and returns a single strings column result. * * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the @p separator provided. + * delimited by the separator provided. * * A null list row will always result in a null string in the output row. Any non-null list row - * having a null elenent will result in the corresponding output row to be null unless a - * @p narep string is specified to be used in its place. - * - * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the - * output between null elements. Otherwise, separators are always added if @p narep is valid. + * having a null elenent will result in the corresponding output row to be null unless a narep + * string is specified to be used in its place. * * @code{.pseudo} * Example: - * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff'] ] - * - * out = join_list_elements(s) - * out is ['aabbcc', null, 'dd', null, 'ff'] + * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff'} ] * - * out = join_list_elements(s, ':', '_') - * out is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff'] + * r1 = strings::concatenate_list_elements(s) + * r1 is ['aabbcc', null, 'dd', null, 'ff'] * - * out = join_list_elements(s, ':', '', separator_on_nulls::NO) - * out is ['aa:bb:cc', null, ':dd', 'ee', 'ff'] + * r2 = strings::concatenate_list_elements(s, ':', '_') + * r2 is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff'] * @endcode * * @throw cudf::logic_error if input column is not lists of strings column. @@ -288,19 +244,14 @@ std::unique_ptr join_list_elements( * @param narep String that should be used to replace null strings in any non-null list row, default * is an invalid-scalar denoting that list rows containing null strings will result in null * string in the corresponding output rows. - * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. - * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will result - * in an empty string. Otherwise, it will result in a null. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with concatenated results. */ -std::unique_ptr join_list_elements( +std::unique_ptr concatenate_list_elements( const lists_column_view& lists_strings_column, - string_scalar const& separator = string_scalar(""), - string_scalar const& narep = string_scalar("", false), - separator_on_nulls separate_nulls = separator_on_nulls::YES, - output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + string_scalar const& separator = string_scalar(""), + string_scalar const& narep = string_scalar("", false), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group } // namespace strings diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp index d6bdf398886..6e25a4dfa38 100644 --- a/cpp/include/cudf/strings/detail/combine.hpp +++ b/cpp/include/cudf/strings/detail/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #include #include -#include #include #include @@ -37,7 +36,6 @@ std::unique_ptr concatenate( table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, - separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 68e365bbe60..d2b6be5eead 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -404,14 +404,11 @@ void writer::impl::write(table_view const& table, auto str_table_view = str_table_ptr->view(); // concatenate columns in each row into one big string column - // (using null representation and delimiter): + //(using null representation and delimiter): // std::string delimiter_str{options_.get_inter_column_delimiter()}; - auto str_concat_col = cudf::strings::detail::concatenate(str_table_view, - delimiter_str, - options_.get_na_rep(), - strings::separator_on_nulls::YES, - stream); + auto str_concat_col = cudf::strings::detail::concatenate( + str_table_view, delimiter_str, options_.get_na_rep(), stream); write_chunked(str_concat_col->view(), metadata, stream); } diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index d9c57a79045..5d7b9152ff3 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -41,83 +41,55 @@ namespace strings { namespace detail { namespace { -struct concat_strings_base { +/** + * @brief Concatenate strings functor + * + * This will concatenate the strings from each row of the given table + * and apply the separator. The null-replacement string `d_narep` is + * used in place of any string in a row that contains a null entry. + */ +struct concat_strings_fn { table_device_view const d_table; + string_view const d_separator; string_scalar_device_view const d_narep; - separator_on_nulls separate_nulls; offset_type* d_offsets{}; char* d_chars{}; - /** - * @brief Concatenate each table row to a single output string. - * - * This will concatenate the strings from each row of the given table - * and apply the separator. The null-replacement string `d_narep` is - * used in place of any string in a row that contains a null entry. - * - * @param idx The current row to process - * @param d_separator String to place in between each column's row - */ - __device__ void process_row(size_type idx, string_view const d_separator) + __device__ void operator()(size_type idx) { - if (!d_narep.is_valid() && - thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { - return col.is_null(idx); - })) { + bool const null_element = + thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { + return col.is_null(idx); + }); + // handle a null row + if (null_element && !d_narep.is_valid()) { if (!d_chars) d_offsets[idx] = 0; return; } - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - offset_type bytes = 0; - bool write_separator = false; - + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + size_type bytes = 0; for (auto itr = d_table.begin(); itr < d_table.end(); ++itr) { - auto const d_column = *itr; - bool const null_element = d_column.is_null(idx); - - if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) { + auto const d_column = *itr; + auto const d_str = + d_column.is_null(idx) ? d_narep.value() : d_column.element(idx); + if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); + bytes += d_str.size_bytes(); + // separator goes only in between elements + if (itr + 1 < d_table.end()) { if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator); bytes += d_separator.size_bytes(); - write_separator = false; } - - // write out column's row data (or narep if the row is null) - auto const d_str = null_element ? d_narep.value() : d_column.element(idx); - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); - bytes += d_str.size_bytes(); - - write_separator = - write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element; } - if (!d_chars) d_offsets[idx] = bytes; } }; -/** - * @brief Single separator concatenate functor - */ -struct concat_strings_fn : concat_strings_base { - string_view const d_separator; - - concat_strings_fn(table_device_view const& d_table, - string_view const& d_separator, - string_scalar_device_view const& d_narep, - separator_on_nulls separate_nulls) - : concat_strings_base{d_table, d_narep, separate_nulls}, d_separator(d_separator) - { - } - - __device__ void operator()(size_type idx) { process_row(idx, d_separator); } -}; - } // namespace std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, - separator_on_nulls separate_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -140,7 +112,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); - concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls}; + concat_strings_fn fn{*d_table, d_separator, d_narep}; auto children = make_strings_children(fn, strings_count, stream, mr); // create resulting null mask @@ -148,9 +120,9 @@ std::unique_ptr concatenate(table_view const& strings_columns, thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), [d_table = *d_table, d_narep] __device__(size_type idx) { - if (d_narep.is_valid()) return true; - return !thrust::any_of( + bool null_element = thrust::any_of( thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); }); + return (!null_element || d_narep.is_valid()); }, stream, mr); @@ -173,42 +145,68 @@ namespace { * when a separator row is null `d_separator_narep`. The `d_narep` is * used in place of a null entry in the strings columns. */ -struct multi_separator_concat_fn : concat_strings_base { +struct multi_separator_concat_fn { + table_device_view const d_table; column_device_view const d_separators; string_scalar_device_view const d_separator_narep; - - multi_separator_concat_fn(table_device_view const& d_table, - column_device_view const& d_separators, - string_scalar_device_view const& d_separator_narep, - string_scalar_device_view const& d_narep, - separator_on_nulls separate_nulls) - : concat_strings_base{d_table, d_narep, separate_nulls}, - d_separators(d_separators), - d_separator_narep(d_separator_narep) - { - } + string_scalar_device_view const d_narep; + offset_type* d_offsets{}; + char* d_chars{}; __device__ void operator()(size_type idx) { - if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) { + bool const all_nulls = + thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { + return col.is_null(idx); + }); + + if ((d_separators.is_null(idx) && !d_separator_narep.is_valid()) || + (all_nulls && !d_narep.is_valid())) { if (!d_chars) d_offsets[idx] = 0; return; } + // point to output location + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + offset_type bytes = 0; + + // there is at least one non-null column value auto const d_separator = d_separators.is_valid(idx) ? d_separators.element(idx) : d_separator_narep.value(); - // base class utility function handles the rest - process_row(idx, d_separator); + auto const d_null_rep = d_narep.is_valid() ? d_narep.value() : string_view{}; + + // write output entry for this row + bool colval_written = false; // state variable for writing separators + for (auto const d_column : d_table) { + // if the row is null and if there is no replacement, skip it + if (d_column.is_null(idx) && !d_narep.is_valid()) continue; + + // separator in this row is written only after the first output + if (colval_written) { + if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator); + bytes += d_separator.size_bytes(); + } + + // write out column's row data (or narep if the row is null) + string_view const d_str = + d_column.is_null(idx) ? d_null_rep : d_column.element(idx); + if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); + bytes += d_str.size_bytes(); + + // column's string or narep could by empty so we need this flag + // to know we got this far even if no actual bytes were copied + colval_written = true; // use the separator before the next column + } + + if (!d_chars) d_offsets[idx] = bytes; } }; - } // namespace std::unique_ptr concatenate(table_view const& strings_columns, strings_column_view const& separators, string_scalar const& separator_narep, string_scalar const& col_narep, - separator_on_nulls separate_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -236,19 +234,20 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); - multi_separator_concat_fn mscf{ - *d_table, separator_col_view, separator_rep, col_rep, separate_nulls}; + multi_separator_concat_fn mscf{*d_table, separator_col_view, separator_rep, col_rep}; auto children = make_strings_children(mscf, strings_count, stream, mr); // Create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), - [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type idx) { - if (!separator_col_view.is_valid(idx) && !separator_rep.is_valid()) return false; - if (col_rep.is_valid()) return true; - return !thrust::any_of( - thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); }); + [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type ridx) { + if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return false; + bool all_nulls = + thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& col) { + return col.is_null(ridx); + }); + return all_nulls ? col_rep.is_valid() : true; }, stream, mr); @@ -269,29 +268,21 @@ std::unique_ptr concatenate(table_view const& strings_columns, std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, - separator_on_nulls separate_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate( - strings_columns, separator, narep, separate_nulls, rmm::cuda_stream_default, mr); + return detail::concatenate(strings_columns, separator, narep, rmm::cuda_stream_default, mr); } std::unique_ptr concatenate(table_view const& strings_columns, strings_column_view const& separators, string_scalar const& separator_narep, string_scalar const& col_narep, - separator_on_nulls separate_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate(strings_columns, - separators, - separator_narep, - col_narep, - separate_nulls, - rmm::cuda_stream_default, - mr); + return detail::concatenate( + strings_columns, separators, separator_narep, col_narep, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/combine/concatenate_list_elements.cu b/cpp/src/strings/combine/concatenate_list_elements.cu new file mode 100644 index 00000000000..1157b8f3fce --- /dev/null +++ b/cpp/src/strings/combine/concatenate_list_elements.cu @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf { +namespace strings { +namespace detail { + +namespace { +/** + * @brief Compute string sizes, string validities, and concatenate strings functor. + * + * This functor is executed twice. In the first pass, the sizes and validities of the output strings + * will be computed. In the second pass, this will concatenate the strings within each list element + * of the given lists column and apply the separator. The null-replacement string scalar + * `string_narep_dv` (if valid) is used in place of any null string. + * + * @tparam Functor The functor which can check for validity of the input list at a given list index + * as well as access to the separator corresponding to the list index. + */ +template +struct compute_size_and_concatenate_fn { + Functor const func; + column_device_view const lists_dv; + offset_type const* const list_offsets; + column_device_view const strings_dv; + string_scalar_device_view const string_narep_dv; + + offset_type* d_offsets{nullptr}; + + // If d_chars == nullptr: only compute sizes and validities of the output strings. + // If d_chars != nullptr: only concatenate strings. + char* d_chars{nullptr}; + + // We need to set `1` or `0` for the validities of the output strings. + int8_t* d_validities{nullptr}; + + __device__ void operator()(size_type const idx) + { + // If this is the second pass, and the row `idx` is known to be a null string + if (d_chars and not d_validities[idx]) { return; } + + if (not d_chars and func.is_null_list(lists_dv, idx)) { + d_offsets[idx] = 0; + d_validities[idx] = false; + return; + } + + auto const separator = func.separator(idx); + auto const separator_size = separator.size_bytes(); + auto size_bytes = size_type{0}; + bool written = false; + char* output_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + + for (size_type str_idx = list_offsets[idx], idx_end = list_offsets[idx + 1]; str_idx < idx_end; + ++str_idx) { + if (not d_chars and (strings_dv.is_null(str_idx) and not string_narep_dv.is_valid())) { + d_offsets[idx] = 0; + d_validities[idx] = false; + return; // early termination: the entire list of strings will result in a null string + } + auto const d_str = strings_dv.is_null(str_idx) ? string_narep_dv.value() + : strings_dv.element(str_idx); + size_bytes += separator_size + d_str.size_bytes(); + if (output_ptr) { + // Separator is inserted only in between strings + if (written) { output_ptr = detail::copy_string(output_ptr, separator); } + output_ptr = detail::copy_string(output_ptr, d_str); + written = true; + } + } + + // Separator is inserted only in between strings + if (not d_chars) { + d_offsets[idx] = static_cast(size_bytes - separator_size); + d_validities[idx] = true; + } + } +}; + +/** + * @brief Functor accompanying with `compute_size_and_concatenate_fn` for computing output string + * sizes, output string validities, and concatenating strings within list elements; used when the + * separator is a string scalar. + */ +struct scalar_separator_fn { + string_scalar_device_view const d_separator; + + __device__ bool is_null_list(column_device_view const& lists_dv, size_type const idx) const + noexcept + { + return lists_dv.is_null(idx); + } + + __device__ string_view separator(size_type const) const noexcept { return d_separator.value(); } +}; + +} // namespace + +std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, + string_scalar const& separator, + string_scalar const& narep, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, + "The input column must be a column of lists of strings"); + CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar"); + + auto const num_rows = lists_strings_column.size(); + if (num_rows == 0) { return detail::make_empty_strings_column(stream, mr); } + + // Accessing the child strings column of the lists column must be done by calling `child()` on the + // lists column, not `get_sliced_child()`. This is because calling to `offsets_begin()` on the + // lists column returns a pointer to the offsets of the original lists column, which may not start + // from `0`. + auto const strings_col = strings_column_view(lists_strings_column.child()); + auto const lists_dv_ptr = column_device_view::create(lists_strings_column.parent(), stream); + auto const strings_dv_ptr = column_device_view::create(strings_col.parent(), stream); + auto const sep_dv = get_scalar_device_view(const_cast(separator)); + auto const string_narep_dv = get_scalar_device_view(const_cast(narep)); + + auto const func = scalar_separator_fn{sep_dv}; + auto const comp_fn = compute_size_and_concatenate_fn{ + func, + *lists_dv_ptr, + lists_strings_column.offsets_begin(), + *strings_dv_ptr, + string_narep_dv, + }; + auto [offsets_column, chars_column, null_mask, null_count] = + make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); + + return make_strings_column(num_rows, + std::move(offsets_column), + std::move(chars_column), + null_count, + std::move(null_mask), + stream, + mr); +} + +namespace { +/** + * @brief Functor accompanying with `compute_size_and_concatenate_fn` for computing output string + * sizes, output string validities, and concatenating strings within list elements; used when the + * separators are given as a strings column. + */ +struct column_separators_fn { + column_device_view const separators_dv; + string_scalar_device_view const sep_narep_dv; + + __device__ bool is_null_list(column_device_view const& lists_dv, size_type const idx) const + noexcept + { + return lists_dv.is_null(idx) or (separators_dv.is_null(idx) and not sep_narep_dv.is_valid()); + } + + __device__ string_view separator(size_type const idx) const noexcept + { + return separators_dv.is_valid(idx) ? separators_dv.element(idx) + : sep_narep_dv.value(); + } +}; + +} // namespace + +std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, + strings_column_view const& separators, + string_scalar const& separator_narep, + string_scalar const& string_narep, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, + "The input column must be a column of lists of strings"); + CUDF_EXPECTS(lists_strings_column.size() == separators.size(), + "Separators column should be the same size as the lists columns"); + + auto const num_rows = lists_strings_column.size(); + if (num_rows == 0) { return detail::make_empty_strings_column(stream, mr); } + + // Accessing the child strings column of the lists column must be done by calling `child()` on the + // lists column, not `get_sliced_child()`. This is because calling to `offsets_begin()` on the + // lists column returns a pointer to the offsets of the original lists column, which may not start + // from `0`. + auto const strings_col = strings_column_view(lists_strings_column.child()); + auto const lists_dv_ptr = column_device_view::create(lists_strings_column.parent(), stream); + auto const strings_dv_ptr = column_device_view::create(strings_col.parent(), stream); + auto const string_narep_dv = get_scalar_device_view(const_cast(string_narep)); + auto const sep_dv_ptr = column_device_view::create(separators.parent(), stream); + auto const sep_narep_dv = get_scalar_device_view(const_cast(separator_narep)); + + auto const func = column_separators_fn{*sep_dv_ptr, sep_narep_dv}; + auto const comp_fn = compute_size_and_concatenate_fn{ + func, + *lists_dv_ptr, + lists_strings_column.offsets_begin(), + *strings_dv_ptr, + string_narep_dv, + }; + auto [offsets_column, chars_column, null_mask, null_count] = + make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); + + return make_strings_column(num_rows, + std::move(offsets_column), + std::move(chars_column), + null_count, + std::move(null_mask), + stream, + mr); +} + +} // namespace detail + +std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, + string_scalar const& separator, + string_scalar const& narep, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::concatenate_list_elements( + lists_strings_column, separator, narep, rmm::cuda_stream_default, mr); +} + +std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, + strings_column_view const& separators, + string_scalar const& separator_narep, + string_scalar const& string_narep, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::concatenate_list_elements( + lists_strings_column, separators, separator_narep, string_narep, rmm::cuda_stream_default, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index ce80e4bf064..d87b4b81bdc 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -328,8 +328,8 @@ ConfigureTest(STRINGS_TEST strings/booleans_tests.cpp strings/case_tests.cpp strings/chars_types_tests.cpp + strings/combine/concatenate_list_elements_tests.cpp strings/combine/concatenate_tests.cpp - strings/combine/join_list_elements_tests.cpp strings/combine/join_strings_tests.cpp strings/concatenate_tests.cpp strings/contains_tests.cpp diff --git a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp b/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp new file mode 100644 index 00000000000..b6afd588dfb --- /dev/null +++ b/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +struct StringsListsConcatenateTest : public cudf::test::BaseFixture { +}; + +namespace { +using STR_LISTS = cudf::test::lists_column_wrapper; +using STR_COL = cudf::test::strings_column_wrapper; +using INT_LISTS = cudf::test::lists_column_wrapper; + +constexpr bool print_all{false}; + +auto null_at(cudf::size_type idx) +{ + return cudf::detail::make_counting_transform_iterator(0, [idx](auto i) { return i != idx; }); +} + +auto all_nulls() +{ + return cudf::detail::make_counting_transform_iterator(0, [](auto) { return false; }); +} + +auto nulls_from_nullptr(std::vector const& strs) +{ + return thrust::make_transform_iterator(strs.begin(), [](auto ptr) { return ptr != nullptr; }); +} + +} // namespace + +TEST_F(StringsListsConcatenateTest, InvalidInput) +{ + // Invalid list type + { + auto const string_lists = INT_LISTS{{1, 2, 3}, {4, 5, 6}}.release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv), cudf::logic_error); + } + + // Invalid scalar separator + { + auto const string_lists = + STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + EXPECT_THROW( + cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("", false)), + cudf::logic_error); + } + + // Invalid column separators + { + auto const string_lists = + STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + auto const separators = STR_COL{"+++"}.release(); // size doesn't match with lists column size + EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv, separators->view()), + cudf::logic_error); + } +} + +TEST_F(StringsListsConcatenateTest, EmptyInput) +{ + auto const string_lists = STR_LISTS{}.release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + auto const expected = STR_COL{}; + auto results = cudf::strings::concatenate_list_elements(string_lv); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + + auto const separators = STR_COL{}.release(); + results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); +} + +TEST_F(StringsListsConcatenateTest, ZeroSizeStringsInput) +{ + auto const string_lists = + STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + auto const expected = STR_COL{"", "", ""}; + + auto results = cudf::strings::concatenate_list_elements(string_lv); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + + auto const separators = STR_COL{"", "", ""}.release(); + results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); +} + +TEST_F(StringsListsConcatenateTest, AllNullsStringsInput) +{ + auto const string_lists = STR_LISTS{ + STR_LISTS{{""}, all_nulls()}, + STR_LISTS{{"", "", ""}, all_nulls()}, + STR_LISTS{{"", ""}, + all_nulls()}}.release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + auto const expected = STR_COL{{"", "", ""}, all_nulls()}; + + auto results = cudf::strings::concatenate_list_elements(string_lv); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + + auto const separators = STR_COL{{"", "", ""}, all_nulls()}.release(); + results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); +} + +TEST_F(StringsListsConcatenateTest, ScalarSeparator) +{ + auto const string_lists = STR_LISTS{{STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, + STR_LISTS{}, /*NULL*/ + STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, + STR_LISTS{"zzz", "xxxxx"}}, + null_at(1)} + .release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + + // No null replacement + { + auto const results = + cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // With null replacement + { + auto const results = cudf::strings::concatenate_list_elements( + string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); + std::vector h_expected{ + "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } +} + +TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) +{ + auto const string_lists = STR_LISTS{ + {STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, + STR_LISTS{}, /*NULL*/ + STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, + STR_LISTS{"zzz", "xxxxx"}, + STR_LISTS{"11111", "11111", "11111", "11111", "11111"}, /*NULL*/ + STR_LISTS{{"abcdef", "012345", "" /*NULL*/, "xxx000"}, null_at(2)}, + STR_LISTS{{"xyz" /*NULL*/, "11111", "00000"}, null_at(0)}, + STR_LISTS{"0a0b0c", "5x5y5z"}, + STR_LISTS{"xxx"}, /*NULL*/ + STR_LISTS{"ééé", "12345abcdef"}, + STR_LISTS{"aaaééébbbéééccc", "12345"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return i != 1 && i != 4 && i != 8; + })}.release(); + + // Sliced the entire lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); + auto const results = + cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{nullptr, + nullptr, + nullptr, + "zzz+++xxxxx", + nullptr, + nullptr, + nullptr, + "0a0b0c+++5x5y5z", + nullptr, + "ééé+++12345abcdef", + "aaaééébbbéééccc+++12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the entire lists column, with null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); + std::vector h_expected{"a+++___+++ccc", + nullptr, + "___+++efgh+++ijk", + "zzz+++xxxxx", + nullptr, + "abcdef+++012345+++___+++xxx000", + "___+++11111+++00000", + "0a0b0c+++5x5y5z", + nullptr, + "ééé+++12345abcdef", + "aaaééébbbéééccc+++12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the first half of the lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); + auto const results = + cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the first half of the lists column, with null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); + std::vector h_expected{ + "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the second half of the lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); + auto const results = + cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{ + nullptr, nullptr, "0a0b0c+++5x5y5z", nullptr, "ééé+++12345abcdef", "aaaééébbbéééccc+++12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the second half of the lists column, with null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); + std::vector h_expected{"abcdef+++012345+++___+++xxx000", + "___+++11111+++00000", + "0a0b0c+++5x5y5z", + nullptr, + "ééé+++12345abcdef", + "aaaééébbbéééccc+++12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the middle part of the lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); + auto const results = + cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{ + "zzz+++xxxxx", nullptr, nullptr, nullptr, "0a0b0c+++5x5y5z"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the middle part of the lists column, with null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); + std::vector h_expected{"zzz+++xxxxx", + nullptr, + "abcdef+++012345+++___+++xxx000", + "___+++11111+++00000", + "0a0b0c+++5x5y5z"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } +} + +TEST_F(StringsListsConcatenateTest, ColumnSeparators) +{ + auto const string_lists = STR_LISTS{{STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, + STR_LISTS{}, /*NULL*/ + STR_LISTS{"0a0b0c", "xyzééé"}, + STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, + STR_LISTS{{"ééé" /*NULL*/, "ááá", "ííí"}, null_at(0)}, + STR_LISTS{"zzz", "xxxxx"}}, + null_at(1)} + .release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + auto const separators = STR_COL{ + {"+++", "***", "!!!" /*NULL*/, "$$$" /*NULL*/, "%%%", "^^^"}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return i != 2 && i != 3; + })}.release(); + + // No null replacement + { + auto const results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + std::vector h_expected{nullptr, nullptr, nullptr, nullptr, nullptr, "zzz^^^xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // With null replacement for separators + { + auto const results = cudf::strings::concatenate_list_elements( + string_lv, separators->view(), cudf::string_scalar("|||")); + std::vector h_expected{ + nullptr, nullptr, "0a0b0c|||xyzééé", nullptr, nullptr, "zzz^^^xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // With null replacement for strings + { + auto const results = cudf::strings::concatenate_list_elements( + string_lv, separators->view(), cudf::string_scalar("", false), cudf::string_scalar("XXXXX")); + std::vector h_expected{ + "a+++XXXXX+++ccc", nullptr, nullptr, nullptr, "XXXXX%%%ááá%%%ííí", "zzz^^^xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // With null replacement for both separators and strings + { + auto const results = cudf::strings::concatenate_list_elements( + string_lv, separators->view(), cudf::string_scalar("|||"), cudf::string_scalar("XXXXX")); + std::vector h_expected{"a+++XXXXX+++ccc", + nullptr, + "0a0b0c|||xyzééé", + "XXXXX|||efgh|||ijk", + "XXXXX%%%ááá%%%ííí", + "zzz^^^xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } +} + +TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) +{ + auto const string_lists = STR_LISTS{ + {STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, + STR_LISTS{}, /*NULL*/ + STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, + STR_LISTS{"zzz", "xxxxx"}, + STR_LISTS{"11111", "11111", "11111", "11111", "11111"}, /*NULL*/ + STR_LISTS{{"abcdef", "012345", "" /*NULL*/, "xxx000"}, null_at(2)}, + STR_LISTS{{"xyz" /*NULL*/, "11111", "00000"}, null_at(0)}, + STR_LISTS{"0a0b0c", "5x5y5z"}, + STR_LISTS{"xxx"}, /*NULL*/ + STR_LISTS{"ééé", "12345abcdef"}, + STR_LISTS{"aaaééébbbéééccc", "12345"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return i != 1 && i != 4 && i != 8; + })}.release(); + auto const separators = STR_COL{ + {"+++", "***", "!!!" /*NULL*/, "$$$" /*NULL*/, "%%%", "^^^", "~!~", "###", "&&&", "-+-", "=+="}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return i != 2 && i != 3; + })}.release(); + + // Sliced the entire lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); + auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + std::vector h_expected{nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + "0a0b0c###5x5y5z", + nullptr, + "ééé-+-12345abcdef", + "aaaééébbbéééccc=+=12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the entire lists column, with null replacements + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); + std::vector h_expected{"a+++___+++ccc", + nullptr, + "___|||efgh|||ijk", + "zzz|||xxxxx", + nullptr, + "abcdef^^^012345^^^___^^^xxx000", + "___~!~11111~!~00000", + "0a0b0c###5x5y5z", + nullptr, + "ééé-+-12345abcdef", + "aaaééébbbéééccc=+=12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the first half of the lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); + auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + std::vector h_expected{nullptr, nullptr, nullptr, nullptr}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the first half of the lists column, with null replacements + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); + std::vector h_expected{ + "a+++___+++ccc", nullptr, "___|||efgh|||ijk", "zzz|||xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the second half of the lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); + auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + std::vector h_expected{ + nullptr, nullptr, "0a0b0c###5x5y5z", nullptr, "ééé-+-12345abcdef", "aaaééébbbéééccc=+=12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the second half of the lists column, with null replacements + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); + std::vector h_expected{"abcdef^^^012345^^^___^^^xxx000", + "___~!~11111~!~00000", + "0a0b0c###5x5y5z", + nullptr, + "ééé-+-12345abcdef", + "aaaééébbbéééccc=+=12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the middle part of the lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); + auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + std::vector h_expected{nullptr, nullptr, nullptr, nullptr, "0a0b0c###5x5y5z"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the middle part of the lists column, with null replacements + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); + std::vector h_expected{"zzz|||xxxxx", + nullptr, + "abcdef^^^012345^^^___^^^xxx000", + "___~!~11111~!~00000", + "0a0b0c###5x5y5z"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } +} diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp index 3291f066cac..c1c390e8a82 100644 --- a/cpp/tests/strings/combine/concatenate_tests.cpp +++ b/cpp/tests/strings/combine/concatenate_tests.cpp @@ -95,58 +95,6 @@ TEST_F(StringsCombineTest, Concatenate) } } -TEST_F(StringsCombineTest, ConcatenateSkipNulls) -{ - cudf::test::strings_column_wrapper strings1({"eee", "", "", "", "aa", "bbb", "ééé"}, - {1, 0, 0, 1, 1, 1, 1}); - cudf::test::strings_column_wrapper strings2({"xyz", "", "d", "éa", "", "", "f"}, - {1, 0, 1, 1, 1, 0, 1}); - cudf::test::strings_column_wrapper strings3({"q", "", "s", "t", "u", "", "w"}, - {1, 1, 1, 1, 1, 0, 1}); - - cudf::table_view table({strings1, strings2, strings3}); - - { - cudf::test::strings_column_wrapper expected( - {"eee+xyz+q", "++", "+d+s", "+éa+t", "aa++u", "bbb++", "ééé+f+w"}); - auto results = cudf::strings::concatenate(table, - cudf::string_scalar("+"), - cudf::string_scalar(""), - cudf::strings::separator_on_nulls::YES); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - } - { - cudf::test::strings_column_wrapper expected( - {"eee+xyz+q", "", "d+s", "+éa+t", "aa++u", "bbb", "ééé+f+w"}); - auto results = cudf::strings::concatenate(table, - cudf::string_scalar("+"), - cudf::string_scalar(""), - cudf::strings::separator_on_nulls::NO); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - } - { - cudf::test::strings_column_wrapper expected( - {"eee+xyz+q", "", "", "+éa+t", "aa++u", "", "ééé+f+w"}, {1, 0, 0, 1, 1, 0, 1}); - auto results = cudf::strings::concatenate(table, - cudf::string_scalar("+"), - cudf::string_scalar("", false), - cudf::strings::separator_on_nulls::NO); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - } - { - cudf::test::strings_column_wrapper sep_col({"+", "-", ".", "@", "*", "^^", "#"}); - auto results = cudf::strings::concatenate(table, - cudf::strings_column_view(sep_col), - cudf::string_scalar(""), - cudf::string_scalar(""), - cudf::strings::separator_on_nulls::NO); - - cudf::test::strings_column_wrapper expected( - {"eee+xyz+q", "", "d.s", "@éa@t", "aa**u", "bbb", "ééé#f#w"}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); - } -} - TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( @@ -347,20 +295,12 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnEmptyAndNullStringsNoR auto sep_col = cudf::test::strings_column_wrapper( {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false}); - auto exp_results1 = cudf::test::strings_column_wrapper( - {"", "", "", "", "", "", "", ""}, {false, false, true, false, false, false, false, false}); + auto exp_results = cudf::test::strings_column_wrapper( + {"", "", "", "", "", "", "", ""}, {false, false, true, false, true, false, true, false}); + auto results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); - - auto exp_results2 = cudf::test::strings_column_wrapper( - {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false}); - results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, - cudf::strings_column_view(sep_col), - cudf::string_scalar("", false), - cudf::string_scalar(""), - cudf::strings::separator_on_nulls::NO); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacements) @@ -375,23 +315,13 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacement {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"}, {true, true, false, true, false, true, false, true, true, true, true, true}); - auto exp_results1 = cudf::test::strings_column_wrapper( - {"eeexyzfoo", "~~~", "", "", "", "", "", "", "", "", "", ""}, - {true, true, false, false, false, false, false, false, false, false, false, false}); + auto exp_results = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""}, + {true, true, false, true, false, true, false, true, true, false, false, false}); auto results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); - - auto exp_results2 = cudf::test::strings_column_wrapper( - {"eeexyzfoo", "~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""}, - {true, true, false, true, false, true, false, true, true, true, true, true}); - results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, - cudf::strings_column_view(sep_col), - cudf::string_scalar("", false), - cudf::string_scalar(""), - cudf::strings::separator_on_nulls::NO); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorReplacement) @@ -405,26 +335,26 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorRepl auto sep_col = cudf::test::strings_column_wrapper( {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"}, {true, true, false, true, false, true, false, true, true, true, true, true}); - auto sep_rep = cudf::string_scalar("!!!!!!!"); + auto sep_rep = cudf::string_scalar("!!!!!!!!!!"); - auto exp_results1 = cudf::test::strings_column_wrapper( - {"eeexyzfoo", "~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""}, - {true, true, true, false, false, false, false, false, false, false, false, false}); + auto exp_results = cudf::test::strings_column_wrapper( + {"eeexyzfoo", + "~~~", + "!!!!!!!!!!éaff", + "éééf", + "éa", + "", + "éaff", + "valid", + "doo", + "", + "", + ""}, + {true, true, true, true, true, true, true, true, true, false, false, false}); auto results = cudf::strings::concatenate( cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col), sep_rep); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); - - auto exp_results2 = cudf::test::strings_column_wrapper( - {"eeexyzfoo", "~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""}, - {true, true, true, true, true, true, true, true, true, true, true, true}); - - results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, - cudf::strings_column_view(sep_col), - sep_rep, - cudf::string_scalar(""), - cudf::strings::separator_on_nulls::NO); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixColumnReplacement) diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index c39175bceac..051ce5e9277 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -157,11 +157,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenation(JNI } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(JNIEnv *env, jclass, - jlongArray column_handles, - jlong sep_handle, - jlong separator_narep, - jlong col_narep, - jboolean separate_nulls) { + jlongArray column_handles, + jlong sep_handle, + jlong separator_narep, + jlong col_narep, + jboolean separate_nulls) { JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0); JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0); JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0); @@ -169,7 +169,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepC try { cudf::jni::auto_set_device(env); const auto& separator_narep_scalar = *reinterpret_cast(separator_narep); - const auto& col_narep_scalar = *reinterpret_cast(col_narep); + const auto& col_narep_scalar = *reinterpret_cast(col_narep); auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES : cudf::strings::separator_on_nulls::NO; @@ -204,7 +204,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationList try { cudf::jni::auto_set_device(env); const auto& separator_narep_scalar = *reinterpret_cast(separator_narep); - const auto& col_narep_scalar = *reinterpret_cast(col_narep); + const auto& col_narep_scalar = *reinterpret_cast(col_narep); auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES : cudf::strings::separator_on_nulls::NO; auto empty_list_output = diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 1371d43195c..2615de3f31c 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2110,7 +2110,7 @@ void testStringConcatSeparators() { } @Test - void testStringConcatSeparatorsEmptyStringForNull() { + void testStringConcatWsSeparators() { try (ColumnVector sv1 = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf", null, null, "\\G\u0100"); ColumnVector sv2 = ColumnVector.fromStrings("b", "C", "\u0500\u0501", "x\nYz", null, null, "", null); ColumnVector e_concat = ColumnVector.fromStrings("aA1\t\ud721b", "BA1\t\ud721C", "cdA1\t\ud721\u0500\u0501", diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/cpp/strings/combine.pxd index 51c706b68d0..250c6441882 100644 --- a/python/cudf/cudf/_lib/cpp/strings/combine.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/combine.pxd @@ -18,13 +18,13 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil: string_scalar separator, string_scalar narep) except + - cdef unique_ptr[column] join_list_elements( + cdef unique_ptr[column] concatenate_list_elements( column_view lists_strings_column, column_view separators, string_scalar separator_narep, string_scalar string_narep) except + - cdef unique_ptr[column] join_list_elements( + cdef unique_ptr[column] concatenate_list_elements( column_view lists_strings_column, string_scalar separator, string_scalar narep) except + diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 0d7dfb5c619..25619de3ed0 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -16,7 +16,7 @@ from cudf._lib.table cimport Table from cudf._lib.cpp.strings.combine cimport ( concatenate as cpp_concatenate, join_strings as cpp_join_strings, - join_list_elements as cpp_join_list_elements + concatenate_list_elements as cpp_concatenate_list_elements ) @@ -105,7 +105,7 @@ def join_lists_with_scalar( ) with nogil: - c_result = move(cpp_join_list_elements( + c_result = move(cpp_concatenate_list_elements( source_view, scalar_separator[0], scalar_narep[0] @@ -142,7 +142,7 @@ def join_lists_with_column( ) with nogil: - c_result = move(cpp_join_list_elements( + c_result = move(cpp_concatenate_list_elements( source_view, separator_view, scalar_separator_narep[0], From 27b92672e73b80651087278535925e77da16bb09 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 17:00:03 -0500 Subject: [PATCH 14/22] Update spacing --- java/src/main/native/src/ColumnVectorJni.cpp | 14 +++++++------- .../test/java/ai/rapids/cudf/ColumnVectorTest.java | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index 051ce5e9277..c39175bceac 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -157,11 +157,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenation(JNI } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(JNIEnv *env, jclass, - jlongArray column_handles, - jlong sep_handle, - jlong separator_narep, - jlong col_narep, - jboolean separate_nulls) { + jlongArray column_handles, + jlong sep_handle, + jlong separator_narep, + jlong col_narep, + jboolean separate_nulls) { JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0); JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0); JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0); @@ -169,7 +169,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepC try { cudf::jni::auto_set_device(env); const auto& separator_narep_scalar = *reinterpret_cast(separator_narep); - const auto& col_narep_scalar = *reinterpret_cast(col_narep); + const auto& col_narep_scalar = *reinterpret_cast(col_narep); auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES : cudf::strings::separator_on_nulls::NO; @@ -204,7 +204,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationList try { cudf::jni::auto_set_device(env); const auto& separator_narep_scalar = *reinterpret_cast(separator_narep); - const auto& col_narep_scalar = *reinterpret_cast(col_narep); + const auto& col_narep_scalar = *reinterpret_cast(col_narep); auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES : cudf::strings::separator_on_nulls::NO; auto empty_list_output = diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 2615de3f31c..1371d43195c 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2110,7 +2110,7 @@ void testStringConcatSeparators() { } @Test - void testStringConcatWsSeparators() { + void testStringConcatSeparatorsEmptyStringForNull() { try (ColumnVector sv1 = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf", null, null, "\\G\u0100"); ColumnVector sv2 = ColumnVector.fromStrings("b", "C", "\u0500\u0501", "x\nYz", null, null, "", null); ColumnVector e_concat = ColumnVector.fromStrings("aA1\t\ud721b", "BA1\t\ud721C", "cdA1\t\ud721\u0500\u0501", From 6c4c2ef59176c16bc9ef2f830c13018bc0464284 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 19:57:31 -0500 Subject: [PATCH 15/22] Update to match null handling for array of all nulls --- java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 1371d43195c..c5da4f53880 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2319,7 +2319,7 @@ void testStringConcatWsSingleListColAllNulls() { new HostColumnVector.BasicType(true, DType.STRING)), Arrays.asList("aaa"), Arrays.asList(null, null, null)); ColumnVector sep_col = ColumnVector.fromStrings("-", "-"); - ColumnVector e_concat = ColumnVector.fromStrings("aaa", ""); + ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, @@ -2333,7 +2333,7 @@ void testStringConcatWsSingleListColAllNullsScalarSep() { try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.STRING)), Arrays.asList("aaa"), Arrays.asList(null, null, null)); - ColumnVector e_concat = ColumnVector.fromStrings("aaa", ""); + ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); Scalar separatorString = Scalar.fromString("-"); Scalar narep = Scalar.fromString(""); ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, @@ -2348,7 +2348,7 @@ void testStringConcatWsSingleListColAllNullsSepTrue() { new HostColumnVector.BasicType(true, DType.STRING)), Arrays.asList("aaa"), Arrays.asList(null, null, null)); ColumnVector sep_col = ColumnVector.fromStrings("-", "-"); - ColumnVector e_concat = ColumnVector.fromStrings("aaa", "--"); + ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, From 10ffddf1c20fbcfd62e61369890c3bd29663ea66 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 20:41:06 -0500 Subject: [PATCH 16/22] Fix descriptions of parameters and typos --- .../java/ai/rapids/cudf/ColumnVector.java | 70 +++++++++++-------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 8520d6a983e..7d60c6db8c6 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -500,7 +500,7 @@ public static ColumnVector stringConcatenate(ColumnView[] columns) { /** * Concatenate columns of strings together, combining a corresponding row from each column into - * a single string row of a new column. This version inludes the separator for null rows + * a single string row of a new column. This version includes the separator for null rows * if 'narep' is valid. * @param separator string scalar inserted between each string being merged. * @param narep string scalar indicating null behavior. If set to null and any string in the row @@ -522,7 +522,7 @@ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, Col * will be replaced by the specified string. * @param columns array of columns containing strings, must be non-empty * @param separate_nulls if true, then the separator is included for null rows if - * `col_narep` is valid. + * `narep` is valid. * @return A new java column vector containing the concatenated strings. */ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, ColumnView[] columns, @@ -548,7 +548,7 @@ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, Col * Concatenate columns of strings together using a separator specified for each row * and returns the result as a string column. If the row separator for a given row is null, * output column for that row is null. Null column values for a given row are skipped. - * @param columns array of columns containing strings, must be more than 1 columns + * @param columns array of columns containing strings * @param sep_col strings column that provides the separator for a given row * @return A new java column vector containing the concatenated strings with separator between. */ @@ -565,10 +565,12 @@ public static ColumnVector stringConcatenate(ColumnView[] columns, ColumnView se * output column for that row is null unless separator_narep is provided. * The separator is applied between two output row values if the separate_nulls * is `YES` or only between valid rows if separate_nulls is `NO`. - * @param columns array of columns containing strings, must be more than 1 columns + * @param columns array of columns containing strings * @param sep_col strings column that provides the separator for a given row - * @param separator_narep String that should be used in place of a null separator for a given - * row. + * @param separator_narep string scalar indicating null behavior when a separator is null. + * If set to null and the separator is null the resulting string will + * be null. If not null, this string will be used in place of a null + * separator. * @param col_narep string that should be used in place of any null strings * found in any column. * @param separate_nulls if true, then the separator is included for null rows if @@ -616,31 +618,32 @@ public static ColumnVector stringConcatenateListElements(ColumnView list_column, * Given a lists column of strings (each row is a list of strings), concatenates the strings * within each row and returns a single strings column result. * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the row separator provided in the sep_colstrings column. + * delimited by the row separator provided in the sep_col strings column. * @param list_column column containing lists of strings to concatenate. * @param sep_col strings column that provides separators for concatenation. - * @param separator_narep string that should be used to replace null separator, default is an - * invalid-scalar denoting that rows containing null separator will - * result in null string in the corresponding output rows. + * @param separator_narep string scalar indicating null behavior when a separator is null. + * If set to null and the separator is null the resulting string will + * be null. If not null, this string will be used in place of a null + * separator. * @param string_narep string that should be used to replace null strings in any non-null list - * row, default is an invalid-scalar denoting that list rows containing null - * strings will result in null string in the corresponding output rows. + * row. If set to null and the string is null the resulting string will + * be null. If not null, this string will be used in place of a null value. * @param separate_nulls if true, then the separator is included for null rows if - * `col_narep` is valid. + * `string_narep` is valid. * @param empty_string_output_if_empty_list if set to true, any input row that is an empty list * will result in an empty string. Otherwise, it will result in a null. * @return A new java column vector containing the concatenated strings with separator between. */ public static ColumnVector stringConcatenateListElements(ColumnView list_column, - ColumnView sep_col, Scalar separator_narep, Scalar col_narep, boolean separate_nulls, + ColumnView sep_col, Scalar separator_narep, Scalar string_narep, boolean separate_nulls, boolean empty_string_output_if_empty_list) { assert separator_narep != null : "separator narep scalar provided may not be null"; - assert col_narep != null : "column narep scalar provided may not be null"; + assert string_narep != null : "string narep scalar provided may not be null"; assert separator_narep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; - assert col_narep.getType().equals(DType.STRING) : "column narep scalar must be a string scalar"; + assert string_narep.getType().equals(DType.STRING) : "string narep scalar must be a string scalar"; return new ColumnVector(stringConcatenationListElementsSepCol(list_column.getNativeView(), - sep_col.getNativeView(), separator_narep.getScalarHandle(), col_narep.getScalarHandle(), + sep_col.getNativeView(), separator_narep.getScalarHandle(), string_narep.getScalarHandle(), separate_nulls, empty_string_output_if_empty_list)); } @@ -656,7 +659,7 @@ public static ColumnVector stringConcatenateListElements(ColumnView list_column, * column will be replaced by the specified string. The underlying value in the * string scalar may be null, but the object passed in may not. * @param separate_nulls if true, then the separator is included for null rows if - * `col_narep` is valid. + * `narep` is valid. * @param empty_string_output_if_empty_list if set to true, any input row that is an empty list * will result in an empty string. Otherwise, it will result in a null. * @return A new java column vector containing the concatenated strings with separator between. @@ -865,12 +868,13 @@ private static native long makeList(long[] handles, long typeHandle, int scale, * * @param columnViews array of longs holding the native handles of the column_views to combine. * @param separator string scalar inserted between each string being merged, may not be null. - * @param narep string scalar indicating null behavior. If set to null and any string in the row is null - * the resulting string will be null. If not null, null values in any column will be - * replaced by the specified string. The underlying value in the string scalar may be null, - * but the object passed in may not. + * @param narep string scalar indicating null behavior. If set to null and any string in + * the row is null the resulting string will be null. If not null, null + * values in any column will be replaced by the specified string. The + * underlying value in the string scalar may be null, but the object passed + * in may not. * @param separate_nulls boolean if true, then the separator is included for null rows if - * `col_narep` is valid. + * `narep` is valid. * @return native handle of the resulting cudf column, used to construct the Java column * by the stringConcatenate method. */ @@ -882,8 +886,10 @@ private static native long stringConcatenation(long[] columnViews, long separato * and returns the result as a string column. * @param columns array of longs holding the native handles of the column_views to combine. * @param sep_column long holding the native handle of the strings_column_view used as separators. - * @param separator_narep String scalar that should be used in place of a null separator for a given - * row. + * @param separator_narep string scalar indicating null behavior when a separator is null. + * If set to null and the separator is null the resulting string will + * be null. If not null, this string will be used in place of a null + * separator. * @param col_narep string String scalar that should be used in place of any null strings * found in any column. * @param separate_nulls boolean if true, then the separator is included for null rows if @@ -905,8 +911,10 @@ private static native long stringConcatenationSepCol(long[] columnViews, * to concatenate. * @param sep_col long holding the native handle of the strings column that provides separators * for concatenation. - * @param separator_narep String scalar that should be used in place of a null separator for a given - * row. + * @param separator_narep string scalar indicating null behavior when a separator is null. + * If set to null and the separator is null the resulting string will + * be null. If not null, this string will be used in place of a null + * separator. * @param col_narep string String scalar that should be used in place of any null strings * found in any column. * @param separate_nulls boolean if true, then the separator is included for null rows if @@ -942,10 +950,10 @@ private static native long stringConcatenationListElementsSepCol(long list_colum * @return native handle of the resulting cudf column, used to construct the Java column. */ private static native long stringConcatenationListElements(long list_column, - long separator, - long narep, - boolean separate_nulls, - boolean empty_string_output_if_empty_list); + long separator, + long narep, + boolean separate_nulls, + boolean empty_string_output_if_empty_list); /** * Native method to hash each row of the given table. Hashing function dispatched on the From cedd6b1f9f1da6c69980ea68b6a51e93ace7f63b Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 20:41:28 -0500 Subject: [PATCH 17/22] remove extra include --- java/src/main/native/src/ColumnVectorJni.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index c39175bceac..db570fc1b6c 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -28,7 +28,6 @@ #include #include #include -#include #include #include "cudf_jni_apis.hpp" From e9d7b220cc413bea33bd0044ef62d3aa97cbc135 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 20:43:51 -0500 Subject: [PATCH 18/22] Add test for one column --- .../test/java/ai/rapids/cudf/ColumnVectorTest.java | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index c5da4f53880..faf99308db7 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2183,6 +2183,20 @@ void testStringConcatWsSimpleOtherApi() { } } + @Test + void testStringConcatWsOneCol() { + try (ColumnVector sv1 = ColumnVector.fromStrings("a"); + ColumnVector sep_col = ColumnVector.fromStrings("-*"); + ColumnVector e_concat = ColumnVector.fromStrings("a"); + Scalar separatorString = Scalar.fromString(null); + Scalar col_narep = Scalar.fromString(""); + ColumnVector concat = ColumnVector.stringConcatenate( + new ColumnView[]{sv1}, sep_col, separatorString, + col_narep, false)) { + assertColumnsAreEqual(e_concat, concat); + } + } + @Test void testStringConcatWsNullSep() { try (ColumnVector sv1 = ColumnVector.fromStrings("a", "c"); From 91713ad049ca3f6808c7724e2c12712ebdd27950 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 19 May 2021 21:48:19 -0500 Subject: [PATCH 19/22] Move string concatenate list element functions to ColumnView and update tests --- .../java/ai/rapids/cudf/ColumnVector.java | 134 ------------------ .../main/java/ai/rapids/cudf/ColumnView.java | 133 +++++++++++++++++ java/src/main/native/src/ColumnVectorJni.cpp | 62 -------- java/src/main/native/src/ColumnViewJni.cpp | 64 +++++++++ .../java/ai/rapids/cudf/ColumnVectorTest.java | 51 ++++--- 5 files changed, 230 insertions(+), 214 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 7d60c6db8c6..7a50c439ede 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -595,87 +595,6 @@ public static ColumnVector stringConcatenate(ColumnView[] columns, separator_narep.getScalarHandle(), col_narep.getScalarHandle(), separate_nulls)); } - /** - * Given a lists column of strings (each row is a list of strings), concatenates the strings - * within each row and returns a single strings column result. Each new string is created by - * concatenating the strings from the same row (same list element) delimited by the separator - * provided. This version of the function relaces nulls with empty string and returns null - * for empty list. - * @param list_column column containing lists of strings to concatenate. - * @param sep_col strings column that provides separators for concatenation. - * @return A new java column vector containing the concatenated strings with separator between. - */ - public static ColumnVector stringConcatenateListElements(ColumnView list_column, - ColumnView sep_col) { - try (Scalar nullString = Scalar.fromString(null); - Scalar emptyString = Scalar.fromString("")) { - return stringConcatenateListElements(list_column, sep_col, nullString, emptyString, - false, false); - } - } - - /** - * Given a lists column of strings (each row is a list of strings), concatenates the strings - * within each row and returns a single strings column result. - * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the row separator provided in the sep_col strings column. - * @param list_column column containing lists of strings to concatenate. - * @param sep_col strings column that provides separators for concatenation. - * @param separator_narep string scalar indicating null behavior when a separator is null. - * If set to null and the separator is null the resulting string will - * be null. If not null, this string will be used in place of a null - * separator. - * @param string_narep string that should be used to replace null strings in any non-null list - * row. If set to null and the string is null the resulting string will - * be null. If not null, this string will be used in place of a null value. - * @param separate_nulls if true, then the separator is included for null rows if - * `string_narep` is valid. - * @param empty_string_output_if_empty_list if set to true, any input row that is an empty list - * will result in an empty string. Otherwise, it will result in a null. - * @return A new java column vector containing the concatenated strings with separator between. - */ - public static ColumnVector stringConcatenateListElements(ColumnView list_column, - ColumnView sep_col, Scalar separator_narep, Scalar string_narep, boolean separate_nulls, - boolean empty_string_output_if_empty_list) { - assert separator_narep != null : "separator narep scalar provided may not be null"; - assert string_narep != null : "string narep scalar provided may not be null"; - assert separator_narep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; - assert string_narep.getType().equals(DType.STRING) : "string narep scalar must be a string scalar"; - - return new ColumnVector(stringConcatenationListElementsSepCol(list_column.getNativeView(), - sep_col.getNativeView(), separator_narep.getScalarHandle(), string_narep.getScalarHandle(), - separate_nulls, empty_string_output_if_empty_list)); - } - - /** - * Given a lists column of strings (each row is a list of strings), concatenates the strings - * within each row and returns a single strings column result. Each new string is created by - * concatenating the strings from the same row (same list element) delimited by the - * separator provided. - * @param list_column column containing lists of strings to concatenate. - * @param separator string scalar inserted between each string being merged. - * @param narep string scalar indicating null behavior. If set to null and any string in the row - * is null the resulting string will be null. If not null, null values in any - * column will be replaced by the specified string. The underlying value in the - * string scalar may be null, but the object passed in may not. - * @param separate_nulls if true, then the separator is included for null rows if - * `narep` is valid. - * @param empty_string_output_if_empty_list if set to true, any input row that is an empty list - * will result in an empty string. Otherwise, it will result in a null. - * @return A new java column vector containing the concatenated strings with separator between. - */ - public static ColumnVector stringConcatenateListElements(ColumnView list_column, - Scalar separator, Scalar narep, boolean separate_nulls, - boolean empty_string_output_if_empty_list) { - assert separator != null : "separator scalar provided may not be null"; - assert narep != null : "column narep scalar provided may not be null"; - assert narep.getType().equals(DType.STRING) : "narep scalar must be a string scalar"; - - return new ColumnVector(stringConcatenationListElements(list_column.getNativeView(), - separator.getScalarHandle(), narep.getScalarHandle(), separate_nulls, - empty_string_output_if_empty_list)); - } - /** * Concatenate columns of lists horizontally (row by row), combining a corresponding row * from each column into a single list row of a new column. @@ -902,59 +821,6 @@ private static native long stringConcatenationSepCol(long[] columnViews, long col_narep, boolean separate_nulls); - /** - * Native method to concatenate a list column of strings (each row is a list of strings), - * concatenates the strings within each row and returns a single strings column result. - * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the row separator provided in the `separators` strings column. - * @param list_column long holding the native handle of the column containing lists of strings - * to concatenate. - * @param sep_col long holding the native handle of the strings column that provides separators - * for concatenation. - * @param separator_narep string scalar indicating null behavior when a separator is null. - * If set to null and the separator is null the resulting string will - * be null. If not null, this string will be used in place of a null - * separator. - * @param col_narep string String scalar that should be used in place of any null strings - * found in any column. - * @param separate_nulls boolean if true, then the separator is included for null rows if - * `col_narep` is valid. - * @param empty_string_output_if_empty_list boolean if true, any input row that is an empty list - * will result in an empty string. Otherwise, it will result in a null. - * @return native handle of the resulting cudf column, used to construct the Java column. - */ - private static native long stringConcatenationListElementsSepCol(long list_column, - long sep_column, - long separator_narep, - long col_narep, - boolean separate_nulls, - boolean empty_string_output_if_empty_list); - - /** - * Native method to concatenate a list column of strings (each row is a list of strings), - * concatenates the strings within each row and returns a single strings column result. - * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the separator provided. - * @param list_column long holding the native handle of the column containing lists of strings - * to concatenate. - * @param separator string scalar inserted between each string being merged, may not be null. - * @param narep string scalar indicating null behavior. If set to null and any string in the row - * is null the resulting string will be null. If not null, null values in any - * column will be replaced by the specified string. The underlying value in the - * string scalar may be null, but the object passed in may not. - * @param separate_nulls boolean if true, then the separator is included for null rows if - * `col_narep` is valid. - * @param empty_string_output_if_empty_list boolean if true, any input row that is an empty list - * will result in an empty string. Otherwise, it will - * result in a null. - * @return native handle of the resulting cudf column, used to construct the Java column. - */ - private static native long stringConcatenationListElements(long list_column, - long separator, - long narep, - boolean separate_nulls, - boolean empty_string_output_if_empty_list); - /** * Native method to hash each row of the given table. Hashing function dispatched on the * native side using the hashId. diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 8a551d3452a..ba9242e800d 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2116,6 +2116,85 @@ public final ColumnVector substring(ColumnView start, ColumnView end) { return new ColumnVector(substringColumn(getNativeView(), start.getNativeView(), end.getNativeView())); } + /** + * Given a lists column of strings (each row is a list of strings), concatenates the strings + * within each row and returns a single strings column result. Each new string is created by + * concatenating the strings from the same row (same list element) delimited by the separator + * provided. This version of the function relaces nulls with empty string and returns null + * for empty list. + * @param list_column column containing lists of strings to concatenate. + * @param sep_col strings column that provides separators for concatenation. + * @return A new java column vector containing the concatenated strings with separator between. + */ + public final ColumnVector stringConcatenateListElements(ColumnView sep_col) { + try (Scalar nullString = Scalar.fromString(null); + Scalar emptyString = Scalar.fromString("")) { + return stringConcatenateListElements(sep_col, nullString, emptyString, + false, false); + } + } + + /** + * Given a lists column of strings (each row is a list of strings), concatenates the strings + * within each row and returns a single strings column result. + * Each new string is created by concatenating the strings from the same row (same list element) + * delimited by the row separator provided in the sep_col strings column. + * @param sep_col strings column that provides separators for concatenation. + * @param separator_narep string scalar indicating null behavior when a separator is null. + * If set to null and the separator is null the resulting string will + * be null. If not null, this string will be used in place of a null + * separator. + * @param string_narep string that should be used to replace null strings in any non-null list + * row. If set to null and the string is null the resulting string will + * be null. If not null, this string will be used in place of a null value. + * @param separate_nulls if true, then the separator is included for null rows if + * `string_narep` is valid. + * @param empty_string_output_if_empty_list if set to true, any input row that is an empty list + * will result in an empty string. Otherwise, it will result in a null. + * @return A new java column vector containing the concatenated strings with separator between. + */ + public final ColumnVector stringConcatenateListElements(ColumnView sep_col, + Scalar separator_narep, Scalar string_narep, boolean separate_nulls, + boolean empty_string_output_if_empty_list) { + assert type.equals(DType.LIST) : "column type must be a list"; + assert separator_narep != null : "separator narep scalar provided may not be null"; + assert string_narep != null : "string narep scalar provided may not be null"; + assert separator_narep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; + assert string_narep.getType().equals(DType.STRING) : "string narep scalar must be a string scalar"; + + return new ColumnVector(stringConcatenationListElementsSepCol(getNativeView(), + sep_col.getNativeView(), separator_narep.getScalarHandle(), string_narep.getScalarHandle(), + separate_nulls, empty_string_output_if_empty_list)); + } + + /** + * Given a lists column of strings (each row is a list of strings), concatenates the strings + * within each row and returns a single strings column result. Each new string is created by + * concatenating the strings from the same row (same list element) delimited by the + * separator provided. + * @param separator string scalar inserted between each string being merged. + * @param narep string scalar indicating null behavior. If set to null and any string in the row + * is null the resulting string will be null. If not null, null values in any + * column will be replaced by the specified string. The underlying value in the + * string scalar may be null, but the object passed in may not. + * @param separate_nulls if true, then the separator is included for null rows if + * `narep` is valid. + * @param empty_string_output_if_empty_list if set to true, any input row that is an empty list + * will result in an empty string. Otherwise, it will result in a null. + * @return A new java column vector containing the concatenated strings with separator between. + */ + public final ColumnVector stringConcatenateListElements(Scalar separator, + Scalar narep, boolean separate_nulls, boolean empty_string_output_if_empty_list) { + assert type.equals(DType.LIST) : "column type must be a list"; + assert separator != null : "separator scalar provided may not be null"; + assert narep != null : "column narep scalar provided may not be null"; + assert narep.getType().equals(DType.STRING) : "narep scalar must be a string scalar"; + + return new ColumnVector(stringConcatenationListElements(getNativeView(), + separator.getScalarHandle(), narep.getScalarHandle(), separate_nulls, + empty_string_output_if_empty_list)); + } + /** * Apply a JSONPath string to all rows in an input strings column. * @@ -2712,6 +2791,60 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) { */ private static native long stringTimestampToTimestamp(long viewHandle, int unit, String format); + /** + * Native method to concatenate a list column of strings (each row is a list of strings), + * concatenates the strings within each row and returns a single strings column result. + * Each new string is created by concatenating the strings from the same row (same list element) + * delimited by the row separator provided in the `separators` strings column. + * @param listColumnHandle long holding the native handle of the column containing lists of strings + * to concatenate. + * @param sepColumn long holding the native handle of the strings column that provides separators + * for concatenation. + * @param separatorNarep string scalar indicating null behavior when a separator is null. + * If set to null and the separator is null the resulting string will + * be null. If not null, this string will be used in place of a null + * separator. + * @param colNarep string String scalar that should be used in place of any null strings + * found in any column. + * @param separateNulls boolean if true, then the separator is included for null rows if + * `col_narep` is valid. + * @param emptyStringOutputIfEmptyList boolean if true, any input row that is an empty list + * will result in an empty string. Otherwise, it will + * result in a null. + * @return native handle of the resulting cudf column, used to construct the Java column. + */ + private static native long stringConcatenationListElementsSepCol(long listColumnHandle, + long sepColumn, + long separatorNarep, + long colNarep, + boolean separateNulls, + boolean emptyStringOutputIfEmptyList); + + /** + * Native method to concatenate a list column of strings (each row is a list of strings), + * concatenates the strings within each row and returns a single strings column result. + * Each new string is created by concatenating the strings from the same row (same list element) + * delimited by the separator provided. + * @param listColumnHandle long holding the native handle of the column containing lists of strings + * to concatenate. + * @param separator string scalar inserted between each string being merged, may not be null. + * @param narep string scalar indicating null behavior. If set to null and any string in the row + * is null the resulting string will be null. If not null, null values in any + * column will be replaced by the specified string. The underlying value in the + * string scalar may be null, but the object passed in may not. + * @param separateNulls boolean if true, then the separator is included for null rows if + * `col_narep` is valid. + * @param emptyStringOutputIfEmptyList boolean if true, any input row that is an empty list + * will result in an empty string. Otherwise, it will + * result in a null. + * @return native handle of the resulting cudf column, used to construct the Java column. + */ + private static native long stringConcatenationListElements(long listColumnHandle, + long separator, + long narep, + boolean separateNulls, + boolean emptyStringOutputIfEmptyList); + private static native long getJSONObject(long viewHandle, long scalarHandle) throws CudfException; /** diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index db570fc1b6c..772beebc330 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -189,68 +189,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepC CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationListElementsSepCol(JNIEnv *env, jclass, - jlong column_handle, - jlong sep_handle, - jlong separator_narep, - jlong col_narep, - jboolean separate_nulls, - jboolean empty_string_output_if_empty_list) { - JNI_NULL_CHECK(env, column_handle, "column handle is null", 0); - JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0); - JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0); - JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0); - try { - cudf::jni::auto_set_device(env); - const auto& separator_narep_scalar = *reinterpret_cast(separator_narep); - const auto& col_narep_scalar = *reinterpret_cast(col_narep); - auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES - : cudf::strings::separator_on_nulls::NO; - auto empty_list_output = - empty_string_output_if_empty_list ? cudf::strings::output_if_empty_list::EMPTY_STRING - : cudf::strings::output_if_empty_list::NULL_ELEMENT; - - cudf::column_view *column = reinterpret_cast(sep_handle); - cudf::strings_column_view strings_column(*column); - cudf::column_view *cv = reinterpret_cast(column_handle); - cudf::lists_column_view lcv(*cv); - std::unique_ptr result = - cudf::strings::join_list_elements(lcv, strings_column, separator_narep_scalar, - col_narep_scalar, null_policy, empty_list_output); - return reinterpret_cast(result.release()); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationListElements(JNIEnv *env, jclass, - jlong column_handle, - jlong separator, - jlong narep, - jboolean separate_nulls, - jboolean empty_string_output_if_empty_list) { - JNI_NULL_CHECK(env, column_handle, "column handle is null", 0); - JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0); - JNI_NULL_CHECK(env, narep, "separator narep string scalar object is null", 0); - try { - cudf::jni::auto_set_device(env); - const auto& separator_scalar = *reinterpret_cast(separator); - const auto& narep_scalar = *reinterpret_cast(narep); - auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES - : cudf::strings::separator_on_nulls::NO; - auto empty_list_output = - empty_string_output_if_empty_list ? cudf::strings::output_if_empty_list::EMPTY_STRING - : cudf::strings::output_if_empty_list::NULL_ELEMENT; - - cudf::column_view *cv = reinterpret_cast(column_handle); - cudf::lists_column_view lcv(*cv); - std::unique_ptr result = - cudf::strings::join_list_elements(lcv, separator_scalar, narep_scalar, - null_policy, empty_list_output); - return reinterpret_cast(result.release()); - } - CATCH_STD(env, 0); -} - JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatListByRow(JNIEnv *env, jclass, jlongArray column_handles, jboolean ignore_null) { diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index f09b9029ecb..b82584afc1c 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -1867,4 +1868,67 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env CATCH_STD(env, 0) } + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListElementsSepCol(JNIEnv *env, jclass, + jlong column_handle, + jlong sep_handle, + jlong separator_narep, + jlong col_narep, + jboolean separate_nulls, + jboolean empty_string_output_if_empty_list) { + JNI_NULL_CHECK(env, column_handle, "column handle is null", 0); + JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0); + JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0); + JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0); + try { + cudf::jni::auto_set_device(env); + const auto& separator_narep_scalar = *reinterpret_cast(separator_narep); + const auto& col_narep_scalar = *reinterpret_cast(col_narep); + auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES + : cudf::strings::separator_on_nulls::NO; + auto empty_list_output = + empty_string_output_if_empty_list ? cudf::strings::output_if_empty_list::EMPTY_STRING + : cudf::strings::output_if_empty_list::NULL_ELEMENT; + + cudf::column_view *column = reinterpret_cast(sep_handle); + cudf::strings_column_view strings_column(*column); + cudf::column_view *cv = reinterpret_cast(column_handle); + cudf::lists_column_view lcv(*cv); + std::unique_ptr result = + cudf::strings::join_list_elements(lcv, strings_column, separator_narep_scalar, + col_narep_scalar, null_policy, empty_list_output); + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListElements(JNIEnv *env, jclass, + jlong column_handle, + jlong separator, + jlong narep, + jboolean separate_nulls, + jboolean empty_string_output_if_empty_list) { + JNI_NULL_CHECK(env, column_handle, "column handle is null", 0); + JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0); + JNI_NULL_CHECK(env, narep, "separator narep string scalar object is null", 0); + try { + cudf::jni::auto_set_device(env); + const auto& separator_scalar = *reinterpret_cast(separator); + const auto& narep_scalar = *reinterpret_cast(narep); + auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES + : cudf::strings::separator_on_nulls::NO; + auto empty_list_output = + empty_string_output_if_empty_list ? cudf::strings::output_if_empty_list::EMPTY_STRING + : cudf::strings::output_if_empty_list::NULL_ELEMENT; + + cudf::column_view *cv = reinterpret_cast(column_handle); + cudf::lists_column_view lcv(*cv); + std::unique_ptr result = + cudf::strings::join_list_elements(lcv, separator_scalar, narep_scalar, + null_policy, empty_list_output); + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0); +} + } // extern "C" diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index faf99308db7..137abc74953 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2302,13 +2302,27 @@ void testStringConcatWsNullAllColSepTrue() { void testStringConcatWsSingleListCol() { try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.STRING)), - Arrays.asList("aaa"), Arrays.asList("b", "c", "d"), Arrays.asList("\u0480\u0481", null, "asdfbe", null)); + Arrays.asList("aaa"), Arrays.asList("b", "c", "d"), + Arrays.asList("\u0480\u0481", null, "asdfbe", null)); ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "*"); ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481*asdfbe"); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, - separatorString, col_narep, false, false)) { + ColumnVector concat = cv1.stringConcatenateListElements(sep_col, separatorString, + col_narep, false, false)) { + assertColumnsAreEqual(e_concat, concat); + } + } + + @Test + void testStringConcatWsSingleListColDefaultApi() { + try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("aaa"), Arrays.asList("b", "c", "d"), + Arrays.asList("\u0480\u0481", null, "asdfbe", null)); + ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "*"); + ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481*asdfbe"); + ColumnVector concat = cv1.stringConcatenateListElements(sep_col)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2317,12 +2331,13 @@ void testStringConcatWsSingleListCol() { void testStringConcatWsSingleListColScalarSep() { try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.STRING)), - Arrays.asList("aaa"), Arrays.asList("b", "c", "d"), Arrays.asList("\u0480\u0481", null, "asdfbe", null)); + Arrays.asList("aaa"), Arrays.asList("b", "c", "d"), + Arrays.asList("\u0480\u0481", null, "asdfbe", null)); Scalar separatorString = Scalar.fromString("-"); ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481-asdfbe"); Scalar narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, separatorString, - narep, false, false)) { + ColumnVector concat = cv1.stringConcatenateListElements(separatorString, narep, false, + false)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2336,8 +2351,8 @@ void testStringConcatWsSingleListColAllNulls() { ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, - separatorString, col_narep, false, false)) { + ColumnVector concat = cv1.stringConcatenateListElements(sep_col, separatorString, + col_narep, false, false)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2350,8 +2365,8 @@ void testStringConcatWsSingleListColAllNullsScalarSep() { ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); Scalar separatorString = Scalar.fromString("-"); Scalar narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, - separatorString, narep, false, false)) { + ColumnVector concat = cv1.stringConcatenateListElements(separatorString, narep, + false, false)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2365,8 +2380,8 @@ void testStringConcatWsSingleListColAllNullsSepTrue() { ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); - ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, - separatorString, col_narep, true, false)) { + ColumnVector concat = cv1.stringConcatenateListElements(sep_col, separatorString, + col_narep, true, false)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2380,8 +2395,8 @@ void testStringConcatWsSingleListColAllNullsKeepNulls() { ColumnVector e_concat = ColumnVector.fromStrings("aaa", null); Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(null); - ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, - separatorString, col_narep, true, false)) { + ColumnVector concat = cv1.stringConcatenateListElements(sep_col, separatorString, + col_narep, true, false)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2396,8 +2411,8 @@ void testStringConcatWsSingleListColEmptyArray() { Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); // set the parameter to return null on empty array - ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, - separatorString, col_narep, false, false)) { + ColumnVector concat = cv1.stringConcatenateListElements(sep_col, separatorString, + col_narep, false, false)) { assertColumnsAreEqual(e_concat, concat); } } @@ -2412,8 +2427,8 @@ void testStringConcatWsSingleListColEmptyArrayReturnEmpty() { Scalar separatorString = Scalar.fromString(null); Scalar col_narep = Scalar.fromString(""); // set the parameter to return empty string on empty array - ColumnVector concat = ColumnVector.stringConcatenateListElements(cv1, sep_col, - separatorString, col_narep, false, true)) { + ColumnVector concat = cv1.stringConcatenateListElements(sep_col, separatorString, + col_narep, false, true)) { assertColumnsAreEqual(e_concat, concat); } } From 6e1c2be425064013bb54a2ca94ab8702f9e86cf6 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Thu, 20 May 2021 13:06:37 -0500 Subject: [PATCH 20/22] Update to use camel case and fix doc --- .../java/ai/rapids/cudf/ColumnVector.java | 52 +++++++++---------- .../main/java/ai/rapids/cudf/ColumnView.java | 4 +- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 7a50c439ede..a4510a7033d 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -521,12 +521,12 @@ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, Col * is null the resulting string will be null. If not null, null values in any column * will be replaced by the specified string. * @param columns array of columns containing strings, must be non-empty - * @param separate_nulls if true, then the separator is included for null rows if + * @param separateNulls if true, then the separator is included for null rows if * `narep` is valid. * @return A new java column vector containing the concatenated strings. */ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, ColumnView[] columns, - boolean separate_nulls) { + boolean separateNulls) { assert columns != null : "input columns should not be null"; assert columns.length > 0 : "input columns should not be empty"; assert separator != null : "separator scalar provided may not be null"; @@ -534,14 +534,14 @@ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, Col assert narep != null : "narep scalar provided may not be null"; assert narep.getType().equals(DType.STRING) : "narep scalar must be a string scalar"; - long[] column_views = new long[columns.length]; + long[] columnViews = new long[columns.length]; for(int i = 0; i < columns.length; i++) { assert columns[i] != null : "Column vectors passed may not be null"; - column_views[i] = columns[i].getNativeView(); + columnViews[i] = columns[i].getNativeView(); } - return new ColumnVector(stringConcatenation(column_views, separator.getScalarHandle(), - narep.getScalarHandle(), separate_nulls)); + return new ColumnVector(stringConcatenation(columnViews, separator.getScalarHandle(), + narep.getScalarHandle(), separateNulls)); } /** @@ -549,50 +549,50 @@ public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, Col * and returns the result as a string column. If the row separator for a given row is null, * output column for that row is null. Null column values for a given row are skipped. * @param columns array of columns containing strings - * @param sep_col strings column that provides the separator for a given row + * @param sepCol strings column that provides the separator for a given row * @return A new java column vector containing the concatenated strings with separator between. */ - public static ColumnVector stringConcatenate(ColumnView[] columns, ColumnView sep_col) { + public static ColumnVector stringConcatenate(ColumnView[] columns, ColumnView sepCol) { try (Scalar nullString = Scalar.fromString(null); Scalar emptyString = Scalar.fromString("")) { - return stringConcatenate(columns, sep_col, nullString, emptyString, false); + return stringConcatenate(columns, sepCol, nullString, emptyString, false); } } /** * Concatenate columns of strings together using a separator specified for each row * and returns the result as a string column. If the row separator for a given row is null, - * output column for that row is null unless separator_narep is provided. - * The separator is applied between two output row values if the separate_nulls - * is `YES` or only between valid rows if separate_nulls is `NO`. + * output column for that row is null unless separatorNarep is provided. + * The separator is applied between two output row values if the separateNulls + * is `YES` or only between valid rows if separateNulls is `NO`. * @param columns array of columns containing strings - * @param sep_col strings column that provides the separator for a given row - * @param separator_narep string scalar indicating null behavior when a separator is null. + * @param sepCol strings column that provides the separator for a given row + * @param separatorNarep string scalar indicating null behavior when a separator is null. * If set to null and the separator is null the resulting string will * be null. If not null, this string will be used in place of a null * separator. - * @param col_narep string that should be used in place of any null strings + * @param colNarep string that should be used in place of any null strings * found in any column. - * @param separate_nulls if true, then the separator is included for null rows if - * `col_narep` is valid. + * @param separateNulls if true, then the separator is included for null rows if + * `colNarep` is valid. * @return A new java column vector containing the concatenated strings with separator between. */ public static ColumnVector stringConcatenate(ColumnView[] columns, - ColumnView sep_col, Scalar separator_narep, Scalar col_narep, boolean separate_nulls) { + ColumnView sepCol, Scalar separatorNarep, Scalar colNarep, boolean separateNulls) { assert columns.length >= 1 : ".stringConcatenate() operation requires at least 1 column"; - assert separator_narep != null : "separator narep scalar provided may not be null"; - assert col_narep != null : "column narep scalar provided may not be null"; - assert separator_narep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; - assert col_narep.getType().equals(DType.STRING) : "column narep scalar must be a string scalar"; + assert separatorNarep != null : "separator narep scalar provided may not be null"; + assert colNarep != null : "column narep scalar provided may not be null"; + assert separatorNarep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; + assert colNarep.getType().equals(DType.STRING) : "column narep scalar must be a string scalar"; - long[] column_views = new long[columns.length]; + long[] columnViews = new long[columns.length]; for(int i = 0; i < columns.length; i++) { assert columns[i] != null : "Column vectors passed may not be null"; - column_views[i] = columns[i].getNativeView(); + columnViews[i] = columns[i].getNativeView(); } - return new ColumnVector(stringConcatenationSepCol(column_views, sep_col.getNativeView(), - separator_narep.getScalarHandle(), col_narep.getScalarHandle(), separate_nulls)); + return new ColumnVector(stringConcatenationSepCol(columnViews, sepCol.getNativeView(), + separatorNarep.getScalarHandle(), colNarep.getScalarHandle(), separateNulls)); } /** diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index ba9242e800d..c62097c2ec9 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2807,7 +2807,7 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) { * @param colNarep string String scalar that should be used in place of any null strings * found in any column. * @param separateNulls boolean if true, then the separator is included for null rows if - * `col_narep` is valid. + * `colNarep` is valid. * @param emptyStringOutputIfEmptyList boolean if true, any input row that is an empty list * will result in an empty string. Otherwise, it will * result in a null. @@ -2833,7 +2833,7 @@ private static native long stringConcatenationListElementsSepCol(long listColumn * column will be replaced by the specified string. The underlying value in the * string scalar may be null, but the object passed in may not. * @param separateNulls boolean if true, then the separator is included for null rows if - * `col_narep` is valid. + * `narep` is valid. * @param emptyStringOutputIfEmptyList boolean if true, any input row that is an empty list * will result in an empty string. Otherwise, it will * result in a null. From f8612d7abef7b671a3aa70aa7effa16909455914 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Thu, 20 May 2021 17:17:41 -0500 Subject: [PATCH 21/22] Fix camel case in column view --- .../main/java/ai/rapids/cudf/ColumnView.java | 49 +++++++++---------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index c62097c2ec9..d3b09c3b2bd 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2122,14 +2122,13 @@ public final ColumnVector substring(ColumnView start, ColumnView end) { * concatenating the strings from the same row (same list element) delimited by the separator * provided. This version of the function relaces nulls with empty string and returns null * for empty list. - * @param list_column column containing lists of strings to concatenate. - * @param sep_col strings column that provides separators for concatenation. + * @param sepCol strings column that provides separators for concatenation. * @return A new java column vector containing the concatenated strings with separator between. */ - public final ColumnVector stringConcatenateListElements(ColumnView sep_col) { + public final ColumnVector stringConcatenateListElements(ColumnView sepCol) { try (Scalar nullString = Scalar.fromString(null); Scalar emptyString = Scalar.fromString("")) { - return stringConcatenateListElements(sep_col, nullString, emptyString, + return stringConcatenateListElements(sepCol, nullString, emptyString, false, false); } } @@ -2138,33 +2137,33 @@ public final ColumnVector stringConcatenateListElements(ColumnView sep_col) { * Given a lists column of strings (each row is a list of strings), concatenates the strings * within each row and returns a single strings column result. * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the row separator provided in the sep_col strings column. - * @param sep_col strings column that provides separators for concatenation. - * @param separator_narep string scalar indicating null behavior when a separator is null. + * delimited by the row separator provided in the sepCol strings column. + * @param sepCol strings column that provides separators for concatenation. + * @param separatorNarep string scalar indicating null behavior when a separator is null. * If set to null and the separator is null the resulting string will * be null. If not null, this string will be used in place of a null * separator. - * @param string_narep string that should be used to replace null strings in any non-null list + * @param stringNarep string that should be used to replace null strings in any non-null list * row. If set to null and the string is null the resulting string will * be null. If not null, this string will be used in place of a null value. - * @param separate_nulls if true, then the separator is included for null rows if - * `string_narep` is valid. - * @param empty_string_output_if_empty_list if set to true, any input row that is an empty list + * @param separateNulls if true, then the separator is included for null rows if + * `stringNarep` is valid. + * @param emptyStringOutputIfEmptyList if set to true, any input row that is an empty list * will result in an empty string. Otherwise, it will result in a null. * @return A new java column vector containing the concatenated strings with separator between. */ - public final ColumnVector stringConcatenateListElements(ColumnView sep_col, - Scalar separator_narep, Scalar string_narep, boolean separate_nulls, - boolean empty_string_output_if_empty_list) { + public final ColumnVector stringConcatenateListElements(ColumnView sepCol, + Scalar separatorNarep, Scalar stringNarep, boolean separateNulls, + boolean emptyStringOutputIfEmptyList) { assert type.equals(DType.LIST) : "column type must be a list"; - assert separator_narep != null : "separator narep scalar provided may not be null"; - assert string_narep != null : "string narep scalar provided may not be null"; - assert separator_narep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; - assert string_narep.getType().equals(DType.STRING) : "string narep scalar must be a string scalar"; + assert separatorNarep != null : "separator narep scalar provided may not be null"; + assert stringNarep != null : "string narep scalar provided may not be null"; + assert separatorNarep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar"; + assert stringNarep.getType().equals(DType.STRING) : "string narep scalar must be a string scalar"; return new ColumnVector(stringConcatenationListElementsSepCol(getNativeView(), - sep_col.getNativeView(), separator_narep.getScalarHandle(), string_narep.getScalarHandle(), - separate_nulls, empty_string_output_if_empty_list)); + sepCol.getNativeView(), separatorNarep.getScalarHandle(), stringNarep.getScalarHandle(), + separateNulls, emptyStringOutputIfEmptyList)); } /** @@ -2177,22 +2176,22 @@ public final ColumnVector stringConcatenateListElements(ColumnView sep_col, * is null the resulting string will be null. If not null, null values in any * column will be replaced by the specified string. The underlying value in the * string scalar may be null, but the object passed in may not. - * @param separate_nulls if true, then the separator is included for null rows if + * @param separateNulls if true, then the separator is included for null rows if * `narep` is valid. - * @param empty_string_output_if_empty_list if set to true, any input row that is an empty list + * @param emptyStringOutputIfEmptyList if set to true, any input row that is an empty list * will result in an empty string. Otherwise, it will result in a null. * @return A new java column vector containing the concatenated strings with separator between. */ public final ColumnVector stringConcatenateListElements(Scalar separator, - Scalar narep, boolean separate_nulls, boolean empty_string_output_if_empty_list) { + Scalar narep, boolean separateNulls, boolean emptyStringOutputIfEmptyList) { assert type.equals(DType.LIST) : "column type must be a list"; assert separator != null : "separator scalar provided may not be null"; assert narep != null : "column narep scalar provided may not be null"; assert narep.getType().equals(DType.STRING) : "narep scalar must be a string scalar"; return new ColumnVector(stringConcatenationListElements(getNativeView(), - separator.getScalarHandle(), narep.getScalarHandle(), separate_nulls, - empty_string_output_if_empty_list)); + separator.getScalarHandle(), narep.getScalarHandle(), separateNulls, + emptyStringOutputIfEmptyList)); } /** From 0bdf70c98ebb0440d7dd4111d7397e8727929f58 Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Fri, 21 May 2021 10:11:10 -0500 Subject: [PATCH 22/22] PR 8282 changed behavior of passing single column to stringConcatenate, update test --- .../java/ai/rapids/cudf/ColumnVectorTest.java | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 137abc74953..fbd2543d9d0 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2085,15 +2085,16 @@ void testStringConcatWithNulls() { assertColumnsAreEqual(concat, e_concat); } - try (ColumnVector v = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf", - "g\nH", "IJ\"\u0100\u0101\u0500\u0501", - "kl m", "Nop1", "\\qRs2", null, - "3tuV\'", "wX4Yz", "\ud720\ud721"); - Scalar emptyString = Scalar.fromString(""); - Scalar nullSubstitute = Scalar.fromString("NULL"); - ColumnVector concat = ColumnVector.stringConcatenate(emptyString, nullSubstitute, new ColumnView[]{v})) { - assertColumnsAreEqual(v, concat); - } + assertThrows(CudfException.class, () -> { + try (ColumnVector v = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf", + "g\nH", "IJ\"\u0100\u0101\u0500\u0501", + "kl m", "Nop1", "\\qRs2", null, + "3tuV\'", "wX4Yz", "\ud720\ud721"); + Scalar emptyString = Scalar.fromString(""); + Scalar nullSubstitute = Scalar.fromString("NULL"); + ColumnVector concat = ColumnVector.stringConcatenate(emptyString, nullSubstitute, new ColumnView[]{v})) { + } + }); } @Test