diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 0cb9ed37d9f..84183819854 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -4639,6 +4639,10 @@ static native long makeCudfColumnView(int type, int scale, long data, long dataS static native long applyBooleanMask(long arrayColumnView, long booleanMaskHandle) throws CudfException; + static native boolean hasNonEmptyNulls(long handle) throws CudfException; + + static native long purgeNonEmptyNulls(long handle) throws CudfException; + /** * A utility class to create column vector like objects without refcounts and other APIs when * creating the device side vector from host side nested vectors. Eventually this can go away or @@ -4997,4 +5001,37 @@ public HostColumnVector copyToHost() { } } } + + /** + * Exact check if a column or its descendants have non-empty null rows + * + * @return Whether the column or its descendants have non-empty null rows + */ + public boolean hasNonEmptyNulls() { + return hasNonEmptyNulls(viewHandle); + } + + /** + * Copies this column into output while purging any non-empty null rows in the column or its + * descendants. + * + * If this column is not of compound type (LIST/STRING/STRUCT/DICTIONARY), the output will be + * the same as input. + * + * The purge operation only applies directly to LIST and STRING columns, but it applies indirectly + * to STRUCT/DICTIONARY columns as well, since these columns may have child columns that + * are LIST or STRING. + * + * Examples: + * lists = data: [{{0,1}, {2,3}, {4,5}} validity: {true, false, true}] + * lists[1] is null, but the list's child column still stores `{2,3}`. + * + * After purging the contents of the list's null rows, the column's contents will be: + * lists = [data: {{0,1}, {4,5}} validity: {true, false, true}] + * + * @return A new column with equivalent contents to `input`, but with null rows purged + */ + public ColumnVector purgeNonEmptyNulls() { + return new ColumnVector(purgeNonEmptyNulls(viewHandle)); + } } diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index c42cc430560..f2c361c5e8c 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -2457,4 +2457,26 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_applyBooleanMask( CATCH_STD(env, 0); } +JNIEXPORT jboolean JNICALL +Java_ai_rapids_cudf_ColumnView_hasNonEmptyNulls(JNIEnv *env, jclass, jlong column_view_handle) { + JNI_NULL_CHECK(env, column_view_handle, "column_view handle is null", 0); + try { + cudf::jni::auto_set_device(env); + auto const *cv = reinterpret_cast(column_view_handle); + return cudf::has_nonempty_nulls(*cv); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlong JNICALL +Java_ai_rapids_cudf_ColumnView_purgeNonEmptyNulls(JNIEnv *env, jclass, jlong column_view_handle) { + JNI_NULL_CHECK(env, column_view_handle, "column_view handle is null", 0); + try { + cudf::jni::auto_set_device(env); + auto const *cv = reinterpret_cast(column_view_handle); + return release_as_jlong(cudf::purge_nonempty_nulls(*cv)); + } + CATCH_STD(env, 0); +} + } // extern "C" diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 937077c89c9..7848807dab8 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -32,6 +32,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Optional; import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -6691,4 +6692,65 @@ void testApplyBooleanMaskFromListOfStructure() { assertColumnsAreEqual(expectedCv, actualCv); } } + + /** + * The caller needs to make sure to close the returned ColumnView + */ + private ColumnView[] getColumnViewWithNonEmptyNulls() { + List list0 = Arrays.asList(1, 2, 3); + List list1 = Arrays.asList(4, 5, null); + List list2 = Arrays.asList(7, 8, 9); + List list3 = null; + ColumnVector input = makeListsColumn(DType.INT32, list0, list1, list2, list3); + // Modify the validity buffer + BaseDeviceMemoryBuffer dmb = input.getDeviceBufferFor(BufferType.VALIDITY); + try (HostMemoryBuffer newValidity = HostMemoryBuffer.allocate(64)) { + newValidity.copyFromDeviceBuffer(dmb); + BitVectorHelper.setNullAt(newValidity, 1); + dmb.copyFromHostBuffer(newValidity); + } + try (HostColumnVector hostColumnVector = input.copyToHost()) { + assert (hostColumnVector.isNull(1)); + assert (hostColumnVector.isNull(3)); + } + try (ColumnVector expectedOffsetsBeforePurge = ColumnVector.fromInts(0, 3, 6, 9, 9)) { + ColumnView offsetsCvBeforePurge = input.getListOffsetsView(); + assertColumnsAreEqual(expectedOffsetsBeforePurge, offsetsCvBeforePurge); + } + ColumnView colWithNonEmptyNulls = new ColumnView(input.type, input.rows, Optional.of(2L), dmb, + input.getDeviceBufferFor(BufferType.OFFSET), input.getChildColumnViews()); + assertEquals(2, colWithNonEmptyNulls.nullCount); + return new ColumnView[]{input, colWithNonEmptyNulls}; + } + + @Test + void testPurgeNonEmptyNullsList() { + ColumnView[] values = getColumnViewWithNonEmptyNulls(); + try (ColumnView colWithNonEmptyNulls = values[1]; + ColumnView input = values[0]; + // purge non-empty nulls + ColumnView colWithEmptyNulls = colWithNonEmptyNulls.purgeNonEmptyNulls(); + ColumnVector expectedOffsetsAfterPurge = ColumnVector.fromInts(0, 3, 3, 6, 6); + ColumnView offsetsCvAfterPurge = colWithEmptyNulls.getListOffsetsView()) { + assertTrue(colWithNonEmptyNulls.hasNonEmptyNulls()); + assertColumnsAreEqual(expectedOffsetsAfterPurge, offsetsCvAfterPurge); + assertFalse(colWithEmptyNulls.hasNonEmptyNulls()); + } + } + + @Test + void testPurgeNonEmptyNullsStruct() { + ColumnView[] values = getColumnViewWithNonEmptyNulls(); + try (ColumnView listCol = values[1]; + ColumnView input = values[0]; + ColumnView stringsCol = ColumnVector.fromStrings("A", "col", "of", "Strings"); + ColumnView structView = ColumnView.makeStructView(stringsCol, listCol); + ColumnView structWithEmptyNulls = structView.purgeNonEmptyNulls(); + ColumnView newListChild = structWithEmptyNulls.getChildColumnView(1); + ColumnVector expectedOffsetsAfterPurge = ColumnVector.fromInts(0, 3, 3, 6, 6); + ColumnView offsetsCvAfterPurge = newListChild.getListOffsetsView()) { + assertColumnsAreEqual(expectedOffsetsAfterPurge, offsetsCvAfterPurge); + assertFalse(newListChild.hasNonEmptyNulls()); + } + } }