diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index b2c001c6737..cc1bc35f951 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -3473,6 +3473,16 @@ public final ColumnVector listSortRows(boolean isDescending, boolean isNullSmall return new ColumnVector(listSortRows(getNativeView(), isDescending, isNullSmallest)); } + /** + * Generate list offsets from sizes of each list. + * NOTICE: This API only works for INT32. Otherwise, the behavior is undefined. And no null and negative value is allowed. + * + * @return a column of list offsets whose size is N + 1 + */ + public final ColumnVector generateListOffsets() { + return new ColumnVector(generateListOffsets(getNativeView())); + } + /** * Get a single item from the column at the specified index as a Scalar. * @@ -4162,6 +4172,8 @@ static native long makeCudfColumnView(int type, int scale, long data, long dataS static native long copyColumnViewToCV(long viewHandle) throws CudfException; + static native long generateListOffsets(long handle) throws CudfException; + /** * A utility class to create column vector like objects without refcounts and other APIs when * creating the device side vector from host side nested vectors. Eventually this can go away or diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 6a294920d07..e074180c312 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -624,6 +624,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_generateListOffsets(JNIEnv *env, jclass, + jlong handle) { + JNI_NULL_CHECK(env, handle, "handle is null", 0) + try { + cudf::jni::auto_set_device(env); + auto const cv = reinterpret_cast(handle); + return release_as_jlong(cudf::jni::generate_list_offsets(*cv)); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass, jlong input_handle, jstring pattern_obj, diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu index 47055ca1611..6b4db39eb34 100644 --- a/java/src/main/native/src/ColumnViewJni.cu +++ b/java/src/main/native/src/ColumnViewJni.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,8 +15,11 @@ */ #include +#include #include #include +#include +#include #include "ColumnViewJni.hpp" @@ -51,4 +54,22 @@ new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar, return deep_copy; } +std::unique_ptr generate_list_offsets(cudf::column_view const &list_length, + rmm::cuda_stream_view stream) { + CUDF_EXPECTS(list_length.type().id() == cudf::type_id::INT32, + "Input column does not have type INT32."); + + auto const begin_iter = list_length.template begin(); + auto const end_iter = list_length.template end(); + + auto offsets_column = make_numeric_column(data_type{type_id::INT32}, list_length.size() + 1, + mask_state::UNALLOCATED, stream); + auto offsets_view = offsets_column->mutable_view(); + auto d_offsets = offsets_view.template begin(); + + thrust::inclusive_scan(rmm::exec_policy(stream), begin_iter, end_iter, d_offsets + 1); + CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream)); + + return offsets_column; +} } // namespace cudf::jni diff --git a/java/src/main/native/src/ColumnViewJni.hpp b/java/src/main/native/src/ColumnViewJni.hpp index 37e58ecb63a..429f36bcb1d 100644 --- a/java/src/main/native/src/ColumnViewJni.hpp +++ b/java/src/main/native/src/ColumnViewJni.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include +#include namespace cudf::jni { @@ -35,4 +36,19 @@ std::unique_ptr new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar, cudf::column_view const &bool_column); +/** + * @brief Generates list offsets with lengths of each list. + * + * For example, + * Given a list column: [[1,2,3], [4,5], [6], [], [7,8]] + * The list lengths of it: [3, 2, 1, 0, 2] + * The list offsets of it: [0, 3, 5, 6, 6, 8] + * + * @param list_length The column represents list lengths. + * @return The column represents list offsets. + */ +std::unique_ptr +generate_list_offsets(cudf::column_view const &list_length, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + } // namespace cudf::jni diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 9189cd27303..a42846aac05 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -6284,4 +6284,19 @@ void testSegmentedGather() { assertColumnsAreEqual(expected, actual); } } + + @Test + void testGenerateListOffsets() { + try (ColumnVector index = ColumnVector.fromInts(1, 3, 3, 0, 2, 0, 0, 5, 10, 25); + ColumnVector actual = index.generateListOffsets(); + ColumnVector expected = ColumnVector.fromInts(0, 1, 4, 7, 7, 9, 9, 9, 14, 24, 49)) { + assertColumnsAreEqual(expected, actual); + } + + try (ColumnVector index = ColumnVector.fromInts(0, 0, 1, 0, 0); + ColumnVector actual = index.generateListOffsets(); + ColumnVector expected = ColumnVector.fromInts(0, 0, 0, 1, 1, 1)) { + assertColumnsAreEqual(expected, actual); + } + } }