From eaba42e4d7631302c81c4caf2f3d29fb24f3c45d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 25 Jan 2022 19:47:19 -0500 Subject: [PATCH 01/39] Add libcudf strings split API that accepts regex pattern --- conda/recipes/libcudf/meta.yaml | 1 + cpp/CMakeLists.txt | 1 + cpp/include/cudf/strings/split/split_re.hpp | 82 ++++++++ cpp/src/strings/split/split_record_re.cu | 215 ++++++++++++++++++++ cpp/tests/strings/split_tests.cpp | 37 +++- 5 files changed, 335 insertions(+), 1 deletion(-) create mode 100644 cpp/include/cudf/strings/split/split_re.hpp create mode 100644 cpp/src/strings/split/split_record_re.cu diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 2cbe5173de0..01ad8d4e270 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -214,6 +214,7 @@ test: - test -f $PREFIX/include/cudf/strings/replace_re.hpp - test -f $PREFIX/include/cudf/strings/split/partition.hpp - test -f $PREFIX/include/cudf/strings/split/split.hpp + - test -f $PREFIX/include/cudf/strings/split/split_re.hpp - test -f $PREFIX/include/cudf/strings/string_view.hpp - test -f $PREFIX/include/cudf/strings/strings_column_view.hpp - test -f $PREFIX/include/cudf/strings/strip.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e4637408110..b25d6ff3703 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -444,6 +444,7 @@ add_library( src/strings/split/partition.cu src/strings/split/split.cu src/strings/split/split_record.cu + src/strings/split/split_record_re.cu src/strings/strings_column_factories.cu src/strings/strings_column_view.cpp src/strings/strings_scalar_factories.cpp diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp new file mode 100644 index 00000000000..b69bd1c5991 --- /dev/null +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf { +namespace strings { +/** + * @addtogroup strings_split + * @{ + * @file + */ + +/** + * @brief Splits individual strings elements into a list of strings + * using a regex pattern to delimit each string. + * + * Each element generates an array of strings that are stored in an output + * lists column. + * + * The number of elements in the output column will be the same as the number of + * elements in the input column. Each individual list item will contain the + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. + * + * The `pattern` is used to identify the separation points within a string + * and splitting stops when either `maxsplit` or the end of the string is reached. + * + * An empty input string will produce a corresponding empty list item output row. + * A null row will produce a corresponding null list item output row. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = split_record(s, "[_ ]") + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * s2 = split_record(s, "[ _]", 1) + * s2 is a lists column of strings: + * [ ["a", "bc_def_g"], + * ["a", "_bc"], + * ["", "ab_cd"], + * ["ab", "cd_"] ] + * @endcode + * + * @throw cudf:logic_error if `pattern` is empty. + * + * @param strings A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return Lists column of strings + * Each vector of the lists column holds splits from a single row + * element of the input column. + */ +std::unique_ptr split_record_re( + strings_column_view const& strings, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_record_re.cu new file mode 100644 index 00000000000..d197ee9c7e3 --- /dev/null +++ b/cpp/src/strings/split/split_record_re.cu @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace cudf { +namespace strings { +namespace detail { + +using string_index_pair = thrust::pair; + +namespace { + +/** + * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`. + */ +template +struct token_counter_fn { + column_device_view const d_strings; // strings to split + reprog_device prog; + size_type const max_tokens; + + __device__ size_type operator()(size_type idx) + { + if (d_strings.is_null(idx)) { return 0; } + + auto const d_str = d_strings.element(idx); + size_type token_count = 0; + + int32_t begin = 0; + int32_t end = -1; + while (token_count < max_tokens - 1) { + if (prog.find(idx, d_str, begin, end) <= 0) { break; } + token_count++; + begin = end + (begin == end); + end = -1; + } + return token_count + 1; // always at least one token + } +}; + +/** + * @brief Identify the tokens from the `idx'th` string element of `d_strings`. + */ +template +struct token_reader_fn { + column_device_view const d_strings; + reprog_device prog; + int32_t const* d_token_offsets; + string_index_pair* d_tokens; + + __device__ void operator()(size_type idx) + { + if (d_strings.is_null(idx)) { return; } + + auto const token_offset = d_token_offsets[idx]; + auto const token_count = d_token_offsets[idx + 1] - token_offset; + auto d_result = d_tokens + token_offset; + auto const d_str = d_strings.element(idx); + if (d_str.empty()) { + // return empty string output for empty string input + *d_result = string_index_pair{"", 0}; + return; + } + + size_type token_idx = 0; + size_type begin = 0; + size_type end = d_str.length(); + size_type last_pos = 0; + while (token_idx < token_count - 1) { + if (prog.find(idx, d_str, begin, end) <= 0) { break; } + + auto const start_pos = d_str.byte_offset(begin); + auto const end_pos = d_str.byte_offset(end); + d_result[token_idx] = string_index_pair{d_str.data() + last_pos, start_pos - last_pos}; + + begin = end + (begin == end); + end = d_str.length(); + token_idx++; + last_pos = end_pos; + } + + // set last token to remainder of the string + if (last_pos <= d_str.size_bytes()) { + d_result[token_idx] = + string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos}; + } + } +}; + +} // namespace + +// The output is one list item per string +std::unique_ptr split_record_re( + strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + + auto const max_tokens = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits::max(); + auto const strings_count = input.size(); + + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_strings = column_device_view::create(input.parent(), stream); + + auto offsets = make_numeric_column( + data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + + auto const begin = thrust::make_counting_iterator(0); + auto const end = thrust::make_counting_iterator(strings_count); + + // create offsets column by counting the number of tokens per string + auto const regex_insts = d_prog->insts_counts(); + if (regex_insts <= RX_SMALL_INSTS) { + token_counter_fn counter{*d_strings, *d_prog, max_tokens}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + token_counter_fn counter{*d_strings, *d_prog, max_tokens}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); + } else if (regex_insts <= RX_LARGE_INSTS) { + token_counter_fn counter{*d_strings, *d_prog, max_tokens}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); + } else { + token_counter_fn counter{*d_strings, *d_prog, max_tokens}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); + } + // convert counts into offsets + thrust::exclusive_scan( + rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); + + // last entry is the total number of tokens to be generated + auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); + + printf("instruction = %d\ntotal_tokens = %d\nbegin,end = %d,%d\n", + regex_insts, + total_tokens, + *begin, + *end); + // split each string into an array of index-pair values + rmm::device_uvector tokens(total_tokens, stream); + if (regex_insts <= RX_SMALL_INSTS) { + token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } else if (regex_insts <= RX_LARGE_INSTS) { + token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } else { + token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } + + // convert the index-pairs into one big strings column + auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); + // create a lists column using the offsets and the strings columns + return make_lists_column(strings_count, + std::move(offsets), + std::move(strings_output), + input.null_count(), + copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} + +} // namespace detail + +// external APIs + +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::split_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index de4e48fd70a..7eddc947d40 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -339,6 +340,40 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } +TEST_F(StringsSplitTest, SplitRecordRegex) +{ + std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + auto result = cudf::strings::split_record_re(sv, "[eé]"); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected( + {LCW{" H", "llo th", "s", ""}, LCW{}, LCW{"ar", " som", " "}, LCW{"t", "st String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); +} + +TEST_F(StringsSplitTest, SplitRecordRegexWithMaxSplit) +{ + std::vector h_strings{" Héllo\tthesé", nullptr, "are\nsome ", "tést\rString", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + auto result = cudf::strings::split_record_re(sv, "\\s", 1); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected( + {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); +} + TEST_F(StringsSplitTest, RSplitRecord) { std::vector h_strings{ From a83243646975f0c398910bb0cb80affa40c214bf Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 26 Jan 2022 11:25:18 -0500 Subject: [PATCH 02/39] add error-checking gtests --- cpp/tests/strings/split_tests.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index 7eddc947d40..badb84536ba 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -654,6 +654,11 @@ TEST_F(StringsSplitTest, InvalidParameter) cudf::logic_error); EXPECT_THROW(cudf::strings::rsplit(strings_view, cudf::string_scalar("", false)), cudf::logic_error); + EXPECT_THROW(cudf::strings::split_record(strings_view, cudf::string_scalar("", false)), + cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_record(strings_view, cudf::string_scalar("", false)), + cudf::logic_error); + EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)), cudf::logic_error); EXPECT_THROW(cudf::strings::rpartition(strings_view, cudf::string_scalar("", false)), From baccf10d8b7c447b6ff9b77451df6ce9afc2fe65 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 26 Jan 2022 13:29:17 -0700 Subject: [PATCH 03/39] Add JNI --- java/src/main/native/src/ColumnViewJni.cpp | 33 ++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 38c6bb3740e..ee00d8707f8 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -576,6 +576,23 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv * CATCH_STD(env, 0); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRegex(JNIEnv *env, jclass, + jlong column_view, + jlong delimiter) { + JNI_NULL_CHECK(env, column_view, "column is null", 0); + JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0); + try { + cudf::jni::auto_set_device(env); + cudf::column_view *cv = reinterpret_cast(column_view); + cudf::strings_column_view scv(*cv); + cudf::string_scalar *ss_scalar = reinterpret_cast(delimiter); + + std::unique_ptr table_result = cudf::strings::split_re(scv, *ss_scalar); + return cudf::jni::convert_table_for_return(env, table_result); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass, jlong column_view, jlong delimiter, @@ -592,6 +609,22 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecordRegex(JNIEnv *env, jclass, + jlong column_view, + jlong delimiter, + jint max_split) { + JNI_NULL_CHECK(env, column_view, "column is null", 0); + JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0); + try { + cudf::jni::auto_set_device(env); + cudf::column_view *cv = reinterpret_cast(column_view); + cudf::strings_column_view scv(*cv); + cudf::string_scalar *ss_scalar = reinterpret_cast(delimiter); + return release_as_jlong(cudf::strings::split_record_re(scv, *ss_scalar, max_split)); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv *env, jclass clazz, jlong input_column, jintArray split_indices) { From d33f79bb9c89d014a4e6a374067e8a88c366aafa Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 27 Jan 2022 10:49:01 -0500 Subject: [PATCH 04/39] use count_matches utility --- cpp/src/strings/split/split_record_re.cu | 63 ++++++++---------------- 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_record_re.cu index d197ee9c7e3..eff0c511393 100644 --- a/cpp/src/strings/split/split_record_re.cu +++ b/cpp/src/strings/split/split_record_re.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -42,30 +43,23 @@ using string_index_pair = thrust::pair; namespace { /** - * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`. + * @brief Convert match counts to token counts. + * + * The matches are the delimiters and the tokens are what is left: + * `token1, delimiter, token2, delimiter, token3, etc` + * Usually `token_count = match_count + 1` even with empty strings. + * However, we need to account for the max_tokens and null rows. */ -template -struct token_counter_fn { - column_device_view const d_strings; // strings to split - reprog_device prog; +struct match_to_token_count_fn { + column_device_view const d_strings; + size_type const* d_counts; size_type const max_tokens; __device__ size_type operator()(size_type idx) { if (d_strings.is_null(idx)) { return 0; } - - auto const d_str = d_strings.element(idx); - size_type token_count = 0; - - int32_t begin = 0; - int32_t end = -1; - while (token_count < max_tokens - 1) { - if (prog.find(idx, d_str, begin, end) <= 0) { break; } - token_count++; - begin = end + (begin == end); - end = -1; - } - return token_count + 1; // always at least one token + auto const match_count = d_counts[idx]; + return std::min(match_count, max_tokens) + 1; } }; @@ -130,34 +124,23 @@ std::unique_ptr split_record_re( { CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); - auto const max_tokens = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits::max(); + auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); auto const strings_count = input.size(); auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); auto d_strings = column_device_view::create(input.parent(), stream); - auto offsets = make_numeric_column( - data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); + auto offsets = count_matches(*d_strings, *d_prog, stream, mr); auto d_offsets = offsets->mutable_view().data(); auto const begin = thrust::make_counting_iterator(0); auto const end = thrust::make_counting_iterator(strings_count); - - // create offsets column by counting the number of tokens per string - auto const regex_insts = d_prog->insts_counts(); - if (regex_insts <= RX_SMALL_INSTS) { - token_counter_fn counter{*d_strings, *d_prog, max_tokens}; - thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); - } else if (regex_insts <= RX_MEDIUM_INSTS) { - token_counter_fn counter{*d_strings, *d_prog, max_tokens}; - thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); - } else if (regex_insts <= RX_LARGE_INSTS) { - token_counter_fn counter{*d_strings, *d_prog, max_tokens}; - thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); - } else { - token_counter_fn counter{*d_strings, *d_prog, max_tokens}; - thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); - } + // convert match counts to tokens + thrust::transform(rmm::exec_policy(stream), + begin, + end, + d_offsets, + match_to_token_count_fn{*d_strings, d_offsets, max_tokens}); // convert counts into offsets thrust::exclusive_scan( rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); @@ -165,13 +148,9 @@ std::unique_ptr split_record_re( // last entry is the total number of tokens to be generated auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); - printf("instruction = %d\ntotal_tokens = %d\nbegin,end = %d,%d\n", - regex_insts, - total_tokens, - *begin, - *end); // split each string into an array of index-pair values rmm::device_uvector tokens(total_tokens, stream); + auto const regex_insts = d_prog->insts_counts(); if (regex_insts <= RX_SMALL_INSTS) { token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); From 9c74fdffbc2c3ddc7e4a248a4c837996c8c25bf4 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 27 Jan 2022 10:49:16 -0500 Subject: [PATCH 05/39] add split_re declaration --- cpp/include/cudf/strings/split/split_re.hpp | 62 +++++++++++++++++++-- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index b69bd1c5991..54d590fcf71 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include namespace cudf { namespace strings { @@ -26,6 +27,59 @@ namespace strings { * @file */ +/** + * @brief Splits individual strings elements into a table of strings columns + * using a regex pattern to delimit each string. + * + * Each element generates an array of strings that are stored in corresponding + * rows in the output table. + * + * The number of elements in the output table will be the same as the number of + * elements in the input column. The row for each column will contain the + * new strings produced from that input row. + * + * The resulting number of columns will be the maximum number of tokens found + * in any input row. + * + * The `pattern` is used to identify the separation points within a string + * and splitting stops when either `maxsplit` or the end of the string is reached. + * + * An empty input string will produce a corresponding empty string in the + * corresponding row in the first column. + * + * A null row will produce a corresponding null rows in the output table. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = split_re(s, "[_ ]") + * s1 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc", "", "ab", "cd"], + * ["def", "bc", "cd", ""], + * ["g", null, null, null] ] + * s2 = split_re(s, "[ _]", 1) + * s2 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc def_g", "_bc", "ab_cd", "cd "] ] + * @endcode + * + * @throw cudf:logic_error if `pattern` is empty. + * + * @param strings A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return Lists column of strings + * Each vector of the lists column holds splits from a single row + * element of the input column. + */ +std::unique_ptr split_re( + strings_column_view const& strings, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Splits individual strings elements into a list of strings * using a regex pattern to delimit each string. @@ -54,10 +108,10 @@ namespace strings { * ["ab", "cd", ""] ] * s2 = split_record(s, "[ _]", 1) * s2 is a lists column of strings: - * [ ["a", "bc_def_g"], + * [ ["a", "bc def_g"], * ["a", "_bc"], - * ["", "ab_cd"], - * ["ab", "cd_"] ] + * ["", "ab cd"], + * ["ab", "cd "] ] * @endcode * * @throw cudf:logic_error if `pattern` is empty. From 1a89db5f53ed21952183d8ab9f2d4e6e800b1175 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 27 Jan 2022 18:01:11 -0500 Subject: [PATCH 06/39] split_re implementation and tests --- cpp/src/strings/split/split_record_re.cu | 193 ++++++++++++++++++----- cpp/tests/strings/split_tests.cpp | 110 +++++++++---- 2 files changed, 235 insertions(+), 68 deletions(-) diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_record_re.cu index eff0c511393..3f916d0138f 100644 --- a/cpp/src/strings/split/split_record_re.cu +++ b/cpp/src/strings/split/split_record_re.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -70,7 +71,7 @@ template struct token_reader_fn { column_device_view const d_strings; reprog_device prog; - int32_t const* d_token_offsets; + offset_type const* d_token_offsets; string_index_pair* d_tokens; __device__ void operator()(size_type idx) @@ -81,11 +82,6 @@ struct token_reader_fn { auto const token_count = d_token_offsets[idx + 1] - token_offset; auto d_result = d_tokens + token_offset; auto const d_str = d_strings.element(idx); - if (d_str.empty()) { - // return empty string output for empty string input - *d_result = string_index_pair{"", 0}; - return; - } size_type token_idx = 0; size_type begin = 0; @@ -112,61 +108,105 @@ struct token_reader_fn { } }; -} // namespace - -// The output is one list item per string -std::unique_ptr split_record_re( - strings_column_view const& input, - std::string const& pattern, - size_type maxsplit, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); - - auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); - auto const strings_count = input.size(); +struct tokens_transform_fn { + column_device_view const d_strings; + string_index_pair const* d_tokens; + offset_type const* d_token_offsets; + size_type const column_index; - auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); - auto d_strings = column_device_view::create(input.parent(), stream); + __device__ string_index_pair operator()(size_type idx) const + { + auto const offset = d_token_offsets[idx]; + auto const token_count = d_token_offsets[idx + 1] - offset; + if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; } + if (column_index > token_count - 1) { return string_index_pair{nullptr, 0}; } + return d_tokens[offset + column_index]; + } +}; - auto offsets = count_matches(*d_strings, *d_prog, stream, mr); - auto d_offsets = offsets->mutable_view().data(); +/** + * @brief Call regex to split each input string into tokens. + * + * This will also convert the `offsets` values from counts to offsets. + * + * @param d_strings Strings to split + * @param d_prog Regex to evaluate against each string + * @param max_tokens The maximum number of tokens for each split. + * @param offsets The number of matches on input. + * The offsets for each token in each string on output. + * @param stream CUDA stream used for kernel launches. + */ +rmm::device_uvector split_utility(column_device_view const& d_strings, + reprog_device& d_prog, + size_type max_tokens, + mutable_column_view& offsets, + rmm::cuda_stream_view stream) +{ + auto d_offsets = offsets.data(); + auto const strings_count = d_strings.size(); auto const begin = thrust::make_counting_iterator(0); auto const end = thrust::make_counting_iterator(strings_count); + // convert match counts to tokens - thrust::transform(rmm::exec_policy(stream), - begin, - end, - d_offsets, - match_to_token_count_fn{*d_strings, d_offsets, max_tokens}); + match_to_token_count_fn match_fn{d_strings, d_offsets, max_tokens}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, match_fn); + // convert counts into offsets - thrust::exclusive_scan( - rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); + thrust::exclusive_scan(rmm::exec_policy(stream), + offsets.begin(), + offsets.end(), + offsets.begin()); - // last entry is the total number of tokens to be generated - auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); + // the last entry is the total number of tokens to be generated + auto const total_tokens = cudf::detail::get_value(offsets, strings_count, stream); - // split each string into an array of index-pair values + // generate tokens for each string rmm::device_uvector tokens(total_tokens, stream); - auto const regex_insts = d_prog->insts_counts(); + auto const regex_insts = d_prog.insts_counts(); if (regex_insts <= RX_SMALL_INSTS) { - token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } else if (regex_insts <= RX_MEDIUM_INSTS) { - token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } else if (regex_insts <= RX_LARGE_INSTS) { - token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } else { - token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } - // convert the index-pairs into one big strings column + return tokens; +} + +} // namespace + +// The output is one list item per string +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + + auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); + auto const strings_count = input.size(); + + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_strings = column_device_view::create(input.parent(), stream); + + auto offsets = count_matches(*d_strings, *d_prog, stream, mr); + auto offsets_view = offsets->mutable_view(); + + // get split tokens from the input column + auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream); + + // convert the tokens into one big strings column auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); + // create a lists column using the offsets and the strings columns return make_lists_column(strings_count, std::move(offsets), @@ -177,10 +217,83 @@ std::unique_ptr split_record_re( mr); } +std::unique_ptr
split_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + + auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); + auto const strings_count = input.size(); + + std::vector> results; + if (strings_count == 0) { + results.push_back(make_empty_column(type_id::STRING)); + return std::make_unique
(std::move(results)); + } + + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_strings = column_device_view::create(input.parent(), stream); + + auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource()); + auto offsets_view = offsets->mutable_view(); + + // get split tokens from the input column + auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream); + + // the columns_count is the maximum number of tokens for any string in the input column + auto const begin = thrust::make_counting_iterator(0); + auto const end = thrust::make_counting_iterator(strings_count); + auto d_offsets = offsets_view.data(); + auto size_lambda = [d_offsets] __device__(auto const idx) -> size_type { + return d_offsets[idx + 1] - d_offsets[idx]; + }; + auto const columns_count = thrust::transform_reduce( + rmm::exec_policy(stream), begin, end, size_lambda, 0, thrust::maximum{}); + + // boundary case: if no columns, return one all-null column (custrings issue #119) + if (columns_count == 0) { + results.push_back(std::make_unique( + data_type{type_id::STRING}, + strings_count, + rmm::device_buffer{0, stream, mr}, // no data + cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr), + strings_count)); + return std::make_unique
(std::move(results)); + } + + // convert the tokens into multiple strings columns + auto make_strings_lambda = [&](size_type column_index) { + // returns appropriate token for each row/column + auto indices_itr = cudf::detail::make_counting_transform_iterator( + 0, tokens_transform_fn{*d_strings, tokens.data(), d_offsets, column_index}); + return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr); + }; + // create each column of tokens + results.resize(columns_count); + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(columns_count), + results.begin(), + make_strings_lambda); + + return std::make_unique
(std::move(results)); +} + } // namespace detail // external APIs +std::unique_ptr
split_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::split_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} + std::unique_ptr split_record_re(strings_column_view const& input, std::string const& pattern, size_type maxsplit, diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index badb84536ba..f541a6b0e81 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -247,33 +247,13 @@ TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); auto results = cudf::strings::split(zero_size_strings_column); EXPECT_TRUE(results->num_columns() == 1); - cudf::test::expect_strings_empty(results->get_column(0)); + EXPECT_TRUE(results->num_rows() == 0); results = cudf::strings::rsplit(zero_size_strings_column); EXPECT_TRUE(results->num_columns() == 1); - cudf::test::expect_strings_empty(results->get_column(0)); -} - -// This test specifically for https://github.com/rapidsai/custrings/issues/119 -TEST_F(StringsSplitTest, AllNullsCase) -{ - std::vector h_strings{nullptr, nullptr, nullptr}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - - auto results = cudf::strings::split(cudf::strings_column_view(strings)); - EXPECT_TRUE(results->num_columns() == 1); - auto column = results->get_column(0).view(); - EXPECT_TRUE(column.size() == 3); - EXPECT_TRUE(column.has_nulls()); - EXPECT_TRUE(column.null_count() == column.size()); - results = cudf::strings::split(cudf::strings_column_view(strings), cudf::string_scalar("-")); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::split_re(zero_size_strings_column, "\\s"); EXPECT_TRUE(results->num_columns() == 1); - column = results->get_column(0); - EXPECT_TRUE(column.size() == 3); - EXPECT_TRUE(column.has_nulls()); - EXPECT_TRUE(column.null_count() == column.size()); + EXPECT_TRUE(results->num_rows() == 0); } TEST_F(StringsSplitTest, SplitRecord) @@ -340,6 +320,54 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } +TEST_F(StringsSplitTest, SplitRegex) +{ + std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + { + auto result = cudf::strings::split_re(sv, "\\s+"); + + cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0}); + cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0}); + auto expected = cudf::table_view({col0, col1, col2}); + CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); + } + + { + auto result = cudf::strings::split_re(sv, "[eé]"); + + cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""}, + {1, 0, 1, 1, 0}); + cudf::test::strings_column_wrapper col2({"s", "", " ", "", ""}, {1, 0, 1, 0, 0}); + cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0}); + auto expected = cudf::table_view({col0, col1, col2, col3}); + CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); + } +} + +TEST_F(StringsSplitTest, SplitRegexWithMaxSplit) +{ + std::vector h_strings{" Héllo\tthesé", nullptr, "are\nsome ", "tést\rString", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + auto result = cudf::strings::split_re(sv, "\\s+", 1); + + cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some ", "String", ""}, + {1, 0, 1, 1, 0}); + auto expected = cudf::table_view({col0, col1}); + CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); +} + TEST_F(StringsSplitTest, SplitRecordRegex) { std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; @@ -469,10 +497,35 @@ TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto split_record_result = cudf::strings::split_record(zero_size_strings_column); - EXPECT_TRUE(split_record_result->size() == 0); - auto rsplit_record_result = cudf::strings::rsplit_record(zero_size_strings_column); - EXPECT_TRUE(rsplit_record_result->size() == 0); + auto result = cudf::strings::split_record(zero_size_strings_column); + EXPECT_TRUE(result->size() == 0); + result = cudf::strings::rsplit_record(zero_size_strings_column); + EXPECT_TRUE(result->size() == 0); + result = cudf::strings::split_record_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(result->size() == 0); +} + +// This test specifically for https://github.com/rapidsai/custrings/issues/119 +TEST_F(StringsSplitTest, AllNullsCase) +{ + cudf::test::strings_column_wrapper input({"", "", ""}, {0, 0, 0}); + auto sv = cudf::strings_column_view(input); + + auto results = cudf::strings::split(sv); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::split(sv, cudf::string_scalar("-")); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::rsplit(sv); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::rsplit(sv, cudf::string_scalar("-")); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::split_re(sv, "-"); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); } TEST_F(StringsSplitTest, Partition) @@ -658,6 +711,7 @@ TEST_F(StringsSplitTest, InvalidParameter) cudf::logic_error); EXPECT_THROW(cudf::strings::rsplit_record(strings_view, cudf::string_scalar("", false)), cudf::logic_error); + EXPECT_THROW(cudf::strings::split_re(strings_view, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)), cudf::logic_error); From 8599d0cba24ef963c28361455769969a3764a430 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 28 Jan 2022 08:16:31 -0500 Subject: [PATCH 07/39] rename split_record_re.cu to split_re.cu --- cpp/CMakeLists.txt | 2 +- cpp/src/strings/split/{split_record_re.cu => split_re.cu} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/src/strings/split/{split_record_re.cu => split_re.cu} (100%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f96ef4945b9..407e1f9a858 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -445,8 +445,8 @@ add_library( src/strings/search/find_multiple.cu src/strings/split/partition.cu src/strings/split/split.cu + src/strings/split/split_re.cu src/strings/split/split_record.cu - src/strings/split/split_record_re.cu src/strings/strings_column_factories.cu src/strings/strings_column_view.cpp src/strings/strings_scalar_factories.cpp diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_re.cu similarity index 100% rename from cpp/src/strings/split/split_record_re.cu rename to cpp/src/strings/split/split_re.cu From b6d7453b66c3548e4e47499de66e7eae0fa0b2fb Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 31 Jan 2022 14:05:56 -0500 Subject: [PATCH 08/39] refactored split_re/rsplit_re functions --- cpp/include/cudf/strings/split/split_re.hpp | 130 +++++++-- cpp/src/strings/split/split_re.cu | 281 ++++++++++++-------- cpp/tests/strings/split_tests.cpp | 162 +++++++---- 3 files changed, 394 insertions(+), 179 deletions(-) diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index 54d590fcf71..cf6d23ccd28 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -32,22 +32,19 @@ namespace strings { * using a regex pattern to delimit each string. * * Each element generates an array of strings that are stored in corresponding - * rows in the output table. + * rows in the output table -- `table[col,row] = token[col] of string[row]` + * where `token` is the substring between each delimiter. * * The number of elements in the output table will be the same as the number of - * elements in the input column. The row for each column will contain the - * new strings produced from that input row. - * - * The resulting number of columns will be the maximum number of tokens found - * in any input row. + * elements in the input column. The resulting number of columns will be the + * maximum number of tokens found in any input row. * * The `pattern` is used to identify the separation points within a string * and splitting stops when either `maxsplit` or the end of the string is reached. * * An empty input string will produce a corresponding empty string in the - * corresponding row in the first column. - * - * A null row will produce a corresponding null rows in the output table. + * corresponding row of the first column. + * A null row will produce corresponding null rows in the output table. * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] @@ -60,7 +57,7 @@ namespace strings { * s2 = split_re(s, "[ _]", 1) * s2 is a table of strings columns: * [ ["a", "a", "", "ab"], - * ["bc def_g", "_bc", "ab_cd", "cd "] ] + * ["bc def_g", "_bc", "ab cd", "cd "] ] * @endcode * * @throw cudf:logic_error if `pattern` is empty. @@ -70,9 +67,7 @@ namespace strings { * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. * @param mr Device memory resource used to allocate the returned result's device memory. - * @return Lists column of strings - * Each vector of the lists column holds splits from a single row - * element of the input column. + * @return A table of columns of strings. */ std::unique_ptr
split_re( strings_column_view const& strings, @@ -81,9 +76,59 @@ std::unique_ptr
split_re( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Splits individual strings elements into a list of strings + * @brief Splits individual strings elements into a table of strings columns * using a regex pattern to delimit each string. * + * Each element generates an array of strings that are stored in corresponding + * rows in the output table -- `table[col,row] = token[col] of string[row]` + * where `token` is the substring between each delimiter. + * + * The number of elements in the output table will be the same as the number of + * elements in the input column. The resulting number of columns will be the + * maximum number of tokens found in any input row. + * + * Splitting occurs by traversing starting from the end of the input string. + * The `pattern` is used to identify the separation points within the string + * and splitting stops when either `maxsplit` or the beginning of the string + * is reached. + * + * An empty input string will produce a corresponding empty string in the + * corresponding row of the first column. + * A null row will produce corresponding null rows in the output table. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = rsplit_re(s, "[_ ]") + * s1 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc", "", "ab", "cd"], + * ["def", "bc", "cd", ""], + * ["g", null, null, null] ] + * s2 = rsplit_re(s, "[ _]", 1) + * s2 is a table of strings columns: + * [ ["a_bc def", "a_", "_ab", "ab"], + * ["g", "bc", "cd", "cd "] ] + * @endcode + * + * @throw cudf:logic_error if `pattern` is empty. + * + * @param strings A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return A table of columns of strings. + */ +std::unique_ptr
rsplit_re( + strings_column_view const& strings, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Splits individual strings elements into a list of strings + * using the given regex pattern to delimit each string. + * * Each element generates an array of strings that are stored in an output * lists column. * @@ -96,7 +141,7 @@ std::unique_ptr
split_re( * and splitting stops when either `maxsplit` or the end of the string is reached. * * An empty input string will produce a corresponding empty list item output row. - * A null row will produce a corresponding null list item output row. + * A null row will produce a corresponding null output row. * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] @@ -121,9 +166,7 @@ std::unique_ptr
split_re( * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. * @param mr Device memory resource used to allocate the returned result's device memory. - * @return Lists column of strings - * Each vector of the lists column holds splits from a single row - * element of the input column. + * @return Lists column of strings. */ std::unique_ptr split_record_re( strings_column_view const& strings, @@ -131,6 +174,57 @@ std::unique_ptr split_record_re( size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Splits individual strings elements into a list of strings + * using the given regex pattern to delimit each string. + * + * Each element generates an array of strings that are stored in an output + * lists column. + * + * The number of elements in the output column will be the same as the number of + * elements in the input column. Each individual list item will contain the + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. + * + * Splitting occurs by traversing starting from the end of the input string. + * The `pattern` is used to identify the separation points within a string + * and splitting stops when either `maxsplit` or the beginning of the string + * is reached. + * + * An empty input string will produce a corresponding empty list item output row. + * A null row will produce a corresponding null output row. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = rsplit_record(s, "[_ ]") + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * s2 = rsplit_record(s, "[ _]", 1) + * s2 is a lists column of strings: + * [ ["a_bc def", "g"], + * ["a_", "bc"], + * ["_ab", "cd"], + * ["ab_cd", ""] ] + * @endcode + * + * @throw cudf:logic_error if `pattern` is empty. + * + * @param strings A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return Lists column of strings. + */ +std::unique_ptr rsplit_record_re( + strings_column_view const& strings, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 3f916d0138f..9427a900d8d 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -32,36 +32,18 @@ #include #include -#include -#include +#include namespace cudf { namespace strings { namespace detail { - -using string_index_pair = thrust::pair; - namespace { -/** - * @brief Convert match counts to token counts. - * - * The matches are the delimiters and the tokens are what is left: - * `token1, delimiter, token2, delimiter, token3, etc` - * Usually `token_count = match_count + 1` even with empty strings. - * However, we need to account for the max_tokens and null rows. - */ -struct match_to_token_count_fn { - column_device_view const d_strings; - size_type const* d_counts; - size_type const max_tokens; +using string_index_pair = thrust::pair; - __device__ size_type operator()(size_type idx) - { - if (d_strings.is_null(idx)) { return 0; } - auto const match_count = d_counts[idx]; - return std::min(match_count, max_tokens) + 1; - } +enum class split_direction { + FORWARD, ///< for split logic + BACKWARD ///< for rsplit logic }; /** @@ -71,56 +53,58 @@ template struct token_reader_fn { column_device_view const d_strings; reprog_device prog; + split_direction const direction; offset_type const* d_token_offsets; string_index_pair* d_tokens; __device__ void operator()(size_type idx) { if (d_strings.is_null(idx)) { return; } + auto const d_str = d_strings.element(idx); auto const token_offset = d_token_offsets[idx]; auto const token_count = d_token_offsets[idx + 1] - token_offset; - auto d_result = d_tokens + token_offset; - auto const d_str = d_strings.element(idx); + auto d_result = d_tokens + token_offset; // store tokens here size_type token_idx = 0; - size_type begin = 0; + size_type begin = 0; // characters size_type end = d_str.length(); - size_type last_pos = 0; - while (token_idx < token_count - 1) { - if (prog.find(idx, d_str, begin, end) <= 0) { break; } - - auto const start_pos = d_str.byte_offset(begin); - auto const end_pos = d_str.byte_offset(end); - d_result[token_idx] = string_index_pair{d_str.data() + last_pos, start_pos - last_pos}; - - begin = end + (begin == end); - end = d_str.length(); - token_idx++; - last_pos = end_pos; + size_type last_pos = 0; // bytes + while (prog.find(idx, d_str, begin, end) > 0) { + // get the token (characters just before this match) + auto token = string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos}; + // store it if we have space + if (token_idx < token_count - 1) { + d_result[token_idx++] = token; + } else { + if (direction == split_direction::FORWARD) { break; } // we are done + for (auto l = 0; l < token_idx - 1; ++l) { + d_result[l] = d_result[l + 1]; // shift left + } + d_result[token_idx - 1] = token; + } + // setup for next match + last_pos = d_str.byte_offset(end); + begin = end + (begin == end); + end = d_str.length(); } - // set last token to remainder of the string + // set the last token to the remainder of the string if (last_pos <= d_str.size_bytes()) { d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos}; } - } -}; - -struct tokens_transform_fn { - column_device_view const d_strings; - string_index_pair const* d_tokens; - offset_type const* d_token_offsets; - size_type const column_index; - __device__ string_index_pair operator()(size_type idx) const - { - auto const offset = d_token_offsets[idx]; - auto const token_count = d_token_offsets[idx + 1] - offset; - if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; } - if (column_index > token_count - 1) { return string_index_pair{nullptr, 0}; } - return d_tokens[offset + column_index]; + if (direction == split_direction::BACKWARD) { + // update first entry -- this happens when max-tokens is hit before the end + auto const first_offset = + d_result[0].first + ? static_cast(thrust::distance(d_str.data(), d_result[0].first)) + : 0; + if (first_offset) { + d_result[0] = string_index_pair{d_str.data(), first_offset + d_result[0].second}; + } + } } }; @@ -138,6 +122,7 @@ struct tokens_transform_fn { */ rmm::device_uvector split_utility(column_device_view const& d_strings, reprog_device& d_prog, + split_direction direction, size_type max_tokens, mutable_column_view& offsets, rmm::cuda_stream_view stream) @@ -148,15 +133,12 @@ rmm::device_uvector split_utility(column_device_view const& d auto const begin = thrust::make_counting_iterator(0); auto const end = thrust::make_counting_iterator(strings_count); - // convert match counts to tokens - match_to_token_count_fn match_fn{d_strings, d_offsets, max_tokens}; - thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, match_fn); - - // convert counts into offsets - thrust::exclusive_scan(rmm::exec_policy(stream), - offsets.begin(), - offsets.end(), - offsets.begin()); + // convert match counts to token offsets + auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) { + return d_strings.is_null(idx) ? 0 : std::min(d_offsets[idx], max_tokens) + 1; + }; + thrust::transform_exclusive_scan( + rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus{}); // the last entry is the total number of tokens to be generated auto const total_tokens = cudf::detail::get_value(offsets, strings_count, stream); @@ -165,60 +147,48 @@ rmm::device_uvector split_utility(column_device_view const& d rmm::device_uvector tokens(total_tokens, stream); auto const regex_insts = d_prog.insts_counts(); if (regex_insts <= RX_SMALL_INSTS) { - token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } else if (regex_insts <= RX_MEDIUM_INSTS) { - token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } else if (regex_insts <= RX_LARGE_INSTS) { - token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } else { - token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } return tokens; } -} // namespace - -// The output is one list item per string -std::unique_ptr split_record_re(strings_column_view const& input, - std::string const& pattern, - size_type maxsplit, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); - - auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); - auto const strings_count = input.size(); - - auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); - auto d_strings = column_device_view::create(input.parent(), stream); - - auto offsets = count_matches(*d_strings, *d_prog, stream, mr); - auto offsets_view = offsets->mutable_view(); - - // get split tokens from the input column - auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream); - - // convert the tokens into one big strings column - auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); +/** + * @brief Returns string pair for the specified column for each string in `d_strings` + * + * This is used to build the table result of a split. + * Null is returned if the row is null of if the `column_index` is larger + * than the token count for that string. + */ +struct tokens_transform_fn { + column_device_view const d_strings; + string_index_pair const* d_tokens; + offset_type const* d_token_offsets; + size_type const column_index; - // create a lists column using the offsets and the strings columns - return make_lists_column(strings_count, - std::move(offsets), - std::move(strings_output), - input.null_count(), - copy_bitmask(input.parent(), stream, mr), - stream, - mr); -} + __device__ string_index_pair operator()(size_type idx) const + { + auto const offset = d_token_offsets[idx]; + auto const token_count = d_token_offsets[idx + 1] - offset; + return (column_index > token_count - 1) || d_strings.is_null(idx) + ? string_index_pair{nullptr, 0} + : d_tokens[offset + column_index]; + } +}; std::unique_ptr
split_re(strings_column_view const& input, std::string const& pattern, + split_direction direction, size_type maxsplit, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -239,19 +209,21 @@ std::unique_ptr
split_re(strings_column_view const& input, auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource()); auto offsets_view = offsets->mutable_view(); + auto d_offsets = offsets_view.data(); // get split tokens from the input column - auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream); + auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); // the columns_count is the maximum number of tokens for any string in the input column - auto const begin = thrust::make_counting_iterator(0); - auto const end = thrust::make_counting_iterator(strings_count); - auto d_offsets = offsets_view.data(); - auto size_lambda = [d_offsets] __device__(auto const idx) -> size_type { - return d_offsets[idx + 1] - d_offsets[idx]; - }; auto const columns_count = thrust::transform_reduce( - rmm::exec_policy(stream), begin, end, size_lambda, 0, thrust::maximum{}); + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + [d_offsets] __device__(auto const idx) -> size_type { + return d_offsets[idx + 1] - d_offsets[idx]; + }, + 0, + thrust::maximum{}); // boundary case: if no columns, return one all-null column (custrings issue #119) if (columns_count == 0) { @@ -271,7 +243,7 @@ std::unique_ptr
split_re(strings_column_view const& input, 0, tokens_transform_fn{*d_strings, tokens.data(), d_offsets, column_index}); return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr); }; - // create each column of tokens + // build a vector of columns results.resize(columns_count); std::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(columns_count), @@ -281,6 +253,78 @@ std::unique_ptr
split_re(strings_column_view const& input, return std::make_unique
(std::move(results)); } +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + split_direction direction, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + + auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); + auto const strings_count = input.size(); + + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_strings = column_device_view::create(input.parent(), stream); + + auto offsets = count_matches(*d_strings, *d_prog, stream, mr); + auto offsets_view = offsets->mutable_view(); + + // get split tokens from the input column + auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); + + // convert the tokens into one big strings column + auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); + + // create a lists column using the offsets and the strings columns + return make_lists_column(strings_count, + std::move(offsets), + std::move(strings_output), + input.null_count(), + copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} + +} // namespace + +std::unique_ptr
split_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr); +} + +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_record_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr); +} + +std::unique_ptr
rsplit_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr); +} + +std::unique_ptr rsplit_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_record_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr); +} + } // namespace detail // external APIs @@ -303,5 +347,22 @@ std::unique_ptr split_record_re(strings_column_view const& input, return detail::split_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); } +std::unique_ptr
rsplit_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::rsplit_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} + +std::unique_ptr rsplit_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::rsplit_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} } // namespace strings } // namespace cudf diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index f541a6b0e81..d0b695bbc93 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -25,8 +25,8 @@ #include #include #include +#include #include -#include #include @@ -241,21 +241,6 @@ TEST_F(StringsSplitTest, RSplitWhitespaceWithMax) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, *expected); } -TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) -{ - cudf::column_view zero_size_strings_column( - cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto results = cudf::strings::split(zero_size_strings_column); - EXPECT_TRUE(results->num_columns() == 1); - EXPECT_TRUE(results->num_rows() == 0); - results = cudf::strings::rsplit(zero_size_strings_column); - EXPECT_TRUE(results->num_columns() == 1); - EXPECT_TRUE(results->num_rows() == 0); - results = cudf::strings::split_re(zero_size_strings_column, "\\s"); - EXPECT_TRUE(results->num_columns() == 1); - EXPECT_TRUE(results->num_rows() == 0); -} - TEST_F(StringsSplitTest, SplitRecord) { std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; @@ -331,41 +316,30 @@ TEST_F(StringsSplitTest, SplitRegex) { auto result = cudf::strings::split_re(sv, "\\s+"); - cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, validity); cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0}); cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0}); auto expected = cudf::table_view({col0, col1, col2}); - CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + + result = cudf::strings::rsplit_re(sv, "\\s+"); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } { auto result = cudf::strings::split_re(sv, "[eé]"); - cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, validity); cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""}, {1, 0, 1, 1, 0}); cudf::test::strings_column_wrapper col2({"s", "", " ", "", ""}, {1, 0, 1, 0, 0}); cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0}); auto expected = cudf::table_view({col0, col1, col2, col3}); - CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); - } -} - -TEST_F(StringsSplitTest, SplitRegexWithMaxSplit) -{ - std::vector h_strings{" Héllo\tthesé", nullptr, "are\nsome ", "tést\rString", ""}; - auto validity = - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); - cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); - auto sv = cudf::strings_column_view(input); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); - auto result = cudf::strings::split_re(sv, "\\s+", 1); - - cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); - cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some ", "String", ""}, - {1, 0, 1, 1, 0}); - auto expected = cudf::table_view({col0, col1}); - CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); + result = cudf::strings::rsplit_re(sv, "[eé]"); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + } } TEST_F(StringsSplitTest, SplitRecordRegex) @@ -376,30 +350,60 @@ TEST_F(StringsSplitTest, SplitRecordRegex) cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); auto sv = cudf::strings_column_view(input); - auto result = cudf::strings::split_record_re(sv, "[eé]"); - using LCW = cudf::test::lists_column_wrapper; - LCW expected( - {LCW{" H", "llo th", "s", ""}, LCW{}, LCW{"ar", " som", " "}, LCW{"t", "st String"}, LCW{""}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + { + auto result = cudf::strings::split_record_re(sv, "\\s+"); + + LCW expected( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", ""}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + + result = cudf::strings::rsplit_record_re(sv, "\\s+"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + } + + { + auto result = cudf::strings::split_record_re(sv, "[eé]"); + + LCW expected({LCW{" H", "llo th", "s", ""}, + LCW{}, + LCW{"ar", " som", " "}, + LCW{"t", "st String"}, + LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + + result = cudf::strings::rsplit_record_re(sv, "[eé]"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + } } -TEST_F(StringsSplitTest, SplitRecordRegexWithMaxSplit) +TEST_F(StringsSplitTest, SplitRegexWithMaxSplit) { std::vector h_strings{" Héllo\tthesé", nullptr, "are\nsome ", "tést\rString", ""}; auto validity = thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); auto sv = cudf::strings_column_view(input); + { + auto result = cudf::strings::split_re(sv, "\\s+", 1); - auto result = cudf::strings::split_record_re(sv, "\\s", 1); + cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some ", "String", ""}, + {1, 0, 1, 1, 0}); + auto expected = cudf::table_view({col0, col1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + } + { + auto result = cudf::strings::split_record_re(sv, "\\s", 1); - using LCW = cudf::test::lists_column_wrapper; - LCW expected( - {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + using LCW = cudf::test::lists_column_wrapper; + LCW expected( + {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + } } TEST_F(StringsSplitTest, RSplitRecord) @@ -493,16 +497,58 @@ TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } -TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns) +TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit) +{ + std::vector h_strings{" Héllo\tthesé", nullptr, "are some\n ", "tést\rString", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + { + auto result = cudf::strings::rsplit_re(sv, "\\s+", 1); + + cudf::test::strings_column_wrapper col0({" Héllo", "", "are some", "tést", ""}, validity); + cudf::test::strings_column_wrapper col1({"thesé", "", "", "String", ""}, {1, 0, 1, 1, 0}); + auto expected = cudf::table_view({col0, col1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + } + { + auto result = cudf::strings::rsplit_record_re(sv, "\\s+", 1); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected( + {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + } +} + +TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); + auto results = cudf::strings::split(zero_size_strings_column); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::rsplit(zero_size_strings_column); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::split_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::rsplit_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + auto result = cudf::strings::split_record(zero_size_strings_column); EXPECT_TRUE(result->size() == 0); result = cudf::strings::rsplit_record(zero_size_strings_column); EXPECT_TRUE(result->size() == 0); result = cudf::strings::split_record_re(zero_size_strings_column, "\\s"); EXPECT_TRUE(result->size() == 0); + result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(result->size() == 0); } // This test specifically for https://github.com/rapidsai/custrings/issues/119 @@ -526,6 +572,20 @@ TEST_F(StringsSplitTest, AllNullsCase) results = cudf::strings::split_re(sv, "-"); EXPECT_TRUE(results->num_columns() == 1); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::rsplit_re(sv, "-"); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + + auto result = cudf::strings::split_record(sv); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{}}, cudf::test::iterators::all_nulls()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + result = cudf::strings::rsplit_record(sv); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + result = cudf::strings::split_record_re(sv, "-"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + result = cudf::strings::rsplit_record_re(sv, "-"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } TEST_F(StringsSplitTest, Partition) From 7bc451b142c84c2505416ae5d8f2d9d979a1989f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 31 Jan 2022 17:26:52 -0500 Subject: [PATCH 09/39] remove unneeded if-check --- cpp/src/strings/split/split_re.cu | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 9427a900d8d..9dcf7e6f17b 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -64,7 +64,7 @@ struct token_reader_fn { auto const token_offset = d_token_offsets[idx]; auto const token_count = d_token_offsets[idx + 1] - token_offset; - auto d_result = d_tokens + token_offset; // store tokens here + auto const d_result = d_tokens + token_offset; // store tokens here size_type token_idx = 0; size_type begin = 0; // characters @@ -72,7 +72,8 @@ struct token_reader_fn { size_type last_pos = 0; // bytes while (prog.find(idx, d_str, begin, end) > 0) { // get the token (characters just before this match) - auto token = string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos}; + auto const token = + string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos}; // store it if we have space if (token_idx < token_count - 1) { d_result[token_idx++] = token; @@ -90,13 +91,10 @@ struct token_reader_fn { } // set the last token to the remainder of the string - if (last_pos <= d_str.size_bytes()) { - d_result[token_idx] = - string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos}; - } + d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos}; if (direction == split_direction::BACKWARD) { - // update first entry -- this happens when max-tokens is hit before the end + // update first entry -- this happens when max-tokens is hit before the end of the string auto const first_offset = d_result[0].first ? static_cast(thrust::distance(d_str.data(), d_result[0].first)) @@ -127,11 +125,11 @@ rmm::device_uvector split_utility(column_device_view const& d mutable_column_view& offsets, rmm::cuda_stream_view stream) { - auto d_offsets = offsets.data(); auto const strings_count = d_strings.size(); - auto const begin = thrust::make_counting_iterator(0); - auto const end = thrust::make_counting_iterator(strings_count); + auto const begin = thrust::make_counting_iterator(0); + auto const end = thrust::make_counting_iterator(strings_count); + auto const d_offsets = offsets.data(); // convert match counts to token offsets auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) { @@ -140,7 +138,7 @@ rmm::device_uvector split_utility(column_device_view const& d thrust::transform_exclusive_scan( rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus{}); - // the last entry is the total number of tokens to be generated + // the last offset entry is the total number of tokens to be generated auto const total_tokens = cudf::detail::get_value(offsets, strings_count, stream); // generate tokens for each string @@ -204,14 +202,16 @@ std::unique_ptr
split_re(strings_column_view const& input, return std::make_unique
(std::move(results)); } + // create the regex device prog from the given pattern auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); auto d_strings = column_device_view::create(input.parent(), stream); + // count the number of delimiters matched in each string auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource()); auto offsets_view = offsets->mutable_view(); auto d_offsets = offsets_view.data(); - // get split tokens from the input column + // get the split tokens from the input column auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); // the columns_count is the maximum number of tokens for any string in the input column @@ -265,13 +265,15 @@ std::unique_ptr split_record_re(strings_column_view const& input, auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); auto const strings_count = input.size(); + // create the regex device prog from the given pattern auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); auto d_strings = column_device_view::create(input.parent(), stream); + // count the number of delimiters matched in each string auto offsets = count_matches(*d_strings, *d_prog, stream, mr); auto offsets_view = offsets->mutable_view(); - // get split tokens from the input column + // get the split tokens from the input column auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); // convert the tokens into one big strings column From 93887b1877733bfc97c29606f8c9a221d8304efb Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 31 Jan 2022 17:27:29 -0500 Subject: [PATCH 10/39] add all empty and all null test cases --- cpp/tests/strings/split_tests.cpp | 36 ++++++++++++++++--------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index d0b695bbc93..4650cbc3c44 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -541,14 +541,14 @@ TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) EXPECT_TRUE(results->num_columns() == 1); EXPECT_TRUE(results->num_rows() == 0); - auto result = cudf::strings::split_record(zero_size_strings_column); - EXPECT_TRUE(result->size() == 0); - result = cudf::strings::rsplit_record(zero_size_strings_column); - EXPECT_TRUE(result->size() == 0); - result = cudf::strings::split_record_re(zero_size_strings_column, "\\s"); - EXPECT_TRUE(result->size() == 0); - result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s"); - EXPECT_TRUE(result->size() == 0); + auto list_result = cudf::strings::split_record(zero_size_strings_column); + EXPECT_TRUE(list_result->size() == 0); + list_result = cudf::strings::rsplit_record(zero_size_strings_column); + EXPECT_TRUE(list_result->size() == 0); + list_result = cudf::strings::split_record_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(list_result->size() == 0); + list_result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(list_result->size() == 0); } // This test specifically for https://github.com/rapidsai/custrings/issues/119 @@ -576,16 +576,16 @@ TEST_F(StringsSplitTest, AllNullsCase) EXPECT_TRUE(results->num_columns() == 1); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); - auto result = cudf::strings::split_record(sv); - using LCW = cudf::test::lists_column_wrapper; + auto list_result = cudf::strings::split_record(sv); + using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{}, LCW{}, LCW{}}, cudf::test::iterators::all_nulls()); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); - result = cudf::strings::rsplit_record(sv); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); - result = cudf::strings::split_record_re(sv, "-"); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); - result = cudf::strings::rsplit_record_re(sv, "-"); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); + list_result = cudf::strings::rsplit_record(sv); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); + list_result = cudf::strings::split_record_re(sv, "-"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); + list_result = cudf::strings::rsplit_record_re(sv, "-"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); } TEST_F(StringsSplitTest, Partition) @@ -773,6 +773,8 @@ TEST_F(StringsSplitTest, InvalidParameter) cudf::logic_error); EXPECT_THROW(cudf::strings::split_re(strings_view, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_re(strings_view, ""), cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_record_re(strings_view, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)), cudf::logic_error); EXPECT_THROW(cudf::strings::rpartition(strings_view, cudf::string_scalar("", false)), From c88eeae8727b9c94f05d15c0e9e3e9714107bf39 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Feb 2022 12:34:45 -0500 Subject: [PATCH 11/39] add more maxsplit gtests --- cpp/include/cudf/strings/split/split_re.hpp | 50 +++++++++++---------- cpp/src/strings/split/split_re.cu | 28 +++++++----- cpp/tests/strings/split_tests.cpp | 34 +++++++++++--- 3 files changed, 71 insertions(+), 41 deletions(-) diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index cf6d23ccd28..c6dc1e5c697 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -28,18 +28,18 @@ namespace strings { */ /** - * @brief Splits individual strings elements into a table of strings columns + * @brief Splits strings elements into a table of strings columns * using a regex pattern to delimit each string. * - * Each element generates an array of strings that are stored in corresponding - * rows in the output table -- `table[col,row] = token[col] of string[row]` - * where `token` is the substring between each delimiter. + * Each element generates a vector of strings that are stored in corresponding + * rows in the output table -- `table[col,row] = token[col] of strings[row]` + * where `token` is a substring between delimiters. * - * The number of elements in the output table will be the same as the number of + * The number of rows in the output table will be the same as the number of * elements in the input column. The resulting number of columns will be the * maximum number of tokens found in any input row. * - * The `pattern` is used to identify the separation points within a string + * The `pattern` is used to identify the delimiters within a string * and splitting stops when either `maxsplit` or the end of the string is reached. * * An empty input string will produce a corresponding empty string in the @@ -62,7 +62,7 @@ namespace strings { * * @throw cudf:logic_error if `pattern` is empty. * - * @param strings A column of string elements to be split. + * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. @@ -70,25 +70,25 @@ namespace strings { * @return A table of columns of strings. */ std::unique_ptr
split_re( - strings_column_view const& strings, + strings_column_view const& input, std::string const& pattern, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Splits individual strings elements into a table of strings columns + * @brief Splits strings elements into a table of strings columns * using a regex pattern to delimit each string. * - * Each element generates an array of strings that are stored in corresponding + * Each element generates a vector of strings that are stored in corresponding * rows in the output table -- `table[col,row] = token[col] of string[row]` * where `token` is the substring between each delimiter. * - * The number of elements in the output table will be the same as the number of + * The number of rows in the output table will be the same as the number of * elements in the input column. The resulting number of columns will be the * maximum number of tokens found in any input row. * * Splitting occurs by traversing starting from the end of the input string. - * The `pattern` is used to identify the separation points within the string + * The `pattern` is used to identify the delimiters within a string * and splitting stops when either `maxsplit` or the beginning of the string * is reached. * @@ -112,7 +112,7 @@ std::unique_ptr
split_re( * * @throw cudf:logic_error if `pattern` is empty. * - * @param strings A column of string elements to be split. + * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. @@ -120,24 +120,25 @@ std::unique_ptr
split_re( * @return A table of columns of strings. */ std::unique_ptr
rsplit_re( - strings_column_view const& strings, + strings_column_view const& input, std::string const& pattern, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Splits individual strings elements into a list of strings + * @brief Splits strings elements into a list column of strings * using the given regex pattern to delimit each string. * * Each element generates an array of strings that are stored in an output - * lists column. + * lists column -- `list[row] = [token1, token2, ...] found in input[row]` + * where `token` is a substring between delimiters. * * The number of elements in the output column will be the same as the number of * elements in the input column. Each individual list item will contain the * new strings for that row. The resulting number of strings in each row can vary * from 0 to `maxsplit + 1`. * - * The `pattern` is used to identify the separation points within a string + * The `pattern` is used to identify the delimiters within a string * and splitting stops when either `maxsplit` or the end of the string is reached. * * An empty input string will produce a corresponding empty list item output row. @@ -161,7 +162,7 @@ std::unique_ptr
rsplit_re( * * @throw cudf:logic_error if `pattern` is empty. * - * @param strings A column of string elements to be split. + * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. @@ -169,17 +170,18 @@ std::unique_ptr
rsplit_re( * @return Lists column of strings. */ std::unique_ptr split_record_re( - strings_column_view const& strings, + strings_column_view const& input, std::string const& pattern, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Splits individual strings elements into a list of strings + * @brief Splits strings elements into a list column of strings * using the given regex pattern to delimit each string. * - * Each element generates an array of strings that are stored in an output - * lists column. + * Each element generates a vector of strings that are stored in an output + * lists column -- `list[row] = [token1, token2, ...] found in input[row]` + * where `token` is a substring between delimiters. * * The number of elements in the output column will be the same as the number of * elements in the input column. Each individual list item will contain the @@ -212,7 +214,7 @@ std::unique_ptr split_record_re( * * @throw cudf:logic_error if `pattern` is empty. * - * @param strings A column of string elements to be split. + * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. @@ -220,7 +222,7 @@ std::unique_ptr split_record_re( * @return Lists column of strings. */ std::unique_ptr rsplit_record_re( - strings_column_view const& strings, + strings_column_view const& input, std::string const& pattern, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 9dcf7e6f17b..dd71533c773 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -48,6 +48,10 @@ enum class split_direction { /** * @brief Identify the tokens from the `idx'th` string element of `d_strings`. + * + * Each string's tokens are stored in the `d_tokens` vector. + * The `d_token_offsets` specifies the output position within `d_tokens` + * for each string. */ template struct token_reader_fn { @@ -118,12 +122,12 @@ struct token_reader_fn { * The offsets for each token in each string on output. * @param stream CUDA stream used for kernel launches. */ -rmm::device_uvector split_utility(column_device_view const& d_strings, - reprog_device& d_prog, - split_direction direction, - size_type max_tokens, - mutable_column_view& offsets, - rmm::cuda_stream_view stream) +rmm::device_uvector generate_tokens(column_device_view const& d_strings, + reprog_device& d_prog, + split_direction direction, + size_type max_tokens, + mutable_column_view& offsets, + rmm::cuda_stream_view stream) { auto const strings_count = d_strings.size(); @@ -165,7 +169,7 @@ rmm::device_uvector split_utility(column_device_view const& d * @brief Returns string pair for the specified column for each string in `d_strings` * * This is used to build the table result of a split. - * Null is returned if the row is null of if the `column_index` is larger + * Null is returned if the row is null or if the `column_index` is larger * than the token count for that string. */ struct tokens_transform_fn { @@ -211,10 +215,10 @@ std::unique_ptr
split_re(strings_column_view const& input, auto offsets_view = offsets->mutable_view(); auto d_offsets = offsets_view.data(); - // get the split tokens from the input column - auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); + // get the split tokens from the input column; this also converts the counts into offsets + auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); - // the columns_count is the maximum number of tokens for any string in the input column + // the output column count is the maximum number of tokens generated for any input string auto const columns_count = thrust::transform_reduce( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -273,8 +277,8 @@ std::unique_ptr split_record_re(strings_column_view const& input, auto offsets = count_matches(*d_strings, *d_prog, stream, mr); auto offsets_view = offsets->mutable_view(); - // get the split tokens from the input column - auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); + // get the split tokens from the input column; this also converts the counts into offsets + auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); // convert the tokens into one big strings column auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index 4650cbc3c44..f0d7315929b 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -322,6 +322,7 @@ TEST_F(StringsSplitTest, SplitRegex) auto expected = cudf::table_view({col0, col1, col2}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + // rsplit == split when using default parameters result = cudf::strings::rsplit_re(sv, "\\s+"); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } @@ -337,6 +338,7 @@ TEST_F(StringsSplitTest, SplitRegex) auto expected = cudf::table_view({col0, col1, col2, col3}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + // rsplit == split when using default parameters result = cudf::strings::rsplit_re(sv, "[eé]"); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } @@ -359,6 +361,7 @@ TEST_F(StringsSplitTest, SplitRecordRegex) validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + // rsplit == split when using default parameters result = cudf::strings::rsplit_record_re(sv, "\\s+"); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } @@ -374,6 +377,7 @@ TEST_F(StringsSplitTest, SplitRecordRegex) validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + // rsplit == split when using default parameters result = cudf::strings::rsplit_record_re(sv, "[eé]"); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } @@ -394,15 +398,31 @@ TEST_F(StringsSplitTest, SplitRegexWithMaxSplit) {1, 0, 1, 1, 0}); auto expected = cudf::table_view({col0, col1}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + + // split everything is the same output as maxsplit==2 for the test input column here + result = cudf::strings::split_re(sv, "\\s+", 2); + auto expected2 = cudf::strings::split_re(sv, "\\s+"); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view()); } { auto result = cudf::strings::split_record_re(sv, "\\s", 1); using LCW = cudf::test::lists_column_wrapper; - LCW expected( + LCW expected1( {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}}, validity); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1); + + result = cudf::strings::split_record_re(sv, "\\s", 2); + LCW expected2( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", " "}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2); + + // split everything is the same output as maxsplit==3 for the test input column here + result = cudf::strings::split_record_re(sv, "\\s", 3); + auto expected0 = cudf::strings::split_record_re(sv, "\\s"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); } } @@ -521,6 +541,11 @@ TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit) {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + + // split everything is the same output as any maxsplit > 2 for the test input column here + result = cudf::strings::rsplit_record_re(sv, "\\s+", 3); + auto expected0 = cudf::strings::rsplit_record_re(sv, "\\s+"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); } } @@ -760,9 +785,8 @@ TEST_F(StringsSplitTest, PartitionZeroSizeStringsColumns) TEST_F(StringsSplitTest, InvalidParameter) { - std::vector h_strings{"string left intentionally blank"}; - cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); - auto strings_view = cudf::strings_column_view(strings); + cudf::test::strings_column_wrapper input({"string left intentionally blank"}); + auto strings_view = cudf::strings_column_view(input); EXPECT_THROW(cudf::strings::split(strings_view, cudf::string_scalar("", false)), cudf::logic_error); EXPECT_THROW(cudf::strings::rsplit(strings_view, cudf::string_scalar("", false)), From 79887d85c9d38916e13069cff2ad76a02ed9a59d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 4 Feb 2022 13:41:00 -0700 Subject: [PATCH 12/39] Change JNI to add a new boolean flag for regex split --- java/src/main/native/src/ColumnViewJni.cpp | 82 ++++++++-------------- 1 file changed, 29 insertions(+), 53 deletions(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index aef6b57230e..3985de41b32 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -561,67 +562,42 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass, - jlong column_view, - jlong delimiter_ptr, - jint max_split) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, delimiter_ptr, "string scalar delimiter is null", 0); + jlong input_handle, + jlong delimiter_handle, + jint max_split, + jboolean split_by_regex) { + JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); + JNI_NULL_CHECK(env, delimiter_handle, "string scalar delimiter is null", 0); try { cudf::jni::auto_set_device(env); - cudf::strings_column_view const scv{*reinterpret_cast(column_view)}; - auto delimiter = reinterpret_cast(delimiter_ptr); + auto const input = reinterpret_cast(input_handle); + auto const strs_input = cudf::strings_column_view{*input}; + auto const delimiter = reinterpret_cast(delimiter_handle); - return cudf::jni::convert_table_for_return(env, - cudf::strings::split(scv, *delimiter, max_split)); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRegex(JNIEnv *env, jclass, - jlong column_view, - jlong delimiter) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0); - try { - cudf::jni::auto_set_device(env); - cudf::column_view *cv = reinterpret_cast(column_view); - cudf::strings_column_view scv(*cv); - cudf::string_scalar *ss_scalar = reinterpret_cast(delimiter); - - std::unique_ptr table_result = cudf::strings::split_re(scv, *ss_scalar); - return cudf::jni::convert_table_for_return(env, table_result); + auto result = split_by_regex ? cudf::strings::split_re(strs_input, *delimiter, max_split) : + cudf::strings::split(strs_input, *delimiter, max_split); + return cudf::jni::convert_table_for_return(env, std::move(result)); } CATCH_STD(env, 0); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass, - jlong column_view, - jlong delimiter, - jint max_split) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0); - try { - cudf::jni::auto_set_device(env); - cudf::column_view *cv = reinterpret_cast(column_view); - cudf::strings_column_view scv(*cv); - cudf::string_scalar *ss_scalar = reinterpret_cast(delimiter); - return release_as_jlong(cudf::strings::split_record(scv, *ss_scalar, max_split)); - } - CATCH_STD(env, 0); -} - -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecordRegex(JNIEnv *env, jclass, - jlong column_view, - jlong delimiter, - jint max_split) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0); - try { - cudf::jni::auto_set_device(env); - cudf::column_view *cv = reinterpret_cast(column_view); - cudf::strings_column_view scv(*cv); - cudf::string_scalar *ss_scalar = reinterpret_cast(delimiter); - return release_as_jlong(cudf::strings::split_record_re(scv, *ss_scalar, max_split)); + jlong input_handle, + jlong delimiter_handle, + jint max_split, + jboolean split_by_regex) { + JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); + JNI_NULL_CHECK(env, delimiter_handle, "delimiter_handle is null", 0); + try { + cudf::jni::auto_set_device(env); + auto const input = reinterpret_cast(input_handle); + auto const strs_input = cudf::strings_column_view{*input}; + auto const delimiter = reinterpret_cast(delimiter_handle); + + auto result = split_by_regex ? + cudf::strings::split_record_re(strs_input, *delimiter, max_split) : + cudf::strings::split_record(strs_input, *delimiter, max_split); + return release_as_jlong(result); } CATCH_STD(env, 0); } From 28524b3c6bd7cd061dbea92f2c6710396dfc97b6 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 4 Feb 2022 14:19:33 -0700 Subject: [PATCH 13/39] Implement all possible overloads for stringSplit binding --- .../main/java/ai/rapids/cudf/ColumnView.java | 154 +++++++++++++----- 1 file changed, 116 insertions(+), 38 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 8155fe79080..1bf0302d1a1 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -826,18 +826,18 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co /** * Creates a deep copy of a column while replacing the validity mask. The validity mask is the * device_vector equivalent of the boolean column given as argument. - * + * * The boolColumn must have the same number of rows as the current column. - * The result column will have the same number of rows as the current column. + * The result column will have the same number of rows as the current column. * For all indices `i` where the boolColumn is `true`, the result column will have a valid value at index i. * For all other values (i.e. `false` or `null`), the result column will have nulls. - * + * * If the current column has a null at a given index `i`, and the new validity mask is `true` at index `i`, * then the row value is undefined. - * + * * @param boolColumn bool column whose value is to be used as the validity mask. * @return Deep copy of the column with replaced validity mask. - */ + */ public final ColumnVector copyWithBooleanColumnAsValidity(ColumnView boolColumn) { return new ColumnVector(copyWithBooleanColumnAsValidity(getNativeView(), boolColumn.getNativeView())); } @@ -2352,81 +2352,157 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) { * @param delimiter UTF-8 encoded string identifying the split points in each string. * An empty string indicates split on whitespace. * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * @param splitByRegex a boolean flag indicating whether the input string will be split by a + * regular expression pattern or just by a string literal delimiter. * @return New table of strings columns. */ - public final Table stringSplit(Scalar delimiter, int maxSplit) { + public final Table stringSplit(Scalar delimiter, int maxSplit, boolean splitByRegex) { assert type.equals(DType.STRING) : "column type must be a String"; assert delimiter != null : "delimiter may not be null"; assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; - return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle(), maxSplit)); + return new Table(stringSplit(this.getNativeView(), + delimiter.getScalarHandle(), + maxSplit, + splitByRegex)); } - + /** - * Returns a list of columns by splitting each string using the specified delimiter. + * Returns a list of columns by splitting each string using the specified string literal delimiter. * The number of rows in the output columns will be the same as the input column. * Null entries are added for a row where split results have been exhausted. * Null string entries return corresponding null output columns. * @param delimiter UTF-8 encoded string identifying the split points in each string. * An empty string indicates split on whitespace. + * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. * @return New table of strings columns. */ - public final Table stringSplit(Scalar delimiter) { - return stringSplit(delimiter, -1); + public final Table stringSplit(Scalar delimiter, int maxSplit) { + return stringSplit(delimiter, maxSplit, false); } /** - * Returns a list of columns by splitting each string using whitespace as the delimiter. + * Returns a list of columns by splitting each string using the specified delimiter. * The number of rows in the output columns will be the same as the input column. * Null entries are added for a row where split results have been exhausted. * Null string entries return corresponding null output columns. + * @param delimiter UTF-8 encoded string identifying the split points in each string. + * An empty string indicates split on whitespace. + * @param splitByRegex a boolean flag indicating whether the input string will be split by a + * regular expression pattern or just by a string literal delimiter. * @return New table of strings columns. */ - public final Table stringSplit() { - try (Scalar emptyString = Scalar.fromString("")) { - return stringSplit(emptyString, -1); - } + public final Table stringSplit(Scalar delimiter, boolean splitByRegex) { + return stringSplit(delimiter, -1, splitByRegex); } /** - * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. + * Returns a list of columns by splitting each string using the specified string literal delimiter. + * The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. + * Null string entries return corresponding null output columns. + * @param delimiter UTF-8 encoded string identifying the split points in each string. + * An empty string indicates split on whitespace. + * @return New table of strings columns. */ - public final ColumnVector stringSplitRecord() { - return stringSplitRecord(-1); + public final Table stringSplit(Scalar delimiter) { + return stringSplit(delimiter, -1, false); } /** - * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. + * Returns a list of columns by splitting each string using whitespace as the delimiter. + * The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. + * Null string entries return corresponding null output columns. * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * @return New table of strings columns. */ - public final ColumnVector stringSplitRecord(int maxSplit) { + public final Table stringSplit(int maxSplit) { try (Scalar emptyString = Scalar.fromString("")) { - return stringSplitRecord(emptyString, maxSplit); + return stringSplit(emptyString, maxSplit, false); } } /** - * Returns a column of lists of strings by splitting each string using the specified delimiter. + * Returns a list of columns by splitting each string using whitespace as the delimiter. + * The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. + * Null string entries return corresponding null output columns. + * @return New table of strings columns. + */ + public final Table stringSplit() { + return stringSplit(-1); + } + + /** + * Returns a column that is a list of strings. Each string list is made by splitting each input + * string using the specified delimiter. * @param delimiter UTF-8 encoded string identifying the split points in each string. * An empty string indicates split on whitespace. + * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * @param splitByRegex a boolean flag indicating whether the input string will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return New table of strings columns. */ - public final ColumnVector stringSplitRecord(Scalar delimiter) { - return stringSplitRecord(delimiter, -1); + public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit, boolean splitByRegex) { + assert type.equals(DType.STRING) : "column type must be String"; + assert delimiter != null : "delimiter may not be null"; + assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; + return new ColumnVector( + stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit, splitByRegex)); } /** * Returns a column that is a list of strings. Each string list is made by splitting each input - * string using the specified delimiter. + * string using the specified string literal delimiter. * @param delimiter UTF-8 encoded string identifying the split points in each string. * An empty string indicates split on whitespace. * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. * @return New table of strings columns. */ public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) { - assert type.equals(DType.STRING) : "column type must be a String"; + assert type.equals(DType.STRING) : "column type must be String"; assert delimiter != null : "delimiter may not be null"; assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; return new ColumnVector( - stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit)); + stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit, false)); + } + + /** + * Returns a column of lists of strings by splitting each string using the specified delimiter. + * @param delimiter UTF-8 encoded string identifying the split points in each string. + * An empty string indicates split on whitespace. + * @param splitByRegex a boolean flag indicating whether the input string will be split by a + * regular expression pattern or just by a string literal delimiter. + */ + public final ColumnVector stringSplitRecord(Scalar delimiter, boolean splitByRegex) { + return stringSplitRecord(delimiter, -1, splitByRegex); + } + + /** + * Returns a column of lists of strings by splitting each string using the specified string + * literal delimiter. + * @param delimiter UTF-8 encoded string identifying the split points in each string. + * An empty string indicates split on whitespace. + */ + public final ColumnVector stringSplitRecord(Scalar delimiter) { + return stringSplitRecord(delimiter, -1, false); + } + + /** + * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. + * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + */ + public final ColumnVector stringSplitRecord(int maxSplit) { + try (Scalar emptyString = Scalar.fromString("")) { + return stringSplitRecord(emptyString, maxSplit, false); + } + } + + /** + * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. + */ + public final ColumnVector stringSplitRecord() { + return stringSplitRecord(-1); } /** @@ -3248,7 +3324,7 @@ public enum FindOptions {FIND_FIRST, FIND_LAST}; * Create a column of int32 indices, indicating the position of the scalar search key * in each list row. * All indices are 0-based. If a search key is not found, the index is set to -1. - * The index is set to null if one of the following is true: + * The index is set to null if one of the following is true: * 1. The search key is null. * 2. The list row is null. * @param key The scalar search key @@ -3265,7 +3341,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) { * Create a column of int32 indices, indicating the position of each row in the * search key column in the corresponding row of the lists column. * All indices are 0-based. If a search key is not found, the index is set to -1. - * The index is set to null if one of the following is true: + * The index is set to null if one of the following is true: * 1. The search key row is null. * 2. The list row is null. * @param keys ColumnView of search keys. @@ -3537,9 +3613,11 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle * @param delimiter UTF-8 encoded string identifying the split points in each string. * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. */ - private static native long[] stringSplit(long columnView, long delimiter, int maxSplit); + private static native long[] stringSplit(long columnView, long delimiter, int maxSplit, + boolean splitByRegex); - private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit); + private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit, + boolean splitByRegex); /** * Native method to calculate substring from a given string column. 0 indexing. @@ -3714,7 +3792,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat /** * Native method to search list rows for null elements. * @param nativeView the column view handle of the list - * @return column handle of the resultant boolean column + * @return column handle of the resultant boolean column */ private static native long listContainsNulls(long nativeView); @@ -3896,20 +3974,20 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi /** * Native method to deep copy a column while replacing the null mask. The null mask is the * device_vector equivalent of the boolean column given as argument. - * + * * The boolColumn must have the same number of rows as the exemplar column. * The result column will have the same number of rows as the exemplar. * For all indices `i` where the boolean column is `true`, the result column will have a valid value at index i. * For all other values (i.e. `false` or `null`), the result column will have nulls. - * + * * If the exemplar column has a null at a given index `i`, and the new validity mask is `true` at index `i`, * then the resultant row value is undefined. - * + * * @param exemplarViewHandle column view of the column that is deep copied. * @param boolColumnViewHandle bool column whose value is to be used as the null mask. * @return Deep copy of the column with replaced null mask. - */ - private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, + */ + private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, long boolColumnViewHandle) throws CudfException; //////// From 75ffaf839b034517d7af58e8d91fdc5629ce6e12 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 4 Feb 2022 15:59:02 -0700 Subject: [PATCH 14/39] Change JNI for stringSplit and stringSplitRecord --- .../main/java/ai/rapids/cudf/ColumnView.java | 81 +++++++++---------- java/src/main/native/src/ColumnViewJni.cpp | 29 ++++--- .../java/ai/rapids/cudf/ColumnVectorTest.java | 40 ++++----- 3 files changed, 72 insertions(+), 78 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 1bf0302d1a1..6d176e5934a 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2349,21 +2349,16 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) { * The number of rows in the output columns will be the same as the input column. * Null entries are added for a row where split results have been exhausted. * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 encoded string identifying the split points in each string. + * @param delimiter UTF-8 string identifying the split points or split pattern in each string. * An empty string indicates split on whitespace. * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. * @param splitByRegex a boolean flag indicating whether the input string will be split by a * regular expression pattern or just by a string literal delimiter. * @return New table of strings columns. */ - public final Table stringSplit(Scalar delimiter, int maxSplit, boolean splitByRegex) { + public final Table stringSplit(String delimiter, int maxSplit, boolean splitByRegex) { assert type.equals(DType.STRING) : "column type must be a String"; - assert delimiter != null : "delimiter may not be null"; - assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; - return new Table(stringSplit(this.getNativeView(), - delimiter.getScalarHandle(), - maxSplit, - splitByRegex)); + return new Table(stringSplit(this.getNativeView(), delimiter, maxSplit, splitByRegex)); } /** @@ -2371,12 +2366,12 @@ public final Table stringSplit(Scalar delimiter, int maxSplit, boolean splitByRe * The number of rows in the output columns will be the same as the input column. * Null entries are added for a row where split results have been exhausted. * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 encoded string identifying the split points in each string. + * @param delimiter UTF-8 string identifying the split points or split pattern in each string. * An empty string indicates split on whitespace. * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. * @return New table of strings columns. */ - public final Table stringSplit(Scalar delimiter, int maxSplit) { + public final Table stringSplit(String delimiter, int maxSplit) { return stringSplit(delimiter, maxSplit, false); } @@ -2385,13 +2380,13 @@ public final Table stringSplit(Scalar delimiter, int maxSplit) { * The number of rows in the output columns will be the same as the input column. * Null entries are added for a row where split results have been exhausted. * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 encoded string identifying the split points in each string. + * @param delimiter UTF-8 string identifying the split points or split pattern in each string. * An empty string indicates split on whitespace. * @param splitByRegex a boolean flag indicating whether the input string will be split by a * regular expression pattern or just by a string literal delimiter. * @return New table of strings columns. */ - public final Table stringSplit(Scalar delimiter, boolean splitByRegex) { + public final Table stringSplit(String delimiter, boolean splitByRegex) { return stringSplit(delimiter, -1, splitByRegex); } @@ -2400,11 +2395,11 @@ public final Table stringSplit(Scalar delimiter, boolean splitByRegex) { * The number of rows in the output columns will be the same as the input column. * Null entries are added for a row where split results have been exhausted. * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 encoded string identifying the split points in each string. + * @param delimiter UTF-8 string identifying the split points or split pattern in each string. * An empty string indicates split on whitespace. * @return New table of strings columns. */ - public final Table stringSplit(Scalar delimiter) { + public final Table stringSplit(String delimiter) { return stringSplit(delimiter, -1, false); } @@ -2417,9 +2412,8 @@ public final Table stringSplit(Scalar delimiter) { * @return New table of strings columns. */ public final Table stringSplit(int maxSplit) { - try (Scalar emptyString = Scalar.fromString("")) { - return stringSplit(emptyString, maxSplit, false); - } + String emptyString = ""; + return stringSplit(emptyString, maxSplit, false); } /** @@ -2436,55 +2430,48 @@ public final Table stringSplit() { /** * Returns a column that is a list of strings. Each string list is made by splitting each input * string using the specified delimiter. - * @param delimiter UTF-8 encoded string identifying the split points in each string. + * @param delimiter UTF-8 string identifying the split points or split pattern in each string. * An empty string indicates split on whitespace. * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. * @param splitByRegex a boolean flag indicating whether the input string will be split by a * regular expression pattern or just by a string literal delimiter. * @return New table of strings columns. */ - public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit, boolean splitByRegex) { + public final ColumnVector stringSplitRecord(String delimiter, int maxSplit, boolean splitByRegex) { assert type.equals(DType.STRING) : "column type must be String"; - assert delimiter != null : "delimiter may not be null"; - assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; return new ColumnVector( - stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit, splitByRegex)); + stringSplitRecord(this.getNativeView(), delimiter, maxSplit, splitByRegex)); } /** * Returns a column that is a list of strings. Each string list is made by splitting each input * string using the specified string literal delimiter. - * @param delimiter UTF-8 encoded string identifying the split points in each string. + * @param delimiter UTF-8 string identifying the split points or split pattern in each string. * An empty string indicates split on whitespace. * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. * @return New table of strings columns. */ - public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) { - assert type.equals(DType.STRING) : "column type must be String"; - assert delimiter != null : "delimiter may not be null"; - assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; - return new ColumnVector( - stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit, false)); + public final ColumnVector stringSplitRecord(String delimiter, int maxSplit) { + return stringSplitRecord(delimiter, maxSplit, false); } - /** * Returns a column of lists of strings by splitting each string using the specified delimiter. - * @param delimiter UTF-8 encoded string identifying the split points in each string. + * @param delimiter UTF-8 string identifying the split points or split pattern in each string. * An empty string indicates split on whitespace. * @param splitByRegex a boolean flag indicating whether the input string will be split by a * regular expression pattern or just by a string literal delimiter. */ - public final ColumnVector stringSplitRecord(Scalar delimiter, boolean splitByRegex) { + public final ColumnVector stringSplitRecord(String delimiter, boolean splitByRegex) { return stringSplitRecord(delimiter, -1, splitByRegex); } /** * Returns a column of lists of strings by splitting each string using the specified string * literal delimiter. - * @param delimiter UTF-8 encoded string identifying the split points in each string. + * @param delimiter UTF-8 string identifying the split points or split pattern in each string. * An empty string indicates split on whitespace. */ - public final ColumnVector stringSplitRecord(Scalar delimiter) { + public final ColumnVector stringSplitRecord(String delimiter) { return stringSplitRecord(delimiter, -1, false); } @@ -2493,9 +2480,8 @@ public final ColumnVector stringSplitRecord(Scalar delimiter) { * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. */ public final ColumnVector stringSplitRecord(int maxSplit) { - try (Scalar emptyString = Scalar.fromString("")) { - return stringSplitRecord(emptyString, maxSplit, false); - } + String emptyString = ""; + return stringSplitRecord(emptyString, maxSplit, false); } /** @@ -3607,16 +3593,27 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle private static native long substringLocate(long columnView, long substringScalar, int start, int end); /** - * Native method which returns array of columns by splitting each string using the specified + * Native method which returns an array of columns by splitting each string using the specified * delimiter. - * @param columnView native handle of the cudf::column_view being operated on. - * @param delimiter UTF-8 encoded string identifying the split points in each string. + * @param nativeHandle native handle of the cudf::column_view being operated on. + * @param delimiter UTF-8 string identifying the split points or split pattern in each string. * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * @param splitByRegex a boolean flag indicating whether the input string will be split by a + * regular expression pattern or just by a string literal delimiter. */ - private static native long[] stringSplit(long columnView, long delimiter, int maxSplit, + private static native long[] stringSplit(long nativeHandle, String delimiter, int maxSplit, boolean splitByRegex); - private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit, + /** + * Native method which returns a LIST column by splitting each string into a list of strings + * using the specified delimiter. + * @param nativeHandle native handle of the cudf::column_view being operated on. + * @param delimiter UTF-8 string identifying the split points or split pattern in each string. + * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * @param splitByRegex a boolean flag indicating whether the input string will be split by a + * regular expression pattern or just by a string literal delimiter. + */ + private static native long stringSplitRecord(long nativeHandle, String delimiter, int maxSplit, boolean splitByRegex); /** diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 3985de41b32..c6964f98a53 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -563,19 +563,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass, jlong input_handle, - jlong delimiter_handle, + jstring delimiter, jint max_split, jboolean split_by_regex) { JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); - JNI_NULL_CHECK(env, delimiter_handle, "string scalar delimiter is null", 0); try { cudf::jni::auto_set_device(env); + auto const input = reinterpret_cast(input_handle); auto const strs_input = cudf::strings_column_view{*input}; - auto const delimiter = reinterpret_cast(delimiter_handle); - auto result = split_by_regex ? cudf::strings::split_re(strs_input, *delimiter, max_split) : - cudf::strings::split(strs_input, *delimiter, max_split); + auto const delimiter_content = env->GetStringUTFChars(delimiter, nullptr); + auto const str_delimiter = std::string{delimiter_content}; + + auto result = + split_by_regex ? + cudf::strings::split_re(strs_input, str_delimiter, max_split) : + cudf::strings::split(strs_input, cudf::string_scalar{str_delimiter}, max_split); return cudf::jni::convert_table_for_return(env, std::move(result)); } CATCH_STD(env, 0); @@ -583,20 +587,23 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv * JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass, jlong input_handle, - jlong delimiter_handle, + jstring delimiter, jint max_split, jboolean split_by_regex) { JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); - JNI_NULL_CHECK(env, delimiter_handle, "delimiter_handle is null", 0); try { cudf::jni::auto_set_device(env); + auto const input = reinterpret_cast(input_handle); auto const strs_input = cudf::strings_column_view{*input}; - auto const delimiter = reinterpret_cast(delimiter_handle); - auto result = split_by_regex ? - cudf::strings::split_record_re(strs_input, *delimiter, max_split) : - cudf::strings::split_record(strs_input, *delimiter, max_split); + auto const delimiter_content = env->GetStringUTFChars(delimiter, nullptr); + auto const str_delimiter = std::string{delimiter_content}; + + auto result = + split_by_regex ? + cudf::strings::split_record_re(strs_input, str_delimiter, max_split) : + cudf::strings::split_record(strs_input, cudf::string_scalar{str_delimiter}, max_split); return release_as_jlong(result); } CATCH_STD(env, 0); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index f9c8029ed84..05c7f2e18ce 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4741,24 +4741,25 @@ void testListSortRowsWithStringChild() { @Test void testStringSplitRecord() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings"); - ColumnVector expected = ColumnVector.fromLists( - new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.STRING)), - Arrays.asList("Héllo", "there"), - Arrays.asList("thésé"), - Arrays.asList("null"), - Arrays.asList(""), - Arrays.asList("ARé", "some"), - Arrays.asList("test", "strings")); - Scalar pattern = Scalar.fromString(" "); - ColumnVector result = v.stringSplitRecord(pattern, -1)) { - assertColumnsAreEqual(expected, result); - } + String pattern = " "; + try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings"); + ColumnVector expected = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there"), + Arrays.asList("thésé"), + Arrays.asList("null"), + Arrays.asList(""), + Arrays.asList("ARé", "some"), + Arrays.asList("test", "strings")); + ColumnVector result = v.stringSplitRecord(pattern, -1)) { + assertColumnsAreEqual(expected, result); + } } @Test void testStringSplit() { + String pattern = " "; try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here"); Table expectedSplitOnce = new Table.TestBuilder() .column("Héllo", "thésé", null, "", "ARé", "test") @@ -4769,7 +4770,6 @@ void testStringSplit() { .column("there", null, null, null, "some", "strings") .column("all", null, null, null, "things", "here") .build(); - Scalar pattern = Scalar.fromString(" "); Table resultSplitOnce = v.stringSplit(pattern, 1); Table resultSplitAll = v.stringSplit(pattern)) { assertTablesAreEqual(expectedSplitOnce, resultSplitOnce); @@ -4790,16 +4790,6 @@ void teststringSplitWhiteSpace() { @Test void teststringSplitThrowsException() { - assertThrows(CudfException.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Scalar delimiter = Scalar.fromString(null); - Table result = cv.stringSplit(delimiter)) {} - }); - assertThrows(AssertionError.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Scalar delimiter = Scalar.fromInt(1); - Table result = cv.stringSplit(delimiter)) {} - }); assertThrows(AssertionError.class, () -> { try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); Table result = cv.stringSplit(null)) {} From b3604c9057cbe896824c62972fbc5970bcacf2e7 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 4 Feb 2022 16:12:48 -0700 Subject: [PATCH 15/39] Rename tests --- java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 05c7f2e18ce..024640fefb1 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4778,7 +4778,7 @@ void testStringSplit() { } @Test - void teststringSplitWhiteSpace() { + void testStringSplitWhiteSpace() { try (ColumnVector v = ColumnVector.fromStrings("Héllo thesé", null, "are\tsome", "tést\nString", " "); Table expected = new Table.TestBuilder().column("Héllo", null, "are", "tést", null) .column("thesé", null, "some", "String", null) @@ -4789,7 +4789,7 @@ void teststringSplitWhiteSpace() { } @Test - void teststringSplitThrowsException() { + void testStringSplitThrowsException() { assertThrows(AssertionError.class, () -> { try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); Table result = cv.stringSplit(null)) {} From 61605ef727e361d9c83b311e67afb7284022903a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 4 Feb 2022 16:15:41 -0700 Subject: [PATCH 16/39] Remove test --- java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 8 -------- 1 file changed, 8 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 024640fefb1..e9d7bcbfcbd 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4788,14 +4788,6 @@ void testStringSplitWhiteSpace() { } } - @Test - void testStringSplitThrowsException() { - assertThrows(AssertionError.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Table result = cv.stringSplit(null)) {} - }); - } - @Test void testsubstringColumn() { try (ColumnVector v = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); From 8307bfe091b3e1e67cc623ea007cc9400a24ab44 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 4 Feb 2022 16:15:46 -0700 Subject: [PATCH 17/39] Add assert --- java/src/main/java/ai/rapids/cudf/ColumnView.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 6d176e5934a..740163f7482 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2357,6 +2357,7 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) { * @return New table of strings columns. */ public final Table stringSplit(String delimiter, int maxSplit, boolean splitByRegex) { + assert delimiter != null : "delimiter is null"; assert type.equals(DType.STRING) : "column type must be a String"; return new Table(stringSplit(this.getNativeView(), delimiter, maxSplit, splitByRegex)); } @@ -2438,6 +2439,7 @@ public final Table stringSplit() { * @return New table of strings columns. */ public final ColumnVector stringSplitRecord(String delimiter, int maxSplit, boolean splitByRegex) { + assert delimiter != null : "delimiter is null"; assert type.equals(DType.STRING) : "column type must be String"; return new ColumnVector( stringSplitRecord(this.getNativeView(), delimiter, maxSplit, splitByRegex)); From 75dc621f804bcfe0ff48bc5a7d7b2f4e5ba9a217 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 4 Feb 2022 16:25:23 -0700 Subject: [PATCH 18/39] Add assert --- java/src/main/java/ai/rapids/cudf/ColumnView.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 740163f7482..b6503e01cf3 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2358,6 +2358,7 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) { */ public final Table stringSplit(String delimiter, int maxSplit, boolean splitByRegex) { assert delimiter != null : "delimiter is null"; + assert (delimiter.length() == 0) ^ splitByRegex : "cannot split by empty regex"; assert type.equals(DType.STRING) : "column type must be a String"; return new Table(stringSplit(this.getNativeView(), delimiter, maxSplit, splitByRegex)); } @@ -2440,6 +2441,7 @@ public final Table stringSplit() { */ public final ColumnVector stringSplitRecord(String delimiter, int maxSplit, boolean splitByRegex) { assert delimiter != null : "delimiter is null"; + assert (delimiter.length() == 0) ^ splitByRegex : "cannot split by empty regex"; assert type.equals(DType.STRING) : "column type must be String"; return new ColumnVector( stringSplitRecord(this.getNativeView(), delimiter, maxSplit, splitByRegex)); From 4a2066272f0a9dd9fe349fbc02abb3e128fd2565 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 4 Feb 2022 16:28:11 -0700 Subject: [PATCH 19/39] Fix assert --- java/src/main/java/ai/rapids/cudf/ColumnView.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index b6503e01cf3..912f9f7649b 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2358,7 +2358,7 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) { */ public final Table stringSplit(String delimiter, int maxSplit, boolean splitByRegex) { assert delimiter != null : "delimiter is null"; - assert (delimiter.length() == 0) ^ splitByRegex : "cannot split by empty regex"; + assert delimiter.length() > 0 || !splitByRegex : "cannot split by empty regex"; assert type.equals(DType.STRING) : "column type must be a String"; return new Table(stringSplit(this.getNativeView(), delimiter, maxSplit, splitByRegex)); } @@ -2441,7 +2441,7 @@ public final Table stringSplit() { */ public final ColumnVector stringSplitRecord(String delimiter, int maxSplit, boolean splitByRegex) { assert delimiter != null : "delimiter is null"; - assert (delimiter.length() == 0) ^ splitByRegex : "cannot split by empty regex"; + assert delimiter.length() > 0 || !splitByRegex : "cannot split by empty regex"; assert type.equals(DType.STRING) : "column type must be String"; return new ColumnVector( stringSplitRecord(this.getNativeView(), delimiter, maxSplit, splitByRegex)); From f915f7e92e2adf091c16d182d8b152de27b66c92 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 4 Feb 2022 17:38:53 -0700 Subject: [PATCH 20/39] Fix string construction from jstring --- java/src/main/native/src/ColumnViewJni.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index c6964f98a53..dcc6085c509 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -573,8 +573,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv * auto const input = reinterpret_cast(input_handle); auto const strs_input = cudf::strings_column_view{*input}; - auto const delimiter_content = env->GetStringUTFChars(delimiter, nullptr); - auto const str_delimiter = std::string{delimiter_content}; + auto const delimiter_chars = env->GetStringUTFChars(delimiter, nullptr); + auto const delimiter_size = env->GetStringUTFLength(delimiter); + auto const str_delimiter = std::string(delimiter_chars, delimiter_size); + env->ReleaseStringUTFChars(delimiter, delimiter_chars); auto result = split_by_regex ? @@ -597,8 +599,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv auto const input = reinterpret_cast(input_handle); auto const strs_input = cudf::strings_column_view{*input}; - auto const delimiter_content = env->GetStringUTFChars(delimiter, nullptr); - auto const str_delimiter = std::string{delimiter_content}; + auto const delimiter_chars = env->GetStringUTFChars(delimiter, nullptr); + auto const delimiter_size = env->GetStringUTFLength(delimiter); + auto const str_delimiter = std::string(delimiter_chars, delimiter_size); + env->ReleaseStringUTFChars(delimiter, delimiter_chars); auto result = split_by_regex ? From 4176563e6dca76b1618c4661eaac4b31574d99fb Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 7 Feb 2022 14:44:37 -0700 Subject: [PATCH 21/39] Rename variable and rewrite javadoc --- .../main/java/ai/rapids/cudf/ColumnView.java | 62 ++++++++++++------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 912f9f7649b..e7af237dd9a 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2351,16 +2351,19 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) { * Null string entries return corresponding null output columns. * @param delimiter UTF-8 string identifying the split points or split pattern in each string. * An empty string indicates split on whitespace. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * @param limit the maximum size of the array resulting from splitting the input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. * @param splitByRegex a boolean flag indicating whether the input string will be split by a * regular expression pattern or just by a string literal delimiter. * @return New table of strings columns. */ - public final Table stringSplit(String delimiter, int maxSplit, boolean splitByRegex) { + public final Table stringSplit(String delimiter, int limit, boolean splitByRegex) { assert delimiter != null : "delimiter is null"; assert delimiter.length() > 0 || !splitByRegex : "cannot split by empty regex"; assert type.equals(DType.STRING) : "column type must be a String"; - return new Table(stringSplit(this.getNativeView(), delimiter, maxSplit, splitByRegex)); + assert limit != 0 && limit != 1; + return new Table(stringSplit(this.getNativeView(), delimiter, limit, splitByRegex)); } /** @@ -2370,11 +2373,13 @@ public final Table stringSplit(String delimiter, int maxSplit, boolean splitByRe * Null string entries return corresponding null output columns. * @param delimiter UTF-8 string identifying the split points or split pattern in each string. * An empty string indicates split on whitespace. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * @param limit the maximum size of the array resulting from splitting the input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. * @return New table of strings columns. */ - public final Table stringSplit(String delimiter, int maxSplit) { - return stringSplit(delimiter, maxSplit, false); + public final Table stringSplit(String delimiter, int limit) { + return stringSplit(delimiter, limit, false); } /** @@ -2410,12 +2415,14 @@ public final Table stringSplit(String delimiter) { * The number of rows in the output columns will be the same as the input column. * Null entries are added for a row where split results have been exhausted. * Null string entries return corresponding null output columns. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * @param limit the maximum size of the array resulting from splitting the input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. * @return New table of strings columns. */ - public final Table stringSplit(int maxSplit) { + public final Table stringSplit(int limit) { String emptyString = ""; - return stringSplit(emptyString, maxSplit, false); + return stringSplit(emptyString, limit, false); } /** @@ -2434,17 +2441,20 @@ public final Table stringSplit() { * string using the specified delimiter. * @param delimiter UTF-8 string identifying the split points or split pattern in each string. * An empty string indicates split on whitespace. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * @param limit the maximum size of the array resulting from splitting the input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. * @param splitByRegex a boolean flag indicating whether the input string will be split by a * regular expression pattern or just by a string literal delimiter. * @return New table of strings columns. */ - public final ColumnVector stringSplitRecord(String delimiter, int maxSplit, boolean splitByRegex) { + public final ColumnVector stringSplitRecord(String delimiter, int limit, boolean splitByRegex) { assert delimiter != null : "delimiter is null"; assert delimiter.length() > 0 || !splitByRegex : "cannot split by empty regex"; assert type.equals(DType.STRING) : "column type must be String"; + assert limit != 0 && limit != 1; return new ColumnVector( - stringSplitRecord(this.getNativeView(), delimiter, maxSplit, splitByRegex)); + stringSplitRecord(this.getNativeView(), delimiter, limit, splitByRegex)); } /** @@ -2452,11 +2462,13 @@ public final ColumnVector stringSplitRecord(String delimiter, int maxSplit, bool * string using the specified string literal delimiter. * @param delimiter UTF-8 string identifying the split points or split pattern in each string. * An empty string indicates split on whitespace. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * @param limit the maximum size of the array resulting from splitting the input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. * @return New table of strings columns. */ - public final ColumnVector stringSplitRecord(String delimiter, int maxSplit) { - return stringSplitRecord(delimiter, maxSplit, false); + public final ColumnVector stringSplitRecord(String delimiter, int limit) { + return stringSplitRecord(delimiter, limit, false); } /** * Returns a column of lists of strings by splitting each string using the specified delimiter. @@ -2481,11 +2493,13 @@ public final ColumnVector stringSplitRecord(String delimiter) { /** * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * @param limit the maximum size of the array resulting from splitting the input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. */ - public final ColumnVector stringSplitRecord(int maxSplit) { + public final ColumnVector stringSplitRecord(int limit) { String emptyString = ""; - return stringSplitRecord(emptyString, maxSplit, false); + return stringSplitRecord(emptyString, limit, false); } /** @@ -3601,11 +3615,13 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle * delimiter. * @param nativeHandle native handle of the cudf::column_view being operated on. * @param delimiter UTF-8 string identifying the split points or split pattern in each string. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * @param limit the maximum size of the array resulting from splitting the input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. * @param splitByRegex a boolean flag indicating whether the input string will be split by a * regular expression pattern or just by a string literal delimiter. */ - private static native long[] stringSplit(long nativeHandle, String delimiter, int maxSplit, + private static native long[] stringSplit(long nativeHandle, String delimiter, int limit, boolean splitByRegex); /** @@ -3613,11 +3629,13 @@ private static native long[] stringSplit(long nativeHandle, String delimiter, in * using the specified delimiter. * @param nativeHandle native handle of the cudf::column_view being operated on. * @param delimiter UTF-8 string identifying the split points or split pattern in each string. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * @param limit the maximum size of the array resulting from splitting the input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. * @param splitByRegex a boolean flag indicating whether the input string will be split by a * regular expression pattern or just by a string literal delimiter. */ - private static native long stringSplitRecord(long nativeHandle, String delimiter, int maxSplit, + private static native long stringSplitRecord(long nativeHandle, String delimiter, int limit, boolean splitByRegex); /** From 6d8bcc94c86214148fc310d48e7608d8344fa647 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 7 Feb 2022 14:48:33 -0700 Subject: [PATCH 22/39] Convert java limit to cudf max_split --- java/src/main/native/src/ColumnViewJni.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index dcc6085c509..167f53493f4 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -564,7 +564,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass, jlong input_handle, jstring delimiter, - jint max_split, + jint limit, jboolean split_by_regex) { JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); try { @@ -578,6 +578,12 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv * auto const str_delimiter = std::string(delimiter_chars, delimiter_size); env->ReleaseStringUTFChars(delimiter, delimiter_chars); + if (limit == 0 || limit == 1) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "limit == 0 or limit == 1 are not supported", 0); + } + auto const max_split = limit > 1 ? limit - 1 : limit; + auto result = split_by_regex ? cudf::strings::split_re(strs_input, str_delimiter, max_split) : @@ -590,7 +596,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv * JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass, jlong input_handle, jstring delimiter, - jint max_split, + jint limit, jboolean split_by_regex) { JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); try { @@ -604,6 +610,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv auto const str_delimiter = std::string(delimiter_chars, delimiter_size); env->ReleaseStringUTFChars(delimiter, delimiter_chars); + if (limit == 0 || limit == 1) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "limit == 0 or limit == 1 are not supported", 0); + } + auto const max_split = limit > 1 ? limit - 1 : limit; + auto result = split_by_regex ? cudf::strings::split_record_re(strs_input, str_delimiter, max_split) : From 2e6450fd610c960564c5feab35cb949a592838a7 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 7 Feb 2022 15:00:43 -0700 Subject: [PATCH 23/39] Fix Java test --- java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index e9d7bcbfcbd..bda1a6bda55 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4761,7 +4761,7 @@ void testStringSplitRecord() { void testStringSplit() { String pattern = " "; try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here"); - Table expectedSplitOnce = new Table.TestBuilder() + Table expectedSplitLimit2 = new Table.TestBuilder() .column("Héllo", "thésé", null, "", "ARé", "test") .column("there all", null, null, null, "some things", "strings here") .build(); @@ -4770,9 +4770,9 @@ void testStringSplit() { .column("there", null, null, null, "some", "strings") .column("all", null, null, null, "things", "here") .build(); - Table resultSplitOnce = v.stringSplit(pattern, 1); + Table resultSplitLimit2 = v.stringSplit(pattern, 2); Table resultSplitAll = v.stringSplit(pattern)) { - assertTablesAreEqual(expectedSplitOnce, resultSplitOnce); + assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2); assertTablesAreEqual(expectedSplitAll, resultSplitAll); } } From eb8c326cd1b2ea1e9f673b6a16e07533ce637f14 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 7 Feb 2022 19:51:00 -0500 Subject: [PATCH 24/39] fix doxygen typo in @throw line --- cpp/include/cudf/strings/split/split_re.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index c6dc1e5c697..d61b802efe9 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -60,7 +60,7 @@ namespace strings { * ["bc def_g", "_bc", "ab cd", "cd "] ] * @endcode * - * @throw cudf:logic_error if `pattern` is empty. + * @throw cudf::logic_error if `pattern` is empty. * * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. @@ -110,7 +110,7 @@ std::unique_ptr
split_re( * ["g", "bc", "cd", "cd "] ] * @endcode * - * @throw cudf:logic_error if `pattern` is empty. + * @throw cudf::logic_error if `pattern` is empty. * * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. @@ -160,7 +160,7 @@ std::unique_ptr
rsplit_re( * ["ab", "cd "] ] * @endcode * - * @throw cudf:logic_error if `pattern` is empty. + * @throw cudf::logic_error if `pattern` is empty. * * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. @@ -212,7 +212,7 @@ std::unique_ptr split_record_re( * ["ab_cd", ""] ] * @endcode * - * @throw cudf:logic_error if `pattern` is empty. + * @throw cudf::logic_error if `pattern` is empty. * * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. From d6ee8837ff3f523816d96f444e1b001d14debdf7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 7 Feb 2022 19:51:37 -0500 Subject: [PATCH 25/39] refactor max-tokens calculation into helper function --- cpp/src/strings/split/split_re.cu | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index dd71533c773..d80148f2fe6 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -98,7 +98,7 @@ struct token_reader_fn { d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos}; if (direction == split_direction::BACKWARD) { - // update first entry -- this happens when max-tokens is hit before the end of the string + // update first entry -- this happens when max_tokens is hit before the end of the string auto const first_offset = d_result[0].first ? static_cast(thrust::distance(d_str.data(), d_result[0].first)) @@ -117,6 +117,7 @@ struct token_reader_fn { * * @param d_strings Strings to split * @param d_prog Regex to evaluate against each string + * @param direction Whether tokens are generated forwards or backwards. * @param max_tokens The maximum number of tokens for each split. * @param offsets The number of matches on input. * The offsets for each token in each string on output. @@ -125,12 +126,14 @@ struct token_reader_fn { rmm::device_uvector generate_tokens(column_device_view const& d_strings, reprog_device& d_prog, split_direction direction, - size_type max_tokens, + size_type maxsplit, mutable_column_view& offsets, rmm::cuda_stream_view stream) { auto const strings_count = d_strings.size(); + auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); + auto const begin = thrust::make_counting_iterator(0); auto const end = thrust::make_counting_iterator(strings_count); auto const d_offsets = offsets.data(); @@ -182,7 +185,7 @@ struct tokens_transform_fn { { auto const offset = d_token_offsets[idx]; auto const token_count = d_token_offsets[idx + 1] - offset; - return (column_index > token_count - 1) || d_strings.is_null(idx) + return (column_index >= token_count) || d_strings.is_null(idx) ? string_index_pair{nullptr, 0} : d_tokens[offset + column_index]; } @@ -197,7 +200,6 @@ std::unique_ptr
split_re(strings_column_view const& input, { CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); - auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); auto const strings_count = input.size(); std::vector> results; @@ -216,7 +218,7 @@ std::unique_ptr
split_re(strings_column_view const& input, auto d_offsets = offsets_view.data(); // get the split tokens from the input column; this also converts the counts into offsets - auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); + auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream); // the output column count is the maximum number of tokens generated for any input string auto const columns_count = thrust::transform_reduce( @@ -266,7 +268,6 @@ std::unique_ptr split_record_re(strings_column_view const& input, { CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); - auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); auto const strings_count = input.size(); // create the regex device prog from the given pattern @@ -278,7 +279,7 @@ std::unique_ptr split_record_re(strings_column_view const& input, auto offsets_view = offsets->mutable_view(); // get the split tokens from the input column; this also converts the counts into offsets - auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); + auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream); // convert the tokens into one big strings column auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); From f528107d2daeb430b0ae89d92becb1477c6e0de1 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 8 Feb 2022 09:27:51 -0700 Subject: [PATCH 26/39] Fix typo --- java/src/main/native/src/ColumnViewJni.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 167f53493f4..6b0172acd99 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -580,7 +580,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv * if (limit == 0 || limit == 1) { JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", - "limit == 0 or limit == 1 are not supported", 0); + "limit == 0 and limit == 1 are not supported", 0); } auto const max_split = limit > 1 ? limit - 1 : limit; @@ -612,7 +612,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv if (limit == 0 || limit == 1) { JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", - "limit == 0 or limit == 1 are not supported", 0); + "limit == 0 and limit == 1 are not supported", 0); } auto const max_split = limit > 1 ? limit - 1 : limit; From 70a4e342b26bed44df064b46293ae537cb7b7184 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 10 Feb 2022 14:35:18 -0700 Subject: [PATCH 27/39] Remove support for empty delimiter --- .../main/java/ai/rapids/cudf/ColumnView.java | 56 ++----------------- java/src/main/native/src/ColumnVectorJni.cpp | 1 + java/src/main/native/src/ColumnViewJni.cpp | 39 +++++++++---- 3 files changed, 36 insertions(+), 60 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index d28211d808f..c364643373d 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2359,10 +2359,10 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) { * @return New table of strings columns. */ public final Table stringSplit(String delimiter, int limit, boolean splitByRegex) { - assert delimiter != null : "delimiter is null"; - assert delimiter.length() > 0 || !splitByRegex : "cannot split by empty regex"; assert type.equals(DType.STRING) : "column type must be a String"; - assert limit != 0 && limit != 1; + assert delimiter != null : "delimiter is null"; + assert delimiter.length() > 0 : "empty delimiter is not supported"; + assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported"; return new Table(stringSplit(this.getNativeView(), delimiter, limit, splitByRegex)); } @@ -2410,32 +2410,6 @@ public final Table stringSplit(String delimiter) { return stringSplit(delimiter, -1, false); } - /** - * Returns a list of columns by splitting each string using whitespace as the delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @param limit the maximum size of the array resulting from splitting the input string, - * or -1 for all possible splits. Note that limit = 0 (all possible splits without - * trailing empty strings) and limit = 1 (no split at all) are not supported. - * @return New table of strings columns. - */ - public final Table stringSplit(int limit) { - String emptyString = ""; - return stringSplit(emptyString, limit, false); - } - - /** - * Returns a list of columns by splitting each string using whitespace as the delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @return New table of strings columns. - */ - public final Table stringSplit() { - return stringSplit(-1); - } - /** * Returns a column that is a list of strings. Each string list is made by splitting each input * string using the specified delimiter. @@ -2449,10 +2423,10 @@ public final Table stringSplit() { * @return New table of strings columns. */ public final ColumnVector stringSplitRecord(String delimiter, int limit, boolean splitByRegex) { - assert delimiter != null : "delimiter is null"; - assert delimiter.length() > 0 || !splitByRegex : "cannot split by empty regex"; assert type.equals(DType.STRING) : "column type must be String"; - assert limit != 0 && limit != 1; + assert delimiter != null : "delimiter is null"; + assert delimiter.length() > 0 : "empty delimiter is not supported"; + assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported"; return new ColumnVector( stringSplitRecord(this.getNativeView(), delimiter, limit, splitByRegex)); } @@ -2491,24 +2465,6 @@ public final ColumnVector stringSplitRecord(String delimiter) { return stringSplitRecord(delimiter, -1, false); } - /** - * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. - * @param limit the maximum size of the array resulting from splitting the input string, - * or -1 for all possible splits. Note that limit = 0 (all possible splits without - * trailing empty strings) and limit = 1 (no split at all) are not supported. - */ - public final ColumnVector stringSplitRecord(int limit) { - String emptyString = ""; - return stringSplitRecord(emptyString, limit, false); - } - - /** - * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. - */ - public final ColumnVector stringSplitRecord() { - return stringSplitRecord(-1); - } - /** * Returns a new strings column that contains substrings of the strings in the provided column. * Overloading subString to support if end index is not provided. Appending -1 to indicate to diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index f01d832eb19..83202213d3e 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp" diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 6b0172acd99..2209dc347d2 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -70,6 +70,7 @@ #include #include #include +#include #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp" @@ -567,6 +568,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv * jint limit, jboolean split_by_regex) { JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); + + // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1. + if (limit == 0 || limit == 1) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "limit == 0 and limit == 1 are not supported", 0); + } + try { cudf::jni::auto_set_device(env); @@ -575,14 +583,16 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv * auto const delimiter_chars = env->GetStringUTFChars(delimiter, nullptr); auto const delimiter_size = env->GetStringUTFLength(delimiter); - auto const str_delimiter = std::string(delimiter_chars, delimiter_size); - env->ReleaseStringUTFChars(delimiter, delimiter_chars); - if (limit == 0 || limit == 1) { - JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", - "limit == 0 and limit == 1 are not supported", 0); + // Java's split API produces different behaviors than cudf when splitting with empty delimiter. + if (delimiter_size == 0) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty delimiter is not supported", + 0); } + auto const max_split = limit > 1 ? limit - 1 : limit; + auto const str_delimiter = std::string(delimiter_chars, delimiter_size); + env->ReleaseStringUTFChars(delimiter, delimiter_chars); auto result = split_by_regex ? @@ -599,6 +609,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv jint limit, jboolean split_by_regex) { JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); + + // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1. + if (limit == 0 || limit == 1) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "limit == 0 and limit == 1 are not supported", 0); + } + try { cudf::jni::auto_set_device(env); @@ -607,14 +624,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv auto const delimiter_chars = env->GetStringUTFChars(delimiter, nullptr); auto const delimiter_size = env->GetStringUTFLength(delimiter); - auto const str_delimiter = std::string(delimiter_chars, delimiter_size); - env->ReleaseStringUTFChars(delimiter, delimiter_chars); - if (limit == 0 || limit == 1) { - JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", - "limit == 0 and limit == 1 are not supported", 0); + // Java's split API produces different behaviors than cudf when splitting with empty delimiter. + if (delimiter_size == 0) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty delimiter is not supported", + 0); } + auto const max_split = limit > 1 ? limit - 1 : limit; + auto const str_delimiter = std::string(delimiter_chars, delimiter_size); + env->ReleaseStringUTFChars(delimiter, delimiter_chars); auto result = split_by_regex ? From af16edd1874a472459971081a3929d0eac600d8f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 10 Feb 2022 15:26:56 -0700 Subject: [PATCH 28/39] Update Java tests --- .../java/ai/rapids/cudf/ColumnVectorTest.java | 105 +++++++++++++----- 1 file changed, 79 insertions(+), 26 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index bda1a6bda55..ebbf73a8033 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4739,28 +4739,11 @@ void testListSortRowsWithStringChild() { } } - @Test - void testStringSplitRecord() { - String pattern = " "; - try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings"); - ColumnVector expected = ColumnVector.fromLists( - new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.STRING)), - Arrays.asList("Héllo", "there"), - Arrays.asList("thésé"), - Arrays.asList("null"), - Arrays.asList(""), - Arrays.asList("ARé", "some"), - Arrays.asList("test", "strings")); - ColumnVector result = v.stringSplitRecord(pattern, -1)) { - assertColumnsAreEqual(expected, result); - } - } - @Test void testStringSplit() { String pattern = " "; - try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here"); + try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", + "ARé some things", "test strings here"); Table expectedSplitLimit2 = new Table.TestBuilder() .column("Héllo", "thésé", null, "", "ARé", "test") .column("there all", null, null, null, "some things", "strings here") @@ -4778,13 +4761,83 @@ void testStringSplit() { } @Test - void testStringSplitWhiteSpace() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo thesé", null, "are\tsome", "tést\nString", " "); - Table expected = new Table.TestBuilder().column("Héllo", null, "are", "tést", null) - .column("thesé", null, "some", "String", null) - .build(); - Table result = v.stringSplit()) { - assertTablesAreEqual(expected, result); + void testStringSplitByRegularExpression() { + String pattern = "[_ ]"; + try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "", + "ARé some_things", "test_strings_here"); + Table expectedSplitLimit2 = new Table.TestBuilder() + .column("Héllo", "thésé", null, "", "ARé", "test") + .column("there all", null, null, null, "some_things", "strings_here") + .build(); + Table expectedSplitAll = new Table.TestBuilder() + .column("Héllo", "thésé", null, "", "ARé", "test") + .column("there", null, null, null, "some", "strings") + .column("all", null, null, null, "things", "here") + .build(); + Table resultSplitLimit2 = v.stringSplit(pattern, 2, true); + Table resultSplitAll = v.stringSplit(pattern, true)) { + assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertTablesAreEqual(expectedSplitAll, resultSplitAll); + } + } + + @Test + void testStringSplitRecord() { + String pattern = " "; + try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", + "ARé some things", "test strings here"); + ColumnVector expectedSplitLimit2 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there all"), + Arrays.asList("thésé"), + Arrays.asList((Object) null), + Arrays.asList(""), + Arrays.asList("ARé", "some things"), + Arrays.asList("test", "strings here")); + ColumnVector expectedSplitAll = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there", "all"), + Arrays.asList("thésé"), + Arrays.asList((Object) null), + Arrays.asList(""), + Arrays.asList("ARé", "some", "things"), + Arrays.asList("test", "strings", "here")); + ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2); + ColumnVector resultSplitAll = v.stringSplitRecord(pattern)) { + assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertColumnsAreEqual(expectedSplitAll, resultSplitAll); + } + } + + @Test + void testStringSplitRecordByRegularExpression() { + String pattern = "[_ ]"; + try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "", + "ARé some_things", "test_strings_here"); + ColumnVector expectedSplitLimit2 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there all"), + Arrays.asList("thésé"), + Arrays.asList((Object) null), + Arrays.asList(""), + Arrays.asList("ARé", "some_things"), + Arrays.asList("test", "strings_here")); + ColumnVector expectedSplitAll = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there", "all"), + Arrays.asList("thésé"), + Arrays.asList((Object) null), + Arrays.asList(""), + Arrays.asList("ARé", "some", "things"), + Arrays.asList("test", "strings", "here")); + ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2, true); + ColumnVector resultSplitAll = v.stringSplitRecord(pattern, true)) { + assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertColumnsAreEqual(expectedSplitAll, resultSplitAll); } } From cac2637805bb17151fad6508d71401dfd0ae9af6 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 10 Feb 2022 16:10:20 -0700 Subject: [PATCH 29/39] Fix Java tests --- .../test/java/ai/rapids/cudf/ColumnVectorTest.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 999e9a36267..b759c746735 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4364,10 +4364,10 @@ void testExtractListElements() { ColumnVector expected = ColumnVector.fromStrings("Héllo", "thésé", null, - null, + "", "ARé", "test"); - ColumnVector tmp = v.stringSplitRecord(); + ColumnVector tmp = v.stringSplitRecord(" "); ColumnVector result = tmp.extractListElement(0)) { assertColumnsAreEqual(expected, result); } @@ -4813,7 +4813,7 @@ void testStringSplitRecord() { new HostColumnVector.BasicType(true, DType.STRING)), Arrays.asList("Héllo", "there all"), Arrays.asList("thésé"), - Arrays.asList((Object) null), + null, Arrays.asList(""), Arrays.asList("ARé", "some things"), Arrays.asList("test", "strings here")); @@ -4822,7 +4822,7 @@ void testStringSplitRecord() { new HostColumnVector.BasicType(true, DType.STRING)), Arrays.asList("Héllo", "there", "all"), Arrays.asList("thésé"), - Arrays.asList((Object) null), + null, Arrays.asList(""), Arrays.asList("ARé", "some", "things"), Arrays.asList("test", "strings", "here")); @@ -4843,7 +4843,7 @@ void testStringSplitRecordByRegularExpression() { new HostColumnVector.BasicType(true, DType.STRING)), Arrays.asList("Héllo", "there all"), Arrays.asList("thésé"), - Arrays.asList((Object) null), + null, Arrays.asList(""), Arrays.asList("ARé", "some_things"), Arrays.asList("test", "strings_here")); @@ -4852,7 +4852,7 @@ void testStringSplitRecordByRegularExpression() { new HostColumnVector.BasicType(true, DType.STRING)), Arrays.asList("Héllo", "there", "all"), Arrays.asList("thésé"), - Arrays.asList((Object) null), + null, Arrays.asList(""), Arrays.asList("ARé", "some", "things"), Arrays.asList("test", "strings", "here")); From 2fade8da4952da9cf4d1c5c9e1a739aee51a4e11 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 11 Feb 2022 06:25:37 -0700 Subject: [PATCH 30/39] Reverse change --- java/src/main/native/src/ColumnVectorJni.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index 83202213d3e..f01d832eb19 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -32,7 +32,6 @@ #include #include #include -#include #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp" From a65d358cfda73457c35bc6cb318942e8714bb14f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 11 Feb 2022 11:17:07 -0700 Subject: [PATCH 31/39] Rewrite Javadoc --- .../main/java/ai/rapids/cudf/ColumnView.java | 152 +++++++++--------- 1 file changed, 78 insertions(+), 74 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index c364643373d..770368791f7 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2345,121 +2345,125 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) { } /** - * Returns a list of columns by splitting each string using the specified delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 string identifying the split points or split pattern in each string. - * An empty string indicates split on whitespace. - * @param limit the maximum size of the array resulting from splitting the input string, + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, * or -1 for all possible splits. Note that limit = 0 (all possible splits without * trailing empty strings) and limit = 1 (no split at all) are not supported. - * @param splitByRegex a boolean flag indicating whether the input string will be split by a + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a * regular expression pattern or just by a string literal delimiter. - * @return New table of strings columns. + * @return list of strings columns as a table. */ - public final Table stringSplit(String delimiter, int limit, boolean splitByRegex) { + public final Table stringSplit(String pattern, int limit, boolean splitByRegex) { assert type.equals(DType.STRING) : "column type must be a String"; - assert delimiter != null : "delimiter is null"; - assert delimiter.length() > 0 : "empty delimiter is not supported"; + assert pattern != null : "pattern is null"; + assert pattern.length() > 0 : "empty pattern is not supported"; assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported"; - return new Table(stringSplit(this.getNativeView(), delimiter, limit, splitByRegex)); + return new Table(stringSplit(this.getNativeView(), pattern, limit, splitByRegex)); } /** - * Returns a list of columns by splitting each string using the specified string literal delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 string identifying the split points or split pattern in each string. - * An empty string indicates split on whitespace. - * @param limit the maximum size of the array resulting from splitting the input string, - * or -1 for all possible splits. Note that limit = 0 (all possible splits without - * trailing empty strings) and limit = 1 (no split at all) are not supported. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return list of strings columns as a table. */ - public final Table stringSplit(String delimiter, int limit) { - return stringSplit(delimiter, limit, false); + public final Table stringSplit(String pattern, boolean splitByRegex) { + return stringSplit(pattern, -1, splitByRegex); } /** - * Returns a list of columns by splitting each string using the specified delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 string identifying the split points or split pattern in each string. - * An empty string indicates split on whitespace. - * @param splitByRegex a boolean flag indicating whether the input string will be split by a - * regular expression pattern or just by a string literal delimiter. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified string literal + * delimiter. The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. Null input entries + * result in all nulls in the corresponding rows of the output columns. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @return list of strings columns as a table. */ - public final Table stringSplit(String delimiter, boolean splitByRegex) { - return stringSplit(delimiter, -1, splitByRegex); + public final Table stringSplit(String delimiter, int limit) { + return stringSplit(delimiter, limit, false); } /** - * Returns a list of columns by splitting each string using the specified string literal delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 string identifying the split points or split pattern in each string. - * An empty string indicates split on whitespace. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified string literal + * delimiter. The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. Null input entries + * result in all nulls in the corresponding rows of the output columns. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @return list of strings columns as a table. */ public final Table stringSplit(String delimiter) { return stringSplit(delimiter, -1, false); } /** - * Returns a column that is a list of strings. Each string list is made by splitting each input - * string using the specified delimiter. - * @param delimiter UTF-8 string identifying the split points or split pattern in each string. - * An empty string indicates split on whitespace. - * @param limit the maximum size of the array resulting from splitting the input string, + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified pattern. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, * or -1 for all possible splits. Note that limit = 0 (all possible splits without * trailing empty strings) and limit = 1 (no split at all) are not supported. - * @param splitByRegex a boolean flag indicating whether the input string will be split by a + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a * regular expression pattern or just by a string literal delimiter. - * @return New table of strings columns. + * @return a LIST column of string elements. */ - public final ColumnVector stringSplitRecord(String delimiter, int limit, boolean splitByRegex) { + public final ColumnVector stringSplitRecord(String pattern, int limit, boolean splitByRegex) { assert type.equals(DType.STRING) : "column type must be String"; - assert delimiter != null : "delimiter is null"; - assert delimiter.length() > 0 : "empty delimiter is not supported"; + assert pattern != null : "pattern is null"; + assert pattern.length() > 0 : "empty pattern is not supported"; assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported"; return new ColumnVector( - stringSplitRecord(this.getNativeView(), delimiter, limit, splitByRegex)); + stringSplitRecord(this.getNativeView(), pattern, limit, splitByRegex)); } /** - * Returns a column that is a list of strings. Each string list is made by splitting each input - * string using the specified string literal delimiter. - * @param delimiter UTF-8 string identifying the split points or split pattern in each string. - * An empty string indicates split on whitespace. - * @param limit the maximum size of the array resulting from splitting the input string, + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified pattern. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return a LIST column of string elements. + */ + public final ColumnVector stringSplitRecord(String pattern, boolean splitByRegex) { + return stringSplitRecord(pattern, -1, splitByRegex); + } + + /** + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, * or -1 for all possible splits. Note that limit = 0 (all possible splits without * trailing empty strings) and limit = 1 (no split at all) are not supported. - * @return New table of strings columns. + * @return a LIST column of string elements. */ public final ColumnVector stringSplitRecord(String delimiter, int limit) { return stringSplitRecord(delimiter, limit, false); } - /** - * Returns a column of lists of strings by splitting each string using the specified delimiter. - * @param delimiter UTF-8 string identifying the split points or split pattern in each string. - * An empty string indicates split on whitespace. - * @param splitByRegex a boolean flag indicating whether the input string will be split by a - * regular expression pattern or just by a string literal delimiter. - */ - public final ColumnVector stringSplitRecord(String delimiter, boolean splitByRegex) { - return stringSplitRecord(delimiter, -1, splitByRegex); - } /** - * Returns a column of lists of strings by splitting each string using the specified string - * literal delimiter. - * @param delimiter UTF-8 string identifying the split points or split pattern in each string. - * An empty string indicates split on whitespace. + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @return a LIST column of string elements. */ public final ColumnVector stringSplitRecord(String delimiter) { return stringSplitRecord(delimiter, -1, false); From 7f0fee7edb252991228533462227b392a82b9f33 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 11 Feb 2022 11:21:22 -0700 Subject: [PATCH 32/39] Fix Javadoc for the native methods --- .../main/java/ai/rapids/cudf/ColumnView.java | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 770368791f7..f91ee5535b1 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -3571,31 +3571,35 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle private static native long substringLocate(long columnView, long substringScalar, int start, int end); /** - * Native method which returns an array of columns by splitting each string using the specified - * delimiter. - * @param nativeHandle native handle of the cudf::column_view being operated on. - * @param delimiter UTF-8 string identifying the split points or split pattern in each string. - * @param limit the maximum size of the array resulting from splitting the input string, + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param nativeHandle native handle of the input strings column that being operated on. + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, * or -1 for all possible splits. Note that limit = 0 (all possible splits without * trailing empty strings) and limit = 1 (no split at all) are not supported. - * @param splitByRegex a boolean flag indicating whether the input string will be split by a + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a * regular expression pattern or just by a string literal delimiter. */ - private static native long[] stringSplit(long nativeHandle, String delimiter, int limit, + private static native long[] stringSplit(long nativeHandle, String pattern, int limit, boolean splitByRegex); /** - * Native method which returns a LIST column by splitting each string into a list of strings - * using the specified delimiter. - * @param nativeHandle native handle of the cudf::column_view being operated on. - * @param delimiter UTF-8 string identifying the split points or split pattern in each string. - * @param limit the maximum size of the array resulting from splitting the input string, + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param nativeHandle native handle of the input strings column that being operated on. + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, * or -1 for all possible splits. Note that limit = 0 (all possible splits without * trailing empty strings) and limit = 1 (no split at all) are not supported. - * @param splitByRegex a boolean flag indicating whether the input string will be split by a + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a * regular expression pattern or just by a string literal delimiter. */ - private static native long stringSplitRecord(long nativeHandle, String delimiter, int limit, + private static native long stringSplitRecord(long nativeHandle, String pattern, int limit, boolean splitByRegex); /** From 1b4cd51b9f7ebb3ac784227e85d175f9cf7ebcbc Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 11 Feb 2022 12:33:14 -0700 Subject: [PATCH 33/39] Rewrite JNI --- java/src/main/native/src/ColumnViewJni.cpp | 61 +++++++++------------- 1 file changed, 26 insertions(+), 35 deletions(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 2209dc347d2..89896df515b 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -564,81 +564,72 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass, jlong input_handle, - jstring delimiter, - jint limit, + jstring pattern, jint limit, jboolean split_by_regex) { JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); - // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1. if (limit == 0 || limit == 1) { + // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1. JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0); } try { cudf::jni::auto_set_device(env); - auto const input = reinterpret_cast(input_handle); auto const strs_input = cudf::strings_column_view{*input}; - auto const delimiter_chars = env->GetStringUTFChars(delimiter, nullptr); - auto const delimiter_size = env->GetStringUTFLength(delimiter); - - // Java's split API produces different behaviors than cudf when splitting with empty delimiter. - if (delimiter_size == 0) { - JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty delimiter is not supported", - 0); + auto const pattern_size = env->GetStringUTFLength(pattern); + if (pattern_size == 0) { + // Java's split API produces different behaviors than cudf when splitting with empty + // pattern. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); } auto const max_split = limit > 1 ? limit - 1 : limit; - auto const str_delimiter = std::string(delimiter_chars, delimiter_size); - env->ReleaseStringUTFChars(delimiter, delimiter_chars); + auto const pattern_chars = env->GetStringUTFChars(pattern, nullptr); + auto const pattern_str = std::string(pattern_chars, pattern_size); + env->ReleaseStringUTFChars(pattern, pattern_chars); - auto result = - split_by_regex ? - cudf::strings::split_re(strs_input, str_delimiter, max_split) : - cudf::strings::split(strs_input, cudf::string_scalar{str_delimiter}, max_split); + auto result = split_by_regex ? + cudf::strings::split_re(strs_input, pattern_str, max_split) : + cudf::strings::split(strs_input, cudf::string_scalar{pattern_str}, max_split); return cudf::jni::convert_table_for_return(env, std::move(result)); } CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass, - jlong input_handle, - jstring delimiter, - jint limit, - jboolean split_by_regex) { +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord( + JNIEnv *env, jclass, jlong input_handle, jstring pattern, jint limit, jboolean split_by_regex) { JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); - // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1. if (limit == 0 || limit == 1) { + // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1. JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0); } try { cudf::jni::auto_set_device(env); - auto const input = reinterpret_cast(input_handle); auto const strs_input = cudf::strings_column_view{*input}; - auto const delimiter_chars = env->GetStringUTFChars(delimiter, nullptr); - auto const delimiter_size = env->GetStringUTFLength(delimiter); - - // Java's split API produces different behaviors than cudf when splitting with empty delimiter. - if (delimiter_size == 0) { - JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty delimiter is not supported", - 0); + auto const pattern_size = env->GetStringUTFLength(pattern); + if (pattern_size == 0) { + // Java's split API produces different behaviors than cudf when splitting with empty + // pattern. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); } auto const max_split = limit > 1 ? limit - 1 : limit; - auto const str_delimiter = std::string(delimiter_chars, delimiter_size); - env->ReleaseStringUTFChars(delimiter, delimiter_chars); + auto const pattern_chars = env->GetStringUTFChars(pattern, nullptr); + auto const pattern_str = std::string(pattern_chars, pattern_size); + env->ReleaseStringUTFChars(pattern, pattern_chars); auto result = split_by_regex ? - cudf::strings::split_record_re(strs_input, str_delimiter, max_split) : - cudf::strings::split_record(strs_input, cudf::string_scalar{str_delimiter}, max_split); + cudf::strings::split_record_re(strs_input, pattern_str, max_split) : + cudf::strings::split_record(strs_input, cudf::string_scalar{pattern_str}, max_split); return release_as_jlong(result); } CATCH_STD(env, 0); From 00d7d8ba4f06455f33f1917d7fc6aaed765a0af9 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 11 Feb 2022 13:45:39 -0700 Subject: [PATCH 34/39] Add a function to construct std::string from native_jstring --- java/src/main/native/include/jni_utils.hpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index a45716a89b3..e8346e5ef1e 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -524,7 +525,7 @@ class native_jstring { void init_cstr() const { if (orig != NULL && cstr == NULL) { cstr_length = env->GetStringUTFLength(orig); - cstr = env->GetStringUTFChars(orig, 0); + cstr = env->GetStringUTFChars(orig, 0); // not guarantee to have null terminated. check_java_exception(env); } } @@ -555,6 +556,7 @@ class native_jstring { bool is_null() const noexcept { return orig == NULL; } + // Note that the char* return by this function is not guaranteed to be null-terminated. const char *get() const { init_cstr(); return cstr; @@ -565,6 +567,12 @@ class native_jstring { return cstr_length; } + // Note that the char* return by `get()` is not guaranteed to be null-terminated. + // Thus, constructing an std::string should be performed with a string size supplied. + std::string get_cpp_str() const { return std::string(get(), size_bytes()); } + + jstring get_jstring() const { return orig; } + bool is_empty() const { if (cstr != NULL) { return cstr_length <= 0; @@ -576,8 +584,6 @@ class native_jstring { return true; } - const jstring get_jstring() const { return orig; } - ~native_jstring() { if (orig != NULL && cstr != NULL) { env->ReleaseStringUTFChars(orig, cstr); From 69bb7a0ca3b3357d8f35c8a8017313676dabb7c0 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 11 Feb 2022 13:45:45 -0700 Subject: [PATCH 35/39] Update JNI --- java/src/main/native/src/ColumnViewJni.cpp | 34 ++++++++++------------ 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 89896df515b..b8d62940cae 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -564,7 +564,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass, jlong input_handle, - jstring pattern, jint limit, + jstring pattern_obj, + jint limit, jboolean split_by_regex) { JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); @@ -579,28 +580,27 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv * auto const input = reinterpret_cast(input_handle); auto const strs_input = cudf::strings_column_view{*input}; - auto const pattern_size = env->GetStringUTFLength(pattern); - if (pattern_size == 0) { + auto const pattern = cudf::jni::native_jstring(env, pattern_obj).get_cpp_str(); + if (pattern.empty()) { // Java's split API produces different behaviors than cudf when splitting with empty // pattern. JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); } auto const max_split = limit > 1 ? limit - 1 : limit; - auto const pattern_chars = env->GetStringUTFChars(pattern, nullptr); - auto const pattern_str = std::string(pattern_chars, pattern_size); - env->ReleaseStringUTFChars(pattern, pattern_chars); - auto result = split_by_regex ? - cudf::strings::split_re(strs_input, pattern_str, max_split) : - cudf::strings::split(strs_input, cudf::string_scalar{pattern_str}, max_split); + cudf::strings::split_re(strs_input, pattern, max_split) : + cudf::strings::split(strs_input, cudf::string_scalar{pattern}, max_split); return cudf::jni::convert_table_for_return(env, std::move(result)); } CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord( - JNIEnv *env, jclass, jlong input_handle, jstring pattern, jint limit, jboolean split_by_regex) { +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass, + jlong input_handle, + jstring pattern_obj, + jint limit, + jboolean split_by_regex) { JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); if (limit == 0 || limit == 1) { @@ -614,22 +614,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord( auto const input = reinterpret_cast(input_handle); auto const strs_input = cudf::strings_column_view{*input}; - auto const pattern_size = env->GetStringUTFLength(pattern); - if (pattern_size == 0) { + auto const pattern = cudf::jni::native_jstring(env, pattern_obj).get_cpp_str(); + if (pattern.empty()) { // Java's split API produces different behaviors than cudf when splitting with empty // pattern. JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); } auto const max_split = limit > 1 ? limit - 1 : limit; - auto const pattern_chars = env->GetStringUTFChars(pattern, nullptr); - auto const pattern_str = std::string(pattern_chars, pattern_size); - env->ReleaseStringUTFChars(pattern, pattern_chars); - auto result = split_by_regex ? - cudf::strings::split_record_re(strs_input, pattern_str, max_split) : - cudf::strings::split_record(strs_input, cudf::string_scalar{pattern_str}, max_split); + cudf::strings::split_record_re(strs_input, pattern, max_split) : + cudf::strings::split_record(strs_input, cudf::string_scalar{pattern}, max_split); return release_as_jlong(result); } CATCH_STD(env, 0); From a1b0e37f7beecbacd67457b1f3df0115a66e0e7e Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 11 Feb 2022 13:50:24 -0700 Subject: [PATCH 36/39] Revert "Add a function to construct std::string from native_jstring" This reverts commit 00d7d8ba4f06455f33f1917d7fc6aaed765a0af9. --- java/src/main/native/include/jni_utils.hpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index e8346e5ef1e..a45716a89b3 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -17,7 +17,6 @@ #include #include -#include #include #include @@ -525,7 +524,7 @@ class native_jstring { void init_cstr() const { if (orig != NULL && cstr == NULL) { cstr_length = env->GetStringUTFLength(orig); - cstr = env->GetStringUTFChars(orig, 0); // not guarantee to have null terminated. + cstr = env->GetStringUTFChars(orig, 0); check_java_exception(env); } } @@ -556,7 +555,6 @@ class native_jstring { bool is_null() const noexcept { return orig == NULL; } - // Note that the char* return by this function is not guaranteed to be null-terminated. const char *get() const { init_cstr(); return cstr; @@ -567,12 +565,6 @@ class native_jstring { return cstr_length; } - // Note that the char* return by `get()` is not guaranteed to be null-terminated. - // Thus, constructing an std::string should be performed with a string size supplied. - std::string get_cpp_str() const { return std::string(get(), size_bytes()); } - - jstring get_jstring() const { return orig; } - bool is_empty() const { if (cstr != NULL) { return cstr_length <= 0; @@ -584,6 +576,8 @@ class native_jstring { return true; } + const jstring get_jstring() const { return orig; } + ~native_jstring() { if (orig != NULL && cstr != NULL) { env->ReleaseStringUTFChars(orig, cstr); From 3d55e34751d5f048ac00e709160ce6232830ee6e Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 11 Feb 2022 13:52:21 -0700 Subject: [PATCH 37/39] Update JNI --- java/src/main/native/src/ColumnViewJni.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index b8d62940cae..997a4feaada 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -580,13 +580,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv * auto const input = reinterpret_cast(input_handle); auto const strs_input = cudf::strings_column_view{*input}; - auto const pattern = cudf::jni::native_jstring(env, pattern_obj).get_cpp_str(); - if (pattern.empty()) { + auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj); + if (pattern_jstr.is_empty()) { // Java's split API produces different behaviors than cudf when splitting with empty // pattern. JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); } + auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes()); auto const max_split = limit > 1 ? limit - 1 : limit; auto result = split_by_regex ? cudf::strings::split_re(strs_input, pattern, max_split) : @@ -614,13 +615,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv auto const input = reinterpret_cast(input_handle); auto const strs_input = cudf::strings_column_view{*input}; - auto const pattern = cudf::jni::native_jstring(env, pattern_obj).get_cpp_str(); - if (pattern.empty()) { + auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj); + if (pattern_jstr.is_empty()) { // Java's split API produces different behaviors than cudf when splitting with empty // pattern. JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); } + auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes()); auto const max_split = limit > 1 ? limit - 1 : limit; auto result = split_by_regex ? From 2ba4039916c91a23159ee813dcb32e4d447c1f3c Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 11 Feb 2022 14:13:55 -0700 Subject: [PATCH 38/39] Update comments to clarify why we don't support limit==0 and limit==1 --- java/src/main/native/src/ColumnViewJni.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 997a4feaada..5784be25008 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -570,7 +570,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv * JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); if (limit == 0 || limit == 1) { - // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1. + // Cannot achieve the results of splitting with limit == 0 or limit == 1. + // This is because cudf operates on a different parameter (`max_split`) which is converted from + // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an + // unlimited split. JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0); } @@ -605,7 +608,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); if (limit == 0 || limit == 1) { - // Java's split API produces different behaviors than cudf when limit == 0 and limit == 1. + // Cannot achieve the results of splitting with limit == 0 or limit == 1. + // This is because cudf operates on a different parameter (`max_split`) which is converted from + // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an + // unlimited split. JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0); } From bad6e6771f6139a053d7cdfea45c938f6425c6cc Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 11 Feb 2022 14:24:33 -0700 Subject: [PATCH 39/39] Remove unused header that was added by accident --- java/src/main/native/src/ColumnViewJni.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 5784be25008..548844aa0d3 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -70,7 +70,6 @@ #include #include #include -#include #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp"