-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add regex_program
strings splitting java APIs and tests
#12713
Changes from 3 commits
ffc15c3
4718540
70fa951
0376ce0
5dd8f0f
3643269
70de2b5
465ea42
9a7c154
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -679,11 +679,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reverseStringsOrLists(JNI | |||||||||
CATCH_STD(env, 0); | ||||||||||
} | ||||||||||
|
||||||||||
JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass, | ||||||||||
jlong input_handle, | ||||||||||
jstring pattern_obj, | ||||||||||
jint limit, | ||||||||||
jboolean split_by_regex) { | ||||||||||
JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit( | ||||||||||
JNIEnv *env, jclass, jlong input_handle, jstring pattern_obj, jint regex_flags, | ||||||||||
jint capture_groups, jint limit, jboolean split_by_regex) { | ||||||||||
JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); | ||||||||||
|
||||||||||
if (limit == 0 || limit == 1) { | ||||||||||
|
@@ -697,31 +695,28 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv * | |||||||||
|
||||||||||
try { | ||||||||||
cudf::jni::auto_set_device(env); | ||||||||||
auto const input = reinterpret_cast<cudf::column_view *>(input_handle); | ||||||||||
auto const strs_input = cudf::strings_column_view{*input}; | ||||||||||
|
||||||||||
auto const column_view = reinterpret_cast<cudf::column_view const *>(input_handle); | ||||||||||
auto const strings_column = cudf::strings_column_view{*column_view}; | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggest to avoid
Suggested change
|
||||||||||
auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj); | ||||||||||
if (pattern_jstr.is_empty()) { | ||||||||||
// Java's split API produces different behaviors than cudf when splitting with empty | ||||||||||
// pattern. | ||||||||||
JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); | ||||||||||
} | ||||||||||
|
||||||||||
auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes()); | ||||||||||
auto const max_split = limit > 1 ? limit - 1 : limit; | ||||||||||
auto result = split_by_regex ? | ||||||||||
cudf::strings::split_re(strs_input, pattern, max_split) : | ||||||||||
cudf::strings::split(strs_input, cudf::string_scalar{pattern}, max_split); | ||||||||||
return cudf::jni::convert_table_for_return(env, std::move(result)); | ||||||||||
if (split_by_regex) { | ||||||||||
auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags); | ||||||||||
auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups); | ||||||||||
auto const regex_prog = cudf::strings::regex_program::create(pattern, flags, groups); | ||||||||||
auto result = cudf::strings::split_re(strings_column, *regex_prog, max_split); | ||||||||||
return cudf::jni::convert_table_for_return(env, std::move(result)); | ||||||||||
} else { | ||||||||||
auto result = cudf::strings::split(strings_column, cudf::string_scalar{pattern}, max_split); | ||||||||||
return cudf::jni::convert_table_for_return(env, std::move(result)); | ||||||||||
} | ||||||||||
} | ||||||||||
CATCH_STD(env, 0); | ||||||||||
} | ||||||||||
|
||||||||||
JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass, | ||||||||||
jlong input_handle, | ||||||||||
jstring pattern_obj, | ||||||||||
jint limit, | ||||||||||
jboolean split_by_regex) { | ||||||||||
JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord( | ||||||||||
JNIEnv *env, jclass, jlong input_handle, jstring pattern_obj, jint regex_flags, | ||||||||||
jint capture_groups, jint limit, jboolean split_by_regex) { | ||||||||||
JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); | ||||||||||
|
||||||||||
if (limit == 0 || limit == 1) { | ||||||||||
|
@@ -735,23 +730,22 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv | |||||||||
|
||||||||||
try { | ||||||||||
cudf::jni::auto_set_device(env); | ||||||||||
auto const input = reinterpret_cast<cudf::column_view *>(input_handle); | ||||||||||
auto const strs_input = cudf::strings_column_view{*input}; | ||||||||||
|
||||||||||
auto const column_view = reinterpret_cast<cudf::column_view const *>(input_handle); | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you! All changes updated. |
||||||||||
auto const strings_column = cudf::strings_column_view{*column_view}; | ||||||||||
auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj); | ||||||||||
if (pattern_jstr.is_empty()) { | ||||||||||
// Java's split API produces different behaviors than cudf when splitting with empty | ||||||||||
// pattern. | ||||||||||
JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); | ||||||||||
} | ||||||||||
|
||||||||||
auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes()); | ||||||||||
auto const max_split = limit > 1 ? limit - 1 : limit; | ||||||||||
auto result = | ||||||||||
split_by_regex ? | ||||||||||
cudf::strings::split_record_re(strs_input, pattern, max_split) : | ||||||||||
cudf::strings::split_record(strs_input, cudf::string_scalar{pattern}, max_split); | ||||||||||
return release_as_jlong(result); | ||||||||||
if (split_by_regex) { | ||||||||||
auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags); | ||||||||||
auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups); | ||||||||||
auto const regex_prog = cudf::strings::regex_program::create(pattern, flags, groups); | ||||||||||
auto result = cudf::strings::split_record_re(strings_column, *regex_prog, max_split); | ||||||||||
return release_as_jlong(result); | ||||||||||
} else { | ||||||||||
auto result = | ||||||||||
cudf::strings::split_record(strings_column, cudf::string_scalar{pattern}, max_split); | ||||||||||
return release_as_jlong(result); | ||||||||||
} | ||||||||||
} | ||||||||||
CATCH_STD(env, 0); | ||||||||||
} | ||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.