From 66b5a0c062e7a40223259b64cae327edd1eb8564 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 7 Sep 2022 10:39:02 -0400 Subject: [PATCH] Refactor strings strip functor to details header (#11635) Moves the internal string strip function to `strip.cuh` header in the `include/cudf/strings/detail` folder. Allows this function to be shared with strings-udf code for strip. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/11635 --- cpp/include/cudf/strings/detail/strip.cuh | 69 +++++++++++++++++++++++ cpp/src/strings/strip.cu | 41 +++----------- 2 files changed, 76 insertions(+), 34 deletions(-) create mode 100644 cpp/include/cudf/strings/detail/strip.cuh diff --git a/cpp/include/cudf/strings/detail/strip.cuh b/cpp/include/cudf/strings/detail/strip.cuh new file mode 100644 index 000000000000..533e76121b5b --- /dev/null +++ b/cpp/include/cudf/strings/detail/strip.cuh @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cudf { +namespace strings { +namespace detail { + +/** + * @brief Strips a specified character from the either or both ends of a string + * + * @param d_str Input string to strip + * @param d_to_strip String containing the character to strip; + * only the first character is used + * @param side Which ends of the input string to strip from + * @return New string excluding the stripped ends + */ +__device__ cudf::string_view strip(cudf::string_view const d_str, + cudf::string_view const d_to_strip, + side_type side = side_type::BOTH) +{ + auto is_strip_character = [d_to_strip](char_utf8 chr) -> bool { + if (d_to_strip.empty()) return chr <= ' '; // whitespace check + for (auto c : d_to_strip) { + if (c == chr) return true; + } + return false; + }; + + auto const left_offset = [&] { + if (side != side_type::LEFT && side != side_type::BOTH) return 0; + for (auto itr = d_str.begin(); itr < d_str.end(); ++itr) { + if (!is_strip_character(*itr)) return itr.byte_offset(); + } + return d_str.size_bytes(); + }(); + + auto const right_offset = [&] { + if (side != side_type::RIGHT && side != side_type::BOTH) return d_str.size_bytes(); + for (auto itr = d_str.end(); itr > d_str.begin(); --itr) { + if (!is_strip_character(*(itr - 1))) return itr.byte_offset(); + } + return 0; + }(); + + auto const bytes = (right_offset > left_offset) ? right_offset - left_offset : 0; + return cudf::string_view{d_str.data() + left_offset, bytes}; +} + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu index 8f9794f6679a..5d51a5a7bed0 100644 --- a/cpp/src/strings/strip.cu +++ b/cpp/src/strings/strip.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -28,11 +29,6 @@ #include #include -#include -#include -#include -#include - namespace cudf { namespace strings { namespace detail { @@ -59,38 +55,15 @@ struct strip_fn { if (!d_chars) d_offsets[idx] = 0; return; } + auto const d_str = d_strings.element(idx); - auto is_strip_character = [d_to_strip = d_to_strip] __device__(char_utf8 chr) -> bool { - return d_to_strip.empty() ? (chr <= ' ') : // whitespace check - thrust::any_of( - thrust::seq, d_to_strip.begin(), d_to_strip.end(), [chr] __device__(char_utf8 c) { - return c == chr; - }); - }; - - size_type const left_offset = [&] { - if (side != side_type::LEFT && side != side_type::BOTH) return 0; - auto const itr = - thrust::find_if_not(thrust::seq, d_str.begin(), d_str.end(), is_strip_character); - return itr != d_str.end() ? itr.byte_offset() : d_str.size_bytes(); - }(); - - size_type right_offset = d_str.size_bytes(); - if (side == side_type::RIGHT || side == side_type::BOTH) { - auto const length = d_str.length(); - auto itr = d_str.end(); - for (size_type n = 0; n < length; ++n) { - if (!is_strip_character(*(--itr))) break; - right_offset = itr.byte_offset(); - } + auto const d_stripped = strip(d_str, d_to_strip, side); + if (d_chars) { + copy_string(d_chars + d_offsets[idx], d_stripped); + } else { + d_offsets[idx] = d_stripped.size_bytes(); } - - auto const bytes = (right_offset > left_offset) ? right_offset - left_offset : 0; - if (d_chars) - memcpy(d_chars + d_offsets[idx], d_str.data() + left_offset, bytes); - else - d_offsets[idx] = bytes; } };