Skip to content

Commit

Permalink
Refactor strings strip functor to details header (NVIDIA#11635)
Browse files Browse the repository at this point in the history
Moves the internal string strip function to `strip.cuh` header in the `include/cudf/strings/detail` folder.
Allows this function to be shared with strings-udf code for strip.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/nvdbaranec

URL: rapidsai/cudf#11635
  • Loading branch information
davidwendt authored Sep 7, 2022
1 parent c439647 commit 66b5a0c
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 34 deletions.
69 changes: 69 additions & 0 deletions cpp/include/cudf/strings/detail/strip.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/strings/side_type.hpp>
#include <cudf/strings/string_view.cuh>

namespace cudf {
namespace strings {
namespace detail {

/**
* @brief Strips a specified character from the either or both ends of a string
*
* @param d_str Input string to strip
* @param d_to_strip String containing the character to strip;
* only the first character is used
* @param side Which ends of the input string to strip from
* @return New string excluding the stripped ends
*/
__device__ cudf::string_view strip(cudf::string_view const d_str,
cudf::string_view const d_to_strip,
side_type side = side_type::BOTH)
{
auto is_strip_character = [d_to_strip](char_utf8 chr) -> bool {
if (d_to_strip.empty()) return chr <= ' '; // whitespace check
for (auto c : d_to_strip) {
if (c == chr) return true;
}
return false;
};

auto const left_offset = [&] {
if (side != side_type::LEFT && side != side_type::BOTH) return 0;
for (auto itr = d_str.begin(); itr < d_str.end(); ++itr) {
if (!is_strip_character(*itr)) return itr.byte_offset();
}
return d_str.size_bytes();
}();

auto const right_offset = [&] {
if (side != side_type::RIGHT && side != side_type::BOTH) return d_str.size_bytes();
for (auto itr = d_str.end(); itr > d_str.begin(); --itr) {
if (!is_strip_character(*(itr - 1))) return itr.byte_offset();
}
return 0;
}();

auto const bytes = (right_offset > left_offset) ? right_offset - left_offset : 0;
return cudf::string_view{d_str.data() + left_offset, bytes};
}

} // namespace detail
} // namespace strings
} // namespace cudf
41 changes: 7 additions & 34 deletions cpp/src/strings/strip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/strip.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand All @@ -28,11 +29,6 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/execution_policy.h>
#include <thrust/find.h>
#include <thrust/logical.h>
#include <thrust/transform.h>

namespace cudf {
namespace strings {
namespace detail {
Expand All @@ -59,38 +55,15 @@ struct strip_fn {
if (!d_chars) d_offsets[idx] = 0;
return;
}

auto const d_str = d_strings.element<string_view>(idx);

auto is_strip_character = [d_to_strip = d_to_strip] __device__(char_utf8 chr) -> bool {
return d_to_strip.empty() ? (chr <= ' ') : // whitespace check
thrust::any_of(
thrust::seq, d_to_strip.begin(), d_to_strip.end(), [chr] __device__(char_utf8 c) {
return c == chr;
});
};

size_type const left_offset = [&] {
if (side != side_type::LEFT && side != side_type::BOTH) return 0;
auto const itr =
thrust::find_if_not(thrust::seq, d_str.begin(), d_str.end(), is_strip_character);
return itr != d_str.end() ? itr.byte_offset() : d_str.size_bytes();
}();

size_type right_offset = d_str.size_bytes();
if (side == side_type::RIGHT || side == side_type::BOTH) {
auto const length = d_str.length();
auto itr = d_str.end();
for (size_type n = 0; n < length; ++n) {
if (!is_strip_character(*(--itr))) break;
right_offset = itr.byte_offset();
}
auto const d_stripped = strip(d_str, d_to_strip, side);
if (d_chars) {
copy_string(d_chars + d_offsets[idx], d_stripped);
} else {
d_offsets[idx] = d_stripped.size_bytes();
}

auto const bytes = (right_offset > left_offset) ? right_offset - left_offset : 0;
if (d_chars)
memcpy(d_chars + d_offsets[idx], d_str.data() + left_offset, bytes);
else
d_offsets[idx] = bytes;
}
};

Expand Down

0 comments on commit 66b5a0c

Please sign in to comment.