diff --git a/cpp/include/cudf/strings/detail/pad_impl.cuh b/cpp/include/cudf/strings/detail/pad_impl.cuh new file mode 100644 index 00000000000..648c240bfbc --- /dev/null +++ b/cpp/include/cudf/strings/detail/pad_impl.cuh @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace cudf { +namespace strings { +namespace detail { + +/** + * @brief Return the size in bytes of padding d_str to width characters using a fill character + * with byte length of fill_char_size + * + * Pad does not perform truncation. That is, If `d_str.length() > width` then `d_str.size_bytes()` + * is returned. + * + * @param d_str String to pad + * @param width Number of characters for the padded string result + * @param fill_char_size Size of the fill character in bytes + * @return The number of bytes required for the pad + */ +__device__ size_type compute_padded_size(string_view d_str, + size_type width, + size_type fill_char_size) +{ + auto const length = d_str.length(); + auto bytes = d_str.size_bytes(); + if (width > length) // no truncating; + bytes += fill_char_size * (width - length); // add padding + return bytes; +} + +/** + * @brief Pad d_str with fill_char into output up to width characters + * + * Pad does not perform truncation. That is, If `d_str.length() > width` then + * then d_str is copied into output. + * + * @tparam side Specifies where fill_char is added to d_str + * @param d_str String to pad + * @param width Number of characters for the padded string result + * @param fill_char Size of the fill character in bytes + * @param output Device memory to copy the padded string into + */ +template +__device__ void pad_impl(cudf::string_view d_str, + cudf::size_type width, + cudf::char_utf8 fill_char, + char* output) +{ + auto length = d_str.length(); + if constexpr (side == side_type::LEFT) { + while (length++ < width) { + output += from_char_utf8(fill_char, output); + } + copy_string(output, d_str); + } + if constexpr (side == side_type::RIGHT) { + output = copy_string(output, d_str); + while (length++ < width) { + output += from_char_utf8(fill_char, output); + } + } + if constexpr (side == side_type::BOTH) { + auto const pad_size = width - length; + // an odd width will right-justify + auto right_pad = (width % 2) ? pad_size / 2 : (pad_size - pad_size / 2); + auto left_pad = pad_size - right_pad; // e.g. width=7: "++foxx+"; width=6: "+fox++" + while (left_pad-- > 0) { + output += from_char_utf8(fill_char, output); + } + output = copy_string(output, d_str); + while (right_pad-- > 0) { + output += from_char_utf8(fill_char, output); + } + } +} + +/** + * @brief Prepend d_str with '0' into output up to width characters + * + * Pad does not perform truncation. That is, If `d_str.length() > width` then + * then d_str is copied into output. + * + * If d_str starts with a sign character ('-' or '+') then '0' padding + * starts after the sign. + * + * @param d_str String to pad + * @param width Number of characters for the padded string result + * @param output Device memory to copy the padded string into + */ +__device__ void zfill_impl(cudf::string_view d_str, cudf::size_type width, char* output) +{ + auto length = d_str.length(); + auto in_ptr = d_str.data(); + // if the string starts with a sign, output the sign first + if (!d_str.empty() && (*in_ptr == '-' || *in_ptr == '+')) { + *output++ = *in_ptr++; + d_str = cudf::string_view{in_ptr, d_str.size_bytes() - 1}; + } + while (length++ < width) + *output++ = '0'; // prepend zero char + copy_string(output, d_str); +} + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index e601eeb6b6e..e4002525af9 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -20,8 +20,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -38,6 +37,7 @@ namespace cudf { namespace strings { namespace detail { namespace { + struct compute_pad_output_length_fn { column_device_view d_strings; size_type width; @@ -47,11 +47,7 @@ struct compute_pad_output_length_fn { { if (d_strings.is_null(idx)) return 0; string_view d_str = d_strings.element(idx); - size_type bytes = d_str.size_bytes(); - size_type length = d_str.length(); - if (width > length) // no truncating - bytes += fill_char_size * (width - length); // add padding - return bytes; + return compute_padded_size(d_str, width, fill_char_size); } }; @@ -96,13 +92,10 @@ std::unique_ptr pad( thrust::make_counting_iterator(0), strings_count, [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) { - if (d_strings.is_null(idx)) return; - string_view d_str = d_strings.element(idx); - auto length = d_str.length(); - char* ptr = d_chars + d_offsets[idx]; - while (length++ < width) - ptr += from_char_utf8(d_fill_char, ptr); - copy_string(ptr, d_str); + if (d_strings.is_valid(idx)) { + pad_impl( + d_strings.element(idx), width, d_fill_char, d_chars + d_offsets[idx]); + } }); } else if (side == side_type::RIGHT) { thrust::for_each_n( @@ -110,13 +103,10 @@ std::unique_ptr pad( thrust::make_counting_iterator(0), strings_count, [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) { - if (d_strings.is_null(idx)) return; - string_view d_str = d_strings.element(idx); - auto length = d_str.length(); - char* ptr = d_chars + d_offsets[idx]; - ptr = copy_string(ptr, d_str); - while (length++ < width) - ptr += from_char_utf8(d_fill_char, ptr); + if (d_strings.is_valid(idx)) { + pad_impl( + d_strings.element(idx), width, d_fill_char, d_chars + d_offsets[idx]); + } }); } else if (side == side_type::BOTH) { thrust::for_each_n( @@ -124,18 +114,10 @@ std::unique_ptr pad( thrust::make_counting_iterator(0), strings_count, [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) { - if (d_strings.is_null(idx)) return; - string_view d_str = d_strings.element(idx); - char* ptr = d_chars + d_offsets[idx]; - auto pad = static_cast(width - d_str.length()); - auto right_pad = (width & 1) ? pad / 2 : (pad - pad / 2); // odd width = right-justify - auto left_pad = - pad - right_pad; // e.g. width=7 gives "++foxx+" while width=6 gives "+fox++" - while (left_pad-- > 0) - ptr += from_char_utf8(d_fill_char, ptr); - ptr = copy_string(ptr, d_str); - while (right_pad-- > 0) - ptr += from_char_utf8(d_fill_char, ptr); + if (d_strings.is_valid(idx)) { + pad_impl( + d_strings.element(idx), width, d_fill_char, d_chars + d_offsets[idx]); + } }); } @@ -174,19 +156,10 @@ std::unique_ptr zfill( thrust::make_counting_iterator(0), input.size(), [d_strings, width, d_offsets, d_chars] __device__(size_type idx) { - if (d_strings.is_null(idx)) return; - auto d_str = d_strings.element(idx); - auto length = d_str.length(); - auto in_ptr = d_str.data(); - auto out_ptr = d_chars + d_offsets[idx]; - // if the string starts with a sign, output the sign first - if (!d_str.empty() && (*in_ptr == '-' || *in_ptr == '+')) { - *out_ptr++ = *in_ptr++; - d_str = string_view{in_ptr, d_str.size_bytes() - 1}; + if (d_strings.is_valid(idx)) { + zfill_impl( + d_strings.element(idx), width, d_chars + d_offsets[idx]); } - while (length++ < width) - *out_ptr++ = '0'; // prepend zero char - copy_string(out_ptr, d_str); }); return make_strings_column(input.size(),