Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add strings udf C++ classes and functions for phase II #11912

Merged
merged 38 commits into from
Nov 2, 2022
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
40eb1e2
Add strings udf C++ classes and function for phase II
davidwendt Oct 12, 2022
5317db8
fix style error
davidwendt Oct 12, 2022
8e531a5
Merge branch 'branch-22.12' into udf-string-class
davidwendt Oct 13, 2022
b8d7868
Merge branch 'branch-22.12' into udf-string-class
davidwendt Oct 13, 2022
ae1bbdc
Merge branch 'branch-22.12' into udf-string-class
davidwendt Oct 17, 2022
edcaaf2
Merge branch 'branch-22.12' into udf-string-class
davidwendt Oct 18, 2022
a5661bc
change void* to udf_string*
davidwendt Oct 18, 2022
9661c4e
update doxygens
davidwendt Oct 18, 2022
ece495f
Merge branch 'branch-22.12' into udf-string-class
davidwendt Oct 18, 2022
5554ed9
Merge branch 'branch-22.12' into udf-string-class
davidwendt Oct 19, 2022
ebaf088
add pad utility functions
davidwendt Oct 19, 2022
c3e17ac
fix doxygen for udf_apis.hpp
davidwendt Oct 19, 2022
2dae45d
fix to_string to use count_digits
davidwendt Oct 20, 2022
3467f34
add ALL_FLAGS
davidwendt Oct 20, 2022
4f63c54
Merge branch 'branch-22.12' into udf-string-class
davidwendt Oct 20, 2022
7639039
Merge branch 'branch-22.12' into udf-string-class
davidwendt Oct 21, 2022
84721d4
Merge branch 'branch-22.12' into udf-string-class
davidwendt Oct 24, 2022
4c72149
Merge branch 'branch-22.12' into udf-string-class
davidwendt Oct 25, 2022
cf72fc8
add noexcept decl to appropriate member functions
davidwendt Oct 25, 2022
28e917b
fix return types for split
davidwendt Oct 25, 2022
f82c454
fix doxygen for various functions
davidwendt Oct 25, 2022
3b513a3
Merge branch 'branch-22.12' into udf-string-class
davidwendt Oct 26, 2022
7b9718c
create free_udf_strings_array function
davidwendt Oct 31, 2022
68e54e8
fix compare returns, null assignment, reuse ctors
davidwendt Oct 31, 2022
6eef0a4
fix some doxygen wording
davidwendt Oct 31, 2022
02aa5b4
Merge branch 'branch-22.12' into udf-string-class
davidwendt Oct 31, 2022
69e0d7c
remove string_view const parameter decl
davidwendt Oct 31, 2022
a95c030
fix default-stream
davidwendt Oct 31, 2022
e0526e6
remove lstrip and rstrip
davidwendt Oct 31, 2022
bc903d6
reword split doxygen text for result=nullptr
davidwendt Oct 31, 2022
229c1f2
Merge branch 'branch-22.12' into udf-string-class
davidwendt Nov 1, 2022
eb6532e
add cuda_runtime.h to resolve device refs
davidwendt Nov 1, 2022
a8fca12
fix doxygen wording for pad()
davidwendt Nov 1, 2022
a249d13
refactor split; add count_tokens function
davidwendt Nov 1, 2022
96b06f6
refactor append, replace for better reuse
davidwendt Nov 1, 2022
7849307
expand spos/epos var names
davidwendt Nov 1, 2022
cadcf79
add more doc to replace() for count parm
davidwendt Nov 1, 2022
b3a43b8
Merge branch 'branch-22.12' into udf-string-class
davidwendt Nov 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/include/cudf/strings/detail/char_tables.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ constexpr uint8_t IS_LOWER(uint8_t x) { return ((x) & (1 << 6)); }
constexpr uint8_t IS_SPECIAL(uint8_t x) { return ((x) & (1 << 7)); }
constexpr uint8_t IS_ALPHANUM(uint8_t x) { return ((x) & (0x0F)); }
constexpr uint8_t IS_UPPER_OR_LOWER(uint8_t x) { return ((x) & ((1 << 5) | (1 << 6))); }
constexpr uint8_t ALL_FLAGS = 0xFF;

// Type for the character cases table.
using character_cases_table_type = uint16_t;
Expand Down
211 changes: 211 additions & 0 deletions python/strings_udf/cpp/include/cudf/strings/udf/case.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include "udf_string.cuh"

#include <cudf/strings/detail/char_tables.hpp>
#include <cudf/strings/detail/utf8.hpp>
#include <cudf/strings/string_view.cuh>

namespace cudf {
namespace strings {
namespace udf {

/**
* @brief Global variables for character-type flags and case conversion
*/
struct chars_tables {
cudf::strings::detail::character_flags_table_type* flags_table;
cudf::strings::detail::character_cases_table_type* cases_table;
struct cudf::strings::detail::special_case_mapping* special_case_mapping_table;
};

namespace detail {

/**
* @brief Utility for converting a single character
*
* There are special cases where the conversion may result in multiple characters.
*
* @param tables The char tables required for conversion
* @param result String to append the converted character
* @param code_point The code-point of the character to convert
* @param flag The char-type flag of the character to convert
*/
__device__ inline void convert_char(chars_tables const tables,
udf_string& result,
uint32_t code_point,
uint8_t flag)
{
if (!cudf::strings::detail::IS_SPECIAL(flag)) {
result.append(cudf::strings::detail::codepoint_to_utf8(tables.cases_table[code_point]));
return;
}

// handle special case
auto const map =
tables
.special_case_mapping_table[cudf::strings::detail::get_special_case_hash_index(code_point)];
auto const output_count =
cudf::strings::detail::IS_LOWER(flag) ? map.num_upper_chars : map.num_lower_chars;
auto const* output_chars = cudf::strings::detail::IS_LOWER(flag) ? map.upper : map.lower;
for (uint16_t idx = 0; idx < output_count; idx++) {
result.append(cudf::strings::detail::codepoint_to_utf8(output_chars[idx]));
}
}

/**
* @brief Converts the given string to either upper or lower case
*
* @param tables The char tables required for conversion
* @param d_str Input string to convert
* @param case_flag Identifies upper/lower case conversion
* @return New string containing the converted characters
*/
__device__ inline udf_string convert_case(
chars_tables const tables,
string_view d_str,
cudf::strings::detail::character_flags_table_type case_flag)
{
udf_string result;
for (auto const chr : d_str) {
auto const code_point = cudf::strings::detail::utf8_to_codepoint(chr);
auto const flag = code_point <= 0x00FFFF ? tables.flags_table[code_point] : 0;

if ((flag & case_flag) || (cudf::strings::detail::IS_SPECIAL(flag) &&
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
!cudf::strings::detail::IS_UPPER_OR_LOWER(flag))) {
convert_char(tables, result, code_point, flag);
} else {
result.append(chr);
}
}

return result;
}

/**
* @brief Utility for capitalize and title functions
*
* @tparam CapitalizeNextFn returns true if the next candidate character should be capitalized
* @param tables The char tables required for conversion
* @param d_str Input string to convert
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
* @param next_fn Function for next character capitalized
* @return New string containing the converted characters
*/
template <typename CapitalizeNextFn>
__device__ inline udf_string capitalize(chars_tables const tables,
string_view d_str,
CapitalizeNextFn next_fn)
{
udf_string result;
bool capitalize = true;
for (auto const chr : d_str) {
auto const code_point = cudf::strings::detail::utf8_to_codepoint(chr);
auto const flag = code_point <= 0x00FFFF ? tables.flags_table[code_point] : 0;
auto const change_case =
capitalize ? cudf::strings::detail::IS_LOWER(flag) : cudf::strings::detail::IS_UPPER(flag);
if (change_case) {
detail::convert_char(tables, result, code_point, flag);
} else {
result.append(chr);
}
capitalize = next_fn(flag);
}
return result;
}
} // namespace detail

/**
* @brief Converts the given string to lower case
*
* @param tables The char tables required for conversion
* @param d_str Input string to convert
* @return New string containing the converted characters
*/
__device__ inline udf_string to_lower(chars_tables const tables, string_view d_str)
{
cudf::strings::detail::character_flags_table_type case_flag = cudf::strings::detail::IS_UPPER(
cudf::strings::detail::ALL_FLAGS); // convert only upper case characters
return detail::convert_case(tables, d_str, case_flag);
}

/**
* @brief Converts the given string to upper case
*
* @param tables The char tables required for conversion
* @param d_str Input string to convert
* @return New string containing the converted characters
*/
__device__ inline udf_string to_upper(chars_tables const tables, string_view d_str)
{
cudf::strings::detail::character_flags_table_type case_flag = cudf::strings::detail::IS_LOWER(
cudf::strings::detail::ALL_FLAGS); // convert only lower case characters
return detail::convert_case(tables, d_str, case_flag);
}

/**
* @brief Converts the given string to lower/upper case
*
* All lower case characters are converted to upper case and
* all upper case characters are converted to lower case.
*
* @param tables The char tables required for conversion
* @param d_str Input string to convert
* @return New string containing the converted characters
*/
__device__ inline udf_string swap_case(chars_tables const tables, string_view d_str)
{
cudf::strings::detail::character_flags_table_type case_flag =
cudf::strings::detail::IS_LOWER(cudf::strings::detail::ALL_FLAGS) |
cudf::strings::detail::IS_UPPER(cudf::strings::detail::ALL_FLAGS);
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
return detail::convert_case(tables, d_str, case_flag);
}

/**
* @brief Capitalize the first character of the given string
*
* @param tables The char tables required for conversion
* @param d_str Input string to convert
* @return New string containing the converted characters
*/
__device__ inline udf_string capitalize(chars_tables const tables, string_view d_str)
{
auto next_fn = [](cudf::strings::detail::character_flags_table_type) -> bool { return false; };
vyasr marked this conversation as resolved.
Show resolved Hide resolved
return detail::capitalize(tables, d_str, next_fn);
}

/**
* @brief Converts the given string to title case
*
* The first character after a non-character is converted to upper case.
* All other characters are converted to lower case.
*
* @param tables The char tables required for conversion
* @param d_str Input string to convert
* @return New string containing the converted characters
*/
__device__ inline udf_string title(chars_tables const tables, string_view d_str)
{
auto next_fn = [](cudf::strings::detail::character_flags_table_type flag) -> bool {
return !cudf::strings::detail::IS_ALPHA(flag);
};
return detail::capitalize(tables, d_str, next_fn);
}

} // namespace udf
} // namespace strings
} // namespace cudf
72 changes: 72 additions & 0 deletions python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@

/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include "udf_string.cuh"

#include <cudf/strings/detail/convert/int_to_string.cuh>
#include <cudf/strings/detail/convert/string_to_float.cuh>
#include <cudf/strings/detail/convert/string_to_int.cuh>

namespace cudf {
namespace strings {
namespace udf {

/**
* @brief Converts a string into an integer
*
* The '+' and '-' are allowed but only at the beginning of the string.
* The string is expected to contain base-10 [0-9] characters only.
* Any other character will end the parse.
* Overflow of the int64 type is not detected.
*/
__device__ inline int64_t stoi(string_view const& d_str)
{
return cudf::strings::detail::string_to_integer(d_str);
}

/**
* @brief Converts an integer into string
*
* @param value integer value to convert
*/
__device__ inline udf_string to_string(int64_t value)
{
udf_string result;
if (value == 0) {
result.append("0");
return result;
}
result.resize(cudf::strings::detail::count_digits(value));
cudf::strings::detail::integer_to_string(value, result.data());
return result;
}

/**
* @brief Converts a string into a double
*
* This function supports scientific notation.
* Overflow goes to inf or -inf and underflow may go to 0.
vyasr marked this conversation as resolved.
Show resolved Hide resolved
*/
__device__ inline double stod(string_view const& d_str)
{
return cudf::strings::detail::stod(d_str);
}

} // namespace udf
} // namespace strings
} // namespace cudf
72 changes: 72 additions & 0 deletions python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@

/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include "udf_string.cuh"

#include <cudf/strings/detail/pad_impl.cuh>

namespace cudf {
namespace strings {
namespace udf {

/**
* @brief Pad beginning and/or end of a string with the given fill character
*
* The side_type::BOTH will attempt to center the text using the `fill_char`.
* If the `width <= d_str.length()` no change occurs.
*
* @tparam side Specify where the padding should occur
* @param d_str String to pad
* @param width Minimum length in characters of the output string
vyasr marked this conversation as resolved.
Show resolved Hide resolved
* @param fill_char Character used for padding
*/
template <side_type side = side_type::RIGHT>
__device__ udf_string pad(cudf::string_view const d_str,
cudf::size_type width,
cudf::string_view fill_char = cudf::string_view{" ", 1})
{
if (fill_char.empty()) { return udf_string{d_str}; }

udf_string result;
result.resize(cudf::strings::detail::compute_padded_size(d_str, width, fill_char.size_bytes()));
cudf::strings::detail::pad_impl<side>(d_str, width, *fill_char.begin(), result.data());
return result;
}

/**
* @brief Pad beginning of a string with zero '0'
*
* If the `width` is smaller than the length of `d_str` no change occurs.
*
* If `d_str` starts with a sign character ('-' or '+') then '0' padding
* starts after the sign.
*
* @param d_str String to fill
* @param width Minimum length in characters of the output string (including the sign character)
*/
__device__ udf_string zfill(cudf::string_view const d_str, cudf::size_type width)
{
udf_string result;
result.resize(cudf::strings::detail::compute_padded_size(d_str, width, 1));
cudf::strings::detail::zfill_impl(d_str, width, result.data());
return result;
}

} // namespace udf
} // namespace strings
} // namespace cudf
Loading