Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cudf::strings::integer_to_hex convert API #8450

Merged
merged 4 commits into from
Jun 15, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion cpp/include/cudf/strings/convert/convert_integers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ std::unique_ptr<column> hex_to_integers(
* @code{.pseudo}
* Example:
* s = ['123', '-456', '', 'AGE', '+17EA', '0x9EF' '123ABC']
* b = s.is_hex(s)
* b = is_hex(s)
* b is [true, false, false, false, false, true, true]
* @endcode
*
Expand All @@ -185,6 +185,37 @@ std::unique_ptr<column> is_hex(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a new strings column converting integer columns to hexadecimal
* characters.
*
* Any null entries will result in corresponding null entries in the output column.
*
* The output character set is '0'-'9' and 'A'-'F'. The output string width will
* be a multiple of 2 depending on the size of the integer type. A single leading
* zero is applied to the first non-zero output byte if it less than 0x10.
*
* @code{.pseudo}
* Example:
* input = [123, -1, 0, 27, 342718233] // int32 type input column
* s = integers_to_hex(input)
* s is [ '04D2', 'FFFFFFFF', '00', '1B', '146D7719']
* @endcode
*
* The example above shows an `INT32` type column where each integer is 4 bytes.
* Leading zeros are suppressed unless filling out a complete byte as in
* `123 -> '04D2'` instead of `000004D2` or `4D2`.
*
* @throw cudf::logic_error if the input column is not integral type.
*
* @param input Integer column to convert to hex.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column with hexadecimal characters.
*/
std::unique_ptr<column> integers_to_hex(
column_view const& input,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
95 changes: 95 additions & 0 deletions cpp/src/strings/convert/convert_hex.cu
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,86 @@ void dispatch_hex_to_integers_fn::operator()<bool>(column_device_view const&,
CUDF_FAIL("Output for hex_to_integers must not be a boolean type.");
}

/**
* @brief Functor to convert integers to hexadecimal strings
*
* @tparam IntegerType The specific integer type to convert from.
*/
template <typename IntegerType>
struct integer_to_hex_fn {
column_device_view const d_column;
offset_type* d_offsets{};
char* d_chars{};

__device__ void byte_to_hex(uint8_t byte, char* hex)
{
hex[0] = '0';
if (byte >= 16) {
uint8_t const hibyte = byte / 16;

hex[0] = hibyte < 10 ? '0' + hibyte : 'A' + (hibyte - 10);
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
byte = byte - (hibyte * 16);
}
hex[1] = byte < 10 ? '0' + byte : 'A' + (byte - 10);
}

__device__ void operator()(size_type idx)
{
if (d_column.is_null(idx)) {
if (!d_chars) { d_offsets[idx] = 0; }
return;
}

auto const value = d_column.element<IntegerType>(idx); // ex. 123456
auto value_bytes = reinterpret_cast<uint8_t const*>(&value); // 0x40E20100

// compute the number of output bytes
int bytes = sizeof(IntegerType);
int byte_index = sizeof(IntegerType);
while ((--byte_index > 0) && (value_bytes[byte_index] & 0xFF) == 0) { --bytes; }

// create output
byte_index = bytes - 1;
if (d_chars) {
auto d_buffer = d_chars + d_offsets[idx];
while (byte_index >= 0) {
byte_to_hex(value_bytes[byte_index], d_buffer);
d_buffer += 2;
--byte_index;
}
} else {
d_offsets[idx] = static_cast<offset_type>(bytes) * 2; // 2 hex characters per byte
}
}
};

struct dispatch_integers_to_hex_fn {
template <typename IntegerType, std::enable_if_t<std::is_integral_v<IntegerType>>* = nullptr>
std::unique_ptr<column> operator()(column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr) const
{
auto const d_column = column_device_view::create(input, stream);

auto children = cudf::strings::detail::make_strings_children(
integer_to_hex_fn<IntegerType>{*d_column}, input.size(), stream, mr);

return make_strings_column(input.size(),
std::move(children.first),
std::move(children.second),
input.null_count(),
cudf::detail::copy_bitmask(input, stream, mr),
stream,
mr);
}
// non-integral types throw an exception
template <typename T, typename... Args>
std::enable_if_t<not std::is_integral_v<T>, std::unique_ptr<column>> operator()(Args...) const
{
CUDF_FAIL("integers_to_hex only supports integral type columns");
}
};

} // namespace

// This will convert a strings column into any integer column type.
Expand Down Expand Up @@ -183,6 +263,14 @@ std::unique_ptr<column> is_hex(strings_column_view const& strings,
return results;
}

std::unique_ptr<column> integers_to_hex(column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (input.is_empty()) { return cudf::make_empty_column(data_type{type_id::STRING}); }
return type_dispatcher(input.type(), dispatch_integers_to_hex_fn{}, input, stream, mr);
}

} // namespace detail

// external API
Expand All @@ -201,5 +289,12 @@ std::unique_ptr<column> is_hex(strings_column_view const& strings,
return detail::is_hex(strings, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> integers_to_hex(column_view const& input,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::integers_to_hex(input, rmm::cuda_stream_default, mr);
}

} // namespace strings
} // namespace cudf
47 changes: 47 additions & 0 deletions cpp/tests/strings/integers_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,3 +410,50 @@ TEST_F(StringsConvertTest, IsHex)
auto results = cudf::strings::is_hex(cudf::strings_column_view(strings));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TYPED_TEST(StringsIntegerConvertTest, IntegerToHex)
{
std::vector<TypeParam> h_integers(255);
std::generate(h_integers.begin(), h_integers.end(), []() {
static TypeParam data = 0;
return data++ << (sizeof(TypeParam) - 1) * 8;
});

cudf::test::fixed_width_column_wrapper<TypeParam> integers(h_integers.begin(), h_integers.end());

std::vector<std::string> h_expected(255);
std::transform(h_integers.begin(), h_integers.end(), h_expected.begin(), [](auto v) {
if (v == 0) { return std::string("00"); }
// special handling for single-byte types
if constexpr (std::is_same_v<TypeParam, int8_t> || std::is_same_v<TypeParam, uint8_t>) {
char const hex_digits[16] = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
std::string str;
str += hex_digits[(v & 0xF0) >> 4];
str += hex_digits[(v & 0x0F)];
return str;
}
// all other types work with this
std::stringstream str;
str << std::setfill('0') << std::setw(sizeof(TypeParam) * 2) << std::hex << std::uppercase << v;
return str.str();
});

cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());

auto results = cudf::strings::integers_to_hex(integers);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringsConvertTest, IntegerToHexWithNull)
{
cudf::test::fixed_width_column_wrapper<int32_t> integers(
{123456, -1, 0, 0, 12, 12345, 123456789, -123456789}, {1, 1, 1, 0, 1, 1, 1, 1});

cudf::test::strings_column_wrapper expected(
{"01E240", "FFFFFFFF", "00", "", "0C", "3039", "075BCD15", "F8A432EB"},
{1, 1, 1, 0, 1, 1, 1, 1});

auto results = cudf::strings::integers_to_hex(integers);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
cdef unique_ptr[column] is_hex(
column_view source_strings
) except +

cdef unique_ptr[column] integers_to_hex(
column_view input_col) except +
26 changes: 25 additions & 1 deletion python/cudf/cudf/_lib/string_casting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ from cudf._lib.cpp.strings.convert.convert_integers cimport (
to_integers as cpp_to_integers,
from_integers as cpp_from_integers,
hex_to_integers as cpp_hex_to_integers,
is_hex as cpp_is_hex
is_hex as cpp_is_hex,
integers_to_hex as cpp_integers_to_hex
)
from cudf._lib.cpp.strings.convert.convert_ipv4 cimport (
ipv4_to_integers as cpp_ipv4_to_integers,
Expand Down Expand Up @@ -771,3 +772,26 @@ def is_hex(Column source_strings):
))

return Column.from_unique_ptr(move(c_result))


def itoh(Column input_col):
"""
Converting input column of type integer to a string
column with hexadecimal character digits.

Parameters
----------
input_col : input column of type integer

Returns
-------
A Column of strings with hexadecimal characters.
"""

cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_integers_to_hex(input_column_view))

return Column.from_unique_ptr(move(c_result))