Skip to content

Commit

Permalink
Add cudf::strings::integer_to_hex convert API (#8450)
Browse files Browse the repository at this point in the history
Closes #3105 

This PR adds the `cudf::strings::integers_to_hex` API to libcudf. Also, adds gtests for testing with all integer types. The PR also includes Cython updates to make the new API available to Python cudf.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Marlene  (https://github.com/marlenezw)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #8450
  • Loading branch information
davidwendt authored Jun 15, 2021
1 parent 884f98f commit 6728c75
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 4 deletions.
33 changes: 32 additions & 1 deletion cpp/include/cudf/strings/convert/convert_integers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ std::unique_ptr<column> hex_to_integers(
* @code{.pseudo}
* Example:
* s = ['123', '-456', '', 'AGE', '+17EA', '0x9EF' '123ABC']
* b = s.is_hex(s)
* b = is_hex(s)
* b is [true, false, false, false, false, true, true]
* @endcode
*
Expand All @@ -185,6 +185,37 @@ std::unique_ptr<column> is_hex(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a new strings column converting integer columns to hexadecimal
* characters.
*
* Any null entries will result in corresponding null entries in the output column.
*
* The output character set is '0'-'9' and 'A'-'F'. The output string width will
* be a multiple of 2 depending on the size of the integer type. A single leading
* zero is applied to the first non-zero output byte if it less than 0x10.
*
* @code{.pseudo}
* Example:
* input = [123, -1, 0, 27, 342718233] // int32 type input column
* s = integers_to_hex(input)
* s is [ '04D2', 'FFFFFFFF', '00', '1B', '146D7719']
* @endcode
*
* The example above shows an `INT32` type column where each integer is 4 bytes.
* Leading zeros are suppressed unless filling out a complete byte as in
* `123 -> '04D2'` instead of `000004D2` or `4D2`.
*
* @throw cudf::logic_error if the input column is not integral type.
*
* @param input Integer column to convert to hex.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column with hexadecimal characters.
*/
std::unique_ptr<column> integers_to_hex(
column_view const& input,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
99 changes: 97 additions & 2 deletions cpp/src/strings/convert/convert_hex.cu
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ struct dispatch_hex_to_integers_fn {
hex_to_integer_fn<IntegerType>{strings_column});
}
// non-integral types throw an exception
template <typename T, std::enable_if_t<not std::is_integral<T>::value>* = nullptr>
void operator()(column_device_view const&, mutable_column_view&, rmm::cuda_stream_view) const
template <typename T, typename... Args>
std::enable_if_t<not std::is_integral<T>::value, void> operator()(Args&&...) const
{
CUDF_FAIL("Output for hex_to_integers must be an integral type.");
}
Expand All @@ -118,6 +118,86 @@ void dispatch_hex_to_integers_fn::operator()<bool>(column_device_view const&,
CUDF_FAIL("Output for hex_to_integers must not be a boolean type.");
}

/**
* @brief Functor to convert integers to hexadecimal strings
*
* @tparam IntegerType The specific integer type to convert from.
*/
template <typename IntegerType>
struct integer_to_hex_fn {
column_device_view const d_column;
offset_type* d_offsets{};
char* d_chars{};

__device__ void byte_to_hex(uint8_t byte, char* hex)
{
hex[0] = [&] {
if (byte < 16) { return '0'; }
uint8_t const nibble = byte / 16;

byte = byte - (nibble * 16);
return static_cast<char>(nibble < 10 ? '0' + nibble : 'A' + (nibble - 10));
}();
hex[1] = byte < 10 ? '0' + byte : 'A' + (byte - 10);
}

__device__ void operator()(size_type idx)
{
if (d_column.is_null(idx)) {
if (!d_chars) { d_offsets[idx] = 0; }
return;
}

auto const value = d_column.element<IntegerType>(idx); // ex. 123456
auto value_bytes = reinterpret_cast<uint8_t const*>(&value); // 0x40E20100

// compute the number of output bytes
int bytes = sizeof(IntegerType);
int byte_index = sizeof(IntegerType);
while ((--byte_index > 0) && (value_bytes[byte_index] & 0xFF) == 0) { --bytes; }

// create output
byte_index = bytes - 1;
if (d_chars) {
auto d_buffer = d_chars + d_offsets[idx];
while (byte_index >= 0) {
byte_to_hex(value_bytes[byte_index], d_buffer);
d_buffer += 2;
--byte_index;
}
} else {
d_offsets[idx] = static_cast<offset_type>(bytes) * 2; // 2 hex characters per byte
}
}
};

struct dispatch_integers_to_hex_fn {
template <typename IntegerType, std::enable_if_t<std::is_integral_v<IntegerType>>* = nullptr>
std::unique_ptr<column> operator()(column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr) const
{
auto const d_column = column_device_view::create(input, stream);

auto children = cudf::strings::detail::make_strings_children(
integer_to_hex_fn<IntegerType>{*d_column}, input.size(), stream, mr);

return make_strings_column(input.size(),
std::move(children.first),
std::move(children.second),
input.null_count(),
cudf::detail::copy_bitmask(input, stream, mr),
stream,
mr);
}
// non-integral types throw an exception
template <typename T, typename... Args>
std::enable_if_t<not std::is_integral_v<T>, std::unique_ptr<column>> operator()(Args...) const
{
CUDF_FAIL("integers_to_hex only supports integral type columns");
}
};

} // namespace

// This will convert a strings column into any integer column type.
Expand Down Expand Up @@ -183,6 +263,14 @@ std::unique_ptr<column> is_hex(strings_column_view const& strings,
return results;
}

std::unique_ptr<column> integers_to_hex(column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (input.is_empty()) { return cudf::make_empty_column(data_type{type_id::STRING}); }
return type_dispatcher(input.type(), dispatch_integers_to_hex_fn{}, input, stream, mr);
}

} // namespace detail

// external API
Expand All @@ -201,5 +289,12 @@ std::unique_ptr<column> is_hex(strings_column_view const& strings,
return detail::is_hex(strings, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> integers_to_hex(column_view const& input,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::integers_to_hex(input, rmm::cuda_stream_default, mr);
}

} // namespace strings
} // namespace cudf
47 changes: 47 additions & 0 deletions cpp/tests/strings/integers_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,3 +410,50 @@ TEST_F(StringsConvertTest, IsHex)
auto results = cudf::strings::is_hex(cudf::strings_column_view(strings));
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TYPED_TEST(StringsIntegerConvertTest, IntegerToHex)
{
std::vector<TypeParam> h_integers(255);
std::generate(h_integers.begin(), h_integers.end(), []() {
static TypeParam data = 0;
return data++ << (sizeof(TypeParam) - 1) * 8;
});

cudf::test::fixed_width_column_wrapper<TypeParam> integers(h_integers.begin(), h_integers.end());

std::vector<std::string> h_expected(255);
std::transform(h_integers.begin(), h_integers.end(), h_expected.begin(), [](auto v) {
if (v == 0) { return std::string("00"); }
// special handling for single-byte types
if constexpr (std::is_same_v<TypeParam, int8_t> || std::is_same_v<TypeParam, uint8_t>) {
char const hex_digits[16] = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
std::string str;
str += hex_digits[(v & 0xF0) >> 4];
str += hex_digits[(v & 0x0F)];
return str;
}
// all other types work with this
std::stringstream str;
str << std::setfill('0') << std::setw(sizeof(TypeParam) * 2) << std::hex << std::uppercase << v;
return str.str();
});

cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());

auto results = cudf::strings::integers_to_hex(integers);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringsConvertTest, IntegerToHexWithNull)
{
cudf::test::fixed_width_column_wrapper<int32_t> integers(
{123456, -1, 0, 0, 12, 12345, 123456789, -123456789}, {1, 1, 1, 0, 1, 1, 1, 1});

cudf::test::strings_column_wrapper expected(
{"01E240", "FFFFFFFF", "00", "", "0C", "3039", "075BCD15", "F8A432EB"},
{1, 1, 1, 0, 1, 1, 1, 1});

auto results = cudf::strings::integers_to_hex(integers);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
cdef unique_ptr[column] is_hex(
column_view source_strings
) except +

cdef unique_ptr[column] integers_to_hex(
column_view input_col) except +
26 changes: 25 additions & 1 deletion python/cudf/cudf/_lib/string_casting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ from cudf._lib.cpp.strings.convert.convert_integers cimport (
to_integers as cpp_to_integers,
from_integers as cpp_from_integers,
hex_to_integers as cpp_hex_to_integers,
is_hex as cpp_is_hex
is_hex as cpp_is_hex,
integers_to_hex as cpp_integers_to_hex
)
from cudf._lib.cpp.strings.convert.convert_ipv4 cimport (
ipv4_to_integers as cpp_ipv4_to_integers,
Expand Down Expand Up @@ -771,3 +772,26 @@ def is_hex(Column source_strings):
))

return Column.from_unique_ptr(move(c_result))


def itoh(Column input_col):
"""
Converting input column of type integer to a string
column with hexadecimal character digits.
Parameters
----------
input_col : input column of type integer
Returns
-------
A Column of strings with hexadecimal characters.
"""

cdef column_view input_column_view = input_col.view()
cdef unique_ptr[column] c_result
with nogil:
c_result = move(
cpp_integers_to_hex(input_column_view))

return Column.from_unique_ptr(move(c_result))

0 comments on commit 6728c75

Please sign in to comment.