Skip to content

Commit

Permalink
Add is_integer API that can check for the validity of a string-to-int…
Browse files Browse the repository at this point in the history
…eger conversion (#7642)

This PR addresses #5110, #7080, and rework #7094. It adds the function `cudf::strings::is_integer` that can check if strings can be correctly converted into integer values. Underflow and overflow are also taken into account.

Note that this `cudf::strings::is_integer` is different from the existing `cudf::strings::string::is_integer`, which only checks for pattern and does not care about under/overflow.

Examples:
```
s = { "eee", "-200", "-100", "127", "128", "1.5", NULL}

is_integer(s, INT8) = { 0, 0, 1, 1, 0, 0, NULL}
is_integer(s, INT32) = { 0, 1, 1, 1, 1, 0, NULL}
```

Authors:
  - Nghia Truong (@ttnghia)

Approvers:
  - David (@davidwendt)
  - Jake Hemstad (@jrhemstad)
  - Mark Harris (@harrism)

URL: #7642
  • Loading branch information
ttnghia authored Mar 24, 2021
1 parent 1e9f8f8 commit de55832
Show file tree
Hide file tree
Showing 3 changed files with 383 additions and 69 deletions.
43 changes: 39 additions & 4 deletions cpp/include/cudf/strings/convert/convert_integers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@ std::unique_ptr<column> from_integers(
* characters are valid for conversion to integers.
*
* The output row entry will be set to `true` if the corresponding string element
* has at least one character in [-+0-9].
* have all characters in [-+0-9]. The optional sign character must only be in the first
* position. Notice that the the integer value is not checked to be within its storage limits.
* For strict integer type check, use the other `is_integer()` API which accepts `data_type`
* argument.
*
* @code{.pseudo}
* Example:
Expand All @@ -89,12 +92,44 @@ std::unique_ptr<column> from_integers(
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
* @param strings Strings instance for this operation.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> is_integer(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Returns a boolean column identifying strings in which all
* characters are valid for conversion to integers.
*
* The output row entry will be set to `true` if the corresponding string element
* has all characters in [-+0-9]. The optional sign character must only be in the first
* position. Also, the integer component must fit within the size limits of the underlying
* storage type, which is provided by the int_type parameter.
*
* @code{.pseudo}
* Example:
* s = ['123456', '-456', '', 'A', '+7']
*
* output1 = s.is_integer(s, data_type{type_id::INT32})
* output1 is [true, true, false, false, true]
*
* output2 = s.is_integer(s, data_type{type_id::INT8})
* output2 is [false, false, false, false, true]
* @endcode
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param int_type Integer type used for checking underflow and overflow.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> is_integer(
strings_column_view const& strings,
data_type int_type,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down
212 changes: 165 additions & 47 deletions cpp/src/strings/convert/convert_integers.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
#include <cudf/strings/string.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>
#include <strings/convert/utilities.cuh>
#include <strings/utilities.cuh>
Expand All @@ -38,6 +37,160 @@

namespace cudf {
namespace strings {

namespace detail {
namespace {

/**
* @brief This only checks if a string is a valid integer within the bounds of its storage type.
*/
template <typename IntegerType>
struct string_to_integer_check_fn {
__device__ bool operator()(thrust::pair<string_view, bool> const& p) const
{
if (!p.second || p.first.empty()) { return false; }

auto const d_str = p.first.data();
if (d_str[0] == '-' && std::is_unsigned<IntegerType>::value) { return false; }

auto iter = d_str + static_cast<int>((d_str[0] == '-' || d_str[0] == '+'));
auto const iter_end = d_str + p.first.size_bytes();
if (iter == iter_end) { return false; }

auto const sign = d_str[0] == '-' ? IntegerType{-1} : IntegerType{1};
auto const bound_val =
sign > 0 ? std::numeric_limits<IntegerType>::max() : std::numeric_limits<IntegerType>::min();

IntegerType value = 0; // parse the string to integer and check for overflow along the way
while (iter != iter_end) { // check all bytes for valid characters
auto const chr = *iter++;
// Check for valid character
if (chr < '0' || chr > '9') { return false; }

// Check for underflow and overflow:
auto const digit = static_cast<IntegerType>(chr - '0');
auto const bound_check = (bound_val - sign * digit) / IntegerType{10} * sign;
if (value > bound_check) return false;
value = value * IntegerType{10} + digit;
}

return true;
}
};

/**
* @brief The dispatch functions for checking if strings are valid integers.
*/
struct dispatch_is_integer_fn {
template <typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
std::unique_ptr<column> operator()(strings_column_view const& strings,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr) const
{
auto const d_column = column_device_view::create(strings.parent(), stream);
auto results = make_numeric_column(data_type{type_id::BOOL8},
strings.size(),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
strings.null_count(),
stream,
mr);

auto d_results = results->mutable_view().data<bool>();
if (strings.has_nulls()) {
thrust::transform(rmm::exec_policy(stream),
d_column->pair_begin<string_view, true>(),
d_column->pair_end<string_view, true>(),
d_results,
string_to_integer_check_fn<T>{});
} else {
thrust::transform(rmm::exec_policy(stream),
d_column->pair_begin<string_view, false>(),
d_column->pair_end<string_view, false>(),
d_results,
string_to_integer_check_fn<T>{});
}

// Calling mutable_view() on a column invalidates it's null count so we need to set it back
results->set_null_count(strings.null_count());

return results;
}

template <typename T, std::enable_if_t<not std::is_integral<T>::value>* = nullptr>
std::unique_ptr<column> operator()(strings_column_view const&,
rmm::cuda_stream_view,
rmm::mr::device_memory_resource*) const
{
CUDF_FAIL("is_integer is expecting an integer type");
}
};

} // namespace

std::unique_ptr<column> is_integer(
strings_column_view const& strings,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto const d_column = column_device_view::create(strings.parent(), stream);
auto results = make_numeric_column(data_type{type_id::BOOL8},
strings.size(),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
strings.null_count(),
stream,
mr);

auto d_results = results->mutable_view().data<bool>();
if (strings.has_nulls()) {
thrust::transform(
rmm::exec_policy(stream),
d_column->pair_begin<string_view, true>(),
d_column->pair_end<string_view, true>(),
d_results,
[] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; });
} else {
thrust::transform(
rmm::exec_policy(stream),
d_column->pair_begin<string_view, false>(),
d_column->pair_end<string_view, false>(),
d_results,
[] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; });
}

// Calling mutable_view() on a column invalidates it's null count so we need to set it back
results->set_null_count(strings.null_count());

return results;
}

std::unique_ptr<column> is_integer(
strings_column_view const& strings,
data_type int_type,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
if (strings.is_empty()) { return cudf::make_empty_column(data_type{type_id::BOOL8}); }
return type_dispatcher(int_type, dispatch_is_integer_fn{}, strings, stream, mr);
}

} // namespace detail

// external APIs
std::unique_ptr<column> is_integer(strings_column_view const& strings,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::is_integer(strings, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> is_integer(strings_column_view const& strings,
data_type int_type,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::is_integer(strings, int_type, rmm::cuda_stream_default, mr);
}

namespace detail {
namespace {
/**
Expand Down Expand Up @@ -69,11 +222,10 @@ struct dispatch_to_integers_fn {
mutable_column_view& output_column,
rmm::cuda_stream_view stream) const
{
auto d_results = output_column.data<IntegerType>();
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_column.size()),
d_results,
output_column.data<IntegerType>(),
string_to_integer_fn<IntegerType>{strings_column});
}
// non-integral types throw an exception
Expand Down Expand Up @@ -102,19 +254,22 @@ std::unique_ptr<column> to_integers(strings_column_view const& strings,
{
size_type strings_count = strings.size();
if (strings_count == 0) return make_numeric_column(output_type, 0);
auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_strings = *strings_column;
// create integer output column copying the strings null-mask
auto results = make_numeric_column(output_type,

// Create integer output column copying the strings null-mask
auto results = make_numeric_column(output_type,
strings_count,
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
strings.null_count(),
stream,
mr);
auto results_view = results->mutable_view();
// fill output column with integers
type_dispatcher(output_type, dispatch_to_integers_fn{}, d_strings, results_view, stream);
// Fill output column with integers
auto const strings_dev_view = column_device_view::create(strings.parent(), stream);
auto results_view = results->mutable_view();
type_dispatcher(output_type, dispatch_to_integers_fn{}, *strings_dev_view, results_view, stream);

// Calling mutable_view() on a column invalidates it's null count so we need to set it back
results->set_null_count(strings.null_count());

return results;
}

Expand Down Expand Up @@ -253,42 +408,5 @@ std::unique_ptr<column> from_integers(column_view const& integers,
return detail::from_integers(integers, rmm::cuda_stream_default, mr);
}

namespace detail {
std::unique_ptr<column> is_integer(
strings_column_view const& strings,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_column = *strings_column;
// create output column
auto results = make_numeric_column(data_type{type_id::BOOL8},
strings.size(),
cudf::detail::copy_bitmask(strings.parent(), stream, mr),
strings.null_count(),
stream,
mr);
auto d_results = results->mutable_view().data<bool>();
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings.size()),
d_results,
[d_column] __device__(size_type idx) {
if (d_column.is_null(idx)) return false;
return string::is_integer(d_column.element<string_view>(idx));
});
results->set_null_count(strings.null_count());
return results;
}
} // namespace detail

// external API
std::unique_ptr<column> is_integer(strings_column_view const& strings,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::is_integer(strings, rmm::cuda_stream_default, mr);
}

} // namespace strings
} // namespace cudf
Loading

0 comments on commit de55832

Please sign in to comment.