Skip to content

Commit

Permalink
Decimal support csv reader (#8511)
Browse files Browse the repository at this point in the history
This PR adds support for decimal types in libcudf.

Specifically, the PR:
1. expands the C++ API to allow passing `std::vector<data_type>` (partially addresses #8240)
2. it moves the logic for parsing decimal types to common ground, such that it can be used for both string_to_decimal conversion and for the CSV reader.
3. adds support to the CSV reader to also accept decimal as a target type (partially addresses #7110)

Authors:
  - Elias Stehle (https://github.com/elstehle)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: #8511
  • Loading branch information
elstehle authored Jun 23, 2021
1 parent 20c807d commit 788bddd
Show file tree
Hide file tree
Showing 7 changed files with 498 additions and 313 deletions.
42 changes: 37 additions & 5 deletions cpp/include/cudf/io/csv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <memory>
#include <string>
#include <unordered_map>
#include <variant>
#include <vector>

namespace cudf {
Expand Down Expand Up @@ -110,7 +111,7 @@ class csv_reader_options {
// Conversion settings

// Per-column types; disables type inference on those columns
std::vector<std::string> _dtypes;
std::variant<std::vector<std::string>, std::vector<data_type>> _dtypes;
// Additional values to recognize as boolean true values
std::vector<std::string> _true_values{"True", "TRUE", "true"};
// Additional values to recognize as boolean false values
Expand Down Expand Up @@ -289,7 +290,10 @@ class csv_reader_options {
/**
* @brief Returns per-column types.
*/
std::vector<std::string> const& get_dtypes() const { return _dtypes; }
std::variant<std::vector<std::string>, std::vector<data_type>> const& get_dtypes() const
{
return _dtypes;
}

/**
* @brief Returns additional values to recognize as boolean true values.
Expand Down Expand Up @@ -559,11 +563,24 @@ class csv_reader_options {
}

/**
* @brief Sets per-column types.
* @brief Sets per-column types
*
* @param types Vector specifying the columns' target data types.
*/
void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }

/**
* @brief Sets per-column types, specified by the type's respective string representation.
*
* @param types Vector of dtypes in which the column needs to be read.
*/
void set_dtypes(std::vector<std::string> types) { _dtypes = std::move(types); }
[
[deprecated("The string-based interface will be deprecated."
"Use dtypes(std::vector<data_type>) instead.")]] void
set_dtypes(std::vector<std::string> types)
{
_dtypes = std::move(types);
}

/**
* @brief Sets additional values to recognize as boolean true values.
Expand Down Expand Up @@ -965,10 +982,25 @@ class csv_reader_options_builder {
/**
* @brief Sets per-column types.
*
* @param types Vector of data types in which the column needs to be read.
* @return this for chaining.
*/
csv_reader_options_builder& dtypes(std::vector<data_type> types)
{
options._dtypes = std::move(types);
return *this;
}

/**
* @brief Sets per-column types, specified by the type's respective string representation.
*
* @param types Vector of dtypes in which the column needs to be read.
* @return this for chaining.
*/
csv_reader_options_builder& dtypes(std::vector<std::string> types)
[
[deprecated("The string-based interface will be deprecated."
"Use dtypes(std::vector<data_type>) instead.")]] csv_reader_options_builder&
dtypes(std::vector<std::string> types)
{
options._dtypes = std::move(types);
return *this;
Expand Down
155 changes: 155 additions & 0 deletions cpp/include/cudf/strings/detail/convert/fixed_point.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <thrust/optional.h>
#include <thrust/pair.h>

namespace cudf {
namespace strings {
namespace detail {

/**
* @brief Return the integer component of a decimal string.
*
* This is reads everything up to the exponent 'e' notation.
* The return includes the integer digits and any exponent offset.
*
* @param[in,out] iter Start of characters to parse
* @param[in] end End of characters to parse
* @return Integer component and exponent offset.
*/
__device__ inline thrust::pair<uint64_t, int32_t> parse_integer(char const*& iter,
char const* iter_end,
const char decimal_pt_char = '.')
{
// highest value where another decimal digit cannot be appended without an overflow;
// this preserves the most digits when scaling the final result
constexpr uint64_t decimal_max = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;

uint64_t value = 0; // for checking overflow
int32_t exp_offset = 0;
bool decimal_found = false;

while (iter < iter_end) {
auto const ch = *iter++;
if (ch == decimal_pt_char && !decimal_found) {
decimal_found = true;
continue;
}
if (ch < '0' || ch > '9') {
--iter;
break;
}
if (value > decimal_max) {
exp_offset += static_cast<int32_t>(!decimal_found);
} else {
value = (value * 10) + static_cast<uint64_t>(ch - '0');
exp_offset -= static_cast<int32_t>(decimal_found);
}
}
return {value, exp_offset};
}

/**
* @brief Return the exponent of a decimal string.
*
* This should only be called after the exponent 'e' notation was detected.
* The return is the exponent (base-10) integer and can only be
* invalid if `check_only == true` and invalid characters are found or the
* exponent overflows an int32.
*
* @tparam check_only Set to true to verify the characters are valid and the
* exponent value in the decimal string does not overflow int32
* @param[in,out] iter Start of characters to parse
* (points to the character after the 'E' or 'e')
* @param[in] end End of characters to parse
* @return Integer value of the exponent
*/
template <bool check_only = false>
__device__ thrust::optional<int32_t> parse_exponent(char const* iter, char const* iter_end)
{
constexpr uint32_t exponent_max = static_cast<uint32_t>(std::numeric_limits<int32_t>::max());

// get optional exponent sign
int32_t const exp_sign = [&iter] {
auto const ch = *iter;
if (ch != '-' && ch != '+') { return 1; }
++iter;
return (ch == '-' ? -1 : 1);
}();

// parse exponent integer
int32_t exp_ten = 0;
while (iter < iter_end) {
auto const ch = *iter++;
if (ch < '0' || ch > '9') {
if (check_only) { return thrust::nullopt; }
break;
}

uint32_t exp_check = static_cast<uint32_t>(exp_ten * 10) + static_cast<uint32_t>(ch - '0');
if (check_only && (exp_check > exponent_max)) { return thrust::nullopt; } // check overflow
exp_ten = static_cast<int32_t>(exp_check);
}

return exp_ten * exp_sign;
}

/**
* @brief Converts the string in the range [iter, iter_end) into a decimal.
*
* @tparam DecimalType The decimal type to be returned
* @param iter The beginning of the string. Unless iter >= iter_end, iter is dereferenced
* @param iter_end The end of the characters to parse
* @param scale The scale to be applied
* @return
*/
template <typename DecimalType>
__device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int32_t scale)
{
auto const sign = [&] {
if (iter_end <= iter) { return 0; }
if (*iter == '-') { return -1; }
if (*iter == '+') { return 1; }
return 0;
}();

// if string begins with a sign, continue with next character
if (sign != 0) ++iter;

auto [value, exp_offset] = parse_integer(iter, iter_end);
if (value == 0) { return DecimalType{0}; }

// check for exponent
int32_t exp_ten = 0;
if ((iter < iter_end) && (*iter == 'e' || *iter == 'E')) {
++iter;
if (iter < iter_end) { exp_ten = parse_exponent<false>(iter, iter_end).value(); }
}
exp_ten += exp_offset;

// shift the output value based on the exp_ten and the scale values
if (exp_ten < scale) {
value = value / static_cast<uint64_t>(exp10(static_cast<double>(scale - exp_ten)));
} else {
value = value * static_cast<uint64_t>(exp10(static_cast<double>(exp_ten - scale)));
}

return static_cast<DecimalType>(value) * (sign == 0 ? 1 : sign);
}
} // namespace detail
} // namespace strings
} // namespace cudf
61 changes: 35 additions & 26 deletions cpp/src/io/csv/csv_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/lists/list_view.cuh>
#include <cudf/null_mask.hpp>
#include <cudf/strings/detail/convert/fixed_point.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/structs/struct_view.hpp>
#include <cudf/utilities/bit.hpp>
Expand Down Expand Up @@ -410,26 +411,6 @@ __inline__ __device__ cudf::list_view decode_value(char const *begin,
return cudf::list_view{};
}

// The purpose of this is merely to allow compilation ONLY
// TODO : make this work for csv
template <>
__inline__ __device__ numeric::decimal32 decode_value(char const *begin,
char const *end,
parse_options_view const &opts)
{
return numeric::decimal32{};
}

// The purpose of this is merely to allow compilation ONLY
// TODO : make this work for csv
template <>
__inline__ __device__ numeric::decimal64 decode_value(char const *begin,
char const *end,
parse_options_view const &opts)
{
return numeric::decimal64{};
}

// The purpose of this is merely to allow compilation ONLY
// TODO : make this work for csv
template <>
Expand All @@ -452,10 +433,11 @@ struct decode_op {
* @return bool Whether the parsed value is valid.
*/
template <typename T,
typename std::enable_if_t<std::is_integral<T>::value and !std::is_same<T, bool>::value>
* = nullptr>
typename std::enable_if_t<std::is_integral_v<T> and !std::is_same_v<T, bool> and
!cudf::is_fixed_point<T>()> * = nullptr>
__host__ __device__ __forceinline__ bool operator()(void *out_buffer,
size_t row,
const data_type,
char const *begin,
char const *end,
parse_options_view const &opts,
Expand All @@ -473,12 +455,36 @@ struct decode_op {
return true;
}

/**
* @brief Dispatch for fixed point types.
*
* @return bool Whether the parsed value is valid.
*/
template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()> * = nullptr>
__host__ __device__ __forceinline__ bool operator()(void *out_buffer,
size_t row,
const data_type output_type,
char const *begin,
char const *end,
parse_options_view const &opts,
column_parse::flags flags)
{
static_cast<device_storage_type_t<T> *>(out_buffer)[row] =
[&flags, &opts, output_type, begin, end]() -> device_storage_type_t<T> {
return strings::detail::parse_decimal<device_storage_type_t<T>>(
begin, end, output_type.scale());
}();

return true;
}

/**
* @brief Dispatch for boolean type types.
*/
template <typename T, typename std::enable_if_t<std::is_same<T, bool>::value> * = nullptr>
template <typename T, typename std::enable_if_t<std::is_same_v<T, bool>> * = nullptr>
__host__ __device__ __forceinline__ bool operator()(void *out_buffer,
size_t row,
const data_type,
char const *begin,
char const *end,
parse_options_view const &opts,
Expand All @@ -499,9 +505,10 @@ struct decode_op {
* @brief Dispatch for floating points, which are set to NaN if the input
* is not valid. In such case, the validity mask is set to zero too.
*/
template <typename T, typename std::enable_if_t<std::is_floating_point<T>::value> * = nullptr>
template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>> * = nullptr>
__host__ __device__ __forceinline__ bool operator()(void *out_buffer,
size_t row,
const data_type,
char const *begin,
char const *end,
parse_options_view const &opts,
Expand All @@ -517,10 +524,11 @@ struct decode_op {
* @brief Dispatch for all other types.
*/
template <typename T,
typename std::enable_if_t<!std::is_integral<T>::value and
!std::is_floating_point<T>::value> * = nullptr>
typename std::enable_if_t<!std::is_integral_v<T> and !std::is_floating_point_v<T> and
!cudf::is_fixed_point<T>()> * = nullptr>
__host__ __device__ __forceinline__ bool operator()(void *out_buffer,
size_t row,
const data_type,
char const *begin,
char const *end,
parse_options_view const &opts,
Expand Down Expand Up @@ -605,6 +613,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
decode_op{},
columns[actual_col],
rec_id,
dtypes[actual_col],
field_start,
field_end,
options,
Expand Down
Loading

0 comments on commit 788bddd

Please sign in to comment.