Decimal support csv reader (#8511)

This PR adds support for decimal types in libcudf. Specifically, the PR: 1. expands the C++ API to allow passing `std::vector<data_type>` (partially addresses #8240) 2. it moves the logic for parsing decimal types to common ground, such that it can be used for both string_to_decimal conversion and for the CSV reader. 3. adds support to the CSV reader to also accept decimal as a target type (partially addresses #7110) Authors: - Elias Stehle (https://github.com/elstehle) Approvers: - Devavret Makkar (https://github.com/devavret) - Vukasin Milovanovic (https://github.com/vuule) - David Wendt (https://github.com/davidwendt) URL: #8511
rapidsai · Jun 23, 2021 · 788bddd · 788bddd
1 parent 20c807d
commit 788bddd
Show file tree

Hide file tree

Showing 7 changed files with 498 additions and 313 deletions.
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
@@ -24,6 +24,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <variant>
 #include <vector>
 
 namespace cudf {
@@ -110,7 +111,7 @@ class csv_reader_options {
   // Conversion settings
 
   // Per-column types; disables type inference on those columns
-  std::vector<std::string> _dtypes;
+  std::variant<std::vector<std::string>, std::vector<data_type>> _dtypes;
   // Additional values to recognize as boolean true values
   std::vector<std::string> _true_values{"True", "TRUE", "true"};
   // Additional values to recognize as boolean false values
@@ -289,7 +290,10 @@ class csv_reader_options {
   /**
    * @brief Returns per-column types.
    */
-  std::vector<std::string> const& get_dtypes() const { return _dtypes; }
+  std::variant<std::vector<std::string>, std::vector<data_type>> const& get_dtypes() const
+  {
+    return _dtypes;
+  }
 
   /**
    * @brief Returns additional values to recognize as boolean true values.
@@ -559,11 +563,24 @@ class csv_reader_options {
   }
 
   /**
-   * @brief Sets per-column types.
+   * @brief Sets per-column types
+   *
+   * @param types Vector specifying the columns' target data types.
+   */
+  void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
+
+  /**
+   * @brief Sets per-column types, specified by the type's respective string representation.
    *
    * @param types Vector of dtypes in which the column needs to be read.
    */
-  void set_dtypes(std::vector<std::string> types) { _dtypes = std::move(types); }
+  [
+    [deprecated("The string-based interface will be deprecated."
+                "Use dtypes(std::vector<data_type>) instead.")]] void
+  set_dtypes(std::vector<std::string> types)
+  {
+    _dtypes = std::move(types);
+  }
 
   /**
    * @brief Sets additional values to recognize as boolean true values.
@@ -965,10 +982,25 @@ class csv_reader_options_builder {
   /**
    * @brief Sets per-column types.
    *
+   * @param types Vector of data types in which the column needs to be read.
+   * @return this for chaining.
+   */
+  csv_reader_options_builder& dtypes(std::vector<data_type> types)
+  {
+    options._dtypes = std::move(types);
+    return *this;
+  }
+
+  /**
+   * @brief Sets per-column types, specified by the type's respective string representation.
+   *
    * @param types Vector of dtypes in which the column needs to be read.
    * @return this for chaining.
    */
-  csv_reader_options_builder& dtypes(std::vector<std::string> types)
+  [
+    [deprecated("The string-based interface will be deprecated."
+                "Use dtypes(std::vector<data_type>) instead.")]] csv_reader_options_builder&
+  dtypes(std::vector<std::string> types)
   {
     options._dtypes = std::move(types);
     return *this;

diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <thrust/optional.h>
+#include <thrust/pair.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+/**
+ * @brief Return the integer component of a decimal string.
+ *
+ * This is reads everything up to the exponent 'e' notation.
+ * The return includes the integer digits and any exponent offset.
+ *
+ * @param[in,out] iter Start of characters to parse
+ * @param[in] end End of characters to parse
+ * @return Integer component and exponent offset.
+ */
+__device__ inline thrust::pair<uint64_t, int32_t> parse_integer(char const*& iter,
+                                                                char const* iter_end,
+                                                                const char decimal_pt_char = '.')
+{
+  // highest value where another decimal digit cannot be appended without an overflow;
+  // this preserves the most digits when scaling the final result
+  constexpr uint64_t decimal_max = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
+
+  uint64_t value     = 0;  // for checking overflow
+  int32_t exp_offset = 0;
+  bool decimal_found = false;
+
+  while (iter < iter_end) {
+    auto const ch = *iter++;
+    if (ch == decimal_pt_char && !decimal_found) {
+      decimal_found = true;
+      continue;
+    }
+    if (ch < '0' || ch > '9') {
+      --iter;
+      break;
+    }
+    if (value > decimal_max) {
+      exp_offset += static_cast<int32_t>(!decimal_found);
+    } else {
+      value = (value * 10) + static_cast<uint64_t>(ch - '0');
+      exp_offset -= static_cast<int32_t>(decimal_found);
+    }
+  }
+  return {value, exp_offset};
+}
+
+/**
+ * @brief Return the exponent of a decimal string.
+ *
+ * This should only be called after the exponent 'e' notation was detected.
+ * The return is the exponent (base-10) integer and can only be
+ * invalid if `check_only == true` and invalid characters are found or the
+ * exponent overflows an int32.
+ *
+ * @tparam check_only Set to true to verify the characters are valid and the
+ *         exponent value in the decimal string does not overflow int32
+ * @param[in,out] iter Start of characters to parse
+ *                     (points to the character after the 'E' or 'e')
+ * @param[in] end End of characters to parse
+ * @return Integer value of the exponent
+ */
+template <bool check_only = false>
+__device__ thrust::optional<int32_t> parse_exponent(char const* iter, char const* iter_end)
+{
+  constexpr uint32_t exponent_max = static_cast<uint32_t>(std::numeric_limits<int32_t>::max());
+
+  // get optional exponent sign
+  int32_t const exp_sign = [&iter] {
+    auto const ch = *iter;
+    if (ch != '-' && ch != '+') { return 1; }
+    ++iter;
+    return (ch == '-' ? -1 : 1);
+  }();
+
+  // parse exponent integer
+  int32_t exp_ten = 0;
+  while (iter < iter_end) {
+    auto const ch = *iter++;
+    if (ch < '0' || ch > '9') {
+      if (check_only) { return thrust::nullopt; }
+      break;
+    }
+
+    uint32_t exp_check = static_cast<uint32_t>(exp_ten * 10) + static_cast<uint32_t>(ch - '0');
+    if (check_only && (exp_check > exponent_max)) { return thrust::nullopt; }  // check overflow
+    exp_ten = static_cast<int32_t>(exp_check);
+  }
+
+  return exp_ten * exp_sign;
+}
+
+/**
+ * @brief Converts the string in the range [iter, iter_end) into a decimal.
+ *
+ * @tparam DecimalType The decimal type to be returned
+ * @param iter The beginning of the string. Unless iter >= iter_end, iter is dereferenced
+ * @param iter_end The end of the characters to parse
+ * @param scale The scale to be applied
+ * @return
+ */
+template <typename DecimalType>
+__device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int32_t scale)
+{
+  auto const sign = [&] {
+    if (iter_end <= iter) { return 0; }
+    if (*iter == '-') { return -1; }
+    if (*iter == '+') { return 1; }
+    return 0;
+  }();
+
+  // if string begins with a sign, continue with next character
+  if (sign != 0) ++iter;
+
+  auto [value, exp_offset] = parse_integer(iter, iter_end);
+  if (value == 0) { return DecimalType{0}; }
+
+  // check for exponent
+  int32_t exp_ten = 0;
+  if ((iter < iter_end) && (*iter == 'e' || *iter == 'E')) {
+    ++iter;
+    if (iter < iter_end) { exp_ten = parse_exponent<false>(iter, iter_end).value(); }
+  }
+  exp_ten += exp_offset;
+
+  // shift the output value based on the exp_ten and the scale values
+  if (exp_ten < scale) {
+    value = value / static_cast<uint64_t>(exp10(static_cast<double>(scale - exp_ten)));
+  } else {
+    value = value * static_cast<uint64_t>(exp10(static_cast<double>(exp_ten - scale)));
+  }
+
+  return static_cast<DecimalType>(value) * (sign == 0 ? 1 : sign);
+}
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
@@ -25,6 +25,7 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/lists/list_view.cuh>
 #include <cudf/null_mask.hpp>
+#include <cudf/strings/detail/convert/fixed_point.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -410,26 +411,6 @@ __inline__ __device__ cudf::list_view decode_value(char const *begin,
   return cudf::list_view{};
 }
 
-// The purpose of this is merely to allow compilation ONLY
-// TODO : make this work for csv
-template <>
-__inline__ __device__ numeric::decimal32 decode_value(char const *begin,
-                                                      char const *end,
-                                                      parse_options_view const &opts)
-{
-  return numeric::decimal32{};
-}
-
-// The purpose of this is merely to allow compilation ONLY
-// TODO : make this work for csv
-template <>
-__inline__ __device__ numeric::decimal64 decode_value(char const *begin,
-                                                      char const *end,
-                                                      parse_options_view const &opts)
-{
-  return numeric::decimal64{};
-}
-
 // The purpose of this is merely to allow compilation ONLY
 // TODO : make this work for csv
 template <>
@@ -452,10 +433,11 @@ struct decode_op {
    * @return bool Whether the parsed value is valid.
    */
   template <typename T,
-            typename std::enable_if_t<std::is_integral<T>::value and !std::is_same<T, bool>::value>
-              * = nullptr>
+            typename std::enable_if_t<std::is_integral_v<T> and !std::is_same_v<T, bool> and
+                                      !cudf::is_fixed_point<T>()> * = nullptr>
   __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
                                                       size_t row,
+                                                      const data_type,
                                                       char const *begin,
                                                       char const *end,
                                                       parse_options_view const &opts,
@@ -473,12 +455,36 @@ struct decode_op {
     return true;
   }
 
+  /**
+   * @brief Dispatch for fixed point types.
+   *
+   * @return bool Whether the parsed value is valid.
+   */
+  template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()> * = nullptr>
+  __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
+                                                      size_t row,
+                                                      const data_type output_type,
+                                                      char const *begin,
+                                                      char const *end,
+                                                      parse_options_view const &opts,
+                                                      column_parse::flags flags)
+  {
+    static_cast<device_storage_type_t<T> *>(out_buffer)[row] =
+      [&flags, &opts, output_type, begin, end]() -> device_storage_type_t<T> {
+      return strings::detail::parse_decimal<device_storage_type_t<T>>(
+        begin, end, output_type.scale());
+    }();
+
+    return true;
+  }
+
   /**
    * @brief Dispatch for boolean type types.
    */
-  template <typename T, typename std::enable_if_t<std::is_same<T, bool>::value> * = nullptr>
+  template <typename T, typename std::enable_if_t<std::is_same_v<T, bool>> * = nullptr>
   __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
                                                       size_t row,
+                                                      const data_type,
                                                       char const *begin,
                                                       char const *end,
                                                       parse_options_view const &opts,
@@ -499,9 +505,10 @@ struct decode_op {
    * @brief Dispatch for floating points, which are set to NaN if the input
    * is not valid. In such case, the validity mask is set to zero too.
    */
-  template <typename T, typename std::enable_if_t<std::is_floating_point<T>::value> * = nullptr>
+  template <typename T, typename std::enable_if_t<std::is_floating_point_v<T>> * = nullptr>
   __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
                                                       size_t row,
+                                                      const data_type,
                                                       char const *begin,
                                                       char const *end,
                                                       parse_options_view const &opts,
@@ -517,10 +524,11 @@ struct decode_op {
    * @brief Dispatch for all other types.
    */
   template <typename T,
-            typename std::enable_if_t<!std::is_integral<T>::value and
-                                      !std::is_floating_point<T>::value> * = nullptr>
+            typename std::enable_if_t<!std::is_integral_v<T> and !std::is_floating_point_v<T> and
+                                      !cudf::is_fixed_point<T>()> * = nullptr>
   __host__ __device__ __forceinline__ bool operator()(void *out_buffer,
                                                       size_t row,
+                                                      const data_type,
                                                       char const *begin,
                                                       char const *end,
                                                       parse_options_view const &opts,
@@ -605,6 +613,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
                                     decode_op{},
                                     columns[actual_col],
                                     rec_id,
+                                    dtypes[actual_col],
                                     field_start,
                                     field_end,
                                     options,