Merge remote-tracking branch 'origin/branch-21.10' into unflatten-nes…

…ted-columns
rapidsai · Aug 20, 2021 · 94bb184 · 94bb184
2 parents 7582913 + 5869264
commit 94bb184
Show file tree

Hide file tree

Showing 20 changed files with 1,097 additions and 522 deletions.
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
@@ -115,8 +115,7 @@ class csv_reader_options {
   // Conversion settings
 
   // Per-column types; disables type inference on those columns
-  std::variant<std::vector<std::string>, std::vector<data_type>, std::map<std::string, data_type>>
-    _dtypes;
+  std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
   // Additional values to recognize as boolean true values
   std::vector<std::string> _true_values{"True", "TRUE", "true"};
   // Additional values to recognize as boolean false values
@@ -305,10 +304,7 @@ class csv_reader_options {
   /**
    * @brief Returns per-column types.
    */
-  std::variant<std::vector<std::string>,
-               std::vector<data_type>,
-               std::map<std::string, data_type>> const&
-  get_dtypes() const
+  std::variant<std::vector<data_type>, std::map<std::string, data_type>> const& get_dtypes() const
   {
     return _dtypes;
   }
@@ -608,20 +604,6 @@ class csv_reader_options {
    */
   void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
 
-  /**
-   * @brief Sets per-column types, specified by the type's respective string representation.
-   *
-   * @param types Vector of dtypes in which the column needs to be read.
-   */
-  [[deprecated(
-    "The string-based interface will be deprecated."
-    "Use dtypes(std::vector<data_type>) or "
-    "dtypes(std::map<std::string, data_type>) instead.")]] void
-  set_dtypes(std::vector<std::string> types)
-  {
-    _dtypes = std::move(types);
-  }
-
   /**
    * @brief Sets additional values to recognize as boolean true values.
    *
@@ -1067,22 +1049,6 @@ class csv_reader_options_builder {
     return *this;
   }
 
-  /**
-   * @brief Sets per-column types, specified by the type's respective string representation.
-   *
-   * @param types Vector of dtypes in which the column needs to be read.
-   * @return this for chaining.
-   */
-  [[deprecated(
-    "The string-based interface will be deprecated."
-    "Use dtypes(std::vector<data_type>) or "
-    "dtypes(std::map<std::string, data_type>) instead.")]] csv_reader_options_builder&
-  dtypes(std::vector<std::string> types)
-  {
-    options._dtypes = std::move(types);
-    return *this;
-  }
-
   /**
    * @brief Sets additional values to recognize as boolean true values.
    *

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
@@ -68,8 +68,7 @@ class json_reader_options {
   source_info _source;
 
   // Data types of the column; empty to infer dtypes
-  std::variant<std::vector<std::string>, std::vector<data_type>, std::map<std::string, data_type>>
-    _dtypes;
+  std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
   // Specify the compression format of the source or infer from file extension
   compression_type _compression = compression_type::AUTO;
 
@@ -117,10 +116,7 @@ class json_reader_options {
   /**
    * @brief Returns data types of the columns.
    */
-  std::variant<std::vector<std::string>,
-               std::vector<data_type>,
-               std::map<std::string, data_type>> const&
-  get_dtypes() const
+  std::variant<std::vector<data_type>, std::map<std::string, data_type>> const& get_dtypes() const
   {
     return _dtypes;
   }
@@ -150,20 +146,6 @@ class json_reader_options {
    */
   bool is_enabled_dayfirst() const { return _dayfirst; }
 
-  /**
-   * @brief Set data types for columns to be read.
-   *
-   * @param types Vector of dtypes in string format.
-   */
-  [[deprecated(
-    "The string-based interface will be deprecated."
-    "Use dtypes(std::vector<data_type>) or "
-    "dtypes(std::map<std::string, data_type>) instead.")]] void
-  set_dtypes(std::vector<std::string> types)
-  {
-    _dtypes = std::move(types);
-  }
-
   /**
    * @brief Set data types for columns to be read.
    *
@@ -232,22 +214,6 @@ class json_reader_options_builder {
    */
   explicit json_reader_options_builder(source_info const& src) : options(src) {}
 
-  /**
-   * @brief Set data types for columns to be read.
-   *
-   * @param types Vector of dtypes in string format
-   * @return this for chaining
-   */
-  [[deprecated(
-    "The string-based interface will be deprecated."
-    "Use dtypes(std::vector<data_type>) or "
-    "dtypes(std::map<std::string, data_type>) instead.")]] json_reader_options_builder&
-  dtypes(std::vector<std::string> types)
-  {
-    options._dtypes = std::move(types);
-    return *this;
-  }
-
   /**
    * @brief Set data types for columns to be read.
    *

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
@@ -27,7 +27,6 @@
 
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/table/table.hpp>
@@ -420,14 +419,8 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
   if (has_to_infer_column_types) {
     column_types = infer_column_types(data, row_offsets, stream);
   } else {
-    column_types = std::visit(
-      cudf::detail::visitor_overload{
-        [&](const std::vector<data_type>& data_types) { return select_data_types(data_types); },
-        [&](const std::map<std::string, data_type>& data_types) {
-          return select_data_types(data_types);
-        },
-        [&](const std::vector<string>& dtypes) { return parse_column_types(dtypes); }},
-      opts_.get_dtypes());
+    column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); },
+                              opts_.get_dtypes());
   }
 
   out_columns.reserve(column_types.size());
@@ -707,81 +700,6 @@ std::vector<data_type> reader::impl::infer_column_types(device_span<char const>
   return dtypes;
 }
 
-std::vector<data_type> reader::impl::parse_column_types(
-  const std::vector<std::string>& types_as_strings)
-{
-  std::vector<data_type> dtypes;
-
-  const bool is_dict = std::all_of(types_as_strings.begin(),
-                                   types_as_strings.end(),
-                                   [](const auto& s) { return s.find(':') != std::string::npos; });
-
-  if (!is_dict) {
-    if (types_as_strings.size() == 1) {
-      // If it's a single dtype, assign that dtype to all active columns
-      data_type dtype_;
-      column_parse::flags col_flags_;
-      std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]);
-      dtypes.resize(num_active_cols_, dtype_);
-      for (int col = 0; col < num_actual_cols_; col++) {
-        column_flags_[col] |= col_flags_;
-      }
-      CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-    } else {
-      // If it's a list, assign dtypes to active columns in the given order
-      CUDF_EXPECTS(static_cast<int>(types_as_strings.size()) >= num_actual_cols_,
-                   "Must specify data types for all columns");
-
-      auto dtype_ = std::back_inserter(dtypes);
-
-      for (int col = 0; col < num_actual_cols_; col++) {
-        if (column_flags_[col] & column_parse::enabled) {
-          column_parse::flags col_flags_;
-          std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]);
-          column_flags_[col] |= col_flags_;
-          CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-        }
-      }
-    }
-  } else {
-    // Translate vector of `name : dtype` strings to map
-    // NOTE: Incoming pairs can be out-of-order from column names in dataset
-    std::unordered_map<std::string, std::string> col_type_map;
-    for (const auto& pair : types_as_strings) {
-      const auto pos     = pair.find_last_of(':');
-      const auto name    = pair.substr(0, pos);
-      const auto dtype   = pair.substr(pos + 1, pair.size());
-      col_type_map[name] = dtype;
-    }
-
-    auto dtype_ = std::back_inserter(dtypes);
-
-    for (int col = 0; col < num_actual_cols_; col++) {
-      if (column_flags_[col] & column_parse::enabled) {
-        CUDF_EXPECTS(col_type_map.find(col_names_[col]) != col_type_map.end(),
-                     "Must specify data types for all active columns");
-        column_parse::flags col_flags_;
-        std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[col_names_[col]]);
-        column_flags_[col] |= col_flags_;
-        CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-      }
-    }
-  }
-
-  if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) {
-    for (auto& type : dtypes) {
-      if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); }
-    }
-  }
-
-  for (size_t i = 0; i < dtypes.size(); i++) {
-    // Replace EMPTY dtype with STRING
-    if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; }
-  }
-
-  return dtypes;
-}
-
 std::vector<column_buffer> reader::impl::decode_data(device_span<char const> data,
                                                      device_span<uint64_t const> row_offsets,
                                                      host_span<data_type const> column_types,

diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
@@ -197,15 +197,6 @@ class reader::impl {
    */
   std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);
 
-  /**
-   * @brief Parses the columns' data types from the vector of dtypes that are provided as strings.
-   *
-   * @param types_as_strings The vector of strings from which to parse the columns' target data
-   * types
-   * @return List of columns' data types
-   */
-  std::vector<data_type> parse_column_types(std::vector<std::string> const& types_as_strings);
-
   /**
    * @brief Converts the row-column data and outputs to column bufferrs.
    *

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
@@ -466,71 +466,32 @@ void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
   }
 }
 
-std::vector<data_type> reader::impl::parse_data_types(
-  std::vector<std::string> const& types_as_strings)
-{
-  CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(),
-               "Need to specify the type of each column.\n");
-  std::vector<data_type> dtypes;
-  // Assume that the dtype is in dictionary format only if all elements contain a colon
-  const bool is_dict = std::all_of(
-    std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) {
-      return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s);
-    });
-
-  auto split_on_colon = [](std::string_view s) {
-    auto const i = s.find(":");
-    return std::pair{s.substr(0, i), s.substr(i + 1)};
-  };
-
-  if (is_dict) {
-    std::map<std::string, data_type> col_type_map;
-    std::transform(
-      std::cbegin(types_as_strings),
-      std::cend(types_as_strings),
-      std::inserter(col_type_map, col_type_map.end()),
-      [&](auto const& ts) {
-        auto const [col_name, type_str] = split_on_colon(ts);
-        return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})};
-      });
-
-    // Using the map here allows O(n log n) complexity
-    std::transform(std::cbegin(metadata_.column_names),
-                   std::cend(metadata_.column_names),
-                   std::back_inserter(dtypes),
-                   [&](auto const& column_name) { return col_type_map[column_name]; });
-  } else {
-    std::transform(std::cbegin(types_as_strings),
-                   std::cend(types_as_strings),
-                   std::back_inserter(dtypes),
-                   [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); });
-  }
-  return dtypes;
-}
-
 void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
                                   rmm::cuda_stream_view stream)
 {
   bool has_to_infer_column_types =
     std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes());
   if (!has_to_infer_column_types) {
-    dtypes_ = std::visit(
-      cudf::detail::visitor_overload{
-        [&](const std::vector<data_type>& dtypes) { return dtypes; },
-        [&](const std::map<std::string, data_type>& dtypes) {
-          std::vector<data_type> sorted_dtypes;
-          std::transform(std::cbegin(metadata_.column_names),
-                         std::cend(metadata_.column_names),
-                         std::back_inserter(sorted_dtypes),
-                         [&](auto const& column_name) {
-                           auto const it = dtypes.find(column_name);
-                           CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns");
-                           return it->second;
-                         });
-          return sorted_dtypes;
-        },
-        [&](std::vector<std::string> const& dtypes) { return parse_data_types(dtypes); }},
-      options_.get_dtypes());
+    dtypes_ = std::visit(cudf::detail::visitor_overload{
+                           [&](const std::vector<data_type>& dtypes) {
+                             CUDF_EXPECTS(dtypes.size() == metadata_.column_names.size(),
+                                          "Must specify types for all columns");
+                             return dtypes;
+                           },
+                           [&](const std::map<std::string, data_type>& dtypes) {
+                             std::vector<data_type> sorted_dtypes;
+                             std::transform(std::cbegin(metadata_.column_names),
+                                            std::cend(metadata_.column_names),
+                                            std::back_inserter(sorted_dtypes),
+                                            [&](auto const& column_name) {
+                                              auto const it = dtypes.find(column_name);
+                                              CUDF_EXPECTS(it != dtypes.end(),
+                                                           "Must specify types for all columns");
+                                              return it->second;
+                                            });
+                             return sorted_dtypes;
+                           }},
+                         options_.get_dtypes());
   } else {
     CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
     auto const num_columns       = metadata_.column_names.size();

diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
@@ -158,8 +158,6 @@ class reader::impl {
    */
   void set_column_names(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
 
-  std::vector<data_type> parse_data_types(std::vector<std::string> const& types_as_strings);
-
   /**
    * @brief Set the data type array data member
    *

diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
@@ -1858,7 +1858,11 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str", "int32", "int32", "int32"});
+      .dtypes({dtype<int32_t>(),
+               dtype<cudf::string_view>(),
+               dtype<int32_t>(),
+               dtype<int32_t>(),
+               dtype<int32_t>()});
   auto result = cudf_io::read_csv(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view());

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
@@ -888,4 +888,27 @@ TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs)
                                  float64_wrapper{{1.1, 2.2, 3.3, 4.4}, validity});
 }
 
+TEST_F(JsonReaderTest, BadDtypeParams)
+{
+  std::string buffer = "[1,2,3,4]";
+
+  cudf_io::json_reader_options options_vec =
+    cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      .lines(true)
+      .dtypes({dtype<int8_t>()});
+
+  // should throw because there are four columns and only one dtype
+  EXPECT_THROW(cudf_io::read_json(options_vec), cudf::logic_error);
+
+  cudf_io::json_reader_options options_map =
+    cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      .lines(true)
+      .dtypes(std::map<std::string, cudf::data_type>{{"0", dtype<int8_t>()},
+                                                     {"1", dtype<int8_t>()},
+                                                     {"2", dtype<int8_t>()},
+                                                     {"wrong_name", dtype<int8_t>()}});
+  // should throw because one of the columns is not in the dtype map
+  EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
-Original file line number
+Diff line change
@@ Expand Up / @@ -158,8 +158,6 @@ class reader::impl { @@
        */
       void set_column_names(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
-      std::vector<data_type> parse_data_types(std::vector<std::string> const& types_as_strings);
       /**
        * @brief Set the data type array data member
        *
@@ Expand Down @@