From e6bd37f04f79341f811a5a7a12d7c212071fd7d1 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Thu, 22 Jul 2021 01:07:57 -0700
Subject: [PATCH 01/23] add API that takes a map of data_types

---
 cpp/include/cudf/io/csv.hpp    | 27 +++++++++++++++++++++++++--
 cpp/src/io/csv/reader_impl.cu  | 21 +++++++++++++++++++++
 cpp/src/io/csv/reader_impl.hpp |  8 ++++++++
 cpp/tests/io/csv_test.cpp      | 33 ++++++++++++++++++++++++++++++++-
 4 files changed, 86 insertions(+), 3 deletions(-)
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 1dff99735ec..e70353dff9f 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -111,7 +111,8 @@ class csv_reader_options {
   // Conversion settings
 
   // Per-column types; disables type inference on those columns
-  std::variant<std::vector<std::string>, std::vector<data_type>> _dtypes;
+  std::variant<std::vector<std::string>, std::vector<data_type>, std::map<std::string, data_type>>
+    _dtypes;
   // Additional values to recognize as boolean true values
   std::vector<std::string> _true_values{"True", "TRUE", "true"};
   // Additional values to recognize as boolean false values
@@ -290,7 +291,10 @@ class csv_reader_options {
   /**
    * @brief Returns per-column types.
    */
-  std::variant<std::vector<std::string>, std::vector<data_type>> const& get_dtypes() const
+  std::variant<std::vector<std::string>,
+               std::vector<data_type>,
+               std::map<std::string, data_type>> const&
+  get_dtypes() const
   {
     return _dtypes;
   }
@@ -562,6 +566,13 @@ class csv_reader_options {
     _infer_date_indexes = std::move(col_ind);
   }
 
+  /**
+   * @brief Sets per-column types
+   *
+   * @param types Column name -> data type map specifying the columns' target data types
+   */
+  void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
+
   /**
    * @brief Sets per-column types
    *
@@ -979,6 +990,18 @@ class csv_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets per-column types.
+   *
+   * @param types Column name -> data type map specifying the columns' target data types
+   * @return this for chaining.
+   */
+  csv_reader_options_builder& dtypes(std::map<std::string, data_type> types)
+  {
+    options._dtypes = std::move(types);
+    return *this;
+  }
+
   /**
    * @brief Sets per-column types.
    *
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 70ce0fce1cc..2436e930046 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -280,6 +280,24 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
   return {rmm::device_uvector<char>{0, stream}, selected_rows_offsets{stream}};
 }
 
+std::vector<data_type> reader::impl::sort_data_types(
+  std::map<std::string, data_type> const& col_type_map)
+{
+  std::vector<data_type> dtypes;
+  dtypes.reserve(col_type_map.size());
+
+  for (int col = 0; col < num_actual_cols_; col++) {
+    if (column_flags_[col] & column_parse::enabled) {
+      auto const col_type_it = col_type_map.find(col_names_[col]);
+      CUDF_EXPECTS(col_type_it != col_type_map.end(),
+                   "Must specify data types for all active columns");
+      CUDF_EXPECTS(col_type_it->second.id() != cudf::type_id::EMPTY, "Unsupported data type");
+      dtypes.emplace_back(col_type_it->second);
+    }
+  }
+  return dtypes;
+}
+
 table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
 {
   auto const data_row_offsets = select_data_and_row_offsets(stream);
@@ -385,6 +403,9 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
     column_types =
       std::visit(VisitorOverload{
                    [&](const std::vector<data_type>& data_types) { return data_types; },
+                   [&](const std::map<std::string, data_type>& data_types) {
+                     return sort_data_types(data_types);
+                   },
                    [&](const std::vector<string>& dtypes) { return parse_column_types(dtypes); }},
                  opts_.get_dtypes());
   }
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 29c6b48bc8a..ef1b2a52f71 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -181,6 +181,14 @@ class reader::impl {
                                             device_span<uint64_t const> row_offsets,
                                             rmm::cuda_stream_view stream);
 
+  /**
+   * @brief Sorts the columns' data types from the map of dtypes.
+   *
+   * @param col_type_map Column name -> data type map specifying the columns' target data types
+   * @return Sorted ist of columns' data types
+   */
+  std::vector<data_type> sort_data_types(std::map<std::string, data_type> const& col_type_map);
+
   /**
    * @brief Parses the columns' data types from the vector of dtypes that are provided as strings.
    *
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 4e1ad57080a..0a29832270f 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -80,7 +80,6 @@ struct CsvReaderTest : public cudf::test::BaseFixture {
 // Typed test fixture for timestamp type tests
 template <typename T>
 struct CsvReaderNumericTypeTest : public CsvReaderTest {
-  auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
 
 // Declare typed test cases
@@ -2141,4 +2140,36 @@ TEST_F(CsvReaderTest, DefaultWriteChunkSize)
   }
 }
 
+TEST_F(CsvReaderTest, DtypesMap)
+{
+  std::string csv_in{"12,9\n34,8\n56,7"};
+
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()})
+      .names({"A", "B"})
+      .dtypes(std::map<std::string, cudf::data_type>{{"B", cudf::data_type{cudf::type_id::INT16}},
+                                                     {"A", cudf::data_type{cudf::type_id::INT32}}})
+      .header(-1);
+  auto result = cudf_io::read_csv(in_opts);
+
+  const auto result_table = result.tbl->view();
+  assert(result_table->num_columns() == 2);
+  assert(result_table.column(0).type() == cudf::data_type{cudf::type_id::INT32});
+  assert(result_table.column(1).type() == cudf::data_type{cudf::type_id::INT16});
+  expect_column_data_equal(std::vector<int32_t>{12, 34, 56}, result_table.column(0));
+  expect_column_data_equal(std::vector<int16_t>{9, 8, 7}, result_table.column(1));
+}
+
+TEST_F(CsvReaderTest, DtypesMapInvalid)
+{
+  std::string csv_in{""};
+
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()})
+      .names({"A", "B"})
+      .dtypes(std::map<std::string, cudf::data_type>{{"C", cudf::data_type{cudf::type_id::INT16}}});
+
+  EXPECT_THROW(cudf_io::read_csv(in_opts), cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From fbf05ccadff6e0ea44f5d0c505ec9d40f2f623b8 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Thu, 22 Jul 2021 01:30:50 -0700
Subject: [PATCH 02/23] using

---
 cpp/tests/io/csv_test.cpp | 103 ++++++++++++++++++++------------------
 1 file changed, 53 insertions(+), 50 deletions(-)

diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 0a29832270f..1f783754cfb 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -50,6 +50,10 @@
 
 namespace cudf_io = cudf::io;
 
+using cudf::data_type;
+using cudf::type_id;
+using cudf::type_to_id;
+
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
   typename std::conditional<std::is_same<T, cudf::string_view>::value,
@@ -93,7 +97,7 @@ struct CsvFixedPointReaderTest : public CsvReaderTest {
   {
     cudf::test::strings_column_wrapper strings(reference_strings.begin(), reference_strings.end());
     auto input_column = cudf::strings::to_fixed_point(
-      cudf::strings_column_view(strings), cudf::data_type{cudf::type_to_id<DecimalType>(), scale});
+      cudf::strings_column_view(strings), data_type{type_to_id<DecimalType>(), scale});
 
     std::string buffer = std::accumulate(reference_strings.begin(),
                                          reference_strings.end(),
@@ -104,7 +108,7 @@ struct CsvFixedPointReaderTest : public CsvReaderTest {
 
     cudf_io::csv_reader_options in_opts =
       cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
-        .dtypes({cudf::data_type{cudf::type_to_id<DecimalType>(), scale}})
+        .dtypes({data_type{type_to_id<DecimalType>(), scale}})
         .header(-1);
 
     const auto result      = cudf_io::read_csv(in_opts);
@@ -390,7 +394,7 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale)
   using DecimalType = TypeParam;
   auto input_column = cudf::strings::to_fixed_point(
     cudf::strings_column_view(strings),
-    cudf::data_type{cudf::type_to_id<DecimalType>(), numeric::scale_type{-2}});
+    data_type{type_to_id<DecimalType>(), numeric::scale_type{-2}});
 
   auto input_table = cudf::table_view{std::vector<cudf::column_view>{*input_column}};
 
@@ -436,7 +440,7 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale)
   using DecimalType = TypeParam;
   auto input_column = cudf::strings::to_fixed_point(
     cudf::strings_column_view(strings),
-    cudf::data_type{cudf::type_to_id<DecimalType>(), numeric::scale_type{3}});
+    data_type{type_to_id<DecimalType>(), numeric::scale_type{3}});
 
   auto input_table = cudf::table_view{std::vector<cudf::column_view>{*input_column}};
 
@@ -583,10 +587,10 @@ TEST_F(CsvReaderTest, Booleans)
   // Booleans are the same (integer) data type, but valued at 0 or 1
   const auto view = result.tbl->view();
   EXPECT_EQ(4, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(1).type().id());
-  ASSERT_EQ(cudf::type_id::INT16, view.column(2).type().id());
-  ASSERT_EQ(cudf::type_id::BOOL8, view.column(3).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(1).type().id());
+  ASSERT_EQ(type_id::INT16, view.column(2).type().id());
+  ASSERT_EQ(type_id::BOOL8, view.column(3).type().id());
 
   expect_column_data_equal(std::vector<int32_t>{1, 0, 0, 0, 1}, view.column(0));
   expect_column_data_equal(std::vector<int16_t>{0, 1, 1, 0, 1}, view.column(2));
@@ -613,7 +617,7 @@ TEST_F(CsvReaderTest, Dates)
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   expect_column_data_equal(std::vector<cudf::timestamp_ms>{cudf::timestamp_ms{983750400000ms},
@@ -645,12 +649,12 @@ TEST_F(CsvReaderTest, DatesCastToTimestampSeconds)
       .dtypes(std::vector<std::string>{"date"})
       .dayfirst(true)
       .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS});
+      .timestamp_type(data_type{type_id::TIMESTAMP_SECONDS});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_SECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_SECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   expect_column_data_equal(std::vector<cudf::timestamp_s>{cudf::timestamp_s{983750400s},
@@ -682,12 +686,12 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMilliSeconds)
       .dtypes(std::vector<std::string>{"date"})
       .dayfirst(true)
       .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS});
+      .timestamp_type(data_type{type_id::TIMESTAMP_MILLISECONDS});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   expect_column_data_equal(std::vector<cudf::timestamp_ms>{cudf::timestamp_ms{983750400000ms},
@@ -719,12 +723,12 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMicroSeconds)
       .dtypes(std::vector<std::string>{"date"})
       .dayfirst(true)
       .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS});
+      .timestamp_type(data_type{type_id::TIMESTAMP_MICROSECONDS});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   expect_column_data_equal(std::vector<cudf::timestamp_us>{cudf::timestamp_us{983750400000000us},
@@ -756,12 +760,12 @@ TEST_F(CsvReaderTest, DatesCastToTimestampNanoSeconds)
       .dtypes(std::vector<std::string>{"date"})
       .dayfirst(true)
       .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS});
+      .timestamp_type(data_type{type_id::TIMESTAMP_NANOSECONDS});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   expect_column_data_equal(
@@ -796,12 +800,12 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampSeconds)
       .names({"A"})
       .dtypes(std::vector<std::string>{"datetime64[s]"})
       .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS});
+      .timestamp_type(data_type{type_id::TIMESTAMP_SECONDS});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_SECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_SECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0));
@@ -825,12 +829,12 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMilliSeconds)
       .names({"A"})
       .dtypes(std::vector<std::string>{"datetime64[ms]"})
       .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS});
+      .timestamp_type(data_type{type_id::TIMESTAMP_MILLISECONDS});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0));
@@ -854,12 +858,12 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMicroSeconds)
       .names({"A"})
       .dtypes(std::vector<std::string>{"datetime64[us]"})
       .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS});
+      .timestamp_type(data_type{type_id::TIMESTAMP_MICROSECONDS});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0));
@@ -883,12 +887,12 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampNanoSeconds)
       .names({"A"})
       .dtypes(std::vector<std::string>{"datetime64[ns]"})
       .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS});
+      .timestamp_type(data_type{type_id::TIMESTAMP_NANOSECONDS});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0));
@@ -913,7 +917,7 @@ TEST_F(CsvReaderTest, FloatingPoint)
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::FLOAT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id());
 
   const auto ref_vals =
     std::vector<float>{5.6, 56.79, 12000000000, 0.7, 3.000, 12.34, 0.31, -73.98007199999998};
@@ -945,8 +949,8 @@ TEST_F(CsvReaderTest, Strings)
 
   const auto view = result.tbl->view();
   EXPECT_EQ(2, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
-  ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::STRING, view.column(1).type().id());
 
   expect_column_data_equal(
     std::vector<std::string>{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"},
@@ -975,8 +979,8 @@ TEST_F(CsvReaderTest, StringsQuotes)
 
   const auto view = result.tbl->view();
   EXPECT_EQ(2, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
-  ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::STRING, view.column(1).type().id());
 
   expect_column_data_equal(
     std::vector<std::string>{"abc,\ndef, ghi", "jkl, `mno`, pqr", "stu `vwx` yz"}, view.column(1));
@@ -1005,8 +1009,8 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
 
   const auto view = result.tbl->view();
   EXPECT_EQ(2, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
-  ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::STRING, view.column(1).type().id());
 
   expect_column_data_equal(
     std::vector<std::string>{"\"abcdef ghi\"", "\"jkl \"\"mno\"\" pqr\"", "stu \"vwx\" yz"},
@@ -1032,7 +1036,7 @@ TEST_F(CsvReaderTest, SkiprowsNrows)
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
 
   expect_column_data_equal(std::vector<int32_t>{5, 6}, view.column(0));
 }
@@ -1056,7 +1060,7 @@ TEST_F(CsvReaderTest, ByteRange)
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
 
   expect_column_data_equal(std::vector<int32_t>{4000, 5000, 6000}, view.column(0));
 }
@@ -1074,7 +1078,7 @@ TEST_F(CsvReaderTest, ByteRangeStrings)
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::STRING, view.column(0).type().id());
+  ASSERT_EQ(type_id::STRING, view.column(0).type().id());
 
   expect_column_data_equal(std::vector<std::string>{"c"}, view.column(0));
 }
@@ -1097,7 +1101,7 @@ TEST_F(CsvReaderTest, BlanksAndComments)
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
 
   expect_column_data_equal(std::vector<int32_t>{1, 3, 4, 5, 8, 9}, view.column(0));
 }
@@ -1170,7 +1174,7 @@ TEST_F(CsvReaderTest, ArrowFileSource)
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT8, view.column(0).type().id());
+  ASSERT_EQ(type_id::INT8, view.column(0).type().id());
 
   expect_column_data_equal(std::vector<int8_t>{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0));
 }
@@ -1192,7 +1196,7 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint)
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::FLOAT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id());
 
   const auto col_data = cudf::test::to_host<float>(view.column(0));
   // col_data.first contains the column data
@@ -1211,7 +1215,7 @@ TEST_F(CsvReaderTest, StringInference)
   const auto result = cudf_io::read_csv(in_opts);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
-  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), type_id::STRING);
 }
 
 TEST_F(CsvReaderTest, TypeInferenceThousands)
@@ -1225,9 +1229,9 @@ TEST_F(CsvReaderTest, TypeInferenceThousands)
   const auto result_view = result.tbl->view();
 
   EXPECT_EQ(result_view.num_columns(), 3);
-  EXPECT_EQ(result_view.column(0).type().id(), cudf::type_id::INT64);
-  EXPECT_EQ(result_view.column(1).type().id(), cudf::type_id::INT64);
-  EXPECT_EQ(result_view.column(2).type().id(), cudf::type_id::FLOAT64);
+  EXPECT_EQ(result_view.column(0).type().id(), type_id::INT64);
+  EXPECT_EQ(result_view.column(1).type().id(), type_id::INT64);
+  EXPECT_EQ(result_view.column(2).type().id(), type_id::FLOAT64);
 
   auto tsnd_sep_col = std::vector<int64_t>{1400L, 123456L};
   auto int_col      = std::vector<int64_t>{123L, 123456L};
@@ -1253,9 +1257,9 @@ TEST_F(CsvReaderTest, TypeInferenceWithDecimal)
   const auto result_view = result.tbl->view();
 
   EXPECT_EQ(result_view.num_columns(), 3);
-  EXPECT_EQ(result_view.column(0).type().id(), cudf::type_id::INT64);
-  EXPECT_EQ(result_view.column(1).type().id(), cudf::type_id::STRING);
-  EXPECT_EQ(result_view.column(2).type().id(), cudf::type_id::FLOAT64);
+  EXPECT_EQ(result_view.column(0).type().id(), type_id::INT64);
+  EXPECT_EQ(result_view.column(1).type().id(), type_id::STRING);
+  EXPECT_EQ(result_view.column(2).type().id(), type_id::FLOAT64);
 
   auto int_col = std::vector<int64_t>{1400L, 123456L};
   auto str_col = std::vector<std::string>{"1.23", "123.456"};
@@ -2147,15 +2151,14 @@ TEST_F(CsvReaderTest, DtypesMap)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()})
       .names({"A", "B"})
-      .dtypes(std::map<std::string, cudf::data_type>{{"B", cudf::data_type{cudf::type_id::INT16}},
-                                                     {"A", cudf::data_type{cudf::type_id::INT32}}})
+      .dtypes({{"B", data_type{type_to_id<int16_t>()}}, {"A", data_type{type_to_id<int32_t>()}}})
       .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto result_table = result.tbl->view();
   assert(result_table->num_columns() == 2);
-  assert(result_table.column(0).type() == cudf::data_type{cudf::type_id::INT32});
-  assert(result_table.column(1).type() == cudf::data_type{cudf::type_id::INT16});
+  assert(result_table.column(0).type() == data_type{type_id::INT32});
+  assert(result_table.column(1).type() == data_type{type_id::INT16});
   expect_column_data_equal(std::vector<int32_t>{12, 34, 56}, result_table.column(0));
   expect_column_data_equal(std::vector<int16_t>{9, 8, 7}, result_table.column(1));
 }
@@ -2167,7 +2170,7 @@ TEST_F(CsvReaderTest, DtypesMapInvalid)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()})
       .names({"A", "B"})
-      .dtypes(std::map<std::string, cudf::data_type>{{"C", cudf::data_type{cudf::type_id::INT16}}});
+      .dtypes({{"A", data_type{type_to_id<int16_t>()}}});
 
   EXPECT_THROW(cudf_io::read_csv(in_opts), cudf::logic_error);
 }

From 523e3ad7fbefaf5651aab900285203e7cfd3cfa5 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Thu, 22 Jul 2021 13:34:11 -0700
Subject: [PATCH 03/23] dtypes + selected columns fix

---
 cpp/src/io/csv/reader_impl.cu  | 45 +++++++++++++------
 cpp/src/io/csv/reader_impl.hpp | 12 ++++-
 cpp/tests/io/csv_test.cpp      | 81 ++++++++++++++++------------------
 3 files changed, 79 insertions(+), 59 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 2436e930046..02196d17fc1 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -280,22 +280,39 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
   return {rmm::device_uvector<char>{0, stream}, selected_rows_offsets{stream}};
 }
 
-std::vector<data_type> reader::impl::sort_data_types(
+std::vector<data_type> reader::impl::select_data_types(
   std::map<std::string, data_type> const& col_type_map)
 {
-  std::vector<data_type> dtypes;
-  dtypes.reserve(col_type_map.size());
+  std::vector<data_type> selected_dtypes;
 
   for (int col = 0; col < num_actual_cols_; col++) {
     if (column_flags_[col] & column_parse::enabled) {
       auto const col_type_it = col_type_map.find(col_names_[col]);
       CUDF_EXPECTS(col_type_it != col_type_map.end(),
                    "Must specify data types for all active columns");
-      CUDF_EXPECTS(col_type_it->second.id() != cudf::type_id::EMPTY, "Unsupported data type");
-      dtypes.emplace_back(col_type_it->second);
+      selected_dtypes.emplace_back(col_type_it->second);
     }
   }
-  return dtypes;
+  return selected_dtypes;
+}
+
+std::vector<data_type> reader::impl::select_data_types(std::vector<data_type> const& dtypes)
+{
+  std::vector<data_type> selected_dtypes;
+
+  if (dtypes.size() == 1) {
+    // If it's a single dtype, assign that dtype to all active columns
+    selected_dtypes.resize(num_active_cols_, dtypes.front());
+  } else {
+    // If it's a list, assign dtypes to active columns in the given order
+    CUDF_EXPECTS(static_cast<int>(dtypes.size()) >= num_actual_cols_,
+                 "Must specify data types for all columns");
+
+    for (int col = 0; col < num_actual_cols_; col++) {
+      if (column_flags_[col] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[col]); }
+    }
+  }
+  return selected_dtypes;
 }
 
 table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
@@ -400,14 +417,14 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
   if (has_to_infer_column_types) {
     column_types = infer_column_types(data, row_offsets, stream);
   } else {
-    column_types =
-      std::visit(VisitorOverload{
-                   [&](const std::vector<data_type>& data_types) { return data_types; },
-                   [&](const std::map<std::string, data_type>& data_types) {
-                     return sort_data_types(data_types);
-                   },
-                   [&](const std::vector<string>& dtypes) { return parse_column_types(dtypes); }},
-                 opts_.get_dtypes());
+    column_types = std::visit(
+      VisitorOverload{
+        [&](const std::vector<data_type>& data_types) { return select_data_types(data_types); },
+        [&](const std::map<std::string, data_type>& data_types) {
+          return select_data_types(data_types);
+        },
+        [&](const std::vector<string>& dtypes) { return parse_column_types(dtypes); }},
+      opts_.get_dtypes());
   }
 
   out_columns.reserve(column_types.size());
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index ef1b2a52f71..222bb2c5cb3 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -185,9 +185,17 @@ class reader::impl {
    * @brief Sorts the columns' data types from the map of dtypes.
    *
    * @param col_type_map Column name -> data type map specifying the columns' target data types
-   * @return Sorted ist of columns' data types
+   * @return Sorted ist of selected columns' data types
    */
-  std::vector<data_type> sort_data_types(std::map<std::string, data_type> const& col_type_map);
+  std::vector<data_type> select_data_types(std::map<std::string, data_type> const& col_type_map);
+
+    /**
+   * @brief Sorts the columns' data types from the map of dtypes.
+   *
+   * @param col_type_map Vector of deta types specifying the columns' target data types
+   * @return Sorted ist of selected columns' data types
+   */
+  std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);
 
   /**
    * @brief Parses the columns' data types from the vector of dtypes that are provided as strings.
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 1f783754cfb..27ba1336774 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -54,6 +54,12 @@ using cudf::data_type;
 using cudf::type_id;
 using cudf::type_to_id;
 
+template <typename T>
+auto dtype()
+{
+  return data_type{type_to_id<T>()};
+}
+
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
   typename std::conditional<std::is_same<T, cudf::string_view>::value,
@@ -96,8 +102,8 @@ struct CsvFixedPointReaderTest : public CsvReaderTest {
   void run_tests(const std::vector<std::string>& reference_strings, numeric::scale_type scale)
   {
     cudf::test::strings_column_wrapper strings(reference_strings.begin(), reference_strings.end());
-    auto input_column = cudf::strings::to_fixed_point(
-      cudf::strings_column_view(strings), data_type{type_to_id<DecimalType>(), scale});
+    auto input_column = cudf::strings::to_fixed_point(cudf::strings_column_view(strings),
+                                                      data_type{type_to_id<DecimalType>(), scale});
 
     std::string buffer = std::accumulate(reference_strings.begin(),
                                          reference_strings.end(),
@@ -392,9 +398,9 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale)
   reference_strings = valid_reference_strings;
 
   using DecimalType = TypeParam;
-  auto input_column = cudf::strings::to_fixed_point(
-    cudf::strings_column_view(strings),
-    data_type{type_to_id<DecimalType>(), numeric::scale_type{-2}});
+  auto input_column =
+    cudf::strings::to_fixed_point(cudf::strings_column_view(strings),
+                                  data_type{type_to_id<DecimalType>(), numeric::scale_type{-2}});
 
   auto input_table = cudf::table_view{std::vector<cudf::column_view>{*input_column}};
 
@@ -438,9 +444,9 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale)
   reference_strings = valid_reference_strings;
 
   using DecimalType = TypeParam;
-  auto input_column = cudf::strings::to_fixed_point(
-    cudf::strings_column_view(strings),
-    data_type{type_to_id<DecimalType>(), numeric::scale_type{3}});
+  auto input_column =
+    cudf::strings::to_fixed_point(cudf::strings_column_view(strings),
+                                  data_type{type_to_id<DecimalType>(), numeric::scale_type{3}});
 
   auto input_table = cudf::table_view{std::vector<cudf::column_view>{*input_column}};
 
@@ -482,11 +488,10 @@ TEST_F(CsvReaderTest, MultiColumn)
   {
     std::ostringstream line;
     for (int i = 0; i < num_rows; ++i) {
-      line << std::to_string(int8_values[i]) << "," << int16_values[i] << "," << int16_values[i]
-           << "," << int32_values[i] << "," << int32_values[i] << "," << int64_values[i] << ","
-           << int64_values[i] << "," << std::to_string(uint8_values[i]) << "," << uint16_values[i]
-           << "," << uint32_values[i] << "," << uint64_values[i] << "," << float32_values[i] << ","
-           << float32_values[i] << "," << float64_values[i] << "," << float64_values[i] << "\n";
+      line << std::to_string(int8_values[i]) << "," << int16_values[i] << "," << int32_values[i]
+           << "," << int64_values[i] << "," << std::to_string(uint8_values[i]) << ","
+           << uint16_values[i] << "," << uint32_values[i] << "," << uint64_values[i] << ","
+           << float32_values[i] << "," << float64_values[i] << "\n";
     }
     std::ofstream outfile(filepath, std::ofstream::out);
     outfile << line.str();
@@ -495,39 +500,29 @@ TEST_F(CsvReaderTest, MultiColumn)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .header(-1)
-      .dtypes(std::vector<std::string>{"int8",
-                                       "short",
-                                       "int16",
-                                       "int",
-                                       "int32",
-                                       "long",
-                                       "int64",
-                                       "uint8",
-                                       "uint16",
-                                       "uint32",
-                                       "uint64",
-                                       "float",
-                                       "float32",
-                                       "double",
-                                       "float64"});
+      .dtypes({dtype<int8_t>(),
+               dtype<int16_t>(),
+               dtype<int32_t>(),
+               dtype<int64_t>(),
+               dtype<uint8_t>(),
+               dtype<uint16_t>(),
+               dtype<uint32_t>(),
+               dtype<uint64_t>(),
+               dtype<float>(),
+               dtype<double>()});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   expect_column_data_equal(int8_values, view.column(0));
   expect_column_data_equal(int16_values, view.column(1));
-  expect_column_data_equal(int16_values, view.column(2));
-  expect_column_data_equal(int32_values, view.column(3));
-  expect_column_data_equal(int32_values, view.column(4));
-  expect_column_data_equal(int64_values, view.column(5));
-  expect_column_data_equal(int64_values, view.column(6));
-  expect_column_data_equal(uint8_values, view.column(7));
-  expect_column_data_equal(uint16_values, view.column(8));
-  expect_column_data_equal(uint32_values, view.column(9));
-  expect_column_data_equal(uint64_values, view.column(10));
-  expect_column_data_equal(float32_values, view.column(11));
-  expect_column_data_equal(float32_values, view.column(12));
-  expect_column_data_equal(float64_values, view.column(13));
-  expect_column_data_equal(float64_values, view.column(14));
+  expect_column_data_equal(int32_values, view.column(2));
+  expect_column_data_equal(int64_values, view.column(3));
+  expect_column_data_equal(uint8_values, view.column(4));
+  expect_column_data_equal(uint16_values, view.column(5));
+  expect_column_data_equal(uint32_values, view.column(6));
+  expect_column_data_equal(uint64_values, view.column(7));
+  expect_column_data_equal(float32_values, view.column(8));
+  expect_column_data_equal(float64_values, view.column(9));
 }
 
 TEST_F(CsvReaderTest, RepeatColumn)
@@ -552,7 +547,7 @@ TEST_F(CsvReaderTest, RepeatColumn)
   // repeats column in indexes and names, misses 1 column.
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes(std::vector<std::string>{"int16", "int64", "uint64", "float"})
+      .dtypes({dtype<int16_t>(), dtype<int64_t>(), dtype<uint64_t>(), dtype<float>()})
       .names({"A", "B", "C", "D"})
       .use_cols_indexes({1, 0, 0})
       .use_cols_names({"D", "B", "B"})
@@ -578,7 +573,7 @@ TEST_F(CsvReaderTest, Booleans)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A", "B", "C", "D"})
-      .dtypes(std::vector<std::string>{"int32", "int32", "short", "bool"})
+      .dtypes({dtype<int32_t>(), dtype<int32_t>(), dtype<int16_t>(), dtype<bool>()})
       .true_values({"yes", "Yes", "YES", "foo", "FOO"})
       .false_values({"no", "No", "NO", "Bar", "bar"})
       .header(-1);

From 1d5cf0090bef3f98aa776665703bec3b14770b9c Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Thu, 22 Jul 2021 14:37:29 -0700
Subject: [PATCH 04/23] convert tests to new API

---
 cpp/tests/io/csv_test.cpp | 128 ++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 73 deletions(-)

diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 27ba1336774..3af1e5d5ddb 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -605,7 +605,7 @@ TEST_F(CsvReaderTest, Dates)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .dayfirst(true)
       .header(-1);
   auto result = cudf_io::read_csv(in_opts);
@@ -641,10 +641,9 @@ TEST_F(CsvReaderTest, DatesCastToTimestampSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_SECONDS}})
       .dayfirst(true)
-      .header(-1)
-      .timestamp_type(data_type{type_id::TIMESTAMP_SECONDS});
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
@@ -678,10 +677,9 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMilliSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .dayfirst(true)
-      .header(-1)
-      .timestamp_type(data_type{type_id::TIMESTAMP_MILLISECONDS});
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
@@ -715,10 +713,9 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMicroSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_MICROSECONDS}})
       .dayfirst(true)
-      .header(-1)
-      .timestamp_type(data_type{type_id::TIMESTAMP_MICROSECONDS});
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
@@ -752,10 +749,9 @@ TEST_F(CsvReaderTest, DatesCastToTimestampNanoSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_NANOSECONDS}})
       .dayfirst(true)
-      .header(-1)
-      .timestamp_type(data_type{type_id::TIMESTAMP_NANOSECONDS});
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
@@ -793,9 +789,8 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"datetime64[s]"})
-      .header(-1)
-      .timestamp_type(data_type{type_id::TIMESTAMP_SECONDS});
+      .dtypes({data_type{type_id::TIMESTAMP_SECONDS}})
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
@@ -822,9 +817,8 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMilliSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"datetime64[ms]"})
-      .header(-1)
-      .timestamp_type(data_type{type_id::TIMESTAMP_MILLISECONDS});
+      .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
@@ -851,9 +845,8 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMicroSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"datetime64[us]"})
-      .header(-1)
-      .timestamp_type(data_type{type_id::TIMESTAMP_MICROSECONDS});
+      .dtypes({data_type{type_id::TIMESTAMP_MICROSECONDS}})
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
@@ -880,9 +873,8 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampNanoSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"datetime64[ns]"})
-      .header(-1)
-      .timestamp_type(data_type{type_id::TIMESTAMP_NANOSECONDS});
+      .dtypes({data_type{type_id::TIMESTAMP_NANOSECONDS}})
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
@@ -905,7 +897,7 @@ TEST_F(CsvReaderTest, FloatingPoint)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"float32"})
+      .dtypes({dtype<float>()})
       .lineterminator(';')
       .header(-1);
   auto result = cudf_io::read_csv(in_opts);
@@ -938,7 +930,7 @@ TEST_F(CsvReaderTest, Strings)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str"})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
       .quoting(cudf_io::quote_style::NONE);
   auto result = cudf_io::read_csv(in_opts);
 
@@ -968,7 +960,7 @@ TEST_F(CsvReaderTest, StringsQuotes)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str"})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
       .quotechar('`');
   auto result = cudf_io::read_csv(in_opts);
 
@@ -997,7 +989,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str"})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
       .quoting(cudf_io::quote_style::NONE)
       .doublequote(false);
   auto result = cudf_io::read_csv(in_opts);
@@ -1023,7 +1015,7 @@ TEST_F(CsvReaderTest, SkiprowsNrows)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"int32"})
+      .dtypes({dtype<int32_t>()})
       .header(1)
       .skiprows(2)
       .nrows(2);
@@ -1047,7 +1039,7 @@ TEST_F(CsvReaderTest, ByteRange)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"int32"})
+      .dtypes({dtype<int32_t>()})
       .header(-1)
       .byte_range_offset(11)
       .byte_range_size(15);
@@ -1066,7 +1058,7 @@ TEST_F(CsvReaderTest, ByteRangeStrings)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{input.c_str(), input.size()})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"str"})
+      .dtypes({dtype<cudf::string_view>()})
       .header(-1)
       .byte_range_offset(4);
   auto result = cudf_io::read_csv(in_opts);
@@ -1089,7 +1081,7 @@ TEST_F(CsvReaderTest, BlanksAndComments)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"int32"})
+      .dtypes({dtype<int32_t>()})
       .header(-1)
       .comment('#');
   auto result = cudf_io::read_csv(in_opts);
@@ -1164,7 +1156,7 @@ TEST_F(CsvReaderTest, ArrowFileSource)
   auto arrow_source = cudf_io::arrow_io_source{infile};
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source})
-      .dtypes(std::vector<std::string>{"int8"});
+      .dtypes({dtype<int8_t>()});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
@@ -1185,7 +1177,7 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"float32"})
+      .dtypes({dtype<float>()})
       .header(-1);
   const auto result = cudf_io::read_csv(in_opts);
 
@@ -1294,7 +1286,7 @@ TEST_F(CsvReaderTest, nullHandling)
     cudf_io::csv_reader_options in_opts =
       cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
         .na_filter(false)
-        .dtypes(std::vector<std::string>{"str"})
+        .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
     const auto result = cudf_io::read_csv(in_opts);
@@ -1308,7 +1300,7 @@ TEST_F(CsvReaderTest, nullHandling)
   {
     cudf_io::csv_reader_options in_opts =
       cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
-        .dtypes(std::vector<std::string>{"str"})
+        .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
     const auto result = cudf_io::read_csv(in_opts);
@@ -1325,7 +1317,7 @@ TEST_F(CsvReaderTest, nullHandling)
     cudf_io::csv_reader_options in_opts =
       cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
         .na_values({"Null"})
-        .dtypes(std::vector<std::string>{"str"})
+        .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
     const auto result = cudf_io::read_csv(in_opts);
@@ -1343,7 +1335,7 @@ TEST_F(CsvReaderTest, nullHandling)
       cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
         .keep_default_na(false)
         .na_values({"Null"})
-        .dtypes(std::vector<std::string>{"str"})
+        .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
     const auto result = cudf_io::read_csv(in_opts);
@@ -1553,18 +1545,13 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter)
 
   std::vector<cudf::column_view> input_columns{int8_column,
                                                int16_column,
-                                               int16_column,
-                                               int32_column,
                                                int32_column,
                                                int64_column,
-                                               int64_column,
                                                uint8_column,
                                                uint16_column,
                                                uint32_column,
                                                uint64_column,
                                                float32_column,
-                                               float32_column,
-                                               float64_column,
                                                float64_column};
   cudf::table_view input_table{input_columns};
 
@@ -1575,26 +1562,21 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .header(-1)
-      .dtypes(std::vector<std::string>{"int8",
-                                       "short",
-                                       "int16",
-                                       "int",
-                                       "int32",
-                                       "long",
-                                       "int64",
-                                       "uint8",
-                                       "uint16",
-                                       "uint32",
-                                       "uint64",
-                                       "float",
-                                       "float32",
-                                       "double",
-                                       "float64"});
+      .dtypes({dtype<int8_t>(),
+               dtype<int16_t>(),
+               dtype<int32_t>(),
+               dtype<int64_t>(),
+               dtype<uint8_t>(),
+               dtype<uint16_t>(),
+               dtype<uint32_t>(),
+               dtype<uint64_t>(),
+               dtype<float>(),
+               dtype<double>()});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto result_table = result.tbl->view();
 
-  std::vector<cudf::size_type> non_float64s{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  std::vector<cudf::size_type> non_float64s{0, 1, 2, 3, 4, 5, 6, 7, 8};
   const auto input_sliced_view  = input_table.select(non_float64s);
   const auto result_sliced_view = result_table.select(non_float64s);
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_sliced_view, result_sliced_view);
@@ -1604,9 +1586,6 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter)
   auto float64_col_idx = non_float64s.size();
   check_float_column(
     input_table.column(float64_col_idx), result_table.column(float64_col_idx), tol, validity);
-  ++float64_col_idx;
-  check_float_column(
-    input_table.column(float64_col_idx), result_table.column(float64_col_idx), tol, validity);
 }
 
 TEST_F(CsvReaderTest, DatesWithWriter)
@@ -1631,7 +1610,7 @@ TEST_F(CsvReaderTest, DatesWithWriter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .dayfirst(true)
       .header(-1);
   auto result = cudf_io::read_csv(in_opts);
@@ -1762,7 +1741,7 @@ TEST_F(CsvReaderTest, FloatingPointWithWriter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"float64"})
+      .dtypes({dtype<double>()})
       .header(-1);
   // in_opts.lineterminator = ';';
   auto result = cudf_io::read_csv(in_opts);
@@ -1788,7 +1767,7 @@ TEST_F(CsvReaderTest, StringsWithWriter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str"})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
       .quoting(cudf_io::quote_style::NONE);
   auto result = cudf_io::read_csv(in_opts);
 
@@ -1813,7 +1792,7 @@ TEST_F(CsvReaderTest, StringsWithWriterSimple)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str"})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
       .quoting(cudf_io::quote_style::NONE);
   auto result = cudf_io::read_csv(in_opts);
 
@@ -1837,7 +1816,7 @@ TEST_F(CsvReaderTest, StringsEmbeddedDelimiter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str"});
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()});
   auto result = cudf_io::read_csv(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view());
@@ -1892,7 +1871,7 @@ TEST_F(CsvReaderTest, UserImplementedSource)
   TestSource source{csv_data.str()};
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{&source})
-      .dtypes(std::vector<std::string>{"int8", "int16", "int32"})
+      .dtypes({dtype<int8_t>(), dtype<int16_t>(), dtype<int32_t>()})
       .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
@@ -1937,8 +1916,11 @@ TEST_F(CsvReaderTest, DurationsWithWriter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{
-        "timedelta[D]", "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]", "timedelta64[ns]"});
+      .dtypes({data_type{type_id::DURATION_DAYS},
+               data_type{type_id::DURATION_SECONDS},
+               data_type{type_id::DURATION_MILLISECONDS},
+               data_type{type_id::DURATION_MICROSECONDS},
+               data_type{type_id::DURATION_NANOSECONDS}});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto result_table = result.tbl->view();
@@ -2146,7 +2128,7 @@ TEST_F(CsvReaderTest, DtypesMap)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()})
       .names({"A", "B"})
-      .dtypes({{"B", data_type{type_to_id<int16_t>()}}, {"A", data_type{type_to_id<int32_t>()}}})
+      .dtypes({{"B", dtype<int16_t>()}, {"A", dtype<int32_t>()}})
       .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
@@ -2165,7 +2147,7 @@ TEST_F(CsvReaderTest, DtypesMapInvalid)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()})
       .names({"A", "B"})
-      .dtypes({{"A", data_type{type_to_id<int16_t>()}}});
+      .dtypes({{"A", dtype<int16_t>()}});
 
   EXPECT_THROW(cudf_io::read_csv(in_opts), cudf::logic_error);
 }

From 64a2a07b6418902d237c658247ceca009d92daf6 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Thu, 22 Jul 2021 16:19:07 -0700
Subject: [PATCH 05/23]  infer_date -> parse_date for consistency; add
 parse_hex to libcudf

---
 cpp/include/cudf/io/csv.hpp          | 80 +++++++++++++++++++++++-----
 cpp/src/io/csv/reader_impl.cu        | 22 ++++++--
 python/cudf/cudf/_lib/cpp/io/csv.pxd | 12 +++--
 python/cudf/cudf/_lib/csv.pyx        | 12 ++---
 4 files changed, 98 insertions(+), 28 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index e70353dff9f..8ebb3a72a85 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -104,9 +104,13 @@ class csv_reader_options {
   // Whether a quote inside a value is double-quoted
   bool _doublequote = true;
   // Names of columns to read as datetime
-  std::vector<std::string> _infer_date_names;
+  std::vector<std::string> _parse_dates_names;
   // Indexes of columns to read as datetime
-  std::vector<int> _infer_date_indexes;
+  std::vector<int> _parse_dates_indexes;
+  // Names of columns to parse as hexadecimal
+  std::vector<std::string> _parse_hex_names;
+  // Indexes of columns to parse as hexadecimal
+  std::vector<int> _parse_hex_indexes;
 
   // Conversion settings
 
@@ -281,12 +285,22 @@ class csv_reader_options {
   /**
    * @brief Returns names of columns to read as datetime.
    */
-  std::vector<std::string> const& get_infer_date_names() const { return _infer_date_names; }
+  std::vector<std::string> const& get_parse_dates_names() const { return _parse_dates_names; }
 
   /**
    * @brief Returns indexes of columns to read as datetime.
    */
-  std::vector<int> const& get_infer_date_indexes() const { return _infer_date_indexes; }
+  std::vector<int> const& get_parse_dates_indexes() const { return _parse_dates_indexes; }
+
+  /**
+   * @brief Returns names of columns to read as datetime.
+   */
+  std::vector<std::string> const& get_parse_hex_names() const { return _parse_hex_names; }
+
+  /**
+   * @brief Returns indexes of columns to read as datetime.
+   */
+  std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }
 
   /**
    * @brief Returns per-column types.
@@ -551,9 +565,9 @@ class csv_reader_options {
    *
    * @param col_names Vector of column names to infer as datetime.
    */
-  void set_infer_date_names(std::vector<std::string> col_names)
+  void set_parse_dates(std::vector<std::string> col_names)
   {
-    _infer_date_names = std::move(col_names);
+    _parse_dates_names = std::move(col_names);
   }
 
   /**
@@ -561,11 +575,25 @@ class csv_reader_options {
    *
    * @param col_names Vector of column indices to infer as datetime.
    */
-  void set_infer_date_indexes(std::vector<int> col_ind)
+  void set_parse_dates(std::vector<int> col_ind) { _parse_dates_indexes = std::move(col_ind); }
+
+  /**
+   * @brief Sets names of columns to parse as hexadecimal
+   *
+   * @param col_names Vector of column names to parse as hexadecimal
+   */
+  void set_parse_hex(std::vector<std::string> col_names)
   {
-    _infer_date_indexes = std::move(col_ind);
+    _parse_hex_names = std::move(col_names);
   }
 
+  /**
+   * @brief Sets indexes of columns to parse as hexadecimal
+   *
+   * @param col_names Vector of column indices to parse as hexadecimal
+   */
+  void set_parse_hex(std::vector<int> col_ind) { _parse_hex_indexes = std::move(col_ind); }
+
   /**
    * @brief Sets per-column types
    *
@@ -969,24 +997,48 @@ class csv_reader_options_builder {
   /**
    * @brief Sets names of columns to read as datetime.
    *
-   * @param col_names Vector of column names to infer as datetime.
+   * @param col_names Vector of column names to read as datetime.
    * @return this for chaining.
    */
-  csv_reader_options_builder& infer_date_names(std::vector<std::string> col_names)
+  csv_reader_options_builder& parse_dates(std::vector<std::string> col_names)
   {
-    options._infer_date_names = std::move(col_names);
+    options._parse_dates_names = std::move(col_names);
     return *this;
   }
 
   /**
    * @brief Sets indexes of columns to read as datetime.
    *
-   * @param col_names Vector of column indices to infer as datetime.
+   * @param col_names Vector of column indices to read as datetime.
+   * @return this for chaining.
+   */
+  csv_reader_options_builder& parse_dates(std::vector<int> col_ind)
+  {
+    options._parse_dates_indexes = std::move(col_ind);
+    return *this;
+  }
+
+  /**
+   * @brief Sets names of columns to parse as hexadecimal.
+   *
+   * @param col_names Vector of column names to parse as hexadecimal
+   * @return this for chaining.
+   */
+  csv_reader_options_builder& parse_hex(std::vector<std::string> col_names)
+  {
+    options._parse_hex_names = std::move(col_names);
+    return *this;
+  }
+
+  /**
+   * @brief Sets indexes of columns to parse as hexadecimal.
+   *
+   * @param col_names Vector of column indices to parse as hexadecimal
    * @return this for chaining.
    */
-  csv_reader_options_builder& infer_date_indexes(std::vector<int> col_ind)
+  csv_reader_options_builder& parse_hex(std::vector<int> col_ind)
   {
-    options._infer_date_indexes = std::move(col_ind);
+    options._parse_hex_indexes = std::move(col_ind);
     return *this;
   }
 
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 02196d17fc1..656dde935df 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -390,13 +390,13 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
     }
   }
 
-  // User can specify which columns should be inferred as datetime
-  if (!opts_.get_infer_date_indexes().empty() || !opts_.get_infer_date_names().empty()) {
-    for (const auto index : opts_.get_infer_date_indexes()) {
+  // User can specify which columns should be read as datetime
+  if (!opts_.get_parse_dates_indexes().empty() || !opts_.get_parse_dates_names().empty()) {
+    for (const auto index : opts_.get_parse_dates_indexes()) {
       column_flags_[index] |= column_parse::as_datetime;
     }
 
-    for (const auto& name : opts_.get_infer_date_names()) {
+    for (const auto& name : opts_.get_parse_dates_names()) {
       auto it = std::find(col_names_.begin(), col_names_.end(), name);
       if (it != col_names_.end()) {
         column_flags_[it - col_names_.begin()] |= column_parse::as_datetime;
@@ -404,6 +404,20 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
     }
   }
 
+  // User can specify which columns should be inferred as datetime
+  if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) {
+    for (const auto index : opts_.get_parse_hex_indexes()) {
+      column_flags_[index] |= column_parse::as_hexadecimal;
+    }
+
+    for (const auto& name : opts_.get_parse_hex_names()) {
+      auto it = std::find(col_names_.begin(), col_names_.end(), name);
+      if (it != col_names_.end()) {
+        column_flags_[it - col_names_.begin()] |= column_parse::as_hexadecimal;
+      }
+    }
+  }
+
   // Return empty table rather than exception if nothing to load
   if (num_active_cols_ == 0) { return {std::make_unique<table>(), {}}; }
 
diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd
index c5e235b5697..2d6bdf28f7f 100644
--- a/python/cudf/cudf/_lib/cpp/io/csv.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd
@@ -49,8 +49,10 @@ cdef extern from "cudf/io/csv.hpp" \
         cudf_io_types.quote_style get_quoting() except+
         char get_quotechar() except+
         bool is_enabled_doublequote() except+
-        vector[string] get_infer_date_names() except+
-        vector[int] get_infer_date_indexes() except+
+        vector[string] get_parse_dates_names() except+
+        vector[int] get_parse_dates_indexes() except+
+        vector[string] get_parse_hex_names() except+
+        vector[int] get_parse_hex_indexes() except+
 
         # Conversion settings
         vector[string] get_dtype() except+
@@ -92,8 +94,10 @@ cdef extern from "cudf/io/csv.hpp" \
         void set_quoting(cudf_io_types.quote_style style) except+
         void set_quotechar(char val) except+
         void set_doublequote(bool val) except+
-        void set_infer_date_names(vector[string]) except+
-        void set_infer_date_indexes(vector[int]) except+
+        void set_parse_dates(vector[string]) except+
+        void set_parse_dates(vector[int]) except+
+        void set_parse_hex(vector[string]) except+
+        void set_parse_hex(vector[int]) except+
 
         # Conversion settings
         void set_dtypes(vector[string] types) except+
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 773e81a0a7b..55033c0a0ba 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -116,8 +116,8 @@ cdef csv_reader_options make_csv_reader_options(
     cdef vector[string] c_use_cols_names
     cdef size_type c_nrows = nrows if nrows is not None else -1
     cdef quote_style c_quoting
-    cdef vector[string] c_infer_date_names
-    cdef vector[int] c_infer_date_indexes
+    cdef vector[string] c_parse_dates_names
+    cdef vector[int] c_parse_dates_indexes
     cdef vector[string] c_dtypes
     cdef vector[string] c_true_values
     cdef vector[string] c_false_values
@@ -220,14 +220,14 @@ cdef csv_reader_options make_csv_reader_options(
                 "`parse_dates`: non-lists are unsupported")
         for col in parse_dates:
             if isinstance(col, str):
-                c_infer_date_names.push_back(str(col).encode())
+                c_parse_dates_names.push_back(str(col).encode())
             elif isinstance(col, int):
-                c_infer_date_indexes.push_back(col)
+                c_parse_dates_indexes.push_back(col)
             else:
                 raise NotImplementedError(
                     "`parse_dates`: Nesting is unsupported")
-        csv_reader_options_c.set_infer_date_names(c_infer_date_names)
-        csv_reader_options_c.set_infer_date_indexes(c_infer_date_indexes)
+        csv_reader_options_c.set_parse_dates(c_parse_dates_names)
+        csv_reader_options_c.set_parse_dates(c_parse_dates_indexes)
 
     if dtype is not None:
         if isinstance(dtype, abc.Mapping):

From ee585c1dc295a87749163b95b531121ab46c2af8 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Fri, 23 Jul 2021 11:39:02 -0700
Subject: [PATCH 06/23] use new hex API in tests

---
 cpp/tests/io/csv_test.cpp | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 3af1e5d5ddb..35a5c531b3a 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1467,16 +1467,35 @@ TEST_F(CsvReaderTest, HexTest)
     std::ofstream outfile(filepath, std::ofstream::out);
     outfile << "0x0\n-0x1000\n0xfedcba\n0xABCDEF\n0xaBcDeF\n9512c20b\n";
   }
+  // specify hex columns by name
+  {
+    cudf_io::csv_reader_options in_opts =
+      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+        .names({"A"})
+        .dtypes({dtype<int64_t>()})
+        .header(-1)
+        .parse_hex({"A"});
+    auto result = cudf_io::read_csv(in_opts);
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
-      .names({"A"})
-      .dtypes(std::vector<std::string>{"hex"})
-      .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+    expect_column_data_equal(
+      std::vector<int64_t>{0, -4096, 16702650, 11259375, 11259375, 2501034507},
+      result.tbl->view().column(0));
+  }
 
-  expect_column_data_equal(std::vector<int64_t>{0, -4096, 16702650, 11259375, 11259375, 2501034507},
-                           result.tbl->view().column(0));
+  // specify hex columns by index
+  {
+    cudf_io::csv_reader_options in_opts =
+      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+        .names({"A"})
+        .dtypes({dtype<int64_t>()})
+        .header(-1)
+        .parse_hex(std::vector<int>{0});
+    auto result = cudf_io::read_csv(in_opts);
+
+    expect_column_data_equal(
+      std::vector<int64_t>{0, -4096, 16702650, 11259375, 11259375, 2501034507},
+      result.tbl->view().column(0));
+  }
 }
 
 TYPED_TEST(CsvReaderNumericTypeTest, SingleColumnWithWriter)

From 0e24ae8fbd2d3d5f44a39a0dcc1dc8a209cf71f7 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Fri, 23 Jul 2021 11:44:16 -0700
Subject: [PATCH 07/23] re-enable json tests that were accidentally disabled

---
 cpp/tests/io/json_test.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 426a39ce9d3..282e484327e 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -145,7 +145,6 @@ void check_float_column(cudf::column_view const& col,
 struct JsonReaderTest : public cudf::test::BaseFixture {
 };
 
-/*
 TEST_F(JsonReaderTest, BasicJsonLines)
 {
   std::string data = "[1, 1.1]\n[2, 2.2]\n[3, 3.3]\n";
@@ -615,10 +614,10 @@ TEST_F(JsonReaderTest, JsonLinesObjectsOutOfOrder)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2),
                                  cudf::test::strings_column_wrapper({"aaa", "bbb"}));
 }
-*/
-/*
+
 // currently, the json reader is strict about having non-empty input.
-TEST_F(JsonReaderTest, EmptyFile) {
+TEST_F(JsonReaderTest, EmptyFile)
+{
   auto filepath = temp_env->get_temp_dir() + "EmptyFile.csv";
   {
     std::ofstream outfile{filepath, std::ofstream::out};
@@ -634,7 +633,8 @@ TEST_F(JsonReaderTest, EmptyFile) {
 }
 
 // currently, the json reader is strict about having non-empty input.
-TEST_F(JsonReaderTest, NoDataFile) {
+TEST_F(JsonReaderTest, NoDataFile)
+{
   auto filepath = temp_env->get_temp_dir() + "NoDataFile.csv";
   {
     std::ofstream outfile{filepath, std::ofstream::out};
@@ -648,8 +648,7 @@ TEST_F(JsonReaderTest, NoDataFile) {
   const auto view = result.tbl->view();
   EXPECT_EQ(0, view.num_columns());
 }
-*/
-/*
+
 TEST_F(JsonReaderTest, ArrowFileSource)
 {
   const std::string fname = temp_env->get_temp_dir() + "ArrowFileSource.csv";
@@ -698,7 +697,8 @@ TEST_F(JsonReaderTest, InvalidFloatingPoint)
 
   const auto col_data = cudf::test::to_host<float>(result.tbl->view().column(0));
   // col_data.first contains the column data
-  for (const auto& elem : col_data.first) ASSERT_TRUE(std::isnan(elem));
+  for (const auto& elem : col_data.first)
+    ASSERT_TRUE(std::isnan(elem));
   // col_data.second contains the bitmasks
   ASSERT_EQ(0u, col_data.second[0]);
 }
@@ -861,7 +861,7 @@ TEST_F(JsonReaderTest, ParseOutOfRangeIntegers)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_less_int64_min_append, view.column(8));
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_mixed_range_append, view.column(9));
 }
-*/
+
 TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs)
 {
   const std::string file1 = temp_env->get_temp_dir() + "JsonLinesFileTest1.json";

From 584f1805479007a1d2ed4065dbe55d5de1cc8568 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Fri, 23 Jul 2021 22:15:24 -0700
Subject: [PATCH 08/23] small refactor to prepare for JSON API expansion

---
 cpp/src/io/json/reader_impl.cu  | 27 ++++++++++++++++-----------
 cpp/src/io/json/reader_impl.hpp |  3 +++
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index b4395d6c965..2712e53a43e 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -464,17 +464,14 @@ void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
   }
 }
 
-void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
-                                  rmm::cuda_stream_view stream)
-{
-  auto const dtype = options_.get_dtypes();
-  if (!dtype.empty()) {
-    CUDF_EXPECTS(dtype.size() == metadata_.column_names.size(),
+void reader::impl::parse_data_types(std::vector<std::string> const& types_as_strings){
+
+  CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(),
                  "Need to specify the type of each column.\n");
 
     // Assume that the dtype is in dictionary format only if all elements contain a colon
     const bool is_dict =
-      std::all_of(std::cbegin(dtype), std::cend(dtype), [](const std::string& s) {
+      std::all_of(std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) {
         return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s);
       });
 
@@ -486,8 +483,8 @@ void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
     if (is_dict) {
       std::map<std::string, data_type> col_type_map;
       std::transform(
-        std::cbegin(dtype),
-        std::cend(dtype),
+        std::cbegin(types_as_strings),
+        std::cend(types_as_strings),
         std::inserter(col_type_map, col_type_map.end()),
         [&](auto const& ts) {
           auto const [col_name, type_str] = split_on_colon(ts);
@@ -500,11 +497,19 @@ void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
                      std::back_inserter(dtypes_),
                      [&](auto const& column_name) { return col_type_map[column_name]; });
     } else {
-      std::transform(std::cbegin(dtype),
-                     std::cend(dtype),
+      std::transform(std::cbegin(types_as_strings),
+                     std::cend(types_as_strings),
                      std::back_inserter(dtypes_),
                      [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); });
     }
+}
+
+void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
+                                  rmm::cuda_stream_view stream)
+{
+  auto const& dtype = options_.get_dtypes();
+  if (!dtype.empty()) {
+    parse_data_types(dtype);
   } else {
     CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
     auto const num_columns       = metadata_.column_names.size();
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index bbda7e9ba74..0cf014f379b 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -158,6 +158,9 @@ class reader::impl {
    */
   void set_column_names(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
 
+
+void parse_data_types(std::vector<std::string> const& types_as_strings);
+
   /**
    * @brief Set the data type array data member
    *

From 6b3778674811e3e8e8f7b0fc62839c34aca2c8e8 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Fri, 23 Jul 2021 22:21:58 -0700
Subject: [PATCH 09/23] disable the xfail tests

---
 cpp/tests/io/json_test.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 282e484327e..b96cc9a041e 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -615,6 +615,7 @@ TEST_F(JsonReaderTest, JsonLinesObjectsOutOfOrder)
                                  cudf::test::strings_column_wrapper({"aaa", "bbb"}));
 }
 
+/*
 // currently, the json reader is strict about having non-empty input.
 TEST_F(JsonReaderTest, EmptyFile)
 {
@@ -648,6 +649,7 @@ TEST_F(JsonReaderTest, NoDataFile)
   const auto view = result.tbl->view();
   EXPECT_EQ(0, view.num_columns());
 }
+*/
 
 TEST_F(JsonReaderTest, ArrowFileSource)
 {

From 5120d7b0e77f8ae833d9de6333655ef5b465373c Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Fri, 23 Jul 2021 23:43:42 -0700
Subject: [PATCH 10/23] extend JSON API (no tests)

---
 cpp/include/cudf/io/json.hpp    |  59 +++++++++++++++--
 cpp/src/io/csv/reader_impl.hpp  |   2 +-
 cpp/src/io/json/reader_impl.cu  | 108 ++++++++++++++++++++------------
 cpp/src/io/json/reader_impl.hpp |   3 +-
 cpp/tests/io/json_test.cpp      |   5 +-
 5 files changed, 126 insertions(+), 51 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 2f4d0936d8b..f5d80f6f6c6 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -23,7 +23,9 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
+#include <map>
 #include <string>
+#include <variant>
 #include <vector>
 
 namespace cudf {
@@ -66,7 +68,8 @@ class json_reader_options {
   source_info _source;
 
   // Data types of the column; empty to infer dtypes
-  std::vector<std::string> _dtypes;
+  std::variant<std::vector<std::string>, std::vector<data_type>, std::map<std::string, data_type>>
+    _dtypes;
   // Specify the compression format of the source or infer from file extension
   compression_type _compression = compression_type::AUTO;
 
@@ -114,7 +117,13 @@ class json_reader_options {
   /**
    * @brief Returns data types of the columns.
    */
-  std::vector<std::string> const& get_dtypes() const { return _dtypes; }
+  std::variant<std::vector<std::string>,
+               std::vector<data_type>,
+               std::map<std::string, data_type>> const&
+  get_dtypes() const
+  {
+    return _dtypes;
+  }
 
   /**
    * @brief Returns compression format of the source.
@@ -146,14 +155,28 @@ class json_reader_options {
    *
    * @param types Vector dtypes in string format.
    */
-  void dtypes(std::vector<std::string> types) { _dtypes = std::move(types); }
+  void set_dtypes(std::vector<std::string> types) { _dtypes = std::move(types); }
+
+  /**
+   * @brief Set data types for columns to be read.
+   *
+   * @param types Vector dtypes in string format.
+   */
+
+  void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
+  /**
+   * @brief Set data types for columns to be read.
+   *
+   * @param types Vector dtypes in string format.
+   */
+  void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
 
   /**
    * @brief Set the compression type.
    *
    * @param comp_type The compression type used.
    */
-  void compression(compression_type comp_type) { _compression = comp_type; }
+  void set_compression(compression_type comp_type) { _compression = comp_type; }
 
   /**
    * @brief Set number of bytes to skip from source start.
@@ -205,8 +228,8 @@ class json_reader_options_builder {
   /**
    * @brief Set data types for columns to be read.
    *
-   * @param types Vector dtypes in string format.
-   * @return this for chaining.
+   * @param types Vector of dtypes in string format
+   * @return this for chaining
    */
   json_reader_options_builder& dtypes(std::vector<std::string> types)
   {
@@ -214,6 +237,30 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set data types for columns to be read.
+   *
+   * @param types Vector of dtypes
+   * @return this for chaining
+   */
+  json_reader_options_builder& dtypes(std::vector<data_type> types)
+  {
+    options._dtypes = std::move(types);
+    return *this;
+  }
+
+  /**
+   * @brief Set data types for columns to be read.
+   *
+   * @param types Column name -> dtype map.
+   * @return this for chaining
+   */
+  json_reader_options_builder& dtypes(std::map<std::string, data_type> types)
+  {
+    options._dtypes = std::move(types);
+    return *this;
+  }
+
   /**
    * @brief Set the compression type.
    *
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 222bb2c5cb3..6bd7b66874b 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -189,7 +189,7 @@ class reader::impl {
    */
   std::vector<data_type> select_data_types(std::map<std::string, data_type> const& col_type_map);
 
-    /**
+  /**
    * @brief Sorts the columns' data types from the map of dtypes.
    *
    * @param col_type_map Vector of deta types specifying the columns' target data types
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 2712e53a43e..53318cf43ac 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -50,6 +50,15 @@ namespace json {
 using namespace cudf::io;
 
 namespace {
+/**
+ * @brief Helper class to support inline-overloading for all of a variant's alternative types
+ */
+template <class... Ts>
+struct VisitorOverload : Ts... {
+  using Ts::operator()...;
+};
+template <class... Ts>
+VisitorOverload(Ts...) -> VisitorOverload<Ts...>;
 
 /**
  * @brief Estimates the maximum expected length or a row, based on the number
@@ -236,7 +245,9 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
 {
   size_t map_range_size = 0;
   if (range_size != 0) {
-    map_range_size = range_size + calculate_max_row_size(options_.get_dtypes().size());
+    auto const dtype_option_size =
+      std::visit([](const auto& dtypes) { return dtypes.size(); }, options_.get_dtypes());
+    map_range_size = range_size + calculate_max_row_size(dtype_option_size);
   }
 
   // Support delayed opening of the file if using memory mapping datasource
@@ -464,52 +475,71 @@ void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
   }
 }
 
-void reader::impl::parse_data_types(std::vector<std::string> const& types_as_strings){
-
+std::vector<data_type> reader::impl::parse_data_types(
+  std::vector<std::string> const& types_as_strings)
+{
   CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(),
-                 "Need to specify the type of each column.\n");
-
-    // Assume that the dtype is in dictionary format only if all elements contain a colon
-    const bool is_dict =
-      std::all_of(std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) {
-        return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s);
+               "Need to specify the type of each column.\n");
+  std::vector<data_type> dtypes;
+  // Assume that the dtype is in dictionary format only if all elements contain a colon
+  const bool is_dict = std::all_of(
+    std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) {
+      return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s);
+    });
+
+  auto split_on_colon = [](std::string_view s) {
+    auto const i = s.find(":");
+    return std::pair{s.substr(0, i), s.substr(i + 1)};
+  };
+
+  if (is_dict) {
+    std::map<std::string, data_type> col_type_map;
+    std::transform(
+      std::cbegin(types_as_strings),
+      std::cend(types_as_strings),
+      std::inserter(col_type_map, col_type_map.end()),
+      [&](auto const& ts) {
+        auto const [col_name, type_str] = split_on_colon(ts);
+        return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})};
       });
 
-    auto split_on_colon = [](std::string_view s) {
-      auto const i = s.find(":");
-      return std::pair{s.substr(0, i), s.substr(i + 1)};
-    };
-
-    if (is_dict) {
-      std::map<std::string, data_type> col_type_map;
-      std::transform(
-        std::cbegin(types_as_strings),
-        std::cend(types_as_strings),
-        std::inserter(col_type_map, col_type_map.end()),
-        [&](auto const& ts) {
-          auto const [col_name, type_str] = split_on_colon(ts);
-          return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})};
-        });
-
-      // Using the map here allows O(n log n) complexity
-      std::transform(std::cbegin(metadata_.column_names),
-                     std::cend(metadata_.column_names),
-                     std::back_inserter(dtypes_),
-                     [&](auto const& column_name) { return col_type_map[column_name]; });
-    } else {
-      std::transform(std::cbegin(types_as_strings),
-                     std::cend(types_as_strings),
-                     std::back_inserter(dtypes_),
-                     [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); });
-    }
+    // Using the map here allows O(n log n) complexity
+    std::transform(std::cbegin(metadata_.column_names),
+                   std::cend(metadata_.column_names),
+                   std::back_inserter(dtypes),
+                   [&](auto const& column_name) { return col_type_map[column_name]; });
+  } else {
+    std::transform(std::cbegin(types_as_strings),
+                   std::cend(types_as_strings),
+                   std::back_inserter(dtypes),
+                   [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); });
+  }
+  return dtypes;
 }
 
 void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
                                   rmm::cuda_stream_view stream)
 {
-  auto const& dtype = options_.get_dtypes();
-  if (!dtype.empty()) {
-    parse_data_types(dtype);
+  bool has_to_infer_column_types =
+    std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes());
+  if (!has_to_infer_column_types) {
+    dtypes_ = std::visit(
+      VisitorOverload{
+        [&](const std::vector<data_type>& dtypes) { return dtypes; },
+        [&](const std::map<std::string, data_type>& dtypes) {
+          std::vector<data_type> sorted_dtypes;
+          std::transform(std::cbegin(metadata_.column_names),
+                         std::cend(metadata_.column_names),
+                         std::back_inserter(sorted_dtypes),
+                         [&](auto const& column_name) {
+                           auto const it = dtypes.find(column_name);
+                           CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns");
+                           return it->second;
+                         });
+          return sorted_dtypes;
+        },
+        [&](std::vector<std::string> const& dtypes) { return parse_data_types(dtypes); }},
+      options_.get_dtypes());
   } else {
     CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
     auto const num_columns       = metadata_.column_names.size();
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 0cf014f379b..5cf51369cdf 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -158,8 +158,7 @@ class reader::impl {
    */
   void set_column_names(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
 
-
-void parse_data_types(std::vector<std::string> const& types_as_strings);
+  std::vector<data_type> parse_data_types(std::vector<std::string> const& types_as_strings);
 
   /**
    * @brief Set the data type array data member
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index b96cc9a041e..f8862d9e89c 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -151,7 +151,7 @@ TEST_F(JsonReaderTest, BasicJsonLines)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()})
-      .dtypes({"int", "float64"})
+      .dtypes(std::vector<std::string>{"int", "float64"})
       .lines(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
@@ -670,8 +670,7 @@ TEST_F(JsonReaderTest, ArrowFileSource)
   ;
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
-  EXPECT_EQ(result.tbl->num_columns(),
-            static_cast<cudf::size_type>(in_options.get_dtypes().size()));
+  EXPECT_EQ(result.tbl->num_columns(), 1);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT8);
 
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });

From 6e4a888df3846bef6693e0b2e2f35c5a9de75c36 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Sat, 24 Jul 2021 00:04:45 -0700
Subject: [PATCH 11/23] switch tests to new API

---
 cpp/tests/io/json_test.cpp | 75 +++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 42 deletions(-)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index f8862d9e89c..80d950cc7d8 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -42,6 +42,16 @@ using int64_wrapper        = wrapper<int64_t>;
 using timestamp_ms_wrapper = wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>;
 using bool_wrapper         = wrapper<bool>;
 
+using cudf::data_type;
+using cudf::type_id;
+using cudf::type_to_id;
+
+template <typename T>
+auto dtype()
+{
+  return data_type{type_to_id<T>()};
+}
+
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
   typename std::conditional<std::is_same<T, cudf::string_view>::value,
@@ -151,7 +161,7 @@ TEST_F(JsonReaderTest, BasicJsonLines)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()})
-      .dtypes(std::vector<std::string>{"int", "float64"})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<double>()})
       .lines(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
@@ -182,7 +192,7 @@ TEST_F(JsonReaderTest, FloatingPoint)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes({"float32"})
+      .dtypes({dtype<float>()})
       .lines(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
@@ -206,7 +216,7 @@ TEST_F(JsonReaderTest, JsonLinesStrings)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()})
-      .dtypes({"2:str", "0:int", "1:float64"})
+      .dtypes({{"2", dtype<cudf::string_view>()}, {"0", dtype<int32_t>()}, {"1", dtype<double>()}})
       .lines(true);
 
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
@@ -245,9 +255,8 @@ TEST_F(JsonReaderTest, MultiColumn)
     std::ostringstream line;
     for (int i = 0; i < num_rows; ++i) {
       line << "[" << std::to_string(int8_values[i]) << "," << int16_values[i] << ","
-           << int16_values[i] << "," << int32_values[i] << "," << int32_values[i] << ","
-           << int64_values[i] << "," << int64_values[i] << "," << float32_values[i] << ","
-           << float32_values[i] << "," << float64_values[i] << "," << float64_values[i] << "]\n";
+           << int32_values[i] << "," << int64_values[i] << "," << float32_values[i] << ","
+           << float64_values[i] << "]\n";
     }
     std::ofstream outfile(filepath, std::ofstream::out);
     outfile << line.str();
@@ -255,17 +264,12 @@ TEST_F(JsonReaderTest, MultiColumn)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes({"int8",
-               "short",
-               "int16",
-               "int",
-               "int32",
-               "long",
-               "int64",
-               "float",
-               "float32",
-               "double",
-               "float64"})
+      .dtypes({dtype<int8_t>(),
+               dtype<int16_t>(),
+               dtype<int32_t>(),
+               dtype<int64_t>(),
+               dtype<float>(),
+               dtype<double>()})
       .lines(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
@@ -275,34 +279,21 @@ TEST_F(JsonReaderTest, MultiColumn)
 
   EXPECT_EQ(view.column(0).type().id(), cudf::type_id::INT8);
   EXPECT_EQ(view.column(1).type().id(), cudf::type_id::INT16);
-  EXPECT_EQ(view.column(2).type().id(), cudf::type_id::INT16);
-  EXPECT_EQ(view.column(3).type().id(), cudf::type_id::INT32);
-  EXPECT_EQ(view.column(4).type().id(), cudf::type_id::INT32);
-  EXPECT_EQ(view.column(5).type().id(), cudf::type_id::INT64);
-  EXPECT_EQ(view.column(6).type().id(), cudf::type_id::INT64);
-  EXPECT_EQ(view.column(7).type().id(), cudf::type_id::FLOAT32);
-  EXPECT_EQ(view.column(8).type().id(), cudf::type_id::FLOAT32);
-  EXPECT_EQ(view.column(9).type().id(), cudf::type_id::FLOAT64);
-  EXPECT_EQ(view.column(10).type().id(), cudf::type_id::FLOAT64);
+  EXPECT_EQ(view.column(2).type().id(), cudf::type_id::INT32);
+  EXPECT_EQ(view.column(3).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(view.column(4).type().id(), cudf::type_id::FLOAT32);
+  EXPECT_EQ(view.column(5).type().id(), cudf::type_id::FLOAT64);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(0),
                                  int8_wrapper{int8_values.begin(), int8_values.end(), validity});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(1),
                                  int16_wrapper{int16_values.begin(), int16_values.end(), validity});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(2),
-                                 int16_wrapper{int16_values.begin(), int16_values.end(), validity});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(3),
-                                 int_wrapper{int32_values.begin(), int32_values.end(), validity});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(4),
                                  int_wrapper{int32_values.begin(), int32_values.end(), validity});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(5),
-                                 int64_wrapper{int64_values.begin(), int64_values.end(), validity});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(6),
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(3),
                                  int64_wrapper{int64_values.begin(), int64_values.end(), validity});
-  check_float_column(view.column(7), float32_values, validity);
-  check_float_column(view.column(8), float32_values, validity);
-  check_float_column(view.column(9), float64_values, validity);
-  check_float_column(view.column(10), float64_values, validity);
+  check_float_column(view.column(4), float32_values, validity);
+  check_float_column(view.column(5), float64_values, validity);
 }
 
 TEST_F(JsonReaderTest, Booleans)
@@ -315,7 +306,7 @@ TEST_F(JsonReaderTest, Booleans)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes({"bool"})
+      .dtypes({dtype<bool>()})
       .lines(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
@@ -342,7 +333,7 @@ TEST_F(JsonReaderTest, Dates)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes({"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .lines(true)
       .dayfirst(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
@@ -379,7 +370,7 @@ TEST_F(JsonReaderTest, Durations)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes({"timedelta64[ns]"})
+      .dtypes({data_type{type_id::DURATION_NANOSECONDS}})
       .lines(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
@@ -665,7 +656,7 @@ TEST_F(JsonReaderTest, ArrowFileSource)
   auto arrow_source = cudf_io::arrow_io_source{infile};
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{&arrow_source})
-      .dtypes({"int8"})
+      .dtypes({dtype<int8_t>()})
       .lines(true);
   ;
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
@@ -689,7 +680,7 @@ TEST_F(JsonReaderTest, InvalidFloatingPoint)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes({"float32"})
+      .dtypes({dtype<float>()})
       .lines(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 

From ef031253376ef74df7a7e29cd79736acfd63700c Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Mon, 26 Jul 2021 12:08:38 -0700
Subject: [PATCH 12/23] add new APIs to cython defs

---
 python/cudf/cudf/_lib/cpp/io/csv.pxd  | 5 +++++
 python/cudf/cudf/_lib/cpp/io/json.pxd | 7 +++++++
 2 files changed, 12 insertions(+)

diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd
index 2d6bdf28f7f..faedc9ec052 100644
--- a/python/cudf/cudf/_lib/cpp/io/csv.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd
@@ -5,6 +5,7 @@ from libcpp cimport bool
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
+from libcpp.map cimport map
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
@@ -166,6 +167,10 @@ cdef extern from "cudf/io/csv.hpp" \
 
         # Conversion settings
         csv_reader_options_builder& dtypes(vector[string] types) except+
+        csv_reader_options_builder& dtypes(vector[data_type] types) except+
+        csv_reader_options_builder& dtypes(
+            map[string, data_type] types
+        ) except+
         csv_reader_options_builder& true_values(vector[string] vals) except+
         csv_reader_options_builder& false_values(vector[string] vals) except+
         csv_reader_options_builder& na_values(vector[string] vals) except+
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
index 6f20195e87f..d49ea1eeddf 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -5,6 +5,7 @@ from libcpp cimport bool
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
+from libcpp.map cimport map
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
@@ -47,6 +48,12 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& dtypes(
             vector[string] types
         ) except+
+        json_reader_options_builder& dtypes(
+            vector[data_type] types
+        ) except+
+        json_reader_options_builder& dtypes(
+            map[string, data_type] types
+        ) except+
         json_reader_options_builder& compression(
             cudf_io_types.compression_type compression
         ) except+

From 8ae3e1a3a7bc25812b6b5801b6e6a0699bc00fa6 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Mon, 26 Jul 2021 13:26:28 -0700
Subject: [PATCH 13/23] add to last missing place

---
 python/cudf/cudf/_lib/cpp/io/csv.pxd  | 2 ++
 python/cudf/cudf/_lib/cpp/io/json.pxd | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd
index faedc9ec052..85074cc4369 100644
--- a/python/cudf/cudf/_lib/cpp/io/csv.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd
@@ -102,6 +102,8 @@ cdef extern from "cudf/io/csv.hpp" \
 
         # Conversion settings
         void set_dtypes(vector[string] types) except+
+        void set_dtypes(vector[data_type] types) except+
+        void set_dtypes(map[string, data_type] types) except+
         void set_true_values(vector[string] vals) except+
         void set_false_values(vector[string] vals) except+
         void set_na_values(vector[string] vals) except+
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
index d49ea1eeddf..158994c81a5 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -27,6 +27,8 @@ cdef extern from "cudf/io/json.hpp" \
 
         # setter
         void set_dtypes(vector[string] types) except+
+        void set_dtypes(vector[data_type] types) except+
+        void set_dtypes(map[string, data_type] types) except+
         void set_compression(
             cudf_io_types.compression_type compression
         ) except+

From 53c4d15bc93d815a831298cd094c869c4aa5b138 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Mon, 26 Jul 2021 13:45:46 -0700
Subject: [PATCH 14/23] style fix; missed rename

---
 python/cudf/cudf/_lib/cpp/io/csv.pxd  | 6 +++---
 python/cudf/cudf/_lib/cpp/io/json.pxd | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd
index 85074cc4369..725757121d9 100644
--- a/python/cudf/cudf/_lib/cpp/io/csv.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd
@@ -2,10 +2,10 @@
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
+from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-from libcpp.map cimport map
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
@@ -164,8 +164,8 @@ cdef extern from "cudf/io/csv.hpp" \
         ) except+
         csv_reader_options_builder& quotechar(char val) except+
         csv_reader_options_builder& doublequote(bool val) except+
-        csv_reader_options_builder& infer_date_names(vector[string]) except+
-        csv_reader_options_builder& infer_date_indexes(vector[int]) except+
+        csv_reader_options_builder& parse_dates(vector[string]) except+
+        csv_reader_options_builder& parse_dates(vector[int]) except+
 
         # Conversion settings
         csv_reader_options_builder& dtypes(vector[string] types) except+
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
index 158994c81a5..4a3792f5023 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -2,10 +2,10 @@
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
+from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-from libcpp.map cimport map
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view

From 1b543758d0e815a89d40e0a6e3b6ddfce05ec22c Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Mon, 26 Jul 2021 14:16:45 -0700
Subject: [PATCH 15/23] docs fixes

---
 cpp/src/io/csv/reader_impl.cu  | 2 +-
 cpp/src/io/csv/reader_impl.hpp | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 656dde935df..32326b9603b 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -404,7 +404,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
     }
   }
 
-  // User can specify which columns should be inferred as datetime
+  // User can specify which columns should be parsed as datetime
   if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) {
     for (const auto index : opts_.get_parse_hex_indexes()) {
       column_flags_[index] |= column_parse::as_hexadecimal;
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 6bd7b66874b..e0ee367ad3c 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -182,7 +182,7 @@ class reader::impl {
                                             rmm::cuda_stream_view stream);
 
   /**
-   * @brief Sorts the columns' data types from the map of dtypes.
+   * @brief Selects the columns' data types from the map of dtypes.
    *
    * @param col_type_map Column name -> data type map specifying the columns' target data types
    * @return Sorted ist of selected columns' data types
@@ -190,9 +190,9 @@ class reader::impl {
   std::vector<data_type> select_data_types(std::map<std::string, data_type> const& col_type_map);
 
   /**
-   * @brief Sorts the columns' data types from the map of dtypes.
+   * @brief Selects the columns' data types from the list of dtypes.
    *
-   * @param col_type_map Vector of deta types specifying the columns' target data types
+   * @param col_type_map Vector of data types specifying the columns' target data types
    * @return Sorted ist of selected columns' data types
    */
   std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);

From 6638a94ab0e66dad9ebd351830556e2012f5216e Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Mon, 26 Jul 2021 14:27:22 -0700
Subject: [PATCH 16/23] deprecate APIs

---
 cpp/include/cudf/io/csv.hpp  |  6 ++++--
 cpp/include/cudf/io/json.hpp | 15 +++++++++++++--
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 8ebb3a72a85..8b8027b2c64 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -615,7 +615,8 @@ class csv_reader_options {
    */
   [[deprecated(
     "The string-based interface will be deprecated."
-    "Use dtypes(std::vector<data_type>) instead.")]] void
+    "Use dtypes(std::vector<data_type>) or "
+    "dtypes(std::map<std::string, data_type>) instead.")]] void
   set_dtypes(std::vector<std::string> types)
   {
     _dtypes = std::move(types);
@@ -1074,7 +1075,8 @@ class csv_reader_options_builder {
    */
   [[deprecated(
     "The string-based interface will be deprecated."
-    "Use dtypes(std::vector<data_type>) instead.")]] csv_reader_options_builder&
+    "Use dtypes(std::vector<data_type>) or "
+    "dtypes(std::map<std::string, data_type>) instead.")]] csv_reader_options_builder&
   dtypes(std::vector<std::string> types)
   {
     options._dtypes = std::move(types);
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index f5d80f6f6c6..d456a5b8682 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -155,7 +155,14 @@ class json_reader_options {
    *
    * @param types Vector dtypes in string format.
    */
-  void set_dtypes(std::vector<std::string> types) { _dtypes = std::move(types); }
+  [[deprecated(
+    "The string-based interface will be deprecated."
+    "Use dtypes(std::vector<data_type>) or "
+    "dtypes(std::map<std::string, data_type>) instead.")]] void
+  set_dtypes(std::vector<std::string> types)
+  {
+    _dtypes = std::move(types);
+  }
 
   /**
    * @brief Set data types for columns to be read.
@@ -231,7 +238,11 @@ class json_reader_options_builder {
    * @param types Vector of dtypes in string format
    * @return this for chaining
    */
-  json_reader_options_builder& dtypes(std::vector<std::string> types)
+  [[deprecated(
+    "The string-based interface will be deprecated."
+    "Use dtypes(std::vector<data_type>) or "
+    "dtypes(std::map<std::string, data_type>) instead.")]] json_reader_options_builder&
+  dtypes(std::vector<std::string> types)
   {
     options._dtypes = std::move(types);
     return *this;

From c654e106af186a87750925cbce5405bfa063f5fc Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Mon, 2 Aug 2021 11:53:21 -0700
Subject: [PATCH 17/23] doc fix

---
 cpp/include/cudf/io/csv.hpp  | 8 ++++----
 cpp/include/cudf/io/json.hpp | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 8b8027b2c64..d4a21b2e98c 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -293,12 +293,12 @@ class csv_reader_options {
   std::vector<int> const& get_parse_dates_indexes() const { return _parse_dates_indexes; }
 
   /**
-   * @brief Returns names of columns to read as datetime.
+   * @brief Returns names of columns to read as hexadecimal.
    */
   std::vector<std::string> const& get_parse_hex_names() const { return _parse_hex_names; }
 
   /**
-   * @brief Returns indexes of columns to read as datetime.
+   * @brief Returns indexes of columns to read as hexadecimal.
    */
   std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }
 
@@ -1010,7 +1010,7 @@ class csv_reader_options_builder {
   /**
    * @brief Sets indexes of columns to read as datetime.
    *
-   * @param col_names Vector of column indices to read as datetime.
+   * @param col_ind Vector of column indices to read as datetime
    * @return this for chaining.
    */
   csv_reader_options_builder& parse_dates(std::vector<int> col_ind)
@@ -1034,7 +1034,7 @@ class csv_reader_options_builder {
   /**
    * @brief Sets indexes of columns to parse as hexadecimal.
    *
-   * @param col_names Vector of column indices to parse as hexadecimal
+   * @param col_ind Vector of column indices to parse as hexadecimal
    * @return this for chaining.
    */
   csv_reader_options_builder& parse_hex(std::vector<int> col_ind)
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index d456a5b8682..7286d641f60 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -167,7 +167,7 @@ class json_reader_options {
   /**
    * @brief Set data types for columns to be read.
    *
-   * @param types Vector dtypes in string format.
+   * @param types Vector of dtypes
    */
 
   void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }

From 044b6985a3ae61c627688af0ccd199daded62def Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 2 Aug 2021 11:55:21 -0700
Subject: [PATCH 18/23] Apply suggestions from code review

Co-authored-by: Ram (Ramakrishna Prabhu) <42624703+rgsl888prabhu@users.noreply.github.com>
---
 cpp/src/io/csv/reader_impl.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 6bd7b66874b..a89017a74bf 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -185,7 +185,7 @@ class reader::impl {
    * @brief Sorts the columns' data types from the map of dtypes.
    *
    * @param col_type_map Column name -> data type map specifying the columns' target data types
-   * @return Sorted ist of selected columns' data types
+   * @return Sorted list of selected columns' data types
    */
   std::vector<data_type> select_data_types(std::map<std::string, data_type> const& col_type_map);
 
@@ -193,7 +193,7 @@ class reader::impl {
    * @brief Sorts the columns' data types from the map of dtypes.
    *
    * @param col_type_map Vector of deta types specifying the columns' target data types
-   * @return Sorted ist of selected columns' data types
+   * @return Sorted list of selected columns' data types
    */
   std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);
 

From af697987b35ce11f6247a13f41b73a4af9c12287 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 3 Aug 2021 11:53:48 -0700
Subject: [PATCH 19/23] Apply suggestions from code review

Co-authored-by: Elias Stehle <elias.stehle@gmail.com>
---
 cpp/include/cudf/io/json.hpp   | 3 +--
 cpp/src/io/csv/reader_impl.cu  | 2 +-
 cpp/src/io/csv/reader_impl.hpp | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7286d641f60..a0ea6b6ed17 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -153,7 +153,7 @@ class json_reader_options {
   /**
    * @brief Set data types for columns to be read.
    *
-   * @param types Vector dtypes in string format.
+   * @param types Vector of dtypes in string format.
    */
   [[deprecated(
     "The string-based interface will be deprecated."
@@ -169,7 +169,6 @@ class json_reader_options {
    *
    * @param types Vector of dtypes
    */
-
   void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
   /**
    * @brief Set data types for columns to be read.
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 32326b9603b..611ad1c81d3 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -404,7 +404,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
     }
   }
 
-  // User can specify which columns should be parsed as datetime
+  // User can specify which columns should be parsed as hexadecimal
   if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) {
     for (const auto index : opts_.get_parse_hex_indexes()) {
       column_flags_[index] |= column_parse::as_hexadecimal;
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 0d2367b1646..36c2bf4f9e7 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -192,7 +192,7 @@ class reader::impl {
   /**
    * @brief Selects the columns' data types from the list of dtypes.
    *
-   * @param col_type_map Vector of data types specifying the columns' target data types
+   * @param dtypes Vector of data types specifying the columns' target data types
    * @return Sorted list of selected columns' data types
    */
   std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);

From 8fa26947ec0c8a75ceb2b71186eb0038e871d85a Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Tue, 3 Aug 2021 11:54:51 -0700
Subject: [PATCH 20/23] add missing empty line

---
 cpp/include/cudf/io/json.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index a0ea6b6ed17..8954f7dcab1 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -170,6 +170,7 @@ class json_reader_options {
    * @param types Vector of dtypes
    */
   void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
+
   /**
    * @brief Set data types for columns to be read.
    *

From f3d94e91375c9c34da6dbca9081f59c11e08e754 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Tue, 3 Aug 2021 16:19:02 -0700
Subject: [PATCH 21/23] move visitor_overload to utilities

---
 .../detail/utilities/visitor_overload.hpp     | 30 +++++++++++++++++++
 cpp/src/io/csv/reader_impl.cu                 | 15 ++--------
 cpp/src/io/json/reader_impl.cu                | 13 ++------
 3 files changed, 34 insertions(+), 24 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/visitor_overload.hpp

diff --git a/cpp/include/cudf/detail/utilities/visitor_overload.hpp b/cpp/include/cudf/detail/utilities/visitor_overload.hpp
new file mode 100644
index 00000000000..c77947c2015
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/visitor_overload.hpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace cudf::detail {
+
+/**
+ * @brief Helper class to support inline-overloading for all of a variant's alternative types
+ */
+template <class... Ts>
+struct visitor_overload : Ts... {
+  using Ts::operator()...;
+};
+template <class... Ts>
+visitor_overload(Ts...) -> visitor_overload<Ts...>;
+
+}
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 611ad1c81d3..549b0474fe1 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -27,6 +27,7 @@
 
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/table/table.hpp>
@@ -49,18 +50,6 @@ using cudf::device_span;
 using cudf::host_span;
 using cudf::detail::make_device_uvector_async;
 
-namespace {
-/**
- * @brief Helper class to support inline-overloading for all of a variant's alternative types
- */
-template <class... Ts>
-struct VisitorOverload : Ts... {
-  using Ts::operator()...;
-};
-template <class... Ts>
-VisitorOverload(Ts...) -> VisitorOverload<Ts...>;
-}  // namespace
-
 namespace cudf {
 namespace io {
 namespace detail {
@@ -432,7 +421,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
     column_types = infer_column_types(data, row_offsets, stream);
   } else {
     column_types = std::visit(
-      VisitorOverload{
+      cudf::detail::visitor_overload{
         [&](const std::vector<data_type>& data_types) { return select_data_types(data_types); },
         [&](const std::map<std::string, data_type>& data_types) {
           return select_data_types(data_types);
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 53318cf43ac..a8f117c22bf 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -27,6 +27,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/strings/detail/replace.hpp>
@@ -50,16 +51,6 @@ namespace json {
 using namespace cudf::io;
 
 namespace {
-/**
- * @brief Helper class to support inline-overloading for all of a variant's alternative types
- */
-template <class... Ts>
-struct VisitorOverload : Ts... {
-  using Ts::operator()...;
-};
-template <class... Ts>
-VisitorOverload(Ts...) -> VisitorOverload<Ts...>;
-
 /**
  * @brief Estimates the maximum expected length or a row, based on the number
  * of columns
@@ -524,7 +515,7 @@ void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
     std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes());
   if (!has_to_infer_column_types) {
     dtypes_ = std::visit(
-      VisitorOverload{
+      cudf::detail::visitor_overload{
         [&](const std::vector<data_type>& dtypes) { return dtypes; },
         [&](const std::map<std::string, data_type>& dtypes) {
           std::vector<data_type> sorted_dtypes;

From 829928ab4531ff9842f032693a131dd975146628 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Tue, 3 Aug 2021 16:56:50 -0700
Subject: [PATCH 22/23] update yaml

---
 conda/recipes/libcudf/meta.yaml                        | 1 +
 cpp/include/cudf/detail/utilities/visitor_overload.hpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 75bfe6c34bc..35d444d026c 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -102,6 +102,7 @@ test:
     - test -f $PREFIX/include/cudf/detail/utilities/integer_utils.hpp
     - test -f $PREFIX/include/cudf/detail/utilities/int_fastdiv.h
     - test -f $PREFIX/include/cudf/detail/utilities/vector_factories.hpp
+    - test -f $PREFIX/include/cudf/detail/utilities/visitor_overload.hpp
     - test -f $PREFIX/include/cudf/dictionary/detail/concatenate.hpp
     - test -f $PREFIX/include/cudf/dictionary/detail/encode.hpp
     - test -f $PREFIX/include/cudf/dictionary/detail/merge.hpp
diff --git a/cpp/include/cudf/detail/utilities/visitor_overload.hpp b/cpp/include/cudf/detail/utilities/visitor_overload.hpp
index c77947c2015..fb9998df060 100644
--- a/cpp/include/cudf/detail/utilities/visitor_overload.hpp
+++ b/cpp/include/cudf/detail/utilities/visitor_overload.hpp
@@ -27,4 +27,4 @@ struct visitor_overload : Ts... {
 template <class... Ts>
 visitor_overload(Ts...) -> visitor_overload<Ts...>;
 
-}
+}  // namespace cudf::detail

From 3ebe478695f6d4322aa27809eaa5adb8a61c60a2 Mon Sep 17 00:00:00 2001
From: vuule <vmilovanovic@nvidia.com>
Date: Wed, 4 Aug 2021 12:03:01 -0700
Subject: [PATCH 23/23] fix copyright year

---
 cpp/include/cudf/detail/utilities/visitor_overload.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/detail/utilities/visitor_overload.hpp b/cpp/include/cudf/detail/utilities/visitor_overload.hpp
index fb9998df060..a55ca323c50 100644
--- a/cpp/include/cudf/detail/utilities/visitor_overload.hpp
+++ b/cpp/include/cudf/detail/utilities/visitor_overload.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.