From 9c72e56837ddfb3fb9b3d1111cdd08e1f53595c4 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 14 Aug 2021 05:08:16 -0500
Subject: [PATCH 01/26] simplify io/functions.cpp data source/sink factories

---
 cpp/src/io/functions.cpp | 126 +++++++++++++++++++++------------------
 1 file changed, 67 insertions(+), 59 deletions(-)
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index bf51012211c..e080ea3a2ca 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -106,67 +106,56 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder(
 }
 
 namespace {
-template <typename reader, typename reader_options>
-std::unique_ptr<reader> make_reader(source_info const& src_info,
-                                    reader_options const& options,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
-{
-  if (src_info.type == io_type::FILEPATH) {
-    return std::make_unique<reader>(src_info.filepaths, options, stream, mr);
-  }
 
-  std::vector<std::unique_ptr<datasource>> datasources;
-  if (src_info.type == io_type::HOST_BUFFER) {
-    datasources = cudf::io::datasource::create(src_info.buffers);
-  } else if (src_info.type == io_type::USER_IMPLEMENTED) {
-    datasources = cudf::io::datasource::create(src_info.user_sources);
-  } else {
-    CUDF_FAIL("Unsupported source type");
+std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info)
+{
+  switch (info.type) {
+    case io_type::FILEPATH: return cudf::io::datasource::create(info.filepaths);
+    case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers);
+    case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources);
+    default: CUDF_FAIL("Unsupported source type");
   }
-
-  return std::make_unique<reader>(std::move(datasources), options, stream, mr);
 }
 
-template <typename writer, typename... Ts>
-std::unique_ptr<writer> make_writer(sink_info const& sink, Ts&&... args)
+std::unique_ptr<data_sink> make_datasink(sink_info const& info)
 {
-  if (sink.type == io_type::FILEPATH) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(sink.filepath),
-                                    std::forward<Ts>(args)...);
-  }
-  if (sink.type == io_type::HOST_BUFFER) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(sink.buffer),
-                                    std::forward<Ts>(args)...);
+  switch (info.type) {
+    case io_type::FILEPATH: return cudf::io::data_sink::create(info.filepath);
+    case io_type::HOST_BUFFER: return cudf::io::data_sink::create(info.buffer);
+    case io_type::VOID: return cudf::io::data_sink::create();
+    case io_type::USER_IMPLEMENTED: return cudf::io::data_sink::create(info.user_sink);
+    default: CUDF_FAIL("Unsupported sink type");
   }
-  if (sink.type == io_type::VOID) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(), std::forward<Ts>(args)...);
-  }
-  if (sink.type == io_type::USER_IMPLEMENTED) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(sink.user_sink),
-                                    std::forward<Ts>(args)...);
-  }
-  CUDF_FAIL("Unsupported sink type");
 }
 
 }  // namespace
 
-table_with_metadata read_avro(avro_reader_options const& opts, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_avro(avro_reader_options const& options,
+                              rmm::mr::device_memory_resource* mr)
 {
   namespace avro = cudf::io::detail::avro;
 
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<avro::reader>(opts.get_source(), opts, rmm::cuda_stream_default, mr);
-  return reader->read(opts);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader =
+    std::make_unique<avro::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
+
+  return reader->read(options);
 }
 
-table_with_metadata read_json(json_reader_options const& opts, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_json(json_reader_options const& options,
+                              rmm::mr::device_memory_resource* mr)
 {
   namespace json = cudf::io::detail::json;
 
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<json::reader>(opts.get_source(), opts, rmm::cuda_stream_default, mr);
-  return reader->read(opts);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader =
+    std::make_unique<json::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
+
+  return reader->read(options);
 }
 
 table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr)
@@ -174,8 +163,10 @@ table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_
   namespace csv = cudf::io::detail::csv;
 
   CUDF_FUNC_RANGE();
+
+  auto datasources = make_datasources(options.get_source());
   auto reader =
-    make_reader<csv::reader>(options.get_source(), options, rmm::cuda_stream_default, mr);
+    std::make_unique<csv::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read();
 }
@@ -185,7 +176,9 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
 {
   using namespace cudf::io::detail;
 
-  auto writer = make_writer<csv::writer>(options.get_sink(), options, rmm::cuda_stream_default, mr);
+  auto sink = make_datasink(options.get_sink());
+  auto writer =
+    std::make_unique<csv::writer>(std::move(sink), options, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table(), options.get_metadata());
 }
@@ -294,8 +287,10 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info)
 table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto reader =
-    make_reader<detail_orc::reader>(options.get_source(), options, rmm::cuda_stream_default, mr);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader      = std::make_unique<detail_orc::reader>(
+    std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read(options);
 }
@@ -305,11 +300,13 @@ table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_
  */
 void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resource* mr)
 {
+  namespace io_detail = cudf::io::detail;
+
   CUDF_FUNC_RANGE();
 
-  namespace io_detail = cudf::io::detail;
-  auto writer         = make_writer<detail_orc::writer>(
-    options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
+  auto sink   = make_datasink(options.get_sink());
+  auto writer = std::make_unique<detail_orc::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table());
 }
@@ -317,12 +314,15 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc
 /**
  * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer
  */
-orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& op,
+orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options,
                                        rmm::mr::device_memory_resource* mr)
 {
   namespace io_detail = cudf::io::detail;
-  writer              = make_writer<detail_orc::writer>(
-    op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
+
+  auto sink = make_datasink(options.get_sink());
+
+  writer = std::make_unique<detail_orc::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**
@@ -354,8 +354,10 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<detail_parquet::reader>(
-    options.get_source(), options, rmm::cuda_stream_default, mr);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader      = std::make_unique<detail_parquet::reader>(
+    std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read(options);
 }
@@ -392,25 +394,31 @@ table_input_metadata::table_input_metadata(table_view const& table,
 std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const& options,
                                                     rmm::mr::device_memory_resource* mr)
 {
-  CUDF_FUNC_RANGE();
   namespace io_detail = cudf::io::detail;
 
-  auto writer = make_writer<detail_parquet::writer>(
-    options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
+  CUDF_FUNC_RANGE();
+
+  auto sink   = make_datasink(options.get_sink());
+  auto writer = std::make_unique<detail_parquet::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table());
+
   return writer->close(options.get_column_chunks_file_path());
 }
 
 /**
  * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer
  */
-parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& op,
+parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& options,
                                                rmm::mr::device_memory_resource* mr)
 {
   namespace io_detail = cudf::io::detail;
-  writer              = make_writer<detail_parquet::writer>(
-    op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
+
+  auto sink = make_datasink(options.get_sink());
+
+  writer = std::make_unique<detail_parquet::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**

From 9e92ca2bf346bf1d6ce5ba351fd4b7a2991edfea Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 14 Aug 2021 06:58:00 -0500
Subject: [PATCH 02/26] begin replacing csv_reader with pure functions

---
 cpp/include/cudf/io/detail/csv.hpp |  60 +++--------
 cpp/src/io/csv/reader_impl.cu      | 159 +++++++++++++----------------
 cpp/src/io/csv/reader_impl.hpp     |  44 ++++----
 cpp/src/io/functions.cpp           |  12 ++-
 4 files changed, 114 insertions(+), 161 deletions(-)

diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 89e589d306a..aac44bed50e 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -24,55 +24,21 @@ namespace cudf {
 namespace io {
 namespace detail {
 namespace csv {
+
 /**
- * @brief Class to read CSV dataset data into columns.
+ * @brief Reads the entire dataset.
+ *
+ * @param sources Input `datasource` object to read the dataset from
+ * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ *
+ * @return The set of columns along with table metadata
  */
-class reader {
- private:
-  class impl;
-  std::unique_ptr<impl> _impl;
-
- public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  csv_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Constructor from an array of datasources
-   *
-   * @param sources Input `datasource` objects to read the dataset from
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-                  csv_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Destructor explicitly-declared to avoid inlined in header
-   */
-  ~reader();
-
-  /**
-   * @brief Reads the entire dataset.
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return The set of columns along with table metadata
-   */
-  table_with_metadata read(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-};
+table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
+                             csv_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
 class writer {
  public:
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 549b0474fe1..81c7fc65d3f 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -196,8 +196,10 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream)
   container.resize(1, stream);
 }
 
-std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
+std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
+reader_impl::select_data_and_row_offsets(cudf::io::datasource* source,
+                                         csv_reader_options const& opts_,
+                                         rmm::cuda_stream_view stream)
 {
   auto range_offset  = opts_.get_byte_range_offset();
   auto range_size    = opts_.get_byte_range_size();
@@ -205,8 +207,11 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
   auto skip_end_rows = opts_.get_skipfooter();
   auto num_rows      = opts_.get_nrows();
 
+  // we use "infer", but really we are just getting the string name of the compression type.
+  auto compression_type = infer_compression_type(opts_.get_compression(), "", {});
+
   if (range_offset > 0 || range_size > 0) {
-    CUDF_EXPECTS(compression_type_ == "none",
+    CUDF_EXPECTS(compression_type == "none",
                  "Reading compressed data using `byte range` is unsupported");
   }
   size_t map_range_size = 0;
@@ -217,17 +222,10 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
     map_range_size         = range_size + calculateMaxRowSize(num_columns);
   }
 
-  // Support delayed opening of the file if using memory mapping datasource
-  // This allows only mapping of a subset of the file if using byte range
-  if (source_ == nullptr) {
-    assert(!filepath_.empty());
-    source_ = datasource::create(filepath_, range_offset, map_range_size);
-  }
-
   // Transfer source data to GPU
-  if (!source_->is_empty()) {
-    auto data_size = (map_range_size != 0) ? map_range_size : source_->size();
-    auto buffer    = source_->host_read(range_offset, data_size);
+  if (!source->is_empty()) {
+    auto data_size = (map_range_size != 0) ? map_range_size : source->size();
+    auto buffer    = source->host_read(range_offset, data_size);
 
     auto h_data = host_span<char const>(  //
       reinterpret_cast<const char*>(buffer->data()),
@@ -235,8 +233,8 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
 
     std::vector<char> h_uncomp_data_owner;
 
-    if (compression_type_ != "none") {
-      h_uncomp_data_owner = get_uncompressed_data(h_data, compression_type_);
+    if (compression_type != "none") {
+      h_uncomp_data_owner = get_uncompressed_data(h_data, compression_type);
       h_data              = h_uncomp_data_owner;
     }
     // None of the parameters for row selection is used, we are parsing the entire file
@@ -252,7 +250,8 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
 
     // Gather row offsets
     auto data_row_offsets =
-      load_data_and_gather_row_offsets(h_data,
+      load_data_and_gather_row_offsets(opts_,
+                                       h_data,
                                        data_start_offset,
                                        (range_size) ? range_size : h_data.size(),
                                        (skip_rows > 0) ? skip_rows : 0,
@@ -269,7 +268,7 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
   return {rmm::device_uvector<char>{0, stream}, selected_rows_offsets{stream}};
 }
 
-std::vector<data_type> reader::impl::select_data_types(
+std::vector<data_type> reader_impl::select_data_types(
   std::map<std::string, data_type> const& col_type_map)
 {
   std::vector<data_type> selected_dtypes;
@@ -285,7 +284,7 @@ std::vector<data_type> reader::impl::select_data_types(
   return selected_dtypes;
 }
 
-std::vector<data_type> reader::impl::select_data_types(std::vector<data_type> const& dtypes)
+std::vector<data_type> reader_impl::select_data_types(std::vector<data_type> const& dtypes)
 {
   std::vector<data_type> selected_dtypes;
 
@@ -304,9 +303,12 @@ std::vector<data_type> reader::impl::select_data_types(std::vector<data_type> co
   return selected_dtypes;
 }
 
-table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
+table_with_metadata reader_impl::read(cudf::io::datasource* source,
+                                      csv_reader_options const& opts_,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
-  auto const data_row_offsets = select_data_and_row_offsets(stream);
+  auto const data_row_offsets = select_data_and_row_offsets(source, opts_, stream);
   auto const& data            = data_row_offsets.first;
   auto const& row_offsets     = data_row_offsets.second;
 
@@ -418,7 +420,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
 
   std::vector<data_type> column_types;
   if (has_to_infer_column_types) {
-    column_types = infer_column_types(data, row_offsets, stream);
+    column_types = infer_column_types(data, row_offsets, opts_.get_timestamp_type(), stream);
   } else {
     column_types = std::visit(
       cudf::detail::visitor_overload{
@@ -426,14 +428,16 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
         [&](const std::map<std::string, data_type>& data_types) {
           return select_data_types(data_types);
         },
-        [&](const std::vector<string>& dtypes) { return parse_column_types(dtypes); }},
+        [&](const std::vector<string>& dtypes) {
+          return parse_column_types(dtypes, opts_.get_timestamp_type());
+        }},
       opts_.get_dtypes());
   }
 
   out_columns.reserve(column_types.size());
 
   if (num_records_ != 0) {
-    auto out_buffers = decode_data(data, row_offsets, column_types, stream);
+    auto out_buffers = decode_data(data, row_offsets, column_types, stream, mr);
     for (size_t i = 0; i < column_types.size(); ++i) {
       metadata.column_names.emplace_back(out_buffers[i].name);
       if (column_types[i].id() == type_id::STRING && opts.quotechar != '\0' &&
@@ -446,9 +450,9 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
         const std::string dblquotechar(2, opts.quotechar);
         std::unique_ptr<column> col = cudf::make_strings_column(*out_buffers[i]._strings, stream);
         out_columns.emplace_back(
-          cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr_));
+          cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr));
       } else {
-        out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, mr_));
+        out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, mr));
       }
     }
   } else {
@@ -466,7 +470,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
   return {std::make_unique<table>(std::move(out_columns)), std::move(metadata)};
 }
 
-size_t reader::impl::find_first_row_start(host_span<char const> data)
+size_t reader_impl::find_first_row_start(host_span<char const> data)
 {
   // For now, look for the first terminator (assume the first terminator isn't within a quote)
   // TODO: Attempt to infer this from the data
@@ -477,14 +481,15 @@ size_t reader::impl::find_first_row_start(host_span<char const> data)
   return std::min(pos + 1, data.size());
 }
 
-std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
-                                               size_t range_begin,
-                                               size_t range_end,
-                                               size_t skip_rows,
-                                               int64_t num_rows,
-                                               bool load_whole_file,
-                                               rmm::cuda_stream_view stream)
+std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
+reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_,
+                                              host_span<char const> data,
+                                              size_t range_begin,
+                                              size_t range_end,
+                                              size_t skip_rows,
+                                              int64_t num_rows,
+                                              bool load_whole_file,
+                                              rmm::cuda_stream_view stream)
 {
   constexpr size_t max_chunk_bytes = 64 * 1024 * 1024;  // 64MB
   size_t buffer_size               = std::min(max_chunk_bytes, data.size());
@@ -642,9 +647,10 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
   return {std::move(d_data), std::move(row_offsets)};
 }
 
-std::vector<data_type> reader::impl::infer_column_types(device_span<char const> data,
-                                                        device_span<uint64_t const> row_offsets,
-                                                        rmm::cuda_stream_view stream)
+std::vector<data_type> reader_impl::infer_column_types(device_span<char const> data,
+                                                       device_span<uint64_t const> row_offsets,
+                                                       data_type timestamp_type,
+                                                       rmm::cuda_stream_view stream)
 {
   std::vector<data_type> dtypes;
   if (num_records_ == 0) {
@@ -693,9 +699,9 @@ std::vector<data_type> reader::impl::infer_column_types(device_span<char const>
     }
   }
 
-  if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) {
+  if (timestamp_type.id() != cudf::type_id::EMPTY) {
     for (auto& type : dtypes) {
-      if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); }
+      if (cudf::is_timestamp(type)) { type = timestamp_type; }
     }
   }
 
@@ -707,8 +713,8 @@ std::vector<data_type> reader::impl::infer_column_types(device_span<char const>
   return dtypes;
 }
 
-std::vector<data_type> reader::impl::parse_column_types(
-  const std::vector<std::string>& types_as_strings)
+std::vector<data_type> reader_impl::parse_column_types(
+  const std::vector<std::string>& types_as_strings, data_type timestamp_type)
 {
   std::vector<data_type> dtypes;
 
@@ -768,9 +774,9 @@ std::vector<data_type> reader::impl::parse_column_types(
     }
   }
 
-  if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) {
+  if (timestamp_type.id() != cudf::type_id::EMPTY) {
     for (auto& type : dtypes) {
-      if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); }
+      if (cudf::is_timestamp(type)) { type = timestamp_type; }
     }
   }
 
@@ -782,10 +788,11 @@ std::vector<data_type> reader::impl::parse_column_types(
   return dtypes;
 }
 
-std::vector<column_buffer> reader::impl::decode_data(device_span<char const> data,
-                                                     device_span<uint64_t const> row_offsets,
-                                                     host_span<data_type const> column_types,
-                                                     rmm::cuda_stream_view stream)
+std::vector<column_buffer> reader_impl::decode_data(device_span<char const> data,
+                                                    device_span<uint64_t const> row_offsets,
+                                                    host_span<data_type const> column_types,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
 {
   // Alloc output; columns' data memory is still expected for empty dataframe
   std::vector<column_buffer> out_buffers;
@@ -799,7 +806,7 @@ std::vector<column_buffer> reader::impl::decode_data(device_span<char const> dat
                       num_records_,
                       true,
                       stream,
-                      is_final_allocation ? mr_ : rmm::mr::get_current_device_resource());
+                      is_final_allocation ? mr : rmm::mr::get_current_device_resource());
 
       out_buffer.name         = col_names_[col];
       out_buffer.null_count() = UNKNOWN_NULL_COUNT;
@@ -926,51 +933,31 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
   return parse_opts;
 }
 
-reader::impl::impl(std::unique_ptr<datasource> source,
-                   std::string filepath,
-                   csv_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : mr_(mr), source_(std::move(source)), filepath_(filepath), opts_(options)
+reader_impl::reader_impl(parse_options&& parse_options, int32_t num_actual_columns)
+  : opts(std::move(parse_options)),
+    num_actual_cols_(num_actual_columns),
+    num_active_cols_(num_actual_columns)
 {
-  num_actual_cols_ = opts_.get_names().size();
-  num_active_cols_ = num_actual_cols_;
-
-  compression_type_ =
-    infer_compression_type(opts_.get_compression(),
-                           filepath,
-                           {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}});
-
-  opts = make_parse_options(options, stream);
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               csv_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
+                             csv_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
-  // Delay actual instantiation of data source until read to allow for
-  // partial memory mapping of file using byte ranges
-  _impl = std::make_unique<impl>(nullptr, filepaths[0], options, stream, mr);
-}
+  // get the string name of the compression type.
+  // auto compression_type = infer_compression_type(options.get_compression(), "");
 
-// Forward to implementation
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-               csv_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
-  _impl = std::make_unique<impl>(std::move(sources[0]), "", options, stream, mr);
-}
+  auto parse_options = make_parse_options(options, stream);
 
-// Destructor within this translation unit
-reader::~reader() = default;
+  auto num_actual_columns = options.get_names().size();
 
-// Forward to implementation
-table_with_metadata reader::read(rmm::cuda_stream_view stream) { return _impl->read(stream); }
+  auto reader = std::make_unique<reader_impl>(  //
+    std::move(parse_options),
+    num_actual_columns);
+
+  return reader->read(source.get(), options, stream, mr);
+}
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 36c2bf4f9e7..f25beb5e561 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -66,31 +66,29 @@ using namespace cudf::io;
  *
  * Stage 4: Convert every row from csv text form to cudf binary form.
  */
-class reader::impl {
+class reader_impl {
  public:
   /**
    * @brief Constructor from a dataset source with reader options.
    *
-   * @param source Dataset source
-   * @param filepath Filepath if reading dataset from a file
-   * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
    */
-  explicit impl(std::unique_ptr<datasource> source,
-                std::string filepath,
-                csv_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+  explicit reader_impl(parse_options&& parse_options, int32_t num_actual_columns);
 
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns.
    *
+   * @param source Dataset source
+   * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource to use for device memory allocation
    *
    * @return The set of columns along with metadata
    */
-  table_with_metadata read(rmm::cuda_stream_view stream);
+  table_with_metadata read(cudf::io::datasource* source,
+                           csv_reader_options const& opts_,
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr);
 
  private:
   /**
@@ -133,8 +131,10 @@ class reader::impl {
    *
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-  select_data_and_row_offsets(rmm::cuda_stream_view stream);
+  std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
+  select_data_and_row_offsets(cudf::io::datasource* source,
+                              csv_reader_options const& opts_,
+                              rmm::cuda_stream_view stream);
 
   /**
    * @brief Finds row positions in the specified input data, and loads the selected data onto GPU.
@@ -151,8 +151,9 @@ class reader::impl {
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return Input data and row offsets in the device memory
    */
-  std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-  load_data_and_gather_row_offsets(host_span<char const> data,
+  std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
+  load_data_and_gather_row_offsets(csv_reader_options const& opts_,
+                                   host_span<char const> data,
                                    size_t range_begin,
                                    size_t range_end,
                                    size_t skip_rows,
@@ -179,6 +180,7 @@ class reader::impl {
    */
   std::vector<data_type> infer_column_types(device_span<char const> data,
                                             device_span<uint64_t const> row_offsets,
+                                            data_type timestamp_type,
                                             rmm::cuda_stream_view stream);
 
   /**
@@ -204,7 +206,8 @@ class reader::impl {
    * types
    * @return List of columns' data types
    */
-  std::vector<data_type> parse_column_types(std::vector<std::string> const& types_as_strings);
+  std::vector<data_type> parse_column_types(std::vector<std::string> const& types_as_strings,
+                                            data_type timestamp_type);
 
   /**
    * @brief Converts the row-column data and outputs to column bufferrs.
@@ -217,15 +220,10 @@ class reader::impl {
   std::vector<column_buffer> decode_data(device_span<char const> data,
                                          device_span<uint64_t const> row_offsets,
                                          host_span<data_type const> column_types,
-                                         rmm::cuda_stream_view stream);
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
  private:
-  rmm::mr::device_memory_resource* mr_ = nullptr;
-  std::unique_ptr<datasource> source_;
-  std::string filepath_;
-  std::string compression_type_;
-  const csv_reader_options opts_;
-
   cudf::size_type num_records_ = 0;  // Number of rows with actual data
   int num_active_cols_         = 0;  // Number of columns to read
   int num_actual_cols_         = 0;  // Number of columns in the dataset
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index e080ea3a2ca..f6fa475fdd4 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -160,15 +160,17 @@ table_with_metadata read_json(json_reader_options const& options,
 
 table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr)
 {
-  namespace csv = cudf::io::detail::csv;
-
   CUDF_FUNC_RANGE();
 
   auto datasources = make_datasources(options.get_source());
-  auto reader =
-    std::make_unique<csv::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
 
-  return reader->read();
+  CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported.");
+
+  return cudf::io::detail::csv::read_csv(  //
+    std::move(datasources[0]),
+    options,
+    rmm::cuda_stream_default,
+    mr);
 }
 
 // Freeform API wraps the detail writer class API

From 6492349504a9105ac28b01806b42fcf9ad0a1cb8 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 14 Aug 2021 07:20:01 -0500
Subject: [PATCH 03/26] pass parse_options explicitly in csv_reader

---
 cpp/src/io/csv/reader_impl.cu          | 153 +++++++++++++------------
 cpp/src/io/csv/reader_impl.hpp         |  24 ++--
 cpp/src/io/utilities/parsing_utils.cuh |   2 +-
 3 files changed, 96 insertions(+), 83 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 81c7fc65d3f..f06334b7258 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -123,7 +123,7 @@ string removeQuotes(string str, char quotechar)
  * The first row can be either the header row, or the first data row
  */
 std::vector<std::string> setColumnNames(std::vector<char> const& header,
-                                        parse_options_view const& opts,
+                                        parse_options_view const& parse_opts,
                                         int header_row,
                                         std::string prefix)
 {
@@ -138,35 +138,36 @@ std::vector<std::string> setColumnNames(std::vector<char> const& header,
   bool quotation = false;
   for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) {
     // Flip the quotation flag if current character is a quotechar
-    if (first_row[pos] == opts.quotechar) {
+    if (first_row[pos] == parse_opts.quotechar) {
       quotation = !quotation;
     }
     // Check if end of a column/row
-    else if (pos == first_row.size() - 1 || (!quotation && first_row[pos] == opts.terminator) ||
-             (!quotation && first_row[pos] == opts.delimiter)) {
+    else if (pos == first_row.size() - 1 ||
+             (!quotation && first_row[pos] == parse_opts.terminator) ||
+             (!quotation && first_row[pos] == parse_opts.delimiter)) {
       // This is the header, add the column name
       if (header_row >= 0) {
         // Include the current character, in case the line is not terminated
         int col_name_len = pos - prev + 1;
         // Exclude the delimiter/terminator is present
-        if (first_row[pos] == opts.delimiter || first_row[pos] == opts.terminator) {
+        if (first_row[pos] == parse_opts.delimiter || first_row[pos] == parse_opts.terminator) {
           --col_name_len;
         }
         // Also exclude '\r' character at the end of the column name if it's
         // part of the terminator
-        if (col_name_len > 0 && opts.terminator == '\n' && first_row[pos] == '\n' &&
+        if (col_name_len > 0 && parse_opts.terminator == '\n' && first_row[pos] == '\n' &&
             first_row[pos - 1] == '\r') {
           --col_name_len;
         }
 
         const string new_col_name(first_row.data() + prev, col_name_len);
-        col_names.push_back(removeQuotes(new_col_name, opts.quotechar));
+        col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar));
 
         // Stop parsing when we hit the line terminator; relevant when there is
         // a blank line following the header. In this case, first_row includes
         // multiple line terminators at the end, as the new recStart belongs to
         // a line that comes after the blank line(s)
-        if (!quotation && first_row[pos] == opts.terminator) { break; }
+        if (!quotation && first_row[pos] == parse_opts.terminator) { break; }
       } else {
         // This is the first data row, add the automatically generated name
         col_names.push_back(prefix + std::to_string(num_cols));
@@ -174,8 +175,8 @@ std::vector<std::string> setColumnNames(std::vector<char> const& header,
       num_cols++;
 
       // Skip adjacent delimiters if delim_whitespace is set
-      while (opts.multi_delimiter && pos < first_row.size() && first_row[pos] == opts.delimiter &&
-             first_row[pos + 1] == opts.delimiter) {
+      while (parse_opts.multi_delimiter && pos < first_row.size() &&
+             first_row[pos] == parse_opts.delimiter && first_row[pos + 1] == parse_opts.delimiter) {
         ++pos;
       }
       prev = pos + 1;
@@ -198,17 +199,18 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream)
 
 std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
 reader_impl::select_data_and_row_offsets(cudf::io::datasource* source,
-                                         csv_reader_options const& opts_,
+                                         csv_reader_options const& reader_opts,
+                                         parse_options const& parse_opts,
                                          rmm::cuda_stream_view stream)
 {
-  auto range_offset  = opts_.get_byte_range_offset();
-  auto range_size    = opts_.get_byte_range_size();
-  auto skip_rows     = opts_.get_skiprows();
-  auto skip_end_rows = opts_.get_skipfooter();
-  auto num_rows      = opts_.get_nrows();
+  auto range_offset  = reader_opts.get_byte_range_offset();
+  auto range_size    = reader_opts.get_byte_range_size();
+  auto skip_rows     = reader_opts.get_skiprows();
+  auto skip_end_rows = reader_opts.get_skipfooter();
+  auto num_rows      = reader_opts.get_nrows();
 
   // we use "infer", but really we are just getting the string name of the compression type.
-  auto compression_type = infer_compression_type(opts_.get_compression(), "", {});
+  auto compression_type = infer_compression_type(reader_opts.get_compression(), "", {});
 
   if (range_offset > 0 || range_size > 0) {
     CUDF_EXPECTS(compression_type == "none",
@@ -217,8 +219,8 @@ reader_impl::select_data_and_row_offsets(cudf::io::datasource* source,
   size_t map_range_size = 0;
   if (range_size != 0) {
     auto num_given_dtypes =
-      std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes());
-    const auto num_columns = std::max(opts_.get_names().size(), num_given_dtypes);
+      std::visit([](const auto& dtypes) { return dtypes.size(); }, reader_opts.get_dtypes());
+    const auto num_columns = std::max(reader_opts.get_names().size(), num_given_dtypes);
     map_range_size         = range_size + calculateMaxRowSize(num_columns);
   }
 
@@ -242,15 +244,17 @@ reader_impl::select_data_and_row_offsets(cudf::io::datasource* source,
                                  skip_end_rows <= 0 && num_rows == -1;
 
     // With byte range, find the start of the first data row
-    size_t const data_start_offset = (range_offset != 0) ? find_first_row_start(h_data) : 0;
+    size_t const data_start_offset =
+      (range_offset != 0) ? find_first_row_start(parse_opts.terminator, h_data) : 0;
 
     // TODO: Allow parsing the header outside the mapped range
-    CUDF_EXPECTS((range_offset == 0 || opts_.get_header() < 0),
+    CUDF_EXPECTS((range_offset == 0 || reader_opts.get_header() < 0),
                  "byte_range offset with header not supported");
 
     // Gather row offsets
     auto data_row_offsets =
-      load_data_and_gather_row_offsets(opts_,
+      load_data_and_gather_row_offsets(reader_opts,
+                                       parse_opts,
                                        h_data,
                                        data_start_offset,
                                        (range_size) ? range_size : h_data.size(),
@@ -304,23 +308,26 @@ std::vector<data_type> reader_impl::select_data_types(std::vector<data_type> con
 }
 
 table_with_metadata reader_impl::read(cudf::io::datasource* source,
-                                      csv_reader_options const& opts_,
+                                      csv_reader_options const& reader_opts,
+                                      parse_options const& parse_opts,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  auto const data_row_offsets = select_data_and_row_offsets(source, opts_, stream);
-  auto const& data            = data_row_offsets.first;
-  auto const& row_offsets     = data_row_offsets.second;
+  auto const data_row_offsets =
+    select_data_and_row_offsets(source, reader_opts, parse_opts, stream);
+  auto const& data        = data_row_offsets.first;
+  auto const& row_offsets = data_row_offsets.second;
 
   // Exclude the end-of-data row from number of rows with actual data
   num_records_ = std::max(row_offsets.size(), 1ul) - 1;
 
   // Check if the user gave us a list of column names
-  if (not opts_.get_names().empty()) {
-    column_flags_.resize(opts_.get_names().size(), column_parse::enabled);
-    col_names_ = opts_.get_names();
+  if (not reader_opts.get_names().empty()) {
+    column_flags_.resize(reader_opts.get_names().size(), column_parse::enabled);
+    col_names_ = reader_opts.get_names();
   } else {
-    col_names_ = setColumnNames(header_, opts.view(), opts_.get_header(), opts_.get_prefix());
+    col_names_ = setColumnNames(
+      header_, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());
 
     num_actual_cols_ = num_active_cols_ = col_names_.size();
 
@@ -339,7 +346,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
       // Operator [] inserts a default-initialized value if the given key is not
       // present
       if (++col_names_histogram[col_name] > 1) {
-        if (opts_.is_enabled_mangle_dupe_cols()) {
+        if (reader_opts.is_enabled_mangle_dupe_cols()) {
           // Rename duplicates of column X as X.1, X.2, ...; First appearance
           // stays as X
           do {
@@ -355,21 +362,23 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
 
     // Update the number of columns to be processed, if some might have been
     // removed
-    if (!opts_.is_enabled_mangle_dupe_cols()) { num_active_cols_ = col_names_histogram.size(); }
+    if (!reader_opts.is_enabled_mangle_dupe_cols()) {
+      num_active_cols_ = col_names_histogram.size();
+    }
   }
 
   // User can specify which columns should be parsed
-  if (!opts_.get_use_cols_indexes().empty() || !opts_.get_use_cols_names().empty()) {
+  if (!reader_opts.get_use_cols_indexes().empty() || !reader_opts.get_use_cols_names().empty()) {
     std::fill(column_flags_.begin(), column_flags_.end(), column_parse::disabled);
 
-    for (const auto index : opts_.get_use_cols_indexes()) {
+    for (const auto index : reader_opts.get_use_cols_indexes()) {
       column_flags_[index] = column_parse::enabled;
     }
-    num_active_cols_ = std::unordered_set<int>(opts_.get_use_cols_indexes().begin(),
-                                               opts_.get_use_cols_indexes().end())
+    num_active_cols_ = std::unordered_set<int>(reader_opts.get_use_cols_indexes().begin(),
+                                               reader_opts.get_use_cols_indexes().end())
                          .size();
 
-    for (const auto& name : opts_.get_use_cols_names()) {
+    for (const auto& name : reader_opts.get_use_cols_names()) {
       const auto it = std::find(col_names_.begin(), col_names_.end(), name);
       if (it != col_names_.end()) {
         auto curr_it = it - col_names_.begin();
@@ -382,12 +391,13 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   }
 
   // User can specify which columns should be read as datetime
-  if (!opts_.get_parse_dates_indexes().empty() || !opts_.get_parse_dates_names().empty()) {
-    for (const auto index : opts_.get_parse_dates_indexes()) {
+  if (!reader_opts.get_parse_dates_indexes().empty() ||
+      !reader_opts.get_parse_dates_names().empty()) {
+    for (const auto index : reader_opts.get_parse_dates_indexes()) {
       column_flags_[index] |= column_parse::as_datetime;
     }
 
-    for (const auto& name : opts_.get_parse_dates_names()) {
+    for (const auto& name : reader_opts.get_parse_dates_names()) {
       auto it = std::find(col_names_.begin(), col_names_.end(), name);
       if (it != col_names_.end()) {
         column_flags_[it - col_names_.begin()] |= column_parse::as_datetime;
@@ -396,12 +406,12 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   }
 
   // User can specify which columns should be parsed as hexadecimal
-  if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) {
-    for (const auto index : opts_.get_parse_hex_indexes()) {
+  if (!reader_opts.get_parse_hex_indexes().empty() || !reader_opts.get_parse_hex_names().empty()) {
+    for (const auto index : reader_opts.get_parse_hex_indexes()) {
       column_flags_[index] |= column_parse::as_hexadecimal;
     }
 
-    for (const auto& name : opts_.get_parse_hex_names()) {
+    for (const auto& name : reader_opts.get_parse_hex_names()) {
       auto it = std::find(col_names_.begin(), col_names_.end(), name);
       if (it != col_names_.end()) {
         column_flags_[it - col_names_.begin()] |= column_parse::as_hexadecimal;
@@ -416,11 +426,12 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   auto out_columns = std::vector<std::unique_ptr<cudf::column>>();
 
   bool has_to_infer_column_types =
-    std::visit([](const auto& dtypes) { return dtypes.empty(); }, opts_.get_dtypes());
+    std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes());
 
   std::vector<data_type> column_types;
   if (has_to_infer_column_types) {
-    column_types = infer_column_types(data, row_offsets, opts_.get_timestamp_type(), stream);
+    column_types =
+      infer_column_types(parse_opts, data, row_offsets, reader_opts.get_timestamp_type(), stream);
   } else {
     column_types = std::visit(
       cudf::detail::visitor_overload{
@@ -429,25 +440,25 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
           return select_data_types(data_types);
         },
         [&](const std::vector<string>& dtypes) {
-          return parse_column_types(dtypes, opts_.get_timestamp_type());
+          return parse_column_types(dtypes, reader_opts.get_timestamp_type());
         }},
-      opts_.get_dtypes());
+      reader_opts.get_dtypes());
   }
 
   out_columns.reserve(column_types.size());
 
   if (num_records_ != 0) {
-    auto out_buffers = decode_data(data, row_offsets, column_types, stream, mr);
+    auto out_buffers = decode_data(parse_opts, data, row_offsets, column_types, stream, mr);
     for (size_t i = 0; i < column_types.size(); ++i) {
       metadata.column_names.emplace_back(out_buffers[i].name);
-      if (column_types[i].id() == type_id::STRING && opts.quotechar != '\0' &&
-          opts.doublequote == true) {
+      if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' &&
+          parse_opts.doublequote == true) {
         // PANDAS' default behavior of enabling doublequote for two consecutive
         // quotechars in quoted fields results in reduction to a single quotechar
         // TODO: Would be much more efficient to perform this operation in-place
         // during the conversion stage
-        const std::string quotechar(1, opts.quotechar);
-        const std::string dblquotechar(2, opts.quotechar);
+        const std::string quotechar(1, parse_opts.quotechar);
+        const std::string dblquotechar(2, parse_opts.quotechar);
         std::unique_ptr<column> col = cudf::make_strings_column(*out_buffers[i]._strings, stream);
         out_columns.emplace_back(
           cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr));
@@ -470,19 +481,20 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   return {std::make_unique<table>(std::move(out_columns)), std::move(metadata)};
 }
 
-size_t reader_impl::find_first_row_start(host_span<char const> data)
+size_t reader_impl::find_first_row_start(char row_terminator, host_span<char const> data)
 {
   // For now, look for the first terminator (assume the first terminator isn't within a quote)
   // TODO: Attempt to infer this from the data
   size_t pos = 0;
-  while (pos < data.size() && data[pos] != opts.terminator) {
+  while (pos < data.size() && data[pos] != row_terminator) {
     ++pos;
   }
   return std::min(pos + 1, data.size());
 }
 
 std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
-reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_,
+reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& reader_opts,
+                                              parse_options const& parse_opts,
                                               host_span<char const> data,
                                               size_t range_begin,
                                               size_t range_end,
@@ -498,7 +510,7 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_,
   hostdevice_vector<uint64_t> row_ctx(max_blocks);
   size_t buffer_pos  = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size());
   size_t pos         = std::min(range_begin, data.size());
-  size_t header_rows = (opts_.get_header() >= 0) ? opts_.get_header() + 1 : 0;
+  size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0;
   uint64_t ctx       = 0;
 
   // For compatibility with the previous parser, a row is considered in-range if the
@@ -524,7 +536,7 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_,
 
     // Pass 1: Count the potential number of rows in each character block for each
     // possible parser state at the beginning of the block.
-    uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(opts.view(),
+    uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
                                                                  row_ctx.device_ptr(),
                                                                  device_span<uint64_t>(),
                                                                  d_data,
@@ -563,7 +575,7 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_,
                                stream.value()));
 
       // Pass 2: Output row offsets
-      cudf::io::csv::gpu::gather_row_offsets(opts.view(),
+      cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
                                              row_ctx.device_ptr(),
                                              all_row_offsets,
                                              d_data,
@@ -600,8 +612,8 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_,
       // num_rows does not include blank rows
       if (num_rows >= 0) {
         if (all_row_offsets.size() > header_rows + static_cast<size_t>(num_rows)) {
-          size_t num_blanks =
-            cudf::io::csv::gpu::count_blank_rows(opts.view(), d_data, all_row_offsets, stream);
+          size_t num_blanks = cudf::io::csv::gpu::count_blank_rows(
+            parse_opts.view(), d_data, all_row_offsets, stream);
           if (all_row_offsets.size() - num_blanks > header_rows + static_cast<size_t>(num_rows)) {
             // Got the desired number of rows
             break;
@@ -620,7 +632,7 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_,
   } while (pos < data.size());
 
   auto const non_blank_row_offsets =
-    io::csv::gpu::remove_blank_rows(opts.view(), d_data, all_row_offsets, stream);
+    io::csv::gpu::remove_blank_rows(parse_opts.view(), d_data, all_row_offsets, stream);
   auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets};
 
   // Remove header rows and extract header
@@ -647,7 +659,8 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_,
   return {std::move(d_data), std::move(row_offsets)};
 }
 
-std::vector<data_type> reader_impl::infer_column_types(device_span<char const> data,
+std::vector<data_type> reader_impl::infer_column_types(parse_options const& parse_opts,
+                                                       device_span<char const> data,
                                                        device_span<uint64_t const> row_offsets,
                                                        data_type timestamp_type,
                                                        rmm::cuda_stream_view stream)
@@ -657,7 +670,7 @@ std::vector<data_type> reader_impl::infer_column_types(device_span<char const> d
     dtypes.resize(num_active_cols_, data_type{type_id::EMPTY});
   } else {
     auto column_stats =
-      cudf::io::csv::gpu::detect_column_types(opts.view(),
+      cudf::io::csv::gpu::detect_column_types(parse_opts.view(),
                                               data,
                                               make_device_uvector_async(column_flags_, stream),
                                               row_offsets,
@@ -788,7 +801,8 @@ std::vector<data_type> reader_impl::parse_column_types(
   return dtypes;
 }
 
-std::vector<column_buffer> reader_impl::decode_data(device_span<char const> data,
+std::vector<column_buffer> reader_impl::decode_data(parse_options const& parse_opts,
+                                                    device_span<char const> data,
                                                     device_span<uint64_t const> row_offsets,
                                                     host_span<data_type const> column_types,
                                                     rmm::cuda_stream_view stream,
@@ -823,7 +837,7 @@ std::vector<column_buffer> reader_impl::decode_data(device_span<char const> data
     h_valid[i] = out_buffers[i].null_mask();
   }
 
-  cudf::io::csv::gpu::decode_row_column_data(opts.view(),
+  cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(),
                                              data,
                                              make_device_uvector_async(column_flags_, stream),
                                              row_offsets,
@@ -933,10 +947,8 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
   return parse_opts;
 }
 
-reader_impl::reader_impl(parse_options&& parse_options, int32_t num_actual_columns)
-  : opts(std::move(parse_options)),
-    num_actual_cols_(num_actual_columns),
-    num_active_cols_(num_actual_columns)
+reader_impl::reader_impl(int32_t num_actual_columns)
+  : num_actual_cols_(num_actual_columns), num_active_cols_(num_actual_columns)
 {
 }
 
@@ -953,10 +965,9 @@ table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
   auto num_actual_columns = options.get_names().size();
 
   auto reader = std::make_unique<reader_impl>(  //
-    std::move(parse_options),
     num_actual_columns);
 
-  return reader->read(source.get(), options, stream, mr);
+  return reader->read(source.get(), options, parse_options, stream, mr);
 }
 
 }  // namespace csv
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index f25beb5e561..1c0923b03b3 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -71,22 +71,22 @@ class reader_impl {
   /**
    * @brief Constructor from a dataset source with reader options.
    *
-   * @param stream CUDA stream used for device memory operations and kernel launches
    */
-  explicit reader_impl(parse_options&& parse_options, int32_t num_actual_columns);
+  explicit reader_impl(int32_t num_actual_columns);
 
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns.
    *
    * @param source Dataset source
    * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    *
    * @return The set of columns along with metadata
    */
   table_with_metadata read(cudf::io::datasource* source,
-                           csv_reader_options const& opts_,
+                           csv_reader_options const& reader_opts,
+                           parse_options const& parse_opts,
                            rmm::cuda_stream_view stream,
                            rmm::mr::device_memory_resource* mr);
 
@@ -133,7 +133,8 @@ class reader_impl {
    */
   std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
   select_data_and_row_offsets(cudf::io::datasource* source,
-                              csv_reader_options const& opts_,
+                              csv_reader_options const& reader_opts,
+                              parse_options const& parse_opts,
                               rmm::cuda_stream_view stream);
 
   /**
@@ -152,7 +153,8 @@ class reader_impl {
    * @return Input data and row offsets in the device memory
    */
   std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
-  load_data_and_gather_row_offsets(csv_reader_options const& opts_,
+  load_data_and_gather_row_offsets(csv_reader_options const& reader_opts,
+                                   parse_options const& parse_opts,
                                    host_span<char const> data,
                                    size_t range_begin,
                                    size_t range_end,
@@ -168,7 +170,7 @@ class reader_impl {
    *
    * @return Byte position of the first row
    */
-  size_t find_first_row_start(host_span<char const> data);
+  size_t find_first_row_start(char row_terminator, host_span<char const> data);
 
   /**
    * @brief Automatically infers each column's data type based on the CSV's data within that column.
@@ -178,7 +180,8 @@ class reader_impl {
    * @param stream The stream to which the type inference-kernel will be dispatched
    * @return The columns' inferred data types
    */
-  std::vector<data_type> infer_column_types(device_span<char const> data,
+  std::vector<data_type> infer_column_types(parse_options const& parse_opts,
+                                            device_span<char const> data,
                                             device_span<uint64_t const> row_offsets,
                                             data_type timestamp_type,
                                             rmm::cuda_stream_view stream);
@@ -217,7 +220,8 @@ class reader_impl {
    *
    * @return list of column buffers of decoded data, or ptr/size in the case of strings.
    */
-  std::vector<column_buffer> decode_data(device_span<char const> data,
+  std::vector<column_buffer> decode_data(parse_options const& parse_opts,
+                                         device_span<char const> data,
                                          device_span<uint64_t const> row_offsets,
                                          host_span<data_type const> column_types,
                                          rmm::cuda_stream_view stream,
@@ -228,8 +232,6 @@ class reader_impl {
   int num_active_cols_         = 0;  // Number of columns to read
   int num_actual_cols_         = 0;  // Number of columns in the dataset
 
-  // Parsing options
-  parse_options opts{};
   std::vector<column_parse::flags> column_flags_;
 
   // Intermediate data
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 88297423b9b..39857478c1f 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -68,7 +68,7 @@ struct parse_options {
   cudf::detail::optional_trie trie_na;
   bool multi_delimiter;
 
-  parse_options_view view()
+  parse_options_view view() const
   {
     return {delimiter,
             terminator,

From 3e365b59417f82eb1416e41d033bdf9e16917cf0 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 14 Aug 2021 07:34:28 -0500
Subject: [PATCH 04/26] replace csv reader impl::select_data_types with pure
 function

---
 cpp/src/io/csv/reader_impl.cu  | 39 ++++++++++++++++++++++------------
 cpp/src/io/csv/reader_impl.hpp | 16 --------------
 2 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index f06334b7258..292062b0c59 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -272,36 +272,43 @@ reader_impl::select_data_and_row_offsets(cudf::io::datasource* source,
   return {rmm::device_uvector<char>{0, stream}, selected_rows_offsets{stream}};
 }
 
-std::vector<data_type> reader_impl::select_data_types(
-  std::map<std::string, data_type> const& col_type_map)
+std::vector<data_type> get_data_types_from_column_names(
+  std::map<std::string, data_type> const& column_type_map,
+  std::vector<column_parse::flags> const& column_flags,
+  std::vector<std::string> const& column_names,
+  int32_t num_actual_columns)
 {
   std::vector<data_type> selected_dtypes;
 
-  for (int col = 0; col < num_actual_cols_; col++) {
-    if (column_flags_[col] & column_parse::enabled) {
-      auto const col_type_it = col_type_map.find(col_names_[col]);
-      CUDF_EXPECTS(col_type_it != col_type_map.end(),
+  for (int32_t i = 0; i < num_actual_columns; i++) {
+    if (column_flags[i] & column_parse::enabled) {
+      auto const col_type_it = column_type_map.find(column_names[i]);
+      CUDF_EXPECTS(col_type_it != column_type_map.end(),
                    "Must specify data types for all active columns");
       selected_dtypes.emplace_back(col_type_it->second);
     }
   }
+
   return selected_dtypes;
 }
 
-std::vector<data_type> reader_impl::select_data_types(std::vector<data_type> const& dtypes)
+std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes,
+                                         std::vector<column_parse::flags> const& column_flags,
+                                         int32_t num_actual_columns,
+                                         int32_t num_active_columns)
 {
   std::vector<data_type> selected_dtypes;
 
   if (dtypes.size() == 1) {
     // If it's a single dtype, assign that dtype to all active columns
-    selected_dtypes.resize(num_active_cols_, dtypes.front());
+    selected_dtypes.resize(num_active_columns, dtypes.front());
   } else {
     // If it's a list, assign dtypes to active columns in the given order
-    CUDF_EXPECTS(static_cast<int>(dtypes.size()) >= num_actual_cols_,
+    CUDF_EXPECTS(static_cast<int>(dtypes.size()) >= num_actual_columns,
                  "Must specify data types for all columns");
 
-    for (int col = 0; col < num_actual_cols_; col++) {
-      if (column_flags_[col] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[col]); }
+    for (int i = 0; i < num_actual_columns; i++) {
+      if (column_flags[i] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[i]); }
     }
   }
   return selected_dtypes;
@@ -435,9 +442,15 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   } else {
     column_types = std::visit(
       cudf::detail::visitor_overload{
-        [&](const std::vector<data_type>& data_types) { return select_data_types(data_types); },
+        [&](const std::vector<data_type>& data_types) {
+          return select_data_types(data_types, column_flags_, num_actual_cols_, num_active_cols_);
+        },
         [&](const std::map<std::string, data_type>& data_types) {
-          return select_data_types(data_types);
+          return get_data_types_from_column_names(  //
+            data_types,
+            column_flags_,
+            col_names_,
+            num_actual_cols_);
         },
         [&](const std::vector<string>& dtypes) {
           return parse_column_types(dtypes, reader_opts.get_timestamp_type());
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 1c0923b03b3..1dc74759694 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -186,22 +186,6 @@ class reader_impl {
                                             data_type timestamp_type,
                                             rmm::cuda_stream_view stream);
 
-  /**
-   * @brief Selects the columns' data types from the map of dtypes.
-   *
-   * @param col_type_map Column name -> data type map specifying the columns' target data types
-   * @return Sorted list of selected columns' data types
-   */
-  std::vector<data_type> select_data_types(std::map<std::string, data_type> const& col_type_map);
-
-  /**
-   * @brief Selects the columns' data types from the list of dtypes.
-   *
-   * @param dtypes Vector of data types specifying the columns' target data types
-   * @return Sorted list of selected columns' data types
-   */
-  std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);
-
   /**
    * @brief Parses the columns' data types from the vector of dtypes that are provided as strings.
    *

From a4497c0c2e9f72a2ca6195a8988489dbd3352494 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 14 Aug 2021 07:57:20 -0500
Subject: [PATCH 05/26] replace csv reader impl::column_flags_ member with
 local variable

---
 cpp/src/io/csv/reader_impl.cu  | 97 +++++++++++++++++++---------------
 cpp/src/io/csv/reader_impl.hpp |  7 +--
 2 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 292062b0c59..15795807d65 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -273,8 +273,8 @@ reader_impl::select_data_and_row_offsets(cudf::io::datasource* source,
 }
 
 std::vector<data_type> get_data_types_from_column_names(
-  std::map<std::string, data_type> const& column_type_map,
   std::vector<column_parse::flags> const& column_flags,
+  std::map<std::string, data_type> const& column_type_map,
   std::vector<std::string> const& column_names,
   int32_t num_actual_columns)
 {
@@ -292,8 +292,8 @@ std::vector<data_type> get_data_types_from_column_names(
   return selected_dtypes;
 }
 
-std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes,
-                                         std::vector<column_parse::flags> const& column_flags,
+std::vector<data_type> select_data_types(std::vector<column_parse::flags> const& column_flags,
+                                         std::vector<data_type> const& dtypes,
                                          int32_t num_actual_columns,
                                          int32_t num_active_columns)
 {
@@ -328,9 +328,11 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   // Exclude the end-of-data row from number of rows with actual data
   num_records_ = std::max(row_offsets.size(), 1ul) - 1;
 
+  auto column_flags = std::vector<column_parse::flags>();
+
   // Check if the user gave us a list of column names
   if (not reader_opts.get_names().empty()) {
-    column_flags_.resize(reader_opts.get_names().size(), column_parse::enabled);
+    column_flags.resize(reader_opts.get_names().size(), column_parse::enabled);
     col_names_ = reader_opts.get_names();
   } else {
     col_names_ = setColumnNames(
@@ -338,7 +340,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
 
     num_actual_cols_ = num_active_cols_ = col_names_.size();
 
-    column_flags_.resize(num_actual_cols_, column_parse::enabled);
+    column_flags.resize(num_actual_cols_, column_parse::enabled);
 
     // Rename empty column names to "Unnamed: col_index"
     for (size_t col_idx = 0; col_idx < col_names_.size(); ++col_idx) {
@@ -361,8 +363,8 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
           } while (col_names_histogram[col_name]++);
         } else {
           // All duplicate columns will be ignored; First appearance is parsed
-          const auto idx     = &col_name - col_names_.data();
-          column_flags_[idx] = column_parse::disabled;
+          const auto idx    = &col_name - col_names_.data();
+          column_flags[idx] = column_parse::disabled;
         }
       }
     }
@@ -376,10 +378,10 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
 
   // User can specify which columns should be parsed
   if (!reader_opts.get_use_cols_indexes().empty() || !reader_opts.get_use_cols_names().empty()) {
-    std::fill(column_flags_.begin(), column_flags_.end(), column_parse::disabled);
+    std::fill(column_flags.begin(), column_flags.end(), column_parse::disabled);
 
     for (const auto index : reader_opts.get_use_cols_indexes()) {
-      column_flags_[index] = column_parse::enabled;
+      column_flags[index] = column_parse::enabled;
     }
     num_active_cols_ = std::unordered_set<int>(reader_opts.get_use_cols_indexes().begin(),
                                                reader_opts.get_use_cols_indexes().end())
@@ -389,8 +391,8 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
       const auto it = std::find(col_names_.begin(), col_names_.end(), name);
       if (it != col_names_.end()) {
         auto curr_it = it - col_names_.begin();
-        if (column_flags_[curr_it] == column_parse::disabled) {
-          column_flags_[curr_it] = column_parse::enabled;
+        if (column_flags[curr_it] == column_parse::disabled) {
+          column_flags[curr_it] = column_parse::enabled;
           num_active_cols_++;
         }
       }
@@ -401,13 +403,13 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   if (!reader_opts.get_parse_dates_indexes().empty() ||
       !reader_opts.get_parse_dates_names().empty()) {
     for (const auto index : reader_opts.get_parse_dates_indexes()) {
-      column_flags_[index] |= column_parse::as_datetime;
+      column_flags[index] |= column_parse::as_datetime;
     }
 
     for (const auto& name : reader_opts.get_parse_dates_names()) {
       auto it = std::find(col_names_.begin(), col_names_.end(), name);
       if (it != col_names_.end()) {
-        column_flags_[it - col_names_.begin()] |= column_parse::as_datetime;
+        column_flags[it - col_names_.begin()] |= column_parse::as_datetime;
       }
     }
   }
@@ -415,13 +417,13 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   // User can specify which columns should be parsed as hexadecimal
   if (!reader_opts.get_parse_hex_indexes().empty() || !reader_opts.get_parse_hex_names().empty()) {
     for (const auto index : reader_opts.get_parse_hex_indexes()) {
-      column_flags_[index] |= column_parse::as_hexadecimal;
+      column_flags[index] |= column_parse::as_hexadecimal;
     }
 
     for (const auto& name : reader_opts.get_parse_hex_names()) {
       auto it = std::find(col_names_.begin(), col_names_.end(), name);
       if (it != col_names_.end()) {
-        column_flags_[it - col_names_.begin()] |= column_parse::as_hexadecimal;
+        column_flags[it - col_names_.begin()] |= column_parse::as_hexadecimal;
       }
     }
   }
@@ -437,23 +439,23 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
 
   std::vector<data_type> column_types;
   if (has_to_infer_column_types) {
-    column_types =
-      infer_column_types(parse_opts, data, row_offsets, reader_opts.get_timestamp_type(), stream);
+    column_types = infer_column_types(
+      parse_opts, column_flags, data, row_offsets, reader_opts.get_timestamp_type(), stream);
   } else {
     column_types = std::visit(
       cudf::detail::visitor_overload{
         [&](const std::vector<data_type>& data_types) {
-          return select_data_types(data_types, column_flags_, num_actual_cols_, num_active_cols_);
+          return select_data_types(column_flags, data_types, num_actual_cols_, num_active_cols_);
         },
         [&](const std::map<std::string, data_type>& data_types) {
           return get_data_types_from_column_names(  //
+            column_flags,
             data_types,
-            column_flags_,
             col_names_,
             num_actual_cols_);
         },
         [&](const std::vector<string>& dtypes) {
-          return parse_column_types(dtypes, reader_opts.get_timestamp_type());
+          return parse_column_types(column_flags, dtypes, reader_opts.get_timestamp_type());
         }},
       reader_opts.get_dtypes());
   }
@@ -461,7 +463,8 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   out_columns.reserve(column_types.size());
 
   if (num_records_ != 0) {
-    auto out_buffers = decode_data(parse_opts, data, row_offsets, column_types, stream, mr);
+    auto out_buffers =
+      decode_data(parse_opts, column_flags, data, row_offsets, column_types, stream, mr);
     for (size_t i = 0; i < column_types.size(); ++i) {
       metadata.column_names.emplace_back(out_buffers[i].name);
       if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' &&
@@ -486,7 +489,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
     }
     // Handle empty metadata
     for (int col = 0; col < num_actual_cols_; ++col) {
-      if (column_flags_[col] & column_parse::enabled) {
+      if (column_flags[col] & column_parse::enabled) {
         metadata.column_names.emplace_back(col_names_[col]);
       }
     }
@@ -672,11 +675,13 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& reader_o
   return {std::move(d_data), std::move(row_offsets)};
 }
 
-std::vector<data_type> reader_impl::infer_column_types(parse_options const& parse_opts,
-                                                       device_span<char const> data,
-                                                       device_span<uint64_t const> row_offsets,
-                                                       data_type timestamp_type,
-                                                       rmm::cuda_stream_view stream)
+std::vector<data_type> reader_impl::infer_column_types(
+  parse_options const& parse_opts,
+  std::vector<column_parse::flags> const& column_flags,
+  device_span<char const> data,
+  device_span<uint64_t const> row_offsets,
+  data_type timestamp_type,
+  rmm::cuda_stream_view stream)
 {
   std::vector<data_type> dtypes;
   if (num_records_ == 0) {
@@ -685,7 +690,7 @@ std::vector<data_type> reader_impl::infer_column_types(parse_options const& pars
     auto column_stats =
       cudf::io::csv::gpu::detect_column_types(parse_opts.view(),
                                               data,
-                                              make_device_uvector_async(column_flags_, stream),
+                                              make_device_uvector_async(column_flags, stream),
                                               row_offsets,
                                               num_active_cols_,
                                               stream);
@@ -740,13 +745,15 @@ std::vector<data_type> reader_impl::infer_column_types(parse_options const& pars
 }
 
 std::vector<data_type> reader_impl::parse_column_types(
-  const std::vector<std::string>& types_as_strings, data_type timestamp_type)
+  std::vector<column_parse::flags>& column_flags,
+  std::vector<std::string> const& types_as_strings,
+  data_type timestamp_type)
 {
   std::vector<data_type> dtypes;
 
-  const bool is_dict = std::all_of(types_as_strings.begin(),
+  bool const is_dict = std::all_of(types_as_strings.begin(),
                                    types_as_strings.end(),
-                                   [](const auto& s) { return s.find(':') != std::string::npos; });
+                                   [](auto const& s) { return s.find(':') != std::string::npos; });
 
   if (!is_dict) {
     if (types_as_strings.size() == 1) {
@@ -756,7 +763,7 @@ std::vector<data_type> reader_impl::parse_column_types(
       std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]);
       dtypes.resize(num_active_cols_, dtype_);
       for (int col = 0; col < num_actual_cols_; col++) {
-        column_flags_[col] |= col_flags_;
+        column_flags[col] |= col_flags_;
       }
       CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
     } else {
@@ -767,10 +774,10 @@ std::vector<data_type> reader_impl::parse_column_types(
       auto dtype_ = std::back_inserter(dtypes);
 
       for (int col = 0; col < num_actual_cols_; col++) {
-        if (column_flags_[col] & column_parse::enabled) {
+        if (column_flags[col] & column_parse::enabled) {
           column_parse::flags col_flags_;
           std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]);
-          column_flags_[col] |= col_flags_;
+          column_flags[col] |= col_flags_;
           CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
         }
       }
@@ -789,12 +796,12 @@ std::vector<data_type> reader_impl::parse_column_types(
     auto dtype_ = std::back_inserter(dtypes);
 
     for (int col = 0; col < num_actual_cols_; col++) {
-      if (column_flags_[col] & column_parse::enabled) {
+      if (column_flags[col] & column_parse::enabled) {
         CUDF_EXPECTS(col_type_map.find(col_names_[col]) != col_type_map.end(),
                      "Must specify data types for all active columns");
         column_parse::flags col_flags_;
         std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[col_names_[col]]);
-        column_flags_[col] |= col_flags_;
+        column_flags[col] |= col_flags_;
         CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
       }
     }
@@ -814,19 +821,21 @@ std::vector<data_type> reader_impl::parse_column_types(
   return dtypes;
 }
 
-std::vector<column_buffer> reader_impl::decode_data(parse_options const& parse_opts,
-                                                    device_span<char const> data,
-                                                    device_span<uint64_t const> row_offsets,
-                                                    host_span<data_type const> column_types,
-                                                    rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+std::vector<column_buffer> reader_impl::decode_data(
+  parse_options const& parse_opts,
+  std::vector<column_parse::flags> const& column_flags,
+  device_span<char const> data,
+  device_span<uint64_t const> row_offsets,
+  host_span<data_type const> column_types,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   // Alloc output; columns' data memory is still expected for empty dataframe
   std::vector<column_buffer> out_buffers;
   out_buffers.reserve(column_types.size());
 
   for (int col = 0, active_col = 0; col < num_actual_cols_; ++col) {
-    if (column_flags_[col] & column_parse::enabled) {
+    if (column_flags[col] & column_parse::enabled) {
       const bool is_final_allocation = column_types[active_col].id() != type_id::STRING;
       auto out_buffer =
         column_buffer(column_types[active_col],
@@ -852,7 +861,7 @@ std::vector<column_buffer> reader_impl::decode_data(parse_options const& parse_o
 
   cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(),
                                              data,
-                                             make_device_uvector_async(column_flags_, stream),
+                                             make_device_uvector_async(column_flags, stream),
                                              row_offsets,
                                              make_device_uvector_async(column_types, stream),
                                              make_device_uvector_async(h_data, stream),
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 1dc74759694..f2681cd2cf7 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -181,6 +181,7 @@ class reader_impl {
    * @return The columns' inferred data types
    */
   std::vector<data_type> infer_column_types(parse_options const& parse_opts,
+                                            std::vector<column_parse::flags> const& column_flags,
                                             device_span<char const> data,
                                             device_span<uint64_t const> row_offsets,
                                             data_type timestamp_type,
@@ -193,7 +194,8 @@ class reader_impl {
    * types
    * @return List of columns' data types
    */
-  std::vector<data_type> parse_column_types(std::vector<std::string> const& types_as_strings,
+  std::vector<data_type> parse_column_types(std::vector<column_parse::flags>& column_flags,
+                                            std::vector<std::string> const& types_as_strings,
                                             data_type timestamp_type);
 
   /**
@@ -205,6 +207,7 @@ class reader_impl {
    * @return list of column buffers of decoded data, or ptr/size in the case of strings.
    */
   std::vector<column_buffer> decode_data(parse_options const& parse_opts,
+                                         std::vector<column_parse::flags> const& column_flags,
                                          device_span<char const> data,
                                          device_span<uint64_t const> row_offsets,
                                          host_span<data_type const> column_types,
@@ -216,8 +219,6 @@ class reader_impl {
   int num_active_cols_         = 0;  // Number of columns to read
   int num_actual_cols_         = 0;  // Number of columns in the dataset
 
-  std::vector<column_parse::flags> column_flags_;
-
   // Intermediate data
   std::vector<std::string> col_names_;
   std::vector<char> header_;

From 6d708b75895d3e8da956cbdbb39ae3ca21b392f8 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 14 Aug 2021 08:00:50 -0500
Subject: [PATCH 06/26] make csv reader impl::find_first_row_start a standalone
 function

---
 cpp/src/io/csv/reader_impl.cu  | 22 +++++++++++-----------
 cpp/src/io/csv/reader_impl.hpp |  9 ---------
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 15795807d65..d3ebfb6fcbb 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -197,6 +197,17 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream)
   container.resize(1, stream);
 }
 
+size_t find_first_row_start(char row_terminator, host_span<char const> data)
+{
+  // For now, look for the first terminator (assume the first terminator isn't within a quote)
+  // TODO: Attempt to infer this from the data
+  size_t pos = 0;
+  while (pos < data.size() && data[pos] != row_terminator) {
+    ++pos;
+  }
+  return std::min(pos + 1, data.size());
+}
+
 std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
 reader_impl::select_data_and_row_offsets(cudf::io::datasource* source,
                                          csv_reader_options const& reader_opts,
@@ -497,17 +508,6 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   return {std::make_unique<table>(std::move(out_columns)), std::move(metadata)};
 }
 
-size_t reader_impl::find_first_row_start(char row_terminator, host_span<char const> data)
-{
-  // For now, look for the first terminator (assume the first terminator isn't within a quote)
-  // TODO: Attempt to infer this from the data
-  size_t pos = 0;
-  while (pos < data.size() && data[pos] != row_terminator) {
-    ++pos;
-  }
-  return std::min(pos + 1, data.size());
-}
-
 std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
 reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& reader_opts,
                                               parse_options const& parse_opts,
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index f2681cd2cf7..db37d7daea8 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -163,15 +163,6 @@ class reader_impl {
                                    bool load_whole_file,
                                    rmm::cuda_stream_view stream);
 
-  /**
-   * @brief Find the start position of the first data row
-   *
-   * @param h_data Uncompressed input data in host memory
-   *
-   * @return Byte position of the first row
-   */
-  size_t find_first_row_start(char row_terminator, host_span<char const> data);
-
   /**
    * @brief Automatically infers each column's data type based on the CSV's data within that column.
    *

From 26e37e2ea198881f4494bff2bb7602137e75b601 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 14 Aug 2021 08:08:30 -0500
Subject: [PATCH 07/26] make csv reader impl:col_names_ a local variable

---
 cpp/src/io/csv/reader_impl.cu  | 54 ++++++++++++++++++----------------
 cpp/src/io/csv/reader_impl.hpp |  4 ++-
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index d3ebfb6fcbb..80f91f985ea 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -340,29 +340,30 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   num_records_ = std::max(row_offsets.size(), 1ul) - 1;
 
   auto column_flags = std::vector<column_parse::flags>();
+  auto column_names = std::vector<std::string>();
 
   // Check if the user gave us a list of column names
   if (not reader_opts.get_names().empty()) {
     column_flags.resize(reader_opts.get_names().size(), column_parse::enabled);
-    col_names_ = reader_opts.get_names();
+    column_names = reader_opts.get_names();
   } else {
-    col_names_ = setColumnNames(
+    column_names = setColumnNames(
       header_, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());
 
-    num_actual_cols_ = num_active_cols_ = col_names_.size();
+    num_actual_cols_ = num_active_cols_ = column_names.size();
 
     column_flags.resize(num_actual_cols_, column_parse::enabled);
 
     // Rename empty column names to "Unnamed: col_index"
-    for (size_t col_idx = 0; col_idx < col_names_.size(); ++col_idx) {
-      if (col_names_[col_idx].empty()) {
-        col_names_[col_idx] = string("Unnamed: ") + std::to_string(col_idx);
+    for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) {
+      if (column_names[col_idx].empty()) {
+        column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx);
       }
     }
 
     // Looking for duplicates
     std::unordered_map<string, int> col_names_histogram;
-    for (auto& col_name : col_names_) {
+    for (auto& col_name : column_names) {
       // Operator [] inserts a default-initialized value if the given key is not
       // present
       if (++col_names_histogram[col_name] > 1) {
@@ -374,7 +375,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
           } while (col_names_histogram[col_name]++);
         } else {
           // All duplicate columns will be ignored; First appearance is parsed
-          const auto idx    = &col_name - col_names_.data();
+          const auto idx    = &col_name - column_names.data();
           column_flags[idx] = column_parse::disabled;
         }
       }
@@ -399,9 +400,9 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
                          .size();
 
     for (const auto& name : reader_opts.get_use_cols_names()) {
-      const auto it = std::find(col_names_.begin(), col_names_.end(), name);
-      if (it != col_names_.end()) {
-        auto curr_it = it - col_names_.begin();
+      const auto it = std::find(column_names.begin(), column_names.end(), name);
+      if (it != column_names.end()) {
+        auto curr_it = it - column_names.begin();
         if (column_flags[curr_it] == column_parse::disabled) {
           column_flags[curr_it] = column_parse::enabled;
           num_active_cols_++;
@@ -418,9 +419,9 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
     }
 
     for (const auto& name : reader_opts.get_parse_dates_names()) {
-      auto it = std::find(col_names_.begin(), col_names_.end(), name);
-      if (it != col_names_.end()) {
-        column_flags[it - col_names_.begin()] |= column_parse::as_datetime;
+      auto it = std::find(column_names.begin(), column_names.end(), name);
+      if (it != column_names.end()) {
+        column_flags[it - column_names.begin()] |= column_parse::as_datetime;
       }
     }
   }
@@ -432,9 +433,9 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
     }
 
     for (const auto& name : reader_opts.get_parse_hex_names()) {
-      auto it = std::find(col_names_.begin(), col_names_.end(), name);
-      if (it != col_names_.end()) {
-        column_flags[it - col_names_.begin()] |= column_parse::as_hexadecimal;
+      auto it = std::find(column_names.begin(), column_names.end(), name);
+      if (it != column_names.end()) {
+        column_flags[it - column_names.begin()] |= column_parse::as_hexadecimal;
       }
     }
   }
@@ -462,11 +463,12 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
           return get_data_types_from_column_names(  //
             column_flags,
             data_types,
-            col_names_,
+            column_names,
             num_actual_cols_);
         },
         [&](const std::vector<string>& dtypes) {
-          return parse_column_types(column_flags, dtypes, reader_opts.get_timestamp_type());
+          return parse_column_types(
+            column_flags, column_names, dtypes, reader_opts.get_timestamp_type());
         }},
       reader_opts.get_dtypes());
   }
@@ -474,8 +476,8 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   out_columns.reserve(column_types.size());
 
   if (num_records_ != 0) {
-    auto out_buffers =
-      decode_data(parse_opts, column_flags, data, row_offsets, column_types, stream, mr);
+    auto out_buffers = decode_data(
+      parse_opts, column_flags, column_names, data, row_offsets, column_types, stream, mr);
     for (size_t i = 0; i < column_types.size(); ++i) {
       metadata.column_names.emplace_back(out_buffers[i].name);
       if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' &&
@@ -501,7 +503,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
     // Handle empty metadata
     for (int col = 0; col < num_actual_cols_; ++col) {
       if (column_flags[col] & column_parse::enabled) {
-        metadata.column_names.emplace_back(col_names_[col]);
+        metadata.column_names.emplace_back(column_names[col]);
       }
     }
   }
@@ -746,6 +748,7 @@ std::vector<data_type> reader_impl::infer_column_types(
 
 std::vector<data_type> reader_impl::parse_column_types(
   std::vector<column_parse::flags>& column_flags,
+  std::vector<std::string> const& column_names,
   std::vector<std::string> const& types_as_strings,
   data_type timestamp_type)
 {
@@ -797,10 +800,10 @@ std::vector<data_type> reader_impl::parse_column_types(
 
     for (int col = 0; col < num_actual_cols_; col++) {
       if (column_flags[col] & column_parse::enabled) {
-        CUDF_EXPECTS(col_type_map.find(col_names_[col]) != col_type_map.end(),
+        CUDF_EXPECTS(col_type_map.find(column_names[col]) != col_type_map.end(),
                      "Must specify data types for all active columns");
         column_parse::flags col_flags_;
-        std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[col_names_[col]]);
+        std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[column_names[col]]);
         column_flags[col] |= col_flags_;
         CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
       }
@@ -824,6 +827,7 @@ std::vector<data_type> reader_impl::parse_column_types(
 std::vector<column_buffer> reader_impl::decode_data(
   parse_options const& parse_opts,
   std::vector<column_parse::flags> const& column_flags,
+  std::vector<std::string> const& column_names,
   device_span<char const> data,
   device_span<uint64_t const> row_offsets,
   host_span<data_type const> column_types,
@@ -844,7 +848,7 @@ std::vector<column_buffer> reader_impl::decode_data(
                       stream,
                       is_final_allocation ? mr : rmm::mr::get_current_device_resource());
 
-      out_buffer.name         = col_names_[col];
+      out_buffer.name         = column_names[col];
       out_buffer.null_count() = UNKNOWN_NULL_COUNT;
       out_buffers.emplace_back(std::move(out_buffer));
       active_col++;
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index db37d7daea8..9ce65dcc28f 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -186,6 +186,7 @@ class reader_impl {
    * @return List of columns' data types
    */
   std::vector<data_type> parse_column_types(std::vector<column_parse::flags>& column_flags,
+                                            std::vector<std::string> const& column_names,
                                             std::vector<std::string> const& types_as_strings,
                                             data_type timestamp_type);
 
@@ -199,6 +200,7 @@ class reader_impl {
    */
   std::vector<column_buffer> decode_data(parse_options const& parse_opts,
                                          std::vector<column_parse::flags> const& column_flags,
+                                         std::vector<std::string> const& column_names,
                                          device_span<char const> data,
                                          device_span<uint64_t const> row_offsets,
                                          host_span<data_type const> column_types,
@@ -211,7 +213,7 @@ class reader_impl {
   int num_actual_cols_         = 0;  // Number of columns in the dataset
 
   // Intermediate data
-  std::vector<std::string> col_names_;
+  // std::vector<std::string> col_names_;
   std::vector<char> header_;
 };
 

From 9d84753a449eca35de3a88a76f17798550df82bc Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 14 Aug 2021 12:05:58 -0500
Subject: [PATCH 08/26] replace csv reader impl::num_records with local
 variable.

---
 cpp/src/io/csv/reader_impl.cu  | 45 ++++++++++++++++++++++------------
 cpp/src/io/csv/reader_impl.hpp |  8 +++---
 2 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 80f91f985ea..373ade3b2e1 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -122,10 +122,10 @@ string removeQuotes(string str, char quotechar)
  * @brief Parse the first row to set the column names in the raw_csv parameter.
  * The first row can be either the header row, or the first data row
  */
-std::vector<std::string> setColumnNames(std::vector<char> const& header,
-                                        parse_options_view const& parse_opts,
-                                        int header_row,
-                                        std::string prefix)
+std::vector<std::string> get_column_names(std::vector<char> const& header,
+                                          parse_options_view const& parse_opts,
+                                          int header_row,
+                                          std::string prefix)
 {
   std::vector<std::string> col_names;
 
@@ -337,8 +337,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   auto const& row_offsets = data_row_offsets.second;
 
   // Exclude the end-of-data row from number of rows with actual data
-  num_records_ = std::max(row_offsets.size(), 1ul) - 1;
-
+  auto num_records  = std::max(row_offsets.size(), 1ul) - 1;
   auto column_flags = std::vector<column_parse::flags>();
   auto column_names = std::vector<std::string>();
 
@@ -347,7 +346,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
     column_flags.resize(reader_opts.get_names().size(), column_parse::enabled);
     column_names = reader_opts.get_names();
   } else {
-    column_names = setColumnNames(
+    column_names = get_column_names(
       header_, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());
 
     num_actual_cols_ = num_active_cols_ = column_names.size();
@@ -451,8 +450,14 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
 
   std::vector<data_type> column_types;
   if (has_to_infer_column_types) {
-    column_types = infer_column_types(
-      parse_opts, column_flags, data, row_offsets, reader_opts.get_timestamp_type(), stream);
+    column_types = infer_column_types(  //
+      parse_opts,
+      column_flags,
+      data,
+      row_offsets,
+      num_records,
+      reader_opts.get_timestamp_type(),
+      stream);
   } else {
     column_types = std::visit(
       cudf::detail::visitor_overload{
@@ -475,9 +480,17 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
 
   out_columns.reserve(column_types.size());
 
-  if (num_records_ != 0) {
-    auto out_buffers = decode_data(
-      parse_opts, column_flags, column_names, data, row_offsets, column_types, stream, mr);
+  if (num_records != 0) {
+    auto out_buffers = decode_data(  //
+      parse_opts,
+      column_flags,
+      column_names,
+      data,
+      row_offsets,
+      column_types,
+      num_records,
+      stream,
+      mr);
     for (size_t i = 0; i < column_types.size(); ++i) {
       metadata.column_names.emplace_back(out_buffers[i].name);
       if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' &&
@@ -682,11 +695,12 @@ std::vector<data_type> reader_impl::infer_column_types(
   std::vector<column_parse::flags> const& column_flags,
   device_span<char const> data,
   device_span<uint64_t const> row_offsets,
+  int32_t num_records,
   data_type timestamp_type,
   rmm::cuda_stream_view stream)
 {
   std::vector<data_type> dtypes;
-  if (num_records_ == 0) {
+  if (num_records == 0) {
     dtypes.resize(num_active_cols_, data_type{type_id::EMPTY});
   } else {
     auto column_stats =
@@ -704,7 +718,7 @@ std::vector<data_type> reader_impl::infer_column_types(
                                            column_stats[col].negative_small_int_count +
                                            column_stats[col].positive_small_int_count;
 
-      if (column_stats[col].null_count == num_records_) {
+      if (column_stats[col].null_count == num_records) {
         // Entire column is NULL; allocate the smallest amount of memory
         dtypes.emplace_back(cudf::type_id::INT8);
       } else if (column_stats[col].string_count > 0L) {
@@ -831,6 +845,7 @@ std::vector<column_buffer> reader_impl::decode_data(
   device_span<char const> data,
   device_span<uint64_t const> row_offsets,
   host_span<data_type const> column_types,
+  int32_t num_records,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
@@ -843,7 +858,7 @@ std::vector<column_buffer> reader_impl::decode_data(
       const bool is_final_allocation = column_types[active_col].id() != type_id::STRING;
       auto out_buffer =
         column_buffer(column_types[active_col],
-                      num_records_,
+                      num_records,
                       true,
                       stream,
                       is_final_allocation ? mr : rmm::mr::get_current_device_resource());
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 9ce65dcc28f..006c9ce9800 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -175,6 +175,7 @@ class reader_impl {
                                             std::vector<column_parse::flags> const& column_flags,
                                             device_span<char const> data,
                                             device_span<uint64_t const> row_offsets,
+                                            int32_t num_records,
                                             data_type timestamp_type,
                                             rmm::cuda_stream_view stream);
 
@@ -204,16 +205,15 @@ class reader_impl {
                                          device_span<char const> data,
                                          device_span<uint64_t const> row_offsets,
                                          host_span<data_type const> column_types,
+                                         int32_t num_records,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr);
 
  private:
-  cudf::size_type num_records_ = 0;  // Number of rows with actual data
-  int num_active_cols_         = 0;  // Number of columns to read
-  int num_actual_cols_         = 0;  // Number of columns in the dataset
+  int num_active_cols_ = 0;  // Number of columns to read
+  int num_actual_cols_ = 0;  // Number of columns in the dataset
 
   // Intermediate data
-  // std::vector<std::string> col_names_;
   std::vector<char> header_;
 };
 

From 7ce862ee16cc83c5bbbd7f3d6667ea518527e437 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 14 Aug 2021 12:18:13 -0500
Subject: [PATCH 09/26] convert csv reader impl ::num_actual_columns and
 ::num_active_columns to local variables

---
 cpp/src/io/csv/reader_impl.cu  | 109 ++++++++++++++++++---------------
 cpp/src/io/csv/reader_impl.hpp |  11 ++--
 2 files changed, 64 insertions(+), 56 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 373ade3b2e1..84a4399f465 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -337,9 +337,11 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   auto const& row_offsets = data_row_offsets.second;
 
   // Exclude the end-of-data row from number of rows with actual data
-  auto num_records  = std::max(row_offsets.size(), 1ul) - 1;
-  auto column_flags = std::vector<column_parse::flags>();
-  auto column_names = std::vector<std::string>();
+  auto num_records        = std::max(row_offsets.size(), 1ul) - 1;
+  auto column_flags       = std::vector<column_parse::flags>();
+  auto column_names       = std::vector<std::string>();
+  auto num_actual_columns = static_cast<int32_t>(reader_opts.get_names().size());
+  auto num_active_columns = num_actual_columns;
 
   // Check if the user gave us a list of column names
   if (not reader_opts.get_names().empty()) {
@@ -349,9 +351,9 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
     column_names = get_column_names(
       header_, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());
 
-    num_actual_cols_ = num_active_cols_ = column_names.size();
+    num_actual_columns = num_active_columns = column_names.size();
 
-    column_flags.resize(num_actual_cols_, column_parse::enabled);
+    column_flags.resize(num_actual_columns, column_parse::enabled);
 
     // Rename empty column names to "Unnamed: col_index"
     for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) {
@@ -383,7 +385,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
     // Update the number of columns to be processed, if some might have been
     // removed
     if (!reader_opts.is_enabled_mangle_dupe_cols()) {
-      num_active_cols_ = col_names_histogram.size();
+      num_active_columns = col_names_histogram.size();
     }
   }
 
@@ -394,9 +396,9 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
     for (const auto index : reader_opts.get_use_cols_indexes()) {
       column_flags[index] = column_parse::enabled;
     }
-    num_active_cols_ = std::unordered_set<int>(reader_opts.get_use_cols_indexes().begin(),
-                                               reader_opts.get_use_cols_indexes().end())
-                         .size();
+    num_active_columns = std::unordered_set<int>(reader_opts.get_use_cols_indexes().begin(),
+                                                 reader_opts.get_use_cols_indexes().end())
+                           .size();
 
     for (const auto& name : reader_opts.get_use_cols_names()) {
       const auto it = std::find(column_names.begin(), column_names.end(), name);
@@ -404,7 +406,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
         auto curr_it = it - column_names.begin();
         if (column_flags[curr_it] == column_parse::disabled) {
           column_flags[curr_it] = column_parse::enabled;
-          num_active_cols_++;
+          num_active_columns++;
         }
       }
     }
@@ -440,7 +442,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   }
 
   // Return empty table rather than exception if nothing to load
-  if (num_active_cols_ == 0) { return {std::make_unique<table>(), {}}; }
+  if (num_active_columns == 0) { return {std::make_unique<table>(), {}}; }
 
   auto metadata    = table_metadata{};
   auto out_columns = std::vector<std::unique_ptr<cudf::column>>();
@@ -456,26 +458,32 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
       data,
       row_offsets,
       num_records,
+      num_active_columns,
       reader_opts.get_timestamp_type(),
       stream);
   } else {
-    column_types = std::visit(
-      cudf::detail::visitor_overload{
-        [&](const std::vector<data_type>& data_types) {
-          return select_data_types(column_flags, data_types, num_actual_cols_, num_active_cols_);
-        },
-        [&](const std::map<std::string, data_type>& data_types) {
-          return get_data_types_from_column_names(  //
-            column_flags,
-            data_types,
-            column_names,
-            num_actual_cols_);
-        },
-        [&](const std::vector<string>& dtypes) {
-          return parse_column_types(
-            column_flags, column_names, dtypes, reader_opts.get_timestamp_type());
-        }},
-      reader_opts.get_dtypes());
+    column_types =
+      std::visit(cudf::detail::visitor_overload{
+                   [&](const std::vector<data_type>& data_types) {
+                     return select_data_types(
+                       column_flags, data_types, num_actual_columns, num_active_columns);
+                   },
+                   [&](const std::map<std::string, data_type>& data_types) {
+                     return get_data_types_from_column_names(  //
+                       column_flags,
+                       data_types,
+                       column_names,
+                       num_actual_columns);
+                   },
+                   [&](const std::vector<string>& dtypes) {
+                     return parse_column_types(column_flags,
+                                               column_names,
+                                               dtypes,
+                                               num_actual_columns,
+                                               num_active_columns,
+                                               reader_opts.get_timestamp_type());
+                   }},
+                 reader_opts.get_dtypes());
   }
 
   out_columns.reserve(column_types.size());
@@ -489,6 +497,8 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
       row_offsets,
       column_types,
       num_records,
+      num_actual_columns,
+      num_active_columns,
       stream,
       mr);
     for (size_t i = 0; i < column_types.size(); ++i) {
@@ -514,7 +524,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
       out_columns.emplace_back(make_empty_column(column_types[i]));
     }
     // Handle empty metadata
-    for (int col = 0; col < num_actual_cols_; ++col) {
+    for (int col = 0; col < num_actual_columns; ++col) {
       if (column_flags[col] & column_parse::enabled) {
         metadata.column_names.emplace_back(column_names[col]);
       }
@@ -696,24 +706,25 @@ std::vector<data_type> reader_impl::infer_column_types(
   device_span<char const> data,
   device_span<uint64_t const> row_offsets,
   int32_t num_records,
+  int32_t num_active_columns,
   data_type timestamp_type,
   rmm::cuda_stream_view stream)
 {
   std::vector<data_type> dtypes;
   if (num_records == 0) {
-    dtypes.resize(num_active_cols_, data_type{type_id::EMPTY});
+    dtypes.resize(num_active_columns, data_type{type_id::EMPTY});
   } else {
     auto column_stats =
       cudf::io::csv::gpu::detect_column_types(parse_opts.view(),
                                               data,
                                               make_device_uvector_async(column_flags, stream),
                                               row_offsets,
-                                              num_active_cols_,
+                                              num_active_columns,
                                               stream);
 
     stream.synchronize();
 
-    for (int col = 0; col < num_active_cols_; col++) {
+    for (int col = 0; col < num_active_columns; col++) {
       unsigned long long int_count_total = column_stats[col].big_int_count +
                                            column_stats[col].negative_small_int_count +
                                            column_stats[col].positive_small_int_count;
@@ -764,6 +775,8 @@ std::vector<data_type> reader_impl::parse_column_types(
   std::vector<column_parse::flags>& column_flags,
   std::vector<std::string> const& column_names,
   std::vector<std::string> const& types_as_strings,
+  int32_t num_actual_columns,
+  int32_t num_active_columns,
   data_type timestamp_type)
 {
   std::vector<data_type> dtypes;
@@ -778,19 +791,19 @@ std::vector<data_type> reader_impl::parse_column_types(
       data_type dtype_;
       column_parse::flags col_flags_;
       std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]);
-      dtypes.resize(num_active_cols_, dtype_);
-      for (int col = 0; col < num_actual_cols_; col++) {
+      dtypes.resize(num_active_columns, dtype_);
+      for (int col = 0; col < num_actual_columns; col++) {
         column_flags[col] |= col_flags_;
       }
       CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
     } else {
       // If it's a list, assign dtypes to active columns in the given order
-      CUDF_EXPECTS(static_cast<int>(types_as_strings.size()) >= num_actual_cols_,
+      CUDF_EXPECTS(static_cast<int>(types_as_strings.size()) >= num_actual_columns,
                    "Must specify data types for all columns");
 
       auto dtype_ = std::back_inserter(dtypes);
 
-      for (int col = 0; col < num_actual_cols_; col++) {
+      for (int col = 0; col < num_actual_columns; col++) {
         if (column_flags[col] & column_parse::enabled) {
           column_parse::flags col_flags_;
           std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]);
@@ -812,7 +825,7 @@ std::vector<data_type> reader_impl::parse_column_types(
 
     auto dtype_ = std::back_inserter(dtypes);
 
-    for (int col = 0; col < num_actual_cols_; col++) {
+    for (int col = 0; col < num_actual_columns; col++) {
       if (column_flags[col] & column_parse::enabled) {
         CUDF_EXPECTS(col_type_map.find(column_names[col]) != col_type_map.end(),
                      "Must specify data types for all active columns");
@@ -846,6 +859,8 @@ std::vector<column_buffer> reader_impl::decode_data(
   device_span<uint64_t const> row_offsets,
   host_span<data_type const> column_types,
   int32_t num_records,
+  int32_t num_actual_columns,
+  int32_t num_active_columns,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
@@ -853,7 +868,7 @@ std::vector<column_buffer> reader_impl::decode_data(
   std::vector<column_buffer> out_buffers;
   out_buffers.reserve(column_types.size());
 
-  for (int col = 0, active_col = 0; col < num_actual_cols_; ++col) {
+  for (int col = 0, active_col = 0; col < num_actual_columns; ++col) {
     if (column_flags[col] & column_parse::enabled) {
       const bool is_final_allocation = column_types[active_col].id() != type_id::STRING;
       auto out_buffer =
@@ -870,10 +885,10 @@ std::vector<column_buffer> reader_impl::decode_data(
     }
   }
 
-  thrust::host_vector<void*> h_data(num_active_cols_);
-  thrust::host_vector<bitmask_type*> h_valid(num_active_cols_);
+  thrust::host_vector<void*> h_data(num_active_columns);
+  thrust::host_vector<bitmask_type*> h_valid(num_active_columns);
 
-  for (int i = 0; i < num_active_cols_; ++i) {
+  for (int i = 0; i < num_active_columns; ++i) {
     h_data[i]  = out_buffers[i].data();
     h_valid[i] = out_buffers[i].null_mask();
   }
@@ -988,10 +1003,7 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
   return parse_opts;
 }
 
-reader_impl::reader_impl(int32_t num_actual_columns)
-  : num_actual_cols_(num_actual_columns), num_active_cols_(num_actual_columns)
-{
-}
+reader_impl::reader_impl() {}
 
 table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
                              csv_reader_options const& options,
@@ -1003,12 +1015,7 @@ table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
 
   auto parse_options = make_parse_options(options, stream);
 
-  auto num_actual_columns = options.get_names().size();
-
-  auto reader = std::make_unique<reader_impl>(  //
-    num_actual_columns);
-
-  return reader->read(source.get(), options, parse_options, stream, mr);
+  return std::make_unique<reader_impl>()->read(source.get(), options, parse_options, stream, mr);
 }
 
 }  // namespace csv
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 006c9ce9800..238e83eadc3 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -72,7 +72,7 @@ class reader_impl {
    * @brief Constructor from a dataset source with reader options.
    *
    */
-  explicit reader_impl(int32_t num_actual_columns);
+  explicit reader_impl();
 
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns.
@@ -176,6 +176,7 @@ class reader_impl {
                                             device_span<char const> data,
                                             device_span<uint64_t const> row_offsets,
                                             int32_t num_records,
+                                            int32_t num_active_columns,
                                             data_type timestamp_type,
                                             rmm::cuda_stream_view stream);
 
@@ -189,6 +190,8 @@ class reader_impl {
   std::vector<data_type> parse_column_types(std::vector<column_parse::flags>& column_flags,
                                             std::vector<std::string> const& column_names,
                                             std::vector<std::string> const& types_as_strings,
+                                            int32_t num_actual_columns,
+                                            int32_t num_active_columns,
                                             data_type timestamp_type);
 
   /**
@@ -206,14 +209,12 @@ class reader_impl {
                                          device_span<uint64_t const> row_offsets,
                                          host_span<data_type const> column_types,
                                          int32_t num_records,
+                                         int32_t num_actual_columns,
+                                         int32_t num_active_columns,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr);
 
  private:
-  int num_active_cols_ = 0;  // Number of columns to read
-  int num_actual_cols_ = 0;  // Number of columns in the dataset
-
-  // Intermediate data
   std::vector<char> header_;
 };
 

From 9010fe195d4a13e90d94114fd40567929940ae3e Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 14 Aug 2021 16:08:56 -0500
Subject: [PATCH 10/26] remove csv reader class and impl class in favor of
 fucntions

---
 cpp/src/io/csv/reader_impl.cu  | 214 +++++++++++++++++++++++--------
 cpp/src/io/csv/reader_impl.hpp | 224 ---------------------------------
 2 files changed, 159 insertions(+), 279 deletions(-)
 delete mode 100644 cpp/src/io/csv/reader_impl.hpp

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 84a4399f465..1ee321b30fd 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -19,15 +19,22 @@
  * @brief cuDF-IO CSV reader class implementation
  */
 
-#include "reader_impl.hpp"
+#include "csv_common.h"
+#include "csv_gpu.h"
 
 #include <io/comp/io_uncomp.h>
+#include <io/utilities/column_buffer.hpp>
+#include <io/utilities/hostdevice_vector.hpp>
 #include <io/utilities/parsing_utils.cuh>
+#include <io/utilities/trie.cuh>
 #include <io/utilities/type_conversion.cuh>
 
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
+#include <cudf/io/csv.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/detail/csv.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/table/table.hpp>
@@ -38,10 +45,14 @@
 
 #include <algorithm>
 #include <iostream>
+#include <memory>
 #include <numeric>
+#include <string>
 #include <tuple>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
+#include <vector>
 
 using std::string;
 using std::vector;
@@ -57,6 +68,83 @@ namespace csv {
 using namespace cudf::io::csv;
 using namespace cudf::io;
 
+namespace {
+
+/**
+ * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span.
+ *
+ * Row offsets are stored this way to avoid reallocation/copies when discarding front or back
+ * elements.
+ */
+class selected_rows_offsets {
+  rmm::device_uvector<uint64_t> all;
+  device_span<uint64_t const> selected;
+
+ public:
+  selected_rows_offsets(rmm::device_uvector<uint64_t>&& data,
+                        device_span<uint64_t const> selected_span)
+    : all{std::move(data)}, selected{selected_span}
+  {
+  }
+  selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {}
+
+  operator device_span<uint64_t const>() const { return selected; }
+  void shrink(size_t size)
+  {
+    CUDF_EXPECTS(size <= selected.size(), "New size must be smaller");
+    selected = selected.subspan(0, size);
+  }
+  void erase_first_n(size_t n)
+  {
+    CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove");
+    selected = selected.subspan(n, selected.size() - n);
+  }
+  auto size() const { return selected.size(); }
+  auto data() const { return selected.data(); }
+};
+
+}  // namespace
+
+std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather_row_offsets(
+  csv_reader_options const& reader_opts,
+  parse_options const& parse_opts,
+  std::vector<char>& header,
+  host_span<char const> data,
+  size_t range_begin,
+  size_t range_end,
+  size_t skip_rows,
+  int64_t num_rows,
+  bool load_whole_file,
+  rmm::cuda_stream_view stream);
+
+std::vector<data_type> parse_column_types(std::vector<column_parse::flags>& column_flags,
+                                          std::vector<std::string> const& column_names,
+                                          std::vector<std::string> const& types_as_strings,
+                                          int32_t num_actual_columns,
+                                          int32_t num_active_columns,
+                                          data_type timestamp_type);
+
+std::vector<data_type> infer_column_types(parse_options const& parse_opts,
+                                          std::vector<column_parse::flags> const& column_flags,
+                                          device_span<char const> data,
+                                          device_span<uint64_t const> row_offsets,
+                                          int32_t num_records,
+                                          int32_t num_active_columns,
+                                          data_type timestamp_type,
+                                          rmm::cuda_stream_view stream);
+
+std::vector<column_buffer> decode_data(parse_options const& parse_opts,
+                                       std::vector<column_parse::flags> const& column_flags,
+                                       std::vector<std::string> const& column_names,
+                                       device_span<char const> data,
+                                       device_span<uint64_t const> row_offsets,
+                                       host_span<data_type const> column_types,
+                                       int32_t num_records,
+                                       int32_t num_actual_columns,
+                                       int32_t num_active_columns,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Estimates the maximum expected length or a row, based on the number
  * of columns
@@ -208,11 +296,12 @@ size_t find_first_row_start(char row_terminator, host_span<char const> data)
   return std::min(pos + 1, data.size());
 }
 
-std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
-reader_impl::select_data_and_row_offsets(cudf::io::datasource* source,
-                                         csv_reader_options const& reader_opts,
-                                         parse_options const& parse_opts,
-                                         rmm::cuda_stream_view stream)
+std::pair<rmm::device_uvector<char>, selected_rows_offsets> select_data_and_row_offsets(
+  cudf::io::datasource* source,
+  csv_reader_options const& reader_opts,
+  std::vector<char>& header,
+  parse_options const& parse_opts,
+  rmm::cuda_stream_view stream)
 {
   auto range_offset  = reader_opts.get_byte_range_offset();
   auto range_size    = reader_opts.get_byte_range_size();
@@ -266,6 +355,7 @@ reader_impl::select_data_and_row_offsets(cudf::io::datasource* source,
     auto data_row_offsets =
       load_data_and_gather_row_offsets(reader_opts,
                                        parse_opts,
+                                       header,
                                        h_data,
                                        data_start_offset,
                                        (range_size) ? range_size : h_data.size(),
@@ -325,14 +415,17 @@ std::vector<data_type> select_data_types(std::vector<column_parse::flags> const&
   return selected_dtypes;
 }
 
-table_with_metadata reader_impl::read(cudf::io::datasource* source,
-                                      csv_reader_options const& reader_opts,
-                                      parse_options const& parse_opts,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+table_with_metadata read_csv(cudf::io::datasource* source,
+                             csv_reader_options const& reader_opts,
+                             parse_options const& parse_opts,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
+  std::vector<char> header;
+
   auto const data_row_offsets =
-    select_data_and_row_offsets(source, reader_opts, parse_opts, stream);
+    select_data_and_row_offsets(source, reader_opts, header, parse_opts, stream);
+
   auto const& data        = data_row_offsets.first;
   auto const& row_offsets = data_row_offsets.second;
 
@@ -349,7 +442,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
     column_names = reader_opts.get_names();
   } else {
     column_names = get_column_names(
-      header_, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());
+      header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());
 
     num_actual_columns = num_active_columns = column_names.size();
 
@@ -533,16 +626,32 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source,
   return {std::make_unique<table>(std::move(out_columns)), std::move(metadata)};
 }
 
-std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
-reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& reader_opts,
-                                              parse_options const& parse_opts,
-                                              host_span<char const> data,
-                                              size_t range_begin,
-                                              size_t range_end,
-                                              size_t skip_rows,
-                                              int64_t num_rows,
-                                              bool load_whole_file,
-                                              rmm::cuda_stream_view stream)
+/**
+ * @brief Finds row positions in the specified input data, and loads the selected data onto GPU.
+ *
+ * This function scans the input data to record the row offsets (relative to the start of the
+ * input data). A row is actually the data/offset between two termination symbols.
+ *
+ * @param data Uncompressed input data in host memory
+ * @param range_begin Only include rows starting after this position
+ * @param range_end Only include rows starting before this position
+ * @param skip_rows Number of rows to skip from the start
+ * @param num_rows Number of rows to read; -1: all remaining data
+ * @param load_whole_file Hint that the entire data will be needed on gpu
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Input data and row offsets in the device memory
+ */
+std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather_row_offsets(
+  csv_reader_options const& reader_opts,
+  parse_options const& parse_opts,
+  std::vector<char>& header,
+  host_span<char const> data,
+  size_t range_begin,
+  size_t range_end,
+  size_t skip_rows,
+  int64_t num_rows,
+  bool load_whole_file,
+  rmm::cuda_stream_view stream)
 {
   constexpr size_t max_chunk_bytes = 64 * 1024 * 1024;  // 64MB
   size_t buffer_size               = std::min(max_chunk_bytes, data.size());
@@ -690,7 +799,7 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& reader_o
     const auto header_end   = buffer_pos + row_ctx[1];
     CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(),
                  "Invalid csv header location");
-    header_.assign(data.begin() + header_start, data.begin() + header_end);
+    header.assign(data.begin() + header_start, data.begin() + header_end);
     if (header_rows > 0) { row_offsets.erase_first_n(header_rows); }
   }
   // Apply num_rows limit
@@ -700,15 +809,14 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& reader_o
   return {std::move(d_data), std::move(row_offsets)};
 }
 
-std::vector<data_type> reader_impl::infer_column_types(
-  parse_options const& parse_opts,
-  std::vector<column_parse::flags> const& column_flags,
-  device_span<char const> data,
-  device_span<uint64_t const> row_offsets,
-  int32_t num_records,
-  int32_t num_active_columns,
-  data_type timestamp_type,
-  rmm::cuda_stream_view stream)
+std::vector<data_type> infer_column_types(parse_options const& parse_opts,
+                                          std::vector<column_parse::flags> const& column_flags,
+                                          device_span<char const> data,
+                                          device_span<uint64_t const> row_offsets,
+                                          int32_t num_records,
+                                          int32_t num_active_columns,
+                                          data_type timestamp_type,
+                                          rmm::cuda_stream_view stream)
 {
   std::vector<data_type> dtypes;
   if (num_records == 0) {
@@ -771,13 +879,12 @@ std::vector<data_type> reader_impl::infer_column_types(
   return dtypes;
 }
 
-std::vector<data_type> reader_impl::parse_column_types(
-  std::vector<column_parse::flags>& column_flags,
-  std::vector<std::string> const& column_names,
-  std::vector<std::string> const& types_as_strings,
-  int32_t num_actual_columns,
-  int32_t num_active_columns,
-  data_type timestamp_type)
+std::vector<data_type> parse_column_types(std::vector<column_parse::flags>& column_flags,
+                                          std::vector<std::string> const& column_names,
+                                          std::vector<std::string> const& types_as_strings,
+                                          int32_t num_actual_columns,
+                                          int32_t num_active_columns,
+                                          data_type timestamp_type)
 {
   std::vector<data_type> dtypes;
 
@@ -851,18 +958,17 @@ std::vector<data_type> reader_impl::parse_column_types(
   return dtypes;
 }
 
-std::vector<column_buffer> reader_impl::decode_data(
-  parse_options const& parse_opts,
-  std::vector<column_parse::flags> const& column_flags,
-  std::vector<std::string> const& column_names,
-  device_span<char const> data,
-  device_span<uint64_t const> row_offsets,
-  host_span<data_type const> column_types,
-  int32_t num_records,
-  int32_t num_actual_columns,
-  int32_t num_active_columns,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+std::vector<column_buffer> decode_data(parse_options const& parse_opts,
+                                       std::vector<column_parse::flags> const& column_flags,
+                                       std::vector<std::string> const& column_names,
+                                       device_span<char const> data,
+                                       device_span<uint64_t const> row_offsets,
+                                       host_span<data_type const> column_types,
+                                       int32_t num_records,
+                                       int32_t num_actual_columns,
+                                       int32_t num_active_columns,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
 {
   // Alloc output; columns' data memory is still expected for empty dataframe
   std::vector<column_buffer> out_buffers;
@@ -1003,8 +1109,6 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
   return parse_opts;
 }
 
-reader_impl::reader_impl() {}
-
 table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
                              csv_reader_options const& options,
                              rmm::cuda_stream_view stream,
@@ -1015,7 +1119,7 @@ table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
 
   auto parse_options = make_parse_options(options, stream);
 
-  return std::make_unique<reader_impl>()->read(source.get(), options, parse_options, stream, mr);
+  return read_csv(source.get(), options, parse_options, stream, mr);
 }
 
 }  // namespace csv
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
deleted file mode 100644
index 238e83eadc3..00000000000
--- a/cpp/src/io/csv/reader_impl.hpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "csv_common.h"
-#include "csv_gpu.h"
-
-#include <io/utilities/column_buffer.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
-#include <io/utilities/trie.cuh>
-
-#include <cudf/io/csv.hpp>
-#include <cudf/io/datasource.hpp>
-#include <cudf/io/detail/csv.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-using cudf::host_span;
-
-namespace cudf {
-namespace io {
-namespace detail {
-namespace csv {
-using namespace cudf::io::csv;
-using namespace cudf::io;
-
-/**
- * @brief Implementation for CSV reader
- *
- * The CSV reader is implemented in 4 stages:
- * Stage 1: read and optionally decompress the input data in host memory
- * (may be a memory-mapped view of the data on disk)
- *
- * Stage 2: gather the offset of each data row within the csv data.
- * Since the number of rows in a given character block may depend on the
- * initial parser state (like whether the block starts in a middle of a
- * quote or not), a separate row count and output parser state is computed
- * for every possible input parser state per 16KB character block.
- * The result is then used to infer the parser state and starting row at
- * the beginning of every character block.
- * A second pass can then output the location of every row (which is needed
- * for the subsequent parallel conversion of every row from csv text
- * to cudf binary form)
- *
- * Stage 3: Optional stage to infer the data type of each CSV column.
- *
- * Stage 4: Convert every row from csv text form to cudf binary form.
- */
-class reader_impl {
- public:
-  /**
-   * @brief Constructor from a dataset source with reader options.
-   *
-   */
-  explicit reader_impl();
-
-  /**
-   * @brief Read an entire set or a subset of data and returns a set of columns.
-   *
-   * @param source Dataset source
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   *
-   * @return The set of columns along with metadata
-   */
-  table_with_metadata read(cudf::io::datasource* source,
-                           csv_reader_options const& reader_opts,
-                           parse_options const& parse_opts,
-                           rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr);
-
- private:
-  /**
-   * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span.
-   *
-   * Row offsets are stored this way to avoid reallocation/copies when discarding front or back
-   * elements.
-   */
-  class selected_rows_offsets {
-    rmm::device_uvector<uint64_t> all;
-    device_span<uint64_t const> selected;
-
-   public:
-    selected_rows_offsets(rmm::device_uvector<uint64_t>&& data,
-                          device_span<uint64_t const> selected_span)
-      : all{std::move(data)}, selected{selected_span}
-    {
-    }
-    selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {}
-
-    operator device_span<uint64_t const>() const { return selected; }
-    void shrink(size_t size)
-    {
-      CUDF_EXPECTS(size <= selected.size(), "New size must be smaller");
-      selected = selected.subspan(0, size);
-    }
-    void erase_first_n(size_t n)
-    {
-      CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove");
-      selected = selected.subspan(n, selected.size() - n);
-    }
-    auto size() const { return selected.size(); }
-    auto data() const { return selected.data(); }
-  };
-
-  /**
-   * @brief Selectively loads data on the GPU and gathers offsets of rows to read.
-   *
-   * Selection is based on read options.
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
-  select_data_and_row_offsets(cudf::io::datasource* source,
-                              csv_reader_options const& reader_opts,
-                              parse_options const& parse_opts,
-                              rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Finds row positions in the specified input data, and loads the selected data onto GPU.
-   *
-   * This function scans the input data to record the row offsets (relative to the start of the
-   * input data). A row is actually the data/offset between two termination symbols.
-   *
-   * @param data Uncompressed input data in host memory
-   * @param range_begin Only include rows starting after this position
-   * @param range_end Only include rows starting before this position
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows Number of rows to read; -1: all remaining data
-   * @param load_whole_file Hint that the entire data will be needed on gpu
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @return Input data and row offsets in the device memory
-   */
-  std::pair<rmm::device_uvector<char>, reader_impl::selected_rows_offsets>
-  load_data_and_gather_row_offsets(csv_reader_options const& reader_opts,
-                                   parse_options const& parse_opts,
-                                   host_span<char const> data,
-                                   size_t range_begin,
-                                   size_t range_end,
-                                   size_t skip_rows,
-                                   int64_t num_rows,
-                                   bool load_whole_file,
-                                   rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Automatically infers each column's data type based on the CSV's data within that column.
-   *
-   * @param data The CSV data from which to infer the columns' data types
-   * @param row_offsets The row offsets into the CSV's data
-   * @param stream The stream to which the type inference-kernel will be dispatched
-   * @return The columns' inferred data types
-   */
-  std::vector<data_type> infer_column_types(parse_options const& parse_opts,
-                                            std::vector<column_parse::flags> const& column_flags,
-                                            device_span<char const> data,
-                                            device_span<uint64_t const> row_offsets,
-                                            int32_t num_records,
-                                            int32_t num_active_columns,
-                                            data_type timestamp_type,
-                                            rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Parses the columns' data types from the vector of dtypes that are provided as strings.
-   *
-   * @param types_as_strings The vector of strings from which to parse the columns' target data
-   * types
-   * @return List of columns' data types
-   */
-  std::vector<data_type> parse_column_types(std::vector<column_parse::flags>& column_flags,
-                                            std::vector<std::string> const& column_names,
-                                            std::vector<std::string> const& types_as_strings,
-                                            int32_t num_actual_columns,
-                                            int32_t num_active_columns,
-                                            data_type timestamp_type);
-
-  /**
-   * @brief Converts the row-column data and outputs to column bufferrs.
-   *
-   * @param column_types Column types
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return list of column buffers of decoded data, or ptr/size in the case of strings.
-   */
-  std::vector<column_buffer> decode_data(parse_options const& parse_opts,
-                                         std::vector<column_parse::flags> const& column_flags,
-                                         std::vector<std::string> const& column_names,
-                                         device_span<char const> data,
-                                         device_span<uint64_t const> row_offsets,
-                                         host_span<data_type const> column_types,
-                                         int32_t num_records,
-                                         int32_t num_actual_columns,
-                                         int32_t num_active_columns,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
-
- private:
-  std::vector<char> header_;
-};
-
-}  // namespace csv
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf

From 7cda106ecd8e5ce1b4b1bd6b5cf3124a408220ae Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 14 Aug 2021 16:23:58 -0500
Subject: [PATCH 11/26] rearrange some functions to delete some unneccessary
 declarations.

---
 cpp/src/io/csv/reader_impl.cu | 882 ++++++++++++++++------------------
 1 file changed, 421 insertions(+), 461 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 1ee321b30fd..df16959eda5 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -103,48 +103,6 @@ class selected_rows_offsets {
   auto data() const { return selected.data(); }
 };
 
-}  // namespace
-
-std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather_row_offsets(
-  csv_reader_options const& reader_opts,
-  parse_options const& parse_opts,
-  std::vector<char>& header,
-  host_span<char const> data,
-  size_t range_begin,
-  size_t range_end,
-  size_t skip_rows,
-  int64_t num_rows,
-  bool load_whole_file,
-  rmm::cuda_stream_view stream);
-
-std::vector<data_type> parse_column_types(std::vector<column_parse::flags>& column_flags,
-                                          std::vector<std::string> const& column_names,
-                                          std::vector<std::string> const& types_as_strings,
-                                          int32_t num_actual_columns,
-                                          int32_t num_active_columns,
-                                          data_type timestamp_type);
-
-std::vector<data_type> infer_column_types(parse_options const& parse_opts,
-                                          std::vector<column_parse::flags> const& column_flags,
-                                          device_span<char const> data,
-                                          device_span<uint64_t const> row_offsets,
-                                          int32_t num_records,
-                                          int32_t num_active_columns,
-                                          data_type timestamp_type,
-                                          rmm::cuda_stream_view stream);
-
-std::vector<column_buffer> decode_data(parse_options const& parse_opts,
-                                       std::vector<column_parse::flags> const& column_flags,
-                                       std::vector<std::string> const& column_names,
-                                       device_span<char const> data,
-                                       device_span<uint64_t const> row_offsets,
-                                       host_span<data_type const> column_types,
-                                       int32_t num_records,
-                                       int32_t num_actual_columns,
-                                       int32_t num_active_columns,
-                                       rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
-
 /**
  * @brief Estimates the maximum expected length or a row, based on the number
  * of columns
@@ -296,6 +254,189 @@ size_t find_first_row_start(char row_terminator, host_span<char const> data)
   return std::min(pos + 1, data.size());
 }
 
+/**
+ * @brief Finds row positions in the specified input data, and loads the selected data onto GPU.
+ *
+ * This function scans the input data to record the row offsets (relative to the start of the
+ * input data). A row is actually the data/offset between two termination symbols.
+ *
+ * @param data Uncompressed input data in host memory
+ * @param range_begin Only include rows starting after this position
+ * @param range_end Only include rows starting before this position
+ * @param skip_rows Number of rows to skip from the start
+ * @param num_rows Number of rows to read; -1: all remaining data
+ * @param load_whole_file Hint that the entire data will be needed on gpu
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Input data and row offsets in the device memory
+ */
+std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather_row_offsets(
+  csv_reader_options const& reader_opts,
+  parse_options const& parse_opts,
+  std::vector<char>& header,
+  host_span<char const> data,
+  size_t range_begin,
+  size_t range_end,
+  size_t skip_rows,
+  int64_t num_rows,
+  bool load_whole_file,
+  rmm::cuda_stream_view stream)
+{
+  constexpr size_t max_chunk_bytes = 64 * 1024 * 1024;  // 64MB
+  size_t buffer_size               = std::min(max_chunk_bytes, data.size());
+  size_t max_blocks =
+    std::max<size_t>((buffer_size / cudf::io::csv::gpu::rowofs_block_bytes) + 1, 2);
+  hostdevice_vector<uint64_t> row_ctx(max_blocks);
+  size_t buffer_pos  = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size());
+  size_t pos         = std::min(range_begin, data.size());
+  size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0;
+  uint64_t ctx       = 0;
+
+  // For compatibility with the previous parser, a row is considered in-range if the
+  // previous row terminator is within the given range
+  range_end += (range_end < data.size());
+
+  // Reserve memory by allocating and then resetting the size
+  rmm::device_uvector<char> d_data{
+    (load_whole_file) ? data.size() : std::min(buffer_size * 2, data.size()), stream};
+  d_data.resize(0, stream);
+  rmm::device_uvector<uint64_t> all_row_offsets{0, stream};
+  do {
+    size_t target_pos = std::min(pos + max_chunk_bytes, data.size());
+    size_t chunk_size = target_pos - pos;
+
+    auto const previous_data_size = d_data.size();
+    d_data.resize(target_pos - buffer_pos, stream);
+    CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size,
+                             data.begin() + buffer_pos + previous_data_size,
+                             target_pos - buffer_pos - previous_data_size,
+                             cudaMemcpyDefault,
+                             stream.value()));
+
+    // Pass 1: Count the potential number of rows in each character block for each
+    // possible parser state at the beginning of the block.
+    uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
+                                                                 row_ctx.device_ptr(),
+                                                                 device_span<uint64_t>(),
+                                                                 d_data,
+                                                                 chunk_size,
+                                                                 pos,
+                                                                 buffer_pos,
+                                                                 data.size(),
+                                                                 range_begin,
+                                                                 range_end,
+                                                                 skip_rows,
+                                                                 stream);
+    CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
+                             row_ctx.device_ptr(),
+                             num_blocks * sizeof(uint64_t),
+                             cudaMemcpyDeviceToHost,
+                             stream.value()));
+    stream.synchronize();
+
+    // Sum up the rows in each character block, selecting the row count that
+    // corresponds to the current input context. Also stores the now known input
+    // context per character block that will be needed by the second pass.
+    for (uint32_t i = 0; i < num_blocks; i++) {
+      uint64_t ctx_next = cudf::io::csv::gpu::select_row_context(ctx, row_ctx[i]);
+      row_ctx[i]        = ctx;
+      ctx               = ctx_next;
+    }
+    size_t total_rows = ctx >> 2;
+    if (total_rows > skip_rows) {
+      // At least one row in range in this batch
+      all_row_offsets.resize(total_rows - skip_rows, stream);
+
+      CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(),
+                               row_ctx.host_ptr(),
+                               num_blocks * sizeof(uint64_t),
+                               cudaMemcpyHostToDevice,
+                               stream.value()));
+
+      // Pass 2: Output row offsets
+      cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
+                                             row_ctx.device_ptr(),
+                                             all_row_offsets,
+                                             d_data,
+                                             chunk_size,
+                                             pos,
+                                             buffer_pos,
+                                             data.size(),
+                                             range_begin,
+                                             range_end,
+                                             skip_rows,
+                                             stream);
+      // With byte range, we want to keep only one row out of the specified range
+      if (range_end < data.size()) {
+        CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
+                                 row_ctx.device_ptr(),
+                                 num_blocks * sizeof(uint64_t),
+                                 cudaMemcpyDeviceToHost,
+                                 stream.value()));
+        stream.synchronize();
+
+        size_t rows_out_of_range = 0;
+        for (uint32_t i = 0; i < num_blocks; i++) {
+          rows_out_of_range += row_ctx[i];
+        }
+        if (rows_out_of_range != 0) {
+          // Keep one row out of range (used to infer length of previous row)
+          auto new_row_offsets_size =
+            all_row_offsets.size() - std::min(rows_out_of_range - 1, all_row_offsets.size());
+          all_row_offsets.resize(new_row_offsets_size, stream);
+          // Implies we reached the end of the range
+          break;
+        }
+      }
+      // num_rows does not include blank rows
+      if (num_rows >= 0) {
+        if (all_row_offsets.size() > header_rows + static_cast<size_t>(num_rows)) {
+          size_t num_blanks = cudf::io::csv::gpu::count_blank_rows(
+            parse_opts.view(), d_data, all_row_offsets, stream);
+          if (all_row_offsets.size() - num_blanks > header_rows + static_cast<size_t>(num_rows)) {
+            // Got the desired number of rows
+            break;
+          }
+        }
+      }
+    } else {
+      // Discard data (all rows below skip_rows), keeping one character for history
+      size_t discard_bytes = std::max(d_data.size(), sizeof(char)) - sizeof(char);
+      if (discard_bytes != 0) {
+        erase_except_last(d_data, stream);
+        buffer_pos += discard_bytes;
+      }
+    }
+    pos = target_pos;
+  } while (pos < data.size());
+
+  auto const non_blank_row_offsets =
+    io::csv::gpu::remove_blank_rows(parse_opts.view(), d_data, all_row_offsets, stream);
+  auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets};
+
+  // Remove header rows and extract header
+  const size_t header_row_index = std::max<size_t>(header_rows, 1) - 1;
+  if (header_row_index + 1 < row_offsets.size()) {
+    CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
+                             row_offsets.data() + header_row_index,
+                             2 * sizeof(uint64_t),
+                             cudaMemcpyDeviceToHost,
+                             stream.value()));
+    stream.synchronize();
+
+    const auto header_start = buffer_pos + row_ctx[0];
+    const auto header_end   = buffer_pos + row_ctx[1];
+    CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(),
+                 "Invalid csv header location");
+    header.assign(data.begin() + header_start, data.begin() + header_end);
+    if (header_rows > 0) { row_offsets.erase_first_n(header_rows); }
+  }
+  // Apply num_rows limit
+  if (num_rows >= 0 && static_cast<size_t>(num_rows) < row_offsets.size() - 1) {
+    row_offsets.shrink(num_rows + 1);
+  }
+  return {std::move(d_data), std::move(row_offsets)};
+}
+
 std::pair<rmm::device_uvector<char>, selected_rows_offsets> select_data_and_row_offsets(
   cudf::io::datasource* source,
   csv_reader_options const& reader_opts,
@@ -415,19 +556,221 @@ std::vector<data_type> select_data_types(std::vector<column_parse::flags> const&
   return selected_dtypes;
 }
 
-table_with_metadata read_csv(cudf::io::datasource* source,
-                             csv_reader_options const& reader_opts,
-                             parse_options const& parse_opts,
-                             rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+std::vector<data_type> parse_column_types(std::vector<column_parse::flags>& column_flags,
+                                          std::vector<std::string> const& column_names,
+                                          std::vector<std::string> const& types_as_strings,
+                                          int32_t num_actual_columns,
+                                          int32_t num_active_columns,
+                                          data_type timestamp_type)
 {
-  std::vector<char> header;
+  std::vector<data_type> dtypes;
 
-  auto const data_row_offsets =
-    select_data_and_row_offsets(source, reader_opts, header, parse_opts, stream);
+  bool const is_dict = std::all_of(types_as_strings.begin(),
+                                   types_as_strings.end(),
+                                   [](auto const& s) { return s.find(':') != std::string::npos; });
 
-  auto const& data        = data_row_offsets.first;
-  auto const& row_offsets = data_row_offsets.second;
+  if (!is_dict) {
+    if (types_as_strings.size() == 1) {
+      // If it's a single dtype, assign that dtype to all active columns
+      data_type dtype_;
+      column_parse::flags col_flags_;
+      std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]);
+      dtypes.resize(num_active_columns, dtype_);
+      for (int col = 0; col < num_actual_columns; col++) {
+        column_flags[col] |= col_flags_;
+      }
+      CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
+    } else {
+      // If it's a list, assign dtypes to active columns in the given order
+      CUDF_EXPECTS(static_cast<int>(types_as_strings.size()) >= num_actual_columns,
+                   "Must specify data types for all columns");
+
+      auto dtype_ = std::back_inserter(dtypes);
+
+      for (int col = 0; col < num_actual_columns; col++) {
+        if (column_flags[col] & column_parse::enabled) {
+          column_parse::flags col_flags_;
+          std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]);
+          column_flags[col] |= col_flags_;
+          CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
+        }
+      }
+    }
+  } else {
+    // Translate vector of `name : dtype` strings to map
+    // NOTE: Incoming pairs can be out-of-order from column names in dataset
+    std::unordered_map<std::string, std::string> col_type_map;
+    for (const auto& pair : types_as_strings) {
+      const auto pos     = pair.find_last_of(':');
+      const auto name    = pair.substr(0, pos);
+      const auto dtype   = pair.substr(pos + 1, pair.size());
+      col_type_map[name] = dtype;
+    }
+
+    auto dtype_ = std::back_inserter(dtypes);
+
+    for (int col = 0; col < num_actual_columns; col++) {
+      if (column_flags[col] & column_parse::enabled) {
+        CUDF_EXPECTS(col_type_map.find(column_names[col]) != col_type_map.end(),
+                     "Must specify data types for all active columns");
+        column_parse::flags col_flags_;
+        std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[column_names[col]]);
+        column_flags[col] |= col_flags_;
+        CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
+      }
+    }
+  }
+
+  if (timestamp_type.id() != cudf::type_id::EMPTY) {
+    for (auto& type : dtypes) {
+      if (cudf::is_timestamp(type)) { type = timestamp_type; }
+    }
+  }
+
+  for (size_t i = 0; i < dtypes.size(); i++) {
+    // Replace EMPTY dtype with STRING
+    if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; }
+  }
+
+  return dtypes;
+}
+
+std::vector<data_type> infer_column_types(parse_options const& parse_opts,
+                                          std::vector<column_parse::flags> const& column_flags,
+                                          device_span<char const> data,
+                                          device_span<uint64_t const> row_offsets,
+                                          int32_t num_records,
+                                          int32_t num_active_columns,
+                                          data_type timestamp_type,
+                                          rmm::cuda_stream_view stream)
+{
+  std::vector<data_type> dtypes;
+  if (num_records == 0) {
+    dtypes.resize(num_active_columns, data_type{type_id::EMPTY});
+  } else {
+    auto column_stats =
+      cudf::io::csv::gpu::detect_column_types(parse_opts.view(),
+                                              data,
+                                              make_device_uvector_async(column_flags, stream),
+                                              row_offsets,
+                                              num_active_columns,
+                                              stream);
+
+    stream.synchronize();
+
+    for (int col = 0; col < num_active_columns; col++) {
+      unsigned long long int_count_total = column_stats[col].big_int_count +
+                                           column_stats[col].negative_small_int_count +
+                                           column_stats[col].positive_small_int_count;
+
+      if (column_stats[col].null_count == num_records) {
+        // Entire column is NULL; allocate the smallest amount of memory
+        dtypes.emplace_back(cudf::type_id::INT8);
+      } else if (column_stats[col].string_count > 0L) {
+        dtypes.emplace_back(cudf::type_id::STRING);
+      } else if (column_stats[col].datetime_count > 0L) {
+        dtypes.emplace_back(cudf::type_id::TIMESTAMP_NANOSECONDS);
+      } else if (column_stats[col].bool_count > 0L) {
+        dtypes.emplace_back(cudf::type_id::BOOL8);
+      } else if (column_stats[col].float_count > 0L ||
+                 (column_stats[col].float_count == 0L && int_count_total > 0L &&
+                  column_stats[col].null_count > 0L)) {
+        // The second condition has been added to conform to
+        // PANDAS which states that a column of integers with
+        // a single NULL record need to be treated as floats.
+        dtypes.emplace_back(cudf::type_id::FLOAT64);
+      } else if (column_stats[col].big_int_count == 0) {
+        dtypes.emplace_back(cudf::type_id::INT64);
+      } else if (column_stats[col].big_int_count != 0 &&
+                 column_stats[col].negative_small_int_count != 0) {
+        dtypes.emplace_back(cudf::type_id::STRING);
+      } else {
+        // Integers are stored as 64-bit to conform to PANDAS
+        dtypes.emplace_back(cudf::type_id::UINT64);
+      }
+    }
+  }
+
+  if (timestamp_type.id() != cudf::type_id::EMPTY) {
+    for (auto& type : dtypes) {
+      if (cudf::is_timestamp(type)) { type = timestamp_type; }
+    }
+  }
+
+  for (size_t i = 0; i < dtypes.size(); i++) {
+    // Replace EMPTY dtype with STRING
+    if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; }
+  }
+
+  return dtypes;
+}
+
+std::vector<column_buffer> decode_data(parse_options const& parse_opts,
+                                       std::vector<column_parse::flags> const& column_flags,
+                                       std::vector<std::string> const& column_names,
+                                       device_span<char const> data,
+                                       device_span<uint64_t const> row_offsets,
+                                       host_span<data_type const> column_types,
+                                       int32_t num_records,
+                                       int32_t num_actual_columns,
+                                       int32_t num_active_columns,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  // Alloc output; columns' data memory is still expected for empty dataframe
+  std::vector<column_buffer> out_buffers;
+  out_buffers.reserve(column_types.size());
+
+  for (int col = 0, active_col = 0; col < num_actual_columns; ++col) {
+    if (column_flags[col] & column_parse::enabled) {
+      const bool is_final_allocation = column_types[active_col].id() != type_id::STRING;
+      auto out_buffer =
+        column_buffer(column_types[active_col],
+                      num_records,
+                      true,
+                      stream,
+                      is_final_allocation ? mr : rmm::mr::get_current_device_resource());
+
+      out_buffer.name         = column_names[col];
+      out_buffer.null_count() = UNKNOWN_NULL_COUNT;
+      out_buffers.emplace_back(std::move(out_buffer));
+      active_col++;
+    }
+  }
+
+  thrust::host_vector<void*> h_data(num_active_columns);
+  thrust::host_vector<bitmask_type*> h_valid(num_active_columns);
+
+  for (int i = 0; i < num_active_columns; ++i) {
+    h_data[i]  = out_buffers[i].data();
+    h_valid[i] = out_buffers[i].null_mask();
+  }
+
+  cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(),
+                                             data,
+                                             make_device_uvector_async(column_flags, stream),
+                                             row_offsets,
+                                             make_device_uvector_async(column_types, stream),
+                                             make_device_uvector_async(h_data, stream),
+                                             make_device_uvector_async(h_valid, stream),
+                                             stream);
+
+  return out_buffers;
+}
+
+table_with_metadata read_csv(cudf::io::datasource* source,
+                             csv_reader_options const& reader_opts,
+                             parse_options const& parse_opts,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+{
+  std::vector<char> header;
+
+  auto const data_row_offsets =
+    select_data_and_row_offsets(source, reader_opts, header, parse_opts, stream);
+
+  auto const& data        = data_row_offsets.first;
+  auto const& row_offsets = data_row_offsets.second;
 
   // Exclude the end-of-data row from number of rows with actual data
   auto num_records        = std::max(row_offsets.size(), 1ul) - 1;
@@ -627,418 +970,33 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 }
 
 /**
- * @brief Finds row positions in the specified input data, and loads the selected data onto GPU.
- *
- * This function scans the input data to record the row offsets (relative to the start of the
- * input data). A row is actually the data/offset between two termination symbols.
- *
- * @param data Uncompressed input data in host memory
- * @param range_begin Only include rows starting after this position
- * @param range_end Only include rows starting before this position
- * @param skip_rows Number of rows to skip from the start
- * @param num_rows Number of rows to read; -1: all remaining data
- * @param load_whole_file Hint that the entire data will be needed on gpu
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Input data and row offsets in the device memory
+ * @brief Create a serialized trie for N/A value matching, based on the options.
  */
-std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather_row_offsets(
-  csv_reader_options const& reader_opts,
-  parse_options const& parse_opts,
-  std::vector<char>& header,
-  host_span<char const> data,
-  size_t range_begin,
-  size_t range_end,
-  size_t skip_rows,
-  int64_t num_rows,
-  bool load_whole_file,
-  rmm::cuda_stream_view stream)
+cudf::detail::trie create_na_trie(char quotechar,
+                                  csv_reader_options const& reader_opts,
+                                  rmm::cuda_stream_view stream)
 {
-  constexpr size_t max_chunk_bytes = 64 * 1024 * 1024;  // 64MB
-  size_t buffer_size               = std::min(max_chunk_bytes, data.size());
-  size_t max_blocks =
-    std::max<size_t>((buffer_size / cudf::io::csv::gpu::rowofs_block_bytes) + 1, 2);
-  hostdevice_vector<uint64_t> row_ctx(max_blocks);
-  size_t buffer_pos  = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size());
-  size_t pos         = std::min(range_begin, data.size());
-  size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0;
-  uint64_t ctx       = 0;
-
-  // For compatibility with the previous parser, a row is considered in-range if the
-  // previous row terminator is within the given range
-  range_end += (range_end < data.size());
+  // Default values to recognize as null values
+  static std::vector<std::string> const default_na_values{"",
+                                                          "#N/A",
+                                                          "#N/A N/A",
+                                                          "#NA",
+                                                          "-1.#IND",
+                                                          "-1.#QNAN",
+                                                          "-NaN",
+                                                          "-nan",
+                                                          "1.#IND",
+                                                          "1.#QNAN",
+                                                          "<NA>",
+                                                          "N/A",
+                                                          "NA",
+                                                          "NULL",
+                                                          "NaN",
+                                                          "n/a",
+                                                          "nan",
+                                                          "null"};
 
-  // Reserve memory by allocating and then resetting the size
-  rmm::device_uvector<char> d_data{
-    (load_whole_file) ? data.size() : std::min(buffer_size * 2, data.size()), stream};
-  d_data.resize(0, stream);
-  rmm::device_uvector<uint64_t> all_row_offsets{0, stream};
-  do {
-    size_t target_pos = std::min(pos + max_chunk_bytes, data.size());
-    size_t chunk_size = target_pos - pos;
-
-    auto const previous_data_size = d_data.size();
-    d_data.resize(target_pos - buffer_pos, stream);
-    CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size,
-                             data.begin() + buffer_pos + previous_data_size,
-                             target_pos - buffer_pos - previous_data_size,
-                             cudaMemcpyDefault,
-                             stream.value()));
-
-    // Pass 1: Count the potential number of rows in each character block for each
-    // possible parser state at the beginning of the block.
-    uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
-                                                                 row_ctx.device_ptr(),
-                                                                 device_span<uint64_t>(),
-                                                                 d_data,
-                                                                 chunk_size,
-                                                                 pos,
-                                                                 buffer_pos,
-                                                                 data.size(),
-                                                                 range_begin,
-                                                                 range_end,
-                                                                 skip_rows,
-                                                                 stream);
-    CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                             row_ctx.device_ptr(),
-                             num_blocks * sizeof(uint64_t),
-                             cudaMemcpyDeviceToHost,
-                             stream.value()));
-    stream.synchronize();
-
-    // Sum up the rows in each character block, selecting the row count that
-    // corresponds to the current input context. Also stores the now known input
-    // context per character block that will be needed by the second pass.
-    for (uint32_t i = 0; i < num_blocks; i++) {
-      uint64_t ctx_next = cudf::io::csv::gpu::select_row_context(ctx, row_ctx[i]);
-      row_ctx[i]        = ctx;
-      ctx               = ctx_next;
-    }
-    size_t total_rows = ctx >> 2;
-    if (total_rows > skip_rows) {
-      // At least one row in range in this batch
-      all_row_offsets.resize(total_rows - skip_rows, stream);
-
-      CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(),
-                               row_ctx.host_ptr(),
-                               num_blocks * sizeof(uint64_t),
-                               cudaMemcpyHostToDevice,
-                               stream.value()));
-
-      // Pass 2: Output row offsets
-      cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
-                                             row_ctx.device_ptr(),
-                                             all_row_offsets,
-                                             d_data,
-                                             chunk_size,
-                                             pos,
-                                             buffer_pos,
-                                             data.size(),
-                                             range_begin,
-                                             range_end,
-                                             skip_rows,
-                                             stream);
-      // With byte range, we want to keep only one row out of the specified range
-      if (range_end < data.size()) {
-        CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                 row_ctx.device_ptr(),
-                                 num_blocks * sizeof(uint64_t),
-                                 cudaMemcpyDeviceToHost,
-                                 stream.value()));
-        stream.synchronize();
-
-        size_t rows_out_of_range = 0;
-        for (uint32_t i = 0; i < num_blocks; i++) {
-          rows_out_of_range += row_ctx[i];
-        }
-        if (rows_out_of_range != 0) {
-          // Keep one row out of range (used to infer length of previous row)
-          auto new_row_offsets_size =
-            all_row_offsets.size() - std::min(rows_out_of_range - 1, all_row_offsets.size());
-          all_row_offsets.resize(new_row_offsets_size, stream);
-          // Implies we reached the end of the range
-          break;
-        }
-      }
-      // num_rows does not include blank rows
-      if (num_rows >= 0) {
-        if (all_row_offsets.size() > header_rows + static_cast<size_t>(num_rows)) {
-          size_t num_blanks = cudf::io::csv::gpu::count_blank_rows(
-            parse_opts.view(), d_data, all_row_offsets, stream);
-          if (all_row_offsets.size() - num_blanks > header_rows + static_cast<size_t>(num_rows)) {
-            // Got the desired number of rows
-            break;
-          }
-        }
-      }
-    } else {
-      // Discard data (all rows below skip_rows), keeping one character for history
-      size_t discard_bytes = std::max(d_data.size(), sizeof(char)) - sizeof(char);
-      if (discard_bytes != 0) {
-        erase_except_last(d_data, stream);
-        buffer_pos += discard_bytes;
-      }
-    }
-    pos = target_pos;
-  } while (pos < data.size());
-
-  auto const non_blank_row_offsets =
-    io::csv::gpu::remove_blank_rows(parse_opts.view(), d_data, all_row_offsets, stream);
-  auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets};
-
-  // Remove header rows and extract header
-  const size_t header_row_index = std::max<size_t>(header_rows, 1) - 1;
-  if (header_row_index + 1 < row_offsets.size()) {
-    CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                             row_offsets.data() + header_row_index,
-                             2 * sizeof(uint64_t),
-                             cudaMemcpyDeviceToHost,
-                             stream.value()));
-    stream.synchronize();
-
-    const auto header_start = buffer_pos + row_ctx[0];
-    const auto header_end   = buffer_pos + row_ctx[1];
-    CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(),
-                 "Invalid csv header location");
-    header.assign(data.begin() + header_start, data.begin() + header_end);
-    if (header_rows > 0) { row_offsets.erase_first_n(header_rows); }
-  }
-  // Apply num_rows limit
-  if (num_rows >= 0 && static_cast<size_t>(num_rows) < row_offsets.size() - 1) {
-    row_offsets.shrink(num_rows + 1);
-  }
-  return {std::move(d_data), std::move(row_offsets)};
-}
-
-std::vector<data_type> infer_column_types(parse_options const& parse_opts,
-                                          std::vector<column_parse::flags> const& column_flags,
-                                          device_span<char const> data,
-                                          device_span<uint64_t const> row_offsets,
-                                          int32_t num_records,
-                                          int32_t num_active_columns,
-                                          data_type timestamp_type,
-                                          rmm::cuda_stream_view stream)
-{
-  std::vector<data_type> dtypes;
-  if (num_records == 0) {
-    dtypes.resize(num_active_columns, data_type{type_id::EMPTY});
-  } else {
-    auto column_stats =
-      cudf::io::csv::gpu::detect_column_types(parse_opts.view(),
-                                              data,
-                                              make_device_uvector_async(column_flags, stream),
-                                              row_offsets,
-                                              num_active_columns,
-                                              stream);
-
-    stream.synchronize();
-
-    for (int col = 0; col < num_active_columns; col++) {
-      unsigned long long int_count_total = column_stats[col].big_int_count +
-                                           column_stats[col].negative_small_int_count +
-                                           column_stats[col].positive_small_int_count;
-
-      if (column_stats[col].null_count == num_records) {
-        // Entire column is NULL; allocate the smallest amount of memory
-        dtypes.emplace_back(cudf::type_id::INT8);
-      } else if (column_stats[col].string_count > 0L) {
-        dtypes.emplace_back(cudf::type_id::STRING);
-      } else if (column_stats[col].datetime_count > 0L) {
-        dtypes.emplace_back(cudf::type_id::TIMESTAMP_NANOSECONDS);
-      } else if (column_stats[col].bool_count > 0L) {
-        dtypes.emplace_back(cudf::type_id::BOOL8);
-      } else if (column_stats[col].float_count > 0L ||
-                 (column_stats[col].float_count == 0L && int_count_total > 0L &&
-                  column_stats[col].null_count > 0L)) {
-        // The second condition has been added to conform to
-        // PANDAS which states that a column of integers with
-        // a single NULL record need to be treated as floats.
-        dtypes.emplace_back(cudf::type_id::FLOAT64);
-      } else if (column_stats[col].big_int_count == 0) {
-        dtypes.emplace_back(cudf::type_id::INT64);
-      } else if (column_stats[col].big_int_count != 0 &&
-                 column_stats[col].negative_small_int_count != 0) {
-        dtypes.emplace_back(cudf::type_id::STRING);
-      } else {
-        // Integers are stored as 64-bit to conform to PANDAS
-        dtypes.emplace_back(cudf::type_id::UINT64);
-      }
-    }
-  }
-
-  if (timestamp_type.id() != cudf::type_id::EMPTY) {
-    for (auto& type : dtypes) {
-      if (cudf::is_timestamp(type)) { type = timestamp_type; }
-    }
-  }
-
-  for (size_t i = 0; i < dtypes.size(); i++) {
-    // Replace EMPTY dtype with STRING
-    if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; }
-  }
-
-  return dtypes;
-}
-
-std::vector<data_type> parse_column_types(std::vector<column_parse::flags>& column_flags,
-                                          std::vector<std::string> const& column_names,
-                                          std::vector<std::string> const& types_as_strings,
-                                          int32_t num_actual_columns,
-                                          int32_t num_active_columns,
-                                          data_type timestamp_type)
-{
-  std::vector<data_type> dtypes;
-
-  bool const is_dict = std::all_of(types_as_strings.begin(),
-                                   types_as_strings.end(),
-                                   [](auto const& s) { return s.find(':') != std::string::npos; });
-
-  if (!is_dict) {
-    if (types_as_strings.size() == 1) {
-      // If it's a single dtype, assign that dtype to all active columns
-      data_type dtype_;
-      column_parse::flags col_flags_;
-      std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]);
-      dtypes.resize(num_active_columns, dtype_);
-      for (int col = 0; col < num_actual_columns; col++) {
-        column_flags[col] |= col_flags_;
-      }
-      CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-    } else {
-      // If it's a list, assign dtypes to active columns in the given order
-      CUDF_EXPECTS(static_cast<int>(types_as_strings.size()) >= num_actual_columns,
-                   "Must specify data types for all columns");
-
-      auto dtype_ = std::back_inserter(dtypes);
-
-      for (int col = 0; col < num_actual_columns; col++) {
-        if (column_flags[col] & column_parse::enabled) {
-          column_parse::flags col_flags_;
-          std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]);
-          column_flags[col] |= col_flags_;
-          CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-        }
-      }
-    }
-  } else {
-    // Translate vector of `name : dtype` strings to map
-    // NOTE: Incoming pairs can be out-of-order from column names in dataset
-    std::unordered_map<std::string, std::string> col_type_map;
-    for (const auto& pair : types_as_strings) {
-      const auto pos     = pair.find_last_of(':');
-      const auto name    = pair.substr(0, pos);
-      const auto dtype   = pair.substr(pos + 1, pair.size());
-      col_type_map[name] = dtype;
-    }
-
-    auto dtype_ = std::back_inserter(dtypes);
-
-    for (int col = 0; col < num_actual_columns; col++) {
-      if (column_flags[col] & column_parse::enabled) {
-        CUDF_EXPECTS(col_type_map.find(column_names[col]) != col_type_map.end(),
-                     "Must specify data types for all active columns");
-        column_parse::flags col_flags_;
-        std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[column_names[col]]);
-        column_flags[col] |= col_flags_;
-        CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-      }
-    }
-  }
-
-  if (timestamp_type.id() != cudf::type_id::EMPTY) {
-    for (auto& type : dtypes) {
-      if (cudf::is_timestamp(type)) { type = timestamp_type; }
-    }
-  }
-
-  for (size_t i = 0; i < dtypes.size(); i++) {
-    // Replace EMPTY dtype with STRING
-    if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; }
-  }
-
-  return dtypes;
-}
-
-std::vector<column_buffer> decode_data(parse_options const& parse_opts,
-                                       std::vector<column_parse::flags> const& column_flags,
-                                       std::vector<std::string> const& column_names,
-                                       device_span<char const> data,
-                                       device_span<uint64_t const> row_offsets,
-                                       host_span<data_type const> column_types,
-                                       int32_t num_records,
-                                       int32_t num_actual_columns,
-                                       int32_t num_active_columns,
-                                       rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
-{
-  // Alloc output; columns' data memory is still expected for empty dataframe
-  std::vector<column_buffer> out_buffers;
-  out_buffers.reserve(column_types.size());
-
-  for (int col = 0, active_col = 0; col < num_actual_columns; ++col) {
-    if (column_flags[col] & column_parse::enabled) {
-      const bool is_final_allocation = column_types[active_col].id() != type_id::STRING;
-      auto out_buffer =
-        column_buffer(column_types[active_col],
-                      num_records,
-                      true,
-                      stream,
-                      is_final_allocation ? mr : rmm::mr::get_current_device_resource());
-
-      out_buffer.name         = column_names[col];
-      out_buffer.null_count() = UNKNOWN_NULL_COUNT;
-      out_buffers.emplace_back(std::move(out_buffer));
-      active_col++;
-    }
-  }
-
-  thrust::host_vector<void*> h_data(num_active_columns);
-  thrust::host_vector<bitmask_type*> h_valid(num_active_columns);
-
-  for (int i = 0; i < num_active_columns; ++i) {
-    h_data[i]  = out_buffers[i].data();
-    h_valid[i] = out_buffers[i].null_mask();
-  }
-
-  cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(),
-                                             data,
-                                             make_device_uvector_async(column_flags, stream),
-                                             row_offsets,
-                                             make_device_uvector_async(column_types, stream),
-                                             make_device_uvector_async(h_data, stream),
-                                             make_device_uvector_async(h_valid, stream),
-                                             stream);
-
-  return out_buffers;
-}
-
-/**
- * @brief Create a serialized trie for N/A value matching, based on the options.
- */
-cudf::detail::trie create_na_trie(char quotechar,
-                                  csv_reader_options const& reader_opts,
-                                  rmm::cuda_stream_view stream)
-{
-  // Default values to recognize as null values
-  static std::vector<std::string> const default_na_values{"",
-                                                          "#N/A",
-                                                          "#N/A N/A",
-                                                          "#NA",
-                                                          "-1.#IND",
-                                                          "-1.#QNAN",
-                                                          "-NaN",
-                                                          "-nan",
-                                                          "1.#IND",
-                                                          "1.#QNAN",
-                                                          "<NA>",
-                                                          "N/A",
-                                                          "NA",
-                                                          "NULL",
-                                                          "NaN",
-                                                          "n/a",
-                                                          "nan",
-                                                          "null"};
-
-  if (!reader_opts.is_enabled_na_filter()) { return cudf::detail::trie(0, stream); }
+  if (!reader_opts.is_enabled_na_filter()) { return cudf::detail::trie(0, stream); }
 
   std::vector<std::string> na_values = reader_opts.get_na_values();
   if (reader_opts.is_enabled_keep_default_na()) {
@@ -1109,6 +1067,8 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
   return parse_opts;
 }
 
+}  // namespace
+
 table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
                              csv_reader_options const& options,
                              rmm::cuda_stream_view stream,

From 88e23990151c737dcb4a22a5d6454ef8893285c4 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Tue, 17 Aug 2021 00:53:48 -0500
Subject: [PATCH 12/26] remove filepath-related logic from csv and json readers

---
 cpp/include/cudf/io/csv.hpp            |  2 +-
 cpp/include/cudf/io/json.hpp           |  2 +-
 cpp/src/io/comp/io_uncomp.h            |  7 +++--
 cpp/src/io/comp/uncomp.cpp             | 19 ++++++------
 cpp/src/io/csv/reader_impl.cu          | 38 ++++++------------------
 cpp/src/io/csv/reader_impl.hpp         |  4 ---
 cpp/src/io/functions.cpp               | 40 ++++++++++++++++++++++++--
 cpp/src/io/json/reader_impl.cu         | 31 ++------------------
 cpp/src/io/json/reader_impl.hpp        |  1 -
 cpp/src/io/utilities/parsing_utils.cu  | 34 ----------------------
 cpp/src/io/utilities/parsing_utils.cuh | 18 ------------
 python/cudf/cudf/_lib/csv.pyx          |  2 +-
 python/cudf/cudf/tests/test_csv.py     | 14 ---------
 13 files changed, 66 insertions(+), 146 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index d4a21b2e98c..c807f189aac 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1199,7 +1199,7 @@ class csv_reader_options_builder {
  * @return The set of columns along with metadata.
  */
 table_with_metadata read_csv(
-  csv_reader_options const& options,
+  csv_reader_options options,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 8954f7dcab1..bca60f76260 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -364,7 +364,7 @@ class json_reader_options_builder {
  * @return The set of columns along with metadata.
  */
 table_with_metadata read_json(
-  json_reader_options const& options,
+  json_reader_options options,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/io/comp/io_uncomp.h b/cpp/src/io/comp/io_uncomp.h
index 8daf73ecd0c..7b1feb84813 100644
--- a/cpp/src/io/comp/io_uncomp.h
+++ b/cpp/src/io/comp/io_uncomp.h
@@ -16,12 +16,13 @@
 
 #pragma once
 
+#include <cudf/io/types.hpp>
+#include <cudf/utilities/span.hpp>
+
 #include <memory>
 #include <string>
 #include <vector>
 
-#include <cudf/utilities/span.hpp>
-
 using cudf::host_span;
 
 namespace cudf {
@@ -42,7 +43,7 @@ enum {
 
 std::vector<char> io_uncompress_single_h2d(void const* src, size_t src_size, int stream_type);
 
-std::vector<char> get_uncompressed_data(host_span<char const> data, std::string const& compression);
+std::vector<char> get_uncompressed_data(host_span<char const> data, compression_type compression);
 
 class HostDecompressor {
  public:
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 2cb99d897fe..ee451d04dbb 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -369,6 +369,7 @@ std::vector<char> io_uncompress_single_h2d(const void* src, size_t src_size, int
       // Unsupported format
       break;
   }
+
   CUDF_EXPECTS(comp_data != nullptr, "Unsupported compressed stream type");
   CUDF_EXPECTS(comp_len > 0, "Unsupported compressed stream type");
 
@@ -422,17 +423,17 @@ std::vector<char> io_uncompress_single_h2d(const void* src, size_t src_size, int
  * @return Vector containing the output uncompressed data
  */
 std::vector<char> get_uncompressed_data(host_span<char const> const data,
-                                        std::string const& compression)
+                                        compression_type compression)
 {
   int comp_type = IO_UNCOMP_STREAM_TYPE_INFER;
-  if (compression == "gzip")
-    comp_type = IO_UNCOMP_STREAM_TYPE_GZIP;
-  else if (compression == "zip")
-    comp_type = IO_UNCOMP_STREAM_TYPE_ZIP;
-  else if (compression == "bz2")
-    comp_type = IO_UNCOMP_STREAM_TYPE_BZIP2;
-  else if (compression == "xz")
-    comp_type = IO_UNCOMP_STREAM_TYPE_XZ;
+
+  switch (compression) {
+    case compression_type::GZIP: comp_type = IO_UNCOMP_STREAM_TYPE_GZIP; break;
+    case compression_type::ZIP: comp_type = IO_UNCOMP_STREAM_TYPE_ZIP; break;
+    case compression_type::BZIP2: comp_type = IO_UNCOMP_STREAM_TYPE_BZIP2; break;
+    case compression_type::XZ: comp_type = IO_UNCOMP_STREAM_TYPE_XZ; break;
+    default: break;
+  }
 
   return io_uncompress_single_h2d(data.data(), data.size(), comp_type);
 }
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 549b0474fe1..a85a610962e 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -206,10 +206,12 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
   auto num_rows      = opts_.get_nrows();
 
   if (range_offset > 0 || range_size > 0) {
-    CUDF_EXPECTS(compression_type_ == "none",
+    CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE,
                  "Reading compressed data using `byte range` is unsupported");
   }
+
   size_t map_range_size = 0;
+
   if (range_size != 0) {
     auto num_given_dtypes =
       std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes());
@@ -217,12 +219,7 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
     map_range_size         = range_size + calculateMaxRowSize(num_columns);
   }
 
-  // Support delayed opening of the file if using memory mapping datasource
-  // This allows only mapping of a subset of the file if using byte range
-  if (source_ == nullptr) {
-    assert(!filepath_.empty());
-    source_ = datasource::create(filepath_, range_offset, map_range_size);
-  }
+  // TODO: provide hint to datasource that we should memory map any underlying file.
 
   // Transfer source data to GPU
   if (!source_->is_empty()) {
@@ -235,10 +232,11 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
 
     std::vector<char> h_uncomp_data_owner;
 
-    if (compression_type_ != "none") {
-      h_uncomp_data_owner = get_uncompressed_data(h_data, compression_type_);
+    if (opts_.get_compression() != compression_type::NONE) {
+      h_uncomp_data_owner = get_uncompressed_data(h_data, opts_.get_compression());
       h_data              = h_uncomp_data_owner;
     }
+
     // None of the parameters for row selection is used, we are parsing the entire file
     const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 &&
                                  skip_end_rows <= 0 && num_rows == -1;
@@ -927,35 +925,17 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
 }
 
 reader::impl::impl(std::unique_ptr<datasource> source,
-                   std::string filepath,
                    csv_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : mr_(mr), source_(std::move(source)), filepath_(filepath), opts_(options)
+  : mr_(mr), source_(std::move(source)), opts_(options)
 {
   num_actual_cols_ = opts_.get_names().size();
   num_active_cols_ = num_actual_cols_;
 
-  compression_type_ =
-    infer_compression_type(opts_.get_compression(),
-                           filepath,
-                           {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}});
-
   opts = make_parse_options(options, stream);
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               csv_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
-  // Delay actual instantiation of data source until read to allow for
-  // partial memory mapping of file using byte ranges
-  _impl = std::make_unique<impl>(nullptr, filepaths[0], options, stream, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                csv_reader_options const& options,
@@ -963,7 +943,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
-  _impl = std::make_unique<impl>(std::move(sources[0]), "", options, stream, mr);
+  _impl = std::make_unique<impl>(std::move(sources[0]), options, stream, mr);
 }
 
 // Destructor within this translation unit
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 36c2bf4f9e7..beaa9b816cb 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -72,13 +72,11 @@ class reader::impl {
    * @brief Constructor from a dataset source with reader options.
    *
    * @param source Dataset source
-   * @param filepath Filepath if reading dataset from a file
    * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit impl(std::unique_ptr<datasource> source,
-                std::string filepath,
                 csv_reader_options const& options,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
@@ -222,8 +220,6 @@ class reader::impl {
  private:
   rmm::mr::device_memory_resource* mr_ = nullptr;
   std::unique_ptr<datasource> source_;
-  std::string filepath_;
-  std::string compression_type_;
   const csv_reader_options opts_;
 
   cudf::size_type num_records_ = 0;  // Number of rows with actual data
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index e080ea3a2ca..ccc2eef56c7 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -144,27 +144,61 @@ table_with_metadata read_avro(avro_reader_options const& options,
   return reader->read(options);
 }
 
-table_with_metadata read_json(json_reader_options const& options,
-                              rmm::mr::device_memory_resource* mr)
+compression_type infer_compression_type(compression_type compression, source_info const& info)
+{
+  if (compression != compression_type::AUTO) { return compression; }
+
+  if (info.type != io_type::FILEPATH) { return compression_type::NONE; }
+
+  auto filepath = info.filepaths[0];
+
+  // Attempt to infer from the file extension
+  const auto pos = filepath.find_last_of('.');
+
+  if (pos == std::string::npos) { return {}; }
+
+  auto str_tolower = [](const auto& begin, const auto& end) {
+    std::string out;
+    std::transform(begin, end, std::back_inserter(out), ::tolower);
+    return out;
+  };
+
+  const auto ext = str_tolower(filepath.begin() + pos + 1, filepath.end());
+
+  if (ext == "gz") { return compression_type::GZIP; }
+  if (ext == "zip") { return compression_type::ZIP; }
+  if (ext == "bz2") { return compression_type::BZIP2; }
+  if (ext == "xz") { return compression_type::XZ; }
+
+  return compression_type::NONE;
+}
+
+table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr)
 {
   namespace json = cudf::io::detail::json;
 
   CUDF_FUNC_RANGE();
 
   auto datasources = make_datasources(options.get_source());
+
+  options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
+
   auto reader =
     std::make_unique<json::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read(options);
 }
 
-table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
 {
   namespace csv = cudf::io::detail::csv;
 
   CUDF_FUNC_RANGE();
 
   auto datasources = make_datasources(options.get_source());
+
+  options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
+
   auto reader =
     std::make_unique<csv::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
 
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index a8f117c22bf..bae7471e307 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -241,15 +241,6 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
     map_range_size = range_size + calculate_max_row_size(dtype_option_size);
   }
 
-  // Support delayed opening of the file if using memory mapping datasource
-  // This allows only mapping of a subset of the file if using byte range
-  if (sources_.empty()) {
-    assert(!filepaths_.empty());
-    for (const auto& path : filepaths_) {
-      sources_.emplace_back(datasource::create(path, range_offset, map_range_size));
-    }
-  }
-
   // Iterate through the user defined sources and read the contents into the local buffer
   CUDF_EXPECTS(!sources_.empty(), "No sources were defined");
   size_t total_source_size = 0;
@@ -280,11 +271,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
  */
 void reader::impl::decompress_input(rmm::cuda_stream_view stream)
 {
-  const auto compression_type =
-    infer_compression_type(options_.get_compression(),
-                           filepaths_.size() > 0 ? filepaths_[0] : "",
-                           {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}});
-  if (compression_type == "none") {
+  if (options_.get_compression() == compression_type::NONE) {
     // Do not use the owner vector here to avoid extra copy
     uncomp_data_ = reinterpret_cast<const char*>(buffer_.data());
     uncomp_size_ = buffer_.size();
@@ -293,7 +280,7 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream)
       host_span<char const>(                     //
         reinterpret_cast<const char*>(buffer_.data()),
         buffer_.size()),
-      compression_type);
+      options_.get_compression());
 
     uncomp_data_ = uncomp_data_owner_.data();
     uncomp_size_ = uncomp_data_owner_.size();
@@ -665,7 +652,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    json_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : options_(options), mr_(mr), sources_(std::move(sources)), filepaths_(filepaths)
+  : options_(options), mr_(mr), sources_(std::move(sources))
 {
   CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
 
@@ -713,18 +700,6 @@ table_with_metadata reader::impl::read(json_reader_options const& options,
   return convert_data_to_table(rec_starts, stream);
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               json_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  // Delay actual instantiation of data source until read to allow for
-  // partial memory mapping of file using byte ranges
-  std::vector<std::unique_ptr<datasource>> src = {};  // Empty datasources
-  _impl = std::make_unique<impl>(std::move(src), filepaths, options, stream, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                json_reader_options const& options,
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 5cf51369cdf..f7af55b2b90 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -57,7 +57,6 @@ class reader::impl {
   rmm::mr::device_memory_resource* mr_ = nullptr;
 
   std::vector<std::unique_ptr<datasource>> sources_;
-  std::vector<std::string> filepaths_;
   std::vector<uint8_t> buffer_;
 
   const char* uncomp_data_ = nullptr;
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index 6c8f01111e5..ba62238c5d3 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -209,39 +209,5 @@ cudf::size_type count_all_from_set(const char* h_data,
   return find_all_from_set<void>(h_data, h_size, keys, 0, nullptr, stream);
 }
 
-std::string infer_compression_type(
-  const compression_type& compression_arg,
-  const std::string& filename,
-  const std::vector<std::pair<std::string, std::string>>& ext_to_comp_map)
-{
-  auto str_tolower = [](const auto& begin, const auto& end) {
-    std::string out;
-    std::transform(begin, end, std::back_inserter(out), ::tolower);
-    return out;
-  };
-
-  // Attempt to infer from user-supplied argument
-  if (compression_arg != compression_type::AUTO) {
-    switch (compression_arg) {
-      case compression_type::GZIP: return "gzip";
-      case compression_type::BZIP2: return "bz2";
-      case compression_type::ZIP: return "zip";
-      case compression_type::XZ: return "xz";
-      default: break;
-    }
-  }
-
-  // Attempt to infer from the file extension
-  const auto pos = filename.find_last_of('.');
-  if (pos != std::string::npos) {
-    const auto ext = str_tolower(filename.begin() + pos + 1, filename.end());
-    for (const auto& mapping : ext_to_comp_map) {
-      if (mapping.first == ext) { return mapping.second; }
-    }
-  }
-
-  return "none";
-}
-
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 88297423b9b..daf23de7eb2 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -454,24 +454,6 @@ cudf::size_type count_all_from_set(const char* h_data,
                                    const std::vector<char>& keys,
                                    rmm::cuda_stream_view stream);
 
-/**
- * @brief Infer file compression type based on user supplied arguments.
- *
- * If the user specifies a valid compression_type for compression arg,
- * compression type will be computed based on that.  Otherwise the filename
- * and ext_to_comp_map will be used.
- *
- * @param[in] compression_arg User specified compression type (if any)
- * @param[in] filename Filename to base compression type (by extension) on
- * @param[in] ext_to_comp_map User supplied mapping of file extension to compression type
- *
- * @return string representing compression type ("gzip, "bz2", etc)
- */
-std::string infer_compression_type(
-  const compression_type& compression_arg,
-  const std::string& filename,
-  const std::vector<std::pair<std::string, std::string>>& ext_to_comp_map);
-
 /**
  * @brief Checks whether the given character is a whitespace character.
  *
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index a15a180d466..7a54ccac197 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -101,7 +101,7 @@ cdef csv_reader_options make_csv_reader_options(
     bool na_filter,
     object prefix,
     object index_col,
-) except +:
+) except *:
     cdef source_info c_source_info = make_source_info([datasource])
     cdef compression_type c_compression
     cdef size_type c_header
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 5511a65d0a4..8fb5d7cc9eb 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1069,20 +1069,6 @@ def test_csv_reader_byte_range(tmpdir, segment_bytes):
     assert list(df["int2"]) == list(ref_df["int2"])
 
 
-def test_csv_reader_byte_range_type_corner_case(tmpdir):
-    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv")
-
-    cudf.datasets.timeseries(
-        start="2000-01-01",
-        end="2000-01-02",
-        dtypes={"name": str, "id": int, "x": float, "y": float},
-    ).to_csv(fname, chunksize=100000)
-
-    byte_range = (2_147_483_648, 0)
-    with pytest.raises(RuntimeError, match="Offset is past end of file"):
-        cudf.read_csv(fname, byte_range=byte_range, header=None)
-
-
 @pytest.mark.parametrize("segment_bytes", [10, 19, 31, 36])
 def test_csv_reader_byte_range_strings(segment_bytes):
     names = ["strings"]

From 62b95202d9b1db14f765ef45644d9cf91f782ea7 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Tue, 17 Aug 2021 02:19:27 -0500
Subject: [PATCH 13/26] remove filepath logic from avro, parquet, orc readers

---
 cpp/include/cudf/io/detail/avro.hpp    | 13 -------------
 cpp/include/cudf/io/detail/orc.hpp     | 13 -------------
 cpp/include/cudf/io/detail/parquet.hpp | 13 -------------
 cpp/src/io/avro/reader_impl.cu         | 10 ----------
 cpp/src/io/orc/reader_impl.cu          |  9 ---------
 cpp/src/io/parquet/reader_impl.cu      |  9 ---------
 6 files changed, 67 deletions(-)

diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index 98483d1c03e..306c15dcb72 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -38,19 +38,6 @@ class reader {
   std::unique_ptr<impl> _impl;
 
  public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  avro_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from an array of datasources
    *
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index ab26c01db74..2174b688da2 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -47,19 +47,6 @@ class reader {
   std::unique_ptr<impl> _impl;
 
  public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  orc_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from an array of datasources
    *
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index d95af7a11da..14f27ef8eef 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -49,19 +49,6 @@ class reader {
   std::unique_ptr<impl> _impl;
 
  public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  parquet_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from an array of datasources
    *
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index f6ffdd99d35..08ea96139a1 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -474,16 +474,6 @@ table_with_metadata reader::impl::read(avro_reader_options const& options,
   return {std::make_unique<table>(std::move(out_columns)), std::move(metadata_out)};
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               avro_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
-  _impl = std::make_unique<impl>(datasource::create(filepaths[0]), options, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                avro_reader_options const& options,
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 033a2d9aff5..5d62c45df83 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1383,15 +1383,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               orc_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  _impl = std::make_unique<impl>(datasource::create(filepaths), options, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 9f9bdfd4755..31ae763d9ff 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -1608,15 +1608,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               parquet_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(datasource::create(filepaths), options, mr))
-{
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                parquet_reader_options const& options,

From fb0129433bdd2dd264105ba172d96f2a310d8d8d Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Wed, 18 Aug 2021 15:19:11 -0500
Subject: [PATCH 14/26] move range size padding calculation out of json/csv
 reader and in to json/csv options

---
 cpp/include/cudf/io/csv.hpp        | 34 +++++++++++++++++++++
 cpp/include/cudf/io/json.hpp       | 32 +++++++++++++++++++
 cpp/src/io/csv/reader_impl.cu      | 49 +++++-------------------------
 cpp/src/io/functions.cpp           | 24 +++++++++++----
 cpp/src/io/json/reader_impl.cu     | 46 ++++++----------------------
 cpp/src/io/json/reader_impl.hpp    |  3 +-
 python/cudf/cudf/tests/test_csv.py | 14 +++++++++
 7 files changed, 116 insertions(+), 86 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index c807f189aac..1aa6e3bea29 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -177,6 +177,40 @@ class csv_reader_options {
    */
   std::size_t get_byte_range_size() const { return _byte_range_size; }
 
+  /**
+   * @brief Returns number of bytes to read with padding.
+   */
+  std::size_t get_byte_range_size_with_padding() const
+  {
+    if (_byte_range_size == 0) {
+      return 0;
+    } else {
+      return _byte_range_size + get_byte_range_padding();
+    }
+  }
+
+  /**
+   * @brief Returns number of bytes to pad when reading.
+   */
+  std::size_t get_byte_range_padding() const
+  {
+    auto const num_names   = _names.size();
+    auto const num_dtypes  = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes);
+    auto const num_columns = std::max(num_dtypes, num_names);
+
+    auto const max_row_bytes = 16 * 1024;  // 16KB
+    auto const column_bytes  = 64;
+    auto const base_padding  = 1024;  // 1KB
+
+    if (num_columns == 0) {
+      // Use flat size if the number of columns is not known
+      return max_row_bytes;
+    }
+
+    // Expand the size based on the number of columns, if available
+    return base_padding + num_columns * column_bytes;
+  }
+
   /**
    * @brief Returns names of the columns.
    */
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index bca60f76260..5d2a4f6fcd1 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -140,6 +140,38 @@ class json_reader_options {
    */
   size_t get_byte_range_size() const { return _byte_range_size; }
 
+  /**
+   * @brief Returns number of bytes to read with padding.
+   */
+  size_t get_byte_range_size_with_padding() const
+  {
+    if (_byte_range_size == 0) {
+      return 0;
+    } else {
+      return _byte_range_size + get_byte_range_padding();
+    }
+  }
+
+  /**
+   * @brief Returns number of bytes to pad when reading.
+   */
+  size_t get_byte_range_padding() const
+  {
+    auto const num_columns = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes);
+
+    auto const max_row_bytes = 16 * 1024;  // 16KB
+    auto const column_bytes  = 64;
+    auto const base_padding  = 1024;  // 1KB
+
+    if (num_columns == 0) {
+      // Use flat size if the number of columns is not known
+      return max_row_bytes;
+    }
+
+    // Expand the size based on the number of columns, if available
+    return base_padding + num_columns * column_bytes;
+  }
+
   /**
    * @brief Whether to read the file as a json object per line.
    */
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index a85a610962e..c61cc26800e 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -57,31 +57,6 @@ namespace csv {
 using namespace cudf::io::csv;
 using namespace cudf::io;
 
-/**
- * @brief Estimates the maximum expected length or a row, based on the number
- * of columns
- *
- * If the number of columns is not available, it will return a value large
- * enough for most use cases
- *
- * @param[in] num_columns Number of columns in the CSV file (optional)
- *
- * @return Estimated maximum size of a row, in bytes
- */
-constexpr size_t calculateMaxRowSize(int num_columns = 0) noexcept
-{
-  constexpr size_t max_row_bytes = 16 * 1024;  // 16KB
-  constexpr size_t column_bytes  = 64;
-  constexpr size_t base_padding  = 1024;  // 1KB
-  if (num_columns == 0) {
-    // Use flat size if the number of columns is not known
-    return max_row_bytes;
-  } else {
-    // Expand the size based on the number of columns, if available
-    return base_padding + num_columns * column_bytes;
-  }
-}
-
 /**
  * @brief Translates a dtype string and returns its dtype enumeration and any
  * extended dtype flags that are supported by cuIO. Often, this is a column
@@ -199,31 +174,21 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream)
 std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
 reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
 {
-  auto range_offset  = opts_.get_byte_range_offset();
-  auto range_size    = opts_.get_byte_range_size();
-  auto skip_rows     = opts_.get_skiprows();
-  auto skip_end_rows = opts_.get_skipfooter();
-  auto num_rows      = opts_.get_nrows();
+  auto range_offset      = opts_.get_byte_range_offset();
+  auto range_size        = opts_.get_byte_range_size();
+  auto range_size_padded = opts_.get_byte_range_size_with_padding();
+  auto skip_rows         = opts_.get_skiprows();
+  auto skip_end_rows     = opts_.get_skipfooter();
+  auto num_rows          = opts_.get_nrows();
 
   if (range_offset > 0 || range_size > 0) {
     CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE,
                  "Reading compressed data using `byte range` is unsupported");
   }
 
-  size_t map_range_size = 0;
-
-  if (range_size != 0) {
-    auto num_given_dtypes =
-      std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes());
-    const auto num_columns = std::max(opts_.get_names().size(), num_given_dtypes);
-    map_range_size         = range_size + calculateMaxRowSize(num_columns);
-  }
-
-  // TODO: provide hint to datasource that we should memory map any underlying file.
-
   // Transfer source data to GPU
   if (!source_->is_empty()) {
-    auto data_size = (map_range_size != 0) ? map_range_size : source_->size();
+    auto data_size = (range_size_padded != 0) ? range_size_padded : source_->size();
     auto buffer    = source_->host_read(range_offset, data_size);
 
     auto h_data = host_span<char const>(  //
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index ccc2eef56c7..438cb1762c6 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -107,10 +107,18 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder(
 
 namespace {
 
-std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info)
+std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info,
+                                                                    size_t range_offset = 0,
+                                                                    size_t range_size   = 0)
 {
   switch (info.type) {
-    case io_type::FILEPATH: return cudf::io::datasource::create(info.filepaths);
+    case io_type::FILEPATH: {
+      auto sources = std::vector<std::unique_ptr<cudf::io::datasource>>();
+      for (auto const& filepath : info.filepaths) {
+        sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size));
+      }
+      return sources;
+    }
     case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers);
     case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources);
     default: CUDF_FAIL("Unsupported source type");
@@ -179,10 +187,12 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor
 
   CUDF_FUNC_RANGE();
 
-  auto datasources = make_datasources(options.get_source());
-
   options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
 
+  auto datasources = make_datasources(options.get_source(),
+                                      options.get_byte_range_offset(),
+                                      options.get_byte_range_size_with_padding());
+
   auto reader =
     std::make_unique<json::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
 
@@ -195,10 +205,12 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_
 
   CUDF_FUNC_RANGE();
 
-  auto datasources = make_datasources(options.get_source());
-
   options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
 
+  auto datasources = make_datasources(options.get_source(),
+                                      options.get_byte_range_offset(),
+                                      options.get_byte_range_size_with_padding());
+
   auto reader =
     std::make_unique<csv::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
 
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index bae7471e307..0618f02e98f 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -50,31 +50,6 @@ namespace detail {
 namespace json {
 using namespace cudf::io;
 
-namespace {
-/**
- * @brief Estimates the maximum expected length or a row, based on the number
- * of columns
- *
- * If the number of columns is not available, it will return a value large
- * enough for most use cases
- *
- * @param[in] num_columns Number of columns in the JSON file (optional)
- *
- * @return Estimated maximum size of a row, in bytes
- */
-constexpr size_t calculate_max_row_size(int num_columns = 0) noexcept
-{
-  constexpr size_t max_row_bytes = 16 * 1024;  // 16KB
-  constexpr size_t column_bytes  = 64;
-  constexpr size_t base_padding  = 1024;  // 1KB
-  return num_columns == 0
-           ? max_row_bytes  // Use flat size if the # of columns is not known
-           : base_padding +
-               num_columns * column_bytes;  // Expand size based on the # of columns, if available
-}
-
-}  // anonymous namespace
-
 /**
  * @brief Aggregate the table containing keys info by their hash values.
  *
@@ -231,16 +206,12 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader::impl::get_json_obj
  *
  * @param[in] range_offset Number of bytes offset from the start
  * @param[in] range_size Bytes to read; use `0` for all remaining data
+ * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
  */
-void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
+void reader::impl::ingest_raw_input(size_t range_offset,
+                                    size_t range_size,
+                                    size_t range_size_padded)
 {
-  size_t map_range_size = 0;
-  if (range_size != 0) {
-    auto const dtype_option_size =
-      std::visit([](const auto& dtypes) { return dtypes.size(); }, options_.get_dtypes());
-    map_range_size = range_size + calculate_max_row_size(dtype_option_size);
-  }
-
   // Iterate through the user defined sources and read the contents into the local buffer
   CUDF_EXPECTS(!sources_.empty(), "No sources were defined");
   size_t total_source_size = 0;
@@ -253,7 +224,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
   size_t bytes_read = 0;
   for (const auto& source : sources_) {
     if (!source->is_empty()) {
-      auto data_size = (map_range_size != 0) ? map_range_size : source->size();
+      auto data_size = (range_size_padded != 0) ? range_size_padded : source->size();
       bytes_read += source->host_read(range_offset, data_size, &buffer_[bytes_read]);
     }
   }
@@ -675,10 +646,11 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
 table_with_metadata reader::impl::read(json_reader_options const& options,
                                        rmm::cuda_stream_view stream)
 {
-  auto range_offset = options.get_byte_range_offset();
-  auto range_size   = options.get_byte_range_size();
+  auto range_offset      = options.get_byte_range_offset();
+  auto range_size        = options.get_byte_range_size();
+  auto range_size_padded = options.get_byte_range_size_with_padding();
 
-  ingest_raw_input(range_offset, range_size);
+  ingest_raw_input(range_offset, range_size, range_size_padded);
   CUDF_EXPECTS(buffer_.size() != 0, "Ingest failed: input data is null.\n");
 
   decompress_input(stream);
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index f7af55b2b90..d01f2e8677e 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -109,8 +109,9 @@ class reader::impl {
    *
    * @param[in] range_offset Number of bytes offset from the start
    * @param[in] range_size Bytes to read; use `0` for all remaining data
+   * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
    */
-  void ingest_raw_input(size_t range_offset, size_t range_size);
+  void ingest_raw_input(size_t range_offset, size_t range_size, size_t range_size_padded);
 
   /**
    * @brief Extract the JSON objects keys from the input file with object rows.
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 8fb5d7cc9eb..5511a65d0a4 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1069,6 +1069,20 @@ def test_csv_reader_byte_range(tmpdir, segment_bytes):
     assert list(df["int2"]) == list(ref_df["int2"])
 
 
+def test_csv_reader_byte_range_type_corner_case(tmpdir):
+    fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv")
+
+    cudf.datasets.timeseries(
+        start="2000-01-01",
+        end="2000-01-02",
+        dtypes={"name": str, "id": int, "x": float, "y": float},
+    ).to_csv(fname, chunksize=100000)
+
+    byte_range = (2_147_483_648, 0)
+    with pytest.raises(RuntimeError, match="Offset is past end of file"):
+        cudf.read_csv(fname, byte_range=byte_range, header=None)
+
+
 @pytest.mark.parametrize("segment_bytes", [10, 19, 31, 36])
 def test_csv_reader_byte_range_strings(segment_bytes):
     names = ["strings"]

From d422aebbe62d7e9915af93f474563e6e1c571e97 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Wed, 18 Aug 2021 15:30:38 -0500
Subject: [PATCH 15/26] remove filepaths from json reader

---
 cpp/src/io/json/reader_impl.cu  | 12 +++++-------
 cpp/src/io/json/reader_impl.hpp |  3 +--
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 0618f02e98f..2964a12568f 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -231,7 +231,7 @@ void reader::impl::ingest_raw_input(size_t range_offset,
 
   byte_range_offset_ = range_offset;
   byte_range_size_   = range_size;
-  load_whole_file_   = byte_range_offset_ == 0 && byte_range_size_ == 0;
+  load_whole_source_ = byte_range_offset_ == 0 && byte_range_size_ == 0;
 }
 
 /**
@@ -256,7 +256,7 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream)
     uncomp_data_ = uncomp_data_owner_.data();
     uncomp_size_ = uncomp_data_owner_.size();
   }
-  if (load_whole_file_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
+  if (load_whole_source_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
 }
 
 rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_view stream)
@@ -268,7 +268,7 @@ rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_
   if (allow_newlines_in_strings_) { chars_to_count.push_back('\"'); }
   // If not starting at an offset, add an extra row to account for the first row in the file
   cudf::size_type prefilter_count = ((byte_range_offset_ == 0) ? 1 : 0);
-  if (load_whole_file_) {
+  if (load_whole_source_) {
     prefilter_count += count_all_from_set(data_, chars_to_count, stream);
   } else {
     prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count, stream);
@@ -286,7 +286,7 @@ rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_
   std::vector<char> chars_to_find{'\n'};
   if (allow_newlines_in_strings_) { chars_to_find.push_back('\"'); }
   // Passing offset = 1 to return positions AFTER the found character
-  if (load_whole_file_) {
+  if (load_whole_source_) {
     find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream);
   } else {
     find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr, stream);
@@ -619,7 +619,6 @@ table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t con
 }
 
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   std::vector<std::string> const& filepaths,
                    json_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
@@ -678,8 +677,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
 {
-  std::vector<std::string> file_paths = {};  // Empty filepaths
-  _impl = std::make_unique<impl>(std::move(sources), file_paths, options, stream, mr);
+  _impl = std::make_unique<impl>(std::move(sources), options, stream, mr);
 }
 
 // Destructor within this translation unit
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index d01f2e8677e..d910cce2d72 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -68,7 +68,7 @@ class reader::impl {
 
   size_t byte_range_offset_ = 0;
   size_t byte_range_size_   = 0;
-  bool load_whole_file_     = true;
+  bool load_whole_source_   = true;
 
   table_metadata metadata_;
   std::vector<data_type> dtypes_;
@@ -186,7 +186,6 @@ class reader::impl {
    * @brief Constructor from a dataset source with reader options.
    */
   explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                std::vector<std::string> const& filepaths,
                 json_reader_options const& options,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);

From 4808a435d5da93409fc3bfe963ecf2a81d90d6b5 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 21 Aug 2021 00:45:13 -0500
Subject: [PATCH 16/26] remove unncessary csv writer class

---
 cpp/include/cudf/io/detail/csv.hpp | 51 ++++++++----------------------
 cpp/src/io/csv/writer_impl.cu      | 46 +++++++++++----------------
 cpp/src/io/csv/writer_impl.hpp     |  8 ++---
 cpp/src/io/functions.cpp           |  4 +--
 4 files changed, 36 insertions(+), 73 deletions(-)

diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 89e589d306a..81c16f0a870 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -74,45 +74,20 @@ class reader {
   table_with_metadata read(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 };
 
-class writer {
- public:
-  class impl;
-
- private:
-  std::unique_ptr<impl> _impl;
-
- public:
-  /**
-   * @brief Constructor for output to a file.
-   *
-   * @param sinkp The data sink to write the data to
-   * @param options Settings for controlling writing behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  writer(std::unique_ptr<cudf::io::data_sink> sinkp,
-         csv_writer_options const& options,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr);  // cannot provide definition here (because
-                                                // _impl is incomplete hence unique_ptr has
-                                                // not enough sizeof() info)
-
-  /**
-   * @brief Destructor explicitly-declared to avoid inlined in header
-   */
-  ~writer();
+/**
+ * @brief Writes the entire dataset.
+ *
+ * @param table Set of columns to output
+ * @param metadata Table metadata and column names
+ * @param sinkp The data sink to write the data to
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource to use for device memory allocation
+ */
+void write_csv(std::unique_ptr<cudf::io::data_sink> sinkp,
+               csv_writer_options const& options,
+               rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-  /**
-   * @brief Writes the entire dataset.
-   *
-   * @param table Set of columns to output
-   * @param metadata Table metadata and column names
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write(table_view const& table,
-             const table_metadata* metadata = nullptr,
-             rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-};
 }  // namespace csv
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 335634b7fa8..80f98f6a17f 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -257,30 +257,18 @@ struct column_to_strings_fn {
 };
 }  // unnamed namespace
 
-// Forward to implementation
-writer::writer(std::unique_ptr<data_sink> sink,
-               csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(std::move(sink), options, mr))
-{
-}
-
-// Destructor within this translation unit
-writer::~writer() = default;
-
-writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   csv_writer_options const& options,
-                   rmm::mr::device_memory_resource* mr)
+writer_impl::writer_impl(std::unique_ptr<data_sink> sink,
+                         csv_writer_options const& options,
+                         rmm::mr::device_memory_resource* mr)
   : out_sink_(std::move(sink)), mr_(mr), options_(options)
 {
 }
 
 // write the header: column names:
 //
-void writer::impl::write_chunked_begin(table_view const& table,
-                                       const table_metadata* metadata,
-                                       rmm::cuda_stream_view stream)
+void writer_impl::write_chunked_begin(table_view const& table,
+                                      const table_metadata* metadata,
+                                      rmm::cuda_stream_view stream)
 {
   if ((metadata != nullptr) && (options_.is_enabled_include_header())) {
     auto const& column_names = metadata->column_names;
@@ -334,9 +322,9 @@ void writer::impl::write_chunked_begin(table_view const& table,
   }
 }
 
-void writer::impl::write_chunked(strings_column_view const& str_column_view,
-                                 const table_metadata* metadata,
-                                 rmm::cuda_stream_view stream)
+void writer_impl::write_chunked(strings_column_view const& str_column_view,
+                                const table_metadata* metadata,
+                                rmm::cuda_stream_view stream)
 {
   // algorithm outline:
   //
@@ -384,9 +372,9 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
   }
 }
 
-void writer::impl::write(table_view const& table,
-                         const table_metadata* metadata,
-                         rmm::cuda_stream_view stream)
+void writer_impl::write(table_view const& table,
+                        const table_metadata* metadata,
+                        rmm::cuda_stream_view stream)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
@@ -469,11 +457,13 @@ void writer::impl::write(table_view const& table,
   write_chunked_end(table, metadata, stream);
 }
 
-void writer::write(table_view const& table,
-                   const table_metadata* metadata,
-                   rmm::cuda_stream_view stream)
+void write_csv(std::unique_ptr<cudf::io::data_sink>&& sink,
+               csv_writer_options const& options,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
 {
-  _impl->write(table, metadata, stream);
+  return writer_impl(std::move(sink), options, mr)
+    .write(options.get_table(), options.get_metadata(), stream);
 }
 
 }  // namespace csv
diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp
index 965c036dc75..b5a27e33167 100644
--- a/cpp/src/io/csv/writer_impl.hpp
+++ b/cpp/src/io/csv/writer_impl.hpp
@@ -45,7 +45,7 @@ using namespace cudf::io;
 /**
  * @brief Implementation for CSV writer
  */
-class writer::impl {
+class writer_impl {
  public:
   /**
    * @brief Constructor with writer options.
@@ -54,9 +54,9 @@ class writer::impl {
    * @param options Settings for controlling behavior
    * @param mr Device memory resource to use for device memory allocation
    */
-  impl(std::unique_ptr<data_sink> sink,
-       csv_writer_options const& options,
-       rmm::mr::device_memory_resource* mr);
+  writer_impl(std::unique_ptr<data_sink> sink,
+              csv_writer_options const& options,
+              rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Write an entire dataset to CSV format.
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 438cb1762c6..ac427a5e805 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -223,10 +223,8 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
   using namespace cudf::io::detail;
 
   auto sink = make_datasink(options.get_sink());
-  auto writer =
-    std::make_unique<csv::writer>(std::move(sink), options, rmm::cuda_stream_default, mr);
 
-  writer->write(options.get_table(), options.get_metadata());
+  csv::write_csv(std::move(sink), options, rmm::cuda_stream_default, mr);
 }
 
 namespace detail_orc = cudf::io::detail::orc;

From 9688d77c10399cdd442cdedaaf95e60c532be851 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 21 Aug 2021 01:21:04 -0500
Subject: [PATCH 17/26] replace csv::writer_impl out_sink_ member with local
 variable

---
 cpp/include/cudf/io/detail/csv.hpp |  2 +-
 cpp/src/io/csv/writer_impl.cu      | 45 ++++++++++++++----------------
 cpp/src/io/csv/writer_impl.hpp     | 32 +++++++--------------
 3 files changed, 32 insertions(+), 47 deletions(-)

diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 81c16f0a870..74a987572d6 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -83,7 +83,7 @@ class reader {
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource to use for device memory allocation
  */
-void write_csv(std::unique_ptr<cudf::io::data_sink> sinkp,
+void write_csv(std::unique_ptr<cudf::io::data_sink>&& sinkp,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 80f98f6a17f..091487ead87 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -257,16 +257,15 @@ struct column_to_strings_fn {
 };
 }  // unnamed namespace
 
-writer_impl::writer_impl(std::unique_ptr<data_sink> sink,
-                         csv_writer_options const& options,
-                         rmm::mr::device_memory_resource* mr)
-  : out_sink_(std::move(sink)), mr_(mr), options_(options)
+writer_impl::writer_impl(csv_writer_options const& options, rmm::mr::device_memory_resource* mr)
+  : mr_(mr), options_(options)
 {
 }
 
 // write the header: column names:
 //
-void writer_impl::write_chunked_begin(table_view const& table,
+void writer_impl::write_chunked_begin(data_sink* out_sink,
+                                      table_view const& table,
                                       const table_metadata* metadata,
                                       rmm::cuda_stream_view stream)
 {
@@ -318,18 +317,19 @@ void writer_impl::write_chunked_begin(table_view const& table,
     }
     header.append(terminator);
 
-    out_sink_->host_write(header.data(), header.size());
+    out_sink->host_write(header.data(), header.size());
   }
 }
 
-void writer_impl::write_chunked(strings_column_view const& str_column_view,
+void writer_impl::write_chunked(data_sink* out_sink,
+                                strings_column_view const& str_column_view,
                                 const table_metadata* metadata,
                                 rmm::cuda_stream_view stream)
 {
   // algorithm outline:
   //
   //  for_each(strings_column.begin(), strings_column.end(),
-  //           [sink = out_sink_](auto str_row) mutable {
+  //           [sink = out_sink](auto str_row) mutable {
   //               auto host_buffer = str_row.host_buffer();
   //               sink->host_write(host_buffer_.data(), host_buffer_.size());
   //           });//or...sink->device_write(device_buffer,...);
@@ -347,9 +347,9 @@ void writer_impl::write_chunked(strings_column_view const& str_column_view,
   auto total_num_bytes      = strings_column.chars_size();
   char const* ptr_all_bytes = strings_column.chars().data<char>();
 
-  if (out_sink_->is_device_write_preferred(total_num_bytes)) {
+  if (out_sink->is_device_write_preferred(total_num_bytes)) {
     // Direct write from device memory
-    out_sink_->device_write(ptr_all_bytes, total_num_bytes, stream);
+    out_sink->device_write(ptr_all_bytes, total_num_bytes, stream);
   } else {
     // copy the bytes to host to write them out
     thrust::host_vector<char> h_bytes(total_num_bytes);
@@ -360,26 +360,27 @@ void writer_impl::write_chunked(strings_column_view const& str_column_view,
                              stream.value()));
     stream.synchronize();
 
-    out_sink_->host_write(h_bytes.data(), total_num_bytes);
+    out_sink->host_write(h_bytes.data(), total_num_bytes);
   }
 
   // Needs newline at the end, to separate from next chunk
-  if (out_sink_->is_device_write_preferred(newline.size())) {
-    out_sink_->device_write(newline.data(), newline.size(), stream);
+  if (out_sink->is_device_write_preferred(newline.size())) {
+    out_sink->device_write(newline.data(), newline.size(), stream);
   } else {
-    out_sink_->host_write(options_.get_line_terminator().data(),
-                          options_.get_line_terminator().size());
+    out_sink->host_write(options_.get_line_terminator().data(),
+                         options_.get_line_terminator().size());
   }
 }
 
-void writer_impl::write(table_view const& table,
+void writer_impl::write(data_sink* out_sink,
+                        table_view const& table,
                         const table_metadata* metadata,
                         rmm::cuda_stream_view stream)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
   //
-  write_chunked_begin(table, metadata, stream);
+  write_chunked_begin(out_sink, table, metadata, stream);
 
   if (table.num_rows() > 0) {
     // no need to check same-size columns constraint; auto-enforced by table_view
@@ -448,13 +449,9 @@ void writer_impl::write(table_view const& table,
         return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream);
       }();
 
-      write_chunked(str_concat_col->view(), metadata, stream);
+      write_chunked(out_sink, str_concat_col->view(), metadata, stream);
     }
   }
-
-  // finalize (no-op, for now, but offers a hook for future extensions):
-  //
-  write_chunked_end(table, metadata, stream);
 }
 
 void write_csv(std::unique_ptr<cudf::io::data_sink>&& sink,
@@ -462,8 +459,8 @@ void write_csv(std::unique_ptr<cudf::io::data_sink>&& sink,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
 {
-  return writer_impl(std::move(sink), options, mr)
-    .write(options.get_table(), options.get_metadata(), stream);
+  return writer_impl(options, mr)
+    .write(sink.get(), options.get_table(), options.get_metadata(), stream);
 }
 
 }  // namespace csv
diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp
index b5a27e33167..79ee240f585 100644
--- a/cpp/src/io/csv/writer_impl.hpp
+++ b/cpp/src/io/csv/writer_impl.hpp
@@ -50,63 +50,51 @@ class writer_impl {
   /**
    * @brief Constructor with writer options.
    *
-   * @param sink Output sink
    * @param options Settings for controlling behavior
    * @param mr Device memory resource to use for device memory allocation
    */
-  writer_impl(std::unique_ptr<data_sink> sink,
-              csv_writer_options const& options,
-              rmm::mr::device_memory_resource* mr);
+  writer_impl(csv_writer_options const& options, rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Write an entire dataset to CSV format.
    *
+   * @param sink Output sink
    * @param table The set of columns
    * @param metadata The metadata associated with the table
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  void write(table_view const& table,
+  void write(data_sink* sink,
+             table_view const& table,
              const table_metadata* metadata = nullptr,
              rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
 
   /**
    * @brief Write the header of a CSV format.
    *
+   * @param sink Output sink
    * @param table The set of columns
    * @param metadata The metadata associated with the table
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  void write_chunked_begin(table_view const& table,
+  void write_chunked_begin(data_sink* sink,
+                           table_view const& table,
                            const table_metadata* metadata = nullptr,
                            rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
 
   /**
    * @brief Write dataset to CSV format without header.
    *
+   * @param sink Output sink
    * @param strings_column Subset of columns converted to string to be written.
    * @param metadata The metadata associated with the table
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  void write_chunked(strings_column_view const& strings_column,
+  void write_chunked(data_sink* sink,
+                     strings_column_view const& strings_column,
                      const table_metadata* metadata = nullptr,
                      rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
 
-  /**
-   * @brief Write footer of CSV format (typically, empty).
-   *
-   * @param table The set of columns
-   * @param metadata The metadata associated with the table
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write_chunked_end(table_view const& table,
-                         const table_metadata* metadata = nullptr,
-                         rmm::cuda_stream_view stream   = rmm::cuda_stream_default)
-  {
-    // purposely no-op (for now);
-  }
-
  private:
-  std::unique_ptr<data_sink> out_sink_;
   rmm::mr::device_memory_resource* mr_ = nullptr;
   csv_writer_options const options_;
 };

From 7840dc54111c89ecabf99d004f5d2ab33dbc6376 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 21 Aug 2021 01:42:42 -0500
Subject: [PATCH 18/26] replace csv::writer_impl members with local variables

---
 cpp/src/io/csv/writer_impl.cu  | 31 +++++++++++++++--------------
 cpp/src/io/csv/writer_impl.hpp | 36 +++++++++++++++++-----------------
 2 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 091487ead87..819869c9b50 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -257,17 +257,14 @@ struct column_to_strings_fn {
 };
 }  // unnamed namespace
 
-writer_impl::writer_impl(csv_writer_options const& options, rmm::mr::device_memory_resource* mr)
-  : mr_(mr), options_(options)
-{
-}
-
 // write the header: column names:
 //
 void writer_impl::write_chunked_begin(data_sink* out_sink,
                                       table_view const& table,
-                                      const table_metadata* metadata,
-                                      rmm::cuda_stream_view stream)
+                                      table_metadata const* metadata,
+                                      csv_writer_options const& options_,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr_)
 {
   if ((metadata != nullptr) && (options_.is_enabled_include_header())) {
     auto const& column_names = metadata->column_names;
@@ -323,8 +320,10 @@ void writer_impl::write_chunked_begin(data_sink* out_sink,
 
 void writer_impl::write_chunked(data_sink* out_sink,
                                 strings_column_view const& str_column_view,
-                                const table_metadata* metadata,
-                                rmm::cuda_stream_view stream)
+                                table_metadata const* metadata,
+                                csv_writer_options const& options_,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   // algorithm outline:
   //
@@ -374,13 +373,15 @@ void writer_impl::write_chunked(data_sink* out_sink,
 
 void writer_impl::write(data_sink* out_sink,
                         table_view const& table,
-                        const table_metadata* metadata,
-                        rmm::cuda_stream_view stream)
+                        table_metadata const* metadata,
+                        csv_writer_options const& options_,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr_)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
   //
-  write_chunked_begin(out_sink, table, metadata, stream);
+  write_chunked_begin(out_sink, table, metadata, options_, stream, mr_);
 
   if (table.num_rows() > 0) {
     // no need to check same-size columns constraint; auto-enforced by table_view
@@ -449,7 +450,7 @@ void writer_impl::write(data_sink* out_sink,
         return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream);
       }();
 
-      write_chunked(out_sink, str_concat_col->view(), metadata, stream);
+      write_chunked(out_sink, str_concat_col->view(), metadata, options_, stream, mr_);
     }
   }
 }
@@ -459,8 +460,8 @@ void write_csv(std::unique_ptr<cudf::io::data_sink>&& sink,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
 {
-  return writer_impl(options, mr)
-    .write(sink.get(), options.get_table(), options.get_metadata(), stream);
+  return writer_impl().write(
+    sink.get(), options.get_table(), options.get_metadata(), options, stream, mr);
 }
 
 }  // namespace csv
diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp
index 79ee240f585..9c992282544 100644
--- a/cpp/src/io/csv/writer_impl.hpp
+++ b/cpp/src/io/csv/writer_impl.hpp
@@ -47,26 +47,22 @@ using namespace cudf::io;
  */
 class writer_impl {
  public:
-  /**
-   * @brief Constructor with writer options.
-   *
-   * @param options Settings for controlling behavior
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  writer_impl(csv_writer_options const& options, rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Write an entire dataset to CSV format.
    *
    * @param sink Output sink
    * @param table The set of columns
    * @param metadata The metadata associated with the table
+   * @param options Settings for controlling behavior
    * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource to use for device memory allocation
    */
   void write(data_sink* sink,
              table_view const& table,
-             const table_metadata* metadata = nullptr,
-             rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
+             const table_metadata* metadata,
+             csv_writer_options const& options,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Write the header of a CSV format.
@@ -74,12 +70,16 @@ class writer_impl {
    * @param sink Output sink
    * @param table The set of columns
    * @param metadata The metadata associated with the table
+   * @param options Settings for controlling behavior
    * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource to use for device memory allocation
    */
   void write_chunked_begin(data_sink* sink,
                            table_view const& table,
-                           const table_metadata* metadata = nullptr,
-                           rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
+                           table_metadata const* metadata,
+                           csv_writer_options const& options,
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Write dataset to CSV format without header.
@@ -87,16 +87,16 @@ class writer_impl {
    * @param sink Output sink
    * @param strings_column Subset of columns converted to string to be written.
    * @param metadata The metadata associated with the table
+   * @param options Settings for controlling behavior
    * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource to use for device memory allocation
    */
   void write_chunked(data_sink* sink,
                      strings_column_view const& strings_column,
-                     const table_metadata* metadata = nullptr,
-                     rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-
- private:
-  rmm::mr::device_memory_resource* mr_ = nullptr;
-  csv_writer_options const options_;
+                     const table_metadata* metadata,
+                     csv_writer_options const& options,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr);
 };
 
 std::unique_ptr<column> pandas_format_durations(

From 16e4eee539206db014957ae45003b1d09772d2a5 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 21 Aug 2021 01:52:18 -0500
Subject: [PATCH 19/26] remove unnecessary csv writer impl class

---
 cpp/include/cudf/io/detail/csv.hpp | 13 ++++---
 cpp/src/io/csv/writer_impl.cu      | 45 ++++++++++-------------
 cpp/src/io/csv/writer_impl.hpp     | 57 ------------------------------
 cpp/src/io/functions.cpp           |  8 ++++-
 4 files changed, 33 insertions(+), 90 deletions(-)

diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 74a987572d6..7d4adabc8af 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -75,15 +75,18 @@ class reader {
 };
 
 /**
- * @brief Writes the entire dataset.
+ * @brief Write an entire dataset to CSV format.
  *
- * @param table Set of columns to output
- * @param metadata Table metadata and column names
- * @param sinkp The data sink to write the data to
+ * @param sink Output sink
+ * @param table The set of columns
+ * @param metadata The metadata associated with the table
+ * @param options Settings for controlling behavior
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource to use for device memory allocation
  */
-void write_csv(std::unique_ptr<cudf::io::data_sink>&& sinkp,
+void write_csv(data_sink* sink,
+               table_view const& table,
+               const table_metadata* metadata,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 819869c9b50..f5d5dd99e21 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -259,12 +259,12 @@ struct column_to_strings_fn {
 
 // write the header: column names:
 //
-void writer_impl::write_chunked_begin(data_sink* out_sink,
-                                      table_view const& table,
-                                      table_metadata const* metadata,
-                                      csv_writer_options const& options_,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr_)
+void write_chunked_begin(data_sink* out_sink,
+                         table_view const& table,
+                         table_metadata const* metadata,
+                         csv_writer_options const& options_,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr_)
 {
   if ((metadata != nullptr) && (options_.is_enabled_include_header())) {
     auto const& column_names = metadata->column_names;
@@ -318,12 +318,12 @@ void writer_impl::write_chunked_begin(data_sink* out_sink,
   }
 }
 
-void writer_impl::write_chunked(data_sink* out_sink,
-                                strings_column_view const& str_column_view,
-                                table_metadata const* metadata,
-                                csv_writer_options const& options_,
-                                rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+void write_chunked(data_sink* out_sink,
+                   strings_column_view const& str_column_view,
+                   table_metadata const* metadata,
+                   csv_writer_options const& options_,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
 {
   // algorithm outline:
   //
@@ -371,12 +371,12 @@ void writer_impl::write_chunked(data_sink* out_sink,
   }
 }
 
-void writer_impl::write(data_sink* out_sink,
-                        table_view const& table,
-                        table_metadata const* metadata,
-                        csv_writer_options const& options_,
-                        rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr_)
+void write_csv(data_sink* out_sink,
+               table_view const& table,
+               table_metadata const* metadata,
+               csv_writer_options const& options_,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr_)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
@@ -455,15 +455,6 @@ void writer_impl::write(data_sink* out_sink,
   }
 }
 
-void write_csv(std::unique_ptr<cudf::io::data_sink>&& sink,
-               csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  return writer_impl().write(
-    sink.get(), options.get_table(), options.get_metadata(), options, stream, mr);
-}
-
 }  // namespace csv
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp
index 9c992282544..fbbf1c4d8fc 100644
--- a/cpp/src/io/csv/writer_impl.hpp
+++ b/cpp/src/io/csv/writer_impl.hpp
@@ -42,63 +42,6 @@ namespace csv {
 using namespace cudf::io::csv;
 using namespace cudf::io;
 
-/**
- * @brief Implementation for CSV writer
- */
-class writer_impl {
- public:
-  /**
-   * @brief Write an entire dataset to CSV format.
-   *
-   * @param sink Output sink
-   * @param table The set of columns
-   * @param metadata The metadata associated with the table
-   * @param options Settings for controlling behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  void write(data_sink* sink,
-             table_view const& table,
-             const table_metadata* metadata,
-             csv_writer_options const& options,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Write the header of a CSV format.
-   *
-   * @param sink Output sink
-   * @param table The set of columns
-   * @param metadata The metadata associated with the table
-   * @param options Settings for controlling behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  void write_chunked_begin(data_sink* sink,
-                           table_view const& table,
-                           table_metadata const* metadata,
-                           csv_writer_options const& options,
-                           rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Write dataset to CSV format without header.
-   *
-   * @param sink Output sink
-   * @param strings_column Subset of columns converted to string to be written.
-   * @param metadata The metadata associated with the table
-   * @param options Settings for controlling behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  void write_chunked(data_sink* sink,
-                     strings_column_view const& strings_column,
-                     const table_metadata* metadata,
-                     csv_writer_options const& options,
-                     rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr);
-};
-
 std::unique_ptr<column> pandas_format_durations(
   column_view const& durations,
   rmm::cuda_stream_view stream,
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index ac427a5e805..710ca2b1b22 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -224,7 +224,13 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
 
   auto sink = make_datasink(options.get_sink());
 
-  csv::write_csv(std::move(sink), options, rmm::cuda_stream_default, mr);
+  return csv::write_csv(  //
+    sink.get(),
+    options.get_table(),
+    options.get_metadata(),
+    options,
+    rmm::cuda_stream_default,
+    mr);
 }
 
 namespace detail_orc = cudf::io::detail::orc;

From 9994ea34870e7e4e570a2cd84ac12979551115ef Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 21 Aug 2021 01:59:48 -0500
Subject: [PATCH 20/26] replace writer_impl.hpp with more appropriately named
 durations.hpp

---
 .../io/csv/{writer_impl.hpp => durations.hpp}  | 18 ++----------------
 cpp/src/io/csv/writer_impl.cu                  | 16 +++++++++++++++-
 2 files changed, 17 insertions(+), 17 deletions(-)
 rename cpp/src/io/csv/{writer_impl.hpp => durations.hpp} (70%)

diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/durations.hpp
similarity index 70%
rename from cpp/src/io/csv/writer_impl.hpp
rename to cpp/src/io/csv/durations.hpp
index fbbf1c4d8fc..126c9542fb4 100644
--- a/cpp/src/io/csv/writer_impl.hpp
+++ b/cpp/src/io/csv/durations.hpp
@@ -16,32 +16,18 @@
 
 #pragma once
 
-#include "csv_common.h"
-#include "csv_gpu.h"
-
-#include <cudf/strings/strings_column_view.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
-
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/io/data_sink.hpp>
-#include <cudf/io/detail/csv.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/error.hpp>
+#include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <memory>
-#include <string>
-#include <vector>
 
 namespace cudf {
 namespace io {
 namespace detail {
 namespace csv {
 
-using namespace cudf::io::csv;
-using namespace cudf::io;
-
 std::unique_ptr<column> pandas_format_durations(
   column_view const& durations,
   rmm::cuda_stream_view stream,
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index f5d5dd99e21..c9b70f0eee1 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -19,17 +19,25 @@
  * @brief cuDF-IO CSV writer class implementation
  */
 
-#include "writer_impl.hpp"
+#include "durations.hpp"
+
+#include "csv_common.h"
+#include "csv_gpu.h"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/detail/csv.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -40,13 +48,19 @@
 #include <thrust/scan.h>
 
 #include <algorithm>
+#include <memory>
 #include <sstream>
+#include <string>
+#include <vector>
 
 namespace cudf {
 namespace io {
 namespace detail {
 namespace csv {
 
+using namespace cudf::io::csv;
+using namespace cudf::io;
+
 namespace {
 
 /**

From 640375b4000ecb87872537784c8e572a6172ad8f Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Sat, 21 Aug 2021 15:43:54 -0500
Subject: [PATCH 21/26] re-delete csv reader_impl header

---
 cpp/src/io/csv/reader_impl.hpp | 241 ---------------------------------
 1 file changed, 241 deletions(-)
 delete mode 100644 cpp/src/io/csv/reader_impl.hpp

diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
deleted file mode 100644
index beaa9b816cb..00000000000
--- a/cpp/src/io/csv/reader_impl.hpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "csv_common.h"
-#include "csv_gpu.h"
-
-#include <io/utilities/column_buffer.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
-#include <io/utilities/trie.cuh>
-
-#include <cudf/io/csv.hpp>
-#include <cudf/io/datasource.hpp>
-#include <cudf/io/detail/csv.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-using cudf::host_span;
-
-namespace cudf {
-namespace io {
-namespace detail {
-namespace csv {
-using namespace cudf::io::csv;
-using namespace cudf::io;
-
-/**
- * @brief Implementation for CSV reader
- *
- * The CSV reader is implemented in 4 stages:
- * Stage 1: read and optionally decompress the input data in host memory
- * (may be a memory-mapped view of the data on disk)
- *
- * Stage 2: gather the offset of each data row within the csv data.
- * Since the number of rows in a given character block may depend on the
- * initial parser state (like whether the block starts in a middle of a
- * quote or not), a separate row count and output parser state is computed
- * for every possible input parser state per 16KB character block.
- * The result is then used to infer the parser state and starting row at
- * the beginning of every character block.
- * A second pass can then output the location of every row (which is needed
- * for the subsequent parallel conversion of every row from csv text
- * to cudf binary form)
- *
- * Stage 3: Optional stage to infer the data type of each CSV column.
- *
- * Stage 4: Convert every row from csv text form to cudf binary form.
- */
-class reader::impl {
- public:
-  /**
-   * @brief Constructor from a dataset source with reader options.
-   *
-   * @param source Dataset source
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit impl(std::unique_ptr<datasource> source,
-                csv_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Read an entire set or a subset of data and returns a set of columns.
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return The set of columns along with metadata
-   */
-  table_with_metadata read(rmm::cuda_stream_view stream);
-
- private:
-  /**
-   * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span.
-   *
-   * Row offsets are stored this way to avoid reallocation/copies when discarding front or back
-   * elements.
-   */
-  class selected_rows_offsets {
-    rmm::device_uvector<uint64_t> all;
-    device_span<uint64_t const> selected;
-
-   public:
-    selected_rows_offsets(rmm::device_uvector<uint64_t>&& data,
-                          device_span<uint64_t const> selected_span)
-      : all{std::move(data)}, selected{selected_span}
-    {
-    }
-    selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {}
-
-    operator device_span<uint64_t const>() const { return selected; }
-    void shrink(size_t size)
-    {
-      CUDF_EXPECTS(size <= selected.size(), "New size must be smaller");
-      selected = selected.subspan(0, size);
-    }
-    void erase_first_n(size_t n)
-    {
-      CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove");
-      selected = selected.subspan(n, selected.size() - n);
-    }
-    auto size() const { return selected.size(); }
-    auto data() const { return selected.data(); }
-  };
-
-  /**
-   * @brief Selectively loads data on the GPU and gathers offsets of rows to read.
-   *
-   * Selection is based on read options.
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-  select_data_and_row_offsets(rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Finds row positions in the specified input data, and loads the selected data onto GPU.
-   *
-   * This function scans the input data to record the row offsets (relative to the start of the
-   * input data). A row is actually the data/offset between two termination symbols.
-   *
-   * @param data Uncompressed input data in host memory
-   * @param range_begin Only include rows starting after this position
-   * @param range_end Only include rows starting before this position
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows Number of rows to read; -1: all remaining data
-   * @param load_whole_file Hint that the entire data will be needed on gpu
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @return Input data and row offsets in the device memory
-   */
-  std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-  load_data_and_gather_row_offsets(host_span<char const> data,
-                                   size_t range_begin,
-                                   size_t range_end,
-                                   size_t skip_rows,
-                                   int64_t num_rows,
-                                   bool load_whole_file,
-                                   rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Find the start position of the first data row
-   *
-   * @param h_data Uncompressed input data in host memory
-   *
-   * @return Byte position of the first row
-   */
-  size_t find_first_row_start(host_span<char const> data);
-
-  /**
-   * @brief Automatically infers each column's data type based on the CSV's data within that column.
-   *
-   * @param data The CSV data from which to infer the columns' data types
-   * @param row_offsets The row offsets into the CSV's data
-   * @param stream The stream to which the type inference-kernel will be dispatched
-   * @return The columns' inferred data types
-   */
-  std::vector<data_type> infer_column_types(device_span<char const> data,
-                                            device_span<uint64_t const> row_offsets,
-                                            rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Selects the columns' data types from the map of dtypes.
-   *
-   * @param col_type_map Column name -> data type map specifying the columns' target data types
-   * @return Sorted list of selected columns' data types
-   */
-  std::vector<data_type> select_data_types(std::map<std::string, data_type> const& col_type_map);
-
-  /**
-   * @brief Selects the columns' data types from the list of dtypes.
-   *
-   * @param dtypes Vector of data types specifying the columns' target data types
-   * @return Sorted list of selected columns' data types
-   */
-  std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);
-
-  /**
-   * @brief Parses the columns' data types from the vector of dtypes that are provided as strings.
-   *
-   * @param types_as_strings The vector of strings from which to parse the columns' target data
-   * types
-   * @return List of columns' data types
-   */
-  std::vector<data_type> parse_column_types(std::vector<std::string> const& types_as_strings);
-
-  /**
-   * @brief Converts the row-column data and outputs to column bufferrs.
-   *
-   * @param column_types Column types
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return list of column buffers of decoded data, or ptr/size in the case of strings.
-   */
-  std::vector<column_buffer> decode_data(device_span<char const> data,
-                                         device_span<uint64_t const> row_offsets,
-                                         host_span<data_type const> column_types,
-                                         rmm::cuda_stream_view stream);
-
- private:
-  rmm::mr::device_memory_resource* mr_ = nullptr;
-  std::unique_ptr<datasource> source_;
-  const csv_reader_options opts_;
-
-  cudf::size_type num_records_ = 0;  // Number of rows with actual data
-  int num_active_cols_         = 0;  // Number of columns to read
-  int num_actual_cols_         = 0;  // Number of columns in the dataset
-
-  // Parsing options
-  parse_options opts{};
-  std::vector<column_parse::flags> column_flags_;
-
-  // Intermediate data
-  std::vector<std::string> col_names_;
-  std::vector<char> header_;
-};
-
-}  // namespace csv
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf

From 07b05e89a733de2b1974787837d4e087d4cb1914 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Tue, 24 Aug 2021 14:27:53 -0500
Subject: [PATCH 22/26] re-remove csv/reader_impl.hpp

---
 cpp/src/io/csv/reader_impl.hpp | 232 ---------------------------------
 1 file changed, 232 deletions(-)
 delete mode 100644 cpp/src/io/csv/reader_impl.hpp

diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
deleted file mode 100644
index de363a46ffe..00000000000
--- a/cpp/src/io/csv/reader_impl.hpp
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "csv_common.h"
-#include "csv_gpu.h"
-
-#include <io/utilities/column_buffer.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
-#include <io/utilities/trie.cuh>
-
-#include <cudf/io/csv.hpp>
-#include <cudf/io/datasource.hpp>
-#include <cudf/io/detail/csv.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-using cudf::host_span;
-
-namespace cudf {
-namespace io {
-namespace detail {
-namespace csv {
-using namespace cudf::io::csv;
-using namespace cudf::io;
-
-/**
- * @brief Implementation for CSV reader
- *
- * The CSV reader is implemented in 4 stages:
- * Stage 1: read and optionally decompress the input data in host memory
- * (may be a memory-mapped view of the data on disk)
- *
- * Stage 2: gather the offset of each data row within the csv data.
- * Since the number of rows in a given character block may depend on the
- * initial parser state (like whether the block starts in a middle of a
- * quote or not), a separate row count and output parser state is computed
- * for every possible input parser state per 16KB character block.
- * The result is then used to infer the parser state and starting row at
- * the beginning of every character block.
- * A second pass can then output the location of every row (which is needed
- * for the subsequent parallel conversion of every row from csv text
- * to cudf binary form)
- *
- * Stage 3: Optional stage to infer the data type of each CSV column.
- *
- * Stage 4: Convert every row from csv text form to cudf binary form.
- */
-class reader::impl {
- public:
-  /**
-   * @brief Constructor from a dataset source with reader options.
-   *
-   * @param source Dataset source
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit impl(std::unique_ptr<datasource> source,
-                csv_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Read an entire set or a subset of data and returns a set of columns.
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return The set of columns along with metadata
-   */
-  table_with_metadata read(rmm::cuda_stream_view stream);
-
- private:
-  /**
-   * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span.
-   *
-   * Row offsets are stored this way to avoid reallocation/copies when discarding front or back
-   * elements.
-   */
-  class selected_rows_offsets {
-    rmm::device_uvector<uint64_t> all;
-    device_span<uint64_t const> selected;
-
-   public:
-    selected_rows_offsets(rmm::device_uvector<uint64_t>&& data,
-                          device_span<uint64_t const> selected_span)
-      : all{std::move(data)}, selected{selected_span}
-    {
-    }
-    selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {}
-
-    operator device_span<uint64_t const>() const { return selected; }
-    void shrink(size_t size)
-    {
-      CUDF_EXPECTS(size <= selected.size(), "New size must be smaller");
-      selected = selected.subspan(0, size);
-    }
-    void erase_first_n(size_t n)
-    {
-      CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove");
-      selected = selected.subspan(n, selected.size() - n);
-    }
-    auto size() const { return selected.size(); }
-    auto data() const { return selected.data(); }
-  };
-
-  /**
-   * @brief Selectively loads data on the GPU and gathers offsets of rows to read.
-   *
-   * Selection is based on read options.
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-  select_data_and_row_offsets(rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Finds row positions in the specified input data, and loads the selected data onto GPU.
-   *
-   * This function scans the input data to record the row offsets (relative to the start of the
-   * input data). A row is actually the data/offset between two termination symbols.
-   *
-   * @param data Uncompressed input data in host memory
-   * @param range_begin Only include rows starting after this position
-   * @param range_end Only include rows starting before this position
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows Number of rows to read; -1: all remaining data
-   * @param load_whole_file Hint that the entire data will be needed on gpu
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @return Input data and row offsets in the device memory
-   */
-  std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-  load_data_and_gather_row_offsets(host_span<char const> data,
-                                   size_t range_begin,
-                                   size_t range_end,
-                                   size_t skip_rows,
-                                   int64_t num_rows,
-                                   bool load_whole_file,
-                                   rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Find the start position of the first data row
-   *
-   * @param h_data Uncompressed input data in host memory
-   *
-   * @return Byte position of the first row
-   */
-  size_t find_first_row_start(host_span<char const> data);
-
-  /**
-   * @brief Automatically infers each column's data type based on the CSV's data within that column.
-   *
-   * @param data The CSV data from which to infer the columns' data types
-   * @param row_offsets The row offsets into the CSV's data
-   * @param stream The stream to which the type inference-kernel will be dispatched
-   * @return The columns' inferred data types
-   */
-  std::vector<data_type> infer_column_types(device_span<char const> data,
-                                            device_span<uint64_t const> row_offsets,
-                                            rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Selects the columns' data types from the map of dtypes.
-   *
-   * @param col_type_map Column name -> data type map specifying the columns' target data types
-   * @return Sorted list of selected columns' data types
-   */
-  std::vector<data_type> select_data_types(std::map<std::string, data_type> const& col_type_map);
-
-  /**
-   * @brief Selects the columns' data types from the list of dtypes.
-   *
-   * @param dtypes Vector of data types specifying the columns' target data types
-   * @return Sorted list of selected columns' data types
-   */
-  std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);
-
-  /**
-   * @brief Converts the row-column data and outputs to column bufferrs.
-   *
-   * @param column_types Column types
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return list of column buffers of decoded data, or ptr/size in the case of strings.
-   */
-  std::vector<column_buffer> decode_data(device_span<char const> data,
-                                         device_span<uint64_t const> row_offsets,
-                                         host_span<data_type const> column_types,
-                                         rmm::cuda_stream_view stream);
-
- private:
-  rmm::mr::device_memory_resource* mr_ = nullptr;
-  std::unique_ptr<datasource> source_;
-  const csv_reader_options opts_;
-
-  cudf::size_type num_records_ = 0;  // Number of rows with actual data
-  int num_active_cols_         = 0;  // Number of columns to read
-  int num_actual_cols_         = 0;  // Number of columns in the dataset
-
-  // Parsing options
-  parse_options opts{};
-  std::vector<column_parse::flags> column_flags_;
-
-  // Intermediate data
-  std::vector<std::string> col_names_;
-  std::vector<char> header_;
-};
-
-}  // namespace csv
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf

From 92033c3dc24fa1fdc99d5fa67e1fa191a7bc7a1f Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Wed, 25 Aug 2021 11:27:16 -0500
Subject: [PATCH 23/26] fix bad merge where changes in 9079 were deleted.

---
 cpp/src/io/csv/reader_impl.cu | 124 +---------------------------------
 1 file changed, 2 insertions(+), 122 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 75dc57750ee..5a78c0454b1 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -31,7 +31,6 @@
 
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/csv.hpp>
@@ -480,26 +479,6 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> select_data_and_row_
   return {rmm::device_uvector<char>{0, stream}, selected_rows_offsets{stream}};
 }
 
-std::vector<data_type> get_data_types_from_column_names(
-  std::vector<column_parse::flags> const& column_flags,
-  std::map<std::string, data_type> const& column_type_map,
-  std::vector<std::string> const& column_names,
-  int32_t num_actual_columns)
-{
-  std::vector<data_type> selected_dtypes;
-
-  for (int32_t i = 0; i < num_actual_columns; i++) {
-    if (column_flags[i] & column_parse::enabled) {
-      auto const col_type_it = column_type_map.find(column_names[i]);
-      CUDF_EXPECTS(col_type_it != column_type_map.end(),
-                   "Must specify data types for all active columns");
-      selected_dtypes.emplace_back(col_type_it->second);
-    }
-  }
-
-  return selected_dtypes;
-}
-
 std::vector<data_type> select_data_types(std::vector<column_parse::flags> const& column_flags,
                                          std::vector<data_type> const& dtypes,
                                          int32_t num_actual_columns,
@@ -522,85 +501,6 @@ std::vector<data_type> select_data_types(std::vector<column_parse::flags> const&
   return selected_dtypes;
 }
 
-std::vector<data_type> parse_column_types(std::vector<column_parse::flags>& column_flags,
-                                          std::vector<std::string> const& column_names,
-                                          std::vector<std::string> const& types_as_strings,
-                                          int32_t num_actual_columns,
-                                          int32_t num_active_columns,
-                                          data_type timestamp_type)
-{
-  std::vector<data_type> dtypes;
-
-  bool const is_dict = std::all_of(types_as_strings.begin(),
-                                   types_as_strings.end(),
-                                   [](auto const& s) { return s.find(':') != std::string::npos; });
-
-  if (!is_dict) {
-    if (types_as_strings.size() == 1) {
-      // If it's a single dtype, assign that dtype to all active columns
-      data_type dtype_;
-      column_parse::flags col_flags_;
-      std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]);
-      dtypes.resize(num_active_columns, dtype_);
-      for (int col = 0; col < num_actual_columns; col++) {
-        column_flags[col] |= col_flags_;
-      }
-      CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-    } else {
-      // If it's a list, assign dtypes to active columns in the given order
-      CUDF_EXPECTS(static_cast<int>(types_as_strings.size()) >= num_actual_columns,
-                   "Must specify data types for all columns");
-
-      auto dtype_ = std::back_inserter(dtypes);
-
-      for (int col = 0; col < num_actual_columns; col++) {
-        if (column_flags[col] & column_parse::enabled) {
-          column_parse::flags col_flags_;
-          std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]);
-          column_flags[col] |= col_flags_;
-          CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-        }
-      }
-    }
-  } else {
-    // Translate vector of `name : dtype` strings to map
-    // NOTE: Incoming pairs can be out-of-order from column names in dataset
-    std::unordered_map<std::string, std::string> col_type_map;
-    for (const auto& pair : types_as_strings) {
-      const auto pos     = pair.find_last_of(':');
-      const auto name    = pair.substr(0, pos);
-      const auto dtype   = pair.substr(pos + 1, pair.size());
-      col_type_map[name] = dtype;
-    }
-
-    auto dtype_ = std::back_inserter(dtypes);
-
-    for (int col = 0; col < num_actual_columns; col++) {
-      if (column_flags[col] & column_parse::enabled) {
-        CUDF_EXPECTS(col_type_map.find(column_names[col]) != col_type_map.end(),
-                     "Must specify data types for all active columns");
-        column_parse::flags col_flags_;
-        std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[column_names[col]]);
-        column_flags[col] |= col_flags_;
-        CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-      }
-    }
-  }
-
-  if (timestamp_type.id() != cudf::type_id::EMPTY) {
-    for (auto& type : dtypes) {
-      if (cudf::is_timestamp(type)) { type = timestamp_type; }
-    }
-  }
-
-  for (size_t i = 0; i < dtypes.size(); i++) {
-    // Replace EMPTY dtype with STRING
-    if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; }
-  }
-
-  return dtypes;
-}
-
 std::vector<data_type> infer_column_types(parse_options const& parse_opts,
                                           std::vector<column_parse::flags> const& column_flags,
                                           device_span<char const> data,
@@ -864,28 +764,8 @@ table_with_metadata read_csv(cudf::io::datasource* source,
       reader_opts.get_timestamp_type(),
       stream);
   } else {
-    column_types =
-      std::visit(cudf::detail::visitor_overload{
-                   [&](const std::vector<data_type>& data_types) {
-                     return select_data_types(
-                       column_flags, data_types, num_actual_columns, num_active_columns);
-                   },
-                   [&](const std::map<std::string, data_type>& data_types) {
-                     return get_data_types_from_column_names(  //
-                       column_flags,
-                       data_types,
-                       column_names,
-                       num_actual_columns);
-                   },
-                   [&](const std::vector<string>& dtypes) {
-                     return parse_column_types(column_flags,
-                                               column_names,
-                                               dtypes,
-                                               num_actual_columns,
-                                               num_active_columns,
-                                               reader_opts.get_timestamp_type());
-                   }},
-                 reader_opts.get_dtypes());
+    column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); },
+                              opts_.get_dtypes());
   }
 
   out_columns.reserve(column_types.size());

From 24b394993d3757682506d9bf15c59672e2ffb3c7 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Wed, 25 Aug 2021 21:39:35 -0500
Subject: [PATCH 24/26] add back read_csv impl function 
 get_data_types_from_column_names

---
 cpp/src/io/csv/reader_impl.cu | 59 +++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 24 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 5a78c0454b1..a6e53029043 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -31,6 +31,7 @@
 
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/csv.hpp>
@@ -102,28 +103,6 @@ class selected_rows_offsets {
   auto data() const { return selected.data(); }
 };
 
-/**
- * @brief Translates a dtype string and returns its dtype enumeration and any
- * extended dtype flags that are supported by cuIO. Often, this is a column
- * with the same underlying dtype the basic types, but with different parsing
- * interpretations.
- *
- * @param[in] dtype String containing the basic or extended dtype
- *
- * @return Tuple of data_type and flags
- */
-std::tuple<data_type, column_parse::flags> get_dtype_info(const std::string& dtype)
-{
-  if (dtype == "hex" || dtype == "hex64") {
-    return std::make_tuple(data_type{cudf::type_id::INT64}, column_parse::as_hexadecimal);
-  }
-  if (dtype == "hex32") {
-    return std::make_tuple(data_type{cudf::type_id::INT32}, column_parse::as_hexadecimal);
-  }
-
-  return std::make_tuple(convert_string_to_dtype(dtype), column_parse::as_default);
-}
-
 /**
  * @brief Removes the first and Last quote in the string
  */
@@ -501,6 +480,26 @@ std::vector<data_type> select_data_types(std::vector<column_parse::flags> const&
   return selected_dtypes;
 }
 
+std::vector<data_type> get_data_types_from_column_names(
+  std::vector<column_parse::flags> const& column_flags,
+  std::map<std::string, data_type> const& column_type_map,
+  std::vector<std::string> const& column_names,
+  int32_t num_actual_columns)
+{
+  std::vector<data_type> selected_dtypes;
+
+  for (int32_t i = 0; i < num_actual_columns; i++) {
+    if (column_flags[i] & column_parse::enabled) {
+      auto const col_type_it = column_type_map.find(column_names[i]);
+      CUDF_EXPECTS(col_type_it != column_type_map.end(),
+                   "Must specify data types for all active columns");
+      selected_dtypes.emplace_back(col_type_it->second);
+    }
+  }
+
+  return selected_dtypes;
+}
+
 std::vector<data_type> infer_column_types(parse_options const& parse_opts,
                                           std::vector<column_parse::flags> const& column_flags,
                                           device_span<char const> data,
@@ -764,8 +763,20 @@ table_with_metadata read_csv(cudf::io::datasource* source,
       reader_opts.get_timestamp_type(),
       stream);
   } else {
-    column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); },
-                              opts_.get_dtypes());
+    column_types =
+      std::visit(cudf::detail::visitor_overload{
+                   [&](const std::vector<data_type>& data_types) {
+                     return select_data_types(
+                       column_flags, data_types, num_actual_columns, num_active_columns);
+                   },
+                   [&](const std::map<std::string, data_type>& data_types) {
+                     return get_data_types_from_column_names(  //
+                       column_flags,
+                       data_types,
+                       column_names,
+                       num_actual_columns);
+                   }},
+                 reader_opts.get_dtypes());
   }
 
   out_columns.reserve(column_types.size());

From 1068510dd2c04e4be70bc505024987d755951d72 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Mon, 15 Nov 2021 14:14:22 -0600
Subject: [PATCH 25/26] adjust copyright year

---
 cpp/include/cudf/io/detail/csv.hpp | 2 +-
 cpp/src/io/csv/durations.hpp       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index c52284b96cf..c190340f6c1 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp
index 126c9542fb4..d42ddf3817c 100644
--- a/cpp/src/io/csv/durations.hpp
+++ b/cpp/src/io/csv/durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 4fa0f441569ecb0c8c23e38a13c450981c4bbc10 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Wed, 17 Nov 2021 11:54:07 -0600
Subject: [PATCH 26/26] remove trailing _ in csv writer impl function args

---
 cpp/src/io/csv/writer_impl.cu | 36 +++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 142eae06772..b9b6fc6cf94 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -279,11 +279,11 @@ struct column_to_strings_fn {
 void write_chunked_begin(data_sink* out_sink,
                          table_view const& table,
                          table_metadata const* metadata,
-                         csv_writer_options const& options_,
+                         csv_writer_options const& options,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr_)
+                         rmm::mr::device_memory_resource* mr)
 {
-  if (options_.is_enabled_include_header()) {
+  if (options.is_enabled_include_header()) {
     // need to generate column names if metadata is not provided
     std::vector<std::string> generated_col_names;
     if (metadata == nullptr) {
@@ -296,8 +296,8 @@ void write_chunked_begin(data_sink* out_sink,
     CUDF_EXPECTS(column_names.size() == static_cast<size_t>(table.num_columns()),
                  "Mismatch between number of column headers and table columns.");
 
-    auto const delimiter  = options_.get_inter_column_delimiter();
-    auto const terminator = options_.get_line_terminator();
+    auto const delimiter  = options.get_inter_column_delimiter();
+    auto const terminator = options.get_line_terminator();
 
     // process header names:
     // - if the header name includes the delimiter or terminator character,
@@ -346,7 +346,7 @@ void write_chunked_begin(data_sink* out_sink,
 void write_chunked(data_sink* out_sink,
                    strings_column_view const& str_column_view,
                    table_metadata const* metadata,
-                   csv_writer_options const& options_,
+                   csv_writer_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
 {
@@ -363,7 +363,7 @@ void write_chunked(data_sink* out_sink,
 
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
-  cudf::string_scalar newline{options_.get_line_terminator()};
+  cudf::string_scalar newline{options.get_line_terminator()};
   auto p_str_col_w_nl =
     cudf::strings::detail::join_strings(str_column_view, newline, string_scalar("", false), stream);
   strings_column_view strings_column{p_str_col_w_nl->view()};
@@ -391,26 +391,26 @@ void write_chunked(data_sink* out_sink,
   if (out_sink->is_device_write_preferred(newline.size())) {
     out_sink->device_write(newline.data(), newline.size(), stream);
   } else {
-    out_sink->host_write(options_.get_line_terminator().data(),
-                         options_.get_line_terminator().size());
+    out_sink->host_write(options.get_line_terminator().data(),
+                         options.get_line_terminator().size());
   }
 }
 
 void write_csv(data_sink* out_sink,
                table_view const& table,
                table_metadata const* metadata,
-               csv_writer_options const& options_,
+               csv_writer_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr_)
+               rmm::mr::device_memory_resource* mr)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
   //
-  write_chunked_begin(out_sink, table, metadata, options_, stream, mr_);
+  write_chunked_begin(out_sink, table, metadata, options, stream, mr);
 
   if (table.num_rows() > 0) {
     // no need to check same-size columns constraint; auto-enforced by table_view
-    auto n_rows_per_chunk = options_.get_rows_per_chunk();
+    auto n_rows_per_chunk = options.get_rows_per_chunk();
     //
     // This outputs the CSV in row chunks to save memory.
     // Maybe we can use the total_rows*count calculation and a memory threshold
@@ -440,7 +440,7 @@ void write_csv(data_sink* out_sink,
 
     // convert each chunk to CSV:
     //
-    column_to_strings_fn converter{options_, stream, rmm::mr::get_current_device_resource()};
+    column_to_strings_fn converter{options, stream, rmm::mr::get_current_device_resource()};
     for (auto&& sub_view : vector_views) {
       // Skip if the table has no rows
       if (sub_view.num_rows() == 0) continue;
@@ -463,19 +463,19 @@ void write_csv(data_sink* out_sink,
       // concatenate columns in each row into one big string column
       // (using null representation and delimiter):
       //
-      std::string delimiter_str{options_.get_inter_column_delimiter()};
+      std::string delimiter_str{options.get_inter_column_delimiter()};
       auto str_concat_col = [&] {
         if (str_table_view.num_columns() > 1)
           return cudf::strings::detail::concatenate(str_table_view,
                                                     delimiter_str,
-                                                    options_.get_na_rep(),
+                                                    options.get_na_rep(),
                                                     strings::separator_on_nulls::YES,
                                                     stream);
-        cudf::string_scalar narep{options_.get_na_rep()};
+        cudf::string_scalar narep{options.get_na_rep()};
         return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream);
       }();
 
-      write_chunked(out_sink, str_concat_col->view(), metadata, options_, stream, mr_);
+      write_chunked(out_sink, str_concat_col->view(), metadata, options, stream, mr);
     }
   }
 }