From 9c72e56837ddfb3fb9b3d1111cdd08e1f53595c4 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Aug 2021 05:08:16 -0500 Subject: [PATCH 01/19] simplify io/functions.cpp data source/sink factories --- cpp/src/io/functions.cpp | 126 +++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 59 deletions(-) diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index bf51012211c..e080ea3a2ca 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -106,67 +106,56 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder( } namespace { -template -std::unique_ptr make_reader(source_info const& src_info, - reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - if (src_info.type == io_type::FILEPATH) { - return std::make_unique(src_info.filepaths, options, stream, mr); - } - std::vector> datasources; - if (src_info.type == io_type::HOST_BUFFER) { - datasources = cudf::io::datasource::create(src_info.buffers); - } else if (src_info.type == io_type::USER_IMPLEMENTED) { - datasources = cudf::io::datasource::create(src_info.user_sources); - } else { - CUDF_FAIL("Unsupported source type"); +std::vector> make_datasources(source_info const& info) +{ + switch (info.type) { + case io_type::FILEPATH: return cudf::io::datasource::create(info.filepaths); + case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers); + case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources); + default: CUDF_FAIL("Unsupported source type"); } - - return std::make_unique(std::move(datasources), options, stream, mr); } -template -std::unique_ptr make_writer(sink_info const& sink, Ts&&... args) +std::unique_ptr make_datasink(sink_info const& info) { - if (sink.type == io_type::FILEPATH) { - return std::make_unique(cudf::io::data_sink::create(sink.filepath), - std::forward(args)...); - } - if (sink.type == io_type::HOST_BUFFER) { - return std::make_unique(cudf::io::data_sink::create(sink.buffer), - std::forward(args)...); + switch (info.type) { + case io_type::FILEPATH: return cudf::io::data_sink::create(info.filepath); + case io_type::HOST_BUFFER: return cudf::io::data_sink::create(info.buffer); + case io_type::VOID: return cudf::io::data_sink::create(); + case io_type::USER_IMPLEMENTED: return cudf::io::data_sink::create(info.user_sink); + default: CUDF_FAIL("Unsupported sink type"); } - if (sink.type == io_type::VOID) { - return std::make_unique(cudf::io::data_sink::create(), std::forward(args)...); - } - if (sink.type == io_type::USER_IMPLEMENTED) { - return std::make_unique(cudf::io::data_sink::create(sink.user_sink), - std::forward(args)...); - } - CUDF_FAIL("Unsupported sink type"); } } // namespace -table_with_metadata read_avro(avro_reader_options const& opts, rmm::mr::device_memory_resource* mr) +table_with_metadata read_avro(avro_reader_options const& options, + rmm::mr::device_memory_resource* mr) { namespace avro = cudf::io::detail::avro; CUDF_FUNC_RANGE(); - auto reader = make_reader(opts.get_source(), opts, rmm::cuda_stream_default, mr); - return reader->read(opts); + + auto datasources = make_datasources(options.get_source()); + auto reader = + std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); + + return reader->read(options); } -table_with_metadata read_json(json_reader_options const& opts, rmm::mr::device_memory_resource* mr) +table_with_metadata read_json(json_reader_options const& options, + rmm::mr::device_memory_resource* mr) { namespace json = cudf::io::detail::json; CUDF_FUNC_RANGE(); - auto reader = make_reader(opts.get_source(), opts, rmm::cuda_stream_default, mr); - return reader->read(opts); + + auto datasources = make_datasources(options.get_source()); + auto reader = + std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); + + return reader->read(options); } table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr) @@ -174,8 +163,10 @@ table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_ namespace csv = cudf::io::detail::csv; CUDF_FUNC_RANGE(); + + auto datasources = make_datasources(options.get_source()); auto reader = - make_reader(options.get_source(), options, rmm::cuda_stream_default, mr); + std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); return reader->read(); } @@ -185,7 +176,9 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc { using namespace cudf::io::detail; - auto writer = make_writer(options.get_sink(), options, rmm::cuda_stream_default, mr); + auto sink = make_datasink(options.get_sink()); + auto writer = + std::make_unique(std::move(sink), options, rmm::cuda_stream_default, mr); writer->write(options.get_table(), options.get_metadata()); } @@ -294,8 +287,10 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info) table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto reader = - make_reader(options.get_source(), options, rmm::cuda_stream_default, mr); + + auto datasources = make_datasources(options.get_source()); + auto reader = std::make_unique( + std::move(datasources), options, rmm::cuda_stream_default, mr); return reader->read(options); } @@ -305,11 +300,13 @@ table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_ */ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resource* mr) { + namespace io_detail = cudf::io::detail; + CUDF_FUNC_RANGE(); - namespace io_detail = cudf::io::detail; - auto writer = make_writer( - options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); + auto sink = make_datasink(options.get_sink()); + auto writer = std::make_unique( + std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); writer->write(options.get_table()); } @@ -317,12 +314,15 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc /** * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer */ -orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& op, +orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options, rmm::mr::device_memory_resource* mr) { namespace io_detail = cudf::io::detail; - writer = make_writer( - op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); + + auto sink = make_datasink(options.get_sink()); + + writer = std::make_unique( + std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); } /** @@ -354,8 +354,10 @@ table_with_metadata read_parquet(parquet_reader_options const& options, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto reader = make_reader( - options.get_source(), options, rmm::cuda_stream_default, mr); + + auto datasources = make_datasources(options.get_source()); + auto reader = std::make_unique( + std::move(datasources), options, rmm::cuda_stream_default, mr); return reader->read(options); } @@ -392,25 +394,31 @@ table_input_metadata::table_input_metadata(table_view const& table, std::unique_ptr> write_parquet(parquet_writer_options const& options, rmm::mr::device_memory_resource* mr) { - CUDF_FUNC_RANGE(); namespace io_detail = cudf::io::detail; - auto writer = make_writer( - options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); + CUDF_FUNC_RANGE(); + + auto sink = make_datasink(options.get_sink()); + auto writer = std::make_unique( + std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); writer->write(options.get_table()); + return writer->close(options.get_column_chunks_file_path()); } /** * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer */ -parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& op, +parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& options, rmm::mr::device_memory_resource* mr) { namespace io_detail = cudf::io::detail; - writer = make_writer( - op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); + + auto sink = make_datasink(options.get_sink()); + + writer = std::make_unique( + std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); } /** From 9e92ca2bf346bf1d6ce5ba351fd4b7a2991edfea Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Aug 2021 06:58:00 -0500 Subject: [PATCH 02/19] begin replacing csv_reader with pure functions --- cpp/include/cudf/io/detail/csv.hpp | 60 +++-------- cpp/src/io/csv/reader_impl.cu | 159 +++++++++++++---------------- cpp/src/io/csv/reader_impl.hpp | 44 ++++---- cpp/src/io/functions.cpp | 12 ++- 4 files changed, 114 insertions(+), 161 deletions(-) diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp index 89e589d306a..aac44bed50e 100644 --- a/cpp/include/cudf/io/detail/csv.hpp +++ b/cpp/include/cudf/io/detail/csv.hpp @@ -24,55 +24,21 @@ namespace cudf { namespace io { namespace detail { namespace csv { + /** - * @brief Class to read CSV dataset data into columns. + * @brief Reads the entire dataset. + * + * @param sources Input `datasource` object to read the dataset from + * @param options Settings for controlling reading behavior + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + * + * @return The set of columns along with table metadata */ -class reader { - private: - class impl; - std::unique_ptr _impl; - - public: - /** - * @brief Constructor from an array of file paths - * - * @param filepaths Paths to the files containing the input dataset - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector const& filepaths, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - - /** - * @brief Constructor from an array of datasources - * - * @param sources Input `datasource` objects to read the dataset from - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector>&& sources, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - - /** - * @brief Destructor explicitly-declared to avoid inlined in header - */ - ~reader(); - - /** - * @brief Reads the entire dataset. - * - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return The set of columns along with table metadata - */ - table_with_metadata read(rmm::cuda_stream_view stream = rmm::cuda_stream_default); -}; +table_with_metadata read_csv(std::unique_ptr&& source, + csv_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); class writer { public: diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 549b0474fe1..81c7fc65d3f 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -196,8 +196,10 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream) container.resize(1, stream); } -std::pair, reader::impl::selected_rows_offsets> -reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) +std::pair, reader_impl::selected_rows_offsets> +reader_impl::select_data_and_row_offsets(cudf::io::datasource* source, + csv_reader_options const& opts_, + rmm::cuda_stream_view stream) { auto range_offset = opts_.get_byte_range_offset(); auto range_size = opts_.get_byte_range_size(); @@ -205,8 +207,11 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) auto skip_end_rows = opts_.get_skipfooter(); auto num_rows = opts_.get_nrows(); + // we use "infer", but really we are just getting the string name of the compression type. + auto compression_type = infer_compression_type(opts_.get_compression(), "", {}); + if (range_offset > 0 || range_size > 0) { - CUDF_EXPECTS(compression_type_ == "none", + CUDF_EXPECTS(compression_type == "none", "Reading compressed data using `byte range` is unsupported"); } size_t map_range_size = 0; @@ -217,17 +222,10 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) map_range_size = range_size + calculateMaxRowSize(num_columns); } - // Support delayed opening of the file if using memory mapping datasource - // This allows only mapping of a subset of the file if using byte range - if (source_ == nullptr) { - assert(!filepath_.empty()); - source_ = datasource::create(filepath_, range_offset, map_range_size); - } - // Transfer source data to GPU - if (!source_->is_empty()) { - auto data_size = (map_range_size != 0) ? map_range_size : source_->size(); - auto buffer = source_->host_read(range_offset, data_size); + if (!source->is_empty()) { + auto data_size = (map_range_size != 0) ? map_range_size : source->size(); + auto buffer = source->host_read(range_offset, data_size); auto h_data = host_span( // reinterpret_cast(buffer->data()), @@ -235,8 +233,8 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) std::vector h_uncomp_data_owner; - if (compression_type_ != "none") { - h_uncomp_data_owner = get_uncompressed_data(h_data, compression_type_); + if (compression_type != "none") { + h_uncomp_data_owner = get_uncompressed_data(h_data, compression_type); h_data = h_uncomp_data_owner; } // None of the parameters for row selection is used, we are parsing the entire file @@ -252,7 +250,8 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) // Gather row offsets auto data_row_offsets = - load_data_and_gather_row_offsets(h_data, + load_data_and_gather_row_offsets(opts_, + h_data, data_start_offset, (range_size) ? range_size : h_data.size(), (skip_rows > 0) ? skip_rows : 0, @@ -269,7 +268,7 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; } -std::vector reader::impl::select_data_types( +std::vector reader_impl::select_data_types( std::map const& col_type_map) { std::vector selected_dtypes; @@ -285,7 +284,7 @@ std::vector reader::impl::select_data_types( return selected_dtypes; } -std::vector reader::impl::select_data_types(std::vector const& dtypes) +std::vector reader_impl::select_data_types(std::vector const& dtypes) { std::vector selected_dtypes; @@ -304,9 +303,12 @@ std::vector reader::impl::select_data_types(std::vector co return selected_dtypes; } -table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) +table_with_metadata reader_impl::read(cudf::io::datasource* source, + csv_reader_options const& opts_, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - auto const data_row_offsets = select_data_and_row_offsets(stream); + auto const data_row_offsets = select_data_and_row_offsets(source, opts_, stream); auto const& data = data_row_offsets.first; auto const& row_offsets = data_row_offsets.second; @@ -418,7 +420,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) std::vector column_types; if (has_to_infer_column_types) { - column_types = infer_column_types(data, row_offsets, stream); + column_types = infer_column_types(data, row_offsets, opts_.get_timestamp_type(), stream); } else { column_types = std::visit( cudf::detail::visitor_overload{ @@ -426,14 +428,16 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) [&](const std::map& data_types) { return select_data_types(data_types); }, - [&](const std::vector& dtypes) { return parse_column_types(dtypes); }}, + [&](const std::vector& dtypes) { + return parse_column_types(dtypes, opts_.get_timestamp_type()); + }}, opts_.get_dtypes()); } out_columns.reserve(column_types.size()); if (num_records_ != 0) { - auto out_buffers = decode_data(data, row_offsets, column_types, stream); + auto out_buffers = decode_data(data, row_offsets, column_types, stream, mr); for (size_t i = 0; i < column_types.size(); ++i) { metadata.column_names.emplace_back(out_buffers[i].name); if (column_types[i].id() == type_id::STRING && opts.quotechar != '\0' && @@ -446,9 +450,9 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) const std::string dblquotechar(2, opts.quotechar); std::unique_ptr col = cudf::make_strings_column(*out_buffers[i]._strings, stream); out_columns.emplace_back( - cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr_)); + cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr)); } else { - out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, mr_)); + out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, mr)); } } } else { @@ -466,7 +470,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) return {std::make_unique(std::move(out_columns)), std::move(metadata)}; } -size_t reader::impl::find_first_row_start(host_span data) +size_t reader_impl::find_first_row_start(host_span data) { // For now, look for the first terminator (assume the first terminator isn't within a quote) // TODO: Attempt to infer this from the data @@ -477,14 +481,15 @@ size_t reader::impl::find_first_row_start(host_span data) return std::min(pos + 1, data.size()); } -std::pair, reader::impl::selected_rows_offsets> -reader::impl::load_data_and_gather_row_offsets(host_span data, - size_t range_begin, - size_t range_end, - size_t skip_rows, - int64_t num_rows, - bool load_whole_file, - rmm::cuda_stream_view stream) +std::pair, reader_impl::selected_rows_offsets> +reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_, + host_span data, + size_t range_begin, + size_t range_end, + size_t skip_rows, + int64_t num_rows, + bool load_whole_file, + rmm::cuda_stream_view stream) { constexpr size_t max_chunk_bytes = 64 * 1024 * 1024; // 64MB size_t buffer_size = std::min(max_chunk_bytes, data.size()); @@ -642,9 +647,10 @@ reader::impl::load_data_and_gather_row_offsets(host_span data, return {std::move(d_data), std::move(row_offsets)}; } -std::vector reader::impl::infer_column_types(device_span data, - device_span row_offsets, - rmm::cuda_stream_view stream) +std::vector reader_impl::infer_column_types(device_span data, + device_span row_offsets, + data_type timestamp_type, + rmm::cuda_stream_view stream) { std::vector dtypes; if (num_records_ == 0) { @@ -693,9 +699,9 @@ std::vector reader::impl::infer_column_types(device_span } } - if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) { + if (timestamp_type.id() != cudf::type_id::EMPTY) { for (auto& type : dtypes) { - if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); } + if (cudf::is_timestamp(type)) { type = timestamp_type; } } } @@ -707,8 +713,8 @@ std::vector reader::impl::infer_column_types(device_span return dtypes; } -std::vector reader::impl::parse_column_types( - const std::vector& types_as_strings) +std::vector reader_impl::parse_column_types( + const std::vector& types_as_strings, data_type timestamp_type) { std::vector dtypes; @@ -768,9 +774,9 @@ std::vector reader::impl::parse_column_types( } } - if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) { + if (timestamp_type.id() != cudf::type_id::EMPTY) { for (auto& type : dtypes) { - if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); } + if (cudf::is_timestamp(type)) { type = timestamp_type; } } } @@ -782,10 +788,11 @@ std::vector reader::impl::parse_column_types( return dtypes; } -std::vector reader::impl::decode_data(device_span data, - device_span row_offsets, - host_span column_types, - rmm::cuda_stream_view stream) +std::vector reader_impl::decode_data(device_span data, + device_span row_offsets, + host_span column_types, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Alloc output; columns' data memory is still expected for empty dataframe std::vector out_buffers; @@ -799,7 +806,7 @@ std::vector reader::impl::decode_data(device_span dat num_records_, true, stream, - is_final_allocation ? mr_ : rmm::mr::get_current_device_resource()); + is_final_allocation ? mr : rmm::mr::get_current_device_resource()); out_buffer.name = col_names_[col]; out_buffer.null_count() = UNKNOWN_NULL_COUNT; @@ -926,51 +933,31 @@ parse_options make_parse_options(csv_reader_options const& reader_opts, return parse_opts; } -reader::impl::impl(std::unique_ptr source, - std::string filepath, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : mr_(mr), source_(std::move(source)), filepath_(filepath), opts_(options) +reader_impl::reader_impl(parse_options&& parse_options, int32_t num_actual_columns) + : opts(std::move(parse_options)), + num_actual_cols_(num_actual_columns), + num_active_cols_(num_actual_columns) { - num_actual_cols_ = opts_.get_names().size(); - num_active_cols_ = num_actual_cols_; - - compression_type_ = - infer_compression_type(opts_.get_compression(), - filepath, - {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}}); - - opts = make_parse_options(options, stream); } -// Forward to implementation -reader::reader(std::vector const& filepaths, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +table_with_metadata read_csv(std::unique_ptr&& source, + csv_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported."); - // Delay actual instantiation of data source until read to allow for - // partial memory mapping of file using byte ranges - _impl = std::make_unique(nullptr, filepaths[0], options, stream, mr); -} + // get the string name of the compression type. + // auto compression_type = infer_compression_type(options.get_compression(), ""); -// Forward to implementation -reader::reader(std::vector>&& sources, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported."); - _impl = std::make_unique(std::move(sources[0]), "", options, stream, mr); -} + auto parse_options = make_parse_options(options, stream); -// Destructor within this translation unit -reader::~reader() = default; + auto num_actual_columns = options.get_names().size(); -// Forward to implementation -table_with_metadata reader::read(rmm::cuda_stream_view stream) { return _impl->read(stream); } + auto reader = std::make_unique( // + std::move(parse_options), + num_actual_columns); + + return reader->read(source.get(), options, stream, mr); +} } // namespace csv } // namespace detail diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 36c2bf4f9e7..f25beb5e561 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -66,31 +66,29 @@ using namespace cudf::io; * * Stage 4: Convert every row from csv text form to cudf binary form. */ -class reader::impl { +class reader_impl { public: /** * @brief Constructor from a dataset source with reader options. * - * @param source Dataset source - * @param filepath Filepath if reading dataset from a file - * @param options Settings for controlling reading behavior * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation */ - explicit impl(std::unique_ptr source, - std::string filepath, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); + explicit reader_impl(parse_options&& parse_options, int32_t num_actual_columns); /** * @brief Read an entire set or a subset of data and returns a set of columns. * + * @param source Dataset source + * @param options Settings for controlling reading behavior * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource to use for device memory allocation * * @return The set of columns along with metadata */ - table_with_metadata read(rmm::cuda_stream_view stream); + table_with_metadata read(cudf::io::datasource* source, + csv_reader_options const& opts_, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); private: /** @@ -133,8 +131,10 @@ class reader::impl { * * @param stream CUDA stream used for device memory operations and kernel launches. */ - std::pair, reader::impl::selected_rows_offsets> - select_data_and_row_offsets(rmm::cuda_stream_view stream); + std::pair, reader_impl::selected_rows_offsets> + select_data_and_row_offsets(cudf::io::datasource* source, + csv_reader_options const& opts_, + rmm::cuda_stream_view stream); /** * @brief Finds row positions in the specified input data, and loads the selected data onto GPU. @@ -151,8 +151,9 @@ class reader::impl { * @param stream CUDA stream used for device memory operations and kernel launches * @return Input data and row offsets in the device memory */ - std::pair, reader::impl::selected_rows_offsets> - load_data_and_gather_row_offsets(host_span data, + std::pair, reader_impl::selected_rows_offsets> + load_data_and_gather_row_offsets(csv_reader_options const& opts_, + host_span data, size_t range_begin, size_t range_end, size_t skip_rows, @@ -179,6 +180,7 @@ class reader::impl { */ std::vector infer_column_types(device_span data, device_span row_offsets, + data_type timestamp_type, rmm::cuda_stream_view stream); /** @@ -204,7 +206,8 @@ class reader::impl { * types * @return List of columns' data types */ - std::vector parse_column_types(std::vector const& types_as_strings); + std::vector parse_column_types(std::vector const& types_as_strings, + data_type timestamp_type); /** * @brief Converts the row-column data and outputs to column bufferrs. @@ -217,15 +220,10 @@ class reader::impl { std::vector decode_data(device_span data, device_span row_offsets, host_span column_types, - rmm::cuda_stream_view stream); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); private: - rmm::mr::device_memory_resource* mr_ = nullptr; - std::unique_ptr source_; - std::string filepath_; - std::string compression_type_; - const csv_reader_options opts_; - cudf::size_type num_records_ = 0; // Number of rows with actual data int num_active_cols_ = 0; // Number of columns to read int num_actual_cols_ = 0; // Number of columns in the dataset diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index e080ea3a2ca..f6fa475fdd4 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -160,15 +160,17 @@ table_with_metadata read_json(json_reader_options const& options, table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr) { - namespace csv = cudf::io::detail::csv; - CUDF_FUNC_RANGE(); auto datasources = make_datasources(options.get_source()); - auto reader = - std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); - return reader->read(); + CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported."); + + return cudf::io::detail::csv::read_csv( // + std::move(datasources[0]), + options, + rmm::cuda_stream_default, + mr); } // Freeform API wraps the detail writer class API From 6492349504a9105ac28b01806b42fcf9ad0a1cb8 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Aug 2021 07:20:01 -0500 Subject: [PATCH 03/19] pass parse_options explicitly in csv_reader --- cpp/src/io/csv/reader_impl.cu | 153 +++++++++++++------------ cpp/src/io/csv/reader_impl.hpp | 24 ++-- cpp/src/io/utilities/parsing_utils.cuh | 2 +- 3 files changed, 96 insertions(+), 83 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 81c7fc65d3f..f06334b7258 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -123,7 +123,7 @@ string removeQuotes(string str, char quotechar) * The first row can be either the header row, or the first data row */ std::vector setColumnNames(std::vector const& header, - parse_options_view const& opts, + parse_options_view const& parse_opts, int header_row, std::string prefix) { @@ -138,35 +138,36 @@ std::vector setColumnNames(std::vector const& header, bool quotation = false; for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) { // Flip the quotation flag if current character is a quotechar - if (first_row[pos] == opts.quotechar) { + if (first_row[pos] == parse_opts.quotechar) { quotation = !quotation; } // Check if end of a column/row - else if (pos == first_row.size() - 1 || (!quotation && first_row[pos] == opts.terminator) || - (!quotation && first_row[pos] == opts.delimiter)) { + else if (pos == first_row.size() - 1 || + (!quotation && first_row[pos] == parse_opts.terminator) || + (!quotation && first_row[pos] == parse_opts.delimiter)) { // This is the header, add the column name if (header_row >= 0) { // Include the current character, in case the line is not terminated int col_name_len = pos - prev + 1; // Exclude the delimiter/terminator is present - if (first_row[pos] == opts.delimiter || first_row[pos] == opts.terminator) { + if (first_row[pos] == parse_opts.delimiter || first_row[pos] == parse_opts.terminator) { --col_name_len; } // Also exclude '\r' character at the end of the column name if it's // part of the terminator - if (col_name_len > 0 && opts.terminator == '\n' && first_row[pos] == '\n' && + if (col_name_len > 0 && parse_opts.terminator == '\n' && first_row[pos] == '\n' && first_row[pos - 1] == '\r') { --col_name_len; } const string new_col_name(first_row.data() + prev, col_name_len); - col_names.push_back(removeQuotes(new_col_name, opts.quotechar)); + col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar)); // Stop parsing when we hit the line terminator; relevant when there is // a blank line following the header. In this case, first_row includes // multiple line terminators at the end, as the new recStart belongs to // a line that comes after the blank line(s) - if (!quotation && first_row[pos] == opts.terminator) { break; } + if (!quotation && first_row[pos] == parse_opts.terminator) { break; } } else { // This is the first data row, add the automatically generated name col_names.push_back(prefix + std::to_string(num_cols)); @@ -174,8 +175,8 @@ std::vector setColumnNames(std::vector const& header, num_cols++; // Skip adjacent delimiters if delim_whitespace is set - while (opts.multi_delimiter && pos < first_row.size() && first_row[pos] == opts.delimiter && - first_row[pos + 1] == opts.delimiter) { + while (parse_opts.multi_delimiter && pos < first_row.size() && + first_row[pos] == parse_opts.delimiter && first_row[pos + 1] == parse_opts.delimiter) { ++pos; } prev = pos + 1; @@ -198,17 +199,18 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream) std::pair, reader_impl::selected_rows_offsets> reader_impl::select_data_and_row_offsets(cudf::io::datasource* source, - csv_reader_options const& opts_, + csv_reader_options const& reader_opts, + parse_options const& parse_opts, rmm::cuda_stream_view stream) { - auto range_offset = opts_.get_byte_range_offset(); - auto range_size = opts_.get_byte_range_size(); - auto skip_rows = opts_.get_skiprows(); - auto skip_end_rows = opts_.get_skipfooter(); - auto num_rows = opts_.get_nrows(); + auto range_offset = reader_opts.get_byte_range_offset(); + auto range_size = reader_opts.get_byte_range_size(); + auto skip_rows = reader_opts.get_skiprows(); + auto skip_end_rows = reader_opts.get_skipfooter(); + auto num_rows = reader_opts.get_nrows(); // we use "infer", but really we are just getting the string name of the compression type. - auto compression_type = infer_compression_type(opts_.get_compression(), "", {}); + auto compression_type = infer_compression_type(reader_opts.get_compression(), "", {}); if (range_offset > 0 || range_size > 0) { CUDF_EXPECTS(compression_type == "none", @@ -217,8 +219,8 @@ reader_impl::select_data_and_row_offsets(cudf::io::datasource* source, size_t map_range_size = 0; if (range_size != 0) { auto num_given_dtypes = - std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes()); - const auto num_columns = std::max(opts_.get_names().size(), num_given_dtypes); + std::visit([](const auto& dtypes) { return dtypes.size(); }, reader_opts.get_dtypes()); + const auto num_columns = std::max(reader_opts.get_names().size(), num_given_dtypes); map_range_size = range_size + calculateMaxRowSize(num_columns); } @@ -242,15 +244,17 @@ reader_impl::select_data_and_row_offsets(cudf::io::datasource* source, skip_end_rows <= 0 && num_rows == -1; // With byte range, find the start of the first data row - size_t const data_start_offset = (range_offset != 0) ? find_first_row_start(h_data) : 0; + size_t const data_start_offset = + (range_offset != 0) ? find_first_row_start(parse_opts.terminator, h_data) : 0; // TODO: Allow parsing the header outside the mapped range - CUDF_EXPECTS((range_offset == 0 || opts_.get_header() < 0), + CUDF_EXPECTS((range_offset == 0 || reader_opts.get_header() < 0), "byte_range offset with header not supported"); // Gather row offsets auto data_row_offsets = - load_data_and_gather_row_offsets(opts_, + load_data_and_gather_row_offsets(reader_opts, + parse_opts, h_data, data_start_offset, (range_size) ? range_size : h_data.size(), @@ -304,23 +308,26 @@ std::vector reader_impl::select_data_types(std::vector con } table_with_metadata reader_impl::read(cudf::io::datasource* source, - csv_reader_options const& opts_, + csv_reader_options const& reader_opts, + parse_options const& parse_opts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const data_row_offsets = select_data_and_row_offsets(source, opts_, stream); - auto const& data = data_row_offsets.first; - auto const& row_offsets = data_row_offsets.second; + auto const data_row_offsets = + select_data_and_row_offsets(source, reader_opts, parse_opts, stream); + auto const& data = data_row_offsets.first; + auto const& row_offsets = data_row_offsets.second; // Exclude the end-of-data row from number of rows with actual data num_records_ = std::max(row_offsets.size(), 1ul) - 1; // Check if the user gave us a list of column names - if (not opts_.get_names().empty()) { - column_flags_.resize(opts_.get_names().size(), column_parse::enabled); - col_names_ = opts_.get_names(); + if (not reader_opts.get_names().empty()) { + column_flags_.resize(reader_opts.get_names().size(), column_parse::enabled); + col_names_ = reader_opts.get_names(); } else { - col_names_ = setColumnNames(header_, opts.view(), opts_.get_header(), opts_.get_prefix()); + col_names_ = setColumnNames( + header_, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix()); num_actual_cols_ = num_active_cols_ = col_names_.size(); @@ -339,7 +346,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, // Operator [] inserts a default-initialized value if the given key is not // present if (++col_names_histogram[col_name] > 1) { - if (opts_.is_enabled_mangle_dupe_cols()) { + if (reader_opts.is_enabled_mangle_dupe_cols()) { // Rename duplicates of column X as X.1, X.2, ...; First appearance // stays as X do { @@ -355,21 +362,23 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, // Update the number of columns to be processed, if some might have been // removed - if (!opts_.is_enabled_mangle_dupe_cols()) { num_active_cols_ = col_names_histogram.size(); } + if (!reader_opts.is_enabled_mangle_dupe_cols()) { + num_active_cols_ = col_names_histogram.size(); + } } // User can specify which columns should be parsed - if (!opts_.get_use_cols_indexes().empty() || !opts_.get_use_cols_names().empty()) { + if (!reader_opts.get_use_cols_indexes().empty() || !reader_opts.get_use_cols_names().empty()) { std::fill(column_flags_.begin(), column_flags_.end(), column_parse::disabled); - for (const auto index : opts_.get_use_cols_indexes()) { + for (const auto index : reader_opts.get_use_cols_indexes()) { column_flags_[index] = column_parse::enabled; } - num_active_cols_ = std::unordered_set(opts_.get_use_cols_indexes().begin(), - opts_.get_use_cols_indexes().end()) + num_active_cols_ = std::unordered_set(reader_opts.get_use_cols_indexes().begin(), + reader_opts.get_use_cols_indexes().end()) .size(); - for (const auto& name : opts_.get_use_cols_names()) { + for (const auto& name : reader_opts.get_use_cols_names()) { const auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { auto curr_it = it - col_names_.begin(); @@ -382,12 +391,13 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, } // User can specify which columns should be read as datetime - if (!opts_.get_parse_dates_indexes().empty() || !opts_.get_parse_dates_names().empty()) { - for (const auto index : opts_.get_parse_dates_indexes()) { + if (!reader_opts.get_parse_dates_indexes().empty() || + !reader_opts.get_parse_dates_names().empty()) { + for (const auto index : reader_opts.get_parse_dates_indexes()) { column_flags_[index] |= column_parse::as_datetime; } - for (const auto& name : opts_.get_parse_dates_names()) { + for (const auto& name : reader_opts.get_parse_dates_names()) { auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { column_flags_[it - col_names_.begin()] |= column_parse::as_datetime; @@ -396,12 +406,12 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, } // User can specify which columns should be parsed as hexadecimal - if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) { - for (const auto index : opts_.get_parse_hex_indexes()) { + if (!reader_opts.get_parse_hex_indexes().empty() || !reader_opts.get_parse_hex_names().empty()) { + for (const auto index : reader_opts.get_parse_hex_indexes()) { column_flags_[index] |= column_parse::as_hexadecimal; } - for (const auto& name : opts_.get_parse_hex_names()) { + for (const auto& name : reader_opts.get_parse_hex_names()) { auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { column_flags_[it - col_names_.begin()] |= column_parse::as_hexadecimal; @@ -416,11 +426,12 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, auto out_columns = std::vector>(); bool has_to_infer_column_types = - std::visit([](const auto& dtypes) { return dtypes.empty(); }, opts_.get_dtypes()); + std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); std::vector column_types; if (has_to_infer_column_types) { - column_types = infer_column_types(data, row_offsets, opts_.get_timestamp_type(), stream); + column_types = + infer_column_types(parse_opts, data, row_offsets, reader_opts.get_timestamp_type(), stream); } else { column_types = std::visit( cudf::detail::visitor_overload{ @@ -429,25 +440,25 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, return select_data_types(data_types); }, [&](const std::vector& dtypes) { - return parse_column_types(dtypes, opts_.get_timestamp_type()); + return parse_column_types(dtypes, reader_opts.get_timestamp_type()); }}, - opts_.get_dtypes()); + reader_opts.get_dtypes()); } out_columns.reserve(column_types.size()); if (num_records_ != 0) { - auto out_buffers = decode_data(data, row_offsets, column_types, stream, mr); + auto out_buffers = decode_data(parse_opts, data, row_offsets, column_types, stream, mr); for (size_t i = 0; i < column_types.size(); ++i) { metadata.column_names.emplace_back(out_buffers[i].name); - if (column_types[i].id() == type_id::STRING && opts.quotechar != '\0' && - opts.doublequote == true) { + if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' && + parse_opts.doublequote == true) { // PANDAS' default behavior of enabling doublequote for two consecutive // quotechars in quoted fields results in reduction to a single quotechar // TODO: Would be much more efficient to perform this operation in-place // during the conversion stage - const std::string quotechar(1, opts.quotechar); - const std::string dblquotechar(2, opts.quotechar); + const std::string quotechar(1, parse_opts.quotechar); + const std::string dblquotechar(2, parse_opts.quotechar); std::unique_ptr col = cudf::make_strings_column(*out_buffers[i]._strings, stream); out_columns.emplace_back( cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr)); @@ -470,19 +481,20 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, return {std::make_unique
(std::move(out_columns)), std::move(metadata)}; } -size_t reader_impl::find_first_row_start(host_span data) +size_t reader_impl::find_first_row_start(char row_terminator, host_span data) { // For now, look for the first terminator (assume the first terminator isn't within a quote) // TODO: Attempt to infer this from the data size_t pos = 0; - while (pos < data.size() && data[pos] != opts.terminator) { + while (pos < data.size() && data[pos] != row_terminator) { ++pos; } return std::min(pos + 1, data.size()); } std::pair, reader_impl::selected_rows_offsets> -reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_, +reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& reader_opts, + parse_options const& parse_opts, host_span data, size_t range_begin, size_t range_end, @@ -498,7 +510,7 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_, hostdevice_vector row_ctx(max_blocks); size_t buffer_pos = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size()); size_t pos = std::min(range_begin, data.size()); - size_t header_rows = (opts_.get_header() >= 0) ? opts_.get_header() + 1 : 0; + size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0; uint64_t ctx = 0; // For compatibility with the previous parser, a row is considered in-range if the @@ -524,7 +536,7 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_, // Pass 1: Count the potential number of rows in each character block for each // possible parser state at the beginning of the block. - uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(opts.view(), + uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), row_ctx.device_ptr(), device_span(), d_data, @@ -563,7 +575,7 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_, stream.value())); // Pass 2: Output row offsets - cudf::io::csv::gpu::gather_row_offsets(opts.view(), + cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), row_ctx.device_ptr(), all_row_offsets, d_data, @@ -600,8 +612,8 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_, // num_rows does not include blank rows if (num_rows >= 0) { if (all_row_offsets.size() > header_rows + static_cast(num_rows)) { - size_t num_blanks = - cudf::io::csv::gpu::count_blank_rows(opts.view(), d_data, all_row_offsets, stream); + size_t num_blanks = cudf::io::csv::gpu::count_blank_rows( + parse_opts.view(), d_data, all_row_offsets, stream); if (all_row_offsets.size() - num_blanks > header_rows + static_cast(num_rows)) { // Got the desired number of rows break; @@ -620,7 +632,7 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_, } while (pos < data.size()); auto const non_blank_row_offsets = - io::csv::gpu::remove_blank_rows(opts.view(), d_data, all_row_offsets, stream); + io::csv::gpu::remove_blank_rows(parse_opts.view(), d_data, all_row_offsets, stream); auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets}; // Remove header rows and extract header @@ -647,7 +659,8 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& opts_, return {std::move(d_data), std::move(row_offsets)}; } -std::vector reader_impl::infer_column_types(device_span data, +std::vector reader_impl::infer_column_types(parse_options const& parse_opts, + device_span data, device_span row_offsets, data_type timestamp_type, rmm::cuda_stream_view stream) @@ -657,7 +670,7 @@ std::vector reader_impl::infer_column_types(device_span d dtypes.resize(num_active_cols_, data_type{type_id::EMPTY}); } else { auto column_stats = - cudf::io::csv::gpu::detect_column_types(opts.view(), + cudf::io::csv::gpu::detect_column_types(parse_opts.view(), data, make_device_uvector_async(column_flags_, stream), row_offsets, @@ -788,7 +801,8 @@ std::vector reader_impl::parse_column_types( return dtypes; } -std::vector reader_impl::decode_data(device_span data, +std::vector reader_impl::decode_data(parse_options const& parse_opts, + device_span data, device_span row_offsets, host_span column_types, rmm::cuda_stream_view stream, @@ -823,7 +837,7 @@ std::vector reader_impl::decode_data(device_span data h_valid[i] = out_buffers[i].null_mask(); } - cudf::io::csv::gpu::decode_row_column_data(opts.view(), + cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(), data, make_device_uvector_async(column_flags_, stream), row_offsets, @@ -933,10 +947,8 @@ parse_options make_parse_options(csv_reader_options const& reader_opts, return parse_opts; } -reader_impl::reader_impl(parse_options&& parse_options, int32_t num_actual_columns) - : opts(std::move(parse_options)), - num_actual_cols_(num_actual_columns), - num_active_cols_(num_actual_columns) +reader_impl::reader_impl(int32_t num_actual_columns) + : num_actual_cols_(num_actual_columns), num_active_cols_(num_actual_columns) { } @@ -953,10 +965,9 @@ table_with_metadata read_csv(std::unique_ptr&& source, auto num_actual_columns = options.get_names().size(); auto reader = std::make_unique( // - std::move(parse_options), num_actual_columns); - return reader->read(source.get(), options, stream, mr); + return reader->read(source.get(), options, parse_options, stream, mr); } } // namespace csv diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index f25beb5e561..1c0923b03b3 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -71,22 +71,22 @@ class reader_impl { /** * @brief Constructor from a dataset source with reader options. * - * @param stream CUDA stream used for device memory operations and kernel launches */ - explicit reader_impl(parse_options&& parse_options, int32_t num_actual_columns); + explicit reader_impl(int32_t num_actual_columns); /** * @brief Read an entire set or a subset of data and returns a set of columns. * * @param source Dataset source * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation * * @return The set of columns along with metadata */ table_with_metadata read(cudf::io::datasource* source, - csv_reader_options const& opts_, + csv_reader_options const& reader_opts, + parse_options const& parse_opts, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); @@ -133,7 +133,8 @@ class reader_impl { */ std::pair, reader_impl::selected_rows_offsets> select_data_and_row_offsets(cudf::io::datasource* source, - csv_reader_options const& opts_, + csv_reader_options const& reader_opts, + parse_options const& parse_opts, rmm::cuda_stream_view stream); /** @@ -152,7 +153,8 @@ class reader_impl { * @return Input data and row offsets in the device memory */ std::pair, reader_impl::selected_rows_offsets> - load_data_and_gather_row_offsets(csv_reader_options const& opts_, + load_data_and_gather_row_offsets(csv_reader_options const& reader_opts, + parse_options const& parse_opts, host_span data, size_t range_begin, size_t range_end, @@ -168,7 +170,7 @@ class reader_impl { * * @return Byte position of the first row */ - size_t find_first_row_start(host_span data); + size_t find_first_row_start(char row_terminator, host_span data); /** * @brief Automatically infers each column's data type based on the CSV's data within that column. @@ -178,7 +180,8 @@ class reader_impl { * @param stream The stream to which the type inference-kernel will be dispatched * @return The columns' inferred data types */ - std::vector infer_column_types(device_span data, + std::vector infer_column_types(parse_options const& parse_opts, + device_span data, device_span row_offsets, data_type timestamp_type, rmm::cuda_stream_view stream); @@ -217,7 +220,8 @@ class reader_impl { * * @return list of column buffers of decoded data, or ptr/size in the case of strings. */ - std::vector decode_data(device_span data, + std::vector decode_data(parse_options const& parse_opts, + device_span data, device_span row_offsets, host_span column_types, rmm::cuda_stream_view stream, @@ -228,8 +232,6 @@ class reader_impl { int num_active_cols_ = 0; // Number of columns to read int num_actual_cols_ = 0; // Number of columns in the dataset - // Parsing options - parse_options opts{}; std::vector column_flags_; // Intermediate data diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 88297423b9b..39857478c1f 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -68,7 +68,7 @@ struct parse_options { cudf::detail::optional_trie trie_na; bool multi_delimiter; - parse_options_view view() + parse_options_view view() const { return {delimiter, terminator, From 3e365b59417f82eb1416e41d033bdf9e16917cf0 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Aug 2021 07:34:28 -0500 Subject: [PATCH 04/19] replace csv reader impl::select_data_types with pure function --- cpp/src/io/csv/reader_impl.cu | 39 ++++++++++++++++++++++------------ cpp/src/io/csv/reader_impl.hpp | 16 -------------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index f06334b7258..292062b0c59 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -272,36 +272,43 @@ reader_impl::select_data_and_row_offsets(cudf::io::datasource* source, return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; } -std::vector reader_impl::select_data_types( - std::map const& col_type_map) +std::vector get_data_types_from_column_names( + std::map const& column_type_map, + std::vector const& column_flags, + std::vector const& column_names, + int32_t num_actual_columns) { std::vector selected_dtypes; - for (int col = 0; col < num_actual_cols_; col++) { - if (column_flags_[col] & column_parse::enabled) { - auto const col_type_it = col_type_map.find(col_names_[col]); - CUDF_EXPECTS(col_type_it != col_type_map.end(), + for (int32_t i = 0; i < num_actual_columns; i++) { + if (column_flags[i] & column_parse::enabled) { + auto const col_type_it = column_type_map.find(column_names[i]); + CUDF_EXPECTS(col_type_it != column_type_map.end(), "Must specify data types for all active columns"); selected_dtypes.emplace_back(col_type_it->second); } } + return selected_dtypes; } -std::vector reader_impl::select_data_types(std::vector const& dtypes) +std::vector select_data_types(std::vector const& dtypes, + std::vector const& column_flags, + int32_t num_actual_columns, + int32_t num_active_columns) { std::vector selected_dtypes; if (dtypes.size() == 1) { // If it's a single dtype, assign that dtype to all active columns - selected_dtypes.resize(num_active_cols_, dtypes.front()); + selected_dtypes.resize(num_active_columns, dtypes.front()); } else { // If it's a list, assign dtypes to active columns in the given order - CUDF_EXPECTS(static_cast(dtypes.size()) >= num_actual_cols_, + CUDF_EXPECTS(static_cast(dtypes.size()) >= num_actual_columns, "Must specify data types for all columns"); - for (int col = 0; col < num_actual_cols_; col++) { - if (column_flags_[col] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[col]); } + for (int i = 0; i < num_actual_columns; i++) { + if (column_flags[i] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[i]); } } } return selected_dtypes; @@ -435,9 +442,15 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, } else { column_types = std::visit( cudf::detail::visitor_overload{ - [&](const std::vector& data_types) { return select_data_types(data_types); }, + [&](const std::vector& data_types) { + return select_data_types(data_types, column_flags_, num_actual_cols_, num_active_cols_); + }, [&](const std::map& data_types) { - return select_data_types(data_types); + return get_data_types_from_column_names( // + data_types, + column_flags_, + col_names_, + num_actual_cols_); }, [&](const std::vector& dtypes) { return parse_column_types(dtypes, reader_opts.get_timestamp_type()); diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 1c0923b03b3..1dc74759694 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -186,22 +186,6 @@ class reader_impl { data_type timestamp_type, rmm::cuda_stream_view stream); - /** - * @brief Selects the columns' data types from the map of dtypes. - * - * @param col_type_map Column name -> data type map specifying the columns' target data types - * @return Sorted list of selected columns' data types - */ - std::vector select_data_types(std::map const& col_type_map); - - /** - * @brief Selects the columns' data types from the list of dtypes. - * - * @param dtypes Vector of data types specifying the columns' target data types - * @return Sorted list of selected columns' data types - */ - std::vector select_data_types(std::vector const& dtypes); - /** * @brief Parses the columns' data types from the vector of dtypes that are provided as strings. * From a4497c0c2e9f72a2ca6195a8988489dbd3352494 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Aug 2021 07:57:20 -0500 Subject: [PATCH 05/19] replace csv reader impl::column_flags_ member with local variable --- cpp/src/io/csv/reader_impl.cu | 97 +++++++++++++++++++--------------- cpp/src/io/csv/reader_impl.hpp | 7 +-- 2 files changed, 57 insertions(+), 47 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 292062b0c59..15795807d65 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -273,8 +273,8 @@ reader_impl::select_data_and_row_offsets(cudf::io::datasource* source, } std::vector get_data_types_from_column_names( - std::map const& column_type_map, std::vector const& column_flags, + std::map const& column_type_map, std::vector const& column_names, int32_t num_actual_columns) { @@ -292,8 +292,8 @@ std::vector get_data_types_from_column_names( return selected_dtypes; } -std::vector select_data_types(std::vector const& dtypes, - std::vector const& column_flags, +std::vector select_data_types(std::vector const& column_flags, + std::vector const& dtypes, int32_t num_actual_columns, int32_t num_active_columns) { @@ -328,9 +328,11 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, // Exclude the end-of-data row from number of rows with actual data num_records_ = std::max(row_offsets.size(), 1ul) - 1; + auto column_flags = std::vector(); + // Check if the user gave us a list of column names if (not reader_opts.get_names().empty()) { - column_flags_.resize(reader_opts.get_names().size(), column_parse::enabled); + column_flags.resize(reader_opts.get_names().size(), column_parse::enabled); col_names_ = reader_opts.get_names(); } else { col_names_ = setColumnNames( @@ -338,7 +340,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, num_actual_cols_ = num_active_cols_ = col_names_.size(); - column_flags_.resize(num_actual_cols_, column_parse::enabled); + column_flags.resize(num_actual_cols_, column_parse::enabled); // Rename empty column names to "Unnamed: col_index" for (size_t col_idx = 0; col_idx < col_names_.size(); ++col_idx) { @@ -361,8 +363,8 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, } while (col_names_histogram[col_name]++); } else { // All duplicate columns will be ignored; First appearance is parsed - const auto idx = &col_name - col_names_.data(); - column_flags_[idx] = column_parse::disabled; + const auto idx = &col_name - col_names_.data(); + column_flags[idx] = column_parse::disabled; } } } @@ -376,10 +378,10 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, // User can specify which columns should be parsed if (!reader_opts.get_use_cols_indexes().empty() || !reader_opts.get_use_cols_names().empty()) { - std::fill(column_flags_.begin(), column_flags_.end(), column_parse::disabled); + std::fill(column_flags.begin(), column_flags.end(), column_parse::disabled); for (const auto index : reader_opts.get_use_cols_indexes()) { - column_flags_[index] = column_parse::enabled; + column_flags[index] = column_parse::enabled; } num_active_cols_ = std::unordered_set(reader_opts.get_use_cols_indexes().begin(), reader_opts.get_use_cols_indexes().end()) @@ -389,8 +391,8 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, const auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { auto curr_it = it - col_names_.begin(); - if (column_flags_[curr_it] == column_parse::disabled) { - column_flags_[curr_it] = column_parse::enabled; + if (column_flags[curr_it] == column_parse::disabled) { + column_flags[curr_it] = column_parse::enabled; num_active_cols_++; } } @@ -401,13 +403,13 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, if (!reader_opts.get_parse_dates_indexes().empty() || !reader_opts.get_parse_dates_names().empty()) { for (const auto index : reader_opts.get_parse_dates_indexes()) { - column_flags_[index] |= column_parse::as_datetime; + column_flags[index] |= column_parse::as_datetime; } for (const auto& name : reader_opts.get_parse_dates_names()) { auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { - column_flags_[it - col_names_.begin()] |= column_parse::as_datetime; + column_flags[it - col_names_.begin()] |= column_parse::as_datetime; } } } @@ -415,13 +417,13 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, // User can specify which columns should be parsed as hexadecimal if (!reader_opts.get_parse_hex_indexes().empty() || !reader_opts.get_parse_hex_names().empty()) { for (const auto index : reader_opts.get_parse_hex_indexes()) { - column_flags_[index] |= column_parse::as_hexadecimal; + column_flags[index] |= column_parse::as_hexadecimal; } for (const auto& name : reader_opts.get_parse_hex_names()) { auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { - column_flags_[it - col_names_.begin()] |= column_parse::as_hexadecimal; + column_flags[it - col_names_.begin()] |= column_parse::as_hexadecimal; } } } @@ -437,23 +439,23 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, std::vector column_types; if (has_to_infer_column_types) { - column_types = - infer_column_types(parse_opts, data, row_offsets, reader_opts.get_timestamp_type(), stream); + column_types = infer_column_types( + parse_opts, column_flags, data, row_offsets, reader_opts.get_timestamp_type(), stream); } else { column_types = std::visit( cudf::detail::visitor_overload{ [&](const std::vector& data_types) { - return select_data_types(data_types, column_flags_, num_actual_cols_, num_active_cols_); + return select_data_types(column_flags, data_types, num_actual_cols_, num_active_cols_); }, [&](const std::map& data_types) { return get_data_types_from_column_names( // + column_flags, data_types, - column_flags_, col_names_, num_actual_cols_); }, [&](const std::vector& dtypes) { - return parse_column_types(dtypes, reader_opts.get_timestamp_type()); + return parse_column_types(column_flags, dtypes, reader_opts.get_timestamp_type()); }}, reader_opts.get_dtypes()); } @@ -461,7 +463,8 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, out_columns.reserve(column_types.size()); if (num_records_ != 0) { - auto out_buffers = decode_data(parse_opts, data, row_offsets, column_types, stream, mr); + auto out_buffers = + decode_data(parse_opts, column_flags, data, row_offsets, column_types, stream, mr); for (size_t i = 0; i < column_types.size(); ++i) { metadata.column_names.emplace_back(out_buffers[i].name); if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' && @@ -486,7 +489,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, } // Handle empty metadata for (int col = 0; col < num_actual_cols_; ++col) { - if (column_flags_[col] & column_parse::enabled) { + if (column_flags[col] & column_parse::enabled) { metadata.column_names.emplace_back(col_names_[col]); } } @@ -672,11 +675,13 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& reader_o return {std::move(d_data), std::move(row_offsets)}; } -std::vector reader_impl::infer_column_types(parse_options const& parse_opts, - device_span data, - device_span row_offsets, - data_type timestamp_type, - rmm::cuda_stream_view stream) +std::vector reader_impl::infer_column_types( + parse_options const& parse_opts, + std::vector const& column_flags, + device_span data, + device_span row_offsets, + data_type timestamp_type, + rmm::cuda_stream_view stream) { std::vector dtypes; if (num_records_ == 0) { @@ -685,7 +690,7 @@ std::vector reader_impl::infer_column_types(parse_options const& pars auto column_stats = cudf::io::csv::gpu::detect_column_types(parse_opts.view(), data, - make_device_uvector_async(column_flags_, stream), + make_device_uvector_async(column_flags, stream), row_offsets, num_active_cols_, stream); @@ -740,13 +745,15 @@ std::vector reader_impl::infer_column_types(parse_options const& pars } std::vector reader_impl::parse_column_types( - const std::vector& types_as_strings, data_type timestamp_type) + std::vector& column_flags, + std::vector const& types_as_strings, + data_type timestamp_type) { std::vector dtypes; - const bool is_dict = std::all_of(types_as_strings.begin(), + bool const is_dict = std::all_of(types_as_strings.begin(), types_as_strings.end(), - [](const auto& s) { return s.find(':') != std::string::npos; }); + [](auto const& s) { return s.find(':') != std::string::npos; }); if (!is_dict) { if (types_as_strings.size() == 1) { @@ -756,7 +763,7 @@ std::vector reader_impl::parse_column_types( std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]); dtypes.resize(num_active_cols_, dtype_); for (int col = 0; col < num_actual_cols_; col++) { - column_flags_[col] |= col_flags_; + column_flags[col] |= col_flags_; } CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); } else { @@ -767,10 +774,10 @@ std::vector reader_impl::parse_column_types( auto dtype_ = std::back_inserter(dtypes); for (int col = 0; col < num_actual_cols_; col++) { - if (column_flags_[col] & column_parse::enabled) { + if (column_flags[col] & column_parse::enabled) { column_parse::flags col_flags_; std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]); - column_flags_[col] |= col_flags_; + column_flags[col] |= col_flags_; CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); } } @@ -789,12 +796,12 @@ std::vector reader_impl::parse_column_types( auto dtype_ = std::back_inserter(dtypes); for (int col = 0; col < num_actual_cols_; col++) { - if (column_flags_[col] & column_parse::enabled) { + if (column_flags[col] & column_parse::enabled) { CUDF_EXPECTS(col_type_map.find(col_names_[col]) != col_type_map.end(), "Must specify data types for all active columns"); column_parse::flags col_flags_; std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[col_names_[col]]); - column_flags_[col] |= col_flags_; + column_flags[col] |= col_flags_; CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); } } @@ -814,19 +821,21 @@ std::vector reader_impl::parse_column_types( return dtypes; } -std::vector reader_impl::decode_data(parse_options const& parse_opts, - device_span data, - device_span row_offsets, - host_span column_types, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::vector reader_impl::decode_data( + parse_options const& parse_opts, + std::vector const& column_flags, + device_span data, + device_span row_offsets, + host_span column_types, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Alloc output; columns' data memory is still expected for empty dataframe std::vector out_buffers; out_buffers.reserve(column_types.size()); for (int col = 0, active_col = 0; col < num_actual_cols_; ++col) { - if (column_flags_[col] & column_parse::enabled) { + if (column_flags[col] & column_parse::enabled) { const bool is_final_allocation = column_types[active_col].id() != type_id::STRING; auto out_buffer = column_buffer(column_types[active_col], @@ -852,7 +861,7 @@ std::vector reader_impl::decode_data(parse_options const& parse_o cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(), data, - make_device_uvector_async(column_flags_, stream), + make_device_uvector_async(column_flags, stream), row_offsets, make_device_uvector_async(column_types, stream), make_device_uvector_async(h_data, stream), diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 1dc74759694..f2681cd2cf7 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -181,6 +181,7 @@ class reader_impl { * @return The columns' inferred data types */ std::vector infer_column_types(parse_options const& parse_opts, + std::vector const& column_flags, device_span data, device_span row_offsets, data_type timestamp_type, @@ -193,7 +194,8 @@ class reader_impl { * types * @return List of columns' data types */ - std::vector parse_column_types(std::vector const& types_as_strings, + std::vector parse_column_types(std::vector& column_flags, + std::vector const& types_as_strings, data_type timestamp_type); /** @@ -205,6 +207,7 @@ class reader_impl { * @return list of column buffers of decoded data, or ptr/size in the case of strings. */ std::vector decode_data(parse_options const& parse_opts, + std::vector const& column_flags, device_span data, device_span row_offsets, host_span column_types, @@ -216,8 +219,6 @@ class reader_impl { int num_active_cols_ = 0; // Number of columns to read int num_actual_cols_ = 0; // Number of columns in the dataset - std::vector column_flags_; - // Intermediate data std::vector col_names_; std::vector header_; From 6d708b75895d3e8da956cbdbb39ae3ca21b392f8 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Aug 2021 08:00:50 -0500 Subject: [PATCH 06/19] make csv reader impl::find_first_row_start a standalone function --- cpp/src/io/csv/reader_impl.cu | 22 +++++++++++----------- cpp/src/io/csv/reader_impl.hpp | 9 --------- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 15795807d65..d3ebfb6fcbb 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -197,6 +197,17 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream) container.resize(1, stream); } +size_t find_first_row_start(char row_terminator, host_span data) +{ + // For now, look for the first terminator (assume the first terminator isn't within a quote) + // TODO: Attempt to infer this from the data + size_t pos = 0; + while (pos < data.size() && data[pos] != row_terminator) { + ++pos; + } + return std::min(pos + 1, data.size()); +} + std::pair, reader_impl::selected_rows_offsets> reader_impl::select_data_and_row_offsets(cudf::io::datasource* source, csv_reader_options const& reader_opts, @@ -497,17 +508,6 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, return {std::make_unique
(std::move(out_columns)), std::move(metadata)}; } -size_t reader_impl::find_first_row_start(char row_terminator, host_span data) -{ - // For now, look for the first terminator (assume the first terminator isn't within a quote) - // TODO: Attempt to infer this from the data - size_t pos = 0; - while (pos < data.size() && data[pos] != row_terminator) { - ++pos; - } - return std::min(pos + 1, data.size()); -} - std::pair, reader_impl::selected_rows_offsets> reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& reader_opts, parse_options const& parse_opts, diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index f2681cd2cf7..db37d7daea8 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -163,15 +163,6 @@ class reader_impl { bool load_whole_file, rmm::cuda_stream_view stream); - /** - * @brief Find the start position of the first data row - * - * @param h_data Uncompressed input data in host memory - * - * @return Byte position of the first row - */ - size_t find_first_row_start(char row_terminator, host_span data); - /** * @brief Automatically infers each column's data type based on the CSV's data within that column. * From 26e37e2ea198881f4494bff2bb7602137e75b601 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Aug 2021 08:08:30 -0500 Subject: [PATCH 07/19] make csv reader impl:col_names_ a local variable --- cpp/src/io/csv/reader_impl.cu | 54 ++++++++++++++++++---------------- cpp/src/io/csv/reader_impl.hpp | 4 ++- 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index d3ebfb6fcbb..80f91f985ea 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -340,29 +340,30 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, num_records_ = std::max(row_offsets.size(), 1ul) - 1; auto column_flags = std::vector(); + auto column_names = std::vector(); // Check if the user gave us a list of column names if (not reader_opts.get_names().empty()) { column_flags.resize(reader_opts.get_names().size(), column_parse::enabled); - col_names_ = reader_opts.get_names(); + column_names = reader_opts.get_names(); } else { - col_names_ = setColumnNames( + column_names = setColumnNames( header_, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix()); - num_actual_cols_ = num_active_cols_ = col_names_.size(); + num_actual_cols_ = num_active_cols_ = column_names.size(); column_flags.resize(num_actual_cols_, column_parse::enabled); // Rename empty column names to "Unnamed: col_index" - for (size_t col_idx = 0; col_idx < col_names_.size(); ++col_idx) { - if (col_names_[col_idx].empty()) { - col_names_[col_idx] = string("Unnamed: ") + std::to_string(col_idx); + for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) { + if (column_names[col_idx].empty()) { + column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx); } } // Looking for duplicates std::unordered_map col_names_histogram; - for (auto& col_name : col_names_) { + for (auto& col_name : column_names) { // Operator [] inserts a default-initialized value if the given key is not // present if (++col_names_histogram[col_name] > 1) { @@ -374,7 +375,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, } while (col_names_histogram[col_name]++); } else { // All duplicate columns will be ignored; First appearance is parsed - const auto idx = &col_name - col_names_.data(); + const auto idx = &col_name - column_names.data(); column_flags[idx] = column_parse::disabled; } } @@ -399,9 +400,9 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, .size(); for (const auto& name : reader_opts.get_use_cols_names()) { - const auto it = std::find(col_names_.begin(), col_names_.end(), name); - if (it != col_names_.end()) { - auto curr_it = it - col_names_.begin(); + const auto it = std::find(column_names.begin(), column_names.end(), name); + if (it != column_names.end()) { + auto curr_it = it - column_names.begin(); if (column_flags[curr_it] == column_parse::disabled) { column_flags[curr_it] = column_parse::enabled; num_active_cols_++; @@ -418,9 +419,9 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, } for (const auto& name : reader_opts.get_parse_dates_names()) { - auto it = std::find(col_names_.begin(), col_names_.end(), name); - if (it != col_names_.end()) { - column_flags[it - col_names_.begin()] |= column_parse::as_datetime; + auto it = std::find(column_names.begin(), column_names.end(), name); + if (it != column_names.end()) { + column_flags[it - column_names.begin()] |= column_parse::as_datetime; } } } @@ -432,9 +433,9 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, } for (const auto& name : reader_opts.get_parse_hex_names()) { - auto it = std::find(col_names_.begin(), col_names_.end(), name); - if (it != col_names_.end()) { - column_flags[it - col_names_.begin()] |= column_parse::as_hexadecimal; + auto it = std::find(column_names.begin(), column_names.end(), name); + if (it != column_names.end()) { + column_flags[it - column_names.begin()] |= column_parse::as_hexadecimal; } } } @@ -462,11 +463,12 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, return get_data_types_from_column_names( // column_flags, data_types, - col_names_, + column_names, num_actual_cols_); }, [&](const std::vector& dtypes) { - return parse_column_types(column_flags, dtypes, reader_opts.get_timestamp_type()); + return parse_column_types( + column_flags, column_names, dtypes, reader_opts.get_timestamp_type()); }}, reader_opts.get_dtypes()); } @@ -474,8 +476,8 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, out_columns.reserve(column_types.size()); if (num_records_ != 0) { - auto out_buffers = - decode_data(parse_opts, column_flags, data, row_offsets, column_types, stream, mr); + auto out_buffers = decode_data( + parse_opts, column_flags, column_names, data, row_offsets, column_types, stream, mr); for (size_t i = 0; i < column_types.size(); ++i) { metadata.column_names.emplace_back(out_buffers[i].name); if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' && @@ -501,7 +503,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, // Handle empty metadata for (int col = 0; col < num_actual_cols_; ++col) { if (column_flags[col] & column_parse::enabled) { - metadata.column_names.emplace_back(col_names_[col]); + metadata.column_names.emplace_back(column_names[col]); } } } @@ -746,6 +748,7 @@ std::vector reader_impl::infer_column_types( std::vector reader_impl::parse_column_types( std::vector& column_flags, + std::vector const& column_names, std::vector const& types_as_strings, data_type timestamp_type) { @@ -797,10 +800,10 @@ std::vector reader_impl::parse_column_types( for (int col = 0; col < num_actual_cols_; col++) { if (column_flags[col] & column_parse::enabled) { - CUDF_EXPECTS(col_type_map.find(col_names_[col]) != col_type_map.end(), + CUDF_EXPECTS(col_type_map.find(column_names[col]) != col_type_map.end(), "Must specify data types for all active columns"); column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[col_names_[col]]); + std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[column_names[col]]); column_flags[col] |= col_flags_; CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); } @@ -824,6 +827,7 @@ std::vector reader_impl::parse_column_types( std::vector reader_impl::decode_data( parse_options const& parse_opts, std::vector const& column_flags, + std::vector const& column_names, device_span data, device_span row_offsets, host_span column_types, @@ -844,7 +848,7 @@ std::vector reader_impl::decode_data( stream, is_final_allocation ? mr : rmm::mr::get_current_device_resource()); - out_buffer.name = col_names_[col]; + out_buffer.name = column_names[col]; out_buffer.null_count() = UNKNOWN_NULL_COUNT; out_buffers.emplace_back(std::move(out_buffer)); active_col++; diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index db37d7daea8..9ce65dcc28f 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -186,6 +186,7 @@ class reader_impl { * @return List of columns' data types */ std::vector parse_column_types(std::vector& column_flags, + std::vector const& column_names, std::vector const& types_as_strings, data_type timestamp_type); @@ -199,6 +200,7 @@ class reader_impl { */ std::vector decode_data(parse_options const& parse_opts, std::vector const& column_flags, + std::vector const& column_names, device_span data, device_span row_offsets, host_span column_types, @@ -211,7 +213,7 @@ class reader_impl { int num_actual_cols_ = 0; // Number of columns in the dataset // Intermediate data - std::vector col_names_; + // std::vector col_names_; std::vector header_; }; From 9d84753a449eca35de3a88a76f17798550df82bc Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Aug 2021 12:05:58 -0500 Subject: [PATCH 08/19] replace csv reader impl::num_records with local variable. --- cpp/src/io/csv/reader_impl.cu | 45 ++++++++++++++++++++++------------ cpp/src/io/csv/reader_impl.hpp | 8 +++--- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 80f91f985ea..373ade3b2e1 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -122,10 +122,10 @@ string removeQuotes(string str, char quotechar) * @brief Parse the first row to set the column names in the raw_csv parameter. * The first row can be either the header row, or the first data row */ -std::vector setColumnNames(std::vector const& header, - parse_options_view const& parse_opts, - int header_row, - std::string prefix) +std::vector get_column_names(std::vector const& header, + parse_options_view const& parse_opts, + int header_row, + std::string prefix) { std::vector col_names; @@ -337,8 +337,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, auto const& row_offsets = data_row_offsets.second; // Exclude the end-of-data row from number of rows with actual data - num_records_ = std::max(row_offsets.size(), 1ul) - 1; - + auto num_records = std::max(row_offsets.size(), 1ul) - 1; auto column_flags = std::vector(); auto column_names = std::vector(); @@ -347,7 +346,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, column_flags.resize(reader_opts.get_names().size(), column_parse::enabled); column_names = reader_opts.get_names(); } else { - column_names = setColumnNames( + column_names = get_column_names( header_, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix()); num_actual_cols_ = num_active_cols_ = column_names.size(); @@ -451,8 +450,14 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, std::vector column_types; if (has_to_infer_column_types) { - column_types = infer_column_types( - parse_opts, column_flags, data, row_offsets, reader_opts.get_timestamp_type(), stream); + column_types = infer_column_types( // + parse_opts, + column_flags, + data, + row_offsets, + num_records, + reader_opts.get_timestamp_type(), + stream); } else { column_types = std::visit( cudf::detail::visitor_overload{ @@ -475,9 +480,17 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, out_columns.reserve(column_types.size()); - if (num_records_ != 0) { - auto out_buffers = decode_data( - parse_opts, column_flags, column_names, data, row_offsets, column_types, stream, mr); + if (num_records != 0) { + auto out_buffers = decode_data( // + parse_opts, + column_flags, + column_names, + data, + row_offsets, + column_types, + num_records, + stream, + mr); for (size_t i = 0; i < column_types.size(); ++i) { metadata.column_names.emplace_back(out_buffers[i].name); if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' && @@ -682,11 +695,12 @@ std::vector reader_impl::infer_column_types( std::vector const& column_flags, device_span data, device_span row_offsets, + int32_t num_records, data_type timestamp_type, rmm::cuda_stream_view stream) { std::vector dtypes; - if (num_records_ == 0) { + if (num_records == 0) { dtypes.resize(num_active_cols_, data_type{type_id::EMPTY}); } else { auto column_stats = @@ -704,7 +718,7 @@ std::vector reader_impl::infer_column_types( column_stats[col].negative_small_int_count + column_stats[col].positive_small_int_count; - if (column_stats[col].null_count == num_records_) { + if (column_stats[col].null_count == num_records) { // Entire column is NULL; allocate the smallest amount of memory dtypes.emplace_back(cudf::type_id::INT8); } else if (column_stats[col].string_count > 0L) { @@ -831,6 +845,7 @@ std::vector reader_impl::decode_data( device_span data, device_span row_offsets, host_span column_types, + int32_t num_records, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -843,7 +858,7 @@ std::vector reader_impl::decode_data( const bool is_final_allocation = column_types[active_col].id() != type_id::STRING; auto out_buffer = column_buffer(column_types[active_col], - num_records_, + num_records, true, stream, is_final_allocation ? mr : rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 9ce65dcc28f..006c9ce9800 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -175,6 +175,7 @@ class reader_impl { std::vector const& column_flags, device_span data, device_span row_offsets, + int32_t num_records, data_type timestamp_type, rmm::cuda_stream_view stream); @@ -204,16 +205,15 @@ class reader_impl { device_span data, device_span row_offsets, host_span column_types, + int32_t num_records, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); private: - cudf::size_type num_records_ = 0; // Number of rows with actual data - int num_active_cols_ = 0; // Number of columns to read - int num_actual_cols_ = 0; // Number of columns in the dataset + int num_active_cols_ = 0; // Number of columns to read + int num_actual_cols_ = 0; // Number of columns in the dataset // Intermediate data - // std::vector col_names_; std::vector header_; }; From 7ce862ee16cc83c5bbbd7f3d6667ea518527e437 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Aug 2021 12:18:13 -0500 Subject: [PATCH 09/19] convert csv reader impl ::num_actual_columns and ::num_active_columns to local variables --- cpp/src/io/csv/reader_impl.cu | 109 ++++++++++++++++++--------------- cpp/src/io/csv/reader_impl.hpp | 11 ++-- 2 files changed, 64 insertions(+), 56 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 373ade3b2e1..84a4399f465 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -337,9 +337,11 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, auto const& row_offsets = data_row_offsets.second; // Exclude the end-of-data row from number of rows with actual data - auto num_records = std::max(row_offsets.size(), 1ul) - 1; - auto column_flags = std::vector(); - auto column_names = std::vector(); + auto num_records = std::max(row_offsets.size(), 1ul) - 1; + auto column_flags = std::vector(); + auto column_names = std::vector(); + auto num_actual_columns = static_cast(reader_opts.get_names().size()); + auto num_active_columns = num_actual_columns; // Check if the user gave us a list of column names if (not reader_opts.get_names().empty()) { @@ -349,9 +351,9 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, column_names = get_column_names( header_, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix()); - num_actual_cols_ = num_active_cols_ = column_names.size(); + num_actual_columns = num_active_columns = column_names.size(); - column_flags.resize(num_actual_cols_, column_parse::enabled); + column_flags.resize(num_actual_columns, column_parse::enabled); // Rename empty column names to "Unnamed: col_index" for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) { @@ -383,7 +385,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, // Update the number of columns to be processed, if some might have been // removed if (!reader_opts.is_enabled_mangle_dupe_cols()) { - num_active_cols_ = col_names_histogram.size(); + num_active_columns = col_names_histogram.size(); } } @@ -394,9 +396,9 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, for (const auto index : reader_opts.get_use_cols_indexes()) { column_flags[index] = column_parse::enabled; } - num_active_cols_ = std::unordered_set(reader_opts.get_use_cols_indexes().begin(), - reader_opts.get_use_cols_indexes().end()) - .size(); + num_active_columns = std::unordered_set(reader_opts.get_use_cols_indexes().begin(), + reader_opts.get_use_cols_indexes().end()) + .size(); for (const auto& name : reader_opts.get_use_cols_names()) { const auto it = std::find(column_names.begin(), column_names.end(), name); @@ -404,7 +406,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, auto curr_it = it - column_names.begin(); if (column_flags[curr_it] == column_parse::disabled) { column_flags[curr_it] = column_parse::enabled; - num_active_cols_++; + num_active_columns++; } } } @@ -440,7 +442,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, } // Return empty table rather than exception if nothing to load - if (num_active_cols_ == 0) { return {std::make_unique
(), {}}; } + if (num_active_columns == 0) { return {std::make_unique
(), {}}; } auto metadata = table_metadata{}; auto out_columns = std::vector>(); @@ -456,26 +458,32 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, data, row_offsets, num_records, + num_active_columns, reader_opts.get_timestamp_type(), stream); } else { - column_types = std::visit( - cudf::detail::visitor_overload{ - [&](const std::vector& data_types) { - return select_data_types(column_flags, data_types, num_actual_cols_, num_active_cols_); - }, - [&](const std::map& data_types) { - return get_data_types_from_column_names( // - column_flags, - data_types, - column_names, - num_actual_cols_); - }, - [&](const std::vector& dtypes) { - return parse_column_types( - column_flags, column_names, dtypes, reader_opts.get_timestamp_type()); - }}, - reader_opts.get_dtypes()); + column_types = + std::visit(cudf::detail::visitor_overload{ + [&](const std::vector& data_types) { + return select_data_types( + column_flags, data_types, num_actual_columns, num_active_columns); + }, + [&](const std::map& data_types) { + return get_data_types_from_column_names( // + column_flags, + data_types, + column_names, + num_actual_columns); + }, + [&](const std::vector& dtypes) { + return parse_column_types(column_flags, + column_names, + dtypes, + num_actual_columns, + num_active_columns, + reader_opts.get_timestamp_type()); + }}, + reader_opts.get_dtypes()); } out_columns.reserve(column_types.size()); @@ -489,6 +497,8 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, row_offsets, column_types, num_records, + num_actual_columns, + num_active_columns, stream, mr); for (size_t i = 0; i < column_types.size(); ++i) { @@ -514,7 +524,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, out_columns.emplace_back(make_empty_column(column_types[i])); } // Handle empty metadata - for (int col = 0; col < num_actual_cols_; ++col) { + for (int col = 0; col < num_actual_columns; ++col) { if (column_flags[col] & column_parse::enabled) { metadata.column_names.emplace_back(column_names[col]); } @@ -696,24 +706,25 @@ std::vector reader_impl::infer_column_types( device_span data, device_span row_offsets, int32_t num_records, + int32_t num_active_columns, data_type timestamp_type, rmm::cuda_stream_view stream) { std::vector dtypes; if (num_records == 0) { - dtypes.resize(num_active_cols_, data_type{type_id::EMPTY}); + dtypes.resize(num_active_columns, data_type{type_id::EMPTY}); } else { auto column_stats = cudf::io::csv::gpu::detect_column_types(parse_opts.view(), data, make_device_uvector_async(column_flags, stream), row_offsets, - num_active_cols_, + num_active_columns, stream); stream.synchronize(); - for (int col = 0; col < num_active_cols_; col++) { + for (int col = 0; col < num_active_columns; col++) { unsigned long long int_count_total = column_stats[col].big_int_count + column_stats[col].negative_small_int_count + column_stats[col].positive_small_int_count; @@ -764,6 +775,8 @@ std::vector reader_impl::parse_column_types( std::vector& column_flags, std::vector const& column_names, std::vector const& types_as_strings, + int32_t num_actual_columns, + int32_t num_active_columns, data_type timestamp_type) { std::vector dtypes; @@ -778,19 +791,19 @@ std::vector reader_impl::parse_column_types( data_type dtype_; column_parse::flags col_flags_; std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]); - dtypes.resize(num_active_cols_, dtype_); - for (int col = 0; col < num_actual_cols_; col++) { + dtypes.resize(num_active_columns, dtype_); + for (int col = 0; col < num_actual_columns; col++) { column_flags[col] |= col_flags_; } CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); } else { // If it's a list, assign dtypes to active columns in the given order - CUDF_EXPECTS(static_cast(types_as_strings.size()) >= num_actual_cols_, + CUDF_EXPECTS(static_cast(types_as_strings.size()) >= num_actual_columns, "Must specify data types for all columns"); auto dtype_ = std::back_inserter(dtypes); - for (int col = 0; col < num_actual_cols_; col++) { + for (int col = 0; col < num_actual_columns; col++) { if (column_flags[col] & column_parse::enabled) { column_parse::flags col_flags_; std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]); @@ -812,7 +825,7 @@ std::vector reader_impl::parse_column_types( auto dtype_ = std::back_inserter(dtypes); - for (int col = 0; col < num_actual_cols_; col++) { + for (int col = 0; col < num_actual_columns; col++) { if (column_flags[col] & column_parse::enabled) { CUDF_EXPECTS(col_type_map.find(column_names[col]) != col_type_map.end(), "Must specify data types for all active columns"); @@ -846,6 +859,8 @@ std::vector reader_impl::decode_data( device_span row_offsets, host_span column_types, int32_t num_records, + int32_t num_actual_columns, + int32_t num_active_columns, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -853,7 +868,7 @@ std::vector reader_impl::decode_data( std::vector out_buffers; out_buffers.reserve(column_types.size()); - for (int col = 0, active_col = 0; col < num_actual_cols_; ++col) { + for (int col = 0, active_col = 0; col < num_actual_columns; ++col) { if (column_flags[col] & column_parse::enabled) { const bool is_final_allocation = column_types[active_col].id() != type_id::STRING; auto out_buffer = @@ -870,10 +885,10 @@ std::vector reader_impl::decode_data( } } - thrust::host_vector h_data(num_active_cols_); - thrust::host_vector h_valid(num_active_cols_); + thrust::host_vector h_data(num_active_columns); + thrust::host_vector h_valid(num_active_columns); - for (int i = 0; i < num_active_cols_; ++i) { + for (int i = 0; i < num_active_columns; ++i) { h_data[i] = out_buffers[i].data(); h_valid[i] = out_buffers[i].null_mask(); } @@ -988,10 +1003,7 @@ parse_options make_parse_options(csv_reader_options const& reader_opts, return parse_opts; } -reader_impl::reader_impl(int32_t num_actual_columns) - : num_actual_cols_(num_actual_columns), num_active_cols_(num_actual_columns) -{ -} +reader_impl::reader_impl() {} table_with_metadata read_csv(std::unique_ptr&& source, csv_reader_options const& options, @@ -1003,12 +1015,7 @@ table_with_metadata read_csv(std::unique_ptr&& source, auto parse_options = make_parse_options(options, stream); - auto num_actual_columns = options.get_names().size(); - - auto reader = std::make_unique( // - num_actual_columns); - - return reader->read(source.get(), options, parse_options, stream, mr); + return std::make_unique()->read(source.get(), options, parse_options, stream, mr); } } // namespace csv diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 006c9ce9800..238e83eadc3 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -72,7 +72,7 @@ class reader_impl { * @brief Constructor from a dataset source with reader options. * */ - explicit reader_impl(int32_t num_actual_columns); + explicit reader_impl(); /** * @brief Read an entire set or a subset of data and returns a set of columns. @@ -176,6 +176,7 @@ class reader_impl { device_span data, device_span row_offsets, int32_t num_records, + int32_t num_active_columns, data_type timestamp_type, rmm::cuda_stream_view stream); @@ -189,6 +190,8 @@ class reader_impl { std::vector parse_column_types(std::vector& column_flags, std::vector const& column_names, std::vector const& types_as_strings, + int32_t num_actual_columns, + int32_t num_active_columns, data_type timestamp_type); /** @@ -206,14 +209,12 @@ class reader_impl { device_span row_offsets, host_span column_types, int32_t num_records, + int32_t num_actual_columns, + int32_t num_active_columns, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); private: - int num_active_cols_ = 0; // Number of columns to read - int num_actual_cols_ = 0; // Number of columns in the dataset - - // Intermediate data std::vector header_; }; From 9010fe195d4a13e90d94114fd40567929940ae3e Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Aug 2021 16:08:56 -0500 Subject: [PATCH 10/19] remove csv reader class and impl class in favor of fucntions --- cpp/src/io/csv/reader_impl.cu | 214 +++++++++++++++++++++++-------- cpp/src/io/csv/reader_impl.hpp | 224 --------------------------------- 2 files changed, 159 insertions(+), 279 deletions(-) delete mode 100644 cpp/src/io/csv/reader_impl.hpp diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 84a4399f465..1ee321b30fd 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -19,15 +19,22 @@ * @brief cuDF-IO CSV reader class implementation */ -#include "reader_impl.hpp" +#include "csv_common.h" +#include "csv_gpu.h" #include +#include +#include #include +#include #include #include #include #include +#include +#include +#include #include #include #include @@ -38,10 +45,14 @@ #include #include +#include #include +#include #include #include #include +#include +#include using std::string; using std::vector; @@ -57,6 +68,83 @@ namespace csv { using namespace cudf::io::csv; using namespace cudf::io; +namespace { + +/** + * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span. + * + * Row offsets are stored this way to avoid reallocation/copies when discarding front or back + * elements. + */ +class selected_rows_offsets { + rmm::device_uvector all; + device_span selected; + + public: + selected_rows_offsets(rmm::device_uvector&& data, + device_span selected_span) + : all{std::move(data)}, selected{selected_span} + { + } + selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {} + + operator device_span() const { return selected; } + void shrink(size_t size) + { + CUDF_EXPECTS(size <= selected.size(), "New size must be smaller"); + selected = selected.subspan(0, size); + } + void erase_first_n(size_t n) + { + CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove"); + selected = selected.subspan(n, selected.size() - n); + } + auto size() const { return selected.size(); } + auto data() const { return selected.data(); } +}; + +} // namespace + +std::pair, selected_rows_offsets> load_data_and_gather_row_offsets( + csv_reader_options const& reader_opts, + parse_options const& parse_opts, + std::vector& header, + host_span data, + size_t range_begin, + size_t range_end, + size_t skip_rows, + int64_t num_rows, + bool load_whole_file, + rmm::cuda_stream_view stream); + +std::vector parse_column_types(std::vector& column_flags, + std::vector const& column_names, + std::vector const& types_as_strings, + int32_t num_actual_columns, + int32_t num_active_columns, + data_type timestamp_type); + +std::vector infer_column_types(parse_options const& parse_opts, + std::vector const& column_flags, + device_span data, + device_span row_offsets, + int32_t num_records, + int32_t num_active_columns, + data_type timestamp_type, + rmm::cuda_stream_view stream); + +std::vector decode_data(parse_options const& parse_opts, + std::vector const& column_flags, + std::vector const& column_names, + device_span data, + device_span row_offsets, + host_span column_types, + int32_t num_records, + int32_t num_actual_columns, + int32_t num_active_columns, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + /** * @brief Estimates the maximum expected length or a row, based on the number * of columns @@ -208,11 +296,12 @@ size_t find_first_row_start(char row_terminator, host_span data) return std::min(pos + 1, data.size()); } -std::pair, reader_impl::selected_rows_offsets> -reader_impl::select_data_and_row_offsets(cudf::io::datasource* source, - csv_reader_options const& reader_opts, - parse_options const& parse_opts, - rmm::cuda_stream_view stream) +std::pair, selected_rows_offsets> select_data_and_row_offsets( + cudf::io::datasource* source, + csv_reader_options const& reader_opts, + std::vector& header, + parse_options const& parse_opts, + rmm::cuda_stream_view stream) { auto range_offset = reader_opts.get_byte_range_offset(); auto range_size = reader_opts.get_byte_range_size(); @@ -266,6 +355,7 @@ reader_impl::select_data_and_row_offsets(cudf::io::datasource* source, auto data_row_offsets = load_data_and_gather_row_offsets(reader_opts, parse_opts, + header, h_data, data_start_offset, (range_size) ? range_size : h_data.size(), @@ -325,14 +415,17 @@ std::vector select_data_types(std::vector const& return selected_dtypes; } -table_with_metadata reader_impl::read(cudf::io::datasource* source, - csv_reader_options const& reader_opts, - parse_options const& parse_opts, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +table_with_metadata read_csv(cudf::io::datasource* source, + csv_reader_options const& reader_opts, + parse_options const& parse_opts, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { + std::vector header; + auto const data_row_offsets = - select_data_and_row_offsets(source, reader_opts, parse_opts, stream); + select_data_and_row_offsets(source, reader_opts, header, parse_opts, stream); + auto const& data = data_row_offsets.first; auto const& row_offsets = data_row_offsets.second; @@ -349,7 +442,7 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, column_names = reader_opts.get_names(); } else { column_names = get_column_names( - header_, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix()); + header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix()); num_actual_columns = num_active_columns = column_names.size(); @@ -533,16 +626,32 @@ table_with_metadata reader_impl::read(cudf::io::datasource* source, return {std::make_unique
(std::move(out_columns)), std::move(metadata)}; } -std::pair, reader_impl::selected_rows_offsets> -reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& reader_opts, - parse_options const& parse_opts, - host_span data, - size_t range_begin, - size_t range_end, - size_t skip_rows, - int64_t num_rows, - bool load_whole_file, - rmm::cuda_stream_view stream) +/** + * @brief Finds row positions in the specified input data, and loads the selected data onto GPU. + * + * This function scans the input data to record the row offsets (relative to the start of the + * input data). A row is actually the data/offset between two termination symbols. + * + * @param data Uncompressed input data in host memory + * @param range_begin Only include rows starting after this position + * @param range_end Only include rows starting before this position + * @param skip_rows Number of rows to skip from the start + * @param num_rows Number of rows to read; -1: all remaining data + * @param load_whole_file Hint that the entire data will be needed on gpu + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Input data and row offsets in the device memory + */ +std::pair, selected_rows_offsets> load_data_and_gather_row_offsets( + csv_reader_options const& reader_opts, + parse_options const& parse_opts, + std::vector& header, + host_span data, + size_t range_begin, + size_t range_end, + size_t skip_rows, + int64_t num_rows, + bool load_whole_file, + rmm::cuda_stream_view stream) { constexpr size_t max_chunk_bytes = 64 * 1024 * 1024; // 64MB size_t buffer_size = std::min(max_chunk_bytes, data.size()); @@ -690,7 +799,7 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& reader_o const auto header_end = buffer_pos + row_ctx[1]; CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(), "Invalid csv header location"); - header_.assign(data.begin() + header_start, data.begin() + header_end); + header.assign(data.begin() + header_start, data.begin() + header_end); if (header_rows > 0) { row_offsets.erase_first_n(header_rows); } } // Apply num_rows limit @@ -700,15 +809,14 @@ reader_impl::load_data_and_gather_row_offsets(csv_reader_options const& reader_o return {std::move(d_data), std::move(row_offsets)}; } -std::vector reader_impl::infer_column_types( - parse_options const& parse_opts, - std::vector const& column_flags, - device_span data, - device_span row_offsets, - int32_t num_records, - int32_t num_active_columns, - data_type timestamp_type, - rmm::cuda_stream_view stream) +std::vector infer_column_types(parse_options const& parse_opts, + std::vector const& column_flags, + device_span data, + device_span row_offsets, + int32_t num_records, + int32_t num_active_columns, + data_type timestamp_type, + rmm::cuda_stream_view stream) { std::vector dtypes; if (num_records == 0) { @@ -771,13 +879,12 @@ std::vector reader_impl::infer_column_types( return dtypes; } -std::vector reader_impl::parse_column_types( - std::vector& column_flags, - std::vector const& column_names, - std::vector const& types_as_strings, - int32_t num_actual_columns, - int32_t num_active_columns, - data_type timestamp_type) +std::vector parse_column_types(std::vector& column_flags, + std::vector const& column_names, + std::vector const& types_as_strings, + int32_t num_actual_columns, + int32_t num_active_columns, + data_type timestamp_type) { std::vector dtypes; @@ -851,18 +958,17 @@ std::vector reader_impl::parse_column_types( return dtypes; } -std::vector reader_impl::decode_data( - parse_options const& parse_opts, - std::vector const& column_flags, - std::vector const& column_names, - device_span data, - device_span row_offsets, - host_span column_types, - int32_t num_records, - int32_t num_actual_columns, - int32_t num_active_columns, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::vector decode_data(parse_options const& parse_opts, + std::vector const& column_flags, + std::vector const& column_names, + device_span data, + device_span row_offsets, + host_span column_types, + int32_t num_records, + int32_t num_actual_columns, + int32_t num_active_columns, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Alloc output; columns' data memory is still expected for empty dataframe std::vector out_buffers; @@ -1003,8 +1109,6 @@ parse_options make_parse_options(csv_reader_options const& reader_opts, return parse_opts; } -reader_impl::reader_impl() {} - table_with_metadata read_csv(std::unique_ptr&& source, csv_reader_options const& options, rmm::cuda_stream_view stream, @@ -1015,7 +1119,7 @@ table_with_metadata read_csv(std::unique_ptr&& source, auto parse_options = make_parse_options(options, stream); - return std::make_unique()->read(source.get(), options, parse_options, stream, mr); + return read_csv(source.get(), options, parse_options, stream, mr); } } // namespace csv diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp deleted file mode 100644 index 238e83eadc3..00000000000 --- a/cpp/src/io/csv/reader_impl.hpp +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "csv_common.h" -#include "csv_gpu.h" - -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include - -using cudf::host_span; - -namespace cudf { -namespace io { -namespace detail { -namespace csv { -using namespace cudf::io::csv; -using namespace cudf::io; - -/** - * @brief Implementation for CSV reader - * - * The CSV reader is implemented in 4 stages: - * Stage 1: read and optionally decompress the input data in host memory - * (may be a memory-mapped view of the data on disk) - * - * Stage 2: gather the offset of each data row within the csv data. - * Since the number of rows in a given character block may depend on the - * initial parser state (like whether the block starts in a middle of a - * quote or not), a separate row count and output parser state is computed - * for every possible input parser state per 16KB character block. - * The result is then used to infer the parser state and starting row at - * the beginning of every character block. - * A second pass can then output the location of every row (which is needed - * for the subsequent parallel conversion of every row from csv text - * to cudf binary form) - * - * Stage 3: Optional stage to infer the data type of each CSV column. - * - * Stage 4: Convert every row from csv text form to cudf binary form. - */ -class reader_impl { - public: - /** - * @brief Constructor from a dataset source with reader options. - * - */ - explicit reader_impl(); - - /** - * @brief Read an entire set or a subset of data and returns a set of columns. - * - * @param source Dataset source - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - * - * @return The set of columns along with metadata - */ - table_with_metadata read(cudf::io::datasource* source, - csv_reader_options const& reader_opts, - parse_options const& parse_opts, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - - private: - /** - * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span. - * - * Row offsets are stored this way to avoid reallocation/copies when discarding front or back - * elements. - */ - class selected_rows_offsets { - rmm::device_uvector all; - device_span selected; - - public: - selected_rows_offsets(rmm::device_uvector&& data, - device_span selected_span) - : all{std::move(data)}, selected{selected_span} - { - } - selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {} - - operator device_span() const { return selected; } - void shrink(size_t size) - { - CUDF_EXPECTS(size <= selected.size(), "New size must be smaller"); - selected = selected.subspan(0, size); - } - void erase_first_n(size_t n) - { - CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove"); - selected = selected.subspan(n, selected.size() - n); - } - auto size() const { return selected.size(); } - auto data() const { return selected.data(); } - }; - - /** - * @brief Selectively loads data on the GPU and gathers offsets of rows to read. - * - * Selection is based on read options. - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - std::pair, reader_impl::selected_rows_offsets> - select_data_and_row_offsets(cudf::io::datasource* source, - csv_reader_options const& reader_opts, - parse_options const& parse_opts, - rmm::cuda_stream_view stream); - - /** - * @brief Finds row positions in the specified input data, and loads the selected data onto GPU. - * - * This function scans the input data to record the row offsets (relative to the start of the - * input data). A row is actually the data/offset between two termination symbols. - * - * @param data Uncompressed input data in host memory - * @param range_begin Only include rows starting after this position - * @param range_end Only include rows starting before this position - * @param skip_rows Number of rows to skip from the start - * @param num_rows Number of rows to read; -1: all remaining data - * @param load_whole_file Hint that the entire data will be needed on gpu - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Input data and row offsets in the device memory - */ - std::pair, reader_impl::selected_rows_offsets> - load_data_and_gather_row_offsets(csv_reader_options const& reader_opts, - parse_options const& parse_opts, - host_span data, - size_t range_begin, - size_t range_end, - size_t skip_rows, - int64_t num_rows, - bool load_whole_file, - rmm::cuda_stream_view stream); - - /** - * @brief Automatically infers each column's data type based on the CSV's data within that column. - * - * @param data The CSV data from which to infer the columns' data types - * @param row_offsets The row offsets into the CSV's data - * @param stream The stream to which the type inference-kernel will be dispatched - * @return The columns' inferred data types - */ - std::vector infer_column_types(parse_options const& parse_opts, - std::vector const& column_flags, - device_span data, - device_span row_offsets, - int32_t num_records, - int32_t num_active_columns, - data_type timestamp_type, - rmm::cuda_stream_view stream); - - /** - * @brief Parses the columns' data types from the vector of dtypes that are provided as strings. - * - * @param types_as_strings The vector of strings from which to parse the columns' target data - * types - * @return List of columns' data types - */ - std::vector parse_column_types(std::vector& column_flags, - std::vector const& column_names, - std::vector const& types_as_strings, - int32_t num_actual_columns, - int32_t num_active_columns, - data_type timestamp_type); - - /** - * @brief Converts the row-column data and outputs to column bufferrs. - * - * @param column_types Column types - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return list of column buffers of decoded data, or ptr/size in the case of strings. - */ - std::vector decode_data(parse_options const& parse_opts, - std::vector const& column_flags, - std::vector const& column_names, - device_span data, - device_span row_offsets, - host_span column_types, - int32_t num_records, - int32_t num_actual_columns, - int32_t num_active_columns, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - - private: - std::vector header_; -}; - -} // namespace csv -} // namespace detail -} // namespace io -} // namespace cudf From 7cda106ecd8e5ce1b4b1bd6b5cf3124a408220ae Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Aug 2021 16:23:58 -0500 Subject: [PATCH 11/19] rearrange some functions to delete some unneccessary declarations. --- cpp/src/io/csv/reader_impl.cu | 882 ++++++++++++++++------------------ 1 file changed, 421 insertions(+), 461 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 1ee321b30fd..df16959eda5 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -103,48 +103,6 @@ class selected_rows_offsets { auto data() const { return selected.data(); } }; -} // namespace - -std::pair, selected_rows_offsets> load_data_and_gather_row_offsets( - csv_reader_options const& reader_opts, - parse_options const& parse_opts, - std::vector& header, - host_span data, - size_t range_begin, - size_t range_end, - size_t skip_rows, - int64_t num_rows, - bool load_whole_file, - rmm::cuda_stream_view stream); - -std::vector parse_column_types(std::vector& column_flags, - std::vector const& column_names, - std::vector const& types_as_strings, - int32_t num_actual_columns, - int32_t num_active_columns, - data_type timestamp_type); - -std::vector infer_column_types(parse_options const& parse_opts, - std::vector const& column_flags, - device_span data, - device_span row_offsets, - int32_t num_records, - int32_t num_active_columns, - data_type timestamp_type, - rmm::cuda_stream_view stream); - -std::vector decode_data(parse_options const& parse_opts, - std::vector const& column_flags, - std::vector const& column_names, - device_span data, - device_span row_offsets, - host_span column_types, - int32_t num_records, - int32_t num_actual_columns, - int32_t num_active_columns, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - /** * @brief Estimates the maximum expected length or a row, based on the number * of columns @@ -296,6 +254,189 @@ size_t find_first_row_start(char row_terminator, host_span data) return std::min(pos + 1, data.size()); } +/** + * @brief Finds row positions in the specified input data, and loads the selected data onto GPU. + * + * This function scans the input data to record the row offsets (relative to the start of the + * input data). A row is actually the data/offset between two termination symbols. + * + * @param data Uncompressed input data in host memory + * @param range_begin Only include rows starting after this position + * @param range_end Only include rows starting before this position + * @param skip_rows Number of rows to skip from the start + * @param num_rows Number of rows to read; -1: all remaining data + * @param load_whole_file Hint that the entire data will be needed on gpu + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Input data and row offsets in the device memory + */ +std::pair, selected_rows_offsets> load_data_and_gather_row_offsets( + csv_reader_options const& reader_opts, + parse_options const& parse_opts, + std::vector& header, + host_span data, + size_t range_begin, + size_t range_end, + size_t skip_rows, + int64_t num_rows, + bool load_whole_file, + rmm::cuda_stream_view stream) +{ + constexpr size_t max_chunk_bytes = 64 * 1024 * 1024; // 64MB + size_t buffer_size = std::min(max_chunk_bytes, data.size()); + size_t max_blocks = + std::max((buffer_size / cudf::io::csv::gpu::rowofs_block_bytes) + 1, 2); + hostdevice_vector row_ctx(max_blocks); + size_t buffer_pos = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size()); + size_t pos = std::min(range_begin, data.size()); + size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0; + uint64_t ctx = 0; + + // For compatibility with the previous parser, a row is considered in-range if the + // previous row terminator is within the given range + range_end += (range_end < data.size()); + + // Reserve memory by allocating and then resetting the size + rmm::device_uvector d_data{ + (load_whole_file) ? data.size() : std::min(buffer_size * 2, data.size()), stream}; + d_data.resize(0, stream); + rmm::device_uvector all_row_offsets{0, stream}; + do { + size_t target_pos = std::min(pos + max_chunk_bytes, data.size()); + size_t chunk_size = target_pos - pos; + + auto const previous_data_size = d_data.size(); + d_data.resize(target_pos - buffer_pos, stream); + CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size, + data.begin() + buffer_pos + previous_data_size, + target_pos - buffer_pos - previous_data_size, + cudaMemcpyDefault, + stream.value())); + + // Pass 1: Count the potential number of rows in each character block for each + // possible parser state at the beginning of the block. + uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), + row_ctx.device_ptr(), + device_span(), + d_data, + chunk_size, + pos, + buffer_pos, + data.size(), + range_begin, + range_end, + skip_rows, + stream); + CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), + row_ctx.device_ptr(), + num_blocks * sizeof(uint64_t), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); + + // Sum up the rows in each character block, selecting the row count that + // corresponds to the current input context. Also stores the now known input + // context per character block that will be needed by the second pass. + for (uint32_t i = 0; i < num_blocks; i++) { + uint64_t ctx_next = cudf::io::csv::gpu::select_row_context(ctx, row_ctx[i]); + row_ctx[i] = ctx; + ctx = ctx_next; + } + size_t total_rows = ctx >> 2; + if (total_rows > skip_rows) { + // At least one row in range in this batch + all_row_offsets.resize(total_rows - skip_rows, stream); + + CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(), + row_ctx.host_ptr(), + num_blocks * sizeof(uint64_t), + cudaMemcpyHostToDevice, + stream.value())); + + // Pass 2: Output row offsets + cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), + row_ctx.device_ptr(), + all_row_offsets, + d_data, + chunk_size, + pos, + buffer_pos, + data.size(), + range_begin, + range_end, + skip_rows, + stream); + // With byte range, we want to keep only one row out of the specified range + if (range_end < data.size()) { + CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), + row_ctx.device_ptr(), + num_blocks * sizeof(uint64_t), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); + + size_t rows_out_of_range = 0; + for (uint32_t i = 0; i < num_blocks; i++) { + rows_out_of_range += row_ctx[i]; + } + if (rows_out_of_range != 0) { + // Keep one row out of range (used to infer length of previous row) + auto new_row_offsets_size = + all_row_offsets.size() - std::min(rows_out_of_range - 1, all_row_offsets.size()); + all_row_offsets.resize(new_row_offsets_size, stream); + // Implies we reached the end of the range + break; + } + } + // num_rows does not include blank rows + if (num_rows >= 0) { + if (all_row_offsets.size() > header_rows + static_cast(num_rows)) { + size_t num_blanks = cudf::io::csv::gpu::count_blank_rows( + parse_opts.view(), d_data, all_row_offsets, stream); + if (all_row_offsets.size() - num_blanks > header_rows + static_cast(num_rows)) { + // Got the desired number of rows + break; + } + } + } + } else { + // Discard data (all rows below skip_rows), keeping one character for history + size_t discard_bytes = std::max(d_data.size(), sizeof(char)) - sizeof(char); + if (discard_bytes != 0) { + erase_except_last(d_data, stream); + buffer_pos += discard_bytes; + } + } + pos = target_pos; + } while (pos < data.size()); + + auto const non_blank_row_offsets = + io::csv::gpu::remove_blank_rows(parse_opts.view(), d_data, all_row_offsets, stream); + auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets}; + + // Remove header rows and extract header + const size_t header_row_index = std::max(header_rows, 1) - 1; + if (header_row_index + 1 < row_offsets.size()) { + CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), + row_offsets.data() + header_row_index, + 2 * sizeof(uint64_t), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); + + const auto header_start = buffer_pos + row_ctx[0]; + const auto header_end = buffer_pos + row_ctx[1]; + CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(), + "Invalid csv header location"); + header.assign(data.begin() + header_start, data.begin() + header_end); + if (header_rows > 0) { row_offsets.erase_first_n(header_rows); } + } + // Apply num_rows limit + if (num_rows >= 0 && static_cast(num_rows) < row_offsets.size() - 1) { + row_offsets.shrink(num_rows + 1); + } + return {std::move(d_data), std::move(row_offsets)}; +} + std::pair, selected_rows_offsets> select_data_and_row_offsets( cudf::io::datasource* source, csv_reader_options const& reader_opts, @@ -415,19 +556,221 @@ std::vector select_data_types(std::vector const& return selected_dtypes; } -table_with_metadata read_csv(cudf::io::datasource* source, - csv_reader_options const& reader_opts, - parse_options const& parse_opts, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::vector parse_column_types(std::vector& column_flags, + std::vector const& column_names, + std::vector const& types_as_strings, + int32_t num_actual_columns, + int32_t num_active_columns, + data_type timestamp_type) { - std::vector header; + std::vector dtypes; - auto const data_row_offsets = - select_data_and_row_offsets(source, reader_opts, header, parse_opts, stream); + bool const is_dict = std::all_of(types_as_strings.begin(), + types_as_strings.end(), + [](auto const& s) { return s.find(':') != std::string::npos; }); - auto const& data = data_row_offsets.first; - auto const& row_offsets = data_row_offsets.second; + if (!is_dict) { + if (types_as_strings.size() == 1) { + // If it's a single dtype, assign that dtype to all active columns + data_type dtype_; + column_parse::flags col_flags_; + std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]); + dtypes.resize(num_active_columns, dtype_); + for (int col = 0; col < num_actual_columns; col++) { + column_flags[col] |= col_flags_; + } + CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); + } else { + // If it's a list, assign dtypes to active columns in the given order + CUDF_EXPECTS(static_cast(types_as_strings.size()) >= num_actual_columns, + "Must specify data types for all columns"); + + auto dtype_ = std::back_inserter(dtypes); + + for (int col = 0; col < num_actual_columns; col++) { + if (column_flags[col] & column_parse::enabled) { + column_parse::flags col_flags_; + std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]); + column_flags[col] |= col_flags_; + CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); + } + } + } + } else { + // Translate vector of `name : dtype` strings to map + // NOTE: Incoming pairs can be out-of-order from column names in dataset + std::unordered_map col_type_map; + for (const auto& pair : types_as_strings) { + const auto pos = pair.find_last_of(':'); + const auto name = pair.substr(0, pos); + const auto dtype = pair.substr(pos + 1, pair.size()); + col_type_map[name] = dtype; + } + + auto dtype_ = std::back_inserter(dtypes); + + for (int col = 0; col < num_actual_columns; col++) { + if (column_flags[col] & column_parse::enabled) { + CUDF_EXPECTS(col_type_map.find(column_names[col]) != col_type_map.end(), + "Must specify data types for all active columns"); + column_parse::flags col_flags_; + std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[column_names[col]]); + column_flags[col] |= col_flags_; + CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); + } + } + } + + if (timestamp_type.id() != cudf::type_id::EMPTY) { + for (auto& type : dtypes) { + if (cudf::is_timestamp(type)) { type = timestamp_type; } + } + } + + for (size_t i = 0; i < dtypes.size(); i++) { + // Replace EMPTY dtype with STRING + if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; } + } + + return dtypes; +} + +std::vector infer_column_types(parse_options const& parse_opts, + std::vector const& column_flags, + device_span data, + device_span row_offsets, + int32_t num_records, + int32_t num_active_columns, + data_type timestamp_type, + rmm::cuda_stream_view stream) +{ + std::vector dtypes; + if (num_records == 0) { + dtypes.resize(num_active_columns, data_type{type_id::EMPTY}); + } else { + auto column_stats = + cudf::io::csv::gpu::detect_column_types(parse_opts.view(), + data, + make_device_uvector_async(column_flags, stream), + row_offsets, + num_active_columns, + stream); + + stream.synchronize(); + + for (int col = 0; col < num_active_columns; col++) { + unsigned long long int_count_total = column_stats[col].big_int_count + + column_stats[col].negative_small_int_count + + column_stats[col].positive_small_int_count; + + if (column_stats[col].null_count == num_records) { + // Entire column is NULL; allocate the smallest amount of memory + dtypes.emplace_back(cudf::type_id::INT8); + } else if (column_stats[col].string_count > 0L) { + dtypes.emplace_back(cudf::type_id::STRING); + } else if (column_stats[col].datetime_count > 0L) { + dtypes.emplace_back(cudf::type_id::TIMESTAMP_NANOSECONDS); + } else if (column_stats[col].bool_count > 0L) { + dtypes.emplace_back(cudf::type_id::BOOL8); + } else if (column_stats[col].float_count > 0L || + (column_stats[col].float_count == 0L && int_count_total > 0L && + column_stats[col].null_count > 0L)) { + // The second condition has been added to conform to + // PANDAS which states that a column of integers with + // a single NULL record need to be treated as floats. + dtypes.emplace_back(cudf::type_id::FLOAT64); + } else if (column_stats[col].big_int_count == 0) { + dtypes.emplace_back(cudf::type_id::INT64); + } else if (column_stats[col].big_int_count != 0 && + column_stats[col].negative_small_int_count != 0) { + dtypes.emplace_back(cudf::type_id::STRING); + } else { + // Integers are stored as 64-bit to conform to PANDAS + dtypes.emplace_back(cudf::type_id::UINT64); + } + } + } + + if (timestamp_type.id() != cudf::type_id::EMPTY) { + for (auto& type : dtypes) { + if (cudf::is_timestamp(type)) { type = timestamp_type; } + } + } + + for (size_t i = 0; i < dtypes.size(); i++) { + // Replace EMPTY dtype with STRING + if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; } + } + + return dtypes; +} + +std::vector decode_data(parse_options const& parse_opts, + std::vector const& column_flags, + std::vector const& column_names, + device_span data, + device_span row_offsets, + host_span column_types, + int32_t num_records, + int32_t num_actual_columns, + int32_t num_active_columns, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Alloc output; columns' data memory is still expected for empty dataframe + std::vector out_buffers; + out_buffers.reserve(column_types.size()); + + for (int col = 0, active_col = 0; col < num_actual_columns; ++col) { + if (column_flags[col] & column_parse::enabled) { + const bool is_final_allocation = column_types[active_col].id() != type_id::STRING; + auto out_buffer = + column_buffer(column_types[active_col], + num_records, + true, + stream, + is_final_allocation ? mr : rmm::mr::get_current_device_resource()); + + out_buffer.name = column_names[col]; + out_buffer.null_count() = UNKNOWN_NULL_COUNT; + out_buffers.emplace_back(std::move(out_buffer)); + active_col++; + } + } + + thrust::host_vector h_data(num_active_columns); + thrust::host_vector h_valid(num_active_columns); + + for (int i = 0; i < num_active_columns; ++i) { + h_data[i] = out_buffers[i].data(); + h_valid[i] = out_buffers[i].null_mask(); + } + + cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(), + data, + make_device_uvector_async(column_flags, stream), + row_offsets, + make_device_uvector_async(column_types, stream), + make_device_uvector_async(h_data, stream), + make_device_uvector_async(h_valid, stream), + stream); + + return out_buffers; +} + +table_with_metadata read_csv(cudf::io::datasource* source, + csv_reader_options const& reader_opts, + parse_options const& parse_opts, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + std::vector header; + + auto const data_row_offsets = + select_data_and_row_offsets(source, reader_opts, header, parse_opts, stream); + + auto const& data = data_row_offsets.first; + auto const& row_offsets = data_row_offsets.second; // Exclude the end-of-data row from number of rows with actual data auto num_records = std::max(row_offsets.size(), 1ul) - 1; @@ -627,418 +970,33 @@ table_with_metadata read_csv(cudf::io::datasource* source, } /** - * @brief Finds row positions in the specified input data, and loads the selected data onto GPU. - * - * This function scans the input data to record the row offsets (relative to the start of the - * input data). A row is actually the data/offset between two termination symbols. - * - * @param data Uncompressed input data in host memory - * @param range_begin Only include rows starting after this position - * @param range_end Only include rows starting before this position - * @param skip_rows Number of rows to skip from the start - * @param num_rows Number of rows to read; -1: all remaining data - * @param load_whole_file Hint that the entire data will be needed on gpu - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Input data and row offsets in the device memory + * @brief Create a serialized trie for N/A value matching, based on the options. */ -std::pair, selected_rows_offsets> load_data_and_gather_row_offsets( - csv_reader_options const& reader_opts, - parse_options const& parse_opts, - std::vector& header, - host_span data, - size_t range_begin, - size_t range_end, - size_t skip_rows, - int64_t num_rows, - bool load_whole_file, - rmm::cuda_stream_view stream) +cudf::detail::trie create_na_trie(char quotechar, + csv_reader_options const& reader_opts, + rmm::cuda_stream_view stream) { - constexpr size_t max_chunk_bytes = 64 * 1024 * 1024; // 64MB - size_t buffer_size = std::min(max_chunk_bytes, data.size()); - size_t max_blocks = - std::max((buffer_size / cudf::io::csv::gpu::rowofs_block_bytes) + 1, 2); - hostdevice_vector row_ctx(max_blocks); - size_t buffer_pos = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size()); - size_t pos = std::min(range_begin, data.size()); - size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0; - uint64_t ctx = 0; - - // For compatibility with the previous parser, a row is considered in-range if the - // previous row terminator is within the given range - range_end += (range_end < data.size()); + // Default values to recognize as null values + static std::vector const default_na_values{"", + "#N/A", + "#N/A N/A", + "#NA", + "-1.#IND", + "-1.#QNAN", + "-NaN", + "-nan", + "1.#IND", + "1.#QNAN", + "", + "N/A", + "NA", + "NULL", + "NaN", + "n/a", + "nan", + "null"}; - // Reserve memory by allocating and then resetting the size - rmm::device_uvector d_data{ - (load_whole_file) ? data.size() : std::min(buffer_size * 2, data.size()), stream}; - d_data.resize(0, stream); - rmm::device_uvector all_row_offsets{0, stream}; - do { - size_t target_pos = std::min(pos + max_chunk_bytes, data.size()); - size_t chunk_size = target_pos - pos; - - auto const previous_data_size = d_data.size(); - d_data.resize(target_pos - buffer_pos, stream); - CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size, - data.begin() + buffer_pos + previous_data_size, - target_pos - buffer_pos - previous_data_size, - cudaMemcpyDefault, - stream.value())); - - // Pass 1: Count the potential number of rows in each character block for each - // possible parser state at the beginning of the block. - uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), - row_ctx.device_ptr(), - device_span(), - d_data, - chunk_size, - pos, - buffer_pos, - data.size(), - range_begin, - range_end, - skip_rows, - stream); - CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), - row_ctx.device_ptr(), - num_blocks * sizeof(uint64_t), - cudaMemcpyDeviceToHost, - stream.value())); - stream.synchronize(); - - // Sum up the rows in each character block, selecting the row count that - // corresponds to the current input context. Also stores the now known input - // context per character block that will be needed by the second pass. - for (uint32_t i = 0; i < num_blocks; i++) { - uint64_t ctx_next = cudf::io::csv::gpu::select_row_context(ctx, row_ctx[i]); - row_ctx[i] = ctx; - ctx = ctx_next; - } - size_t total_rows = ctx >> 2; - if (total_rows > skip_rows) { - // At least one row in range in this batch - all_row_offsets.resize(total_rows - skip_rows, stream); - - CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(), - row_ctx.host_ptr(), - num_blocks * sizeof(uint64_t), - cudaMemcpyHostToDevice, - stream.value())); - - // Pass 2: Output row offsets - cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), - row_ctx.device_ptr(), - all_row_offsets, - d_data, - chunk_size, - pos, - buffer_pos, - data.size(), - range_begin, - range_end, - skip_rows, - stream); - // With byte range, we want to keep only one row out of the specified range - if (range_end < data.size()) { - CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), - row_ctx.device_ptr(), - num_blocks * sizeof(uint64_t), - cudaMemcpyDeviceToHost, - stream.value())); - stream.synchronize(); - - size_t rows_out_of_range = 0; - for (uint32_t i = 0; i < num_blocks; i++) { - rows_out_of_range += row_ctx[i]; - } - if (rows_out_of_range != 0) { - // Keep one row out of range (used to infer length of previous row) - auto new_row_offsets_size = - all_row_offsets.size() - std::min(rows_out_of_range - 1, all_row_offsets.size()); - all_row_offsets.resize(new_row_offsets_size, stream); - // Implies we reached the end of the range - break; - } - } - // num_rows does not include blank rows - if (num_rows >= 0) { - if (all_row_offsets.size() > header_rows + static_cast(num_rows)) { - size_t num_blanks = cudf::io::csv::gpu::count_blank_rows( - parse_opts.view(), d_data, all_row_offsets, stream); - if (all_row_offsets.size() - num_blanks > header_rows + static_cast(num_rows)) { - // Got the desired number of rows - break; - } - } - } - } else { - // Discard data (all rows below skip_rows), keeping one character for history - size_t discard_bytes = std::max(d_data.size(), sizeof(char)) - sizeof(char); - if (discard_bytes != 0) { - erase_except_last(d_data, stream); - buffer_pos += discard_bytes; - } - } - pos = target_pos; - } while (pos < data.size()); - - auto const non_blank_row_offsets = - io::csv::gpu::remove_blank_rows(parse_opts.view(), d_data, all_row_offsets, stream); - auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets}; - - // Remove header rows and extract header - const size_t header_row_index = std::max(header_rows, 1) - 1; - if (header_row_index + 1 < row_offsets.size()) { - CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), - row_offsets.data() + header_row_index, - 2 * sizeof(uint64_t), - cudaMemcpyDeviceToHost, - stream.value())); - stream.synchronize(); - - const auto header_start = buffer_pos + row_ctx[0]; - const auto header_end = buffer_pos + row_ctx[1]; - CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(), - "Invalid csv header location"); - header.assign(data.begin() + header_start, data.begin() + header_end); - if (header_rows > 0) { row_offsets.erase_first_n(header_rows); } - } - // Apply num_rows limit - if (num_rows >= 0 && static_cast(num_rows) < row_offsets.size() - 1) { - row_offsets.shrink(num_rows + 1); - } - return {std::move(d_data), std::move(row_offsets)}; -} - -std::vector infer_column_types(parse_options const& parse_opts, - std::vector const& column_flags, - device_span data, - device_span row_offsets, - int32_t num_records, - int32_t num_active_columns, - data_type timestamp_type, - rmm::cuda_stream_view stream) -{ - std::vector dtypes; - if (num_records == 0) { - dtypes.resize(num_active_columns, data_type{type_id::EMPTY}); - } else { - auto column_stats = - cudf::io::csv::gpu::detect_column_types(parse_opts.view(), - data, - make_device_uvector_async(column_flags, stream), - row_offsets, - num_active_columns, - stream); - - stream.synchronize(); - - for (int col = 0; col < num_active_columns; col++) { - unsigned long long int_count_total = column_stats[col].big_int_count + - column_stats[col].negative_small_int_count + - column_stats[col].positive_small_int_count; - - if (column_stats[col].null_count == num_records) { - // Entire column is NULL; allocate the smallest amount of memory - dtypes.emplace_back(cudf::type_id::INT8); - } else if (column_stats[col].string_count > 0L) { - dtypes.emplace_back(cudf::type_id::STRING); - } else if (column_stats[col].datetime_count > 0L) { - dtypes.emplace_back(cudf::type_id::TIMESTAMP_NANOSECONDS); - } else if (column_stats[col].bool_count > 0L) { - dtypes.emplace_back(cudf::type_id::BOOL8); - } else if (column_stats[col].float_count > 0L || - (column_stats[col].float_count == 0L && int_count_total > 0L && - column_stats[col].null_count > 0L)) { - // The second condition has been added to conform to - // PANDAS which states that a column of integers with - // a single NULL record need to be treated as floats. - dtypes.emplace_back(cudf::type_id::FLOAT64); - } else if (column_stats[col].big_int_count == 0) { - dtypes.emplace_back(cudf::type_id::INT64); - } else if (column_stats[col].big_int_count != 0 && - column_stats[col].negative_small_int_count != 0) { - dtypes.emplace_back(cudf::type_id::STRING); - } else { - // Integers are stored as 64-bit to conform to PANDAS - dtypes.emplace_back(cudf::type_id::UINT64); - } - } - } - - if (timestamp_type.id() != cudf::type_id::EMPTY) { - for (auto& type : dtypes) { - if (cudf::is_timestamp(type)) { type = timestamp_type; } - } - } - - for (size_t i = 0; i < dtypes.size(); i++) { - // Replace EMPTY dtype with STRING - if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; } - } - - return dtypes; -} - -std::vector parse_column_types(std::vector& column_flags, - std::vector const& column_names, - std::vector const& types_as_strings, - int32_t num_actual_columns, - int32_t num_active_columns, - data_type timestamp_type) -{ - std::vector dtypes; - - bool const is_dict = std::all_of(types_as_strings.begin(), - types_as_strings.end(), - [](auto const& s) { return s.find(':') != std::string::npos; }); - - if (!is_dict) { - if (types_as_strings.size() == 1) { - // If it's a single dtype, assign that dtype to all active columns - data_type dtype_; - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]); - dtypes.resize(num_active_columns, dtype_); - for (int col = 0; col < num_actual_columns; col++) { - column_flags[col] |= col_flags_; - } - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } else { - // If it's a list, assign dtypes to active columns in the given order - CUDF_EXPECTS(static_cast(types_as_strings.size()) >= num_actual_columns, - "Must specify data types for all columns"); - - auto dtype_ = std::back_inserter(dtypes); - - for (int col = 0; col < num_actual_columns; col++) { - if (column_flags[col] & column_parse::enabled) { - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]); - column_flags[col] |= col_flags_; - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } - } - } - } else { - // Translate vector of `name : dtype` strings to map - // NOTE: Incoming pairs can be out-of-order from column names in dataset - std::unordered_map col_type_map; - for (const auto& pair : types_as_strings) { - const auto pos = pair.find_last_of(':'); - const auto name = pair.substr(0, pos); - const auto dtype = pair.substr(pos + 1, pair.size()); - col_type_map[name] = dtype; - } - - auto dtype_ = std::back_inserter(dtypes); - - for (int col = 0; col < num_actual_columns; col++) { - if (column_flags[col] & column_parse::enabled) { - CUDF_EXPECTS(col_type_map.find(column_names[col]) != col_type_map.end(), - "Must specify data types for all active columns"); - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[column_names[col]]); - column_flags[col] |= col_flags_; - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } - } - } - - if (timestamp_type.id() != cudf::type_id::EMPTY) { - for (auto& type : dtypes) { - if (cudf::is_timestamp(type)) { type = timestamp_type; } - } - } - - for (size_t i = 0; i < dtypes.size(); i++) { - // Replace EMPTY dtype with STRING - if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; } - } - - return dtypes; -} - -std::vector decode_data(parse_options const& parse_opts, - std::vector const& column_flags, - std::vector const& column_names, - device_span data, - device_span row_offsets, - host_span column_types, - int32_t num_records, - int32_t num_actual_columns, - int32_t num_active_columns, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Alloc output; columns' data memory is still expected for empty dataframe - std::vector out_buffers; - out_buffers.reserve(column_types.size()); - - for (int col = 0, active_col = 0; col < num_actual_columns; ++col) { - if (column_flags[col] & column_parse::enabled) { - const bool is_final_allocation = column_types[active_col].id() != type_id::STRING; - auto out_buffer = - column_buffer(column_types[active_col], - num_records, - true, - stream, - is_final_allocation ? mr : rmm::mr::get_current_device_resource()); - - out_buffer.name = column_names[col]; - out_buffer.null_count() = UNKNOWN_NULL_COUNT; - out_buffers.emplace_back(std::move(out_buffer)); - active_col++; - } - } - - thrust::host_vector h_data(num_active_columns); - thrust::host_vector h_valid(num_active_columns); - - for (int i = 0; i < num_active_columns; ++i) { - h_data[i] = out_buffers[i].data(); - h_valid[i] = out_buffers[i].null_mask(); - } - - cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(), - data, - make_device_uvector_async(column_flags, stream), - row_offsets, - make_device_uvector_async(column_types, stream), - make_device_uvector_async(h_data, stream), - make_device_uvector_async(h_valid, stream), - stream); - - return out_buffers; -} - -/** - * @brief Create a serialized trie for N/A value matching, based on the options. - */ -cudf::detail::trie create_na_trie(char quotechar, - csv_reader_options const& reader_opts, - rmm::cuda_stream_view stream) -{ - // Default values to recognize as null values - static std::vector const default_na_values{"", - "#N/A", - "#N/A N/A", - "#NA", - "-1.#IND", - "-1.#QNAN", - "-NaN", - "-nan", - "1.#IND", - "1.#QNAN", - "", - "N/A", - "NA", - "NULL", - "NaN", - "n/a", - "nan", - "null"}; - - if (!reader_opts.is_enabled_na_filter()) { return cudf::detail::trie(0, stream); } + if (!reader_opts.is_enabled_na_filter()) { return cudf::detail::trie(0, stream); } std::vector na_values = reader_opts.get_na_values(); if (reader_opts.is_enabled_keep_default_na()) { @@ -1109,6 +1067,8 @@ parse_options make_parse_options(csv_reader_options const& reader_opts, return parse_opts; } +} // namespace + table_with_metadata read_csv(std::unique_ptr&& source, csv_reader_options const& options, rmm::cuda_stream_view stream, From 88e23990151c737dcb4a22a5d6454ef8893285c4 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 17 Aug 2021 00:53:48 -0500 Subject: [PATCH 12/19] remove filepath-related logic from csv and json readers --- cpp/include/cudf/io/csv.hpp | 2 +- cpp/include/cudf/io/json.hpp | 2 +- cpp/src/io/comp/io_uncomp.h | 7 +++-- cpp/src/io/comp/uncomp.cpp | 19 ++++++------ cpp/src/io/csv/reader_impl.cu | 38 ++++++------------------ cpp/src/io/csv/reader_impl.hpp | 4 --- cpp/src/io/functions.cpp | 40 ++++++++++++++++++++++++-- cpp/src/io/json/reader_impl.cu | 31 ++------------------ cpp/src/io/json/reader_impl.hpp | 1 - cpp/src/io/utilities/parsing_utils.cu | 34 ---------------------- cpp/src/io/utilities/parsing_utils.cuh | 18 ------------ python/cudf/cudf/_lib/csv.pyx | 2 +- python/cudf/cudf/tests/test_csv.py | 14 --------- 13 files changed, 66 insertions(+), 146 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index d4a21b2e98c..c807f189aac 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1199,7 +1199,7 @@ class csv_reader_options_builder { * @return The set of columns along with metadata. */ table_with_metadata read_csv( - csv_reader_options const& options, + csv_reader_options options, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 8954f7dcab1..bca60f76260 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -364,7 +364,7 @@ class json_reader_options_builder { * @return The set of columns along with metadata. */ table_with_metadata read_json( - json_reader_options const& options, + json_reader_options options, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/io/comp/io_uncomp.h b/cpp/src/io/comp/io_uncomp.h index 8daf73ecd0c..7b1feb84813 100644 --- a/cpp/src/io/comp/io_uncomp.h +++ b/cpp/src/io/comp/io_uncomp.h @@ -16,12 +16,13 @@ #pragma once +#include +#include + #include #include #include -#include - using cudf::host_span; namespace cudf { @@ -42,7 +43,7 @@ enum { std::vector io_uncompress_single_h2d(void const* src, size_t src_size, int stream_type); -std::vector get_uncompressed_data(host_span data, std::string const& compression); +std::vector get_uncompressed_data(host_span data, compression_type compression); class HostDecompressor { public: diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 2cb99d897fe..ee451d04dbb 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -369,6 +369,7 @@ std::vector io_uncompress_single_h2d(const void* src, size_t src_size, int // Unsupported format break; } + CUDF_EXPECTS(comp_data != nullptr, "Unsupported compressed stream type"); CUDF_EXPECTS(comp_len > 0, "Unsupported compressed stream type"); @@ -422,17 +423,17 @@ std::vector io_uncompress_single_h2d(const void* src, size_t src_size, int * @return Vector containing the output uncompressed data */ std::vector get_uncompressed_data(host_span const data, - std::string const& compression) + compression_type compression) { int comp_type = IO_UNCOMP_STREAM_TYPE_INFER; - if (compression == "gzip") - comp_type = IO_UNCOMP_STREAM_TYPE_GZIP; - else if (compression == "zip") - comp_type = IO_UNCOMP_STREAM_TYPE_ZIP; - else if (compression == "bz2") - comp_type = IO_UNCOMP_STREAM_TYPE_BZIP2; - else if (compression == "xz") - comp_type = IO_UNCOMP_STREAM_TYPE_XZ; + + switch (compression) { + case compression_type::GZIP: comp_type = IO_UNCOMP_STREAM_TYPE_GZIP; break; + case compression_type::ZIP: comp_type = IO_UNCOMP_STREAM_TYPE_ZIP; break; + case compression_type::BZIP2: comp_type = IO_UNCOMP_STREAM_TYPE_BZIP2; break; + case compression_type::XZ: comp_type = IO_UNCOMP_STREAM_TYPE_XZ; break; + default: break; + } return io_uncompress_single_h2d(data.data(), data.size(), comp_type); } diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 549b0474fe1..a85a610962e 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -206,10 +206,12 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) auto num_rows = opts_.get_nrows(); if (range_offset > 0 || range_size > 0) { - CUDF_EXPECTS(compression_type_ == "none", + CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE, "Reading compressed data using `byte range` is unsupported"); } + size_t map_range_size = 0; + if (range_size != 0) { auto num_given_dtypes = std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes()); @@ -217,12 +219,7 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) map_range_size = range_size + calculateMaxRowSize(num_columns); } - // Support delayed opening of the file if using memory mapping datasource - // This allows only mapping of a subset of the file if using byte range - if (source_ == nullptr) { - assert(!filepath_.empty()); - source_ = datasource::create(filepath_, range_offset, map_range_size); - } + // TODO: provide hint to datasource that we should memory map any underlying file. // Transfer source data to GPU if (!source_->is_empty()) { @@ -235,10 +232,11 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) std::vector h_uncomp_data_owner; - if (compression_type_ != "none") { - h_uncomp_data_owner = get_uncompressed_data(h_data, compression_type_); + if (opts_.get_compression() != compression_type::NONE) { + h_uncomp_data_owner = get_uncompressed_data(h_data, opts_.get_compression()); h_data = h_uncomp_data_owner; } + // None of the parameters for row selection is used, we are parsing the entire file const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 && skip_end_rows <= 0 && num_rows == -1; @@ -927,35 +925,17 @@ parse_options make_parse_options(csv_reader_options const& reader_opts, } reader::impl::impl(std::unique_ptr source, - std::string filepath, csv_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : mr_(mr), source_(std::move(source)), filepath_(filepath), opts_(options) + : mr_(mr), source_(std::move(source)), opts_(options) { num_actual_cols_ = opts_.get_names().size(); num_active_cols_ = num_actual_cols_; - compression_type_ = - infer_compression_type(opts_.get_compression(), - filepath, - {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}}); - opts = make_parse_options(options, stream); } -// Forward to implementation -reader::reader(std::vector const& filepaths, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported."); - // Delay actual instantiation of data source until read to allow for - // partial memory mapping of file using byte ranges - _impl = std::make_unique(nullptr, filepaths[0], options, stream, mr); -} - // Forward to implementation reader::reader(std::vector>&& sources, csv_reader_options const& options, @@ -963,7 +943,7 @@ reader::reader(std::vector>&& sources, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported."); - _impl = std::make_unique(std::move(sources[0]), "", options, stream, mr); + _impl = std::make_unique(std::move(sources[0]), options, stream, mr); } // Destructor within this translation unit diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 36c2bf4f9e7..beaa9b816cb 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -72,13 +72,11 @@ class reader::impl { * @brief Constructor from a dataset source with reader options. * * @param source Dataset source - * @param filepath Filepath if reading dataset from a file * @param options Settings for controlling reading behavior * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ explicit impl(std::unique_ptr source, - std::string filepath, csv_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); @@ -222,8 +220,6 @@ class reader::impl { private: rmm::mr::device_memory_resource* mr_ = nullptr; std::unique_ptr source_; - std::string filepath_; - std::string compression_type_; const csv_reader_options opts_; cudf::size_type num_records_ = 0; // Number of rows with actual data diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index e080ea3a2ca..ccc2eef56c7 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -144,27 +144,61 @@ table_with_metadata read_avro(avro_reader_options const& options, return reader->read(options); } -table_with_metadata read_json(json_reader_options const& options, - rmm::mr::device_memory_resource* mr) +compression_type infer_compression_type(compression_type compression, source_info const& info) +{ + if (compression != compression_type::AUTO) { return compression; } + + if (info.type != io_type::FILEPATH) { return compression_type::NONE; } + + auto filepath = info.filepaths[0]; + + // Attempt to infer from the file extension + const auto pos = filepath.find_last_of('.'); + + if (pos == std::string::npos) { return {}; } + + auto str_tolower = [](const auto& begin, const auto& end) { + std::string out; + std::transform(begin, end, std::back_inserter(out), ::tolower); + return out; + }; + + const auto ext = str_tolower(filepath.begin() + pos + 1, filepath.end()); + + if (ext == "gz") { return compression_type::GZIP; } + if (ext == "zip") { return compression_type::ZIP; } + if (ext == "bz2") { return compression_type::BZIP2; } + if (ext == "xz") { return compression_type::XZ; } + + return compression_type::NONE; +} + +table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr) { namespace json = cudf::io::detail::json; CUDF_FUNC_RANGE(); auto datasources = make_datasources(options.get_source()); + + options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); + auto reader = std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); return reader->read(options); } -table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr) +table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr) { namespace csv = cudf::io::detail::csv; CUDF_FUNC_RANGE(); auto datasources = make_datasources(options.get_source()); + + options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); + auto reader = std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index a8f117c22bf..bae7471e307 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -241,15 +241,6 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) map_range_size = range_size + calculate_max_row_size(dtype_option_size); } - // Support delayed opening of the file if using memory mapping datasource - // This allows only mapping of a subset of the file if using byte range - if (sources_.empty()) { - assert(!filepaths_.empty()); - for (const auto& path : filepaths_) { - sources_.emplace_back(datasource::create(path, range_offset, map_range_size)); - } - } - // Iterate through the user defined sources and read the contents into the local buffer CUDF_EXPECTS(!sources_.empty(), "No sources were defined"); size_t total_source_size = 0; @@ -280,11 +271,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) */ void reader::impl::decompress_input(rmm::cuda_stream_view stream) { - const auto compression_type = - infer_compression_type(options_.get_compression(), - filepaths_.size() > 0 ? filepaths_[0] : "", - {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}}); - if (compression_type == "none") { + if (options_.get_compression() == compression_type::NONE) { // Do not use the owner vector here to avoid extra copy uncomp_data_ = reinterpret_cast(buffer_.data()); uncomp_size_ = buffer_.size(); @@ -293,7 +280,7 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream) host_span( // reinterpret_cast(buffer_.data()), buffer_.size()), - compression_type); + options_.get_compression()); uncomp_data_ = uncomp_data_owner_.data(); uncomp_size_ = uncomp_data_owner_.size(); @@ -665,7 +652,7 @@ reader::impl::impl(std::vector>&& sources, json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : options_(options), mr_(mr), sources_(std::move(sources)), filepaths_(filepaths) + : options_(options), mr_(mr), sources_(std::move(sources)) { CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); @@ -713,18 +700,6 @@ table_with_metadata reader::impl::read(json_reader_options const& options, return convert_data_to_table(rec_starts, stream); } -// Forward to implementation -reader::reader(std::vector const& filepaths, - json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Delay actual instantiation of data source until read to allow for - // partial memory mapping of file using byte ranges - std::vector> src = {}; // Empty datasources - _impl = std::make_unique(std::move(src), filepaths, options, stream, mr); -} - // Forward to implementation reader::reader(std::vector>&& sources, json_reader_options const& options, diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 5cf51369cdf..f7af55b2b90 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -57,7 +57,6 @@ class reader::impl { rmm::mr::device_memory_resource* mr_ = nullptr; std::vector> sources_; - std::vector filepaths_; std::vector buffer_; const char* uncomp_data_ = nullptr; diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu index 6c8f01111e5..ba62238c5d3 100644 --- a/cpp/src/io/utilities/parsing_utils.cu +++ b/cpp/src/io/utilities/parsing_utils.cu @@ -209,39 +209,5 @@ cudf::size_type count_all_from_set(const char* h_data, return find_all_from_set(h_data, h_size, keys, 0, nullptr, stream); } -std::string infer_compression_type( - const compression_type& compression_arg, - const std::string& filename, - const std::vector>& ext_to_comp_map) -{ - auto str_tolower = [](const auto& begin, const auto& end) { - std::string out; - std::transform(begin, end, std::back_inserter(out), ::tolower); - return out; - }; - - // Attempt to infer from user-supplied argument - if (compression_arg != compression_type::AUTO) { - switch (compression_arg) { - case compression_type::GZIP: return "gzip"; - case compression_type::BZIP2: return "bz2"; - case compression_type::ZIP: return "zip"; - case compression_type::XZ: return "xz"; - default: break; - } - } - - // Attempt to infer from the file extension - const auto pos = filename.find_last_of('.'); - if (pos != std::string::npos) { - const auto ext = str_tolower(filename.begin() + pos + 1, filename.end()); - for (const auto& mapping : ext_to_comp_map) { - if (mapping.first == ext) { return mapping.second; } - } - } - - return "none"; -} - } // namespace io } // namespace cudf diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 88297423b9b..daf23de7eb2 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -454,24 +454,6 @@ cudf::size_type count_all_from_set(const char* h_data, const std::vector& keys, rmm::cuda_stream_view stream); -/** - * @brief Infer file compression type based on user supplied arguments. - * - * If the user specifies a valid compression_type for compression arg, - * compression type will be computed based on that. Otherwise the filename - * and ext_to_comp_map will be used. - * - * @param[in] compression_arg User specified compression type (if any) - * @param[in] filename Filename to base compression type (by extension) on - * @param[in] ext_to_comp_map User supplied mapping of file extension to compression type - * - * @return string representing compression type ("gzip, "bz2", etc) - */ -std::string infer_compression_type( - const compression_type& compression_arg, - const std::string& filename, - const std::vector>& ext_to_comp_map); - /** * @brief Checks whether the given character is a whitespace character. * diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index a15a180d466..7a54ccac197 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -101,7 +101,7 @@ cdef csv_reader_options make_csv_reader_options( bool na_filter, object prefix, object index_col, -) except +: +) except *: cdef source_info c_source_info = make_source_info([datasource]) cdef compression_type c_compression cdef size_type c_header diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 5511a65d0a4..8fb5d7cc9eb 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1069,20 +1069,6 @@ def test_csv_reader_byte_range(tmpdir, segment_bytes): assert list(df["int2"]) == list(ref_df["int2"]) -def test_csv_reader_byte_range_type_corner_case(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv") - - cudf.datasets.timeseries( - start="2000-01-01", - end="2000-01-02", - dtypes={"name": str, "id": int, "x": float, "y": float}, - ).to_csv(fname, chunksize=100000) - - byte_range = (2_147_483_648, 0) - with pytest.raises(RuntimeError, match="Offset is past end of file"): - cudf.read_csv(fname, byte_range=byte_range, header=None) - - @pytest.mark.parametrize("segment_bytes", [10, 19, 31, 36]) def test_csv_reader_byte_range_strings(segment_bytes): names = ["strings"] From 62b95202d9b1db14f765ef45644d9cf91f782ea7 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 17 Aug 2021 02:19:27 -0500 Subject: [PATCH 13/19] remove filepath logic from avro, parquet, orc readers --- cpp/include/cudf/io/detail/avro.hpp | 13 ------------- cpp/include/cudf/io/detail/orc.hpp | 13 ------------- cpp/include/cudf/io/detail/parquet.hpp | 13 ------------- cpp/src/io/avro/reader_impl.cu | 10 ---------- cpp/src/io/orc/reader_impl.cu | 9 --------- cpp/src/io/parquet/reader_impl.cu | 9 --------- 6 files changed, 67 deletions(-) diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp index 98483d1c03e..306c15dcb72 100644 --- a/cpp/include/cudf/io/detail/avro.hpp +++ b/cpp/include/cudf/io/detail/avro.hpp @@ -38,19 +38,6 @@ class reader { std::unique_ptr _impl; public: - /** - * @brief Constructor from an array of file paths - * - * @param filepaths Paths to the files containing the input dataset - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector const& filepaths, - avro_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - /** * @brief Constructor from an array of datasources * diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp index ab26c01db74..2174b688da2 100644 --- a/cpp/include/cudf/io/detail/orc.hpp +++ b/cpp/include/cudf/io/detail/orc.hpp @@ -47,19 +47,6 @@ class reader { std::unique_ptr _impl; public: - /** - * @brief Constructor from an array of file paths - * - * @param filepaths Paths to the files containing the input dataset - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector const& filepaths, - orc_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - /** * @brief Constructor from an array of datasources * diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index d95af7a11da..14f27ef8eef 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -49,19 +49,6 @@ class reader { std::unique_ptr _impl; public: - /** - * @brief Constructor from an array of file paths - * - * @param filepaths Paths to the files containing the input dataset - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector const& filepaths, - parquet_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - /** * @brief Constructor from an array of datasources * diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index f6ffdd99d35..08ea96139a1 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -474,16 +474,6 @@ table_with_metadata reader::impl::read(avro_reader_options const& options, return {std::make_unique
(std::move(out_columns)), std::move(metadata_out)}; } -// Forward to implementation -reader::reader(std::vector const& filepaths, - avro_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported."); - _impl = std::make_unique(datasource::create(filepaths[0]), options, mr); -} - // Forward to implementation reader::reader(std::vector>&& sources, avro_reader_options const& options, diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 033a2d9aff5..5d62c45df83 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -1383,15 +1383,6 @@ table_with_metadata reader::impl::read(size_type skip_rows, return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; } -// Forward to implementation -reader::reader(std::vector const& filepaths, - orc_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - _impl = std::make_unique(datasource::create(filepaths), options, mr); -} - // Forward to implementation reader::reader(std::vector>&& sources, orc_reader_options const& options, diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 9f9bdfd4755..31ae763d9ff 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -1608,15 +1608,6 @@ table_with_metadata reader::impl::read(size_type skip_rows, return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; } -// Forward to implementation -reader::reader(std::vector const& filepaths, - parquet_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : _impl(std::make_unique(datasource::create(filepaths), options, mr)) -{ -} - // Forward to implementation reader::reader(std::vector>&& sources, parquet_reader_options const& options, From fb0129433bdd2dd264105ba172d96f2a310d8d8d Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 18 Aug 2021 15:19:11 -0500 Subject: [PATCH 14/19] move range size padding calculation out of json/csv reader and in to json/csv options --- cpp/include/cudf/io/csv.hpp | 34 +++++++++++++++++++++ cpp/include/cudf/io/json.hpp | 32 +++++++++++++++++++ cpp/src/io/csv/reader_impl.cu | 49 +++++------------------------- cpp/src/io/functions.cpp | 24 +++++++++++---- cpp/src/io/json/reader_impl.cu | 46 ++++++---------------------- cpp/src/io/json/reader_impl.hpp | 3 +- python/cudf/cudf/tests/test_csv.py | 14 +++++++++ 7 files changed, 116 insertions(+), 86 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index c807f189aac..1aa6e3bea29 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -177,6 +177,40 @@ class csv_reader_options { */ std::size_t get_byte_range_size() const { return _byte_range_size; } + /** + * @brief Returns number of bytes to read with padding. + */ + std::size_t get_byte_range_size_with_padding() const + { + if (_byte_range_size == 0) { + return 0; + } else { + return _byte_range_size + get_byte_range_padding(); + } + } + + /** + * @brief Returns number of bytes to pad when reading. + */ + std::size_t get_byte_range_padding() const + { + auto const num_names = _names.size(); + auto const num_dtypes = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes); + auto const num_columns = std::max(num_dtypes, num_names); + + auto const max_row_bytes = 16 * 1024; // 16KB + auto const column_bytes = 64; + auto const base_padding = 1024; // 1KB + + if (num_columns == 0) { + // Use flat size if the number of columns is not known + return max_row_bytes; + } + + // Expand the size based on the number of columns, if available + return base_padding + num_columns * column_bytes; + } + /** * @brief Returns names of the columns. */ diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index bca60f76260..5d2a4f6fcd1 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -140,6 +140,38 @@ class json_reader_options { */ size_t get_byte_range_size() const { return _byte_range_size; } + /** + * @brief Returns number of bytes to read with padding. + */ + size_t get_byte_range_size_with_padding() const + { + if (_byte_range_size == 0) { + return 0; + } else { + return _byte_range_size + get_byte_range_padding(); + } + } + + /** + * @brief Returns number of bytes to pad when reading. + */ + size_t get_byte_range_padding() const + { + auto const num_columns = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes); + + auto const max_row_bytes = 16 * 1024; // 16KB + auto const column_bytes = 64; + auto const base_padding = 1024; // 1KB + + if (num_columns == 0) { + // Use flat size if the number of columns is not known + return max_row_bytes; + } + + // Expand the size based on the number of columns, if available + return base_padding + num_columns * column_bytes; + } + /** * @brief Whether to read the file as a json object per line. */ diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index a85a610962e..c61cc26800e 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -57,31 +57,6 @@ namespace csv { using namespace cudf::io::csv; using namespace cudf::io; -/** - * @brief Estimates the maximum expected length or a row, based on the number - * of columns - * - * If the number of columns is not available, it will return a value large - * enough for most use cases - * - * @param[in] num_columns Number of columns in the CSV file (optional) - * - * @return Estimated maximum size of a row, in bytes - */ -constexpr size_t calculateMaxRowSize(int num_columns = 0) noexcept -{ - constexpr size_t max_row_bytes = 16 * 1024; // 16KB - constexpr size_t column_bytes = 64; - constexpr size_t base_padding = 1024; // 1KB - if (num_columns == 0) { - // Use flat size if the number of columns is not known - return max_row_bytes; - } else { - // Expand the size based on the number of columns, if available - return base_padding + num_columns * column_bytes; - } -} - /** * @brief Translates a dtype string and returns its dtype enumeration and any * extended dtype flags that are supported by cuIO. Often, this is a column @@ -199,31 +174,21 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream) std::pair, reader::impl::selected_rows_offsets> reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) { - auto range_offset = opts_.get_byte_range_offset(); - auto range_size = opts_.get_byte_range_size(); - auto skip_rows = opts_.get_skiprows(); - auto skip_end_rows = opts_.get_skipfooter(); - auto num_rows = opts_.get_nrows(); + auto range_offset = opts_.get_byte_range_offset(); + auto range_size = opts_.get_byte_range_size(); + auto range_size_padded = opts_.get_byte_range_size_with_padding(); + auto skip_rows = opts_.get_skiprows(); + auto skip_end_rows = opts_.get_skipfooter(); + auto num_rows = opts_.get_nrows(); if (range_offset > 0 || range_size > 0) { CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE, "Reading compressed data using `byte range` is unsupported"); } - size_t map_range_size = 0; - - if (range_size != 0) { - auto num_given_dtypes = - std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes()); - const auto num_columns = std::max(opts_.get_names().size(), num_given_dtypes); - map_range_size = range_size + calculateMaxRowSize(num_columns); - } - - // TODO: provide hint to datasource that we should memory map any underlying file. - // Transfer source data to GPU if (!source_->is_empty()) { - auto data_size = (map_range_size != 0) ? map_range_size : source_->size(); + auto data_size = (range_size_padded != 0) ? range_size_padded : source_->size(); auto buffer = source_->host_read(range_offset, data_size); auto h_data = host_span( // diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index ccc2eef56c7..438cb1762c6 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -107,10 +107,18 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder( namespace { -std::vector> make_datasources(source_info const& info) +std::vector> make_datasources(source_info const& info, + size_t range_offset = 0, + size_t range_size = 0) { switch (info.type) { - case io_type::FILEPATH: return cudf::io::datasource::create(info.filepaths); + case io_type::FILEPATH: { + auto sources = std::vector>(); + for (auto const& filepath : info.filepaths) { + sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size)); + } + return sources; + } case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers); case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources); default: CUDF_FAIL("Unsupported source type"); @@ -179,10 +187,12 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor CUDF_FUNC_RANGE(); - auto datasources = make_datasources(options.get_source()); - options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); + auto datasources = make_datasources(options.get_source(), + options.get_byte_range_offset(), + options.get_byte_range_size_with_padding()); + auto reader = std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); @@ -195,10 +205,12 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_ CUDF_FUNC_RANGE(); - auto datasources = make_datasources(options.get_source()); - options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); + auto datasources = make_datasources(options.get_source(), + options.get_byte_range_offset(), + options.get_byte_range_size_with_padding()); + auto reader = std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index bae7471e307..0618f02e98f 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -50,31 +50,6 @@ namespace detail { namespace json { using namespace cudf::io; -namespace { -/** - * @brief Estimates the maximum expected length or a row, based on the number - * of columns - * - * If the number of columns is not available, it will return a value large - * enough for most use cases - * - * @param[in] num_columns Number of columns in the JSON file (optional) - * - * @return Estimated maximum size of a row, in bytes - */ -constexpr size_t calculate_max_row_size(int num_columns = 0) noexcept -{ - constexpr size_t max_row_bytes = 16 * 1024; // 16KB - constexpr size_t column_bytes = 64; - constexpr size_t base_padding = 1024; // 1KB - return num_columns == 0 - ? max_row_bytes // Use flat size if the # of columns is not known - : base_padding + - num_columns * column_bytes; // Expand size based on the # of columns, if available -} - -} // anonymous namespace - /** * @brief Aggregate the table containing keys info by their hash values. * @@ -231,16 +206,12 @@ std::pair, col_map_ptr_type> reader::impl::get_json_obj * * @param[in] range_offset Number of bytes offset from the start * @param[in] range_size Bytes to read; use `0` for all remaining data + * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data */ -void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) +void reader::impl::ingest_raw_input(size_t range_offset, + size_t range_size, + size_t range_size_padded) { - size_t map_range_size = 0; - if (range_size != 0) { - auto const dtype_option_size = - std::visit([](const auto& dtypes) { return dtypes.size(); }, options_.get_dtypes()); - map_range_size = range_size + calculate_max_row_size(dtype_option_size); - } - // Iterate through the user defined sources and read the contents into the local buffer CUDF_EXPECTS(!sources_.empty(), "No sources were defined"); size_t total_source_size = 0; @@ -253,7 +224,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) size_t bytes_read = 0; for (const auto& source : sources_) { if (!source->is_empty()) { - auto data_size = (map_range_size != 0) ? map_range_size : source->size(); + auto data_size = (range_size_padded != 0) ? range_size_padded : source->size(); bytes_read += source->host_read(range_offset, data_size, &buffer_[bytes_read]); } } @@ -675,10 +646,11 @@ reader::impl::impl(std::vector>&& sources, table_with_metadata reader::impl::read(json_reader_options const& options, rmm::cuda_stream_view stream) { - auto range_offset = options.get_byte_range_offset(); - auto range_size = options.get_byte_range_size(); + auto range_offset = options.get_byte_range_offset(); + auto range_size = options.get_byte_range_size(); + auto range_size_padded = options.get_byte_range_size_with_padding(); - ingest_raw_input(range_offset, range_size); + ingest_raw_input(range_offset, range_size, range_size_padded); CUDF_EXPECTS(buffer_.size() != 0, "Ingest failed: input data is null.\n"); decompress_input(stream); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index f7af55b2b90..d01f2e8677e 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -109,8 +109,9 @@ class reader::impl { * * @param[in] range_offset Number of bytes offset from the start * @param[in] range_size Bytes to read; use `0` for all remaining data + * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data */ - void ingest_raw_input(size_t range_offset, size_t range_size); + void ingest_raw_input(size_t range_offset, size_t range_size, size_t range_size_padded); /** * @brief Extract the JSON objects keys from the input file with object rows. diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 8fb5d7cc9eb..5511a65d0a4 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1069,6 +1069,20 @@ def test_csv_reader_byte_range(tmpdir, segment_bytes): assert list(df["int2"]) == list(ref_df["int2"]) +def test_csv_reader_byte_range_type_corner_case(tmpdir): + fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv") + + cudf.datasets.timeseries( + start="2000-01-01", + end="2000-01-02", + dtypes={"name": str, "id": int, "x": float, "y": float}, + ).to_csv(fname, chunksize=100000) + + byte_range = (2_147_483_648, 0) + with pytest.raises(RuntimeError, match="Offset is past end of file"): + cudf.read_csv(fname, byte_range=byte_range, header=None) + + @pytest.mark.parametrize("segment_bytes", [10, 19, 31, 36]) def test_csv_reader_byte_range_strings(segment_bytes): names = ["strings"] From d422aebbe62d7e9915af93f474563e6e1c571e97 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 18 Aug 2021 15:30:38 -0500 Subject: [PATCH 15/19] remove filepaths from json reader --- cpp/src/io/json/reader_impl.cu | 12 +++++------- cpp/src/io/json/reader_impl.hpp | 3 +-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 0618f02e98f..2964a12568f 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -231,7 +231,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, byte_range_offset_ = range_offset; byte_range_size_ = range_size; - load_whole_file_ = byte_range_offset_ == 0 && byte_range_size_ == 0; + load_whole_source_ = byte_range_offset_ == 0 && byte_range_size_ == 0; } /** @@ -256,7 +256,7 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream) uncomp_data_ = uncomp_data_owner_.data(); uncomp_size_ = uncomp_data_owner_.size(); } - if (load_whole_file_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream); + if (load_whole_source_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream); } rmm::device_uvector reader::impl::find_record_starts(rmm::cuda_stream_view stream) @@ -268,7 +268,7 @@ rmm::device_uvector reader::impl::find_record_starts(rmm::cuda_stream_ if (allow_newlines_in_strings_) { chars_to_count.push_back('\"'); } // If not starting at an offset, add an extra row to account for the first row in the file cudf::size_type prefilter_count = ((byte_range_offset_ == 0) ? 1 : 0); - if (load_whole_file_) { + if (load_whole_source_) { prefilter_count += count_all_from_set(data_, chars_to_count, stream); } else { prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count, stream); @@ -286,7 +286,7 @@ rmm::device_uvector reader::impl::find_record_starts(rmm::cuda_stream_ std::vector chars_to_find{'\n'}; if (allow_newlines_in_strings_) { chars_to_find.push_back('\"'); } // Passing offset = 1 to return positions AFTER the found character - if (load_whole_file_) { + if (load_whole_source_) { find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream); } else { find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr, stream); @@ -619,7 +619,6 @@ table_with_metadata reader::impl::convert_data_to_table(device_span>&& sources, - std::vector const& filepaths, json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -678,8 +677,7 @@ reader::reader(std::vector>&& sources, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - std::vector file_paths = {}; // Empty filepaths - _impl = std::make_unique(std::move(sources), file_paths, options, stream, mr); + _impl = std::make_unique(std::move(sources), options, stream, mr); } // Destructor within this translation unit diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index d01f2e8677e..d910cce2d72 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -68,7 +68,7 @@ class reader::impl { size_t byte_range_offset_ = 0; size_t byte_range_size_ = 0; - bool load_whole_file_ = true; + bool load_whole_source_ = true; table_metadata metadata_; std::vector dtypes_; @@ -186,7 +186,6 @@ class reader::impl { * @brief Constructor from a dataset source with reader options. */ explicit impl(std::vector>&& sources, - std::vector const& filepaths, json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); From 640375b4000ecb87872537784c8e572a6172ad8f Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 21 Aug 2021 15:43:54 -0500 Subject: [PATCH 16/19] re-delete csv reader_impl header --- cpp/src/io/csv/reader_impl.hpp | 241 --------------------------------- 1 file changed, 241 deletions(-) delete mode 100644 cpp/src/io/csv/reader_impl.hpp diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp deleted file mode 100644 index beaa9b816cb..00000000000 --- a/cpp/src/io/csv/reader_impl.hpp +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "csv_common.h" -#include "csv_gpu.h" - -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include - -using cudf::host_span; - -namespace cudf { -namespace io { -namespace detail { -namespace csv { -using namespace cudf::io::csv; -using namespace cudf::io; - -/** - * @brief Implementation for CSV reader - * - * The CSV reader is implemented in 4 stages: - * Stage 1: read and optionally decompress the input data in host memory - * (may be a memory-mapped view of the data on disk) - * - * Stage 2: gather the offset of each data row within the csv data. - * Since the number of rows in a given character block may depend on the - * initial parser state (like whether the block starts in a middle of a - * quote or not), a separate row count and output parser state is computed - * for every possible input parser state per 16KB character block. - * The result is then used to infer the parser state and starting row at - * the beginning of every character block. - * A second pass can then output the location of every row (which is needed - * for the subsequent parallel conversion of every row from csv text - * to cudf binary form) - * - * Stage 3: Optional stage to infer the data type of each CSV column. - * - * Stage 4: Convert every row from csv text form to cudf binary form. - */ -class reader::impl { - public: - /** - * @brief Constructor from a dataset source with reader options. - * - * @param source Dataset source - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit impl(std::unique_ptr source, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - - /** - * @brief Read an entire set or a subset of data and returns a set of columns. - * - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return The set of columns along with metadata - */ - table_with_metadata read(rmm::cuda_stream_view stream); - - private: - /** - * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span. - * - * Row offsets are stored this way to avoid reallocation/copies when discarding front or back - * elements. - */ - class selected_rows_offsets { - rmm::device_uvector all; - device_span selected; - - public: - selected_rows_offsets(rmm::device_uvector&& data, - device_span selected_span) - : all{std::move(data)}, selected{selected_span} - { - } - selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {} - - operator device_span() const { return selected; } - void shrink(size_t size) - { - CUDF_EXPECTS(size <= selected.size(), "New size must be smaller"); - selected = selected.subspan(0, size); - } - void erase_first_n(size_t n) - { - CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove"); - selected = selected.subspan(n, selected.size() - n); - } - auto size() const { return selected.size(); } - auto data() const { return selected.data(); } - }; - - /** - * @brief Selectively loads data on the GPU and gathers offsets of rows to read. - * - * Selection is based on read options. - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - std::pair, reader::impl::selected_rows_offsets> - select_data_and_row_offsets(rmm::cuda_stream_view stream); - - /** - * @brief Finds row positions in the specified input data, and loads the selected data onto GPU. - * - * This function scans the input data to record the row offsets (relative to the start of the - * input data). A row is actually the data/offset between two termination symbols. - * - * @param data Uncompressed input data in host memory - * @param range_begin Only include rows starting after this position - * @param range_end Only include rows starting before this position - * @param skip_rows Number of rows to skip from the start - * @param num_rows Number of rows to read; -1: all remaining data - * @param load_whole_file Hint that the entire data will be needed on gpu - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Input data and row offsets in the device memory - */ - std::pair, reader::impl::selected_rows_offsets> - load_data_and_gather_row_offsets(host_span data, - size_t range_begin, - size_t range_end, - size_t skip_rows, - int64_t num_rows, - bool load_whole_file, - rmm::cuda_stream_view stream); - - /** - * @brief Find the start position of the first data row - * - * @param h_data Uncompressed input data in host memory - * - * @return Byte position of the first row - */ - size_t find_first_row_start(host_span data); - - /** - * @brief Automatically infers each column's data type based on the CSV's data within that column. - * - * @param data The CSV data from which to infer the columns' data types - * @param row_offsets The row offsets into the CSV's data - * @param stream The stream to which the type inference-kernel will be dispatched - * @return The columns' inferred data types - */ - std::vector infer_column_types(device_span data, - device_span row_offsets, - rmm::cuda_stream_view stream); - - /** - * @brief Selects the columns' data types from the map of dtypes. - * - * @param col_type_map Column name -> data type map specifying the columns' target data types - * @return Sorted list of selected columns' data types - */ - std::vector select_data_types(std::map const& col_type_map); - - /** - * @brief Selects the columns' data types from the list of dtypes. - * - * @param dtypes Vector of data types specifying the columns' target data types - * @return Sorted list of selected columns' data types - */ - std::vector select_data_types(std::vector const& dtypes); - - /** - * @brief Parses the columns' data types from the vector of dtypes that are provided as strings. - * - * @param types_as_strings The vector of strings from which to parse the columns' target data - * types - * @return List of columns' data types - */ - std::vector parse_column_types(std::vector const& types_as_strings); - - /** - * @brief Converts the row-column data and outputs to column bufferrs. - * - * @param column_types Column types - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return list of column buffers of decoded data, or ptr/size in the case of strings. - */ - std::vector decode_data(device_span data, - device_span row_offsets, - host_span column_types, - rmm::cuda_stream_view stream); - - private: - rmm::mr::device_memory_resource* mr_ = nullptr; - std::unique_ptr source_; - const csv_reader_options opts_; - - cudf::size_type num_records_ = 0; // Number of rows with actual data - int num_active_cols_ = 0; // Number of columns to read - int num_actual_cols_ = 0; // Number of columns in the dataset - - // Parsing options - parse_options opts{}; - std::vector column_flags_; - - // Intermediate data - std::vector col_names_; - std::vector header_; -}; - -} // namespace csv -} // namespace detail -} // namespace io -} // namespace cudf From 07b05e89a733de2b1974787837d4e087d4cb1914 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 24 Aug 2021 14:27:53 -0500 Subject: [PATCH 17/19] re-remove csv/reader_impl.hpp --- cpp/src/io/csv/reader_impl.hpp | 232 --------------------------------- 1 file changed, 232 deletions(-) delete mode 100644 cpp/src/io/csv/reader_impl.hpp diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp deleted file mode 100644 index de363a46ffe..00000000000 --- a/cpp/src/io/csv/reader_impl.hpp +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "csv_common.h" -#include "csv_gpu.h" - -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include - -using cudf::host_span; - -namespace cudf { -namespace io { -namespace detail { -namespace csv { -using namespace cudf::io::csv; -using namespace cudf::io; - -/** - * @brief Implementation for CSV reader - * - * The CSV reader is implemented in 4 stages: - * Stage 1: read and optionally decompress the input data in host memory - * (may be a memory-mapped view of the data on disk) - * - * Stage 2: gather the offset of each data row within the csv data. - * Since the number of rows in a given character block may depend on the - * initial parser state (like whether the block starts in a middle of a - * quote or not), a separate row count and output parser state is computed - * for every possible input parser state per 16KB character block. - * The result is then used to infer the parser state and starting row at - * the beginning of every character block. - * A second pass can then output the location of every row (which is needed - * for the subsequent parallel conversion of every row from csv text - * to cudf binary form) - * - * Stage 3: Optional stage to infer the data type of each CSV column. - * - * Stage 4: Convert every row from csv text form to cudf binary form. - */ -class reader::impl { - public: - /** - * @brief Constructor from a dataset source with reader options. - * - * @param source Dataset source - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit impl(std::unique_ptr source, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - - /** - * @brief Read an entire set or a subset of data and returns a set of columns. - * - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return The set of columns along with metadata - */ - table_with_metadata read(rmm::cuda_stream_view stream); - - private: - /** - * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span. - * - * Row offsets are stored this way to avoid reallocation/copies when discarding front or back - * elements. - */ - class selected_rows_offsets { - rmm::device_uvector all; - device_span selected; - - public: - selected_rows_offsets(rmm::device_uvector&& data, - device_span selected_span) - : all{std::move(data)}, selected{selected_span} - { - } - selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {} - - operator device_span() const { return selected; } - void shrink(size_t size) - { - CUDF_EXPECTS(size <= selected.size(), "New size must be smaller"); - selected = selected.subspan(0, size); - } - void erase_first_n(size_t n) - { - CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove"); - selected = selected.subspan(n, selected.size() - n); - } - auto size() const { return selected.size(); } - auto data() const { return selected.data(); } - }; - - /** - * @brief Selectively loads data on the GPU and gathers offsets of rows to read. - * - * Selection is based on read options. - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - std::pair, reader::impl::selected_rows_offsets> - select_data_and_row_offsets(rmm::cuda_stream_view stream); - - /** - * @brief Finds row positions in the specified input data, and loads the selected data onto GPU. - * - * This function scans the input data to record the row offsets (relative to the start of the - * input data). A row is actually the data/offset between two termination symbols. - * - * @param data Uncompressed input data in host memory - * @param range_begin Only include rows starting after this position - * @param range_end Only include rows starting before this position - * @param skip_rows Number of rows to skip from the start - * @param num_rows Number of rows to read; -1: all remaining data - * @param load_whole_file Hint that the entire data will be needed on gpu - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Input data and row offsets in the device memory - */ - std::pair, reader::impl::selected_rows_offsets> - load_data_and_gather_row_offsets(host_span data, - size_t range_begin, - size_t range_end, - size_t skip_rows, - int64_t num_rows, - bool load_whole_file, - rmm::cuda_stream_view stream); - - /** - * @brief Find the start position of the first data row - * - * @param h_data Uncompressed input data in host memory - * - * @return Byte position of the first row - */ - size_t find_first_row_start(host_span data); - - /** - * @brief Automatically infers each column's data type based on the CSV's data within that column. - * - * @param data The CSV data from which to infer the columns' data types - * @param row_offsets The row offsets into the CSV's data - * @param stream The stream to which the type inference-kernel will be dispatched - * @return The columns' inferred data types - */ - std::vector infer_column_types(device_span data, - device_span row_offsets, - rmm::cuda_stream_view stream); - - /** - * @brief Selects the columns' data types from the map of dtypes. - * - * @param col_type_map Column name -> data type map specifying the columns' target data types - * @return Sorted list of selected columns' data types - */ - std::vector select_data_types(std::map const& col_type_map); - - /** - * @brief Selects the columns' data types from the list of dtypes. - * - * @param dtypes Vector of data types specifying the columns' target data types - * @return Sorted list of selected columns' data types - */ - std::vector select_data_types(std::vector const& dtypes); - - /** - * @brief Converts the row-column data and outputs to column bufferrs. - * - * @param column_types Column types - * @param stream CUDA stream used for device memory operations and kernel launches. - * - * @return list of column buffers of decoded data, or ptr/size in the case of strings. - */ - std::vector decode_data(device_span data, - device_span row_offsets, - host_span column_types, - rmm::cuda_stream_view stream); - - private: - rmm::mr::device_memory_resource* mr_ = nullptr; - std::unique_ptr source_; - const csv_reader_options opts_; - - cudf::size_type num_records_ = 0; // Number of rows with actual data - int num_active_cols_ = 0; // Number of columns to read - int num_actual_cols_ = 0; // Number of columns in the dataset - - // Parsing options - parse_options opts{}; - std::vector column_flags_; - - // Intermediate data - std::vector col_names_; - std::vector header_; -}; - -} // namespace csv -} // namespace detail -} // namespace io -} // namespace cudf From 92033c3dc24fa1fdc99d5fa67e1fa191a7bc7a1f Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 25 Aug 2021 11:27:16 -0500 Subject: [PATCH 18/19] fix bad merge where changes in 9079 were deleted. --- cpp/src/io/csv/reader_impl.cu | 124 +--------------------------------- 1 file changed, 2 insertions(+), 122 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 75dc57750ee..5a78c0454b1 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -31,7 +31,6 @@ #include #include -#include #include #include #include @@ -480,26 +479,6 @@ std::pair, selected_rows_offsets> select_data_and_row_ return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; } -std::vector get_data_types_from_column_names( - std::vector const& column_flags, - std::map const& column_type_map, - std::vector const& column_names, - int32_t num_actual_columns) -{ - std::vector selected_dtypes; - - for (int32_t i = 0; i < num_actual_columns; i++) { - if (column_flags[i] & column_parse::enabled) { - auto const col_type_it = column_type_map.find(column_names[i]); - CUDF_EXPECTS(col_type_it != column_type_map.end(), - "Must specify data types for all active columns"); - selected_dtypes.emplace_back(col_type_it->second); - } - } - - return selected_dtypes; -} - std::vector select_data_types(std::vector const& column_flags, std::vector const& dtypes, int32_t num_actual_columns, @@ -522,85 +501,6 @@ std::vector select_data_types(std::vector const& return selected_dtypes; } -std::vector parse_column_types(std::vector& column_flags, - std::vector const& column_names, - std::vector const& types_as_strings, - int32_t num_actual_columns, - int32_t num_active_columns, - data_type timestamp_type) -{ - std::vector dtypes; - - bool const is_dict = std::all_of(types_as_strings.begin(), - types_as_strings.end(), - [](auto const& s) { return s.find(':') != std::string::npos; }); - - if (!is_dict) { - if (types_as_strings.size() == 1) { - // If it's a single dtype, assign that dtype to all active columns - data_type dtype_; - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]); - dtypes.resize(num_active_columns, dtype_); - for (int col = 0; col < num_actual_columns; col++) { - column_flags[col] |= col_flags_; - } - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } else { - // If it's a list, assign dtypes to active columns in the given order - CUDF_EXPECTS(static_cast(types_as_strings.size()) >= num_actual_columns, - "Must specify data types for all columns"); - - auto dtype_ = std::back_inserter(dtypes); - - for (int col = 0; col < num_actual_columns; col++) { - if (column_flags[col] & column_parse::enabled) { - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]); - column_flags[col] |= col_flags_; - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } - } - } - } else { - // Translate vector of `name : dtype` strings to map - // NOTE: Incoming pairs can be out-of-order from column names in dataset - std::unordered_map col_type_map; - for (const auto& pair : types_as_strings) { - const auto pos = pair.find_last_of(':'); - const auto name = pair.substr(0, pos); - const auto dtype = pair.substr(pos + 1, pair.size()); - col_type_map[name] = dtype; - } - - auto dtype_ = std::back_inserter(dtypes); - - for (int col = 0; col < num_actual_columns; col++) { - if (column_flags[col] & column_parse::enabled) { - CUDF_EXPECTS(col_type_map.find(column_names[col]) != col_type_map.end(), - "Must specify data types for all active columns"); - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[column_names[col]]); - column_flags[col] |= col_flags_; - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } - } - } - - if (timestamp_type.id() != cudf::type_id::EMPTY) { - for (auto& type : dtypes) { - if (cudf::is_timestamp(type)) { type = timestamp_type; } - } - } - - for (size_t i = 0; i < dtypes.size(); i++) { - // Replace EMPTY dtype with STRING - if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; } - } - - return dtypes; -} - std::vector infer_column_types(parse_options const& parse_opts, std::vector const& column_flags, device_span data, @@ -864,28 +764,8 @@ table_with_metadata read_csv(cudf::io::datasource* source, reader_opts.get_timestamp_type(), stream); } else { - column_types = - std::visit(cudf::detail::visitor_overload{ - [&](const std::vector& data_types) { - return select_data_types( - column_flags, data_types, num_actual_columns, num_active_columns); - }, - [&](const std::map& data_types) { - return get_data_types_from_column_names( // - column_flags, - data_types, - column_names, - num_actual_columns); - }, - [&](const std::vector& dtypes) { - return parse_column_types(column_flags, - column_names, - dtypes, - num_actual_columns, - num_active_columns, - reader_opts.get_timestamp_type()); - }}, - reader_opts.get_dtypes()); + column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); }, + opts_.get_dtypes()); } out_columns.reserve(column_types.size()); From 24b394993d3757682506d9bf15c59672e2ffb3c7 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 25 Aug 2021 21:39:35 -0500 Subject: [PATCH 19/19] add back read_csv impl function get_data_types_from_column_names --- cpp/src/io/csv/reader_impl.cu | 59 +++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 5a78c0454b1..a6e53029043 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -102,28 +103,6 @@ class selected_rows_offsets { auto data() const { return selected.data(); } }; -/** - * @brief Translates a dtype string and returns its dtype enumeration and any - * extended dtype flags that are supported by cuIO. Often, this is a column - * with the same underlying dtype the basic types, but with different parsing - * interpretations. - * - * @param[in] dtype String containing the basic or extended dtype - * - * @return Tuple of data_type and flags - */ -std::tuple get_dtype_info(const std::string& dtype) -{ - if (dtype == "hex" || dtype == "hex64") { - return std::make_tuple(data_type{cudf::type_id::INT64}, column_parse::as_hexadecimal); - } - if (dtype == "hex32") { - return std::make_tuple(data_type{cudf::type_id::INT32}, column_parse::as_hexadecimal); - } - - return std::make_tuple(convert_string_to_dtype(dtype), column_parse::as_default); -} - /** * @brief Removes the first and Last quote in the string */ @@ -501,6 +480,26 @@ std::vector select_data_types(std::vector const& return selected_dtypes; } +std::vector get_data_types_from_column_names( + std::vector const& column_flags, + std::map const& column_type_map, + std::vector const& column_names, + int32_t num_actual_columns) +{ + std::vector selected_dtypes; + + for (int32_t i = 0; i < num_actual_columns; i++) { + if (column_flags[i] & column_parse::enabled) { + auto const col_type_it = column_type_map.find(column_names[i]); + CUDF_EXPECTS(col_type_it != column_type_map.end(), + "Must specify data types for all active columns"); + selected_dtypes.emplace_back(col_type_it->second); + } + } + + return selected_dtypes; +} + std::vector infer_column_types(parse_options const& parse_opts, std::vector const& column_flags, device_span data, @@ -764,8 +763,20 @@ table_with_metadata read_csv(cudf::io::datasource* source, reader_opts.get_timestamp_type(), stream); } else { - column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); }, - opts_.get_dtypes()); + column_types = + std::visit(cudf::detail::visitor_overload{ + [&](const std::vector& data_types) { + return select_data_types( + column_flags, data_types, num_actual_columns, num_active_columns); + }, + [&](const std::map& data_types) { + return get_data_types_from_column_names( // + column_flags, + data_types, + column_names, + num_actual_columns); + }}, + reader_opts.get_dtypes()); } out_columns.reserve(column_types.size());