From 9c72e56837ddfb3fb9b3d1111cdd08e1f53595c4 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 14 Aug 2021 05:08:16 -0500 Subject: [PATCH 1/6] simplify io/functions.cpp data source/sink factories --- cpp/src/io/functions.cpp | 126 +++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 59 deletions(-) diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index bf51012211c..e080ea3a2ca 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -106,67 +106,56 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder( } namespace { -template -std::unique_ptr make_reader(source_info const& src_info, - reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - if (src_info.type == io_type::FILEPATH) { - return std::make_unique(src_info.filepaths, options, stream, mr); - } - std::vector> datasources; - if (src_info.type == io_type::HOST_BUFFER) { - datasources = cudf::io::datasource::create(src_info.buffers); - } else if (src_info.type == io_type::USER_IMPLEMENTED) { - datasources = cudf::io::datasource::create(src_info.user_sources); - } else { - CUDF_FAIL("Unsupported source type"); +std::vector> make_datasources(source_info const& info) +{ + switch (info.type) { + case io_type::FILEPATH: return cudf::io::datasource::create(info.filepaths); + case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers); + case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources); + default: CUDF_FAIL("Unsupported source type"); } - - return std::make_unique(std::move(datasources), options, stream, mr); } -template -std::unique_ptr make_writer(sink_info const& sink, Ts&&... args) +std::unique_ptr make_datasink(sink_info const& info) { - if (sink.type == io_type::FILEPATH) { - return std::make_unique(cudf::io::data_sink::create(sink.filepath), - std::forward(args)...); - } - if (sink.type == io_type::HOST_BUFFER) { - return std::make_unique(cudf::io::data_sink::create(sink.buffer), - std::forward(args)...); + switch (info.type) { + case io_type::FILEPATH: return cudf::io::data_sink::create(info.filepath); + case io_type::HOST_BUFFER: return cudf::io::data_sink::create(info.buffer); + case io_type::VOID: return cudf::io::data_sink::create(); + case io_type::USER_IMPLEMENTED: return cudf::io::data_sink::create(info.user_sink); + default: CUDF_FAIL("Unsupported sink type"); } - if (sink.type == io_type::VOID) { - return std::make_unique(cudf::io::data_sink::create(), std::forward(args)...); - } - if (sink.type == io_type::USER_IMPLEMENTED) { - return std::make_unique(cudf::io::data_sink::create(sink.user_sink), - std::forward(args)...); - } - CUDF_FAIL("Unsupported sink type"); } } // namespace -table_with_metadata read_avro(avro_reader_options const& opts, rmm::mr::device_memory_resource* mr) +table_with_metadata read_avro(avro_reader_options const& options, + rmm::mr::device_memory_resource* mr) { namespace avro = cudf::io::detail::avro; CUDF_FUNC_RANGE(); - auto reader = make_reader(opts.get_source(), opts, rmm::cuda_stream_default, mr); - return reader->read(opts); + + auto datasources = make_datasources(options.get_source()); + auto reader = + std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); + + return reader->read(options); } -table_with_metadata read_json(json_reader_options const& opts, rmm::mr::device_memory_resource* mr) +table_with_metadata read_json(json_reader_options const& options, + rmm::mr::device_memory_resource* mr) { namespace json = cudf::io::detail::json; CUDF_FUNC_RANGE(); - auto reader = make_reader(opts.get_source(), opts, rmm::cuda_stream_default, mr); - return reader->read(opts); + + auto datasources = make_datasources(options.get_source()); + auto reader = + std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); + + return reader->read(options); } table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr) @@ -174,8 +163,10 @@ table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_ namespace csv = cudf::io::detail::csv; CUDF_FUNC_RANGE(); + + auto datasources = make_datasources(options.get_source()); auto reader = - make_reader(options.get_source(), options, rmm::cuda_stream_default, mr); + std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); return reader->read(); } @@ -185,7 +176,9 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc { using namespace cudf::io::detail; - auto writer = make_writer(options.get_sink(), options, rmm::cuda_stream_default, mr); + auto sink = make_datasink(options.get_sink()); + auto writer = + std::make_unique(std::move(sink), options, rmm::cuda_stream_default, mr); writer->write(options.get_table(), options.get_metadata()); } @@ -294,8 +287,10 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info) table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto reader = - make_reader(options.get_source(), options, rmm::cuda_stream_default, mr); + + auto datasources = make_datasources(options.get_source()); + auto reader = std::make_unique( + std::move(datasources), options, rmm::cuda_stream_default, mr); return reader->read(options); } @@ -305,11 +300,13 @@ table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_ */ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resource* mr) { + namespace io_detail = cudf::io::detail; + CUDF_FUNC_RANGE(); - namespace io_detail = cudf::io::detail; - auto writer = make_writer( - options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); + auto sink = make_datasink(options.get_sink()); + auto writer = std::make_unique( + std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); writer->write(options.get_table()); } @@ -317,12 +314,15 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc /** * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer */ -orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& op, +orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options, rmm::mr::device_memory_resource* mr) { namespace io_detail = cudf::io::detail; - writer = make_writer( - op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); + + auto sink = make_datasink(options.get_sink()); + + writer = std::make_unique( + std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); } /** @@ -354,8 +354,10 @@ table_with_metadata read_parquet(parquet_reader_options const& options, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto reader = make_reader( - options.get_source(), options, rmm::cuda_stream_default, mr); + + auto datasources = make_datasources(options.get_source()); + auto reader = std::make_unique( + std::move(datasources), options, rmm::cuda_stream_default, mr); return reader->read(options); } @@ -392,25 +394,31 @@ table_input_metadata::table_input_metadata(table_view const& table, std::unique_ptr> write_parquet(parquet_writer_options const& options, rmm::mr::device_memory_resource* mr) { - CUDF_FUNC_RANGE(); namespace io_detail = cudf::io::detail; - auto writer = make_writer( - options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); + CUDF_FUNC_RANGE(); + + auto sink = make_datasink(options.get_sink()); + auto writer = std::make_unique( + std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); writer->write(options.get_table()); + return writer->close(options.get_column_chunks_file_path()); } /** * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer */ -parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& op, +parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& options, rmm::mr::device_memory_resource* mr) { namespace io_detail = cudf::io::detail; - writer = make_writer( - op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); + + auto sink = make_datasink(options.get_sink()); + + writer = std::make_unique( + std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); } /** From 88e23990151c737dcb4a22a5d6454ef8893285c4 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 17 Aug 2021 00:53:48 -0500 Subject: [PATCH 2/6] remove filepath-related logic from csv and json readers --- cpp/include/cudf/io/csv.hpp | 2 +- cpp/include/cudf/io/json.hpp | 2 +- cpp/src/io/comp/io_uncomp.h | 7 +++-- cpp/src/io/comp/uncomp.cpp | 19 ++++++------ cpp/src/io/csv/reader_impl.cu | 38 ++++++------------------ cpp/src/io/csv/reader_impl.hpp | 4 --- cpp/src/io/functions.cpp | 40 ++++++++++++++++++++++++-- cpp/src/io/json/reader_impl.cu | 31 ++------------------ cpp/src/io/json/reader_impl.hpp | 1 - cpp/src/io/utilities/parsing_utils.cu | 34 ---------------------- cpp/src/io/utilities/parsing_utils.cuh | 18 ------------ python/cudf/cudf/_lib/csv.pyx | 2 +- python/cudf/cudf/tests/test_csv.py | 14 --------- 13 files changed, 66 insertions(+), 146 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index d4a21b2e98c..c807f189aac 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1199,7 +1199,7 @@ class csv_reader_options_builder { * @return The set of columns along with metadata. */ table_with_metadata read_csv( - csv_reader_options const& options, + csv_reader_options options, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 8954f7dcab1..bca60f76260 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -364,7 +364,7 @@ class json_reader_options_builder { * @return The set of columns along with metadata. */ table_with_metadata read_json( - json_reader_options const& options, + json_reader_options options, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/io/comp/io_uncomp.h b/cpp/src/io/comp/io_uncomp.h index 8daf73ecd0c..7b1feb84813 100644 --- a/cpp/src/io/comp/io_uncomp.h +++ b/cpp/src/io/comp/io_uncomp.h @@ -16,12 +16,13 @@ #pragma once +#include +#include + #include #include #include -#include - using cudf::host_span; namespace cudf { @@ -42,7 +43,7 @@ enum { std::vector io_uncompress_single_h2d(void const* src, size_t src_size, int stream_type); -std::vector get_uncompressed_data(host_span data, std::string const& compression); +std::vector get_uncompressed_data(host_span data, compression_type compression); class HostDecompressor { public: diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 2cb99d897fe..ee451d04dbb 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -369,6 +369,7 @@ std::vector io_uncompress_single_h2d(const void* src, size_t src_size, int // Unsupported format break; } + CUDF_EXPECTS(comp_data != nullptr, "Unsupported compressed stream type"); CUDF_EXPECTS(comp_len > 0, "Unsupported compressed stream type"); @@ -422,17 +423,17 @@ std::vector io_uncompress_single_h2d(const void* src, size_t src_size, int * @return Vector containing the output uncompressed data */ std::vector get_uncompressed_data(host_span const data, - std::string const& compression) + compression_type compression) { int comp_type = IO_UNCOMP_STREAM_TYPE_INFER; - if (compression == "gzip") - comp_type = IO_UNCOMP_STREAM_TYPE_GZIP; - else if (compression == "zip") - comp_type = IO_UNCOMP_STREAM_TYPE_ZIP; - else if (compression == "bz2") - comp_type = IO_UNCOMP_STREAM_TYPE_BZIP2; - else if (compression == "xz") - comp_type = IO_UNCOMP_STREAM_TYPE_XZ; + + switch (compression) { + case compression_type::GZIP: comp_type = IO_UNCOMP_STREAM_TYPE_GZIP; break; + case compression_type::ZIP: comp_type = IO_UNCOMP_STREAM_TYPE_ZIP; break; + case compression_type::BZIP2: comp_type = IO_UNCOMP_STREAM_TYPE_BZIP2; break; + case compression_type::XZ: comp_type = IO_UNCOMP_STREAM_TYPE_XZ; break; + default: break; + } return io_uncompress_single_h2d(data.data(), data.size(), comp_type); } diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 549b0474fe1..a85a610962e 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -206,10 +206,12 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) auto num_rows = opts_.get_nrows(); if (range_offset > 0 || range_size > 0) { - CUDF_EXPECTS(compression_type_ == "none", + CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE, "Reading compressed data using `byte range` is unsupported"); } + size_t map_range_size = 0; + if (range_size != 0) { auto num_given_dtypes = std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes()); @@ -217,12 +219,7 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) map_range_size = range_size + calculateMaxRowSize(num_columns); } - // Support delayed opening of the file if using memory mapping datasource - // This allows only mapping of a subset of the file if using byte range - if (source_ == nullptr) { - assert(!filepath_.empty()); - source_ = datasource::create(filepath_, range_offset, map_range_size); - } + // TODO: provide hint to datasource that we should memory map any underlying file. // Transfer source data to GPU if (!source_->is_empty()) { @@ -235,10 +232,11 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) std::vector h_uncomp_data_owner; - if (compression_type_ != "none") { - h_uncomp_data_owner = get_uncompressed_data(h_data, compression_type_); + if (opts_.get_compression() != compression_type::NONE) { + h_uncomp_data_owner = get_uncompressed_data(h_data, opts_.get_compression()); h_data = h_uncomp_data_owner; } + // None of the parameters for row selection is used, we are parsing the entire file const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 && skip_end_rows <= 0 && num_rows == -1; @@ -927,35 +925,17 @@ parse_options make_parse_options(csv_reader_options const& reader_opts, } reader::impl::impl(std::unique_ptr source, - std::string filepath, csv_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : mr_(mr), source_(std::move(source)), filepath_(filepath), opts_(options) + : mr_(mr), source_(std::move(source)), opts_(options) { num_actual_cols_ = opts_.get_names().size(); num_active_cols_ = num_actual_cols_; - compression_type_ = - infer_compression_type(opts_.get_compression(), - filepath, - {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}}); - opts = make_parse_options(options, stream); } -// Forward to implementation -reader::reader(std::vector const& filepaths, - csv_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported."); - // Delay actual instantiation of data source until read to allow for - // partial memory mapping of file using byte ranges - _impl = std::make_unique(nullptr, filepaths[0], options, stream, mr); -} - // Forward to implementation reader::reader(std::vector>&& sources, csv_reader_options const& options, @@ -963,7 +943,7 @@ reader::reader(std::vector>&& sources, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported."); - _impl = std::make_unique(std::move(sources[0]), "", options, stream, mr); + _impl = std::make_unique(std::move(sources[0]), options, stream, mr); } // Destructor within this translation unit diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 36c2bf4f9e7..beaa9b816cb 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -72,13 +72,11 @@ class reader::impl { * @brief Constructor from a dataset source with reader options. * * @param source Dataset source - * @param filepath Filepath if reading dataset from a file * @param options Settings for controlling reading behavior * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ explicit impl(std::unique_ptr source, - std::string filepath, csv_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); @@ -222,8 +220,6 @@ class reader::impl { private: rmm::mr::device_memory_resource* mr_ = nullptr; std::unique_ptr source_; - std::string filepath_; - std::string compression_type_; const csv_reader_options opts_; cudf::size_type num_records_ = 0; // Number of rows with actual data diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index e080ea3a2ca..ccc2eef56c7 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -144,27 +144,61 @@ table_with_metadata read_avro(avro_reader_options const& options, return reader->read(options); } -table_with_metadata read_json(json_reader_options const& options, - rmm::mr::device_memory_resource* mr) +compression_type infer_compression_type(compression_type compression, source_info const& info) +{ + if (compression != compression_type::AUTO) { return compression; } + + if (info.type != io_type::FILEPATH) { return compression_type::NONE; } + + auto filepath = info.filepaths[0]; + + // Attempt to infer from the file extension + const auto pos = filepath.find_last_of('.'); + + if (pos == std::string::npos) { return {}; } + + auto str_tolower = [](const auto& begin, const auto& end) { + std::string out; + std::transform(begin, end, std::back_inserter(out), ::tolower); + return out; + }; + + const auto ext = str_tolower(filepath.begin() + pos + 1, filepath.end()); + + if (ext == "gz") { return compression_type::GZIP; } + if (ext == "zip") { return compression_type::ZIP; } + if (ext == "bz2") { return compression_type::BZIP2; } + if (ext == "xz") { return compression_type::XZ; } + + return compression_type::NONE; +} + +table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr) { namespace json = cudf::io::detail::json; CUDF_FUNC_RANGE(); auto datasources = make_datasources(options.get_source()); + + options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); + auto reader = std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); return reader->read(options); } -table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr) +table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr) { namespace csv = cudf::io::detail::csv; CUDF_FUNC_RANGE(); auto datasources = make_datasources(options.get_source()); + + options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); + auto reader = std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index a8f117c22bf..bae7471e307 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -241,15 +241,6 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) map_range_size = range_size + calculate_max_row_size(dtype_option_size); } - // Support delayed opening of the file if using memory mapping datasource - // This allows only mapping of a subset of the file if using byte range - if (sources_.empty()) { - assert(!filepaths_.empty()); - for (const auto& path : filepaths_) { - sources_.emplace_back(datasource::create(path, range_offset, map_range_size)); - } - } - // Iterate through the user defined sources and read the contents into the local buffer CUDF_EXPECTS(!sources_.empty(), "No sources were defined"); size_t total_source_size = 0; @@ -280,11 +271,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) */ void reader::impl::decompress_input(rmm::cuda_stream_view stream) { - const auto compression_type = - infer_compression_type(options_.get_compression(), - filepaths_.size() > 0 ? filepaths_[0] : "", - {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}}); - if (compression_type == "none") { + if (options_.get_compression() == compression_type::NONE) { // Do not use the owner vector here to avoid extra copy uncomp_data_ = reinterpret_cast(buffer_.data()); uncomp_size_ = buffer_.size(); @@ -293,7 +280,7 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream) host_span( // reinterpret_cast(buffer_.data()), buffer_.size()), - compression_type); + options_.get_compression()); uncomp_data_ = uncomp_data_owner_.data(); uncomp_size_ = uncomp_data_owner_.size(); @@ -665,7 +652,7 @@ reader::impl::impl(std::vector>&& sources, json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : options_(options), mr_(mr), sources_(std::move(sources)), filepaths_(filepaths) + : options_(options), mr_(mr), sources_(std::move(sources)) { CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); @@ -713,18 +700,6 @@ table_with_metadata reader::impl::read(json_reader_options const& options, return convert_data_to_table(rec_starts, stream); } -// Forward to implementation -reader::reader(std::vector const& filepaths, - json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Delay actual instantiation of data source until read to allow for - // partial memory mapping of file using byte ranges - std::vector> src = {}; // Empty datasources - _impl = std::make_unique(std::move(src), filepaths, options, stream, mr); -} - // Forward to implementation reader::reader(std::vector>&& sources, json_reader_options const& options, diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 5cf51369cdf..f7af55b2b90 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -57,7 +57,6 @@ class reader::impl { rmm::mr::device_memory_resource* mr_ = nullptr; std::vector> sources_; - std::vector filepaths_; std::vector buffer_; const char* uncomp_data_ = nullptr; diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu index 6c8f01111e5..ba62238c5d3 100644 --- a/cpp/src/io/utilities/parsing_utils.cu +++ b/cpp/src/io/utilities/parsing_utils.cu @@ -209,39 +209,5 @@ cudf::size_type count_all_from_set(const char* h_data, return find_all_from_set(h_data, h_size, keys, 0, nullptr, stream); } -std::string infer_compression_type( - const compression_type& compression_arg, - const std::string& filename, - const std::vector>& ext_to_comp_map) -{ - auto str_tolower = [](const auto& begin, const auto& end) { - std::string out; - std::transform(begin, end, std::back_inserter(out), ::tolower); - return out; - }; - - // Attempt to infer from user-supplied argument - if (compression_arg != compression_type::AUTO) { - switch (compression_arg) { - case compression_type::GZIP: return "gzip"; - case compression_type::BZIP2: return "bz2"; - case compression_type::ZIP: return "zip"; - case compression_type::XZ: return "xz"; - default: break; - } - } - - // Attempt to infer from the file extension - const auto pos = filename.find_last_of('.'); - if (pos != std::string::npos) { - const auto ext = str_tolower(filename.begin() + pos + 1, filename.end()); - for (const auto& mapping : ext_to_comp_map) { - if (mapping.first == ext) { return mapping.second; } - } - } - - return "none"; -} - } // namespace io } // namespace cudf diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 88297423b9b..daf23de7eb2 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -454,24 +454,6 @@ cudf::size_type count_all_from_set(const char* h_data, const std::vector& keys, rmm::cuda_stream_view stream); -/** - * @brief Infer file compression type based on user supplied arguments. - * - * If the user specifies a valid compression_type for compression arg, - * compression type will be computed based on that. Otherwise the filename - * and ext_to_comp_map will be used. - * - * @param[in] compression_arg User specified compression type (if any) - * @param[in] filename Filename to base compression type (by extension) on - * @param[in] ext_to_comp_map User supplied mapping of file extension to compression type - * - * @return string representing compression type ("gzip, "bz2", etc) - */ -std::string infer_compression_type( - const compression_type& compression_arg, - const std::string& filename, - const std::vector>& ext_to_comp_map); - /** * @brief Checks whether the given character is a whitespace character. * diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index a15a180d466..7a54ccac197 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -101,7 +101,7 @@ cdef csv_reader_options make_csv_reader_options( bool na_filter, object prefix, object index_col, -) except +: +) except *: cdef source_info c_source_info = make_source_info([datasource]) cdef compression_type c_compression cdef size_type c_header diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 5511a65d0a4..8fb5d7cc9eb 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1069,20 +1069,6 @@ def test_csv_reader_byte_range(tmpdir, segment_bytes): assert list(df["int2"]) == list(ref_df["int2"]) -def test_csv_reader_byte_range_type_corner_case(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv") - - cudf.datasets.timeseries( - start="2000-01-01", - end="2000-01-02", - dtypes={"name": str, "id": int, "x": float, "y": float}, - ).to_csv(fname, chunksize=100000) - - byte_range = (2_147_483_648, 0) - with pytest.raises(RuntimeError, match="Offset is past end of file"): - cudf.read_csv(fname, byte_range=byte_range, header=None) - - @pytest.mark.parametrize("segment_bytes", [10, 19, 31, 36]) def test_csv_reader_byte_range_strings(segment_bytes): names = ["strings"] From 62b95202d9b1db14f765ef45644d9cf91f782ea7 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 17 Aug 2021 02:19:27 -0500 Subject: [PATCH 3/6] remove filepath logic from avro, parquet, orc readers --- cpp/include/cudf/io/detail/avro.hpp | 13 ------------- cpp/include/cudf/io/detail/orc.hpp | 13 ------------- cpp/include/cudf/io/detail/parquet.hpp | 13 ------------- cpp/src/io/avro/reader_impl.cu | 10 ---------- cpp/src/io/orc/reader_impl.cu | 9 --------- cpp/src/io/parquet/reader_impl.cu | 9 --------- 6 files changed, 67 deletions(-) diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp index 98483d1c03e..306c15dcb72 100644 --- a/cpp/include/cudf/io/detail/avro.hpp +++ b/cpp/include/cudf/io/detail/avro.hpp @@ -38,19 +38,6 @@ class reader { std::unique_ptr _impl; public: - /** - * @brief Constructor from an array of file paths - * - * @param filepaths Paths to the files containing the input dataset - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector const& filepaths, - avro_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - /** * @brief Constructor from an array of datasources * diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp index ab26c01db74..2174b688da2 100644 --- a/cpp/include/cudf/io/detail/orc.hpp +++ b/cpp/include/cudf/io/detail/orc.hpp @@ -47,19 +47,6 @@ class reader { std::unique_ptr _impl; public: - /** - * @brief Constructor from an array of file paths - * - * @param filepaths Paths to the files containing the input dataset - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector const& filepaths, - orc_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - /** * @brief Constructor from an array of datasources * diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index d95af7a11da..14f27ef8eef 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -49,19 +49,6 @@ class reader { std::unique_ptr _impl; public: - /** - * @brief Constructor from an array of file paths - * - * @param filepaths Paths to the files containing the input dataset - * @param options Settings for controlling reading behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - explicit reader(std::vector const& filepaths, - parquet_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - /** * @brief Constructor from an array of datasources * diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index f6ffdd99d35..08ea96139a1 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -474,16 +474,6 @@ table_with_metadata reader::impl::read(avro_reader_options const& options, return {std::make_unique(std::move(out_columns)), std::move(metadata_out)}; } -// Forward to implementation -reader::reader(std::vector const& filepaths, - avro_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported."); - _impl = std::make_unique(datasource::create(filepaths[0]), options, mr); -} - // Forward to implementation reader::reader(std::vector>&& sources, avro_reader_options const& options, diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 033a2d9aff5..5d62c45df83 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -1383,15 +1383,6 @@ table_with_metadata reader::impl::read(size_type skip_rows, return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; } -// Forward to implementation -reader::reader(std::vector const& filepaths, - orc_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - _impl = std::make_unique(datasource::create(filepaths), options, mr); -} - // Forward to implementation reader::reader(std::vector>&& sources, orc_reader_options const& options, diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 9f9bdfd4755..31ae763d9ff 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -1608,15 +1608,6 @@ table_with_metadata reader::impl::read(size_type skip_rows, return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; } -// Forward to implementation -reader::reader(std::vector const& filepaths, - parquet_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : _impl(std::make_unique(datasource::create(filepaths), options, mr)) -{ -} - // Forward to implementation reader::reader(std::vector>&& sources, parquet_reader_options const& options, From fb0129433bdd2dd264105ba172d96f2a310d8d8d Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 18 Aug 2021 15:19:11 -0500 Subject: [PATCH 4/6] move range size padding calculation out of json/csv reader and in to json/csv options --- cpp/include/cudf/io/csv.hpp | 34 +++++++++++++++++++++ cpp/include/cudf/io/json.hpp | 32 +++++++++++++++++++ cpp/src/io/csv/reader_impl.cu | 49 +++++------------------------- cpp/src/io/functions.cpp | 24 +++++++++++---- cpp/src/io/json/reader_impl.cu | 46 ++++++---------------------- cpp/src/io/json/reader_impl.hpp | 3 +- python/cudf/cudf/tests/test_csv.py | 14 +++++++++ 7 files changed, 116 insertions(+), 86 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index c807f189aac..1aa6e3bea29 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -177,6 +177,40 @@ class csv_reader_options { */ std::size_t get_byte_range_size() const { return _byte_range_size; } + /** + * @brief Returns number of bytes to read with padding. + */ + std::size_t get_byte_range_size_with_padding() const + { + if (_byte_range_size == 0) { + return 0; + } else { + return _byte_range_size + get_byte_range_padding(); + } + } + + /** + * @brief Returns number of bytes to pad when reading. + */ + std::size_t get_byte_range_padding() const + { + auto const num_names = _names.size(); + auto const num_dtypes = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes); + auto const num_columns = std::max(num_dtypes, num_names); + + auto const max_row_bytes = 16 * 1024; // 16KB + auto const column_bytes = 64; + auto const base_padding = 1024; // 1KB + + if (num_columns == 0) { + // Use flat size if the number of columns is not known + return max_row_bytes; + } + + // Expand the size based on the number of columns, if available + return base_padding + num_columns * column_bytes; + } + /** * @brief Returns names of the columns. */ diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index bca60f76260..5d2a4f6fcd1 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -140,6 +140,38 @@ class json_reader_options { */ size_t get_byte_range_size() const { return _byte_range_size; } + /** + * @brief Returns number of bytes to read with padding. + */ + size_t get_byte_range_size_with_padding() const + { + if (_byte_range_size == 0) { + return 0; + } else { + return _byte_range_size + get_byte_range_padding(); + } + } + + /** + * @brief Returns number of bytes to pad when reading. + */ + size_t get_byte_range_padding() const + { + auto const num_columns = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes); + + auto const max_row_bytes = 16 * 1024; // 16KB + auto const column_bytes = 64; + auto const base_padding = 1024; // 1KB + + if (num_columns == 0) { + // Use flat size if the number of columns is not known + return max_row_bytes; + } + + // Expand the size based on the number of columns, if available + return base_padding + num_columns * column_bytes; + } + /** * @brief Whether to read the file as a json object per line. */ diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index a85a610962e..c61cc26800e 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -57,31 +57,6 @@ namespace csv { using namespace cudf::io::csv; using namespace cudf::io; -/** - * @brief Estimates the maximum expected length or a row, based on the number - * of columns - * - * If the number of columns is not available, it will return a value large - * enough for most use cases - * - * @param[in] num_columns Number of columns in the CSV file (optional) - * - * @return Estimated maximum size of a row, in bytes - */ -constexpr size_t calculateMaxRowSize(int num_columns = 0) noexcept -{ - constexpr size_t max_row_bytes = 16 * 1024; // 16KB - constexpr size_t column_bytes = 64; - constexpr size_t base_padding = 1024; // 1KB - if (num_columns == 0) { - // Use flat size if the number of columns is not known - return max_row_bytes; - } else { - // Expand the size based on the number of columns, if available - return base_padding + num_columns * column_bytes; - } -} - /** * @brief Translates a dtype string and returns its dtype enumeration and any * extended dtype flags that are supported by cuIO. Often, this is a column @@ -199,31 +174,21 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream) std::pair, reader::impl::selected_rows_offsets> reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) { - auto range_offset = opts_.get_byte_range_offset(); - auto range_size = opts_.get_byte_range_size(); - auto skip_rows = opts_.get_skiprows(); - auto skip_end_rows = opts_.get_skipfooter(); - auto num_rows = opts_.get_nrows(); + auto range_offset = opts_.get_byte_range_offset(); + auto range_size = opts_.get_byte_range_size(); + auto range_size_padded = opts_.get_byte_range_size_with_padding(); + auto skip_rows = opts_.get_skiprows(); + auto skip_end_rows = opts_.get_skipfooter(); + auto num_rows = opts_.get_nrows(); if (range_offset > 0 || range_size > 0) { CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE, "Reading compressed data using `byte range` is unsupported"); } - size_t map_range_size = 0; - - if (range_size != 0) { - auto num_given_dtypes = - std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes()); - const auto num_columns = std::max(opts_.get_names().size(), num_given_dtypes); - map_range_size = range_size + calculateMaxRowSize(num_columns); - } - - // TODO: provide hint to datasource that we should memory map any underlying file. - // Transfer source data to GPU if (!source_->is_empty()) { - auto data_size = (map_range_size != 0) ? map_range_size : source_->size(); + auto data_size = (range_size_padded != 0) ? range_size_padded : source_->size(); auto buffer = source_->host_read(range_offset, data_size); auto h_data = host_span( // diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index ccc2eef56c7..438cb1762c6 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -107,10 +107,18 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder( namespace { -std::vector> make_datasources(source_info const& info) +std::vector> make_datasources(source_info const& info, + size_t range_offset = 0, + size_t range_size = 0) { switch (info.type) { - case io_type::FILEPATH: return cudf::io::datasource::create(info.filepaths); + case io_type::FILEPATH: { + auto sources = std::vector>(); + for (auto const& filepath : info.filepaths) { + sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size)); + } + return sources; + } case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers); case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources); default: CUDF_FAIL("Unsupported source type"); @@ -179,10 +187,12 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor CUDF_FUNC_RANGE(); - auto datasources = make_datasources(options.get_source()); - options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); + auto datasources = make_datasources(options.get_source(), + options.get_byte_range_offset(), + options.get_byte_range_size_with_padding()); + auto reader = std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); @@ -195,10 +205,12 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_ CUDF_FUNC_RANGE(); - auto datasources = make_datasources(options.get_source()); - options.set_compression(infer_compression_type(options.get_compression(), options.get_source())); + auto datasources = make_datasources(options.get_source(), + options.get_byte_range_offset(), + options.get_byte_range_size_with_padding()); + auto reader = std::make_unique(std::move(datasources), options, rmm::cuda_stream_default, mr); diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index bae7471e307..0618f02e98f 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -50,31 +50,6 @@ namespace detail { namespace json { using namespace cudf::io; -namespace { -/** - * @brief Estimates the maximum expected length or a row, based on the number - * of columns - * - * If the number of columns is not available, it will return a value large - * enough for most use cases - * - * @param[in] num_columns Number of columns in the JSON file (optional) - * - * @return Estimated maximum size of a row, in bytes - */ -constexpr size_t calculate_max_row_size(int num_columns = 0) noexcept -{ - constexpr size_t max_row_bytes = 16 * 1024; // 16KB - constexpr size_t column_bytes = 64; - constexpr size_t base_padding = 1024; // 1KB - return num_columns == 0 - ? max_row_bytes // Use flat size if the # of columns is not known - : base_padding + - num_columns * column_bytes; // Expand size based on the # of columns, if available -} - -} // anonymous namespace - /** * @brief Aggregate the table containing keys info by their hash values. * @@ -231,16 +206,12 @@ std::pair, col_map_ptr_type> reader::impl::get_json_obj * * @param[in] range_offset Number of bytes offset from the start * @param[in] range_size Bytes to read; use `0` for all remaining data + * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data */ -void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) +void reader::impl::ingest_raw_input(size_t range_offset, + size_t range_size, + size_t range_size_padded) { - size_t map_range_size = 0; - if (range_size != 0) { - auto const dtype_option_size = - std::visit([](const auto& dtypes) { return dtypes.size(); }, options_.get_dtypes()); - map_range_size = range_size + calculate_max_row_size(dtype_option_size); - } - // Iterate through the user defined sources and read the contents into the local buffer CUDF_EXPECTS(!sources_.empty(), "No sources were defined"); size_t total_source_size = 0; @@ -253,7 +224,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) size_t bytes_read = 0; for (const auto& source : sources_) { if (!source->is_empty()) { - auto data_size = (map_range_size != 0) ? map_range_size : source->size(); + auto data_size = (range_size_padded != 0) ? range_size_padded : source->size(); bytes_read += source->host_read(range_offset, data_size, &buffer_[bytes_read]); } } @@ -675,10 +646,11 @@ reader::impl::impl(std::vector>&& sources, table_with_metadata reader::impl::read(json_reader_options const& options, rmm::cuda_stream_view stream) { - auto range_offset = options.get_byte_range_offset(); - auto range_size = options.get_byte_range_size(); + auto range_offset = options.get_byte_range_offset(); + auto range_size = options.get_byte_range_size(); + auto range_size_padded = options.get_byte_range_size_with_padding(); - ingest_raw_input(range_offset, range_size); + ingest_raw_input(range_offset, range_size, range_size_padded); CUDF_EXPECTS(buffer_.size() != 0, "Ingest failed: input data is null.\n"); decompress_input(stream); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index f7af55b2b90..d01f2e8677e 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -109,8 +109,9 @@ class reader::impl { * * @param[in] range_offset Number of bytes offset from the start * @param[in] range_size Bytes to read; use `0` for all remaining data + * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data */ - void ingest_raw_input(size_t range_offset, size_t range_size); + void ingest_raw_input(size_t range_offset, size_t range_size, size_t range_size_padded); /** * @brief Extract the JSON objects keys from the input file with object rows. diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 8fb5d7cc9eb..5511a65d0a4 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1069,6 +1069,20 @@ def test_csv_reader_byte_range(tmpdir, segment_bytes): assert list(df["int2"]) == list(ref_df["int2"]) +def test_csv_reader_byte_range_type_corner_case(tmpdir): + fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv") + + cudf.datasets.timeseries( + start="2000-01-01", + end="2000-01-02", + dtypes={"name": str, "id": int, "x": float, "y": float}, + ).to_csv(fname, chunksize=100000) + + byte_range = (2_147_483_648, 0) + with pytest.raises(RuntimeError, match="Offset is past end of file"): + cudf.read_csv(fname, byte_range=byte_range, header=None) + + @pytest.mark.parametrize("segment_bytes", [10, 19, 31, 36]) def test_csv_reader_byte_range_strings(segment_bytes): names = ["strings"] From d422aebbe62d7e9915af93f474563e6e1c571e97 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 18 Aug 2021 15:30:38 -0500 Subject: [PATCH 5/6] remove filepaths from json reader --- cpp/src/io/json/reader_impl.cu | 12 +++++------- cpp/src/io/json/reader_impl.hpp | 3 +-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 0618f02e98f..2964a12568f 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -231,7 +231,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, byte_range_offset_ = range_offset; byte_range_size_ = range_size; - load_whole_file_ = byte_range_offset_ == 0 && byte_range_size_ == 0; + load_whole_source_ = byte_range_offset_ == 0 && byte_range_size_ == 0; } /** @@ -256,7 +256,7 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream) uncomp_data_ = uncomp_data_owner_.data(); uncomp_size_ = uncomp_data_owner_.size(); } - if (load_whole_file_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream); + if (load_whole_source_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream); } rmm::device_uvector reader::impl::find_record_starts(rmm::cuda_stream_view stream) @@ -268,7 +268,7 @@ rmm::device_uvector reader::impl::find_record_starts(rmm::cuda_stream_ if (allow_newlines_in_strings_) { chars_to_count.push_back('\"'); } // If not starting at an offset, add an extra row to account for the first row in the file cudf::size_type prefilter_count = ((byte_range_offset_ == 0) ? 1 : 0); - if (load_whole_file_) { + if (load_whole_source_) { prefilter_count += count_all_from_set(data_, chars_to_count, stream); } else { prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count, stream); @@ -286,7 +286,7 @@ rmm::device_uvector reader::impl::find_record_starts(rmm::cuda_stream_ std::vector chars_to_find{'\n'}; if (allow_newlines_in_strings_) { chars_to_find.push_back('\"'); } // Passing offset = 1 to return positions AFTER the found character - if (load_whole_file_) { + if (load_whole_source_) { find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream); } else { find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr, stream); @@ -619,7 +619,6 @@ table_with_metadata reader::impl::convert_data_to_table(device_span>&& sources, - std::vector const& filepaths, json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -678,8 +677,7 @@ reader::reader(std::vector>&& sources, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - std::vector file_paths = {}; // Empty filepaths - _impl = std::make_unique(std::move(sources), file_paths, options, stream, mr); + _impl = std::make_unique(std::move(sources), options, stream, mr); } // Destructor within this translation unit diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index d01f2e8677e..d910cce2d72 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -68,7 +68,7 @@ class reader::impl { size_t byte_range_offset_ = 0; size_t byte_range_size_ = 0; - bool load_whole_file_ = true; + bool load_whole_source_ = true; table_metadata metadata_; std::vector dtypes_; @@ -186,7 +186,6 @@ class reader::impl { * @brief Constructor from a dataset source with reader options. */ explicit impl(std::vector>&& sources, - std::vector const& filepaths, json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); From 2ac281f207f129d75b29e3316e9a644c0f4758ce Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 24 Aug 2021 09:03:58 -0500 Subject: [PATCH 6/6] minor style adjustments --- cpp/src/io/comp/uncomp.cpp | 18 +++++++++--------- cpp/src/io/csv/reader_impl.cu | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index ee451d04dbb..e08cf1f8e1b 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -425,15 +425,15 @@ std::vector io_uncompress_single_h2d(const void* src, size_t src_size, int std::vector get_uncompressed_data(host_span const data, compression_type compression) { - int comp_type = IO_UNCOMP_STREAM_TYPE_INFER; - - switch (compression) { - case compression_type::GZIP: comp_type = IO_UNCOMP_STREAM_TYPE_GZIP; break; - case compression_type::ZIP: comp_type = IO_UNCOMP_STREAM_TYPE_ZIP; break; - case compression_type::BZIP2: comp_type = IO_UNCOMP_STREAM_TYPE_BZIP2; break; - case compression_type::XZ: comp_type = IO_UNCOMP_STREAM_TYPE_XZ; break; - default: break; - } + auto const comp_type = [compression]() { + switch (compression) { + case compression_type::GZIP: return IO_UNCOMP_STREAM_TYPE_GZIP; + case compression_type::ZIP: return IO_UNCOMP_STREAM_TYPE_ZIP; + case compression_type::BZIP2: return IO_UNCOMP_STREAM_TYPE_BZIP2; + case compression_type::XZ: return IO_UNCOMP_STREAM_TYPE_XZ; + default: return IO_UNCOMP_STREAM_TYPE_INFER; + } + }(); return io_uncompress_single_h2d(data.data(), data.size(), comp_type); } diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index c61cc26800e..5877805f5c8 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -188,8 +188,8 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) // Transfer source data to GPU if (!source_->is_empty()) { - auto data_size = (range_size_padded != 0) ? range_size_padded : source_->size(); - auto buffer = source_->host_read(range_offset, data_size); + auto const data_size = (range_size_padded != 0) ? range_size_padded : source_->size(); + auto const buffer = source_->host_read(range_offset, data_size); auto h_data = host_span( // reinterpret_cast(buffer->data()),