From 345887e5450e187f2f1c3cfaaf91f494bcd20bce Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 25 Mar 2021 13:01:56 -0700 Subject: [PATCH] docs; pull up _file to reuse; replace lazy size query in file_wrapper; consistent size checks --- cpp/include/cudf/io/datasource.hpp | 2 +- cpp/src/io/utilities/datasource.cpp | 79 ++++++++++++---------- cpp/src/io/utilities/file_io_utilities.cpp | 21 +++--- cpp/src/io/utilities/file_io_utilities.hpp | 6 +- 4 files changed, 55 insertions(+), 53 deletions(-) diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index 8fcc045e6d2..ab7a3a6fa9b 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -123,7 +123,7 @@ class datasource { * @param[in] offset Bytes from the start * @param[in] size Bytes to read * - * @return The data buffer + * @return The data buffer (can be smaller than size) */ virtual std::unique_ptr host_read(size_t offset, size_t size) = 0; diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 754807fb9c8..34427749165 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -27,15 +27,16 @@ namespace cudf { namespace io { /** - * @brief Implementation class for reading from a file or memory source using - * memory mapped access. - * - * Unlike Arrow's memory mapped IO class, this implementation allows memory - * mapping a subset of the file where the starting offset may not be zero. + * @brief Base class for file input. Only implements direct device reads. */ class file_source : public datasource { public: - explicit file_source(const char *filepath) : _cufile_in(detail::make_cufile_input(filepath)) {} + explicit file_source(const char *filepath) + : _file(filepath, O_RDONLY), + _file_size{_file.size()}, + _cufile_in(detail::make_cufile_input(filepath)) + { + } virtual ~file_source() = default; @@ -51,7 +52,9 @@ class file_source : public datasource { rmm::cuda_stream_view stream) override { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - return _cufile_in->read(offset, size, stream); + + auto const read_size = std::min(size, _file.size() - offset); + return _cufile_in->read(offset, read_size, stream); } size_t device_read(size_t offset, @@ -60,21 +63,32 @@ class file_source : public datasource { rmm::cuda_stream_view stream) override { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - return _cufile_in->read(offset, size, dst, stream); + + auto const read_size = std::min(size, _file.size() - offset); + return _cufile_in->read(offset, read_size, dst, stream); } + size_t size() const override { return _file.size(); } + + protected: + detail::file_wrapper _file; + private: std::unique_ptr _cufile_in; }; +/** + * @brief Implementation class for reading from a file using memory mapped access. + * + * Unlike Arrow's memory mapped IO class, this implementation allows memory mapping a subset of the + * file where the starting offset may not be zero. + */ class memory_mapped_source : public file_source { public: explicit memory_mapped_source(const char *filepath, size_t offset, size_t size) : file_source(filepath) { - auto const file = detail::file_wrapper(filepath, O_RDONLY); - _file_size = file.size(); - if (_file_size != 0) { map(file.desc(), offset, size); } + map(_file.desc(), offset, size); } virtual ~memory_mapped_source() @@ -105,22 +119,15 @@ class memory_mapped_source : public file_source { return read_size; } - size_t size() const override { return _file_size; } - private: void map(int fd, size_t offset, size_t size) { - CUDF_EXPECTS(offset < _file_size, "Offset is past end of file"); + CUDF_EXPECTS(offset + size < _file.size(), "Mapped region is past end of file"); // Offset for `mmap()` must be page aligned _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1); - // Clamp length to available data in the file - if (size == 0) { - size = _file_size - offset; - } else { - if ((offset + size) > _file_size) { size = _file_size - offset; } - } + if (size == 0) { size = _file.size() - offset; } // Size for `mmap()` needs to include the page padding _map_size = size + (offset - _map_offset); @@ -133,25 +140,26 @@ class memory_mapped_source : public file_source { private: size_t _map_size = 0; size_t _map_offset = 0; - size_t _file_size = 0; void *_map_addr = nullptr; }; +/** + * @brief Implementation class for reading from a file using `read` calls + * + * Potentially faster than `memory_mapped_source` when only a small portion of the file is read + * through the host. + */ class direct_read_source : public file_source { public: - explicit direct_read_source(const char *filepath) - : file_source(filepath), _file(filepath, O_RDONLY) - { - struct stat st; - CUDF_EXPECTS(fstat(_file.desc(), &st) != -1, "Cannot query file size"); - _file_size = static_cast(st.st_size); - } + explicit direct_read_source(const char *filepath) : file_source(filepath) {} std::unique_ptr host_read(size_t offset, size_t size) override { lseek(_file.desc(), offset, SEEK_SET); + // Clamp length to available data - ssize_t const read_size = std::min(size, _file_size - offset); + ssize_t const read_size = std::min(size, _file.size() - offset); + std::vector v(read_size); CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); return buffer::create(std::move(v)); @@ -162,15 +170,12 @@ class direct_read_source : public file_source { lseek(_file.desc(), offset, SEEK_SET); // Clamp length to available data - auto const read_size = std::min(size, _file_size - offset); - return read(_file.desc(), dst, read_size); - } - - size_t size() const override { return _file_size; } + auto const read_size = std::min(size, _file.size() - offset); - private: - size_t _file_size = 0; - detail::file_wrapper _file; + CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast(read_size), + "read failed"); + return read_size; + } }; /** diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 6a19c903931..3544241f659 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -25,30 +25,27 @@ namespace cudf { namespace io { namespace detail { +size_t get_file_size(int fd) +{ + struct stat st; + CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size"); + return static_cast(st.st_size); +} + file_wrapper::file_wrapper(std::string const &filepath, int flags) - : fd(open(filepath.c_str(), flags)) + : fd(open(filepath.c_str(), flags)), _size{get_file_size(fd)} { CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath); } file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode) - : fd(open(filepath.c_str(), flags, mode)) + : fd(open(filepath.c_str(), flags, mode)), _size{get_file_size(fd)} { CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath); } file_wrapper::~file_wrapper() { close(fd); } -long file_wrapper::size() const -{ - if (_size < 0) { - struct stat st; - CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size"); - _size = static_cast(st.st_size); - } - return _size; -} - std::string getenv_or(std::string const &env_var_name, std::string const &default_val) { auto const env_val = std::getenv(env_var_name.c_str()); diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 331b80d1af5..0119484aee5 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -36,14 +36,14 @@ namespace detail { * @brief Class that provides RAII for file handling. */ class file_wrapper { - int const fd = -1; - long mutable _size = -1; + int fd = -1; + size_t _size; public: explicit file_wrapper(std::string const &filepath, int flags); explicit file_wrapper(std::string const &filepath, int flags, mode_t mode); ~file_wrapper(); - long size() const; + auto size() const { return _size; } auto desc() const { return fd; } };