From bfb424d2e036e74c333b0f28ae80d260c67dd6ad Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Mar 2021 17:34:53 -0700 Subject: [PATCH 1/8] add new source type to avoid mmap when GDS compatibility mode is enabled --- cpp/src/io/utilities/datasource.cpp | 129 ++++++++++++++------- cpp/src/io/utilities/file_io_utilities.cpp | 88 +++++--------- cpp/src/io/utilities/file_io_utilities.hpp | 33 ++++++ 3 files changed, 151 insertions(+), 99 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 3f2884d5b7d..58b84923de1 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -33,20 +33,44 @@ namespace io { * Unlike Arrow's memory mapped IO class, this implementation allows memory * mapping a subset of the file where the starting offset may not be zero. */ -class memory_mapped_source : public datasource { - class memory_mapped_buffer : public buffer { - size_t _size = 0; - uint8_t *_data = nullptr; +class file_source : public datasource { + public: + explicit file_source(const char *filepath) : _cufile_in(detail::make_cufile_input(filepath)) {} - public: - memory_mapped_buffer(uint8_t *data, size_t size) : _size(size), _data(data) {} - size_t size() const override { return _size; } - const uint8_t *data() const override { return _data; } - }; + virtual ~file_source() = default; + + bool supports_device_read() const override { return _cufile_in != nullptr; } + + bool is_device_read_preferred(size_t size) const + { + return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size); + } + std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override + { + CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); + return _cufile_in->read(offset, size, stream); + } + + size_t device_read(size_t offset, + size_t size, + uint8_t *dst, + rmm::cuda_stream_view stream) override + { + CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); + return _cufile_in->read(offset, size, dst, stream); + } + + private: + std::unique_ptr _cufile_in; +}; + +class memory_mapped_source : public file_source { public: explicit memory_mapped_source(const char *filepath, size_t offset, size_t size) - : _cufile_in(detail::make_cufile_input(filepath)) + : file_source(filepath) { auto const file = detail::file_wrapper(filepath, O_RDONLY); _file_size = file.size(); @@ -65,7 +89,7 @@ class memory_mapped_source : public datasource { // Clamp length to available data in the mapped region auto const read_size = std::min(size, _map_size - (offset - _map_offset)); - return std::make_unique( + return std::make_unique( static_cast(_map_addr) + (offset - _map_offset), read_size); } @@ -81,33 +105,6 @@ class memory_mapped_source : public datasource { return read_size; } - bool supports_device_read() const override { return _cufile_in != nullptr; } - - bool is_device_read_preferred(size_t size) const - { - return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size); - } - - std::unique_ptr device_read(size_t offset, - size_t size, - rmm::cuda_stream_view stream) override - { - if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file."); - - auto const read_size = std::min(size, _map_size - (offset - _map_offset)); - return _cufile_in->read(offset, read_size, stream); - } - - size_t device_read(size_t offset, - size_t size, - uint8_t *dst, - rmm::cuda_stream_view stream) override - { - if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file."); - auto const read_size = std::min(size, _map_size - (offset - _map_offset)); - return _cufile_in->read(offset, read_size, dst, stream); - } - size_t size() const override { return _file_size; } private: @@ -134,11 +131,55 @@ class memory_mapped_source : public datasource { } private: - size_t _file_size = 0; - void *_map_addr = nullptr; size_t _map_size = 0; size_t _map_offset = 0; - std::unique_ptr _cufile_in; + size_t _file_size = 0; + void *_map_addr = nullptr; +}; + +class direct_read_source : public file_source { + class vector_buffer : public buffer { + std::vector _data; + + public: + explicit vector_buffer(std::vector &&data) : _data(std::move(data)) {} + size_t size() const override { return _data.size(); } + const uint8_t *data() const override { return _data.data(); } + }; + + public: + explicit direct_read_source(const char *filepath) + : file_source(filepath), _file(filepath, O_RDONLY) + { + struct stat st; + CUDF_EXPECTS(fstat(_file.desc(), &st) != -1, "Cannot query file size"); + _file_size = static_cast(st.st_size); + } + + std::unique_ptr host_read(size_t offset, size_t size) override + { + lseek(_file.desc(), offset, SEEK_SET); + // Clamp length to available data + ssize_t const read_size = std::min(size, _file_size - offset); + std::vector v(read_size); + CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); + return std::make_unique(std::move(v)); + } + + size_t host_read(size_t offset, size_t size, uint8_t *dst) override + { + lseek(_file.desc(), offset, SEEK_SET); + + // Clamp length to available data + auto const read_size = std::min(size, _file_size - offset); + return read(_file.desc(), dst, read_size); + } + + size_t size() const override { return _file_size; } + + private: + size_t _file_size = 0; + detail::file_wrapper _file; }; /** @@ -189,6 +230,12 @@ std::unique_ptr datasource::create(const std::string &filepath, size_t offset, size_t size) { +#ifdef CUFILE_FOUND + if (detail::cufile_config::instance()->is_required()) { + // avoid mmap as GDS is expected to be used for most reads + return std::make_unique(filepath.c_str()); + } +#endif // Use our own memory mapping implementation for direct file reads return std::make_unique(filepath.c_str(), offset, size); } diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 22ff057cbc1..e382ea54a04 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include #include @@ -52,67 +51,40 @@ long file_wrapper::size() const #ifdef CUFILE_FOUND -/** - * @brief Class that manages cuFile configuration. - */ -class cufile_config { - std::string const default_policy = "OFF"; - std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON"; - - std::string const policy = default_policy; - temp_directory tmp_config_dir{"cudf_cufile_config"}; - - std::string getenv_or(std::string const &env_var_name, std::string const &default_val) - { - auto const env_val = std::getenv(env_var_name.c_str()); - return (env_val == nullptr) ? default_val : std::string(env_val); - } +std::string cufile_config::getenv_or(std::string const &env_var_name, + std::string const &default_val) +{ + auto const env_val = std::getenv(env_var_name.c_str()); + return (env_val == nullptr) ? default_val : std::string(env_val); +} - cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)} - { - if (is_enabled()) { - // Modify the config file based on the policy - auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json"); - std::ifstream user_config_file(config_file_path); - // Modified config file is stored in a temporary directory - auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json"; - std::ofstream cudf_config_file(cudf_config_path); - - std::string line; - while (std::getline(user_config_file, line)) { - std::string const tag = "\"allow_compat_mode\""; - if (line.find(tag) != std::string::npos) { - // TODO: only replace the true/false value - // Enable compatiblity mode when cuDF does not fall back to host path - cudf_config_file << tag << ": " << (is_required() ? "true" : "false") << ",\n"; - } else { - cudf_config_file << line << '\n'; - } - - // Point libcufile to the modified config file - CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0, - "Failed to set the cuFile config file environment variable."); +cufile_config::cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)} +{ + if (is_enabled()) { + // Modify the config file based on the policy + auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json"); + std::ifstream user_config_file(config_file_path); + // Modified config file is stored in a temporary directory + auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json"; + std::ofstream cudf_config_file(cudf_config_path); + + std::string line; + while (std::getline(user_config_file, line)) { + std::string const tag = "\"allow_compat_mode\""; + if (line.find(tag) != std::string::npos) { + // TODO: only replace the true/false value + // Enable compatiblity mode when cuDF does not fall back to host path + cudf_config_file << tag << ": " << (is_required() ? "true" : "false") << ",\n"; + } else { + cudf_config_file << line << '\n'; } - } - } - - public: - /** - * @brief Returns true when cuFile use is enabled. - */ - bool is_enabled() const { return policy == "ALWAYS" or policy == "GDS"; } - - /** - * @brief Returns true when cuDF should not fall back to host IO. - */ - bool is_required() const { return policy == "ALWAYS"; } - static cufile_config const *instance() - { - static cufile_config _instance; - return &_instance; + // Point libcufile to the modified config file + CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0, + "Failed to set the cuFile config file environment variable."); + } } -}; +} /** * @brief Class that dynamically loads the cuFile library and manages the cuFile driver. diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 85399bdd44d..8c487f50f1d 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -24,6 +24,7 @@ #include #include +#include #include @@ -128,6 +129,38 @@ class cufile_output : public cufile_io_base { class cufile_shim; +/** + * @brief Class that manages cuFile configuration. + */ +class cufile_config { + std::string const default_policy = "OFF"; + std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON"; + + std::string const policy = default_policy; + temp_directory tmp_config_dir{"cudf_cufile_config"}; + + std::string getenv_or(std::string const &env_var_name, std::string const &default_val); + + cufile_config(); + + public: + /** + * @brief Returns true when cuFile use is enabled. + */ + bool is_enabled() const { return policy == "ALWAYS" or policy == "GDS"; } + + /** + * @brief Returns true when cuDF should not fall back to host IO. + */ + bool is_required() const { return policy == "ALWAYS"; } + + static cufile_config const *instance() + { + static cufile_config _instance; + return &_instance; + } +}; + /** * @brief Class that provides RAII for cuFile file registration. */ From 05ab2ddd1f17897f32f96f1469cba28612f7f539 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 24 Mar 2021 17:38:59 -0700 Subject: [PATCH 2/8] remove redundant buffer type --- cpp/src/io/utilities/datasource.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 58b84923de1..754807fb9c8 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -138,15 +138,6 @@ class memory_mapped_source : public file_source { }; class direct_read_source : public file_source { - class vector_buffer : public buffer { - std::vector _data; - - public: - explicit vector_buffer(std::vector &&data) : _data(std::move(data)) {} - size_t size() const override { return _data.size(); } - const uint8_t *data() const override { return _data.data(); } - }; - public: explicit direct_read_source(const char *filepath) : file_source(filepath), _file(filepath, O_RDONLY) @@ -163,7 +154,7 @@ class direct_read_source : public file_source { ssize_t const read_size = std::min(size, _file_size - offset); std::vector v(read_size); CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); - return std::make_unique(std::move(v)); + return buffer::create(std::move(v)); } size_t host_read(size_t offset, size_t size, uint8_t *dst) override From 9f4e25157294b968537bbf66e603ca5b951268ff Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 25 Mar 2021 11:08:49 -0700 Subject: [PATCH 3/8] move cufile_config::instance definiton to the cpp file --- cpp/src/io/utilities/file_io_utilities.cpp | 5 +++++ cpp/src/io/utilities/file_io_utilities.hpp | 6 +----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index e382ea54a04..578d27cbad0 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -85,6 +85,11 @@ cufile_config::cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", defau } } } +cufile_config const *cufile_config::instance() +{ + static cufile_config _instance; + return &_instance; +} /** * @brief Class that dynamically loads the cuFile library and manages the cuFile driver. diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 8c487f50f1d..1542a747428 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -154,11 +154,7 @@ class cufile_config { */ bool is_required() const { return policy == "ALWAYS"; } - static cufile_config const *instance() - { - static cufile_config _instance; - return &_instance; - } + static cufile_config const *instance(); }; /** From ef742a04daf8c10fd758ab7cbf507da9b40146ef Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 25 Mar 2021 11:24:44 -0700 Subject: [PATCH 4/8] make getnev_or a free function --- cpp/src/io/utilities/file_io_utilities.cpp | 7 +++---- cpp/src/io/utilities/file_io_utilities.hpp | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 578d27cbad0..6a19c903931 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -49,15 +49,14 @@ long file_wrapper::size() const return _size; } -#ifdef CUFILE_FOUND - -std::string cufile_config::getenv_or(std::string const &env_var_name, - std::string const &default_val) +std::string getenv_or(std::string const &env_var_name, std::string const &default_val) { auto const env_val = std::getenv(env_var_name.c_str()); return (env_val == nullptr) ? default_val : std::string(env_val); } +#ifdef CUFILE_FOUND + cufile_config::cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)} { if (is_enabled()) { diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 1542a747428..331b80d1af5 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -139,8 +139,6 @@ class cufile_config { std::string const policy = default_policy; temp_directory tmp_config_dir{"cudf_cufile_config"}; - std::string getenv_or(std::string const &env_var_name, std::string const &default_val); - cufile_config(); public: From 345887e5450e187f2f1c3cfaaf91f494bcd20bce Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 25 Mar 2021 13:01:56 -0700 Subject: [PATCH 5/8] docs; pull up _file to reuse; replace lazy size query in file_wrapper; consistent size checks --- cpp/include/cudf/io/datasource.hpp | 2 +- cpp/src/io/utilities/datasource.cpp | 79 ++++++++++++---------- cpp/src/io/utilities/file_io_utilities.cpp | 21 +++--- cpp/src/io/utilities/file_io_utilities.hpp | 6 +- 4 files changed, 55 insertions(+), 53 deletions(-) diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index 8fcc045e6d2..ab7a3a6fa9b 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -123,7 +123,7 @@ class datasource { * @param[in] offset Bytes from the start * @param[in] size Bytes to read * - * @return The data buffer + * @return The data buffer (can be smaller than size) */ virtual std::unique_ptr host_read(size_t offset, size_t size) = 0; diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 754807fb9c8..34427749165 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -27,15 +27,16 @@ namespace cudf { namespace io { /** - * @brief Implementation class for reading from a file or memory source using - * memory mapped access. - * - * Unlike Arrow's memory mapped IO class, this implementation allows memory - * mapping a subset of the file where the starting offset may not be zero. + * @brief Base class for file input. Only implements direct device reads. */ class file_source : public datasource { public: - explicit file_source(const char *filepath) : _cufile_in(detail::make_cufile_input(filepath)) {} + explicit file_source(const char *filepath) + : _file(filepath, O_RDONLY), + _file_size{_file.size()}, + _cufile_in(detail::make_cufile_input(filepath)) + { + } virtual ~file_source() = default; @@ -51,7 +52,9 @@ class file_source : public datasource { rmm::cuda_stream_view stream) override { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - return _cufile_in->read(offset, size, stream); + + auto const read_size = std::min(size, _file.size() - offset); + return _cufile_in->read(offset, read_size, stream); } size_t device_read(size_t offset, @@ -60,21 +63,32 @@ class file_source : public datasource { rmm::cuda_stream_view stream) override { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - return _cufile_in->read(offset, size, dst, stream); + + auto const read_size = std::min(size, _file.size() - offset); + return _cufile_in->read(offset, read_size, dst, stream); } + size_t size() const override { return _file.size(); } + + protected: + detail::file_wrapper _file; + private: std::unique_ptr _cufile_in; }; +/** + * @brief Implementation class for reading from a file using memory mapped access. + * + * Unlike Arrow's memory mapped IO class, this implementation allows memory mapping a subset of the + * file where the starting offset may not be zero. + */ class memory_mapped_source : public file_source { public: explicit memory_mapped_source(const char *filepath, size_t offset, size_t size) : file_source(filepath) { - auto const file = detail::file_wrapper(filepath, O_RDONLY); - _file_size = file.size(); - if (_file_size != 0) { map(file.desc(), offset, size); } + map(_file.desc(), offset, size); } virtual ~memory_mapped_source() @@ -105,22 +119,15 @@ class memory_mapped_source : public file_source { return read_size; } - size_t size() const override { return _file_size; } - private: void map(int fd, size_t offset, size_t size) { - CUDF_EXPECTS(offset < _file_size, "Offset is past end of file"); + CUDF_EXPECTS(offset + size < _file.size(), "Mapped region is past end of file"); // Offset for `mmap()` must be page aligned _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1); - // Clamp length to available data in the file - if (size == 0) { - size = _file_size - offset; - } else { - if ((offset + size) > _file_size) { size = _file_size - offset; } - } + if (size == 0) { size = _file.size() - offset; } // Size for `mmap()` needs to include the page padding _map_size = size + (offset - _map_offset); @@ -133,25 +140,26 @@ class memory_mapped_source : public file_source { private: size_t _map_size = 0; size_t _map_offset = 0; - size_t _file_size = 0; void *_map_addr = nullptr; }; +/** + * @brief Implementation class for reading from a file using `read` calls + * + * Potentially faster than `memory_mapped_source` when only a small portion of the file is read + * through the host. + */ class direct_read_source : public file_source { public: - explicit direct_read_source(const char *filepath) - : file_source(filepath), _file(filepath, O_RDONLY) - { - struct stat st; - CUDF_EXPECTS(fstat(_file.desc(), &st) != -1, "Cannot query file size"); - _file_size = static_cast(st.st_size); - } + explicit direct_read_source(const char *filepath) : file_source(filepath) {} std::unique_ptr host_read(size_t offset, size_t size) override { lseek(_file.desc(), offset, SEEK_SET); + // Clamp length to available data - ssize_t const read_size = std::min(size, _file_size - offset); + ssize_t const read_size = std::min(size, _file.size() - offset); + std::vector v(read_size); CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); return buffer::create(std::move(v)); @@ -162,15 +170,12 @@ class direct_read_source : public file_source { lseek(_file.desc(), offset, SEEK_SET); // Clamp length to available data - auto const read_size = std::min(size, _file_size - offset); - return read(_file.desc(), dst, read_size); - } - - size_t size() const override { return _file_size; } + auto const read_size = std::min(size, _file.size() - offset); - private: - size_t _file_size = 0; - detail::file_wrapper _file; + CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast(read_size), + "read failed"); + return read_size; + } }; /** diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 6a19c903931..3544241f659 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -25,30 +25,27 @@ namespace cudf { namespace io { namespace detail { +size_t get_file_size(int fd) +{ + struct stat st; + CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size"); + return static_cast(st.st_size); +} + file_wrapper::file_wrapper(std::string const &filepath, int flags) - : fd(open(filepath.c_str(), flags)) + : fd(open(filepath.c_str(), flags)), _size{get_file_size(fd)} { CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath); } file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode) - : fd(open(filepath.c_str(), flags, mode)) + : fd(open(filepath.c_str(), flags, mode)), _size{get_file_size(fd)} { CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath); } file_wrapper::~file_wrapper() { close(fd); } -long file_wrapper::size() const -{ - if (_size < 0) { - struct stat st; - CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size"); - _size = static_cast(st.st_size); - } - return _size; -} - std::string getenv_or(std::string const &env_var_name, std::string const &default_val) { auto const env_val = std::getenv(env_var_name.c_str()); diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 331b80d1af5..0119484aee5 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -36,14 +36,14 @@ namespace detail { * @brief Class that provides RAII for file handling. */ class file_wrapper { - int const fd = -1; - long mutable _size = -1; + int fd = -1; + size_t _size; public: explicit file_wrapper(std::string const &filepath, int flags); explicit file_wrapper(std::string const &filepath, int flags, mode_t mode); ~file_wrapper(); - long size() const; + auto size() const { return _size; } auto desc() const { return fd; } }; From fe209299311a71b4eeef922ab607d777d511b53d Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 25 Mar 2021 13:12:36 -0700 Subject: [PATCH 6/8] fix + name improvement --- cpp/src/io/utilities/datasource.cpp | 4 +--- cpp/src/io/utilities/file_io_utilities.cpp | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 34427749165..a448640efd4 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -32,9 +32,7 @@ namespace io { class file_source : public datasource { public: explicit file_source(const char *filepath) - : _file(filepath, O_RDONLY), - _file_size{_file.size()}, - _cufile_in(detail::make_cufile_input(filepath)) + : _file(filepath, O_RDONLY), _cufile_in(detail::make_cufile_input(filepath)) { } diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 3544241f659..322296715fc 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -25,10 +25,10 @@ namespace cudf { namespace io { namespace detail { -size_t get_file_size(int fd) +size_t get_file_size(int file_descriptor) { struct stat st; - CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size"); + CUDF_EXPECTS(fstat(file_descriptor, &st) != -1, "Cannot query file size"); return static_cast(st.st_size); } From 0cce85896f81c233c74bb253ea18d7503c188aee Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 25 Mar 2021 18:38:42 -0700 Subject: [PATCH 7/8] fix error with mapped source --- cpp/src/io/utilities/datasource.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index a448640efd4..abc7a5ae056 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -86,7 +86,7 @@ class memory_mapped_source : public file_source { explicit memory_mapped_source(const char *filepath, size_t offset, size_t size) : file_source(filepath) { - map(_file.desc(), offset, size); + if (_file.size() != 0) map(_file.desc(), offset, size); } virtual ~memory_mapped_source() @@ -120,12 +120,12 @@ class memory_mapped_source : public file_source { private: void map(int fd, size_t offset, size_t size) { - CUDF_EXPECTS(offset + size < _file.size(), "Mapped region is past end of file"); + CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file"); // Offset for `mmap()` must be page aligned _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1); - if (size == 0) { size = _file.size() - offset; } + if (size == 0 || (offset + size) > _file.size()) { size = _file.size() - offset; } // Size for `mmap()` needs to include the page padding _map_size = size + (offset - _map_offset); From ca63a42007212baa18e3d1809558ea39382586db Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 29 Mar 2021 12:16:47 -0700 Subject: [PATCH 8/8] place concrete datasource type in anonymous namespace --- cpp/src/io/utilities/datasource.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index abc7a5ae056..8f2a5389b4d 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -25,6 +25,7 @@ namespace cudf { namespace io { +namespace { /** * @brief Base class for file input. Only implements direct device reads. @@ -220,6 +221,8 @@ class user_datasource_wrapper : public datasource { datasource *const source; ///< A non-owning pointer to the user-implemented datasource }; +} // namespace + std::unique_ptr datasource::create(const std::string &filepath, size_t offset, size_t size)