Skip to content

Commit

Permalink
docs; pull up _file to reuse; replace lazy size query in file_wrapper…
Browse files Browse the repository at this point in the history
…; consistent size checks
  • Loading branch information
vuule committed Mar 25, 2021
1 parent ef742a0 commit 345887e
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 53 deletions.
2 changes: 1 addition & 1 deletion cpp/include/cudf/io/datasource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ class datasource {
* @param[in] offset Bytes from the start
* @param[in] size Bytes to read
*
* @return The data buffer
* @return The data buffer (can be smaller than size)
*/
virtual std::unique_ptr<datasource::buffer> host_read(size_t offset, size_t size) = 0;

Expand Down
79 changes: 42 additions & 37 deletions cpp/src/io/utilities/datasource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,16 @@ namespace cudf {
namespace io {

/**
* @brief Implementation class for reading from a file or memory source using
* memory mapped access.
*
* Unlike Arrow's memory mapped IO class, this implementation allows memory
* mapping a subset of the file where the starting offset may not be zero.
* @brief Base class for file input. Only implements direct device reads.
*/
class file_source : public datasource {
public:
explicit file_source(const char *filepath) : _cufile_in(detail::make_cufile_input(filepath)) {}
explicit file_source(const char *filepath)
: _file(filepath, O_RDONLY),
_file_size{_file.size()},
_cufile_in(detail::make_cufile_input(filepath))
{
}

virtual ~file_source() = default;

Expand All @@ -51,7 +52,9 @@ class file_source : public datasource {
rmm::cuda_stream_view stream) override
{
CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
return _cufile_in->read(offset, size, stream);

auto const read_size = std::min(size, _file.size() - offset);
return _cufile_in->read(offset, read_size, stream);
}

size_t device_read(size_t offset,
Expand All @@ -60,21 +63,32 @@ class file_source : public datasource {
rmm::cuda_stream_view stream) override
{
CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
return _cufile_in->read(offset, size, dst, stream);

auto const read_size = std::min(size, _file.size() - offset);
return _cufile_in->read(offset, read_size, dst, stream);
}

size_t size() const override { return _file.size(); }

protected:
detail::file_wrapper _file;

private:
std::unique_ptr<detail::cufile_input_impl> _cufile_in;
};

/**
* @brief Implementation class for reading from a file using memory mapped access.
*
* Unlike Arrow's memory mapped IO class, this implementation allows memory mapping a subset of the
* file where the starting offset may not be zero.
*/
class memory_mapped_source : public file_source {
public:
explicit memory_mapped_source(const char *filepath, size_t offset, size_t size)
: file_source(filepath)
{
auto const file = detail::file_wrapper(filepath, O_RDONLY);
_file_size = file.size();
if (_file_size != 0) { map(file.desc(), offset, size); }
map(_file.desc(), offset, size);
}

virtual ~memory_mapped_source()
Expand Down Expand Up @@ -105,22 +119,15 @@ class memory_mapped_source : public file_source {
return read_size;
}

size_t size() const override { return _file_size; }

private:
void map(int fd, size_t offset, size_t size)
{
CUDF_EXPECTS(offset < _file_size, "Offset is past end of file");
CUDF_EXPECTS(offset + size < _file.size(), "Mapped region is past end of file");

// Offset for `mmap()` must be page aligned
_map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);

// Clamp length to available data in the file
if (size == 0) {
size = _file_size - offset;
} else {
if ((offset + size) > _file_size) { size = _file_size - offset; }
}
if (size == 0) { size = _file.size() - offset; }

// Size for `mmap()` needs to include the page padding
_map_size = size + (offset - _map_offset);
Expand All @@ -133,25 +140,26 @@ class memory_mapped_source : public file_source {
private:
size_t _map_size = 0;
size_t _map_offset = 0;
size_t _file_size = 0;
void *_map_addr = nullptr;
};

/**
* @brief Implementation class for reading from a file using `read` calls
*
* Potentially faster than `memory_mapped_source` when only a small portion of the file is read
* through the host.
*/
class direct_read_source : public file_source {
public:
explicit direct_read_source(const char *filepath)
: file_source(filepath), _file(filepath, O_RDONLY)
{
struct stat st;
CUDF_EXPECTS(fstat(_file.desc(), &st) != -1, "Cannot query file size");
_file_size = static_cast<size_t>(st.st_size);
}
explicit direct_read_source(const char *filepath) : file_source(filepath) {}

std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
{
lseek(_file.desc(), offset, SEEK_SET);

// Clamp length to available data
ssize_t const read_size = std::min(size, _file_size - offset);
ssize_t const read_size = std::min(size, _file.size() - offset);

std::vector<uint8_t> v(read_size);
CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed");
return buffer::create(std::move(v));
Expand All @@ -162,15 +170,12 @@ class direct_read_source : public file_source {
lseek(_file.desc(), offset, SEEK_SET);

// Clamp length to available data
auto const read_size = std::min(size, _file_size - offset);
return read(_file.desc(), dst, read_size);
}

size_t size() const override { return _file_size; }
auto const read_size = std::min(size, _file.size() - offset);

private:
size_t _file_size = 0;
detail::file_wrapper _file;
CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast<ssize_t>(read_size),
"read failed");
return read_size;
}
};

/**
Expand Down
21 changes: 9 additions & 12 deletions cpp/src/io/utilities/file_io_utilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,30 +25,27 @@ namespace cudf {
namespace io {
namespace detail {

size_t get_file_size(int fd)
{
struct stat st;
CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size");
return static_cast<size_t>(st.st_size);
}

file_wrapper::file_wrapper(std::string const &filepath, int flags)
: fd(open(filepath.c_str(), flags))
: fd(open(filepath.c_str(), flags)), _size{get_file_size(fd)}
{
CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
}

file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode)
: fd(open(filepath.c_str(), flags, mode))
: fd(open(filepath.c_str(), flags, mode)), _size{get_file_size(fd)}
{
CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
}

file_wrapper::~file_wrapper() { close(fd); }

long file_wrapper::size() const
{
if (_size < 0) {
struct stat st;
CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size");
_size = static_cast<size_t>(st.st_size);
}
return _size;
}

std::string getenv_or(std::string const &env_var_name, std::string const &default_val)
{
auto const env_val = std::getenv(env_var_name.c_str());
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/io/utilities/file_io_utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ namespace detail {
* @brief Class that provides RAII for file handling.
*/
class file_wrapper {
int const fd = -1;
long mutable _size = -1;
int fd = -1;
size_t _size;

public:
explicit file_wrapper(std::string const &filepath, int flags);
explicit file_wrapper(std::string const &filepath, int flags, mode_t mode);
~file_wrapper();
long size() const;
auto size() const { return _size; }
auto desc() const { return fd; }
};

Expand Down

0 comments on commit 345887e

Please sign in to comment.