Skip to content

Commit

Permalink
open(): add support for O_DIRECT flag
Browse files Browse the repository at this point in the history
When this flag is set, file I/O is performed directly on the
storage device, bypassing the page cache. Direct file I/O requires
the address and length of user buffers, as well as the file offset,
to be aligned to the filesystem block size (which is 512 bytes for
TFS); if this requirement is not met, I/O syscalls return -EINVAL.
  • Loading branch information
francescolavra committed Aug 14, 2024
1 parent 1d022f3 commit c0df77b
Show file tree
Hide file tree
Showing 5 changed files with 259 additions and 13 deletions.
66 changes: 63 additions & 3 deletions src/unix/filesystem.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,66 @@ void file_readahead(file f, u64 offset, u64 len)
irangel(offset + len, ra_size), 0, 0);
}

static sysreturn file_io_init_internal(file f, u64 offset, struct iovec *iov, int count, sg_list sg)
{
if (!(f->f.flags & O_DIRECT)) {
iov_to_sg(sg, iov, count);
return 0;
}
u64 block_mask = fs_blocksize(f->fs) - 1;
if (offset & block_mask)
return -EINVAL;
for (int i = 0; i < count; i++) {
u64 len = iov[i].iov_len;
if (len == 0)
continue;
void *ptr = iov[i].iov_base;
if ((u64_from_pointer(ptr) & block_mask) || (len & block_mask))
return -EINVAL;
touch_memory(ptr, len);
u64 phys = physical_from_virtual(ptr);
if (phys == INVALID_PHYSICAL)
return -EFAULT;
void *end = ptr + len;
/* ensure each SG buffer references a physically contiguous memory range */
void *contiguous_base = ptr;
u64 contiguous_len;
if ((u64_from_pointer(end - 1) & ~PAGEMASK) == (u64_from_pointer(ptr) & ~PAGEMASK)) {
contiguous_len = end - ptr; /* range fits in a single page */
} else {
ptr = pointer_from_u64(pad(u64_from_pointer(ptr + 1), PAGESIZE));
contiguous_len = ptr - contiguous_base;
phys += contiguous_len;
for (; ptr < end; ptr += PAGESIZE, phys += PAGESIZE) {
u64 next_phys = physical_from_virtual(ptr);
if (next_phys != phys) {
if (next_phys == INVALID_PHYSICAL)
return -EFAULT;
sg_buf sgb = sg_list_tail_add(sg, contiguous_len);
if (sgb == INVALID_ADDRESS)
return -ENOMEM;
sgb->buf = contiguous_base;
sgb->size = contiguous_len;
sgb->offset = 0;
sgb->refcount = 0;
contiguous_base = ptr;
contiguous_len = 0;
phys = next_phys;
}
contiguous_len += MIN(end - ptr, PAGESIZE);
}
}
sg_buf sgb = sg_list_tail_add(sg, contiguous_len);
if (sgb == INVALID_ADDRESS)
return -ENOMEM;
sgb->buf = contiguous_base;
sgb->size = contiguous_len;
sgb->offset = 0;
sgb->refcount = 0;
}
return 0;
}

sysreturn file_io_init_sg(file f, u64 offset, struct iovec *iov, int count, sg_list *sgp)
{
sg_list sg = sg_new(count);
Expand All @@ -85,9 +145,9 @@ sysreturn file_io_init_sg(file f, u64 offset, struct iovec *iov, int count, sg_l
rv = -EFAULT;
goto out;
}
iov_to_sg(sg, iov, count);
rv = 0;
*sgp = sg;
rv = file_io_init_internal(f, offset, iov, count, sg);
if (!rv)
*sgp = sg;
out:
if (rv != -EFAULT)
context_clear_err(ctx);
Expand Down
32 changes: 22 additions & 10 deletions src/unix/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,9 @@ static void file_io_complete(file f, range r, boolean is_file_offset, sg_list sg
if (is_ok(s)) {
u64 len = range_span(r);
len -= sg_total_len(sg);
u64 file_len = fsfile_get_length(f->fsf);
if (r.start + len > file_len) /* can happen with direct I/O */
len = file_len - r.start;
if (is_file_offset) /* vs specified offset (pread/pwrite) */
f->offset += len;
rv = len;
Expand Down Expand Up @@ -484,6 +487,15 @@ static void begin_file_read(file f, u64 length)
}
}

static void file_do_read(file f, sg_list sg, range q, status_handler sh)
{
u64 len = range_span(q);
begin_file_read(f, len);
apply(f->fs_read, sg, q, sh);
if (!(f->f.flags & O_DIRECT))
file_readahead(f, q.start, len);
}

closure_function(5, 1, void, file_read_complete,
sg_list, sg, range, r, file, f, boolean, is_file_offset, io_completion, completion,
status s)
Expand Down Expand Up @@ -521,9 +533,7 @@ closure_func_basic(file_io, sysreturn, file_read,
deallocate_sg_list(sg);
return io_complete(completion, -ENOMEM);
}
begin_file_read(f, length);
apply(f->fs_read, sg, irangel(offset, length), sh);
file_readahead(f, offset, length);
file_do_read(f, sg, r, sh);
/* possible direct return in top half */
return bh ? SYSRETURN_CONTINUE_BLOCKING : thread_maybe_sleep_uninterruptible(t);
}
Expand Down Expand Up @@ -551,10 +561,7 @@ closure_func_basic(file_iov, sysreturn, file_readv,
deallocate_sg_list(sg);
return io_complete(completion, -ENOMEM);
}
u64 length = range_span(r);
begin_file_read(f, length);
apply(f->fs_read, sg, r, sh);
file_readahead(f, offset, length);
file_do_read(f, sg, r, sh);

/* possible direct return in top half */
return bh ? SYSRETURN_CONTINUE_BLOCKING : thread_maybe_sleep_uninterruptible(t);
Expand Down Expand Up @@ -750,10 +757,15 @@ int unix_file_new(filesystem fs, tuple md, int type, int flags, fsfile fsf)
f->fsf = fsf;
u64 length;
if (fsf) {
pagecache_node pn = fsfile_get_cachenode(fsf);
f->fs_read = pagecache_node_get_reader(pn);
if (flags & O_DIRECT) {
f->fs_read = fsfile_get_reader(fsf);
f->fs_write = fsfile_get_writer(fsf);
} else {
pagecache_node pn = fsfile_get_cachenode(fsf);
f->fs_read = pagecache_node_get_reader(pn);
f->fs_write = pagecache_node_get_writer(pn);
}
assert(f->fs_read);
f->fs_write = pagecache_node_get_writer(pn);
assert(f->fs_write);
f->fadv = POSIX_FADV_NORMAL;
length = fsfile_get_length(fsf);
Expand Down
42 changes: 42 additions & 0 deletions test/runtime/readv.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <poll.h>
Expand All @@ -20,6 +21,45 @@
test_error("\"%ld\" != \"%ld\"", (long)expected, (long)actual); \
}

static void readv_test_direct(void)
{
const int alignment = 512;
int fd = open("hello", O_RDONLY | O_DIRECT);
test_assert(fd >= 0);
unsigned char buf[3 * alignment];
struct iovec iovs[2];
unsigned char *ptr;
int file_len;

/* unaligned base pointers: readv() may or may not fail with EINVAL (it fails on Nanos and
* succeeds on Linux with ext4 filesystem) */
if ((intptr_t)buf & (alignment - 1))
ptr = buf;
else
ptr = buf + 1;
iovs[0].iov_base = ptr;
iovs[1].iov_base = ptr + alignment;
iovs[0].iov_len = iovs[1].iov_len = alignment;
if (readv(fd, iovs, 2) > 0)
test_assert(lseek(fd, 0, SEEK_SET) == 0);
else
test_assert(errno == EINVAL);

/* unaligned buffer length */
ptr = (unsigned char *)((intptr_t)(buf - 1) & ~(alignment - 1)) + alignment;
iovs[0].iov_base = ptr;
iovs[1].iov_base = ptr + alignment;
iovs[0].iov_len = 1;
test_assert((readv(fd, iovs, 2) == -1) && (errno == EINVAL));

/* aligned buffer address and length */
iovs[0].iov_len = alignment;
file_len = readv(fd, iovs, 2);
test_assert((file_len > 0) && (file_len < 2 * alignment));

close(fd);
}

int main()
{
struct iovec iovs[3];
Expand Down Expand Up @@ -129,6 +169,8 @@ int main()
}
close(fd);

readv_test_direct();

printf("readv test PASSED\n");

return EXIT_SUCCESS;
Expand Down
79 changes: 79 additions & 0 deletions test/runtime/write.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <time.h>
#include <sys/time.h>
#include <sys/eventfd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>

Expand Down Expand Up @@ -473,6 +474,83 @@ static void write_exec_test(const char *prog)
}
}

static void write_test_direct(void)
{
const char *file_name = "test_direct";
const int alignment = 512;
const int page_size = 4096;
int fd = open(file_name, O_CREAT | O_RDWR | O_DIRECT, S_IRUSR | S_IWUSR);
test_assert(fd >= 0);
unsigned char wbuf[2 * alignment];
unsigned char rbuf[2 * alignment];
unsigned char *wptr, *rptr;

/* unaligned buffer address: write() may or may not fail with EINVAL (it fails on Nanos and
* succeeds on Linux with ext4 filesystem) */
if ((intptr_t)wbuf & (alignment - 1))
wptr = wbuf;
else
wptr = wbuf + 1;
if (write(fd, wptr, alignment) > 0)
test_assert(lseek(fd, 0, SEEK_SET) == 0);
else
test_assert(errno == EINVAL);

/* unaligned buffer length */
wptr = (unsigned char *)((intptr_t)(wbuf - 1) & ~(alignment - 1)) + alignment;
test_assert((write(fd, wptr, 1) == -1) && (errno == EINVAL));

/* aligned buffer address and length */
for (int i = 0; i < alignment; i += sizeof(uint64_t))
*(uint64_t *)(wptr + i) = i;
test_assert(write(fd, wptr, alignment) == alignment);

/* unaligned file offset */
test_assert(lseek(fd, 1, SEEK_SET) == 1);
test_assert((write(fd, wptr, alignment) == -1) && (errno == EINVAL));

/* aligned buffer address and length */
rptr = (unsigned char *)((intptr_t)(rbuf - 1) & ~(alignment - 1)) + alignment;
test_assert(pread(fd, rptr, alignment, 0) == alignment);
test_assert(!memcmp(rptr, wptr, alignment));

test_assert((pwrite(fd, FAULT_ADDR, alignment, 0) == -1) && (errno == EFAULT));

size_t map_size = 8 << 20;
void *wmap = mmap(NULL, map_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
void *rmap = mmap(NULL, map_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
test_assert((wmap != MAP_FAILED) && (rmap != MAP_FAILED));
for (int i = 0; i < map_size; i += sizeof(uint64_t))
*(uint64_t *)(wmap + i) = i;

/* single page */
wptr = (unsigned char *)((intptr_t)(wmap - 1) & ~(page_size - 1)) + page_size;
test_assert(pwrite(fd, wptr, page_size, 0) == page_size);
test_assert(pread(fd, rmap, page_size, 0) == page_size);
test_assert(!memcmp(rmap, wptr, page_size));

/* sub-page range fitting in a single page */
wptr += alignment;
test_assert(pwrite(fd, wptr, page_size - alignment, 0) == page_size - alignment);
test_assert(pread(fd, rmap, page_size - alignment, 0) == page_size - alignment);
test_assert(!memcmp(rmap, wptr, page_size - alignment));

/* range straddling 2 pages */
wptr += alignment;
test_assert(pwrite(fd, wptr, page_size - alignment, 0) == page_size - alignment);
test_assert(pread(fd, rmap, page_size - alignment, 0) == page_size - alignment);
test_assert(!memcmp(rmap, wptr, page_size - alignment));

test_assert(pwrite(fd, wmap, map_size, 0) == map_size);
test_assert(pread(fd, rmap, map_size, 0) == map_size);
test_assert(!memcmp(rmap, wmap, map_size));

munmap(wmap, map_size);
munmap(rmap, map_size);
close(fd);
unlink(file_name);
}

/* isn't this in a std include somewhere? */
static inline void timerspec_sub(struct timespec *a, struct timespec *b, struct timespec *r)
{
Expand Down Expand Up @@ -848,6 +926,7 @@ int main(int argc, char **argv)
sync_write_test();
truncate_test(argv[0]);
write_exec_test(argv[0]);
write_test_direct();
fs_stress_test();
}

Expand Down
53 changes: 53 additions & 0 deletions test/runtime/writev.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <stdint.h>
#include <string.h>
#include <sys/uio.h>
#include <unistd.h>
Expand All @@ -23,6 +25,55 @@
test_perror("lseek"); \
}

static void writev_test_direct(void)
{
const char *file_name = "test_direct";
const int alignment = 512;
int fd = open(file_name, O_CREAT | O_RDWR | O_DIRECT, S_IRUSR | S_IWUSR);
test_assert(fd >= 0);
unsigned char wbuf[3 * alignment];
unsigned char rbuf[3 * alignment];
struct iovec iovs[2];
unsigned char *ptr;

/* unaligned base pointers: writev() may or may not fail with EINVAL (it fails on Nanos and
* succeeds on Linux with ext4 filesystem) */
if ((intptr_t)wbuf & (alignment - 1))
ptr = wbuf;
else
ptr = wbuf + 1;
iovs[0].iov_base = ptr;
iovs[1].iov_base = ptr + alignment;
iovs[0].iov_len = iovs[1].iov_len = alignment;
if (writev(fd, iovs, 2) > 0)
test_assert(lseek(fd, 0, SEEK_SET) == 0);
else
test_assert(errno == EINVAL);

/* unaligned buffer length */
ptr = (unsigned char *)((intptr_t)(wbuf - 1) & ~(alignment - 1)) + alignment;
iovs[0].iov_base = ptr;
iovs[1].iov_base = ptr + alignment;
iovs[0].iov_len = 1;
test_assert((writev(fd, iovs, 2) == -1) && (errno == EINVAL));

/* aligned buffer address and length */
for (int i = 0; i < 2 * alignment; i += sizeof(uint64_t))
*(uint64_t *)(ptr + i) = i;
iovs[0].iov_len = alignment;
test_assert(writev(fd, iovs, 2) == 2 * alignment);

/* aligned buffer address and length */
test_assert(lseek(fd, 0, SEEK_SET) == 0);
ptr = (unsigned char *)((intptr_t)(rbuf - 1) & ~(alignment - 1)) + alignment;
test_assert(read(fd, ptr, 2 * alignment) == 2 * alignment);

test_assert(!memcmp(ptr, iovs[0].iov_base, alignment));
test_assert(!memcmp(ptr + alignment, iovs[1].iov_base, alignment));
close(fd);
unlink(file_name);
}

int main()
{
struct iovec iovs[3];
Expand Down Expand Up @@ -131,6 +182,8 @@ int main()
test_perror("close read-only");
}

writev_test_direct();

printf("write test passed\n");

return 0;
Expand Down

0 comments on commit c0df77b

Please sign in to comment.