Skip to content

Commit

Permalink
Fix buffered/direct/mmap I/O race
Browse files Browse the repository at this point in the history
When a page is faulted in for memory mapped I/O the page lock
may be dropped before it has been read and marked up to date.
If a buffered read encounters such a page in mappedread() it
must wait until the page has been updated. Failure to do so
will result in a panic on debug builds and incorrect data on
production builds.

The critical part of this change is in mappedread() where pages
which are not up to date are now handled. Additionally, it
includes the following simplifications.

- zfs_getpage() and zfs_fillpage() could be passed an array of
  pages. This could be more efficient if it was used but in
  practice only a single page was ever provided. These
  interfaces were simplified to acknowledge that.

- update_pages() was modified to correctly set the PG_error bit
  on a page when it cannot be read by dmu_read().

- Setting PG_error and PG_uptodate was moved to zfs_fillpage()
  from zpl_readpage_common(). This is consistent with the
  handling in update_pages() and mappedread().

- Minor additional refactoring to comments and variable
  declarations to improve readability.

- Add a test case to exercise concurrent buffered, direct,
  and mmap IO to the same file.

- Reduce the mmap_sync test case default run time.

Reviewed-by: Richard Yao <[email protected]>
Reviewed-by: Brian Atkinson <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes #13608 
Closes #14498
  • Loading branch information
behlendorf authored Feb 23, 2023
1 parent 7cb67d6 commit 89cd219
Show file tree
Hide file tree
Showing 7 changed files with 175 additions and 105 deletions.
2 changes: 1 addition & 1 deletion include/os/linux/zfs/sys/zfs_vnops_os.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ extern void zfs_inactive(struct inode *ip);
extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
offset_t offset, cred_t *cr);
extern int zfs_fid(struct inode *ip, fid_t *fidp);
extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages);
extern int zfs_getpage(struct inode *ip, struct page *pp);
extern int zfs_putpage(struct inode *ip, struct page *pp,
struct writeback_control *wbc, boolean_t for_sync);
extern int zfs_dirty_inode(struct inode *ip, int flags);
Expand Down
169 changes: 82 additions & 87 deletions module/os/linux/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -220,43 +220,46 @@ zfs_close(struct inode *ip, int flag, cred_t *cr)
}

#if defined(_KERNEL)

static int zfs_fillpage(struct inode *ip, struct page *pp);

/*
* When a file is memory mapped, we must keep the IO data synchronized
* between the DMU cache and the memory mapped pages. What this means:
*
* On Write: If we find a memory mapped page, we write to *both*
* the page and the dmu buffer.
* between the DMU cache and the memory mapped pages. Update all mapped
* pages with the contents of the coresponding dmu buffer.
*/
void
update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
{
struct inode *ip = ZTOI(zp);
struct address_space *mp = ip->i_mapping;
struct page *pp;
uint64_t nbytes;
int64_t off;
void *pb;
struct address_space *mp = ZTOI(zp)->i_mapping;
int64_t off = start & (PAGE_SIZE - 1);

off = start & (PAGE_SIZE-1);
for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
nbytes = MIN(PAGE_SIZE - off, len);
uint64_t nbytes = MIN(PAGE_SIZE - off, len);

pp = find_lock_page(mp, start >> PAGE_SHIFT);
struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
if (pp) {
if (mapping_writably_mapped(mp))
flush_dcache_page(pp);

pb = kmap(pp);
(void) dmu_read(os, zp->z_id, start + off, nbytes,
pb + off, DMU_READ_PREFETCH);
void *pb = kmap(pp);
int error = dmu_read(os, zp->z_id, start + off,
nbytes, pb + off, DMU_READ_PREFETCH);
kunmap(pp);

if (mapping_writably_mapped(mp))
flush_dcache_page(pp);
if (error) {
SetPageError(pp);
ClearPageUptodate(pp);
} else {
ClearPageError(pp);
SetPageUptodate(pp);

if (mapping_writably_mapped(mp))
flush_dcache_page(pp);

mark_page_accessed(pp);
}

mark_page_accessed(pp);
SetPageUptodate(pp);
ClearPageError(pp);
unlock_page(pp);
put_page(pp);
}
Expand All @@ -267,38 +270,44 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
}

/*
* When a file is memory mapped, we must keep the IO data synchronized
* between the DMU cache and the memory mapped pages. What this means:
*
* On Read: We "read" preferentially from memory mapped pages,
* else we default from the dmu buffer.
*
* NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
* the file is memory mapped.
* When a file is memory mapped, we must keep the I/O data synchronized
* between the DMU cache and the memory mapped pages. Preferentially read
* from memory mapped pages, otherwise fallback to reading through the dmu.
*/
int
mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
{
struct inode *ip = ZTOI(zp);
struct address_space *mp = ip->i_mapping;
struct page *pp;
int64_t start, off;
uint64_t bytes;
int64_t start = uio->uio_loffset;
int64_t off = start & (PAGE_SIZE - 1);
int len = nbytes;
int error = 0;
void *pb;

start = uio->uio_loffset;
off = start & (PAGE_SIZE-1);
for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
bytes = MIN(PAGE_SIZE - off, len);
uint64_t bytes = MIN(PAGE_SIZE - off, len);

pp = find_lock_page(mp, start >> PAGE_SHIFT);
struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
if (pp) {
ASSERT(PageUptodate(pp));
/*
* If filemap_fault() retries there exists a window
* where the page will be unlocked and not up to date.
* In this case we must try and fill the page.
*/
if (unlikely(!PageUptodate(pp))) {
error = zfs_fillpage(ip, pp);
if (error) {
unlock_page(pp);
put_page(pp);
return (error);
}
}

ASSERT(PageUptodate(pp) || PageDirty(pp));

unlock_page(pp);

pb = kmap(pp);
void *pb = kmap(pp);
error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
kunmap(pp);

Expand All @@ -314,9 +323,11 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)

len -= bytes;
off = 0;

if (error)
break;
}

return (error);
}
#endif /* _KERNEL */
Expand Down Expand Up @@ -3959,80 +3970,64 @@ zfs_inactive(struct inode *ip)
* Fill pages with data from the disk.
*/
static int
zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
zfs_fillpage(struct inode *ip, struct page *pp)
{
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
objset_t *os;
struct page *cur_pp;
u_offset_t io_off, total;
size_t io_len;
loff_t i_size;
unsigned page_idx;
int err;

os = zfsvfs->z_os;
io_len = nr_pages << PAGE_SHIFT;
i_size = i_size_read(ip);
io_off = page_offset(pl[0]);
loff_t i_size = i_size_read(ip);
u_offset_t io_off = page_offset(pp);
size_t io_len = PAGE_SIZE;

if (io_off + io_len > i_size)
io_len = i_size - io_off;

/*
* Iterate over list of pages and read each page individually.
*/
page_idx = 0;
for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
caddr_t va;

cur_pp = pl[page_idx++];
va = kmap(cur_pp);
err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
DMU_READ_PREFETCH);
kunmap(cur_pp);
if (err) {
/* convert checksum errors into IO errors */
if (err == ECKSUM)
err = SET_ERROR(EIO);
return (err);
}
void *va = kmap(pp);
int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
PAGE_SIZE, va, DMU_READ_PREFETCH);
kunmap(pp);

if (error) {
/* convert checksum errors into IO errors */
if (error == ECKSUM)
error = SET_ERROR(EIO);

SetPageError(pp);
ClearPageUptodate(pp);
} else {
ClearPageError(pp);
SetPageUptodate(pp);
}

return (0);
return (error);
}

/*
* Uses zfs_fillpage to read data from the file and fill the pages.
* Uses zfs_fillpage to read data from the file and fill the page.
*
* IN: ip - inode of file to get data from.
* pl - list of pages to read
* nr_pages - number of pages to read
* pp - page to read
*
* RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - atime updated
*/
int
zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
zfs_getpage(struct inode *ip, struct page *pp)
{
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
int err;

if (pl == NULL)
return (0);

if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (err);
znode_t *zp = ITOZ(ip);
int error;

err = zfs_fillpage(ip, pl, nr_pages);
if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (error);

dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE);
error = zfs_fillpage(ip, pp);
if (error == 0)
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);

zfs_exit(zfsvfs, FTAG);
return (err);

return (error);
}

/*
Expand Down
17 changes: 2 additions & 15 deletions module/os/linux/zfs/zpl_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -657,29 +657,16 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
static inline int
zpl_readpage_common(struct page *pp)
{
struct inode *ip;
struct page *pl[1];
int error = 0;
fstrans_cookie_t cookie;

ASSERT(PageLocked(pp));
ip = pp->mapping->host;
pl[0] = pp;

cookie = spl_fstrans_mark();
error = -zfs_getpage(ip, pl, 1);
int error = -zfs_getpage(pp->mapping->host, pp);
spl_fstrans_unmark(cookie);

if (error) {
SetPageError(pp);
ClearPageUptodate(pp);
} else {
ClearPageError(pp);
SetPageUptodate(pp);
flush_dcache_page(pp);
}

unlock_page(pp);

return (error);
}

Expand Down
3 changes: 2 additions & 1 deletion tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -687,7 +687,8 @@ tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
tags = ['functional', 'migration']

[tests/functional/mmap]
tests = ['mmap_write_001_pos', 'mmap_read_001_pos', 'mmap_seek_001_pos', 'mmap_sync_001_pos']
tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos',
'mmap_sync_001_pos', 'mmap_write_001_pos']
tags = ['functional', 'mmap']

[tests/functional/mount]
Expand Down
2 changes: 1 addition & 1 deletion tests/zfs-tests/cmd/mmap_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ main(int argc, char *argv[])
return (1);
}

int run_time_mins = 5;
int run_time_mins = 1;
if (argc >= 2) {
run_time_mins = atoi(argv[1]);
}
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1491,6 +1491,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/migration/setup.ksh \
functional/mmap/cleanup.ksh \
functional/mmap/mmap_libaio_001_pos.ksh \
functional/mmap/mmap_mixed.ksh \
functional/mmap/mmap_read_001_pos.ksh \
functional/mmap/mmap_seek_001_pos.ksh \
functional/mmap/mmap_sync_001_pos.ksh \
Expand Down
Loading

0 comments on commit 89cd219

Please sign in to comment.