-
Notifications
You must be signed in to change notification settings - Fork 1.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix zfs_write() / mmap update_time() lock inversion #7942
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4608,13 +4608,11 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) | |
int | ||
zfs_dirty_inode(struct inode *ip, int flags) | ||
{ | ||
znode_t *zp = ITOZ(ip); | ||
zfsvfs_t *zfsvfs = ITOZSB(ip); | ||
dmu_tx_t *tx; | ||
uint64_t mode, atime[2], mtime[2], ctime[2]; | ||
sa_bulk_attr_t bulk[4]; | ||
int error = 0; | ||
int cnt = 0; | ||
znode_t *zp = ITOZ(ip); | ||
zfsvfs_t *zfsvfs = ITOZSB(ip); | ||
boolean_t waited = B_FALSE; | ||
dmu_tx_t *tx; | ||
int error; | ||
|
||
if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) | ||
return (0); | ||
|
@@ -4624,32 +4622,56 @@ zfs_dirty_inode(struct inode *ip, int flags) | |
|
||
#ifdef I_DIRTY_TIME | ||
/* | ||
* This is the lazytime semantic indroduced in Linux 4.0 | ||
* This flag will only be called from update_time when lazytime is set. | ||
* (Note, I_DIRTY_SYNC will also set if not lazytime) | ||
* Fortunately mtime and ctime are managed within ZFS itself, so we | ||
* only need to dirty atime. | ||
* This is the SB_LAZYTIME semantic introduced in Linux 4.0 kernel. | ||
* When the I_DIRTY_TIME flag is set and neither I_DIRTY_DATASYNC or | ||
* I_DIRTY_SYNC is set then the inode times may be written lazily. | ||
* Fortunately mtime and ctime are managed within ZFS itself, so | ||
* we only need to dirty atime. | ||
*/ | ||
if (flags == I_DIRTY_TIME) { | ||
if ((flags & I_DIRTY_TIME) && | ||
!(flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))) { | ||
zp->z_atime_dirty = 1; | ||
goto out; | ||
ZFS_EXIT(zfsvfs); | ||
return (0); | ||
} | ||
#endif | ||
|
||
top: | ||
tx = dmu_tx_create(zfsvfs->z_os); | ||
|
||
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); | ||
zfs_sa_upgrade_txholds(tx, zp); | ||
|
||
error = dmu_tx_assign(tx, TXG_WAIT); | ||
/* | ||
* When a page is faulted in by filemap_page_mkwrite() this function | ||
* may be called by update_time() with the file's mmap_sem held. | ||
* Therefore it's necessary to use TXG_NOWAIT since we cannot release | ||
* the mmap_sem, and even if we could, it would be undesirable to | ||
* delay the page fault. TXG_NOTHROTTLE will be set as needed to | ||
* bypass the write throttle. In the unlikely case the transaction | ||
* cannot be assigned set z_atime_dirty=1 so at least the times will | ||
* be updated when the file is closed. | ||
*/ | ||
error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); | ||
if (error) { | ||
if (error == ERESTART && waited == B_FALSE) { | ||
waited = B_TRUE; | ||
dmu_tx_abort(tx); | ||
goto top; | ||
} | ||
dmu_tx_abort(tx); | ||
goto out; | ||
zp->z_atime_dirty = 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem with this is that we only do lazy update atime, but not mtime and ctime. We need to add that otherwise we will lose those. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's true, unfortunate we really don't have a mechanism for that currently. Deferring the |
||
ZFS_EXIT(zfsvfs); | ||
return (0); | ||
} | ||
|
||
mutex_enter(&zp->z_lock); | ||
zp->z_atime_dirty = 0; | ||
|
||
int cnt = 0; | ||
sa_bulk_attr_t bulk[4]; | ||
uint64_t mode, atime[2], mtime[2], ctime[2]; | ||
|
||
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); | ||
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); | ||
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); | ||
|
@@ -4667,8 +4689,7 @@ zfs_dirty_inode(struct inode *ip, int flags) | |
mutex_exit(&zp->z_lock); | ||
|
||
dmu_tx_commit(tx); | ||
out: | ||
ZFS_EXIT(zfsvfs); | ||
|
||
return (error); | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In
dmu_tx_try_assign
:It doesn't seem that TXG_NOTHROTTLE will prevent dmu_tx_assign from failing? Perhaps we should bail out on
waited && error == ERESTART
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea. You're right, it won't prevent it from failing under all circumstances, only due to the write throttle. I assume your specific concern is that we could end up spinning here, which would be just as bad.