Skip to content

Commit

Permalink
Illumos 6267 - dn_bonus evicted too early
Browse files Browse the repository at this point in the history
The bonus buffer associated with a dnode is expected to remain resident
until: the dnode is evicted via dnode_buf_pageout(), the dnode is
freed in dnode_sync_free(), or the objset containing the dnode is
evicted via dmu_objset_evict(). However, since bonus buffers (and DMU
buffers in general) can have draining references when these events occur,
dbuf_rele_and_unlock() has logic to ensure that once these late references
are released affected buffers will be evicted.

dbuf_rele_and_unlock() currently checks for a dbuf for an evicting
objset via the os->os_evicting flag, and detects buffers for a freed
dnode by testing dn->dn_type and dn->dn_free_txg fields. Unfortunately,
the free'd dnode test can fire prematurely - anytime after the dnode is
scheduled to be freed via dnode_free() until the free is committed in
dnode_sync_free(). If all references to the bonus buffer are dropped
within this window, the bonus buffer will be evicted and code in
dnode_sync() that relies on the bonus buffer will fail.

Additionally, the "free'd dnode test" isn't applied to normal buffers
(buffers that are not the bonus buffer) and there is no mechanism to
guarantee eviction in the dnode_buf_pageout() case (the dnode is not
being freed nor is the objset being evicted).

Replace the two existing deferred eviction mechanisms with a per-dbuf
flag, db_pending_evict. This is set when explicit eviction is requested
via either dnode_evict_dbufs() or dnode_evict_bonus(). These actions
only occur after it is safe for dnode buffers to be evicted (e.g. the
bonus buffer will not be referenced again).

uts/common/fs/zfs/sys/dbuf.h:
	Add comments for boolean fields in dmu_buf_impl_t.

	Add the db_pending_evict field.

uts/common/fs/zfs/sys/dbuf.h:
uts/common/fs/zfs/dbuf.c:
	Rename db_immediate_evict to db_user_immediate_evict to avoid
	confusion between dbuf user state eviction and deferred eviction
	of a dbuf.

uts/common/fs/zfs/dbuf.c:
	Consistently use TRUE/FALSE for boolean fields in
	dmu_buf_impl_t.

	Simplify pending eviction logic to use the new db_pending_evict
	flag in all cases.

uts/common/fs/zfs/dmu_objset.c:
uts/common/fs/zfs/sys/dmu_objset.h:
	Remove objset_t's os_evicting field. This same functionality
	is now provided by db_pending_evict.

uts/common/fs/zfs/dnode_sync.c:
	In dnode_evict_dbufs() and dnode_evict_bonus(), mark dbufs
	with draining references (dbufs that can't be evicted inline)
	as pending_evict.

	In dnode_sync_free(), remove ASSERT() that a dnode being free'd
	has no active dbufs. This is usually the case, but is not
	guaranteed due to draining references. (e.g. The deadlist for a
	deleted dataset may still be open if another thread referenced
	the dataset just before it was freed and the dsl_dataset_t hasn't
	been released or is still being evicted).

openzfs#3865
openzfs#3443
Ported-by: Ned Bass <[email protected]>
  • Loading branch information
scsiguy authored and ryao committed Oct 6, 2015
1 parent 2669893 commit 20e1c63
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 43 deletions.
18 changes: 17 additions & 1 deletion include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,25 @@ typedef struct dmu_buf_impl {
/* User callback information. */
dmu_buf_user_t *db_user;

uint8_t db_immediate_evict;
/*
* Evict user data as soon as the dirty and reference
* counts are equal.
*/
uint8_t db_user_immediate_evict;

/*
* This block was freed while a read or write was
* active.
*/
uint8_t db_freed_in_flight;

/*
* dnode_evict_dbufs() or dnode_evict_bonsu() tried to
* evict this dbuf, but couldn't due to outstanding
* references. Evict once the refcount drops to 0.
*/
uint8_t db_pending_evict;

uint8_t db_dirtycnt;
} dmu_buf_impl_t;

Expand Down
1 change: 0 additions & 1 deletion include/sys/dmu_objset.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ struct objset {
uint8_t os_copies;
enum zio_checksum os_dedup_checksum;
boolean_t os_dedup_verify;
boolean_t os_evicting;
zfs_logbias_op_t os_logbias;
zfs_cache_type_t os_primary_cache;
zfs_cache_type_t os_secondary_cache;
Expand Down
47 changes: 12 additions & 35 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
*/
ASSERT3U(holds, >=, db->db_dirtycnt);
} else {
if (db->db_immediate_evict == TRUE)
if (db->db_user_immediate_evict == TRUE)
ASSERT3U(holds, >=, db->db_dirtycnt);
else
ASSERT3U(holds, >, 0);
Expand Down Expand Up @@ -1880,8 +1880,9 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_blkptr = blkptr;

db->db_user = NULL;
db->db_immediate_evict = 0;
db->db_freed_in_flight = 0;
db->db_user_immediate_evict = FALSE;
db->db_freed_in_flight = FALSE;
db->db_pending_evict = FALSE;

if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
Expand Down Expand Up @@ -2318,12 +2319,13 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
arc_buf_freeze(db->db_buf);

if (holds == db->db_dirtycnt &&
db->db_level == 0 && db->db_immediate_evict)
db->db_level == 0 && db->db_user_immediate_evict)
dbuf_evict_user(db);

if (holds == 0) {
if (db->db_blkid == DMU_BONUS_BLKID) {
dnode_t *dn;
boolean_t evict_dbuf = db->db_pending_evict;

/*
* If the dnode moves here, we cannot cross this
Expand All @@ -2338,7 +2340,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
* Decrementing the dbuf count means that the bonus
* buffer's dnode hold is no longer discounted in
* dnode_move(). The dnode cannot move until after
* the dnode_rele_and_unlock() below.
* the dnode_rele() below.
*/
DB_DNODE_EXIT(db);

Expand All @@ -2348,35 +2350,10 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
*/
mutex_exit(&db->db_mtx);

/*
* If the dnode has been freed, evict the bonus
* buffer immediately. The data in the bonus
* buffer is no longer relevant and this prevents
* a stale bonus buffer from being associated
* with this dnode_t should the dnode_t be reused
* prior to being destroyed.
*/
mutex_enter(&dn->dn_mtx);
if (dn->dn_type == DMU_OT_NONE ||
dn->dn_free_txg != 0) {
/*
* Drop dn_mtx. It is a leaf lock and
* cannot be held when dnode_evict_bonus()
* acquires other locks in order to
* perform the eviction.
*
* Freed dnodes cannot be reused until the
* last hold is released. Since this bonus
* buffer has a hold, the dnode will remain
* in the free state, even without dn_mtx
* held, until the dnode_rele_and_unlock()
* below.
*/
mutex_exit(&dn->dn_mtx);
if (evict_dbuf)
dnode_evict_bonus(dn);
mutex_enter(&dn->dn_mtx);
}
dnode_rele_and_unlock(dn, db);

dnode_rele(dn, db);
} else if (db->db_buf == NULL) {
/*
* This is a special case: we never associated this
Expand Down Expand Up @@ -2423,7 +2400,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
} else {
dbuf_clear(db);
}
} else if (db->db_objset->os_evicting ||
} else if (db->db_pending_evict ||
arc_buf_eviction_needed(db->db_buf)) {
dbuf_clear(db);
} else {
Expand Down Expand Up @@ -2471,7 +2448,7 @@ dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;

db->db_immediate_evict = TRUE;
db->db_user_immediate_evict = TRUE;
return (dmu_buf_set_user(db_fake, user));
}

Expand Down
1 change: 0 additions & 1 deletion module/zfs/dmu_objset.c
Original file line number Diff line number Diff line change
Expand Up @@ -727,7 +727,6 @@ dmu_objset_evict(objset_t *os)
if (os->os_sa)
sa_tear_down(os);

os->os_evicting = B_TRUE;
dmu_objset_evict_dbufs(os);

mutex_enter(&os->os_lock);
Expand Down
14 changes: 9 additions & 5 deletions module/zfs/dnode_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,7 @@ dnode_evict_dbufs(dnode_t *dn)
db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
avl_remove(&dn->dn_dbufs, db_marker);
} else {
db->db_pending_evict = TRUE;
mutex_exit(&db->db_mtx);
db_next = AVL_NEXT(&dn->dn_dbufs, db);
}
Expand All @@ -447,10 +448,14 @@ void
dnode_evict_bonus(dnode_t *dn)
{
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
mutex_enter(&dn->dn_bonus->db_mtx);
dbuf_evict(dn->dn_bonus);
dn->dn_bonus = NULL;
if (dn->dn_bonus != NULL) {
if (refcount_is_zero(&dn->dn_bonus->db_holds)) {
mutex_enter(&dn->dn_bonus->db_mtx);
dbuf_evict(dn->dn_bonus);
dn->dn_bonus = NULL;
} else {
dn->dn_bonus->db_pending_evict = TRUE;
}
}
rw_exit(&dn->dn_struct_rwlock);
}
Expand Down Expand Up @@ -502,7 +507,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)

dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
dnode_evict_dbufs(dn);
ASSERT(avl_is_empty(&dn->dn_dbufs));

/*
* XXX - It would be nice to assert this, but we may still
Expand Down

0 comments on commit 20e1c63

Please sign in to comment.