Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-15863 container: fix a race for container cache #15038

Merged
merged 4 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 23 additions & 31 deletions src/common/lru.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ lru_hop_rec_decref(struct d_hash_table *htable, d_list_t *link)

D_ASSERT(llink->ll_ref > 0);
llink->ll_ref--;
if (llink->ll_ref == 1 && llink->ll_ops->lop_wakeup)

/* eviction waiter is the last one holds refcount */
if (llink->ll_wait_evict &&
llink->ll_ops->lop_wakeup && daos_lru_is_last_user(llink))
llink->ll_ops->lop_wakeup(llink);

/* Delete from hash only if no more references */
Expand Down Expand Up @@ -215,15 +218,6 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key,
if (link != NULL) {
llink = link2llink(link);
D_ASSERT(llink->ll_evicted == 0);
if (llink->ll_evicting) {
wangshilong marked this conversation as resolved.
Show resolved Hide resolved
/**
* Avoid calling `lru_hop_rec_decref()` at this point
* to prevent `wakeup()` from being invoked twice.
*/
D_ASSERT(llink->ll_ref > 1);
llink->ll_ref--;
D_GOTO(out, rc = -DER_SHUTDOWN);
}
/* remove busy item from LRU */
if (!d_list_empty(&llink->ll_qlink))
d_list_del_init(&llink->ll_qlink);
Expand Down Expand Up @@ -257,24 +251,17 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key,
return rc;
}

static void
lru_ref_release_internal(struct daos_lru_cache *lcache, struct daos_llink *llink, bool wait)
void
daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink)
{
D_ASSERT(lcache != NULL && llink != NULL && llink->ll_ref > 1);
D_ASSERT(d_list_empty(&llink->ll_qlink));

lru_hop_rec_decref(&lcache->dlc_htable, &llink->ll_link);

if (wait && llink->ll_ref > 1) {
D_ASSERT(llink->ll_evicting == 0);
llink->ll_evicting = 1;
lcache->dlc_ops->lop_wait(llink);
llink->ll_evicting = 0;
llink->ll_evicted = 1;
}

if (llink->ll_ref == 1) { /* the last refcount */
if (lcache->dlc_csize == 0)
/* zero-sized cache always evicts unused item */
if (lcache->dlc_csize == 0 && !llink->ll_evicted)
llink->ll_evicted = 1;

if (llink->ll_evicted) {
Expand All @@ -297,15 +284,20 @@ lru_ref_release_internal(struct daos_lru_cache *lcache, struct daos_llink *llink
}

void
daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink)
{
lru_ref_release_internal(lcache, llink, false);
}

void
daos_lru_ref_wait_evict(struct daos_lru_cache *lcache, struct daos_llink *llink)
daos_lru_ref_evict_wait(struct daos_lru_cache *lcache, struct daos_llink *llink)
{
D_ASSERT(lcache->dlc_ops->lop_wait);

lru_ref_release_internal(lcache, llink, true);
if (!llink->ll_evicted)
daos_lru_ref_evict(lcache, llink);

if (lcache->dlc_ops->lop_wait && !daos_lru_is_last_user(llink)) {
/* Wait until I'm the last one.
* XXX: the implementation can only support one waiter for now, if there
* is a secondary ULT calls this function on the same item, it will hit
* the assertion.
*/
D_ASSERT(!llink->ll_wait_evict);
llink->ll_wait_evict = 1;
lcache->dlc_ops->lop_wait(llink);
llink->ll_wait_evict = 0;
}
}
4 changes: 3 additions & 1 deletion src/container/srv_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -1261,7 +1261,9 @@ cont_child_destroy_one(void *vin)
D_GOTO(out_pool, rc = -DER_BUSY);
} /* else: resync should have completed, try again */

daos_lru_ref_wait_evict(tls->dt_cont_cache, &cont->sc_list);
/* nobody should see it again after eviction */
daos_lru_ref_evict_wait(tls->dt_cont_cache, &cont->sc_list);
daos_lru_ref_release(tls->dt_cont_cache, &cont->sc_list);
}

D_DEBUG(DB_MD, DF_CONT": destroying vos container\n",
Expand Down
38 changes: 9 additions & 29 deletions src/include/daos/lru.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ struct daos_llink {
d_list_t ll_link; /**< LRU hash link */
d_list_t ll_qlink; /**< Temp link for traverse */
uint32_t ll_ref; /**< refcount for this ref */
uint32_t ll_evicted:1, /**< has been evicted */
ll_evicting:1; /**< been evicting */
uint32_t ll_evicted:1; /**< has been evicted */
uint32_t ll_wait_evict:1; /**< wait for completion of eviction */
struct daos_llink_ops *ll_ops; /**< ops to maintain refs */
};

Expand Down Expand Up @@ -121,26 +121,7 @@ void
daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink);

/**
* Evicts the LRU link from the DAOS LRU cache after waiting
* for all references to be released.
*
* \param[in] lcache DAOS LRU cache
* \param[in] llink DAOS LRU link to be evicted
*
*/
void
daos_lru_ref_wait_evict(struct daos_lru_cache *lcache, struct daos_llink *llink);

/**
* Flush old items from LRU.
*
* \param[in] lcache DAOS LRU cache
*/
void
daos_lru_ref_flush(struct daos_lru_cache *lcache);

/**
* Evict the item from LRU after releasing the last refcount on it.
* Evict the item from LRU before releasing the refcount on it.
*
* \param[in] lcache DAOS LRU cache
* \param[in] llink DAOS LRU item to be evicted
Expand All @@ -153,15 +134,14 @@ daos_lru_ref_evict(struct daos_lru_cache *lcache, struct daos_llink *llink)
}

/**
* Check if a LRU element has been evicted or not
* Evict the item from LRU before releasing the refcount on it, wait until
* the caller is the last one holds refcount.
*
* \param[in] llink DAOS LRU item to check
* \param[in] lcache DAOS LRU cache
* \param[in] llink DAOS LRU item to be evicted
*/
static inline bool
daos_lru_ref_evicted(struct daos_llink *llink)
{
return llink->ll_evicted;
}
void
daos_lru_ref_evict_wait(struct daos_lru_cache *lcache, struct daos_llink *llink);

/**
* Increase a usage reference to the LRU element
Expand Down
Loading