-
Notifications
You must be signed in to change notification settings - Fork 302
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DAOS-13812 container: fix destroy vs lookup #12757
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -629,13 +629,18 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid, | |
rc = ABT_cond_create(&cont->sc_scrub_cond); | ||
if (rc != ABT_SUCCESS) { | ||
rc = dss_abterr2der(rc); | ||
goto out_mutex; | ||
goto out_resync_cond; | ||
} | ||
rc = ABT_cond_create(&cont->sc_rebuild_cond); | ||
if (rc != ABT_SUCCESS) { | ||
rc = dss_abterr2der(rc); | ||
goto out_scrub_cond; | ||
} | ||
|
||
cont->sc_pool = ds_pool_child_lookup(po_uuid); | ||
if (cont->sc_pool == NULL) { | ||
rc = -DER_NO_HDL; | ||
goto out_cond; | ||
goto out_rebuild_cond; | ||
} | ||
|
||
rc = vos_cont_open(cont->sc_pool->spc_hdl, co_uuid, &cont->sc_hdl); | ||
|
@@ -661,7 +666,11 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid, | |
|
||
out_pool: | ||
ds_pool_child_put(cont->sc_pool); | ||
out_cond: | ||
out_rebuild_cond: | ||
ABT_cond_free(&cont->sc_rebuild_cond); | ||
out_scrub_cond: | ||
ABT_cond_free(&cont->sc_scrub_cond); | ||
out_resync_cond: | ||
ABT_cond_free(&cont->sc_dtx_resync_cond); | ||
out_mutex: | ||
ABT_mutex_free(&cont->sc_mutex); | ||
|
@@ -688,6 +697,7 @@ cont_child_free_ref(struct daos_llink *llink) | |
D_FREE(cont->sc_snapshots); | ||
ABT_cond_free(&cont->sc_dtx_resync_cond); | ||
ABT_cond_free(&cont->sc_scrub_cond); | ||
ABT_cond_free(&cont->sc_rebuild_cond); | ||
ABT_mutex_free(&cont->sc_mutex); | ||
D_FREE(cont); | ||
} | ||
|
@@ -742,6 +752,13 @@ ds_cont_child_cache_destroy(struct daos_lru_cache *cache) | |
daos_lru_cache_destroy(cache); | ||
} | ||
|
||
static void | ||
cont_child_put(struct daos_lru_cache *cache, struct ds_cont_child *cont) | ||
{ | ||
daos_lru_ref_release(cache, &cont->sc_list); | ||
} | ||
|
||
|
||
/* | ||
* If create == false, then this is assumed to be a pure lookup. In this case, | ||
* -DER_NONEXIST is returned if the ds_cont_child object does not exist. | ||
|
@@ -754,6 +771,7 @@ cont_child_lookup(struct daos_lru_cache *cache, const uuid_t co_uuid, | |
struct daos_llink *llink; | ||
uuid_t key[2]; /* HT key is cuuid+puuid */ | ||
int rc; | ||
struct ds_cont_child *ds_cont; | ||
|
||
uuid_copy(key[0], co_uuid); | ||
uuid_copy(key[1], po_uuid); | ||
|
@@ -772,16 +790,15 @@ cont_child_lookup(struct daos_lru_cache *cache, const uuid_t co_uuid, | |
return rc; | ||
} | ||
|
||
*cont = cont_child_obj(llink); | ||
ds_cont = cont_child_obj(llink); | ||
if (ds_cont->sc_stopping) { | ||
cont_child_put(cache, ds_cont); | ||
return -DER_SHUTDOWN; | ||
} | ||
*cont = ds_cont; | ||
wangshilong marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return 0; | ||
} | ||
|
||
static void | ||
cont_child_put(struct daos_lru_cache *cache, struct ds_cont_child *cont) | ||
{ | ||
daos_lru_ref_release(cache, &cont->sc_list); | ||
} | ||
|
||
static inline bool | ||
cont_child_started(struct ds_cont_child *cont_child) | ||
{ | ||
|
@@ -807,13 +824,13 @@ cont_child_stop(struct ds_cont_child *cont_child) | |
/* Some ds_cont_child will only created by ds_cont_child_lookup(). | ||
* never be started at all | ||
*/ | ||
cont_child->sc_stopping = 1; | ||
if (cont_child_started(cont_child)) { | ||
D_DEBUG(DB_MD, DF_CONT"[%d]: Stopping container\n", | ||
DP_CONT(cont_child->sc_pool->spc_uuid, | ||
cont_child->sc_uuid), | ||
dss_get_module_info()->dmi_tgt_id); | ||
|
||
cont_child->sc_stopping = 1; | ||
d_list_del_init(&cont_child->sc_link); | ||
|
||
dtx_cont_deregister(cont_child); | ||
|
@@ -1178,7 +1195,7 @@ cont_child_destroy_one(void *vin) | |
ABT_mutex_unlock(cont->sc_mutex); | ||
|
||
/* Give chance to DTX reindex ULT for exit. */ | ||
if (unlikely(cont->sc_dtx_reindex)) | ||
while (unlikely(cont->sc_dtx_reindex)) | ||
ABT_thread_yield(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Nasf-Fan , reindex is supposed to be very quick? Is this busy yield loop ok? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reindex may take long time. But when close the container, reindex ULT will exit soon when detects the "stopping" flag. |
||
|
||
/* Make sure checksum scrubbing has stopped */ | ||
|
@@ -1189,6 +1206,12 @@ cont_child_destroy_one(void *vin) | |
} | ||
ABT_mutex_unlock(cont->sc_mutex); | ||
|
||
/* Make sure rebuild has stopped */ | ||
ABT_mutex_lock(cont->sc_mutex); | ||
if (cont->sc_rebuilding) | ||
ABT_cond_wait(cont->sc_rebuild_cond, cont->sc_mutex); | ||
ABT_mutex_unlock(cont->sc_mutex); | ||
|
||
retry_cnt++; | ||
if (retry_cnt > 1) { | ||
D_ERROR("container is still in-use: open %u, resync %s, reindex %s\n", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -402,6 +402,7 @@ struct rebuild_scan_arg { | |
int snapshot_cnt; | ||
uint32_t yield_freq; | ||
int32_t obj_yield_cnt; | ||
struct ds_cont_child *cont_child; | ||
}; | ||
|
||
/** | ||
|
@@ -696,7 +697,7 @@ rebuild_obj_scan_cb(daos_handle_t ch, vos_iter_entry_t *ent, | |
int i; | ||
int rc = 0; | ||
|
||
if (rpt->rt_abort) { | ||
if (rpt->rt_abort || arg->cont_child->sc_stopping) { | ||
D_DEBUG(DB_REBUILD, "rebuild is aborted\n"); | ||
return 1; | ||
} | ||
|
@@ -839,35 +840,45 @@ rebuild_container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, | |
} | ||
|
||
rc = vos_cont_open(iter_param->ip_hdl, entry->ie_couuid, &coh); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not from this patch, but I don't see why we need to call vos_cont_open() here, won't following ds_cont_child_lookup() open vos container? @wangdi1 ? |
||
if (rc == -DER_NONEXIST) { | ||
D_DEBUG(DB_REBUILD, DF_UUID" already destroyed\n", DP_UUID(arg->co_uuid)); | ||
return 0; | ||
} | ||
|
||
if (rc != 0) { | ||
D_ERROR("Open container "DF_UUID" failed: "DF_RC"\n", | ||
DP_UUID(entry->ie_couuid), DP_RC(rc)); | ||
return rc; | ||
} | ||
|
||
rc = ds_cont_child_lookup(rpt->rt_pool_uuid, entry->ie_couuid, &cont_child); | ||
if (rc == -DER_NONEXIST || rc == -DER_SHUTDOWN) { | ||
D_DEBUG(DB_REBUILD, DF_UUID" already destroyed or destroying\n", | ||
DP_UUID(arg->co_uuid)); | ||
rc = 0; | ||
D_GOTO(close, rc); | ||
} | ||
|
||
if (rc != 0) { | ||
D_ERROR("Container "DF_UUID", ds_cont_child_lookup failed: "DF_RC"\n", | ||
DP_UUID(entry->ie_couuid), DP_RC(rc)); | ||
D_GOTO(close, rc); | ||
} | ||
cont_child->sc_rebuilding = 1; | ||
|
||
rc = ds_cont_fetch_snaps(rpt->rt_pool->sp_iv_ns, entry->ie_couuid, NULL, | ||
&snapshot_cnt); | ||
if (rc) { | ||
D_ERROR("Container "DF_UUID", ds_cont_fetch_snaps failed: "DF_RC"\n", | ||
DP_UUID(entry->ie_couuid), DP_RC(rc)); | ||
vos_cont_close(coh); | ||
return rc; | ||
D_GOTO(close, rc); | ||
} | ||
|
||
rc = ds_cont_get_props(&arg->co_props, rpt->rt_pool->sp_uuid, entry->ie_couuid); | ||
if (rc) { | ||
D_ERROR("Container "DF_UUID", ds_cont_get_props failed: "DF_RC"\n", | ||
DP_UUID(entry->ie_couuid), DP_RC(rc)); | ||
vos_cont_close(coh); | ||
return rc; | ||
} | ||
|
||
rc = ds_cont_child_lookup(rpt->rt_pool_uuid, entry->ie_couuid, &cont_child); | ||
if (rc != 0) { | ||
D_ERROR("Container "DF_UUID", ds_cont_child_lookup failed: "DF_RC"\n", | ||
DP_UUID(entry->ie_couuid), DP_RC(rc)); | ||
vos_cont_close(coh); | ||
return rc; | ||
D_GOTO(close, rc); | ||
} | ||
|
||
/* Wait for EC aggregation to finish. NB: migrate needs to wait for EC aggregation to finish */ | ||
|
@@ -899,6 +910,7 @@ rebuild_container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, | |
param.ip_flags = VOS_IT_FOR_MIGRATION; | ||
uuid_copy(arg->co_uuid, entry->ie_couuid); | ||
arg->snapshot_cnt = snapshot_cnt; | ||
arg->cont_child = cont_child; | ||
|
||
/* If there is no snapshots, then rebuild does not need to migrate | ||
* punched objects at all. Ideally, it should ignore any objects | ||
|
@@ -914,8 +926,11 @@ rebuild_container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, | |
close: | ||
vos_cont_close(coh); | ||
|
||
if (cont_child != NULL) | ||
if (cont_child != NULL) { | ||
cont_child->sc_rebuilding = 0; | ||
ABT_cond_broadcast(cont_child->sc_rebuild_cond); | ||
ds_cont_child_put(cont_child); | ||
} | ||
|
||
D_DEBUG(DB_REBUILD, DF_UUID"/"DF_UUID" iterate cont done: "DF_RC"\n", | ||
DP_UUID(rpt->rt_pool_uuid), DP_UUID(entry->ie_couuid), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
checking "sc_stopping" in this internal lookup function could unintentional errors (for example, in cont_child_start() or cont_child_destroy_one()) , should we only check "sc_stopping" in the ds_cont_child_lookup()?