From 7669d283de1304ff8a709676ea7548cf43d762c0 Mon Sep 17 00:00:00 2001 From: Ryan Grant Date: Wed, 19 Oct 2016 15:29:46 -0600 Subject: [PATCH] update to address issue where race conditions can happen when unlinking MEs that are actively receiving, closes #70 --- src/ib/ptl_me.c | 62 +++++++-- src/ib/ptl_ref.h | 3 + src/ib/ptl_tgt.c | 13 +- test/basic/Makefile.am | 5 +- test/basic/test_unlink_race.c | 247 ++++++++++++++++++++++++++++++++++ 5 files changed, 311 insertions(+), 19 deletions(-) create mode 100755 test/basic/test_unlink_race.c diff --git a/src/ib/ptl_me.c b/src/ib/ptl_me.c index 7fc93434..a3499248 100644 --- a/src/ib/ptl_me.c +++ b/src/ib/ptl_me.c @@ -401,7 +401,7 @@ int _PtlMEUnlink(PPEGBL ptl_handle_me_t me_handle) me_t *me; int ref_cnt; // int pt_index; -// pt_t *pt; + pt_t *pt; // ni_t *ni; #ifndef NO_ARG_VALIDATION @@ -413,12 +413,20 @@ int _PtlMEUnlink(PPEGBL ptl_handle_me_t me_handle) err = to_me(MYGBL_ me_handle, &me); if (err) goto err1; + + if (me == NULL){ + err = PTL_OK; + goto err1; + } + else{ + pt = me->pt; + } #else me = to_obj(MYGBL_ POOL_ANY, me_handle); #endif //pt_index = me->pt_index; //pt = &ni->pt[pt_index]; - //me_get(me); + //me_get(me); //If this was an overflow, it should just complete now //there's no other busy work being done @@ -428,13 +436,31 @@ int _PtlMEUnlink(PPEGBL ptl_handle_me_t me_handle) /* make sure the me isn't still involved in any final * cleanup before we unlink it */ - while (atomic_read(&me->busy) == 1) { - SPINLOCK_BODY(); + if (atomic_read(&me->busy) == 1){ + err = PTL_IN_USE; + me_put(me); + goto err1; } - PTL_FASTLOCK_LOCK(&me->pt->lock); + if (me != NULL && pt != NULL){ + while (pthread_spin_trylock(&pt->lock) != 0){ + usleep(500); + if(me == NULL ){ + err = PTL_IN_USE; + goto err1; + } + } + if(me == NULL){ + PTL_FASTLOCK_UNLOCK(&pt->lock); + err = PTL_IN_USE; + goto err1; + } + } + else { + err = PTL_OK; + goto err1; + } - ref_cnt = me_ref_cnt(me); /* There should only be 2 references on the object before we can @@ -442,21 +468,31 @@ int _PtlMEUnlink(PPEGBL ptl_handle_me_t me_handle) if (ref_cnt > 2) { me_put(me); err = PTL_IN_USE; - goto err1; - } else if (ref_cnt < 2) { + goto err2; + } else if (ref_cnt < 2 && ref_cnt > 0) { me_put(me); err = PTL_ARG_INVALID; - goto err1; + goto err2; + } + else if (ref_cnt <= 0){ + err = PTL_OK; + goto err2; } - PTL_FASTLOCK_UNLOCK(&me->pt->lock); - le_unlink((le_t *)me, 0); + //if (me->pt != NULL) + PTL_FASTLOCK_UNLOCK(&pt->lock); + //if (me != NULL) + le_unlink((le_t *)me, 0); err = PTL_OK; ref_cnt = me_ref_cnt(me); - if (ref_cnt > 0) - me_put(me); + if (ref_cnt > 0) + me_put(me); + goto err1; + err2: + //if (me->pt != NULL) + PTL_FASTLOCK_UNLOCK(&pt->lock); err1: #ifndef NO_ARG_VALIDATION gbl_put(); diff --git a/src/ib/ptl_ref.h b/src/ib/ptl_ref.h index bc858694..4a8ca043 100644 --- a/src/ib/ptl_ref.h +++ b/src/ib/ptl_ref.h @@ -58,7 +58,10 @@ static inline void ref_get(struct ref *ref) ref_cnt = atomic_inc(&ref->ref_cnt); +#ifdef PTL_DEBUG assert(ref_cnt >= 1); +#endif + } /** diff --git a/src/ib/ptl_tgt.c b/src/ib/ptl_tgt.c index fa6d92d3..f912ec4d 100644 --- a/src/ib/ptl_tgt.c +++ b/src/ib/ptl_tgt.c @@ -708,6 +708,8 @@ static int tgt_get_match(buf_t *buf) pt->state |= PT_AUTO_DISABLED; ptl_info("dropping due to lack of unexpected headers\n"); PTL_FASTLOCK_UNLOCK(&pt->lock); + le_put(buf->le); + buf->le = NULL; buf->ni_fail = PTL_NI_PT_DISABLED; WARN(); return STATE_TGT_DROP; @@ -1711,8 +1713,9 @@ static int tgt_send_ack(buf_t *buf) if (buf->le && buf->le->ptl_list == PTL_PRIORITY_LIST) { /* The LE must be released before we sent the ack. */ - le_put(buf->le); + //le_put(buf->le); atomic_set(&buf->me->busy, 0); + le_put(buf->le); buf->le = NULL; } @@ -1807,8 +1810,8 @@ static int tgt_send_reply(buf_t *buf) if (buf->le && buf->le->ptl_list == PTL_PRIORITY_LIST) { /* The LE must be released before we sent the ack. */ - le_put(buf->le); atomic_set(&buf->me->busy, 0); + le_put(buf->le); buf->le = NULL; } @@ -1863,7 +1866,7 @@ static int tgt_cleanup(buf_t *buf) /* On the overflow list, and was already matched by an * ME/LE. */ assert(buf->le->ptl_list == PTL_OVERFLOW_LIST); - atomic_set(&buf->le->busy, 0); + //atomic_set(&buf->le->busy, 0); state = STATE_TGT_OVERFLOW_EVENT; } else if (buf->le && buf->le->ptl_list == PTL_OVERFLOW_LIST) { //if the pt hasn't run out of resources and unexpected headers are enabled @@ -1924,8 +1927,8 @@ static void tgt_cleanup_2(buf_t *buf) { if (buf->le) { ptl_warn("me/le cleanup \n"); - le_put(buf->le); atomic_set(&buf->me->busy, 0); + le_put(buf->le); buf->le = NULL; } @@ -1937,8 +1940,8 @@ static void tgt_cleanup_2(buf_t *buf) //it was an overflow match, so reduce the //unexpected message count atomic_dec(&pt->unexpected_size); - le_put(buf->matching.le); atomic_set(&buf->matching.le->busy, 0); + le_put(buf->matching.le); buf->matching.le = NULL; } diff --git a/test/basic/Makefile.am b/test/basic/Makefile.am index f32375c9..76fd7146 100644 --- a/test/basic/Makefile.am +++ b/test/basic/Makefile.am @@ -15,7 +15,7 @@ TESTS = \ test_PA_NIInit \ test_LA_NIInit \ test_bootstrap \ - test_PA_LE_put_self \ + test_PA_LE_put_self \ test_PA_ME_put_self \ test_LA_LE_put_self \ test_LA_ME_put_self \ @@ -77,6 +77,7 @@ TESTS = \ test_ME_flowctl_nohdr \ test_LE_unlink \ test_ME_unlink \ + test_unlink_race \ test_PA_LE_persistent_search \ test_PA_ME_persistent_search \ test_ct_ack \ @@ -298,6 +299,8 @@ test_LE_unlink_CPPFLAGS = $(AM_CPPFLAGS) -DMATCHING=0 test_ME_unlink_SOURCES = test_unlink.c test_ME_unlink_CPPFLAGS = $(AM_CPPFLAGS) -DMATCHING=1 +test_unlink_race_SOURCES = test_unlink_race.c + test_PA_LE_persistent_search_SOURCES = test_persistent_search.c test_PA_LE_persistent_search_CPPFLAGS = $(AM_CPPFLAGS) -DMATCHING=0 diff --git a/test/basic/test_unlink_race.c b/test/basic/test_unlink_race.c new file mode 100755 index 00000000..2f5080b4 --- /dev/null +++ b/test/basic/test_unlink_race.c @@ -0,0 +1,247 @@ +#include +#include +#include +#include +#include +#include +#include "testing.h" + +int main(int argc, char *argv[]) +{ + int err; + ptl_interface_t iface; + unsigned int ni_opt; + ptl_process_t id; + ptl_ni_limits_t desired; + ptl_ni_limits_t actual; + ptl_handle_ni_t ni_handle; + unsigned int pt_opt; + ptl_handle_eq_t eq_handle; + ptl_pt_index_t pt_index_req; + ptl_pt_index_t pt_index; + ptl_me_t me; + ptl_list_t ptl_list; + void *user_ptr; + ptl_handle_me_t me_handles[20]; + ptl_md_t md; + ptl_handle_md_t md_handle; + ptl_size_t local_offset; + ptl_size_t length; + ptl_ack_req_t ack_req; + ptl_process_t target_id; + ptl_match_bits_t match_bits; + ptl_size_t remote_offset; + ptl_hdr_data_t hdr_data; + int i, j; + ptl_size_t count; + ptl_event_t event; + + /* + * init portals library + */ + err = PtlInit(); + if (err) { + printf("PtlInit failed, err = %d\n", err); + return 1; + } + + /* + * create an NI + */ + iface = PTL_IFACE_DEFAULT; + ni_opt = PTL_NI_MATCHING | PTL_NI_PHYSICAL; + id.phys.nid = PTL_NID_ANY; + id.phys.pid = PTL_PID_ANY; + desired.max_entries = 64; + desired.max_mds = 64; + desired.max_cts = 64; + desired.max_eqs = 64; + desired.max_pt_index = 64; + desired.max_iovecs = 64; + desired.max_list_size = 64; + desired.max_msg_size = 32768; + desired.max_atomic_size = 64; + + err = PtlNIInit(iface, ni_opt, PTL_PID_ANY, &desired, &actual, + &ni_handle); + if (err) { + printf("PtlNIInit failed, err = %d\n", err); + return 1; + } + + /* + * get process ID + */ + err = PtlGetId(ni_handle, &id); + if (err) { + printf("PtlGetId failed, err = %d\n", err); + return 1; + } + + /* + * create an EQ + */ + count = 1000; + + err = PtlEQAlloc(ni_handle, count, &eq_handle); + if (err) { + printf("PtlEQAlloc failed, err = %d\n", err); + return 1; + } + + /* + * create portals table entry + */ + pt_opt = 0; + pt_index_req = PTL_PT_ANY; + + err = PtlPTAlloc(ni_handle, pt_opt, PTL_EQ_NONE, pt_index_req, &pt_index); + if (err) { + printf("PtlPTAlloc failed, err = %d\n", err); + return 1; + } + + /* + * create ME + */ + me.length = 1024; + me.start = malloc(me.length); + me.ct_handle = PTL_CT_NONE; + me.uid = PTL_UID_ANY; + me.options = PTL_ME_OP_PUT | PTL_ME_USE_ONCE; + me.min_free = 0; + me.match_id = id; + me.match_bits = 0; + me.ignore_bits = 0; + ptl_list = PTL_PRIORITY_LIST; + user_ptr = NULL; + if (!me.start) { + printf("unable to allocate %" PRIu64 " bytes for me\n", me.length); + return 1; + } + +/* + * create MD + */ + md.length = 1024; + md.start = malloc(md.length); + md.options = 0; + md.eq_handle = eq_handle; + md.ct_handle = PTL_CT_NONE; + if (!md.start) { + printf("unable to allocate %" PRIu64 " bytes for md\n", md.length); + return 1; + } + + err = PtlMDBind(ni_handle, &md, &md_handle); + if (err) { + printf("PtlMDBind failed, err = %d\n", err); + return 1; + } + + + + for (j = 0; j < 1000; j++){ + libtest_barrier(); + for (i = 0; i < 20; i++) { + err = PtlMEAppend(ni_handle, pt_index, &me, ptl_list, user_ptr, &me_handles[i]); + if (err) { + printf("PtlMEAppend failed, err = %d\n", err); + return 1; + } + } + + local_offset = 0; + length = 4; + ack_req = PTL_ACK_REQ; + target_id = id; + match_bits = 0; + remote_offset = 0; + hdr_data = 0; + for (i = 0; i < 10; i++) { + err = PtlPut(md_handle, local_offset, length, ack_req, target_id, + pt_index, match_bits, remote_offset, user_ptr, hdr_data); + if (err) { + printf("PtlPut failed, err = %d\n", err); + return 1; + } + } + + for (i = 0; i < 10; i++) { + err = PtlEQWait(eq_handle, &event); + if (err) { + printf("PtlEQWait failed, err = %d\n", err); + return 1; + } + } + + int unlinks = 0; + for (i = 0;i < 20; i++) { + err = PtlMEUnlink(me_handles[i]); + if (err == PTL_OK) { + unlinks++; + } + if (unlinks == 10) { + break; + } + } + // printf("asserts= %d\n",unlinks); + assert(unlinks == 10); + + for (i = 0; i < 10; i++) { + err = PtlEQWait(eq_handle, &event); + if (err) { + printf("PtlEQWait failed, err = %d\n", err); + return 1; + } + } + + } + + /* + * destroy MD + */ + do { + err = PtlMDRelease(md_handle); + if (err && err != PTL_IN_USE) { + printf("PtlMDRelease failed, err = %d\n", err); + return 1; + } + } while(err == PTL_IN_USE); + + libtest_barrier(); + + /* + * destroy portals table entry + */ + err = PtlPTFree(ni_handle, pt_index); + if (err && err != PTL_PT_IN_USE) { + printf("PtlPTFree failed, err = %d\n", err); + return 1; + } + + /* + * destroy an EQ + */ + err = PtlEQFree(eq_handle); + if (err) { + printf("PtlEQFree failed, err = %d\n", err); + return 1; + } + + /* + * destroy an NI + */ + err = PtlNIFini(ni_handle); + if (err) { + printf("PtlNIFini failed, err = %d\n", err); + return 1; + } + + /* + * cleanup portals library + */ + PtlFini(); + + return 0; +}