From eac29ee9d6b5ec8e29262b6fff3e8c3b87f0891f Mon Sep 17 00:00:00 2001 From: Greg Eisenhauer Date: Sun, 3 Nov 2019 07:41:18 -0500 Subject: [PATCH] Fix a notification failure on the read side and a race on the writer side. --- source/adios2/toolkit/sst/cp/cp_reader.c | 2 ++ source/adios2/toolkit/sst/cp/cp_writer.c | 4 ++-- source/adios2/toolkit/sst/dp/evpath_dp.c | 7 ++++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/source/adios2/toolkit/sst/cp/cp_reader.c b/source/adios2/toolkit/sst/cp/cp_reader.c index 81d458760a..a5efd21328 100644 --- a/source/adios2/toolkit/sst/cp/cp_reader.c +++ b/source/adios2/toolkit/sst/cp/cp_reader.c @@ -196,6 +196,8 @@ extern void ReaderConnCloseHandler(CManager cm, CMConnection ClosedConn, CP_verbose(Stream, "Reader-side Rank received a " "connection-close event after close, " "not unexpected\n"); + Stream->DP_Interface->notifyConnFailure(&Svcs, Stream->DP_Stream, + FailedPeerRank); } else { diff --git a/source/adios2/toolkit/sst/cp/cp_writer.c b/source/adios2/toolkit/sst/cp/cp_writer.c index fae59fbd98..6664e562cf 100644 --- a/source/adios2/toolkit/sst/cp/cp_writer.c +++ b/source/adios2/toolkit/sst/cp/cp_writer.c @@ -425,7 +425,6 @@ extern void WriterConnCloseHandler(CManager cm, CMConnection closed_conn, "connection-close event in unexpected " "state %s\n", SSTStreamStatusStr[WSreader->ReaderStatus]); - CP_PeerFailCloseWSReader(WSreader, PeerFailed); } QueueMaintenance(ParentWriterStream); PTHREAD_MUTEX_UNLOCK(&ParentWriterStream->DataLock); @@ -1304,6 +1303,7 @@ static void CP_PeerFailCloseWSReader(WS_ReaderInfo CP_WSR_Stream, } if (NewState == PeerFailed) { + DerefAllSentTimesteps(CP_WSR_Stream->ParentStream, CP_WSR_Stream); CMadd_delayed_task(ParentStream->CPInfo->cm, 2, 0, CloseWSRStream, CP_WSR_Stream); } @@ -1596,7 +1596,7 @@ static void ProcessReaderStatusList(SstStream Stream, CP_verbose(Stream, "Adjusting reader %d status from %s to %s\n", i, SSTStreamStatusStr[Stream->Readers[i]->ReaderStatus], SSTStreamStatusStr[Metadata->ReaderStatus[i]]); - Stream->Readers[i]->ReaderStatus = Metadata->ReaderStatus[i]; + CP_PeerFailCloseWSReader(Stream->Readers[i], Metadata->ReaderStatus[i]); } } PTHREAD_MUTEX_UNLOCK(&Stream->DataLock); diff --git a/source/adios2/toolkit/sst/dp/evpath_dp.c b/source/adios2/toolkit/sst/dp/evpath_dp.c index bfe4314201..b4c8f91da3 100644 --- a/source/adios2/toolkit/sst/dp/evpath_dp.c +++ b/source/adios2/toolkit/sst/dp/evpath_dp.c @@ -384,8 +384,13 @@ static void EvpathReadRequestHandler(CManager cm, CMConnection conn, * Shouldn't ever get here because we should never get a request for a * timestep that we don't have. */ - fprintf(stderr, "Writer rank %d - Failed to read Timestep %ld, not found\n", + fprintf(stderr, "\n\n\n\n"); + fprintf(stderr, "Writer rank %d - Failed to read Timestep %ld, not found. This is an internal inconsistency\n", WSR_Stream->WS_Stream->Rank, ReadRequestMsg->Timestep); + fprintf(stderr, "Writer rank %d - Request came from rank %d, please report this error!\n", + WSR_Stream->WS_Stream->Rank, RequestingRank); + fprintf(stderr, "\n\n\n\n"); + /* * in the interest of not failing a writer on a reader failure, don't * assert(0) here. Probably this sort of error should close the link to