Skip to content

Commit

Permalink
Merge pull request #1855 from eisenhauer/SstRaces
Browse files Browse the repository at this point in the history
Fix a notification failure on the read side and a race on the writer …
eisenhauer authored Nov 3, 2019
2 parents 7610e17 + 58f3239 commit 73a4833
Showing 3 changed files with 15 additions and 3 deletions.
2 changes: 2 additions & 0 deletions source/adios2/toolkit/sst/cp/cp_reader.c
Original file line number Diff line number Diff line change
@@ -196,6 +196,8 @@ extern void ReaderConnCloseHandler(CManager cm, CMConnection ClosedConn,
CP_verbose(Stream, "Reader-side Rank received a "
"connection-close event after close, "
"not unexpected\n");
Stream->DP_Interface->notifyConnFailure(&Svcs, Stream->DP_Stream,
FailedPeerRank);
}
else
{
5 changes: 3 additions & 2 deletions source/adios2/toolkit/sst/cp/cp_writer.c
Original file line number Diff line number Diff line change
@@ -425,7 +425,6 @@ extern void WriterConnCloseHandler(CManager cm, CMConnection closed_conn,
"connection-close event in unexpected "
"state %s\n",
SSTStreamStatusStr[WSreader->ReaderStatus]);
CP_PeerFailCloseWSReader(WSreader, PeerFailed);
}
QueueMaintenance(ParentWriterStream);
PTHREAD_MUTEX_UNLOCK(&ParentWriterStream->DataLock);
@@ -1304,6 +1303,7 @@ static void CP_PeerFailCloseWSReader(WS_ReaderInfo CP_WSR_Stream,
}
if (NewState == PeerFailed)
{
DerefAllSentTimesteps(CP_WSR_Stream->ParentStream, CP_WSR_Stream);
CMadd_delayed_task(ParentStream->CPInfo->cm, 2, 0, CloseWSRStream,
CP_WSR_Stream);
}
@@ -1596,7 +1596,8 @@ static void ProcessReaderStatusList(SstStream Stream,
CP_verbose(Stream, "Adjusting reader %d status from %s to %s\n", i,
SSTStreamStatusStr[Stream->Readers[i]->ReaderStatus],
SSTStreamStatusStr[Metadata->ReaderStatus[i]]);
Stream->Readers[i]->ReaderStatus = Metadata->ReaderStatus[i];
CP_PeerFailCloseWSReader(Stream->Readers[i],
Metadata->ReaderStatus[i]);
}
}
PTHREAD_MUTEX_UNLOCK(&Stream->DataLock);
11 changes: 10 additions & 1 deletion source/adios2/toolkit/sst/dp/evpath_dp.c
Original file line number Diff line number Diff line change
@@ -384,8 +384,17 @@ static void EvpathReadRequestHandler(CManager cm, CMConnection conn,
* Shouldn't ever get here because we should never get a request for a
* timestep that we don't have.
*/
fprintf(stderr, "Writer rank %d - Failed to read Timestep %ld, not found\n",
fprintf(stderr, "\n\n\n\n");
fprintf(stderr,
"Writer rank %d - Failed to read Timestep %ld, not found. This is "
"an internal inconsistency\n",
WSR_Stream->WS_Stream->Rank, ReadRequestMsg->Timestep);
fprintf(stderr,
"Writer rank %d - Request came from rank %d, please report this "
"error!\n",
WSR_Stream->WS_Stream->Rank, RequestingRank);
fprintf(stderr, "\n\n\n\n");

/*
* in the interest of not failing a writer on a reader failure, don't
* assert(0) here. Probably this sort of error should close the link to

0 comments on commit 73a4833

Please sign in to comment.