From 0f99b0869cd0cd3ab2a42c0d74b18616eefaaf15 Mon Sep 17 00:00:00 2001 From: Greg Eisenhauer Date: Tue, 4 Apr 2023 11:22:38 -0400 Subject: [PATCH] Prototype DAOS engine --- source/adios2/CMakeLists.txt | 8 +- source/adios2/core/IO.cpp | 13 + .../engine/daos/BP5Writer_TwoLevelShm.cpp | 298 +++ source/adios2/engine/daos/DaosEngine.cpp | 377 +++ source/adios2/engine/daos/DaosEngine.h | 224 ++ source/adios2/engine/daos/DaosReader.cpp | 1430 ++++++++++++ source/adios2/engine/daos/DaosReader.h | 309 +++ source/adios2/engine/daos/DaosReader.tcc | 39 + source/adios2/engine/daos/DaosWriter.cpp | 2014 +++++++++++++++++ source/adios2/engine/daos/DaosWriter.h | 403 ++++ source/adios2/engine/daos/DaosWriter.tcc | 97 + .../daos/DaosWriter_EveryoneWrites_Async.cpp | 357 +++ .../engine/daos/DaosWriter_TwoLevelShm.cpp | 298 +++ .../daos/DaosWriter_TwoLevelShm_Async.cpp | 357 +++ 14 files changed, 6223 insertions(+), 1 deletion(-) create mode 100644 source/adios2/engine/daos/BP5Writer_TwoLevelShm.cpp create mode 100644 source/adios2/engine/daos/DaosEngine.cpp create mode 100644 source/adios2/engine/daos/DaosEngine.h create mode 100644 source/adios2/engine/daos/DaosReader.cpp create mode 100644 source/adios2/engine/daos/DaosReader.h create mode 100644 source/adios2/engine/daos/DaosReader.tcc create mode 100644 source/adios2/engine/daos/DaosWriter.cpp create mode 100644 source/adios2/engine/daos/DaosWriter.h create mode 100644 source/adios2/engine/daos/DaosWriter.tcc create mode 100644 source/adios2/engine/daos/DaosWriter_EveryoneWrites_Async.cpp create mode 100644 source/adios2/engine/daos/DaosWriter_TwoLevelShm.cpp create mode 100644 source/adios2/engine/daos/DaosWriter_TwoLevelShm_Async.cpp diff --git a/source/adios2/CMakeLists.txt b/source/adios2/CMakeLists.txt index 77899e8ecc..9cf7da7dc9 100644 --- a/source/adios2/CMakeLists.txt +++ b/source/adios2/CMakeLists.txt @@ -189,7 +189,13 @@ if (ADIOS2_HAVE_BP5 OR ADIOS2_HAVE_SST) endif() if(ADIOS2_HAVE_DAOS) - target_sources(adios2_core PRIVATE toolkit/transport/file/FileDaos.cpp) + target_sources(adios2_core PRIVATE toolkit/transport/file/FileDaos.cpp + engine/daos/DaosEngine.cpp + engine/daos/DaosWriter_EveryoneWrites_Async.cpp + engine/daos/DaosWriter_TwoLevelShm_Async.cpp + engine/daos/DaosWriter_TwoLevelShm.cpp + engine/daos/DaosReader.cpp engine/daos/DaosReader.tcc + engine/daos/DaosWriter.cpp engine/daos/DaosWriter.tcc) target_link_libraries(adios2_core PRIVATE DAOS::DAOS) endif() diff --git a/source/adios2/core/IO.cpp b/source/adios2/core/IO.cpp index c8ef73e3e6..47d7cd0f88 100644 --- a/source/adios2/core/IO.cpp +++ b/source/adios2/core/IO.cpp @@ -51,6 +51,11 @@ #include "adios2/engine/sst/SstWriter.h" #endif +#ifdef ADIOS2_HAVE_DAOS // external dependencies +#include "adios2/engine/daos/DaosReader.h" +#include "adios2/engine/daos/DaosWriter.h" +#endif + namespace adios2 { namespace core @@ -98,6 +103,14 @@ std::unordered_map Factory = { #else IO::NoEngineEntry("ERROR: this version didn't compile with " "Sst library, can't use Sst engine\n") +#endif + }, + {"daos", +#ifdef ADIOS2_HAVE_DAOS + {IO::MakeEngine, IO::MakeEngine} +#else + IO::NoEngineEntry("ERROR: this version didn't compile with " + "DAOS library, can't use DAOS engine\n") #endif }, {"effis", diff --git a/source/adios2/engine/daos/BP5Writer_TwoLevelShm.cpp b/source/adios2/engine/daos/BP5Writer_TwoLevelShm.cpp new file mode 100644 index 0000000000..9ef11dd74b --- /dev/null +++ b/source/adios2/engine/daos/BP5Writer_TwoLevelShm.cpp @@ -0,0 +1,298 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * DaosWriter.cpp + * + */ + +#include "DaosWriter.h" + +#include "adios2/common/ADIOSMacros.h" +#include "adios2/core/IO.h" +#include "adios2/helper/adiosFunctions.h" //CheckIndexRange, PaddingToAlignOffset +#include "adios2/toolkit/format/buffer/chunk/ChunkV.h" +#include "adios2/toolkit/format/buffer/malloc/MallocV.h" +#include "adios2/toolkit/shm/TokenChain.h" +#include "adios2/toolkit/transport/file/FileFStream.h" +#include + +#include +#include +#include + +namespace adios2 +{ +namespace core +{ +namespace engine +{ + +using namespace adios2::format; + +void DaosWriter::WriteData_TwoLevelShm(format::BufferV *Data) +{ + aggregator::MPIShmChain *a = + dynamic_cast(m_Aggregator); + + // new step writing starts at offset m_DataPos on master aggregator + // other aggregators to the same file will need to wait for the position + // to arrive from the rank below + + // align to PAGE_SIZE (only valid on master aggregator at this point) + m_DataPos += + helper::PaddingToAlignOffset(m_DataPos, m_Parameters.StripeSize); + + // Each aggregator needs to know the total size they write + // This calculation is valid on aggregators only + std::vector mySizes = a->m_Comm.GatherValues(Data->Size()); + uint64_t myTotalSize = 0; + uint64_t maxSize = 0; + for (auto s : mySizes) + { + myTotalSize += s; + if (s > maxSize) + { + maxSize = s; + } + } + + if (a->m_Comm.Size() > 1) + { + size_t alignment_size = sizeof(max_align_t); + if (m_Parameters.DirectIO) + { + alignment_size = m_Parameters.DirectIOAlignOffset; + } + a->CreateShm(static_cast(maxSize), m_Parameters.MaxShmSize, + alignment_size); + } + + shm::TokenChain tokenChain(&a->m_Comm); + + if (a->m_IsAggregator) + { + // In each aggregator chain, send from master down the line + // these total sizes, so every aggregator knows where to start + if (a->m_AggregatorChainComm.Rank() > 0) + { + a->m_AggregatorChainComm.Recv( + &m_DataPos, 1, a->m_AggregatorChainComm.Rank() - 1, 0, + "AggregatorChain token in DaosWriter::WriteData_TwoLevelShm"); + // align to PAGE_SIZE + m_DataPos += helper::PaddingToAlignOffset(m_DataPos, + m_Parameters.StripeSize); + } + m_StartDataPos = m_DataPos; // metadata needs this info + if (a->m_AggregatorChainComm.Rank() < + a->m_AggregatorChainComm.Size() - 1) + { + uint64_t nextWriterPos = m_DataPos + myTotalSize; + a->m_AggregatorChainComm.Isend( + &nextWriterPos, 1, a->m_AggregatorChainComm.Rank() + 1, 0, + "Chain token in DaosWriter::WriteData"); + } + else if (a->m_AggregatorChainComm.Size() > 1) + { + // send back final position from last aggregator in file to master + // aggregator + uint64_t nextWriterPos = m_DataPos + myTotalSize; + a->m_AggregatorChainComm.Isend( + &nextWriterPos, 1, 0, 0, + "Chain token in DaosWriter::WriteData"); + } + + /*std::cout << "Rank " << m_Comm.Rank() + << " aggregator start writing step " << m_WriterStep + << " to subfile " << a->m_SubStreamIndex << " at pos " + << m_DataPos << " totalsize " << myTotalSize << std::endl;*/ + + // Send token to first non-aggregator to start filling shm + // Also informs next process its starting offset (for correct metadata) + uint64_t nextWriterPos = m_DataPos + Data->Size(); + tokenChain.SendToken(nextWriterPos); + + WriteMyOwnData(Data); + + /* Write from shm until every non-aggr sent all data */ + if (a->m_Comm.Size() > 1) + { + WriteOthersData(myTotalSize - Data->Size()); + } + + // Master aggregator needs to know where the last writing ended by the + // last aggregator in the chain, so that it can start from the correct + // position at the next output step + if (a->m_AggregatorChainComm.Size() > 1 && + !a->m_AggregatorChainComm.Rank()) + { + a->m_AggregatorChainComm.Recv( + &m_DataPos, 1, a->m_AggregatorChainComm.Size() - 1, 0, + "Chain token in DaosWriter::WriteData"); + } + } + else + { + // non-aggregators fill shared buffer in marching order + // they also receive their starting offset this way + m_StartDataPos = tokenChain.RecvToken(); + + /*std::cout << "Rank " << m_Comm.Rank() + << " non-aggregator recv token to fill shm = " + << m_StartDataPos << std::endl;*/ + + SendDataToAggregator(Data); + + uint64_t nextWriterPos = m_StartDataPos + Data->Size(); + tokenChain.SendToken(nextWriterPos); + } + + if (a->m_Comm.Size() > 1) + { + a->DestroyShm(); + } +} + +void DaosWriter::WriteMyOwnData(format::BufferV *Data) +{ + std::vector DataVec = Data->DataVec(); + m_StartDataPos = m_DataPos; + m_FileDataManager.WriteFileAt(DataVec.data(), DataVec.size(), + m_StartDataPos); + m_DataPos += Data->Size(); +} + +/*std::string DoubleBufferToString(const double *b, int n) +{ + std::ostringstream out; + out.precision(1); + out << std::fixed << "["; + char s[32]; + + for (int i = 0; i < n; ++i) + { + snprintf(s, sizeof(s), "%g", b[i]); + out << s; + if (i < n - 1) + { + out << ", "; + } + } + out << "]"; + return out.str(); +}*/ + +void DaosWriter::SendDataToAggregator(format::BufferV *Data) +{ + /* Only one process is running this function at once + See shmFillerToken in the caller function + + In a loop, copy the local data into the shared memory, alternating + between the two segments. + */ + + aggregator::MPIShmChain *a = + dynamic_cast(m_Aggregator); + + std::vector DataVec = Data->DataVec(); + size_t nBlocks = DataVec.size(); + + // size_t sent = 0; + size_t block = 0; + size_t temp_offset = 0; + while (block < nBlocks) + { + // potentially blocking call waiting on Aggregator + aggregator::MPIShmChain::ShmDataBuffer *b = a->LockProducerBuffer(); + // b->max_size: how much we can copy + // b->actual_size: how much we actually copy + b->actual_size = 0; + while (true) + { + /* Copy n bytes from the current block, current offset to shm + making sure to use up to shm_size bytes + */ + size_t n = DataVec[block].iov_len - temp_offset; + if (n > (b->max_size - b->actual_size)) + { + n = b->max_size - b->actual_size; + } + std::memcpy(&b->buf[b->actual_size], + (const char *)DataVec[block].iov_base + temp_offset, n); + b->actual_size += n; + + /* Have we processed the entire block or staying with it? */ + if (n + temp_offset < DataVec[block].iov_len) + { + temp_offset += n; + } + else + { + temp_offset = 0; + ++block; + } + + /* Have we reached the max allowed shm size ?*/ + if (b->actual_size >= b->max_size) + { + break; + } + if (block >= nBlocks) + { + break; + } + } + // sent += b->actual_size; + + /*if (m_RankMPI >= 42) + { + std::cout << "Rank " << m_Comm.Rank() + << " filled shm, data_size = " << b->actual_size + << " block = " << block + << " temp offset = " << temp_offset << " sent = " << sent + << " buf = " << static_cast(b->buf) << " = " + << DoubleBufferToString((double *)b->buf, + b->actual_size / sizeof(double)) + << std::endl; + }*/ + + a->UnlockProducerBuffer(); + } +} +void DaosWriter::WriteOthersData(size_t TotalSize) +{ + /* Only an Aggregator calls this function */ + aggregator::MPIShmChain *a = + dynamic_cast(m_Aggregator); + + size_t wrote = 0; + while (wrote < TotalSize) + { + // potentially blocking call waiting on some non-aggr process + aggregator::MPIShmChain::ShmDataBuffer *b = a->LockConsumerBuffer(); + + /*std::cout << "Rank " << m_Comm.Rank() + << " write from shm, data_size = " << b->actual_size + << " total so far = " << wrote + << " buf = " << static_cast(b->buf) << " = " + << DoubleBufferToString((double *)b->buf, + b->actual_size / sizeof(double)) + << std::endl;*/ + /*<< " buf = " << static_cast(b->buf) << " = [" + << (int)b->buf[0] << (int)b->buf[1] << "..." + << (int)b->buf[b->actual_size - 2] + << (int)b->buf[b->actual_size - 1] << "]" << std::endl;*/ + + // b->actual_size: how much we need to write + m_FileDataManager.WriteFiles(b->buf, b->actual_size); + + wrote += b->actual_size; + + a->UnlockConsumerBuffer(); + } + m_DataPos += TotalSize; +} + +} // end namespace engine +} // end namespace core +} // end namespace adios2 diff --git a/source/adios2/engine/daos/DaosEngine.cpp b/source/adios2/engine/daos/DaosEngine.cpp new file mode 100644 index 0000000000..5b0dd0d37d --- /dev/null +++ b/source/adios2/engine/daos/DaosEngine.cpp @@ -0,0 +1,377 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * DaosEngine.cpp + * + */ + +#include "DaosEngine.h" + +#include "adios2/common/ADIOSMacros.h" +#include "adios2/common/ADIOSTypes.h" //PathSeparator +#include "adios2/core/IO.h" +#include "adios2/helper/adiosFunctions.h" //CreateDirectory, StringToTimeUnit, + +#include +#include + +namespace adios2 +{ +namespace core +{ +namespace engine +{ + +std::vector +DaosEngine::GetBPMetadataFileNames(const std::vector &names) const + noexcept +{ + std::vector metadataFileNames; + metadataFileNames.reserve(names.size()); + for (const auto &name : names) + { + metadataFileNames.push_back(GetBPMetadataFileName(name)); + } + return metadataFileNames; +} + +std::vector DaosEngine::GetBPMetaMetadataFileNames( + const std::vector &names) const noexcept +{ + std::vector metaMetadataFileNames; + metaMetadataFileNames.reserve(names.size()); + for (const auto &name : names) + { + metaMetadataFileNames.push_back(GetBPMetaMetadataFileName(name)); + } + return metaMetadataFileNames; +} + +std::string DaosEngine::GetBPMetadataFileName(const std::string &name) const + noexcept +{ + const std::string bpName = helper::RemoveTrailingSlash(name); + const size_t index = 0; // global metadata file is generated by rank 0 + /* the name of the metadata file is "md.0" */ + const std::string bpMetaDataRankName(bpName + PathSeparator + "md." + + std::to_string(index)); + return bpMetaDataRankName; +} + +std::string DaosEngine::GetBPMetaMetadataFileName(const std::string &name) const + noexcept +{ + const std::string bpName = helper::RemoveTrailingSlash(name); + const size_t index = 0; // global metadata file is generated by rank 0 + /* the name of the metadata file is "md.0" */ + const std::string bpMetaMetaDataRankName(bpName + PathSeparator + "mmd." + + std::to_string(index)); + return bpMetaMetaDataRankName; +} + +std::vector DaosEngine::GetBPMetadataIndexFileNames( + const std::vector &names) const noexcept +{ + std::vector metadataIndexFileNames; + metadataIndexFileNames.reserve(names.size()); + for (const auto &name : names) + { + metadataIndexFileNames.push_back(GetBPMetadataIndexFileName(name)); + } + return metadataIndexFileNames; +} + +std::string +DaosEngine::GetBPMetadataIndexFileName(const std::string &name) const noexcept +{ + const std::string bpName = helper::RemoveTrailingSlash(name); + /* the name of the metadata index file is "md.idx" */ + const std::string bpMetaDataIndexRankName(bpName + PathSeparator + + "md.idx"); + return bpMetaDataIndexRankName; +} + +std::vector +DaosEngine::GetBPVersionFileNames(const std::vector &names) const + noexcept +{ + std::vector versionFileNames; + versionFileNames.reserve(names.size()); + for (const auto &name : names) + { + versionFileNames.push_back(GetBPVersionFileName(name)); + } + return versionFileNames; +} + +std::string DaosEngine::GetBPVersionFileName(const std::string &name) const + noexcept +{ + const std::string bpName = helper::RemoveTrailingSlash(name); + /* the name of the version file is ".bpversion" */ + const std::string bpVersionFileName(bpName + PathSeparator + ".bpversion"); + return bpVersionFileName; +} + +std::string DaosEngine::GetBPSubStreamName(const std::string &name, + const size_t id, + const bool hasSubFiles, + const bool isReader) const noexcept +{ + if (!hasSubFiles) + { + return name; + } + + const std::string bpName = helper::RemoveTrailingSlash(name); + /* the name of a data file starts with "data." */ + const std::string bpRankName(bpName + PathSeparator + "data." + + std::to_string(id)); + return bpRankName; +} + +std::vector +DaosEngine::GetBPSubStreamNames(const std::vector &names, + size_t subFileIndex) const noexcept +{ + std::vector bpNames; + bpNames.reserve(names.size()); + for (const auto &name : names) + { + bpNames.push_back(GetBPSubStreamName(name, subFileIndex)); + } + return bpNames; +} + +void DaosEngine::ParseParams(IO &io, struct DAOSParams &Params) +{ + adios2::Params params_lowercase; + for (auto &p : io.m_Parameters) + { + const std::string key = helper::LowerCase(p.first); + const std::string value = helper::LowerCase(p.second); + params_lowercase[key] = value; + } + + auto lf_SetBoolParameter = [&](const std::string key, bool ¶meter, + bool def) { + const std::string lkey = helper::LowerCase(std::string(key)); + auto itKey = params_lowercase.find(lkey); + parameter = def; + if (itKey != params_lowercase.end()) + { + std::string value = itKey->second; + std::transform(value.begin(), value.end(), value.begin(), + ::tolower); + if (value == "yes" || value == "true" || value == "on") + { + parameter = true; + } + else if (value == "no" || value == "false" || value == "off") + { + parameter = false; + } + else + { + helper::Throw( + "Engine", "DaosEngine", "ParseParams", + "Unknown BP5 Boolean parameter '" + value + "'"); + } + } + }; + + auto lf_SetFloatParameter = [&](const std::string key, float ¶meter, + float def) { + const std::string lkey = helper::LowerCase(std::string(key)); + auto itKey = params_lowercase.find(lkey); + parameter = def; + if (itKey != params_lowercase.end()) + { + std::string value = itKey->second; + parameter = + helper::StringTo(value, " in Parameter key=" + key); + } + }; + + auto lf_SetSizeBytesParameter = [&](const std::string key, + size_t ¶meter, size_t def) { + const std::string lkey = helper::LowerCase(std::string(key)); + auto itKey = params_lowercase.find(lkey); + parameter = def; + if (itKey != params_lowercase.end()) + { + std::string value = itKey->second; + parameter = helper::StringToByteUnits( + value, "for Parameter key=" + key + "in call to Open"); + } + }; + + auto lf_SetIntParameter = [&](const std::string key, int ¶meter, + int def) { + const std::string lkey = helper::LowerCase(std::string(key)); + auto itKey = params_lowercase.find(lkey); + parameter = def; + if (itKey != params_lowercase.end()) + { + parameter = std::stoi(itKey->second); + return true; + } + return false; + }; + + auto lf_SetUIntParameter = [&](const std::string key, + unsigned int ¶meter, unsigned int def) { + const std::string lkey = helper::LowerCase(std::string(key)); + auto itKey = params_lowercase.find(lkey); + parameter = def; + if (itKey != params_lowercase.end()) + { + unsigned long result = std::stoul(itKey->second); + if (result > std::numeric_limits::max()) + { + result = std::numeric_limits::max(); + } + parameter = static_cast(result); + return true; + } + return false; + }; + + auto lf_SetStringParameter = [&](const std::string key, + std::string ¶meter, const char *def) { + const std::string lkey = helper::LowerCase(std::string(key)); + auto itKey = params_lowercase.find(lkey); + parameter = def; + if (itKey != params_lowercase.end()) + { + parameter = itKey->second; + return true; + } + return false; + }; + + auto lf_SetBufferVTypeParameter = [&](const std::string key, int ¶meter, + int def) { + const std::string lkey = helper::LowerCase(std::string(key)); + auto itKey = params_lowercase.find(lkey); + parameter = def; + if (itKey != params_lowercase.end()) + { + std::string value = itKey->second; + std::transform(value.begin(), value.end(), value.begin(), + ::tolower); + if (value == "malloc") + { + parameter = (int)BufferVType::MallocVType; + } + else if (value == "chunk") + { + parameter = (int)BufferVType::ChunkVType; + } + else + { + helper::Throw( + "Engine", "DaosEngine", "ParseParams", + "Unknown BP5 BufferVType parameter \"" + value + + "\" (must be \"malloc\" or \"chunk\""); + } + } + }; + + auto lf_SetAggregationTypeParameter = [&](const std::string key, + int ¶meter, int def) { + const std::string lkey = helper::LowerCase(std::string(key)); + auto itKey = params_lowercase.find(lkey); + parameter = def; + if (itKey != params_lowercase.end()) + { + std::string value = itKey->second; + std::transform(value.begin(), value.end(), value.begin(), + ::tolower); + if (value == "everyonewrites" || value == "auto") + { + parameter = (int)AggregationType::EveryoneWrites; + } + else if (value == "everyonewritesserial") + { + parameter = (int)AggregationType::EveryoneWritesSerial; + } + else if (value == "twolevelshm") + { + parameter = (int)AggregationType::TwoLevelShm; + } + else + { + helper::Throw( + "Engine", "DaosEngine", "ParseParams", + "Unknown BP5 AggregationType parameter \"" + value + + "\" (must be \"auto\", \"everyonewrites\" or " + "\"twolevelshm\""); + } + } + }; + + auto lf_SetAsyncWriteParameter = [&](const std::string key, int ¶meter, + int def) { + const std::string lkey = helper::LowerCase(std::string(key)); + auto itKey = params_lowercase.find(lkey); + parameter = def; + if (itKey != params_lowercase.end()) + { + std::string value = itKey->second; + std::transform(value.begin(), value.end(), value.begin(), + ::tolower); + if (value == "guided" || value == "auto" || value == "on" || + value == "true") + { + parameter = (int)AsyncWrite::Guided; + } + else if (value == "sync" || value == "off" || value == "false") + { + parameter = (int)AsyncWrite::Sync; + } + else if (value == "naive") + { + parameter = (int)AsyncWrite::Naive; + } + else + { + helper::Throw( + "Engine", "DaosEngine", "ParseParams", + "Unknown BP5 AsyncWriteMode parameter \"" + value + + "\" (must be \"auto\", \"sync\", \"naive\", " + "\"throttled\" " + "or \"guided\""); + } + } + }; + +#define get_params(Param, Type, Typedecl, Default) \ + lf_Set##Type##Parameter(#Param, Params.Param, Default); + + DAOS_FOREACH_PARAMETER_TYPE_4ARGS(get_params); +#undef get_params + + if (Params.verbose > 0 && !m_RankMPI) + { + std::cout << "---------------- " << io.m_EngineType + << " engine parameters --------------\n"; +#define print_params(Param, Type, Typedecl, Default) \ + lf_Set##Type##Parameter(#Param, Params.Param, Default); \ + if (!m_RankMPI) \ + { \ + std::cout << " " << std::string(#Param) << " = " << Params.Param \ + << " default = " << Default << std::endl; \ + } + + DAOS_FOREACH_PARAMETER_TYPE_4ARGS(print_params); +#undef print_params + std::cout << "-----------------------------------------------------" + << std::endl; + } +}; + +} // namespace engine +} // namespace core +} // namespace adios2 diff --git a/source/adios2/engine/daos/DaosEngine.h b/source/adios2/engine/daos/DaosEngine.h new file mode 100644 index 0000000000..b4801fab1a --- /dev/null +++ b/source/adios2/engine/daos/DaosEngine.h @@ -0,0 +1,224 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * DaosEngine.h + * + */ + +#ifndef ADIOS2_ENGINE_DAOS_DAOSENGINE_H_ +#define ADIOS2_ENGINE_DAOS_DAOSENGINE_H_ + +#include "adios2/common/ADIOSConfig.h" +#include "adios2/core/Engine.h" +#include "adios2/helper/adiosComm.h" +#include "adios2/toolkit/burstbuffer/FileDrainerSingleThread.h" +#include "adios2/toolkit/format/bp5/BP5Serializer.h" +#include "adios2/toolkit/transportman/TransportMan.h" + +namespace adios2 +{ +namespace core +{ +namespace engine +{ + +class DaosEngine +{ +public: + int m_RankMPI = 0; + /* metadata index table + 0: pos in memory for step (after filtered read) + 1: size of metadata + 2: flush count + 3: pos in index where data offsets are enumerated + 4: abs. pos in metadata File for step + */ + std::unordered_map> m_MetadataIndexTable; + + struct Minifooter + { + std::string VersionTag; + uint64_t PGIndexStart = 0; + uint64_t VarsIndexStart = 0; + uint64_t AttributesIndexStart = 0; + int8_t Version = -1; + bool IsLittleEndian = true; + bool HasSubFiles = false; + }; + + format::BufferSTL m_MetadataIndex; + + /** Positions of flags in Index Table Header that Reader uses */ + static constexpr size_t m_IndexHeaderSize = 64; + static constexpr size_t m_EndianFlagPosition = 36; + static constexpr size_t m_BPVersionPosition = 37; + static constexpr size_t m_BPMinorVersionPosition = 38; + static constexpr size_t m_ActiveFlagPosition = 39; + static constexpr size_t m_ColumnMajorFlagPosition = 40; + static constexpr size_t m_VersionTagPosition = 0; + static constexpr size_t m_VersionTagLength = 32; + + static constexpr uint8_t m_BP5MinorVersion = 2; + + /** Index record types */ + enum IndexRecord + { + StepRecord = 's', + WriterMapRecord = 'w', + }; + + std::vector + GetBPSubStreamNames(const std::vector &names, + size_t subFileIndex) const noexcept; + + std::vector + GetBPMetadataFileNames(const std::vector &names) const + noexcept; + std::vector + GetBPMetaMetadataFileNames(const std::vector &names) const + noexcept; + std::string GetBPMetadataFileName(const std::string &name) const noexcept; + std::string GetBPMetaMetadataFileName(const std::string &name) const + noexcept; + std::vector + GetBPMetadataIndexFileNames(const std::vector &names) const + noexcept; + + std::string GetBPMetadataIndexFileName(const std::string &name) const + noexcept; + + std::string GetBPSubStreamName(const std::string &name, const size_t id, + const bool hasSubFiles = true, + const bool isReader = false) const noexcept; + + std::vector + GetBPVersionFileNames(const std::vector &names) const noexcept; + + std::string GetBPVersionFileName(const std::string &name) const noexcept; + + enum class BufferVType + { + MallocVType, + ChunkVType, + Auto + }; + + BufferVType UseBufferV = BufferVType::ChunkVType; + + enum class AggregationType + { + EveryoneWrites, + EveryoneWritesSerial, + TwoLevelShm, + Auto + }; + + enum class AsyncWrite + { + Sync = 0, // enable using AsyncWriteMode as bool expression + Naive, + Guided + }; + + /** + * sub-block size for min/max calculation of large arrays in number of + * elements (not bytes). The default big number per Put() default will + * result in the original single min/max value-pair per block + */ + const size_t DefaultStatsBlockSize = 1125899906842624ULL; + +#define DAOS_FOREACH_PARAMETER_TYPE_4ARGS(MACRO) \ + MACRO(OpenTimeoutSecs, Float, float, -1.0f) \ + MACRO(BeginStepPollingFrequencySecs, Float, float, 1.0f) \ + MACRO(StreamReader, Bool, bool, false) \ + MACRO(BurstBufferDrain, Bool, bool, true) \ + MACRO(BurstBufferPath, String, std::string, "") \ + MACRO(NodeLocal, Bool, bool, false) \ + MACRO(verbose, Int, int, 0) \ + MACRO(CollectiveMetadata, Bool, bool, true) \ + MACRO(NumAggregators, UInt, unsigned int, 0) \ + MACRO(AggregatorRatio, UInt, unsigned int, 0) \ + MACRO(NumSubFiles, UInt, unsigned int, 0) \ + MACRO(StripeSize, UInt, unsigned int, 4096) \ + MACRO(DirectIO, Bool, bool, false) \ + MACRO(DirectIOAlignOffset, UInt, unsigned int, 512) \ + MACRO(DirectIOAlignBuffer, UInt, unsigned int, 0) \ + MACRO(AggregationType, AggregationType, int, \ + (int)AggregationType::TwoLevelShm) \ + MACRO(AsyncOpen, Bool, bool, true) \ + MACRO(AsyncWrite, AsyncWrite, int, (int)AsyncWrite::Sync) \ + MACRO(GrowthFactor, Float, float, DefaultBufferGrowthFactor) \ + MACRO(InitialBufferSize, SizeBytes, size_t, DefaultInitialBufferSize) \ + MACRO(MinDeferredSize, SizeBytes, size_t, DefaultMinDeferredSize) \ + MACRO(BufferChunkSize, SizeBytes, size_t, DefaultBufferChunkSize) \ + MACRO(MaxShmSize, SizeBytes, size_t, DefaultMaxShmSize) \ + MACRO(BufferVType, BufferVType, int, (int)BufferVType::ChunkVType) \ + MACRO(AppendAfterSteps, Int, int, INT_MAX) \ + MACRO(SelectSteps, String, std::string, "") \ + MACRO(ReaderShortCircuitReads, Bool, bool, false) \ + MACRO(StatsLevel, UInt, unsigned int, 1) \ + MACRO(StatsBlockSize, SizeBytes, size_t, DefaultStatsBlockSize) \ + MACRO(Threads, UInt, unsigned int, 0) \ + MACRO(UseOneTimeAttributes, Bool, bool, true) \ + MACRO(MaxOpenFilesAtOnce, UInt, unsigned int, UINT_MAX) + + struct DAOSParams + { +#define declare_struct(Param, Type, Typedecl, Default) Typedecl Param; + DAOS_FOREACH_PARAMETER_TYPE_4ARGS(declare_struct) +#undef declare_struct + }; + + void ParseParams(IO &io, DAOSParams &Params); + DAOSParams m_Parameters; + +private: +}; + +} // namespace engine +} // namespace core +} // namespace adios2 +#endif + +/* + * Data Formats: + * MetadataIndex file (md.idx) + * BP5 header for "Index Table" (64 bytes) + * for each Writer, what aggregator writes its data + * uint16_t [ WriterCount] + * for each timestep: (size (WriterCount + 2 ) 64-bit ints + * uint64_t 0 : CombinedMetaDataPos + * uint64_t 1 : CombinedMetaDataSize + * uint64_t 2 : FlushCount + * for each Writer + * for each flush before the last: + * uint64_t DataPos (in the file above) + * uint64_t DataSize + * for the final flush: + * uint64_t DataPos (in the file above) + * So, each timestep takes sizeof(uint64_t)* (3 + ((FlushCount-1)*2 + + *1) * WriterCount) bytes + * + * MetaMetadata file (mmd.0) contains FFS format information + * for each meta metadata item: + * uint64_t MetaMetaIDLen + * uint64_t MetaMetaInfoLen + * char[MeatMetaIDLen] MetaMetaID + * char[MetaMetaInfoLen] MetaMetanfo + * Notes: This file should be quite small, with size dependent upon the + *number of different "formats" written by any rank. + * + * + * MetaData file (md.0) contains encoded metadata/attribute data + * BP5 header for "Metadata" (64 bytes) + * for each timestep: + * uint64_t : TotalSize of this metadata block + * uint64_t[WriterCount] : Length of each rank's metadata + * uint64_t[WriterCount] : Length of each rank's attribute + * FFS-encoded metadata block of the length above + * FFS-encoded attribute data block of the length above + * + * Data file (data.x) contains a block of data for each timestep, for each + *rank + */ diff --git a/source/adios2/engine/daos/DaosReader.cpp b/source/adios2/engine/daos/DaosReader.cpp new file mode 100644 index 0000000000..f0bc8af734 --- /dev/null +++ b/source/adios2/engine/daos/DaosReader.cpp @@ -0,0 +1,1430 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * DaosReader.cpp + * + */ + +#include "DaosReader.h" +#include "DaosReader.tcc" + +#include "adios2/helper/adiosMath.h" // SetWithinLimit +#include + +#include +#include +#include +#include +#include + +using TP = std::chrono::high_resolution_clock::time_point; +#define NOW() std::chrono::high_resolution_clock::now(); +#define DURATION(T1, T2) static_cast((T2 - T1).count()) / 1000000000.0; + +namespace adios2 +{ +namespace core +{ +namespace engine +{ + +DaosReader::DaosReader(IO &io, const std::string &name, const Mode mode, + helper::Comm comm) +: Engine("DaosReader", io, name, mode, std::move(comm)), + m_MDFileManager(io, m_Comm), m_DataFileManager(io, m_Comm), + m_MDIndexFileManager(io, m_Comm), m_FileMetaMetadataManager(io, m_Comm), + m_ActiveFlagFileManager(io, m_Comm) +{ + PERFSTUBS_SCOPED_TIMER("DaosReader::Open"); + Init(); + m_IsOpen = true; +} + +DaosReader::~DaosReader() +{ + if (m_BP5Deserializer) + delete m_BP5Deserializer; + if (m_IsOpen) + { + DestructorClose(m_FailVerbose); + } + m_IsOpen = false; +} + +void DaosReader::DestructorClose(bool Verbose) noexcept +{ + // Nothing special needs to be done to "close" a BP5 reader during shutdown + // if it hasn't already been Closed + m_IsOpen = false; +} + +void DaosReader::InstallMetadataForTimestep(size_t Step) +{ + size_t pgstart = m_MetadataIndexTable[Step][0]; + size_t Position = pgstart + sizeof(uint64_t); // skip total data size + const uint64_t WriterCount = + m_WriterMap[m_WriterMapIndex[Step]].WriterCount; + size_t MDPosition = Position + 2 * sizeof(uint64_t) * WriterCount; + for (size_t WriterRank = 0; WriterRank < WriterCount; WriterRank++) + { + // variable metadata for timestep + // size_t ThisMDSize = helper::ReadValue( + // m_Metadata.m_Buffer, Position, m_Minifooter.IsLittleEndian); + // char *ThisMD = m_Metadata.m_Buffer.data() + MDPosition; + size_t ThisMDSize; + char *ThisMD; + + // DAOS temporary declarations + char key[1000]; + int rc; + + // Query size of a writer rank's metadata + sprintf(key, "step%d-rank%d", Step, WriterRank); + std::cout << __FILE__ << "::" << __func__ << "(), step: " << Step + << std::endl; + std::cout << "key = " << key << std::endl; + rc = daos_kv_get(oh, DAOS_TX_NONE, 0, key, &ThisMDSize, NULL, NULL); + ASSERT(rc == 0, "daos_kv_get() failed to get size with %d", rc); + std::cout << "WriterRank = " << WriterRank + << ", ThisMDSize = " << ThisMDSize << std::endl; + + // Allocate memory + ThisMD = new char[ThisMDSize]; + + // Read writer's metadata + rc = daos_kv_get(oh, DAOS_TX_NONE, 0, key, &ThisMDSize, ThisMD, NULL); + ASSERT(rc == 0, "daos_kv_get() failed to read metadata with %d", rc); + + std::cout << "Printing the first 10 bytes of Metadata" << std::endl; + char *data = reinterpret_cast(ThisMD); + for (int i = 0; i < 10; i++) + { + // std::cout << std::hex << std::setw(2) << std::setfill('0') << + // static_cast(data[i]) << " "; + std::cout << static_cast(data[i]) << " "; + } + std::cout << std::endl; + + if (m_OpenMode == Mode::ReadRandomAccess) + { + m_BP5Deserializer->InstallMetaData(ThisMD, ThisMDSize, WriterRank, + Step); + } + else + { + m_BP5Deserializer->InstallMetaData(ThisMD, ThisMDSize, WriterRank); + } + // MDPosition += ThisMDSize; + + // delete[] ThisMD; + } + // for (size_t WriterRank = 0; WriterRank < WriterCount; WriterRank++) + //{ + // // attribute metadata for timestep + // size_t ThisADSize = helper::ReadValue( + // m_Metadata.m_Buffer, Position, m_Minifooter.IsLittleEndian); + // char *ThisAD = m_Metadata.m_Buffer.data() + MDPosition; + // if (ThisADSize > 0) + // m_BP5Deserializer->InstallAttributeData(ThisAD, ThisADSize); + // MDPosition += ThisADSize; + //} +} + +StepStatus DaosReader::BeginStep(StepMode mode, const float timeoutSeconds) +{ + PERFSTUBS_SCOPED_TIMER("DaosReader::BeginStep"); + + if (m_OpenMode == Mode::ReadRandomAccess) + { + helper::Throw( + "Engine", "DaosReader", "BeginStep", + "BeginStep called in random access mode"); + } + if (m_BetweenStepPairs) + { + helper::Throw("Engine", "DaosReader", "BeginStep", + "BeginStep() is called a second time " + "without an intervening EndStep()"); + } + + if (mode != StepMode::Read) + { + helper::Throw( + "Engine", "DaosReader", "BeginStep", + "mode is not supported yet, only Read is valid for engine " + "DaosReader, in call to BeginStep"); + } + + StepStatus status = StepStatus::OK; + if (m_FirstStep) + { + if (!m_StepsCount) + { + // not steps was found in Open/Init, check for new steps now + status = CheckForNewSteps(Seconds(timeoutSeconds)); + } + } + else + { + if (m_CurrentStep + 1 >= m_StepsCount) + { + // we processed steps in memory, check for new steps now + status = CheckForNewSteps(Seconds(timeoutSeconds)); + } + } + + if (status == StepStatus::OK) + { + m_BetweenStepPairs = true; + if (m_FirstStep) + { + m_FirstStep = false; + } + else + { + ++m_CurrentStep; + } + + m_IO.m_EngineStep = m_CurrentStep; + // SstBlock AttributeBlockList = + // SstGetAttributeData(m_Input, SstCurrentStep(m_Input)); + // i = 0; + // while (AttributeBlockList && AttributeBlockList[i].BlockData) + // { + // m_IO.RemoveAllAttributes(); + // m_BP5Deserializer->InstallAttributeData( + // AttributeBlockList[i].BlockData, + // AttributeBlockList[i].BlockSize); + // i++; + // } + + m_BP5Deserializer->SetupForStep( + m_CurrentStep, + m_WriterMap[m_WriterMapIndex[m_CurrentStep]].WriterCount); + + /* Remove all existing variables from previous steps + It seems easier than trying to update them */ + // m_IO.RemoveAllVariables(); + + InstallMetadataForTimestep(m_CurrentStep); + m_IO.ResetVariablesStepSelection(false, + "in call to BP5 Reader BeginStep"); + + // caches attributes for each step + // if a variable name is a prefix + // e.g. var prefix = {var/v1, var/v2, var/v3} + m_IO.SetPrefixedNames(true); + } + + return status; +} + +size_t DaosReader::CurrentStep() const { return m_CurrentStep; } + +void DaosReader::EndStep() +{ + if (m_OpenMode == Mode::ReadRandomAccess) + { + helper::Throw("Engine", "DaosReader", "EndStep", + "EndStep called in random access mode"); + } + if (!m_BetweenStepPairs) + { + helper::Throw( + "Engine", "DaosReader", "EndStep", + "EndStep() is called without a successful BeginStep()"); + } + m_BetweenStepPairs = false; + PERFSTUBS_SCOPED_TIMER("DaosReader::EndStep"); + PerformGets(); +} + +std::pair +DaosReader::ReadData(adios2::transportman::TransportMan &FileManager, + const size_t maxOpenFiles, const size_t WriterRank, + const size_t Timestep, const size_t StartOffset, + const size_t Length, char *Destination) +{ + /* + * Warning: this function is called by multiple threads + */ + size_t FlushCount = m_MetadataIndexTable[Timestep][2]; + size_t DataPosPos = m_MetadataIndexTable[Timestep][3]; + size_t SubfileNum = static_cast( + m_WriterMap[m_WriterMapIndex[Timestep]].RankToSubfile[WriterRank]); + + // check if subfile is already opened + TP startSubfile = NOW(); + if (FileManager.m_Transports.count(SubfileNum) == 0) + { + const std::string subFileName = GetBPSubStreamName( + m_Name, SubfileNum, m_Minifooter.HasSubFiles, true); + if (FileManager.m_Transports.size() >= maxOpenFiles) + { + auto m = FileManager.m_Transports.begin(); + FileManager.CloseFiles((int)m->first); + } + FileManager.OpenFileID(subFileName, SubfileNum, Mode::Read, + m_IO.m_TransportsParameters[0], + /*{{"transport", "File"}},*/ false); + } + TP endSubfile = NOW(); + double timeSubfile = DURATION(startSubfile, endSubfile); + + /* Each block is in exactly one flush. The StartOffset was calculated + as if all the flushes were in a single contiguous block in file. + */ + TP startRead = NOW(); + size_t InfoStartPos = + DataPosPos + (WriterRank * (2 * FlushCount + 1) * sizeof(uint64_t)); + size_t SumDataSize = 0; // count in contiguous space + for (size_t flush = 0; flush < FlushCount; flush++) + { + size_t ThisDataPos = + helper::ReadValue(m_MetadataIndex.m_Buffer, InfoStartPos, + m_Minifooter.IsLittleEndian); + size_t ThisDataSize = + helper::ReadValue(m_MetadataIndex.m_Buffer, InfoStartPos, + m_Minifooter.IsLittleEndian); + + if (StartOffset < SumDataSize + ThisDataSize) + { + // discount offsets of skipped flushes + size_t Offset = StartOffset - SumDataSize; + FileManager.ReadFile(Destination, Length, ThisDataPos + Offset, + SubfileNum); + TP endRead = NOW(); + double timeRead = DURATION(startRead, endRead); + return std::make_pair(timeSubfile, timeRead); + } + SumDataSize += ThisDataSize; + } + + size_t ThisDataPos = helper::ReadValue( + m_MetadataIndex.m_Buffer, InfoStartPos, m_Minifooter.IsLittleEndian); + size_t Offset = StartOffset - SumDataSize; + FileManager.ReadFile(Destination, Length, ThisDataPos + Offset, SubfileNum); + + TP endRead = NOW(); + double timeRead = DURATION(startRead, endRead); + return std::make_pair(timeSubfile, timeRead); +} + +void DaosReader::PerformGets() +{ + auto lf_CompareReqSubfile = + [&](adios2::format::BP5Deserializer::ReadRequest &r1, + adios2::format::BP5Deserializer::ReadRequest &r2) -> bool { + return (m_WriterMap[m_WriterMapIndex[r1.Timestep]] + .RankToSubfile[r1.WriterRank] < + m_WriterMap[m_WriterMapIndex[r2.Timestep]] + .RankToSubfile[r2.WriterRank]); + }; + + // TP start = NOW(); + PERFSTUBS_SCOPED_TIMER("DaosReader::PerformGets"); + size_t maxReadSize; + + // TP startGenerate = NOW(); + auto ReadRequests = + m_BP5Deserializer->GenerateReadRequests(false, &maxReadSize); + size_t nRequest = ReadRequests.size(); + // TP endGenerate = NOW(); + // double generateTime = DURATION(startGenerate, endGenerate); + + size_t nextRequest = 0; + std::mutex mutexReadRequests; + + auto lf_GetNextRequest = [&]() -> size_t { + std::lock_guard lockGuard(mutexReadRequests); + size_t reqidx = MaxSizeT; + if (nextRequest < nRequest) + { + reqidx = nextRequest; + ++nextRequest; + } + return reqidx; + }; + + auto lf_Reader = [&](const int FileManagerID, const size_t maxOpenFiles) + -> std::tuple { + double copyTotal = 0.0; + double readTotal = 0.0; + double subfileTotal = 0.0; + size_t nReads = 0; + std::vector buf(maxReadSize); + + while (true) + { + const auto reqidx = lf_GetNextRequest(); + if (reqidx > nRequest) + { + break; + } + auto &Req = ReadRequests[reqidx]; + if (!Req.DestinationAddr) + { + Req.DestinationAddr = buf.data(); + } + std::pair t = + ReadData(fileManagers[FileManagerID], maxOpenFiles, + Req.WriterRank, Req.Timestep, Req.StartOffset, + Req.ReadLength, Req.DestinationAddr); + + TP startCopy = NOW(); + m_BP5Deserializer->FinalizeGet(Req, false); + TP endCopy = NOW(); + subfileTotal += t.first; + readTotal += t.second; + copyTotal += DURATION(startCopy, endCopy); + ++nReads; + } + return std::make_tuple(subfileTotal, readTotal, copyTotal, nReads); + }; + + // TP startRead = NOW(); + // double sortTime = 0.0; + if (m_Threads > 1 && nRequest > 1) + { + // TP startSort = NOW(); + std::sort(ReadRequests.begin(), ReadRequests.end(), + lf_CompareReqSubfile); + // TP endSort = NOW(); + // sortTime = DURATION(startSort, endSort); + size_t nThreads = (m_Threads < nRequest ? m_Threads : nRequest); + + size_t maxOpenFiles = helper::SetWithinLimit( + (size_t)m_Parameters.MaxOpenFilesAtOnce / nThreads, (size_t)1, + MaxSizeT); + + std::vector>> + futures(nThreads - 1); + + // launch Threads-1 threads to process subsets of requests, + // then main thread process the last subset + for (size_t tid = 0; tid < nThreads - 1; ++tid) + { + futures[tid] = std::async(std::launch::async, lf_Reader, tid + 1, + maxOpenFiles); + } + // main thread runs last subset of reads + /*auto tMain = */ lf_Reader(0, maxOpenFiles); + /*{ + double tSubfile = std::get<0>(tMain); + double tRead = std::get<1>(tMain); + double tCopy = std::get<2>(tMain); + size_t nReads = std::get<3>(tMain); + std::cout << " -> PerformGets() thread MAIN total = " + << tSubfile + tRead + tCopy << "s, subfile = " << tSubfile + << "s, read = " << tRead << "s, copy = " << tCopy + << ", nReads = " << nReads << std::endl; + }*/ + + // wait for all async threads + int tid = 1; + for (auto &f : futures) + { + /*auto t = */ f.get(); + /*double tSubfile = std::get<0>(t); + double tRead = std::get<1>(t); + double tCopy = std::get<2>(t); + size_t nReads = std::get<3>(t); + std::cout << " -> PerformGets() thread " << tid + << " total = " << tSubfile + tRead + tCopy + << "s, subfile = " << tSubfile << "s, read = " << tRead + << "s, copy = " << tCopy << ", nReads = " << nReads + << std::endl;*/ + ++tid; + } + } + else + { + size_t maxOpenFiles = helper::SetWithinLimit( + (size_t)m_Parameters.MaxOpenFilesAtOnce, (size_t)1, MaxSizeT); + std::vector buf(maxReadSize); + for (auto &Req : ReadRequests) + { + if (!Req.DestinationAddr) + { + Req.DestinationAddr = buf.data(); + } + ReadData(m_DataFileManager, maxOpenFiles, Req.WriterRank, + Req.Timestep, Req.StartOffset, Req.ReadLength, + Req.DestinationAddr); + m_BP5Deserializer->FinalizeGet(Req, false); + } + } + + // clear pending requests inside deserializer + { + std::vector empty; + m_BP5Deserializer->FinalizeGets(empty); + } + + /*TP end = NOW(); + double t1 = DURATION(start, end); + double t2 = DURATION(startRead, end); + std::cout << " -> PerformGets() total = " << t1 << "s, Read loop = " << t2 + << "s, sort = " << sortTime << "s, generate = " << generateTime + << ", nRequests = " << nRequest << std::endl;*/ +} + +// PRIVATE +void DaosReader::Init() +{ + if ((m_OpenMode != Mode::Read) && (m_OpenMode != Mode::ReadRandomAccess)) + { + helper::Throw( + "Engine", "DaosReader", "Init", + "BPFileReader only supports OpenMode::Read or " + "OpenMode::ReadRandomAccess from" + + m_Name); + } + + // if IO was involved in reading before this flag may be true now + m_IO.m_ReadStreaming = false; + m_ReaderIsRowMajor = (m_IO.m_ArrayOrder == ArrayOrdering::RowMajor); + InitParameters(); + InitTransports(); + InitDAOS(); + if (!m_Parameters.SelectSteps.empty()) + { + m_SelectedSteps.ParseSelection(m_Parameters.SelectSteps); + } + + /* Do a collective wait for the file(s) to appear within timeout. + Make sure every process comes to the same conclusion */ + const Seconds timeoutSeconds = Seconds(m_Parameters.OpenTimeoutSecs); + + Seconds pollSeconds = Seconds(m_Parameters.BeginStepPollingFrequencySecs); + if (pollSeconds > timeoutSeconds) + { + pollSeconds = timeoutSeconds; + } + + TimePoint timeoutInstant = Now() + timeoutSeconds; + OpenFiles(timeoutInstant, pollSeconds, timeoutSeconds); + UpdateBuffer(timeoutInstant, pollSeconds / 10, timeoutSeconds); +} + +void DaosReader::InitParameters() +{ + ParseParams(m_IO, m_Parameters); + if (m_Parameters.OpenTimeoutSecs < 0.0f) + { + if (m_OpenMode == Mode::ReadRandomAccess) + { + m_Parameters.OpenTimeoutSecs = 0.0f; + } + else + { + m_Parameters.OpenTimeoutSecs = 3600.0f; + } + } + + m_Threads = m_Parameters.Threads; + if (m_Threads == 0) + { + helper::Comm m_NodeComm = + m_Comm.GroupByShm("creating per-node comm at BP5 Open(read)"); + unsigned int NodeSize = static_cast(m_NodeComm.Size()); + unsigned int NodeThreadSize = helper::NumHardwareThreadsPerNode(); + if (NodeThreadSize > 0) + { + m_Threads = + helper::SetWithinLimit(NodeThreadSize / NodeSize, 1U, 16U); + } + else + { + m_Threads = helper::SetWithinLimit(8U / NodeSize, 1U, 8U); + } + } + + // Create m_Threads-1 extra file managers to be used by threads + // The main thread uses the DataFileManager pushed here to vector[0] + fileManagers.push_back(m_DataFileManager); + for (unsigned int i = 0; i < m_Threads - 1; ++i) + { + fileManagers.push_back(transportman::TransportMan( + transportman::TransportMan(m_IO, singleComm))); + } + + size_t limit = helper::RaiseLimitNoFile(); + if (m_Parameters.MaxOpenFilesAtOnce > limit - 8) + { + m_Parameters.MaxOpenFilesAtOnce = limit - 8; + } +} + +bool DaosReader::SleepOrQuit(const TimePoint &timeoutInstant, + const Seconds &pollSeconds) +{ + auto now = Now(); + if (now >= timeoutInstant) + { + return false; + } + auto remainderTime = timeoutInstant - now; + auto sleepTime = pollSeconds; + if (remainderTime < sleepTime) + { + sleepTime = remainderTime; + } + std::this_thread::sleep_for(sleepTime); + return true; +} + +size_t DaosReader::OpenWithTimeout(transportman::TransportMan &tm, + const std::vector &fileNames, + const TimePoint &timeoutInstant, + const Seconds &pollSeconds, + std::string &lasterrmsg /*INOUT*/) +{ + size_t flag = 1; // 0 = OK, opened file, 1 = timeout, 2 = error + do + { + try + { + errno = 0; + const bool profile = + false; // m_BP4Deserializer.m_Profiler.m_IsActive; + tm.OpenFiles(fileNames, adios2::Mode::Read, + m_IO.m_TransportsParameters, profile); + flag = 0; // found file + break; + } + catch (std::ios_base::failure &e) + { + lasterrmsg = + std::string("errno=" + std::to_string(errno) + ": " + e.what()); + if (errno == ENOENT) + { + flag = 1; // timeout + } + else + { + flag = 2; // fatal error + break; + } + } + } while (SleepOrQuit(timeoutInstant, pollSeconds)); + return flag; +} + +void DaosReader::OpenFiles(TimePoint &timeoutInstant, + const Seconds &pollSeconds, + const Seconds &timeoutSeconds) +{ + /* Poll */ + size_t flag = 1; // 0 = OK, opened file, 1 = timeout, 2 = error + std::string lasterrmsg; + if (m_Comm.Rank() == 0) + { + /* Open the metadata index table */ + const std::string metadataIndexFile(GetBPMetadataIndexFileName(m_Name)); + + flag = OpenWithTimeout(m_MDIndexFileManager, {metadataIndexFile}, + timeoutInstant, pollSeconds, lasterrmsg); + if (flag == 0) + { + /* Open the metadata file */ + const std::string metadataFile(GetBPMetadataFileName(m_Name)); + + /* We found md.idx. If we don't find md.0 immediately we should + * wait a little bit hoping for the file system to catch up. + * This slows down finding the error in file reading mode but + * it will be more robust in streaming mode + */ + if (timeoutSeconds == Seconds(0.0)) + { + timeoutInstant += Seconds(5.0); + } + + flag = OpenWithTimeout(m_MDFileManager, {metadataFile}, + timeoutInstant, pollSeconds, lasterrmsg); + if (flag != 0) + { + /* Close the metadata index table */ + m_MDIndexFileManager.CloseFiles(); + } + else + { + /* Open the metametadata file */ + const std::string metametadataFile( + GetBPMetaMetadataFileName(m_Name)); + + /* We found md.idx. If we don't find md.0 immediately we should + * wait a little bit hoping for the file system to catch up. + * This slows down finding the error in file reading mode but + * it will be more robust in streaming mode + */ + if (timeoutSeconds == Seconds(0.0)) + { + timeoutInstant += Seconds(5.0); + } + + flag = OpenWithTimeout(m_FileMetaMetadataManager, + {metametadataFile}, timeoutInstant, + pollSeconds, lasterrmsg); + if (flag != 0) + { + /* Close the metametadata index table */ + m_MDIndexFileManager.CloseFiles(); + m_MDFileManager.CloseFiles(); + } + } + } + } + + flag = m_Comm.BroadcastValue(flag, 0); + if (flag == 2) + { + if (m_Comm.Rank() == 0 && !lasterrmsg.empty()) + { + helper::Throw( + "Engine", "DaosReader", "OpenFiles", + "File " + m_Name + " cannot be opened: " + lasterrmsg); + } + else + { + helper::Throw( + "Engine", "DaosReader", "OpenFiles", + "File " + m_Name + " cannot be opened"); + } + } + else if (flag == 1) + { + if (m_Comm.Rank() == 0) + { + helper::Throw( + "Engine", "DaosReader", "OpenFiles", + "File " + m_Name + " could not be found within the " + + std::to_string(timeoutSeconds.count()) + + "s timeout: " + lasterrmsg); + } + else + { + helper::Throw( + "Engine", "DaosReader", "OpenFiles", + "File " + m_Name + " could not be found within the " + + std::to_string(timeoutSeconds.count()) + "s timeout"); + } + } + + /* At this point we may have an empty index table. + * The writer has created the file but no content may have been stored yet. + */ +} + +MinVarInfo *DaosReader::MinBlocksInfo(const VariableBase &Var, + const size_t Step) const +{ + return m_BP5Deserializer->MinBlocksInfo(Var, Step); +} + +bool DaosReader::VarShape(const VariableBase &Var, const size_t Step, + Dims &Shape) const +{ + return m_BP5Deserializer->VarShape(Var, Step, Shape); +} + +bool DaosReader::VariableMinMax(const VariableBase &Var, const size_t Step, + MinMaxStruct &MinMax) +{ + return m_BP5Deserializer->VariableMinMax(Var, Step, MinMax); +} + +void DaosReader::InitTransports() +{ + if (m_IO.m_TransportsParameters.empty()) + { + Params defaultTransportParameters; + defaultTransportParameters["transport"] = "File"; + m_IO.m_TransportsParameters.push_back(defaultTransportParameters); + } +} + +void DaosReader::InitDAOS() +{ + // Rank 0 - Connect to DAOS pool, and open container + int rc; + rc = daos_init(); + ASSERT(rc == 0, "daos_init failed with %d", rc); + + std::cout << __func__ << std::endl; + + rc = gethostname(node, sizeof(node)); + ASSERT(rc == 0, "buffer for hostname too small"); + if (m_Comm.Rank() == 0) + { + /** connect to the just created DAOS pool */ + rc = daos_pool_connect(pool_label, DSS_PSETID, + // DAOS_PC_EX , + DAOS_PC_RW /* read write access */, + &poh /* returned pool handle */, + NULL /* returned pool info */, NULL /* event */); + ASSERT(rc == 0, "pool connect failed with %d", rc); + } + + /** share pool handle with peer tasks */ + daos_handle_share(&poh, DaosReader::HANDLE_POOL); + + if (m_Comm.Rank() == 0) + { + /** open container */ + rc = daos_cont_open(poh, cont_label, DAOS_COO_RW, &coh, NULL, NULL); + ASSERT(rc == 0, "container open failed with %d", rc); + } + + /** share container handle with peer tasks */ + daos_handle_share(&coh, HANDLE_CO); + + if (m_Comm.Rank() == 0) + { + FILE *fp = fopen("./share/oid.txt", "r"); + if (fp == NULL) + { + perror("fopen"); + exit(1); + } + if (fscanf(fp, "%" SCNu64 "\n%" SCNu64 "\n", &oid.hi, &oid.lo) != 2) + { + fprintf(stderr, "Error reading OID from file\n"); + exit(1); + } + fclose(fp); + } + + // Rank 0 will broadcast the DAOS KV OID + MPI_Bcast(&oid.hi, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD); + MPI_Bcast(&oid.lo, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD); + + // Open KV object + rc = daos_kv_open(coh, oid, 0, &oh, NULL); + ASSERT(rc == 0, "daos_kv_open failed with %d", rc); +} + +void DaosReader::InstallMetaMetaData(format::BufferSTL buffer) +{ + size_t Position = m_MetaMetaDataFileAlreadyProcessedSize; + while (Position < buffer.m_Buffer.size()) + { + format::BP5Base::MetaMetaInfoBlock MMI; + + MMI.MetaMetaIDLen = helper::ReadValue( + buffer.m_Buffer, Position, m_Minifooter.IsLittleEndian); + MMI.MetaMetaInfoLen = helper::ReadValue( + buffer.m_Buffer, Position, m_Minifooter.IsLittleEndian); + MMI.MetaMetaID = buffer.Data() + Position; + MMI.MetaMetaInfo = buffer.Data() + Position + MMI.MetaMetaIDLen; + m_BP5Deserializer->InstallMetaMetaData(MMI); + Position += MMI.MetaMetaIDLen + MMI.MetaMetaInfoLen; + } + m_MetaMetaDataFileAlreadyProcessedSize = Position; +} + +void DaosReader::UpdateBuffer(const TimePoint &timeoutInstant, + const Seconds &pollSeconds, + const Seconds &timeoutSeconds) +{ + size_t newIdxSize = 0; + m_MetadataIndex.Reset(true, false); + if (m_Comm.Rank() == 0) + { + /* Read metadata index table into memory */ + const size_t metadataIndexFileSize = + m_MDIndexFileManager.GetFileSize(0); + newIdxSize = metadataIndexFileSize - m_MDIndexFileAlreadyReadSize; + if (metadataIndexFileSize > m_MDIndexFileAlreadyReadSize) + { + m_MetadataIndex.m_Buffer.resize(newIdxSize); + m_MDIndexFileManager.ReadFile(m_MetadataIndex.m_Buffer.data(), + newIdxSize, + m_MDIndexFileAlreadyReadSize); + } + else + { + m_MetadataIndex.m_Buffer.resize(0); + } + } + + // broadcast metadata index buffer to all ranks from zero + m_Comm.BroadcastVector(m_MetadataIndex.m_Buffer); + newIdxSize = m_MetadataIndex.m_Buffer.size(); + + size_t parsedIdxSize = 0; + const auto stepsBefore = m_StepsCount; + if (newIdxSize > 0) + { + /* Parse metadata index table */ + const bool hasHeader = (!m_MDIndexFileAlreadyReadSize); + parsedIdxSize = ParseMetadataIndex(m_MetadataIndex, 0, hasHeader); + // now we are sure the index header has been parsed, + // first step parsing done + // m_FilteredMetadataInfo is created + + // cut down the index buffer by throwing away the read but unprocessed + // steps + m_MetadataIndex.m_Buffer.resize(parsedIdxSize); + // next time read index file from this position + m_MDIndexFileAlreadyReadSize += parsedIdxSize; + + // At this point first in time we learned the writer's major and we can + // create the serializer object + if (!m_BP5Deserializer) + { + m_BP5Deserializer = new format::BP5Deserializer( + m_WriterIsRowMajor, m_ReaderIsRowMajor, + (m_OpenMode == Mode::ReadRandomAccess)); + m_BP5Deserializer->m_Engine = this; + } + } + + if (m_StepsCount > stepsBefore) + { + m_Metadata.Reset(true, false); + m_MetaMetadata.Reset(true, false); + if (m_Comm.Rank() == 0) + { + // // How much metadata do we need to read? + // size_t fileFilteredSize = 0; + // for (auto p : m_FilteredMetadataInfo) + // { + // fileFilteredSize += p.second; + // } + // + // /* Read metadata file into memory but first make sure + // * it has the content that the index table refers to + // */ + // auto p = m_FilteredMetadataInfo.back(); + // uint64_t expectedMinFileSize = p.first + p.second; + // size_t actualFileSize = 0; + // do + // { + // actualFileSize = m_MDFileManager.GetFileSize(0); + // if (actualFileSize >= expectedMinFileSize) + // { + // break; + // } + // } while (SleepOrQuit(timeoutInstant, pollSeconds)); + // + // if (actualFileSize >= expectedMinFileSize) + // { + // m_Metadata.Resize(fileFilteredSize, + // "allocating metadata buffer, " + // "in call to DaosReader Open"); + // size_t mempos = 0; + // for (auto p : m_FilteredMetadataInfo) + // { + // m_MDFileManager.ReadFile( + // m_Metadata.m_Buffer.data() + mempos, + // p.second, p.first); + // mempos += p.second; + // } + // m_MDFileAlreadyReadSize = expectedMinFileSize; + // } + // else + // { + // helper::Throw( + // "Engine", "DaosReader", "UpdateBuffer", + // "File " + m_Name + + // " was found with an index file but md.0 " + // "has not contained enough data within " + // "the specified timeout of " + + // std::to_string(timeoutSeconds.count()) + + // " seconds. index size = " + + // std::to_string(newIdxSize) + " metadata + // size = " + std::to_string(actualFileSize) + // + " expected size = " + + // std::to_string(expectedMinFileSize) + + // ". One reason could be if the reader finds + // old " "data " "while " "the writer is + // creating the new files."); + // } + + /* Read new meta-meta-data into memory and append to existing one in + * memory */ + const size_t metametadataFileSize = + m_FileMetaMetadataManager.GetFileSize(0); + if (metametadataFileSize > m_MetaMetaDataFileAlreadyReadSize) + { + const size_t newMMDSize = + metametadataFileSize - m_MetaMetaDataFileAlreadyReadSize; + m_MetaMetadata.Resize(metametadataFileSize, + "(re)allocating meta-meta-data buffer, " + "in call to DaosReader Open"); + m_FileMetaMetadataManager.ReadFile( + m_MetaMetadata.m_Buffer.data() + + m_MetaMetaDataFileAlreadyReadSize, + newMMDSize, m_MetaMetaDataFileAlreadyReadSize); + m_MetaMetaDataFileAlreadyReadSize += newMMDSize; + } + } + + // broadcast buffer to all ranks from zero + // m_Comm.BroadcastVector(m_Metadata.m_Buffer); + + // broadcast metadata index buffer to all ranks from zero + m_Comm.BroadcastVector(m_MetaMetadata.m_Buffer); + + InstallMetaMetaData(m_MetaMetadata); + + if (m_OpenMode == Mode::ReadRandomAccess) + { + for (size_t Step = 0; Step < m_MetadataIndexTable.size(); Step++) + { + m_BP5Deserializer->SetupForStep( + Step, m_WriterMap[m_WriterMapIndex[Step]].WriterCount); + InstallMetadataForTimestep(Step); + } + } + } +} + +size_t DaosReader::ParseMetadataIndex(format::BufferSTL &bufferSTL, + const size_t absoluteStartPos, + const bool hasHeader) +{ + const auto &buffer = bufferSTL.m_Buffer; + size_t &position = bufferSTL.m_Position; + + if (hasHeader) + { + // Read header (64 bytes) + // long version string + position = m_VersionTagPosition; + m_Minifooter.VersionTag.assign(&buffer[position], m_VersionTagLength); + + position = m_EndianFlagPosition; + const uint8_t endianness = helper::ReadValue(buffer, position); + m_Minifooter.IsLittleEndian = (endianness == 0) ? true : false; +#ifndef ADIOS2_HAVE_ENDIAN_REVERSE + if (helper::IsLittleEndian() != m_Minifooter.IsLittleEndian) + { + helper::Throw( + "Engine", "DaosReader", "ParseMetadataIndex", + "reader found BigEndian bp file, " + "this version of ADIOS2 wasn't compiled " + "with the cmake flag -DADIOS2_USE_Endian_Reverse=ON " + "explicitly, in call to Open"); + } +#endif + + // This has no flag in BP5 header. Always true + m_Minifooter.HasSubFiles = true; + + // BP version + position = m_BPVersionPosition; + m_Minifooter.Version = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + if (m_Minifooter.Version != 5) + { + helper::Throw( + "Engine", "DaosReader", "ParseMetadataIndex", + "ADIOS2 BP5 Engine only supports bp format " + "version 5, found " + + std::to_string(m_Minifooter.Version) + " version"); + } + + // BP minor version, unused + position = m_BPMinorVersionPosition; + const uint8_t minorversion = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + if (minorversion != m_BP5MinorVersion) + { + helper::Throw( + "Engine", "DaosReader", "ParseMetadataIndex", + "Current ADIOS2 BP5 Engine only supports version 5." + + std::to_string(m_BP5MinorVersion) + ", found 5." + + std::to_string(minorversion) + " version"); + } + + // Writer active flag + position = m_ActiveFlagPosition; + const char activeChar = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + m_WriterIsActive = (activeChar == '\1' ? true : false); + + position = m_ColumnMajorFlagPosition; + const uint8_t val = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + m_WriterIsRowMajor = val == 'n'; + // move position to first row + position = m_IndexHeaderSize; + } + + // set a limit for metadata size in streaming mode + size_t maxMetadataSizeInMemory = adios2::MaxSizeT; + if (m_OpenMode == Mode::Read) + { + maxMetadataSizeInMemory = 16777216; // 16MB + } + size_t metadataSizeToRead = 0; + + // Read each record now + uint64_t MetadataPosTotalSkip = 0; + m_MetadataIndexTable.clear(); + m_FilteredMetadataInfo.clear(); + uint64_t minfo_pos = 0; + uint64_t minfo_size = 0; + int n = 0; // a loop counter for current run4 + int nrec = 0; // number of records in current run + + while (position < buffer.size() && + metadataSizeToRead < maxMetadataSizeInMemory) + { + + const unsigned char recordID = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + const uint64_t recordLength = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + const size_t dbgRecordStartPosition = position; + + switch (recordID) + { + case IndexRecord::WriterMapRecord: + { + auto p = m_WriterMap.emplace(m_StepsCount, WriterMapStruct()); + auto &s = p.first->second; + s.WriterCount = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + s.AggregatorCount = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + s.SubfileCount = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + // Get the process -> subfile map + s.RankToSubfile.reserve(s.WriterCount); + for (uint64_t i = 0; i < s.WriterCount; i++) + { + const uint64_t subfileIdx = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + s.RankToSubfile.push_back(subfileIdx); + } + m_LastMapStep = m_StepsCount; + m_LastWriterCount = s.WriterCount; + break; + } + case IndexRecord::StepRecord: + { + std::vector ptrs; + const uint64_t MetadataPos = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + const uint64_t MetadataSize = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + const uint64_t FlushCount = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + + if (!n) + { + minfo_pos = MetadataPos; // initialize minfo_pos properly + MetadataPosTotalSkip = MetadataPos; + } + + if (m_SelectedSteps.IsSelected(m_AbsStepsInFile)) + { + m_WriterMapIndex.push_back(m_LastMapStep); + + // pos in metadata in memory + ptrs.push_back(MetadataPos - MetadataPosTotalSkip); + ptrs.push_back(MetadataSize); + ptrs.push_back(FlushCount); + ptrs.push_back(position); + // absolute pos in file before read + ptrs.push_back(MetadataPos); + m_MetadataIndexTable[m_StepsCount] = ptrs; +#ifdef DUMPDATALOCINFO + for (uint64_t i = 0; i < m_WriterCount; i++) + { + size_t DataPosPos = ptrs[3]; + std::cout << "Writer " << i << " data at "; + for (uint64_t j = 0; j < FlushCount; j++) + { + const uint64_t DataPos = helper::ReadValue( + buffer, DataPosPos, m_Minifooter.IsLittleEndian); + const uint64_t DataSize = helper::ReadValue( + buffer, DataPosPos, m_Minifooter.IsLittleEndian); + std::cout << "loc:" << DataPos << " siz:" << DataSize + << "; "; + } + const uint64_t DataPos = helper::ReadValue( + buffer, DataPosPos, m_Minifooter.IsLittleEndian); + std::cout << "loc:" << DataPos << std::endl; + } +#endif + minfo_size += MetadataSize; + metadataSizeToRead += MetadataSize; + m_StepsCount++; + } + else + { + MetadataPosTotalSkip += MetadataSize; + if (minfo_size > 0) + { + m_FilteredMetadataInfo.push_back( + std::make_pair(minfo_pos, minfo_size)); + } + minfo_pos = MetadataPos + MetadataSize; + minfo_size = 0; + } + + // skip over the writer -> data file offset records + position += + sizeof(uint64_t) * m_LastWriterCount * ((2 * FlushCount) + 1); + ++m_AbsStepsInFile; + ++n; + break; + } + } + // dbg + if ((position - dbgRecordStartPosition) != (size_t)recordLength) + { + helper::Throw( + "Engine", "DaosReader", "ParseMetadataIndex", + "Record " + std::to_string(nrec) + " (id = " + + std::to_string(recordID) + ") has invalid length " + + std::to_string(recordLength) + ". We parsed " + + std::to_string(position - dbgRecordStartPosition) + + " bytes for this record" + + ); + } + ++nrec; + } + if (minfo_size > 0) + { + m_FilteredMetadataInfo.push_back(std::make_pair(minfo_pos, minfo_size)); + } + + return position; +} + +bool DaosReader::ReadActiveFlag(std::vector &buffer) +{ + if (buffer.size() < m_ActiveFlagPosition) + { + helper::Throw( + "Engine", "DaosReader", "ReadActiveFlag", + "called with a buffer smaller than required"); + } + // Writer active flag + size_t position = m_ActiveFlagPosition; + const char activeChar = helper::ReadValue( + buffer, position, m_Minifooter.IsLittleEndian); + m_WriterIsActive = (activeChar == '\1' ? true : false); + return m_WriterIsActive; +} + +bool DaosReader::CheckWriterActive() +{ + size_t flag = 1; + if (m_Comm.Rank() == 0) + { + auto fsize = m_MDIndexFileManager.GetFileSize(0); + if (fsize >= m_IndexHeaderSize) + { + std::vector header(m_IndexHeaderSize, '\0'); + m_MDIndexFileManager.ReadFile(header.data(), m_IndexHeaderSize, 0, + 0); + bool active = ReadActiveFlag(header); + flag = (active ? 1 : 0); + } + } + flag = m_Comm.BroadcastValue(flag, 0); + m_WriterIsActive = (flag > 0); + return m_WriterIsActive; +} + +StepStatus DaosReader::CheckForNewSteps(Seconds timeoutSeconds) +{ + /* Do a collective wait for a step within timeout. + Make sure every reader comes to the same conclusion */ + StepStatus retval = StepStatus::OK; + + if (timeoutSeconds < Seconds::zero()) + { + timeoutSeconds = Seconds(999999999); // max 1 billion seconds wait + } + const TimePoint timeoutInstant = Now() + timeoutSeconds; + + auto pollSeconds = Seconds(m_Parameters.BeginStepPollingFrequencySecs); + if (pollSeconds > timeoutSeconds) + { + pollSeconds = timeoutSeconds; + } + + /* Poll */ + const auto stepsBefore = m_StepsCount; + do + { + UpdateBuffer(timeoutInstant, pollSeconds / 10, timeoutSeconds); + if (m_StepsCount > stepsBefore) + { + break; + } + if (!CheckWriterActive()) + { + /* Race condition: When checking data in UpdateBuffer, new + * step(s) may have not arrived yet. When checking active flag, + * the writer may have completed write and terminated. So we may + * have missed a step or two. */ + UpdateBuffer(timeoutInstant, pollSeconds / 10, timeoutSeconds); + break; + } + } while (SleepOrQuit(timeoutInstant, pollSeconds)); + + if (m_StepsCount > stepsBefore) + { + /* we have got new steps and new metadata in memory */ + retval = StepStatus::OK; + } + else + { + m_IO.m_ReadStreaming = false; + if (m_WriterIsActive) + { + retval = StepStatus::NotReady; + } + else + { + retval = StepStatus::EndOfStream; + } + } + return retval; +} + +void DaosReader::DoGetAbsoluteSteps(const VariableBase &variable, + std::vector &keys) const +{ + m_BP5Deserializer->GetAbsoluteSteps(variable, keys); + return; +} + +#define declare_type(T) \ + void DaosReader::DoGetSync(Variable &variable, T *data) \ + { \ + PERFSTUBS_SCOPED_TIMER("DaosReader::Get"); \ + GetSyncCommon(variable, data); \ + } \ + void DaosReader::DoGetDeferred(Variable &variable, T *data) \ + { \ + PERFSTUBS_SCOPED_TIMER("DaosReader::Get"); \ + GetDeferredCommon(variable, data); \ + } +ADIOS2_FOREACH_STDTYPE_1ARG(declare_type) +#undef declare_type + +void DaosReader::DoGetStructSync(VariableStruct &variable, void *data) +{ + PERFSTUBS_SCOPED_TIMER("DaosReader::Get"); + GetSyncCommon(variable, data); +} + +void DaosReader::DoGetStructDeferred(VariableStruct &variable, void *data) +{ + PERFSTUBS_SCOPED_TIMER("DaosReader::Get"); + GetDeferredCommon(variable, data); +} + +void DaosReader::DoClose(const int transportIndex) +{ + PERFSTUBS_SCOPED_TIMER("DaosReader::Close"); + if (m_OpenMode == Mode::ReadRandomAccess) + { + PerformGets(); + } + else if (m_BetweenStepPairs) + { + EndStep(); + } + m_DataFileManager.CloseFiles(); + m_MDFileManager.CloseFiles(); + m_MDIndexFileManager.CloseFiles(); + m_FileMetaMetadataManager.CloseFiles(); + for (unsigned int i = 1; i < m_Threads; ++i) + { + fileManagers[i].CloseFiles(); + } +} + +size_t DaosReader::DoSteps() const { return m_StepsCount; } + +void DaosReader::NotifyEngineNoVarsQuery() +{ + if (!m_BetweenStepPairs) + { + helper::Throw( + "Engine", "DaosReader", "NotifyEngineNoVarsQuery", + "You've called InquireVariable() when the IO is empty and " + "outside a BeginStep/EndStep pair. If this is code that is " + "newly " + "transititioning to the BP5 file engine, you may be relying " + "upon " + "deprecated behaviour. If you intend to use ADIOS using the " + "Begin/EndStep interface, move all InquireVariable calls " + "inside " + "the BeginStep/EndStep pair. If intending to use " + "random-access " + "file mode, change your Open() mode parameter to " + "Mode::ReadRandomAccess."); + } +} + +void DaosReader::daos_handle_share(daos_handle_t *hdl, int type) +{ + d_iov_t ghdl = {NULL, 0, 0}; + int rc; + + if (m_Comm.Rank() == 0) + { + /** fetch size of global handle */ + if (type == DaosReader::HANDLE_POOL) + rc = daos_pool_local2global(*hdl, &ghdl); + else + rc = daos_cont_local2global(*hdl, &ghdl); + ASSERT(rc == 0, "local2global failed with %d", rc); + } + + /** broadcast size of global handle to all peers */ + MPI_Bcast(&ghdl.iov_buf_len, 1, MPI_UINT64_T, 0, MPI_COMM_WORLD); + + /** allocate buffer for global pool handle */ + ghdl.iov_buf = malloc(ghdl.iov_buf_len); + ghdl.iov_len = ghdl.iov_buf_len; + + if (m_Comm.Rank() == 0) + { + /** generate actual global handle to share with peer tasks */ + if (type == DaosReader::HANDLE_POOL) + rc = daos_pool_local2global(*hdl, &ghdl); + else + rc = daos_cont_local2global(*hdl, &ghdl); + ASSERT(rc == 0, "local2global failed with %d", rc); + } + + /** broadcast global handle to all peers */ + MPI_Bcast(ghdl.iov_buf, ghdl.iov_len, MPI_BYTE, 0, MPI_COMM_WORLD); + + if (m_Comm.Rank() != 0) + { + /** unpack global handle */ + if (type == DaosReader::HANDLE_POOL) + { + /* NB: Only pool_global2local are different */ + rc = daos_pool_global2local(ghdl, hdl); + } + else + { + rc = daos_cont_global2local(poh, ghdl, hdl); + } + ASSERT(rc == 0, "global2local failed with %d", rc); + } + + free(ghdl.iov_buf); + + MPI_Barrier(MPI_COMM_WORLD); +} + +} // end namespace engine +} // end namespace core +} // end namespace adios2 diff --git a/source/adios2/engine/daos/DaosReader.h b/source/adios2/engine/daos/DaosReader.h new file mode 100644 index 0000000000..57e955e8ae --- /dev/null +++ b/source/adios2/engine/daos/DaosReader.h @@ -0,0 +1,309 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * DaosReader.h + * + */ + +#ifndef ADIOS2_ENGINE_DAOS_DAOSREADER_H_ +#define ADIOS2_ENGINE_DAOS_DAOSREADER_H_ +#define DSS_PSETID "daos_server" + +#include "adios2/common/ADIOSConfig.h" +#include "adios2/core/CoreTypes.h" +#include "adios2/core/Engine.h" +#include "adios2/engine/daos/DaosEngine.h" +#include "adios2/helper/adiosComm.h" +#include "adios2/helper/adiosRangeFilter.h" +#include "adios2/toolkit/format/bp5/BP5Deserializer.h" +#include "adios2/toolkit/transportman/TransportMan.h" + +#include +#include +#include +#include +#include +#include + +#define FAIL(fmt, ...) \ + do \ + { \ + fprintf(stderr, "Process %d(%s): " fmt " aborting\n", m_Comm.Rank(), \ + node, ##__VA_ARGS__); \ + MPI_Abort(MPI_COMM_WORLD, 1); \ + } while (0) +#define ASSERT(cond, ...) \ + do \ + { \ + if (!(cond)) \ + FAIL(__VA_ARGS__); \ + } while (0) + +namespace adios2 +{ +namespace core +{ +namespace engine +{ + +class DaosReader : public DaosEngine, public Engine +{ + +public: + /** + * Unique constructor + * @param io + * @param name + * @param openMode only read + * @param comm + */ + DaosReader(IO &io, const std::string &name, const Mode mode, + helper::Comm comm); + + ~DaosReader(); + + StepStatus BeginStep(StepMode mode = StepMode::Read, + const float timeoutSeconds = -1.0) final; + + size_t CurrentStep() const final; + + void EndStep() final; + + void PerformGets() final; + + MinVarInfo *MinBlocksInfo(const VariableBase &, const size_t Step) const; + bool VarShape(const VariableBase &Var, const size_t Step, + Dims &Shape) const; + bool VariableMinMax(const VariableBase &, const size_t Step, + MinMaxStruct &MinMax); + +private: + format::BP5Deserializer *m_BP5Deserializer = nullptr; + /* transport manager for metadata file */ + transportman::TransportMan m_MDFileManager; + /* How many bytes of metadata have we already read in? */ + size_t m_MDFileAlreadyReadSize = 0; + /* How many bytes of metadata have we already processed? + * It is <= m_MDFileAlreadyReadSize, at = we need to read more */ + size_t m_MDFileProcessedSize = 0; + /* The file position of the first byte that is currently + * residing in memory. Needed for skewing positions when + * processing metadata index. + */ + size_t m_MDFileAbsolutePos = 0; + /* m_MDFileAbsolutePos <= m_MDFileProcessedSize <= m_MDFileAlreadyReadSize + */ + + /* transport manager for managing data file(s) */ + transportman::TransportMan m_DataFileManager; + + /* transport manager for managing the metadata index file */ + transportman::TransportMan m_MDIndexFileManager; + /* transport manager for managing the metadata index file */ + transportman::TransportMan m_FileMetaMetadataManager; + /* How many bytes of metadata index have we already read in? */ + size_t m_MDIndexFileAlreadyReadSize = 0; + + /* How many bytes of meta-metadata have we already read in? */ + size_t m_MetaMetaDataFileAlreadyReadSize = 0; + /* How many bytes of meta-metadata have we already processed? */ + size_t m_MetaMetaDataFileAlreadyProcessedSize = 0; + + /* transport manager for managing the active flag file */ + transportman::TransportMan m_ActiveFlagFileManager; + bool m_WriterIsActive = true; + + /* DAOS declarations */ + + uuid_t pool_uuid, cont_uuid; + char *pool_label = "pool_ranjansv"; + char *cont_label = "adios-daos-engine-cont"; + + /* Declare variables for pool and container handles */ + daos_handle_t poh, coh; + + enum DAOS_handleType + { + HANDLE_POOL, + HANDLE_CO, + }; + + /* Declare variables for the KV object */ + daos_handle_t oh; + daos_obj_id_t oid; + + char node[128] = "unknown"; + + /** used for per-step reads, TODO: to be moved to BP5Deserializer */ + size_t m_CurrentStep = 0; + size_t m_StepsCount = 0; + size_t m_AbsStepsInFile = 0; // all steps parsed including unselected + uint64_t m_LastMapStep = 0; // remember last step that had writer map + uint64_t m_LastWriterCount = 0; // remember writer count in that step + bool m_FirstStep = true; + + /** used to filter steps */ + helper::RangeFilter m_SelectedSteps; + + // offset/size pairs to read sections of metadata from file in InitBuffer + std::vector> m_FilteredMetadataInfo; + + Minifooter m_Minifooter; + + void Init(); + void InitParameters(); + void InitTransports(); + + /** DAOS pool connection and container opening */ + void InitDAOS(); + + /* Sleep up to pollSeconds time if we have not reached timeoutInstant. + * Return true if slept + * return false if sleep was not needed because it was overtime + */ + bool SleepOrQuit(const TimePoint &timeoutInstant, + const Seconds &pollSeconds); + /** Open one category of files within timeout. + * @return: 0 = OK, 1 = timeout, 2 = error + * lasterrmsg contains the error message in case of error + */ + size_t OpenWithTimeout(transportman::TransportMan &tm, + const std::vector &fileNames, + const TimePoint &timeoutInstant, + const Seconds &pollSeconds, + std::string &lasterrmsg /*INOUT*/); + + /** Open files within timeout. + * @return True if files are opened, False in case of timeout + */ + void OpenFiles(TimePoint &timeoutInstant, const Seconds &pollSeconds, + const Seconds &timeoutSeconds); + + /** Read in metadata if exist (throwing away old). + * It reads and parses metadata-index, and reads metadata into memory. + * In streaming mode, only a limited size of metadata is read in. + * Changes in m_StepsCount before and after calling can be used to + * track if new steps (after filtering with SelectSteps) are read in + * and are ready to be processed. + */ + void UpdateBuffer(const TimePoint &timeoutInstant, + const Seconds &pollSeconds, + const Seconds &timeoutSeconds); + + bool ReadActiveFlag(std::vector &buffer); + + /* Parse metadata. + * + * Return the size of metadataindex where parsing stopped. In streaming mode + * parsing is limited to read only a certain size of metadata at once. + * + * As a side effect, the following variables are filled out: + * m_MetadataIndexTable + * m_WriterMapIndex + * m_FilteredMetadataInfo + */ + size_t ParseMetadataIndex(format::BufferSTL &bufferSTL, + const size_t absoluteStartPos, + const bool hasHeader); + + /** Process the new metadata coming in (in UpdateBuffer) + * @param newIdxSize: the size of the new content from Index Table + */ + void ProcessMetadataForNewSteps(const size_t newIdxSize); + + /** Check the active status of the writer. + * @return true if writer is still active. + * It sets m_WriterIsActive. + */ + bool CheckWriterActive(); + + /** Check for a step that is already in memory but haven't + * been processed yet. + * @return true: if new step has been found and processed, false otherwise + * Used by CheckForNewSteps() to get the next step from memory if there is + * one. + */ + bool ProcessNextStepInMemory(); + + /** Check for new steps withing timeout and only if writer is active. + * @return the status flag + * Used by BeginStep() to get new steps from file when it reaches the + * end of steps in memory. + */ + StepStatus CheckForNewSteps(Seconds timeoutSeconds); + + /** Notify the engine when InquireVariable is called when the IO is empty. + * Called from IO.tcc + */ + void NotifyEngineNoVarsQuery(); + +#define declare_type(T) \ + void DoGetSync(Variable &, T *) final; \ + void DoGetDeferred(Variable &, T *) final; + ADIOS2_FOREACH_STDTYPE_1ARG(declare_type) +#undef declare_type + + void DoClose(const int transportIndex = -1) final; + + void GetSyncCommon(VariableBase &variable, void *data); + + void GetDeferredCommon(VariableBase &variable, void *data); + + void DoGetStructSync(VariableStruct &, void *); + void DoGetStructDeferred(VariableStruct &, void *); + + template + void ReadVariableBlocks(Variable &variable); + + size_t DoSteps() const final; + + void DoGetAbsoluteSteps(const VariableBase &variable, + std::vector &keys) const final; + + uint32_t m_WriterColumnMajor = 0; + bool m_ReaderIsRowMajor = true; + bool m_WriterIsRowMajor = true; + + format::BufferSTL m_MetadataIndex; + format::BufferSTL m_MetaMetadata; + format::BufferSTL m_Metadata; + + void InstallMetaMetaData(format::BufferSTL MetaMetadata); + void InstallMetadataForTimestep(size_t Step); + std::pair + ReadData(adios2::transportman::TransportMan &FileManager, + const size_t maxOpenFiles, const size_t WriterRank, + const size_t Timestep, const size_t StartOffset, + const size_t Length, char *Destination); + + struct WriterMapStruct + { + uint32_t WriterCount = 0; + uint32_t AggregatorCount = 0; + uint32_t SubfileCount = 0; + std::vector RankToSubfile; // size WriterCount + }; + + // step -> writermap but not for all steps + std::map m_WriterMap; + // step -> writermap index (for all steps) + std::vector m_WriterMapIndex; + + void DestructorClose(bool Verbose) noexcept; + + /* Communicator connecting ranks on each Compute Node. + Only used to calculate the number of threads available for reading */ + helper::Comm m_NodeComm; + helper::Comm singleComm; + unsigned int m_Threads; + std::vector fileManagers; // manager per thread + + void daos_handle_share(daos_handle_t *, int); +}; + +} // end namespace engine +} // end namespace core +} // end namespace adios2 + +#endif /* ADIOS2_ENGINE_DAOS_DAOSREADER_H_ */ diff --git a/source/adios2/engine/daos/DaosReader.tcc b/source/adios2/engine/daos/DaosReader.tcc new file mode 100644 index 0000000000..b4d47c9a76 --- /dev/null +++ b/source/adios2/engine/daos/DaosReader.tcc @@ -0,0 +1,39 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * DaosReader.tcc + * + */ + +#ifndef ADIOS2_ENGINE_DAOS_DAOSREADER_TCC_ +#define ADIOS2_ENGINE_DAOS_DAOSREADER_TCC_ + +#include "DaosReader.h" + +#include "adios2/helper/adiosFunctions.h" + +namespace adios2 +{ +namespace core +{ +namespace engine +{ + +inline void DaosReader::GetSyncCommon(VariableBase &variable, void *data) +{ + bool need_sync = m_BP5Deserializer->QueueGet(variable, data); + if (need_sync) + PerformGets(); +} + +void DaosReader::GetDeferredCommon(VariableBase &variable, void *data) +{ + (void)m_BP5Deserializer->QueueGet(variable, data); +} + +} // end namespace engine +} // end namespace core +} // end namespace adios2 + +#endif /* ADIOS2_ENGINE_DAOS_DAOSREADER_TCC_ */ diff --git a/source/adios2/engine/daos/DaosWriter.cpp b/source/adios2/engine/daos/DaosWriter.cpp new file mode 100644 index 0000000000..7adb0a169c --- /dev/null +++ b/source/adios2/engine/daos/DaosWriter.cpp @@ -0,0 +1,2014 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * DaosWriter.cpp + * + */ + +#include "DaosWriter.h" +#include "DaosWriter.tcc" + +#include "adios2/common/ADIOSMacros.h" +#include "adios2/core/IO.h" +#include "adios2/helper/adiosFunctions.h" //CheckIndexRange +#include "adios2/helper/adiosMath.h" // SetWithinLimit +#include "adios2/helper/adiosMemory.h" // NdCopy +#include "adios2/toolkit/format/buffer/chunk/ChunkV.h" +#include "adios2/toolkit/format/buffer/malloc/MallocV.h" +#include "adios2/toolkit/transport/file/FileFStream.h" +#include + +#include +#include // setw +#include +#include // make_shared + +namespace adios2 +{ +namespace core +{ +namespace engine +{ + +using namespace adios2::format; + +DaosWriter::DaosWriter(IO &io, const std::string &name, const Mode mode, + helper::Comm comm) +: Engine("DaosWriter", io, name, mode, std::move(comm)), m_BP5Serializer(), + m_FileDataManager(io, m_Comm), m_FileMetadataManager(io, m_Comm), + m_FileMetadataIndexManager(io, m_Comm), m_FileMetaMetadataManager(io, m_Comm), + m_Profiler(m_Comm) +{ + m_EngineStart = Now(); + PERFSTUBS_SCOPED_TIMER("DaosWriter::Open"); + m_IO.m_ReadStreaming = false; + + Init(); + m_IsOpen = true; +} + +StepStatus DaosWriter::BeginStep(StepMode mode, const float timeoutSeconds) +{ + if (m_BetweenStepPairs) + { + helper::Throw("Engine", "DaosWriter", "BeginStep", + "BeginStep() is called a second time " + "without an intervening EndStep()"); + } + + Seconds ts = Now() - m_EngineStart; + // std::cout << "BEGIN STEP starts at: " << ts.count() << std::endl; + m_BetweenStepPairs = true; + + if (m_WriterStep > 0) + { + m_LastTimeBetweenSteps = Now() - m_EndStepEnd; + m_TotalTimeBetweenSteps += m_LastTimeBetweenSteps; + m_AvgTimeBetweenSteps = m_TotalTimeBetweenSteps / m_WriterStep; + m_ExpectedTimeBetweenSteps = m_LastTimeBetweenSteps; + if (m_ExpectedTimeBetweenSteps > m_AvgTimeBetweenSteps) + { + m_ExpectedTimeBetweenSteps = m_AvgTimeBetweenSteps; + } + } + + if ((m_WriterStep == 0) && m_Parameters.UseOneTimeAttributes) + { + const auto &attributes = m_IO.GetAttributes(); + + for (const auto &attributePair : attributes) + { + m_BP5Serializer.OnetimeMarshalAttribute(*(attributePair.second)); + } + } + + if (m_Parameters.AsyncWrite) + { + m_AsyncWriteLock.lock(); + m_flagRush = true; + m_AsyncWriteLock.unlock(); + TimePoint wait_start = Now(); + if (m_WriteFuture.valid()) + { + m_Profiler.Start("WaitOnAsync"); + m_WriteFuture.get(); + m_Comm.Barrier(); + AsyncWriteDataCleanup(); + Seconds wait = Now() - wait_start; + if (m_Comm.Rank() == 0) + { + WriteMetadataFileIndex(m_LatestMetaDataPos, + m_LatestMetaDataSize); + if (m_Parameters.verbose > 0) + { + std::cout << "BeginStep, wait on async write was = " + << wait.count() << " time since EndStep was = " + << m_LastTimeBetweenSteps.count() + << " expect next one to be = " + << m_ExpectedTimeBetweenSteps.count() + << std::endl; + } + } + m_Profiler.Stop("WaitOnAsync"); + } + } + + if (m_Parameters.BufferVType == (int)BufferVType::MallocVType) + { + m_BP5Serializer.InitStep(new MallocV( + "DaosWriter", false, m_BP5Serializer.m_BufferAlign, + m_BP5Serializer.m_BufferBlockSize, m_Parameters.InitialBufferSize, + m_Parameters.GrowthFactor)); + } + else + { + m_BP5Serializer.InitStep(new ChunkV( + "DaosWriter", false, m_BP5Serializer.m_BufferAlign, + m_BP5Serializer.m_BufferBlockSize, m_Parameters.BufferChunkSize)); + } + m_ThisTimestepDataSize = 0; + + ts = Now() - m_EngineStart; + // std::cout << "BEGIN STEP ended at: " << ts.count() << std::endl; + return StepStatus::OK; +} + +size_t DaosWriter::CurrentStep() const { return m_WriterStep; } + +void DaosWriter::PerformPuts() +{ + PERFSTUBS_SCOPED_TIMER("DaosWriter::PerformPuts"); + m_Profiler.Start("PP"); + m_BP5Serializer.PerformPuts(m_Parameters.AsyncWrite || + m_Parameters.DirectIO); + m_Profiler.Stop("PP"); + return; +} + +void DaosWriter::WriteMetaMetadata( + const std::vector MetaMetaBlocks) +{ + for (auto &b : MetaMetaBlocks) + { + m_FileMetaMetadataManager.WriteFiles((char *)&b.MetaMetaIDLen, + sizeof(size_t)); + m_FileMetaMetadataManager.WriteFiles((char *)&b.MetaMetaInfoLen, + sizeof(size_t)); + m_FileMetaMetadataManager.WriteFiles((char *)b.MetaMetaID, + b.MetaMetaIDLen); + m_FileMetaMetadataManager.WriteFiles((char *)b.MetaMetaInfo, + b.MetaMetaInfoLen); + } +} + +uint64_t +DaosWriter::WriteMetadata(const std::vector &MetaDataBlocks, + const std::vector &AttributeBlocks) +{ + uint64_t MDataTotalSize = 0; + uint64_t MetaDataSize = 0; + std::vector SizeVector; + std::vector AttrSizeVector; + SizeVector.reserve(MetaDataBlocks.size()); + for (auto &b : MetaDataBlocks) + { + MDataTotalSize += sizeof(uint64_t) + b.iov_len; + SizeVector.push_back(b.iov_len); + } + for (auto &b : AttributeBlocks) + { + MDataTotalSize += sizeof(uint64_t) + b.iov_len; + AttrSizeVector.push_back(b.iov_len); + } + MetaDataSize = 0; + m_FileMetadataManager.WriteFiles((char *)&MDataTotalSize, sizeof(uint64_t)); + MetaDataSize += sizeof(uint64_t); + m_FileMetadataManager.WriteFiles((char *)SizeVector.data(), + sizeof(uint64_t) * SizeVector.size()); + MetaDataSize += sizeof(uint64_t) * AttrSizeVector.size(); + m_FileMetadataManager.WriteFiles((char *)AttrSizeVector.data(), + sizeof(uint64_t) * AttrSizeVector.size()); + MetaDataSize += sizeof(uint64_t) * AttrSizeVector.size(); + for (auto &b : MetaDataBlocks) + { + if (!b.iov_base) + continue; + m_FileMetadataManager.WriteFiles((char *)b.iov_base, b.iov_len); + MetaDataSize += b.iov_len; + } + + for (auto &b : AttributeBlocks) + { + if (!b.iov_base) + continue; + m_FileMetadataManager.WriteFiles((char *)b.iov_base, b.iov_len); + MetaDataSize += b.iov_len; + } + + m_MetaDataPos += MetaDataSize; + return MetaDataSize; +} + +void DaosWriter::AsyncWriteDataCleanup() +{ + if (m_Parameters.AsyncWrite) + { + switch (m_Parameters.AggregationType) + { + case (int)AggregationType::EveryoneWrites: + case (int)AggregationType::EveryoneWritesSerial: + AsyncWriteDataCleanup_EveryoneWrites(); + break; + case (int)AggregationType::TwoLevelShm: + AsyncWriteDataCleanup_TwoLevelShm(); + break; + default: + break; + } + } +} + +void DaosWriter::WriteData(format::BufferV *Data) +{ + if (m_Parameters.AsyncWrite) + { + switch (m_Parameters.AggregationType) + { + case (int)AggregationType::EveryoneWrites: + WriteData_EveryoneWrites_Async(Data, false); + break; + case (int)AggregationType::EveryoneWritesSerial: + WriteData_EveryoneWrites_Async(Data, true); + break; + case (int)AggregationType::TwoLevelShm: + WriteData_TwoLevelShm_Async(Data); + break; + default: + helper::Throw( + "Engine", "DaosWriter", "WriteData", + "Aggregation method " + + std::to_string(m_Parameters.AggregationType) + + "is not supported in BP5"); + } + } + else + { + switch (m_Parameters.AggregationType) + { + case (int)AggregationType::EveryoneWrites: + WriteData_EveryoneWrites(Data, false); + break; + case (int)AggregationType::EveryoneWritesSerial: + WriteData_EveryoneWrites(Data, true); + break; + case (int)AggregationType::TwoLevelShm: + WriteData_TwoLevelShm(Data); + break; + default: + helper::Throw( + "Engine", "DaosWriter", "WriteData", + "Aggregation method " + + std::to_string(m_Parameters.AggregationType) + + "is not supported in BP5"); + } + delete Data; + } +} + +void DaosWriter::WriteData_EveryoneWrites(format::BufferV *Data, + bool SerializedWriters) +{ + const aggregator::MPIChain *a = + dynamic_cast(m_Aggregator); + + // new step writing starts at offset m_DataPos on aggregator + // others will wait for the position to arrive from the rank below + + if (a->m_Comm.Rank() > 0) + { + a->m_Comm.Recv(&m_DataPos, 1, a->m_Comm.Rank() - 1, 0, + "Chain token in DaosWriter::WriteData"); + } + + // align to PAGE_SIZE + m_DataPos += + helper::PaddingToAlignOffset(m_DataPos, m_Parameters.StripeSize); + m_StartDataPos = m_DataPos; + + if (!SerializedWriters && a->m_Comm.Rank() < a->m_Comm.Size() - 1) + { + /* Send the token before writing so everyone can start writing asap */ + uint64_t nextWriterPos = m_DataPos + Data->Size(); + a->m_Comm.Isend(&nextWriterPos, 1, a->m_Comm.Rank() + 1, 0, + "Chain token in DaosWriter::WriteData"); + } + + m_DataPos += Data->Size(); + std::vector DataVec = Data->DataVec(); + m_FileDataManager.WriteFileAt(DataVec.data(), DataVec.size(), + m_StartDataPos); + + if (SerializedWriters && a->m_Comm.Rank() < a->m_Comm.Size() - 1) + { + /* send token now, effectively serializing the writers in the chain */ + uint64_t nextWriterPos = m_DataPos; + a->m_Comm.Isend(&nextWriterPos, 1, a->m_Comm.Rank() + 1, 0, + "Chain token in DaosWriter::WriteData"); + } + + if (a->m_Comm.Size() > 1) + { + // at the end, last rank sends back the final data pos to first rank + // so it can update its data pos + if (a->m_Comm.Rank() == a->m_Comm.Size() - 1) + { + a->m_Comm.Isend(&m_DataPos, 1, 0, 0, + "Final chain token in DaosWriter::WriteData"); + } + if (a->m_Comm.Rank() == 0) + { + a->m_Comm.Recv(&m_DataPos, 1, a->m_Comm.Size() - 1, 0, + "Chain token in DaosWriter::WriteData"); + } + } +} + +void DaosWriter::WriteMetadataFileIndex(uint64_t MetaDataPos, + uint64_t MetaDataSize) +{ + m_FileMetadataManager.FlushFiles(); + + // bufsize: Step record + size_t bufsize = + 1 + (4 + ((FlushPosSizeInfo.size() * 2) + 1) * m_Comm.Size()) * + sizeof(uint64_t); + if (MetaDataPos == 0) + { + // First time, write the headers + bufsize += m_IndexHeaderSize; + } + if (!m_WriterSubfileMap.empty()) + { + // WriterMap record + bufsize += 1 + (4 + m_Comm.Size()) * sizeof(uint64_t); + } + + std::vector buf(bufsize); + size_t pos = 0; + uint64_t d; + unsigned char record; + + if (MetaDataPos == 0) + { + // First time, write the headers + MakeHeader(buf, pos, "Index Table", true); + } + + // WriterMap record + if (!m_WriterSubfileMap.empty()) + { + record = WriterMapRecord; + helper::CopyToBuffer(buf, pos, &record, 1); // record type + d = (3 + m_Comm.Size()) * sizeof(uint64_t); + helper::CopyToBuffer(buf, pos, &d, 1); // record length + d = static_cast(m_Comm.Size()); + helper::CopyToBuffer(buf, pos, &d, 1); + d = static_cast(m_Aggregator->m_NumAggregators); + helper::CopyToBuffer(buf, pos, &d, 1); + d = static_cast(m_Aggregator->m_SubStreams); + helper::CopyToBuffer(buf, pos, &d, 1); + helper::CopyToBuffer(buf, pos, m_WriterSubfileMap.data(), + m_Comm.Size()); + m_WriterSubfileMap.clear(); + } + + // Step record + record = StepRecord; + helper::CopyToBuffer(buf, pos, &record, 1); // record type + d = (3 + ((FlushPosSizeInfo.size() * 2) + 1) * m_Comm.Size()) * + sizeof(uint64_t); + helper::CopyToBuffer(buf, pos, &d, 1); // record length + helper::CopyToBuffer(buf, pos, &MetaDataPos, 1); + helper::CopyToBuffer(buf, pos, &MetaDataSize, 1); + d = static_cast(FlushPosSizeInfo.size()); + helper::CopyToBuffer(buf, pos, &d, 1); + + for (int writer = 0; writer < m_Comm.Size(); writer++) + { + for (size_t flushNum = 0; flushNum < FlushPosSizeInfo.size(); + flushNum++) + { + // add two numbers here + helper::CopyToBuffer(buf, pos, + &FlushPosSizeInfo[flushNum][2 * writer], 2); + } + helper::CopyToBuffer(buf, pos, &m_WriterDataPos[writer], 1); + } + + m_FileMetadataIndexManager.WriteFiles((char *)buf.data(), buf.size()); + +#ifdef DUMPDATALOCINFO + std::cout << "Flush count is :" << FlushPosSizeInfo.size() << std::endl; + std::cout << "Write Index positions = {" << std::endl; + + for (size_t i = 0; i < m_Comm.Size(); ++i) + { + std::cout << "Writer " << i << " has data at: " << std::endl; + uint64_t eachWriterSize = FlushPosSizeInfo.size() * 2 + 1; + for (size_t j = 0; j < FlushPosSizeInfo.size(); ++j) + { + std::cout << "loc:" << buf[3 + eachWriterSize * i + j * 2] + << " siz:" << buf[3 + eachWriterSize * i + j * 2 + 1] + << std::endl; + } + std::cout << "loc:" << buf[3 + eachWriterSize * (i + 1) - 1] + << std::endl; + } + std::cout << "}" << std::endl; +#endif + /* reset for next timestep */ + FlushPosSizeInfo.clear(); +} + +void DaosWriter::NotifyEngineAttribute(std::string name, DataType type) noexcept +{ + helper::Throw( + "DaosWriter", "Engine", "ThrowUp", + "Engine does not support NotifyEngineAttribute"); +} + +void DaosWriter::NotifyEngineAttribute(std::string name, AttributeBase *Attr, + void *data) noexcept +{ + if (!m_Parameters.UseOneTimeAttributes) + { + m_MarshalAttributesNecessary = true; + return; + } + + m_BP5Serializer.OnetimeMarshalAttribute(*Attr); + m_MarshalAttributesNecessary = false; +} + +void DaosWriter::MarshalAttributes() +{ + PERFSTUBS_SCOPED_TIMER_FUNC(); + const auto &attributes = m_IO.GetAttributes(); + + // if there are no new attributes, nothing to do + if (!m_MarshalAttributesNecessary) + { + return; + } + m_MarshalAttributesNecessary = false; + + for (const auto &attributePair : attributes) + { + const std::string name(attributePair.first); + auto baseAttr = &attributePair.second; + const DataType type((*baseAttr)->m_Type); + int element_count = -1; + + if (!attributePair.second->m_IsSingleValue) + { + element_count = (*baseAttr)->m_Elements; + } + + if (type == DataType::None) + { + } + else if (type == helper::GetDataType()) + { + core::Attribute &attribute = + *m_IO.InquireAttribute(name); + void *data_addr; + if (attribute.m_IsSingleValue) + { + data_addr = (void *)attribute.m_DataSingleValue.c_str(); + } + else + { + const char **tmp = + (const char **)malloc(sizeof(char *) * element_count); + for (int i = 0; i < element_count; i++) + { + auto str = &attribute.m_DataArray[i]; + tmp[i] = str->c_str(); + } + // tmp will be free'd after final attribute marshalling + data_addr = (void *)tmp; + } + + m_BP5Serializer.MarshalAttribute(name.c_str(), type, sizeof(char *), + element_count, data_addr); + } +#define declare_type(T) \ + else if (type == helper::GetDataType()) \ + { \ + core::Attribute &attribute = *m_IO.InquireAttribute(name); \ + int element_count = -1; \ + void *data_addr = &attribute.m_DataSingleValue; \ + if (!attribute.m_IsSingleValue) \ + { \ + element_count = attribute.m_Elements; \ + data_addr = attribute.m_DataArray.data(); \ + } \ + m_BP5Serializer.MarshalAttribute(attribute.m_Name.c_str(), type, \ + sizeof(T), element_count, data_addr); \ + } + + ADIOS2_FOREACH_PRIMITIVE_STDTYPE_1ARG(declare_type) +#undef declare_type + } +} + +void DaosWriter::EndStep() +{ + /* Seconds ts = Now() - m_EngineStart; + std::cout << "END STEP starts at: " << ts.count() << std::endl; */ + m_BetweenStepPairs = false; + PERFSTUBS_SCOPED_TIMER("DaosWriter::EndStep"); + m_Profiler.Start("endstep"); + + m_Profiler.Start("close_ts"); + MarshalAttributes(); + + // true: advances step + auto TSInfo = m_BP5Serializer.CloseTimestep( + m_WriterStep, m_Parameters.AsyncWrite || m_Parameters.DirectIO); + + /* TSInfo includes NewMetaMetaBlocks, the MetaEncodeBuffer, the + * AttributeEncodeBuffer and the data encode Vector */ + + m_ThisTimestepDataSize += TSInfo.DataBuffer->Size(); + m_Profiler.Stop("close_ts"); + + m_Profiler.Start("AWD"); + // TSInfo destructor would delete the DataBuffer so we need to save it + // for async IO and let the writer free it up when not needed anymore + adios2::format::BufferV *databuf = TSInfo.DataBuffer; + TSInfo.DataBuffer = NULL; + m_AsyncWriteLock.lock(); + m_flagRush = false; + m_AsyncWriteLock.unlock(); + WriteData(databuf); + m_Profiler.Stop("AWD"); + + /* + * Two-step metadata aggregation + */ + m_Profiler.Start("meta_lvl1"); + std::vector MetaBuffer; + // core::iovec m{TSInfo.MetaEncodeBuffer->Data(), + // TSInfo.MetaEncodeBuffer->m_FixedSize}; + core::iovec m{nullptr, 0}; + core::iovec a{nullptr, 0}; + if (TSInfo.AttributeEncodeBuffer) + { + a = {TSInfo.AttributeEncodeBuffer->Data(), + TSInfo.AttributeEncodeBuffer->m_FixedSize}; + } + MetaBuffer = m_BP5Serializer.CopyMetadataToContiguous( + TSInfo.NewMetaMetaBlocks, {m}, {a}, {m_ThisTimestepDataSize}, + {m_StartDataPos}); + + if (m_Aggregator->m_Comm.Size() > 1) + { // level 1 + m_Profiler.Start("meta_gather1"); + size_t LocalSize = MetaBuffer.size(); + std::vector RecvCounts = + m_Aggregator->m_Comm.GatherValues(LocalSize, 0); + std::vector RecvBuffer; + if (m_Aggregator->m_Comm.Rank() == 0) + { + uint64_t TotalSize = 0; + for (auto &n : RecvCounts) + TotalSize += n; + RecvBuffer.resize(TotalSize); + /*std::cout << "MD Lvl-1: rank " << m_Comm.Rank() << " gather " + << TotalSize << " bytes from aggregator group" + << std::endl;*/ + } + m_Aggregator->m_Comm.GathervArrays(MetaBuffer.data(), LocalSize, + RecvCounts.data(), RecvCounts.size(), + RecvBuffer.data(), 0); + m_Profiler.Stop("meta_gather1"); + if (m_Aggregator->m_Comm.Rank() == 0) + { + std::vector + UniqueMetaMetaBlocks; + std::vector DataSizes; + std::vector WriterDataPositions; + std::vector AttributeBlocks; + auto Metadata = m_BP5Serializer.BreakoutContiguousMetadata( + RecvBuffer, RecvCounts, UniqueMetaMetaBlocks, AttributeBlocks, + DataSizes, WriterDataPositions); + + MetaBuffer.clear(); + MetaBuffer = m_BP5Serializer.CopyMetadataToContiguous( + UniqueMetaMetaBlocks, Metadata, AttributeBlocks, DataSizes, + WriterDataPositions); + } + } // level 1 + m_Profiler.Stop("meta_lvl1"); + m_Profiler.Start("meta_lvl2"); + // level 2 + if (m_Aggregator->m_Comm.Rank() == 0) + { + std::vector RecvBuffer; + std::vector *buf; + std::vector RecvCounts; + size_t LocalSize = MetaBuffer.size(); + if (m_CommAggregators.Size() > 1) + { + m_Profiler.Start("meta_gather2"); + RecvCounts = m_CommAggregators.GatherValues(LocalSize, 0); + if (m_CommAggregators.Rank() == 0) + { + uint64_t TotalSize = 0; + for (auto &n : RecvCounts) + TotalSize += n; + RecvBuffer.resize(TotalSize); + /*std::cout << "MD Lvl-2: rank " << m_Comm.Rank() << " gather " + << TotalSize << " bytes from aggregator group" + << std::endl;*/ + } + + m_CommAggregators.GathervArrays( + MetaBuffer.data(), LocalSize, RecvCounts.data(), + RecvCounts.size(), RecvBuffer.data(), 0); + buf = &RecvBuffer; + m_Profiler.Stop("meta_gather2"); + } + else + { + buf = &MetaBuffer; + RecvCounts.push_back(LocalSize); + } + + if (m_CommAggregators.Rank() == 0) + { + std::vector + UniqueMetaMetaBlocks; + std::vector DataSizes; + std::vector AttributeBlocks; + m_WriterDataPos.resize(0); + auto Metadata = m_BP5Serializer.BreakoutContiguousMetadata( + *buf, RecvCounts, UniqueMetaMetaBlocks, AttributeBlocks, + DataSizes, m_WriterDataPos); + assert(m_WriterDataPos.size() == + static_cast(m_Comm.Size())); + WriteMetaMetadata(UniqueMetaMetaBlocks); + m_LatestMetaDataPos = m_MetaDataPos; + m_LatestMetaDataSize = WriteMetadata(Metadata, AttributeBlocks); + // m_LatestMetaDataPos = 0; + // m_LatestMetaDataSize = 0; + if (!m_Parameters.AsyncWrite) + { + WriteMetadataFileIndex(m_LatestMetaDataPos, + m_LatestMetaDataSize); + } + } + } // level 2 + m_Profiler.Stop("meta_lvl2"); + + char key[1000]; + int rc; + + sprintf(key, "step%d-rank%d", m_WriterStep, m_Comm.Rank()); + std::cout << __FILE__ << "::" << __func__ << "(), step: " << m_WriterStep + << std::endl; + std::cout << "Rank = " << m_Comm.Rank() + << ", Metadata size = " << TSInfo.MetaEncodeBuffer->m_FixedSize + << std::endl; + std::cout << "key = " << key << std::endl; + std::cout << "Printing the first 10 bytes of Metadata" << std::endl; + char *data = reinterpret_cast(TSInfo.MetaEncodeBuffer->Data()); + for (int i = 0; i < 10; i++) + { + // std::cout << std::hex << std::setw(2) << std::setfill('0') << + // static_cast(data[i]) << " "; + std::cout << static_cast(data[i]) << " "; + } + std::cout << std::endl; + rc = daos_kv_put(oh, DAOS_TX_NONE, 0, key, + TSInfo.MetaEncodeBuffer->m_FixedSize, + TSInfo.MetaEncodeBuffer->Data(), NULL); + ASSERT(rc == 0, "daos_kv_put() failed with %d", rc); + + if (m_Parameters.AsyncWrite) + { + /* Start counting computation blocks between EndStep and next BeginStep + * each time */ + { + m_AsyncWriteLock.lock(); + m_ComputationBlockTimes.clear(); + m_ComputationBlocksLength = 0.0; + m_ComputationBlockID = 0; + m_AsyncWriteLock.unlock(); + } + } + + m_Profiler.Stop("endstep"); + m_WriterStep++; + m_EndStepEnd = Now(); + /* Seconds ts2 = Now() - m_EngineStart; + std::cout << "END STEP ended at: " << ts2.count() << std::endl;*/ +} + +// PRIVATE +void DaosWriter::Init() +{ + m_BP5Serializer.m_Engine = this; + m_RankMPI = m_Comm.Rank(); + InitParameters(); + InitAggregator(); + InitTransports(); + InitDAOS(); + InitBPBuffer(); +} + +void DaosWriter::InitParameters() +{ + ParseParams(m_IO, m_Parameters); + m_WriteToBB = !(m_Parameters.BurstBufferPath.empty()); + m_DrainBB = m_WriteToBB && m_Parameters.BurstBufferDrain; + + unsigned int nproc = (unsigned int)m_Comm.Size(); + m_Parameters.NumAggregators = + helper::SetWithinLimit(m_Parameters.NumAggregators, 0U, nproc); + m_Parameters.NumSubFiles = + helper::SetWithinLimit(m_Parameters.NumSubFiles, 0U, nproc); + m_Parameters.AggregatorRatio = + helper::SetWithinLimit(m_Parameters.AggregatorRatio, 0U, nproc); + if (m_Parameters.NumAggregators == 0) + { + if (m_Parameters.AggregatorRatio > 0) + { + m_Parameters.NumAggregators = helper::SetWithinLimit( + nproc / m_Parameters.AggregatorRatio, 0U, nproc); + } + else if (m_Parameters.NumSubFiles > 0) + { + m_Parameters.NumAggregators = + helper::SetWithinLimit(m_Parameters.NumSubFiles, 0U, nproc); + } + } + m_Parameters.NumSubFiles = helper::SetWithinLimit( + m_Parameters.NumSubFiles, 0U, m_Parameters.NumAggregators); + + // Limiting to max 64MB page size + m_Parameters.StripeSize = + helper::SetWithinLimit(m_Parameters.StripeSize, 0U, 67108864U); + if (m_Parameters.StripeSize == 0) + { + m_Parameters.StripeSize = 4096; + } + + if (m_Parameters.DirectIO) + { + if (m_Parameters.DirectIOAlignBuffer == 0) + { + m_Parameters.DirectIOAlignBuffer = m_Parameters.DirectIOAlignOffset; + } + m_BP5Serializer.m_BufferBlockSize = m_Parameters.DirectIOAlignOffset; + m_BP5Serializer.m_BufferAlign = m_Parameters.DirectIOAlignBuffer; + if (m_Parameters.StripeSize % m_Parameters.DirectIOAlignOffset) + { + size_t k = + m_Parameters.StripeSize / m_Parameters.DirectIOAlignOffset + 1; + m_Parameters.StripeSize = k * m_Parameters.DirectIOAlignOffset; + } + if (m_Parameters.BufferChunkSize % m_Parameters.DirectIOAlignOffset) + { + size_t k = m_Parameters.BufferChunkSize / + m_Parameters.DirectIOAlignOffset + + 1; + m_Parameters.BufferChunkSize = k * m_Parameters.DirectIOAlignOffset; + } + } + + m_BP5Serializer.m_StatsLevel = m_Parameters.StatsLevel; +} + +uint64_t DaosWriter::CountStepsInMetadataIndex(format::BufferSTL &bufferSTL) +{ + const auto &buffer = bufferSTL.m_Buffer; + size_t &position = bufferSTL.m_Position; + + if (buffer.size() < m_IndexHeaderSize) + { + m_AppendMetadataPos = 0; + m_AppendMetaMetadataPos = 0; + m_AppendMetadataIndexPos = 0; + m_AppendDataPos.resize(m_Aggregator->m_NumAggregators, + 0ULL); // safe bet + return 0; + } + + // Check endinanness + position = m_EndianFlagPosition; + const uint8_t endianness = helper::ReadValue(buffer, position); + bool IsLittleEndian = (endianness == 0) ? true : false; + if (helper::IsLittleEndian() != IsLittleEndian) + { + std::string m = (IsLittleEndian ? "Little" : "Big"); + + helper::Throw( + "Engine", "DaosWriter", "CountStepsInMetadataIndex", + "ADIOS2 BP5 Engine only supports appending with the same " + "endianness. The existing file is " + + m + "Endian"); + } + + // BP version + position = m_BPVersionPosition; + uint8_t Version = + helper::ReadValue(buffer, position, IsLittleEndian); + if (Version != 5) + { + helper::Throw( + "Engine", "DaosWriter", "CountStepsInMetadataIndex", + "ADIOS2 BP5 Engine only supports bp format " + "version 5, found " + + std::to_string(Version) + " version"); + } + + // BP minor version + position = m_BPMinorVersionPosition; + uint8_t minorVersion = + helper::ReadValue(buffer, position, IsLittleEndian); + if (minorVersion != m_BP5MinorVersion) + { + helper::Throw( + "Engine", "DaosWriter", "CountStepsInMetadataIndex", + "Current ADIOS2 BP5 Engine can only append to bp format 5." + + std::to_string(m_BP5MinorVersion) + " but this file is 5." + + std::to_string(minorVersion) + " version"); + } + + position = m_ColumnMajorFlagPosition; + const uint8_t columnMajor = + helper::ReadValue(buffer, position, IsLittleEndian); + const uint8_t NowColumnMajor = + (m_IO.m_ArrayOrder == ArrayOrdering::ColumnMajor) ? 'y' : 'n'; + if (columnMajor != NowColumnMajor) + { + std::string m = (columnMajor == 'y' ? "column" : "row"); + helper::Throw( + "Engine", "DaosWriter", "CountStepsInMetadataIndex", + "ADIOS2 BP5 Engine only supports appending with the same " + "column/row major settings as it was written." + " Existing file is " + + m + " major"); + } + + position = m_IndexHeaderSize; // after the header + // Just count the steps first + unsigned int availableSteps = 0; + uint64_t nDataFiles = 0; + while (position < buffer.size()) + { + const unsigned char recordID = + helper::ReadValue(buffer, position, IsLittleEndian); + position += sizeof(uint64_t); // recordLength + + switch (recordID) + { + case IndexRecord::WriterMapRecord: + { + m_AppendWriterCount = + helper::ReadValue(buffer, position, IsLittleEndian); + m_AppendAggregatorCount = + helper::ReadValue(buffer, position, IsLittleEndian); + m_AppendSubfileCount = + helper::ReadValue(buffer, position, IsLittleEndian); + if (m_AppendSubfileCount > nDataFiles) + { + nDataFiles = m_AppendSubfileCount; + } + // jump over writermap + position += m_AppendWriterCount * sizeof(uint64_t); + break; + } + case IndexRecord::StepRecord: + { + position += 2 * sizeof(uint64_t); // MetadataPos, MetadataSize + const uint64_t FlushCount = + helper::ReadValue(buffer, position, IsLittleEndian); + // jump over the metadata positions + position += + sizeof(uint64_t) * m_AppendWriterCount * ((2 * FlushCount) + 1); + availableSteps++; + break; + } + } + } + + unsigned int targetStep = 0; + + if (m_Parameters.AppendAfterSteps < 0) + { + // -1 means append after last step + int s = (int)availableSteps + m_Parameters.AppendAfterSteps + 1; + if (s < 0) + { + s = 0; + } + targetStep = static_cast(s); + } + else + { + targetStep = static_cast(m_Parameters.AppendAfterSteps); + } + if (targetStep > availableSteps) + { + targetStep = availableSteps; + } + + m_AppendDataPos.resize(nDataFiles, 0ULL); + + if (!targetStep) + { + // append at 0 is like writing new file + m_AppendMetadataPos = 0; + m_AppendMetaMetadataPos = 0; + m_AppendMetadataIndexPos = 0; + return 0; + } + + m_AppendMetadataPos = MaxSizeT; // size of header + m_AppendMetaMetadataPos = MaxSizeT; + m_AppendMetadataIndexPos = MaxSizeT; + std::fill(m_AppendDataPos.begin(), m_AppendDataPos.end(), MaxSizeT); + + if (targetStep == availableSteps) + { + // append after existing steps + return targetStep; + } + + // append but not at 0 and not after existing steps + // Read each record now completely to get offsets at step+1 + position = m_IndexHeaderSize; + unsigned int currentStep = 0; + std::vector writerToFileMap; + // reading one step beyond target to get correct offsets + while (currentStep <= targetStep && position < buffer.size()) + { + const unsigned char recordID = + helper::ReadValue(buffer, position, IsLittleEndian); + position += sizeof(uint64_t); // recordLength + + switch (recordID) + { + case IndexRecord::WriterMapRecord: + { + m_AppendWriterCount = + helper::ReadValue(buffer, position, IsLittleEndian); + m_AppendAggregatorCount = + helper::ReadValue(buffer, position, IsLittleEndian); + m_AppendSubfileCount = + helper::ReadValue(buffer, position, IsLittleEndian); + + // Get the process -> subfile map + writerToFileMap.clear(); + for (uint64_t i = 0; i < m_AppendWriterCount; i++) + { + const uint64_t subfileIdx = helper::ReadValue( + buffer, position, IsLittleEndian); + writerToFileMap.push_back(subfileIdx); + } + break; + } + case IndexRecord::StepRecord: + { + m_AppendMetadataIndexPos = position - sizeof(unsigned char) - + sizeof(uint64_t); // pos of RecordID + const uint64_t MetadataPos = + helper::ReadValue(buffer, position, IsLittleEndian); + position += sizeof(uint64_t); // MetadataSize + const uint64_t FlushCount = + helper::ReadValue(buffer, position, IsLittleEndian); + + m_AppendMetadataPos = static_cast(MetadataPos); + + if (currentStep == targetStep) + { + // we need the very first (smallest) write position to each + // subfile Offsets and sizes, 2*FlushCount + 1 per writer + for (uint64_t i = 0; i < m_AppendWriterCount; i++) + { + // first flush/write position will do + const size_t FirstDataPos = + static_cast(helper::ReadValue( + buffer, position, IsLittleEndian)); + position += + sizeof(uint64_t) * 2 * FlushCount; // no need to read + /* std::cout << "Writer " << i << " subfile " << + writerToFileMap[i] << " first data loc:" << + FirstDataPos << std::endl; */ + if (FirstDataPos < m_AppendDataPos[writerToFileMap[i]]) + { + m_AppendDataPos[writerToFileMap[i]] = FirstDataPos; + } + } + } + else + { + // jump over all data offsets in this step + position += sizeof(uint64_t) * m_AppendWriterCount * + (1 + 2 * FlushCount); + } + currentStep++; + break; + } + } + } + return targetStep; +} + +void DaosWriter::InitAggregator() +{ + // in BP5, aggregation is "always on", but processes may be alone, so + // m_Aggregator.m_IsActive is always true + // m_Aggregator.m_Comm.Rank() will always succeed (not abort) + // m_Aggregator.m_SubFileIndex is always set + + if (m_Parameters.AggregationType == (int)AggregationType::EveryoneWrites || + m_Parameters.AggregationType == + (int)AggregationType::EveryoneWritesSerial) + { + m_Parameters.NumSubFiles = m_Parameters.NumAggregators; + m_AggregatorEveroneWrites.Init(m_Parameters.NumAggregators, + m_Parameters.NumSubFiles, m_Comm); + m_IAmDraining = m_AggregatorEveroneWrites.m_IsAggregator; + m_IAmWritingData = true; + DataWritingComm = &m_AggregatorEveroneWrites.m_Comm; + m_Aggregator = static_cast( + &m_AggregatorEveroneWrites); + } + else + { + size_t numNodes = m_AggregatorTwoLevelShm.PreInit(m_Comm); + (void)numNodes; + m_AggregatorTwoLevelShm.Init(m_Parameters.NumAggregators, + m_Parameters.NumSubFiles, m_Comm); + + /*std::cout << "Rank " << m_RankMPI << " aggr? " + << m_AggregatorTwoLevelShm.m_IsAggregator << " master? " + << m_AggregatorTwoLevelShm.m_IsMasterAggregator + << " aggr size = " << m_AggregatorTwoLevelShm.m_Size + << " rank = " << m_AggregatorTwoLevelShm.m_Rank + << " subfile = " << m_AggregatorTwoLevelShm.m_SubStreamIndex + << " type = " << m_Parameters.AggregationType << std::endl;*/ + + m_IAmDraining = m_AggregatorTwoLevelShm.m_IsMasterAggregator; + m_IAmWritingData = m_AggregatorTwoLevelShm.m_IsAggregator; + DataWritingComm = &m_AggregatorTwoLevelShm.m_AggregatorChainComm; + m_Aggregator = + static_cast(&m_AggregatorTwoLevelShm); + } + + /* comm for Aggregators only. + * We are only interested in the chain of rank 0s + */ + int color = m_Aggregator->m_Comm.Rank(); + m_CommAggregators = + m_Comm.Split(color, 0, "creating level 2 chain of aggregators at Open"); +} + +void DaosWriter::InitTransports() +{ + if (m_IO.m_TransportsParameters.empty()) + { + Params defaultTransportParameters; + defaultTransportParameters["transport"] = "File"; + m_IO.m_TransportsParameters.push_back(defaultTransportParameters); + } + + if (m_WriteToBB) + { + m_BBName = m_Parameters.BurstBufferPath + PathSeparator + m_Name; + } + else + { + m_BBName = m_Name; + } + /* From this point, engine writes to m_BBName, which points to either + the BB file system if BB is turned on, or to the target file system. + m_Name always points to the target file system, to which the drainer + should write if BB is turned on + */ + + // Names passed to IO AddTransport option with key "Name" + const std::vector transportsNames = + m_FileDataManager.GetFilesBaseNames(m_BBName, + m_IO.m_TransportsParameters); + + // /path/name.bp.dir/name.bp.rank + m_SubStreamNames = + GetBPSubStreamNames(transportsNames, m_Aggregator->m_SubStreamIndex); + + if (m_IAmDraining) + { + // Only (master)aggregators will run draining processes + if (m_DrainBB) + { + const std::vector drainTransportNames = + m_FileDataManager.GetFilesBaseNames( + m_Name, m_IO.m_TransportsParameters); + m_DrainSubStreamNames = GetBPSubStreamNames( + drainTransportNames, m_Aggregator->m_SubStreamIndex); + /* start up BB thread */ + // m_FileDrainer.SetVerbose( + // m_Parameters.BurstBufferVerbose, + // m_Comm.Rank()); + m_FileDrainer.Start(); + } + } + + /* Create the directories either on target or burst buffer if used */ + // m_BP4Serializer.m_Profiler.Start("mkdir"); + + if (m_Comm.Rank() == 0) + { + m_MetadataFileNames = GetBPMetadataFileNames(transportsNames); + m_MetaMetadataFileNames = GetBPMetaMetadataFileNames(transportsNames); + m_MetadataIndexFileNames = GetBPMetadataIndexFileNames(transportsNames); + } + m_FileMetadataManager.MkDirsBarrier(m_MetadataFileNames, + m_IO.m_TransportsParameters, + m_Parameters.NodeLocal || m_WriteToBB); + /* Create the directories on burst buffer if used */ + if (m_DrainBB) + { + /* Create the directories on target anyway by main thread */ + m_FileDataManager.MkDirsBarrier(m_DrainSubStreamNames, + m_IO.m_TransportsParameters, + m_Parameters.NodeLocal); + } + + /* Everyone opens its data file. Each aggregation chain opens + one data file and does so in chain, not everyone at once */ + if (m_Parameters.AsyncOpen) + { + for (size_t i = 0; i < m_IO.m_TransportsParameters.size(); ++i) + { + m_IO.m_TransportsParameters[i]["asyncopen"] = "true"; + } + } + + if (m_Parameters.DirectIO) + { + for (size_t i = 0; i < m_IO.m_TransportsParameters.size(); ++i) + { + m_IO.m_TransportsParameters[i]["DirectIO"] = "true"; + } + } + + bool useProfiler = true; + + if (m_IAmWritingData) + { + m_FileDataManager.OpenFiles(m_SubStreamNames, m_OpenMode, + m_IO.m_TransportsParameters, useProfiler, + *DataWritingComm); + } + + if (m_IAmDraining) + { + if (m_DrainBB) + { + for (const auto &name : m_DrainSubStreamNames) + { + m_FileDrainer.AddOperationOpen(name, m_OpenMode); + } + } + } + + if (m_Comm.Rank() == 0) + { + // force turn off directio to metadata files + for (size_t i = 0; i < m_IO.m_TransportsParameters.size(); ++i) + { + m_IO.m_TransportsParameters[i]["DirectIO"] = "false"; + } + m_FileMetaMetadataManager.OpenFiles(m_MetaMetadataFileNames, m_OpenMode, + m_IO.m_TransportsParameters, + useProfiler); + + m_FileMetadataManager.OpenFiles(m_MetadataFileNames, m_OpenMode, + m_IO.m_TransportsParameters, + useProfiler); + + m_FileMetadataIndexManager.OpenFiles( + m_MetadataIndexFileNames, m_OpenMode, m_IO.m_TransportsParameters, + useProfiler); + + if (m_DrainBB) + { + const std::vector drainTransportNames = + m_FileDataManager.GetFilesBaseNames( + m_Name, m_IO.m_TransportsParameters); + m_DrainMetadataFileNames = + GetBPMetadataFileNames(drainTransportNames); + m_DrainMetadataIndexFileNames = + GetBPMetadataIndexFileNames(drainTransportNames); + + for (const auto &name : m_DrainMetadataFileNames) + { + m_FileDrainer.AddOperationOpen(name, m_OpenMode); + } + for (const auto &name : m_DrainMetadataIndexFileNames) + { + m_FileDrainer.AddOperationOpen(name, m_OpenMode); + } + } + } +} + +void DaosWriter::InitDAOS() +{ + // Rank 0 - Connect to DAOS pool, and open container + int rc; + rc = gethostname(node, sizeof(node)); + ASSERT(rc == 0, "buffer for hostname too small"); + if (m_Comm.Rank() == 0) + { + /** connect to the just created DAOS pool */ + rc = daos_pool_connect(pool_label, DSS_PSETID, + // DAOS_PC_EX , + DAOS_PC_RW /* read write access */, + &poh /* returned pool handle */, + NULL /* returned pool info */, NULL /* event */); + ASSERT(rc == 0, "pool connect failed with %d", rc); + } + + /** share pool handle with peer tasks */ + daos_handle_share(&poh, DaosWriter::HANDLE_POOL); + + if (m_Comm.Rank() == 0) + { + /** open container */ + rc = daos_cont_open(poh, cont_label, DAOS_COO_RW, &coh, NULL, NULL); + ASSERT(rc == 0, "container open failed with %d", rc); + } + + /** share container handle with peer tasks */ + daos_handle_share(&coh, HANDLE_CO); + + if (m_Comm.Rank() == 0) + { + /** Open a DAOS KV object */ + rc = daos_obj_generate_oid(coh, &oid, DAOS_OT_KV_HASHED, OC_SX, 0, 0); + ASSERT(rc == 0, "daos_obj_generate_oid failed with %d", rc); + } + + // Rank 0 will broadcast the DAOS KV OID + MPI_Bcast(&oid.hi, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD); + MPI_Bcast(&oid.lo, 1, MPI_UNSIGNED_LONG, 0, MPI_COMM_WORLD); + + // Open KV object + rc = daos_kv_open(coh, oid, 0, &oh, NULL); + ASSERT(rc == 0, "daos_kv_open failed with %d", rc); + FILE *fp = fopen("./share/oid.txt", "w"); + if (fp == NULL) + { + perror("fopen"); + exit(1); + } + fprintf(fp, "%" PRIu64 "\n%" PRIu64 "\n", oid.hi, oid.lo); + fclose(fp); +} + +/*generate the header for the metadata index file*/ +void DaosWriter::MakeHeader(std::vector &buffer, size_t &position, + const std::string fileType, const bool isActive) +{ + auto lf_CopyVersionChar = [](const std::string version, + std::vector &buffer, size_t &position) { + helper::CopyToBuffer(buffer, position, version.c_str()); + }; + + // auto &buffer = b.m_Buffer; + // auto &position = b.m_Position; + // auto &absolutePosition = b.m_AbsolutePosition; + if (position > 0) + { + helper::Throw( + "Engine", "DaosWriter", "MakeHeader", + "BP4Serializer::MakeHeader can only be called for an empty " + "buffer. This one for " + + fileType + " already has content of " + + std::to_string(position) + " bytes."); + } + + if (buffer.size() < m_IndexHeaderSize) + { + buffer.resize(m_IndexHeaderSize); + } + + const std::string majorVersion(std::to_string(ADIOS2_VERSION_MAJOR)); + const std::string minorVersion(std::to_string(ADIOS2_VERSION_MINOR)); + const std::string patchVersion(std::to_string(ADIOS2_VERSION_PATCH)); + + // byte 0-31: Readable tag + if (position != m_VersionTagPosition) + { + helper::Throw( + "Engine", "DaosWriter", "MakeHeader", + "ADIOS Coding ERROR in BP4Serializer::MakeHeader. Version Tag " + "position mismatch"); + } + std::string versionLongTag("ADIOS-BP v" + majorVersion + "." + + minorVersion + "." + patchVersion + " "); + size_t maxTypeLen = m_VersionTagLength - versionLongTag.size(); + const std::string fileTypeStr = fileType.substr(0, maxTypeLen); + versionLongTag += fileTypeStr; + const size_t versionLongTagSize = versionLongTag.size(); + if (versionLongTagSize < m_VersionTagLength) + { + helper::CopyToBuffer(buffer, position, versionLongTag.c_str(), + versionLongTagSize); + position += m_VersionTagLength - versionLongTagSize; + } + else if (versionLongTagSize > m_VersionTagLength) + { + helper::CopyToBuffer(buffer, position, versionLongTag.c_str(), + m_VersionTagLength); + } + else + { + helper::CopyToBuffer(buffer, position, versionLongTag.c_str(), + m_VersionTagLength); + } + + // byte 32-35: MAJOR MINOR PATCH Unused + + lf_CopyVersionChar(majorVersion, buffer, position); + lf_CopyVersionChar(minorVersion, buffer, position); + lf_CopyVersionChar(patchVersion, buffer, position); + ++position; + + // Note: Reader does process and use bytes 36-38 in + // BP4Deserialize.cpp::ParseMetadataIndex(). + // Order and position must match there. + + // byte 36: endianness + if (position != m_EndianFlagPosition) + { + helper::Throw( + "Engine", "DaosWriter", "MakeHeader", + "ADIOS Coding ERROR in DaosWriter::MakeHeader. Endian Flag " + "position mismatch"); + } + const uint8_t endianness = helper::IsLittleEndian() ? 0 : 1; + helper::CopyToBuffer(buffer, position, &endianness); + + // byte 37: BP Version 5 + if (position != m_BPVersionPosition) + { + helper::Throw( + "Engine", "DaosWriter", "MakeHeader", + "ADIOS Coding ERROR in DaosWriter::MakeHeader. BP Version " + "position mismatch"); + } + const uint8_t version = 5; + helper::CopyToBuffer(buffer, position, &version); + + // byte 38: BP Minor version 1 + if (position != m_BPMinorVersionPosition) + { + helper::Throw( + "Engine", "DaosWriter", "MakeHeader", + "ADIOS Coding ERROR in DaosWriter::MakeHeader. BP Minor version " + "position mismatch"); + } + const uint8_t minorversion = m_BP5MinorVersion; + helper::CopyToBuffer(buffer, position, &minorversion); + + // byte 39: Active flag (used in Index Table only) + if (position != m_ActiveFlagPosition) + { + helper::Throw( + "Engine", "DaosWriter", "MakeHeader", + "ADIOS Coding ERROR in DaosWriter::MakeHeader. Active Flag " + "position mismatch"); + } + const uint8_t activeFlag = (isActive ? 1 : 0); + helper::CopyToBuffer(buffer, position, &activeFlag); + + // byte 40 columnMajor + // write if data is column major in metadata and data + const uint8_t columnMajor = + (m_IO.m_ArrayOrder == ArrayOrdering::ColumnMajor) ? 'y' : 'n'; + helper::CopyToBuffer(buffer, position, &columnMajor); + + // byte 41-63: unused + position += 23; + // absolutePosition = position; +} + +void DaosWriter::UpdateActiveFlag(const bool active) +{ + const char activeChar = (active ? '\1' : '\0'); + m_FileMetadataIndexManager.WriteFileAt(&activeChar, 1, + m_ActiveFlagPosition); + m_FileMetadataIndexManager.FlushFiles(); + m_FileMetadataIndexManager.SeekToFileEnd(); + if (m_DrainBB) + { + for (size_t i = 0; i < m_MetadataIndexFileNames.size(); ++i) + { + m_FileDrainer.AddOperationWriteAt(m_DrainMetadataIndexFileNames[i], + m_ActiveFlagPosition, 1, + &activeChar); + m_FileDrainer.AddOperationSeekEnd(m_DrainMetadataIndexFileNames[i]); + } + } +} + +void DaosWriter::InitBPBuffer() +{ + if (m_OpenMode == Mode::Append) + { + format::BufferSTL preMetadataIndex; + size_t preMetadataIndexFileSize; + + if (m_Comm.Rank() == 0) + { + preMetadataIndexFileSize = + m_FileMetadataIndexManager.GetFileSize(0); + preMetadataIndex.m_Buffer.resize(preMetadataIndexFileSize); + preMetadataIndex.m_Buffer.assign(preMetadataIndex.m_Buffer.size(), + '\0'); + preMetadataIndex.m_Position = 0; + m_FileMetadataIndexManager.ReadFile( + preMetadataIndex.m_Buffer.data(), preMetadataIndexFileSize); + } + m_Comm.BroadcastVector(preMetadataIndex.m_Buffer); + m_WriterStep = CountStepsInMetadataIndex(preMetadataIndex); + + // truncate and seek + if (m_Aggregator->m_IsAggregator) + { + const size_t off = m_AppendDataPos[m_Aggregator->m_SubStreamIndex]; + if (off < MaxSizeT) + { + m_FileDataManager.Truncate(off); + // Seek is needed since truncate does not seek. + // SeekTo instead of SeetToFileEnd in case a transport + // does not support actual truncate. + m_FileDataManager.SeekTo(off); + m_DataPos = off; + } + else + { + m_DataPos = m_FileDataManager.GetFileSize(0); + } + } + + if (m_Comm.Rank() == 0) + { + // Truncate existing metadata file + if (m_AppendMetadataPos < MaxSizeT) + { + m_MetaDataPos = m_AppendMetadataPos; + m_FileMetadataManager.Truncate(m_MetaDataPos); + m_FileMetadataManager.SeekTo(m_MetaDataPos); + } + else + { + m_MetaDataPos = m_FileMetadataManager.GetFileSize(0); + m_FileMetadataManager.SeekToFileEnd(); + } + + // Truncate existing meta-meta file + if (m_AppendMetaMetadataPos < MaxSizeT) + { + m_FileMetaMetadataManager.Truncate(m_AppendMetaMetadataPos); + m_FileMetaMetadataManager.SeekTo(m_AppendMetaMetadataPos); + } + else + { + m_FileMetadataIndexManager.SeekToFileEnd(); + } + + // Set the flag in the header of metadata index table to 1 again + // to indicate a new run begins + UpdateActiveFlag(true); + + // Truncate existing index file + if (m_AppendMetadataIndexPos < MaxSizeT) + { + m_FileMetadataIndexManager.Truncate(m_AppendMetadataIndexPos); + m_FileMetadataIndexManager.SeekTo(m_AppendMetadataIndexPos); + } + else + { + m_FileMetadataIndexManager.SeekToFileEnd(); + } + } + m_AppendDataPos.clear(); + } + + if (!m_WriterStep) + { + /* This is a new file or append at 0 + * Make headers in data buffer and metadata buffer (but do not write + * them yet so that Open() can stay free of writing to disk) + */ + if (m_Comm.Rank() == 0) + { + m_FileMetadataIndexManager.SeekToFileBegin(); + m_FileMetadataManager.SeekToFileBegin(); + m_FileMetaMetadataManager.SeekToFileBegin(); + } + // last attempt to clean up datafile if called with append mode, + // data existed but index was missing + if (m_Aggregator->m_IsAggregator) + { + m_FileDataManager.SeekTo(0); + } + } + + if (m_Comm.Rank() == 0) + { + m_WriterDataPos.resize(m_Comm.Size()); + } + + if (!m_WriterStep || + m_AppendWriterCount != static_cast(m_Comm.Size()) || + m_AppendAggregatorCount != + static_cast(m_Aggregator->m_NumAggregators) || + m_AppendSubfileCount != + static_cast(m_Aggregator->m_SubStreams)) + { + // new Writer Map is needed, generate now, write later + const uint64_t a = + static_cast(m_Aggregator->m_SubStreamIndex); + m_WriterSubfileMap = m_Comm.GatherValues(a, 0); + } +} + +void DaosWriter::EnterComputationBlock() noexcept +{ + if (m_Parameters.AsyncWrite && !m_BetweenStepPairs) + { + m_ComputationBlockStart = Now(); + { + m_AsyncWriteLock.lock(); + m_InComputationBlock = true; + m_AsyncWriteLock.unlock(); + } + } +} + +void DaosWriter::ExitComputationBlock() noexcept +{ + if (m_Parameters.AsyncWrite && m_InComputationBlock) + { + double t = Seconds(Now() - m_ComputationBlockStart).count(); + { + m_AsyncWriteLock.lock(); + if (t > 0.1) // only register long enough intervals + { + m_ComputationBlockTimes.emplace_back(m_ComputationBlockID, t); + m_ComputationBlocksLength += t; + } + m_InComputationBlock = false; + ++m_ComputationBlockID; + m_AsyncWriteLock.unlock(); + } + } +} + +void DaosWriter::FlushData(const bool isFinal) +{ + BufferV *DataBuf; + if (m_Parameters.BufferVType == (int)BufferVType::MallocVType) + { + DataBuf = m_BP5Serializer.ReinitStepData( + new MallocV("DaosWriter", false, m_BP5Serializer.m_BufferAlign, + m_BP5Serializer.m_BufferBlockSize, + m_Parameters.InitialBufferSize, + m_Parameters.GrowthFactor), + m_Parameters.AsyncWrite || m_Parameters.DirectIO); + } + else + { + DataBuf = m_BP5Serializer.ReinitStepData( + new ChunkV("DaosWriter", false, m_BP5Serializer.m_BufferAlign, + m_BP5Serializer.m_BufferBlockSize, + m_Parameters.BufferChunkSize), + m_Parameters.AsyncWrite || m_Parameters.DirectIO); + } + + auto databufsize = DataBuf->Size(); + WriteData(DataBuf); + /* DataBuf is deleted in WriteData() */ + DataBuf = nullptr; + + m_ThisTimestepDataSize += databufsize; + + if (!isFinal) + { + size_t tmp[2]; + // aggregate start pos and data size to rank 0 + tmp[0] = m_StartDataPos; + tmp[1] = databufsize; + + std::vector RecvBuffer; + if (m_Comm.Rank() == 0) + { + RecvBuffer.resize(m_Comm.Size() * 2); + } + m_Comm.GatherArrays(tmp, 2, RecvBuffer.data(), 0); + if (m_Comm.Rank() == 0) + { + FlushPosSizeInfo.push_back(RecvBuffer); + } + } +} + +void DaosWriter::Flush(const int transportIndex) {} + +void DaosWriter::PerformDataWrite() { FlushData(false); } + +void DaosWriter::DestructorClose(bool Verbose) noexcept +{ + if (Verbose) + { + std::cerr << "BP5 Writer \"" << m_Name + << "\" Destroyed without a prior Close()." << std::endl; + std::cerr << "This may result in corrupt output." << std::endl; + } + // close metadata index file + UpdateActiveFlag(false); + m_IsOpen = false; +} + +DaosWriter::~DaosWriter() +{ + if (m_IsOpen) + { + DestructorClose(m_FailVerbose); + } + m_IsOpen = false; +} + +void DaosWriter::DoClose(const int transportIndex) +{ + PERFSTUBS_SCOPED_TIMER("DaosWriter::Close"); + + if ((m_WriterStep == 0) && !m_BetweenStepPairs) + { + /* never did begin step, do one now */ + BeginStep(StepMode::Update); + } + if (m_BetweenStepPairs) + { + EndStep(); + } + + TimePoint wait_start = Now(); + Seconds wait(0.0); + if (m_WriteFuture.valid()) + { + m_Profiler.Start("WaitOnAsync"); + m_AsyncWriteLock.lock(); + m_flagRush = true; + m_AsyncWriteLock.unlock(); + m_WriteFuture.get(); + wait += Now() - wait_start; + m_Profiler.Stop("WaitOnAsync"); + } + + m_FileDataManager.CloseFiles(transportIndex); + // Delete files from temporary storage if draining was on + + if (m_Comm.Rank() == 0) + { + // close metadata file + m_FileMetadataManager.CloseFiles(); + + // close metametadata file + m_FileMetaMetadataManager.CloseFiles(); + } + + if (m_Parameters.AsyncWrite) + { + // wait until all process' writing thread completes + m_Profiler.Start("WaitOnAsync"); + wait_start = Now(); + m_Comm.Barrier(); + AsyncWriteDataCleanup(); + wait += Now() - wait_start; + if (m_Comm.Rank() == 0 && m_Parameters.verbose > 0) + { + std::cout << "Close waited " << wait.count() + << " seconds on async threads" << std::endl; + } + m_Profiler.Stop("WaitOnAsync"); + } + + if (m_Comm.Rank() == 0) + { + if (m_Parameters.AsyncWrite) + { + WriteMetadataFileIndex(m_LatestMetaDataPos, m_LatestMetaDataSize); + } + // close metadata index file + UpdateActiveFlag(false); + m_FileMetadataIndexManager.CloseFiles(); + } + + FlushProfiler(); +} + +void DaosWriter::FlushProfiler() +{ + auto transportTypes = m_FileDataManager.GetTransportsTypes(); + + // find first File type output, where we can write the profile + int fileTransportIdx = -1; + for (size_t i = 0; i < transportTypes.size(); ++i) + { + if (transportTypes[i].compare(0, 4, "File") == 0) + { + fileTransportIdx = static_cast(i); + } + } + + auto transportProfilers = m_FileDataManager.GetTransportsProfilers(); + + auto transportTypesMD = m_FileMetadataManager.GetTransportsTypes(); + auto transportProfilersMD = m_FileMetadataManager.GetTransportsProfilers(); + + transportTypes.insert(transportTypes.end(), transportTypesMD.begin(), + transportTypesMD.end()); + + transportProfilers.insert(transportProfilers.end(), + transportProfilersMD.begin(), + transportProfilersMD.end()); + + // m_Profiler.WriteOut(transportTypes, transportProfilers); + + const std::string lineJSON( + m_Profiler.GetRankProfilingJSON(transportTypes, transportProfilers) + + ",\n"); + + const std::vector profilingJSON( + m_Profiler.AggregateProfilingJSON(lineJSON)); + + if (m_RankMPI == 0) + { + // std::cout << "write profiling file!" << std::endl; + std::string profileFileName; + if (m_DrainBB) + { + // auto bpTargetNames = + // m_BP4Serializer.GetBPBaseNames({m_Name}); + std::vector bpTargetNames = {m_Name}; + if (fileTransportIdx > -1) + { + profileFileName = + bpTargetNames[fileTransportIdx] + "/profiling.json"; + } + else + { + profileFileName = bpTargetNames[0] + "_profiling.json"; + } + m_FileDrainer.AddOperationWrite( + profileFileName, profilingJSON.size(), profilingJSON.data()); + } + else + { + transport::FileFStream profilingJSONStream(m_Comm); + // auto bpBaseNames = + // m_BP4Serializer.GetBPBaseNames({m_BBName}); + std::vector bpBaseNames = {m_Name}; + if (fileTransportIdx > -1) + { + profileFileName = + bpBaseNames[fileTransportIdx] + "/profiling.json"; + } + else + { + profileFileName = bpBaseNames[0] + "_profiling.json"; + } + profilingJSONStream.Open(profileFileName, Mode::Write); + profilingJSONStream.Write(profilingJSON.data(), + profilingJSON.size()); + profilingJSONStream.Close(); + } + } +} + +size_t DaosWriter::DebugGetDataBufferSize() const +{ + return m_BP5Serializer.DebugGetDataBufferSize(); +} + +void DaosWriter::PutCommon(VariableBase &variable, const void *values, + bool sync) +{ + if (!m_BetweenStepPairs) + { + BeginStep(StepMode::Update); + } + + // if the user buffer is allocated on the GPU always use sync mode + if (variable.GetMemorySpace(values) != MemorySpace::Host) + sync = true; + + size_t *Shape = NULL; + size_t *Start = NULL; + size_t *Count = NULL; + size_t DimCount = variable.m_Count.size(); + + if (variable.m_ShapeID == ShapeID::GlobalArray) + { + Shape = variable.m_Shape.data(); + Count = variable.m_Count.data(); + Start = variable.m_Start.data(); + } + else if (variable.m_ShapeID == ShapeID::LocalArray) + { + Count = variable.m_Count.data(); + } + else if (variable.m_ShapeID == ShapeID::JoinedArray) + { + Count = variable.m_Count.data(); + Shape = variable.m_Shape.data(); + } + + size_t ObjSize; + if (variable.m_Type == DataType::Struct) + { + ObjSize = variable.m_ElementSize; + } + else + { + ObjSize = helper::GetDataTypeSize(variable.m_Type); + } + + if (!sync) + { + /* If arrays is small, force copying to internal buffer to aggregate + * small writes */ + size_t n = helper::GetTotalSize(variable.m_Count) * ObjSize; + if (n < m_Parameters.MinDeferredSize) + { + sync = true; + } + } + + if (!variable.m_MemoryCount.empty()) + { + int DimCount = variable.m_Count.size(); + std::vector ZeroDims(DimCount); + // get a temporary span then fill with memselection now + format::BufferV::BufferPos bp5span(0, 0, 0); + m_BP5Serializer.Marshal((void *)&variable, variable.m_Name.c_str(), + variable.m_Type, variable.m_ElementSize, + DimCount, Shape, Count, Start, nullptr, false, + &bp5span); + void *ptr = + m_BP5Serializer.GetPtr(bp5span.bufferIdx, bp5span.posInBuffer); + + const bool sourceRowMajor = helper::IsRowMajor(m_IO.m_HostLanguage); + + helper::NdCopy( + (const char *)values, helper::CoreDims(ZeroDims), + variable.m_MemoryCount, sourceRowMajor, false, (char *)ptr, + variable.m_MemoryStart, variable.m_Count, sourceRowMajor, false, + ObjSize, helper::CoreDims(), helper::CoreDims(), helper::CoreDims(), + helper::CoreDims(), false /* safemode */, variable.m_MemSpace); + } + else + { + if (variable.m_Type == DataType::String) + { + std::string &source = *(std::string *)values; + void *p = &(source[0]); + m_BP5Serializer.Marshal((void *)&variable, variable.m_Name.c_str(), + variable.m_Type, variable.m_ElementSize, + DimCount, Shape, Count, Start, &p, sync, + nullptr); + } + else + m_BP5Serializer.Marshal((void *)&variable, variable.m_Name.c_str(), + variable.m_Type, variable.m_ElementSize, + DimCount, Shape, Count, Start, values, sync, + nullptr); + } +} + +#define declare_type(T) \ + void DaosWriter::DoPut(Variable &variable, \ + typename Variable::Span &span, \ + const bool initialize, const T &value) \ + { \ + PERFSTUBS_SCOPED_TIMER("DaosWriter::Put"); \ + PutCommonSpan(variable, span, initialize, value); \ + } + +ADIOS2_FOREACH_PRIMITIVE_STDTYPE_1ARG(declare_type) +#undef declare_type + +#define declare_type(T) \ + void DaosWriter::DoPutSync(Variable &variable, const T *data) \ + { \ + PutCommon(variable, data, true); \ + } \ + void DaosWriter::DoPutDeferred(Variable &variable, const T *data) \ + { \ + PutCommon(variable, data, false); \ + } + +ADIOS2_FOREACH_STDTYPE_1ARG(declare_type) +#undef declare_type + +#define declare_type(T, L) \ + T *DaosWriter::DoBufferData_##L(const int bufferIdx, \ + const size_t payloadPosition, \ + const size_t bufferID) noexcept \ + { \ + return reinterpret_cast( \ + m_BP5Serializer.GetPtr(bufferIdx, payloadPosition)); \ + } + +ADIOS2_FOREACH_PRIMITVE_STDTYPE_2ARGS(declare_type) +#undef declare_type + +void DaosWriter::DoPutStructSync(VariableStruct &variable, const void *data) +{ + PutCommon(variable, data, true); +} + +void DaosWriter::DoPutStructDeferred(VariableStruct &variable, const void *data) +{ + PutCommon(variable, data, false); +} + +void DaosWriter::daos_handle_share(daos_handle_t *hdl, int type) +{ + d_iov_t ghdl = {NULL, 0, 0}; + int rc; + + if (m_Comm.Rank() == 0) + { + /** fetch size of global handle */ + if (type == DaosWriter::HANDLE_POOL) + rc = daos_pool_local2global(*hdl, &ghdl); + else + rc = daos_cont_local2global(*hdl, &ghdl); + ASSERT(rc == 0, "local2global failed with %d", rc); + } + + /** broadcast size of global handle to all peers */ + MPI_Bcast(&ghdl.iov_buf_len, 1, MPI_UINT64_T, 0, MPI_COMM_WORLD); + + /** allocate buffer for global pool handle */ + ghdl.iov_buf = malloc(ghdl.iov_buf_len); + ghdl.iov_len = ghdl.iov_buf_len; + + if (m_Comm.Rank() == 0) + { + /** generate actual global handle to share with peer tasks */ + if (type == DaosWriter::HANDLE_POOL) + rc = daos_pool_local2global(*hdl, &ghdl); + else + rc = daos_cont_local2global(*hdl, &ghdl); + ASSERT(rc == 0, "local2global failed with %d", rc); + } + + /** broadcast global handle to all peers */ + MPI_Bcast(ghdl.iov_buf, ghdl.iov_len, MPI_BYTE, 0, MPI_COMM_WORLD); + + if (m_Comm.Rank() != 0) + { + /** unpack global handle */ + if (type == DaosWriter::HANDLE_POOL) + { + /* NB: Only pool_global2local are different */ + rc = daos_pool_global2local(ghdl, hdl); + } + else + { + rc = daos_cont_global2local(poh, ghdl, hdl); + } + ASSERT(rc == 0, "global2local failed with %d", rc); + } + + free(ghdl.iov_buf); + + MPI_Barrier(MPI_COMM_WORLD); +} + +} // end namespace engine +} // end namespace core +} // end namespace adios2 diff --git a/source/adios2/engine/daos/DaosWriter.h b/source/adios2/engine/daos/DaosWriter.h new file mode 100644 index 0000000000..3b1acbf2eb --- /dev/null +++ b/source/adios2/engine/daos/DaosWriter.h @@ -0,0 +1,403 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * DaosWriter.h + * + */ + +#ifndef ADIOS2_ENGINE_DAOS_DAOSWRITER_H_ +#define ADIOS2_ENGINE_DAOS_DAOSWRITER_H_ +#define DSS_PSETID "daos_server" + +#include "adios2/common/ADIOSConfig.h" +#include "adios2/core/CoreTypes.h" +#include "adios2/core/Engine.h" +#include "adios2/engine/daos/DaosEngine.h" +#include "adios2/helper/adiosComm.h" +#include "adios2/helper/adiosMemory.h" // PaddingToAlignOffset +#include "adios2/toolkit/aggregator/mpi/MPIChain.h" +#include "adios2/toolkit/aggregator/mpi/MPIShmChain.h" +#include "adios2/toolkit/burstbuffer/FileDrainerSingleThread.h" +#include "adios2/toolkit/format/bp5/BP5Serializer.h" +#include "adios2/toolkit/format/buffer/BufferV.h" +#include "adios2/toolkit/shm/Spinlock.h" +#include "adios2/toolkit/shm/TokenChain.h" +#include "adios2/toolkit/transportman/TransportMan.h" +#include +#include +#include + +#define FAIL(fmt, ...) \ + do \ + { \ + fprintf(stderr, "Process %d(%s): " fmt " aborting\n", m_Comm.Rank(), \ + node, ##__VA_ARGS__); \ + MPI_Abort(MPI_COMM_WORLD, 1); \ + } while (0) +#define ASSERT(cond, ...) \ + do \ + { \ + if (!(cond)) \ + FAIL(__VA_ARGS__); \ + } while (0) + +namespace adios2 +{ +namespace core +{ +namespace engine +{ + +class DaosWriter : public DaosEngine, public core::Engine +{ + +public: + /** + * Constructor for file Writer in Daos format + * @param name unique name given to the engine + * @param openMode w (supported), r, a from OpenMode in ADIOSTypes.h + * @param comm multi-process communicator + */ + DaosWriter(IO &io, const std::string &name, const Mode mode, + helper::Comm comm); + + ~DaosWriter(); + + StepStatus BeginStep(StepMode mode, + const float timeoutSeconds = -1.0) final; + size_t CurrentStep() const final; + void PerformPuts() final; + void PerformDataWrite() final; + void EndStep() final; + void Flush(const int transportIndex = -1) final; + + size_t DebugGetDataBufferSize() const final; + +private: + /** Single object controlling BP buffering */ + format::BP5Serializer m_BP5Serializer; + + /** Manage BP data files Transports from IO AddTransport */ + transportman::TransportMan m_FileDataManager; + + /** Manages the optional collective metadata files */ + transportman::TransportMan m_FileMetadataManager; + + /* transport manager for managing the metadata index file */ + transportman::TransportMan m_FileMetadataIndexManager; + + transportman::TransportMan m_FileMetaMetadataManager; + + /* DAOS declarations */ + + uuid_t pool_uuid, cont_uuid; + char *pool_label = "pool_ranjansv"; + char *cont_label = "adios-daos-engine-cont"; + + /* Declare variables for pool and container handles */ + daos_handle_t poh, coh; + + enum DAOS_handleType + { + HANDLE_POOL, + HANDLE_CO, + }; + + /* Declare variables for the KV object */ + daos_handle_t oh; + daos_obj_id_t oid; + + char node[128] = "unknown"; + + int64_t m_WriterStep = 0; + /* + * Burst buffer variables + */ + /** true if burst buffer is used to write */ + bool m_WriteToBB = false; + /** true if burst buffer is drained to disk */ + bool m_DrainBB = true; + /** File drainer thread if burst buffer is used */ + burstbuffer::FileDrainerSingleThread m_FileDrainer; + /** m_Name modified with burst buffer path if BB is used, + * == m_Name otherwise. + * m_Name is a constant of Engine and is the user provided target path + */ + std::string m_BBName; + /* Name of subfiles to directly write to (for all transports) + * This is either original target or burst buffer if used */ + std::vector m_SubStreamNames; + /* Name of subfiles on target if burst buffer is used (for all transports) + */ + std::vector m_DrainSubStreamNames; + std::vector m_MetadataFileNames; + std::vector m_DrainMetadataFileNames; + std::vector m_MetaMetadataFileNames; + std::vector m_MetadataIndexFileNames; + std::vector m_DrainMetadataIndexFileNames; + std::vector m_ActiveFlagFileNames; + + bool m_BetweenStepPairs = false; + + void Init() final; + + /** Parses parameters from IO SetParameters */ + void InitParameters() final; + /** Set up the aggregator */ + void InitAggregator(); + /** Complete opening/createing metadata and data files */ + void InitTransports() final; + /** DAOS pool connection and container opening */ + void InitDAOS(); + /** Allocates memory and starts a PG group */ + void InitBPBuffer(); + void NotifyEngineAttribute(std::string name, DataType type) noexcept; + /** Notify the engine when a new attribute is defined or modified. Called + * from IO.tcc + */ + void NotifyEngineAttribute(std::string name, AttributeBase *Attr, + void *data) noexcept; + + void EnterComputationBlock() noexcept; + /** Inform about computation block through User->ADIOS->IO */ + void ExitComputationBlock() noexcept; + +#define declare_type(T) \ + void DoPut(Variable &variable, typename Variable::Span &span, \ + const bool initialize, const T &value) final; + + ADIOS2_FOREACH_PRIMITIVE_STDTYPE_1ARG(declare_type) +#undef declare_type + + template + void PutCommonSpan(Variable &variable, typename Variable::Span &span, + const bool initialize, const T &value); + +#define declare_type(T) \ + void DoPutSync(Variable &, const T *) final; \ + void DoPutDeferred(Variable &, const T *) final; + + ADIOS2_FOREACH_STDTYPE_1ARG(declare_type) +#undef declare_type + + void PutCommon(VariableBase &variable, const void *data, bool sync); + +#define declare_type(T, L) \ + T *DoBufferData_##L(const int bufferIdx, const size_t payloadPosition, \ + const size_t bufferID = 0) noexcept final; + + ADIOS2_FOREACH_PRIMITVE_STDTYPE_2ARGS(declare_type) +#undef declare_type + + void DoPutStructSync(VariableStruct &, const void *) final; + void DoPutStructDeferred(VariableStruct &, const void *) final; + + void PutStruct(VariableStruct &, const void *, bool); + + void FlushData(const bool isFinal = false); + + void DoClose(const int transportIndex = -1) final; + + /** Write a profiling.json file from m_BP1Writer and m_TransportsManager + * profilers*/ + void WriteProfilingJSONFile(); + + void WriteMetaMetadata( + const std::vector MetaMetaBlocks); + + void WriteMetadataFileIndex(uint64_t MetaDataPos, uint64_t MetaDataSize); + + uint64_t WriteMetadata(const std::vector &MetaDataBlocks, + const std::vector &AttributeBlocks); + + /** Write Data to disk, in an aggregator chain */ + void WriteData(format::BufferV *Data); + void WriteData_EveryoneWrites(format::BufferV *Data, + bool SerializedWriters); + void WriteData_EveryoneWrites_Async(format::BufferV *Data, + bool SerializedWriters); + void WriteData_TwoLevelShm(format::BufferV *Data); + void WriteData_TwoLevelShm_Async(format::BufferV *Data); + + void UpdateActiveFlag(const bool active); + + void WriteCollectiveMetadataFile(const bool isFinal = false); + + void MarshalAttributes(); + + /* Two-level-shm aggregator functions */ + void WriteMyOwnData(format::BufferV *Data); + void SendDataToAggregator(format::BufferV *Data); + void WriteOthersData(const size_t TotalSize); + + template + void PerformPutCommon(Variable &variable); + + void FlushProfiler(); + + /** manages all communication tasks in aggregation */ + aggregator::MPIAggregator *m_Aggregator; // points to one of these below + aggregator::MPIShmChain m_AggregatorTwoLevelShm; + aggregator::MPIChain m_AggregatorEveroneWrites; + bool m_IAmDraining = false; + bool m_IAmWritingData = false; + helper::Comm *DataWritingComm; // processes that write the same data file + // aggregators only (valid if m_Aggregator->m_Comm.Rank() == 0) + helper::Comm m_CommAggregators; + adios2::profiling::JSONProfiler m_Profiler; + +protected: + virtual void DestructorClose(bool Verbose) noexcept; + +private: + // updated during WriteMetaData + uint64_t m_MetaDataPos = 0; + + /** On every process, at the end of writing, this holds the offset + * where they started writing (needed for global metadata) + */ + uint64_t m_StartDataPos = 0; + /** On aggregators, at the end of writing, this holds the starting offset + * to the next step's writing; otherwise used as temporary offset variable + * during writing on every process and points to the end of the process' + * data block in the file (not used for anything) + */ + uint64_t m_DataPos = 0; + + /* + * Total data written this timestep + */ + uint64_t m_ThisTimestepDataSize = 0; + + /** rank 0 collects m_StartDataPos in this vector for writing it + * to the index file + */ + std::vector m_WriterDataPos; + + bool m_MarshalAttributesNecessary = true; + + std::vector> FlushPosSizeInfo; + + void MakeHeader(std::vector &buffer, size_t &position, + const std::string fileType, const bool isActive); + + std::vector m_WriterSubfileMap; // rank => subfile index + + // Append helper data + std::vector m_AppendDataPos; // each subfile append pos + size_t m_AppendMetadataPos; // metadata file append pos + size_t m_AppendMetaMetadataPos; // meta-metadata file append pos + size_t m_AppendMetadataIndexPos; // index file append pos + uint32_t m_AppendWriterCount; // last active number of writers + unsigned int m_AppendAggregatorCount; // last active number of aggr + unsigned int m_AppendSubfileCount; // last active number of subfiles + /* Process existing index, fill in append variables, + * and return the actual step we land after appending. + * Uses parameter AppendAfterStep + * It resets m_Aggregator->m_NumAggregators so init aggregators later + */ + uint64_t CountStepsInMetadataIndex(format::BufferSTL &bufferSTL); + + /* Async write's future */ + std::future m_WriteFuture; + // variables to delay writing to index file + uint64_t m_LatestMetaDataPos; + uint64_t m_LatestMetaDataSize; + Seconds m_LastTimeBetweenSteps = Seconds(0.0); + Seconds m_TotalTimeBetweenSteps = Seconds(0.0); + Seconds m_AvgTimeBetweenSteps = Seconds(0.0); + Seconds m_ExpectedTimeBetweenSteps = Seconds(0.0); + TimePoint m_EndStepEnd; + TimePoint m_EngineStart; + TimePoint m_BeginStepStart; + bool m_flagRush; // main thread flips this in Close, async thread watches it + bool m_InComputationBlock = false; // main thread flips this in Clos + TimePoint m_ComputationBlockStart; + /* block counter and length in seconds */ + size_t m_ComputationBlockID = 0; + + struct ComputationBlockInfo + { + size_t blockID; + double length; // seconds + ComputationBlockInfo(const size_t id, const double len) + : blockID(id), length(len){}; + }; + + std::vector m_ComputationBlockTimes; + /* sum of computationBlockTimes at start of async IO; */ + double m_ComputationBlocksLength = 0.0; + + /* struct of data passed from main thread to async write thread at launch */ + struct AsyncWriteInfo + { + adios2::aggregator::MPIAggregator *aggregator; + int rank_global; + helper::Comm comm_chain; + int rank_chain; + int nproc_chain; + TimePoint tstart; + adios2::shm::TokenChain *tokenChain; + transportman::TransportMan *tm; + adios2::format::BufferV *Data; + uint64_t startPos; + uint64_t totalSize; + double deadline; // wall-clock time available in seconds + bool *flagRush; // flipped from false to true by main thread + bool *inComputationBlock; // flipped back and forth by main thread + // comm-free time within deadline in seconds + double computationBlocksLength; + std::vector expectedComputationBlocks; // a copy + std::vector + *currentComputationBlocks; // extended by main thread + size_t *currentComputationBlockID; // increased by main thread + shm::Spinlock *lock; // race condition over currentComp* variables + }; + + AsyncWriteInfo *m_AsyncWriteInfo; + /* lock to handle race condition over the following currentComp* variables + m_InComputationBlock / AsyncWriteInfo::inComputationBlock + m_ComputationBlockID / AsyncWriteInfo::currentComputationBlockID + m_flagRush / AsyncWriteInfo::flagRush + Currently not used + m_ComputationBlockTimes / AsyncWriteInfo::currentComputationBlocks + Note: The rush flag does not need protection but CI TSAN sanitizer + screams data race if not protected. + */ + shm::Spinlock m_AsyncWriteLock; + + /* Static functions that will run in another thread */ + static int AsyncWriteThread_EveryoneWrites(AsyncWriteInfo *info); + static int AsyncWriteThread_TwoLevelShm(AsyncWriteInfo *info); + static void AsyncWriteThread_TwoLevelShm_Aggregator(AsyncWriteInfo *info); + static void AsyncWriteThread_TwoLevelShm_SendDataToAggregator( + aggregator::MPIShmChain *a, format::BufferV *Data); + + /* write own data used by both + EveryoneWrites and TwoLevelShm async threads */ + static void AsyncWriteOwnData(AsyncWriteInfo *info, + std::vector &DataVec, + const size_t totalsize, + const bool seekOnFirstWrite); + enum class ComputationStatus + { + InComp, + NotInComp_ExpectMore, + NoMoreComp + }; + static ComputationStatus IsInComputationBlock(AsyncWriteInfo *info, + size_t &compBlockIdx); + + void AsyncWriteDataCleanup(); + void AsyncWriteDataCleanup_EveryoneWrites(); + void AsyncWriteDataCleanup_TwoLevelShm(); + + void daos_handle_share(daos_handle_t *, int); +}; + +} // end namespace engine +} // end namespace core +} // end namespace adios2 + +#endif /* ADIOS2_ENGINE_DAOS_DAOSWRITER_H_ */ diff --git a/source/adios2/engine/daos/DaosWriter.tcc b/source/adios2/engine/daos/DaosWriter.tcc new file mode 100644 index 0000000000..9b72a65f5b --- /dev/null +++ b/source/adios2/engine/daos/DaosWriter.tcc @@ -0,0 +1,97 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * DaosWriter.tcc implementation of template functions with known type + * + */ +#ifndef ADIOS2_ENGINE_DAOS_DAOSWRITER_TCC_ +#define ADIOS2_ENGINE_DAOS_DAOSWRITER_TCC_ + +#include "DaosWriter.h" +#include "adios2/helper/adiosMath.h" + +namespace adios2 +{ +namespace core +{ +namespace engine +{ + +template +void DaosWriter::PutCommonSpan(Variable &variable, + typename Variable::Span &span, + const bool initialize, const T &value) +{ + format::BufferV::BufferPos bp5span(0, 0, 0); + + size_t *Shape = NULL; + size_t *Start = NULL; + size_t *Count = NULL; + size_t DimCount = 0; + + if (!m_BetweenStepPairs) + { + BeginStep(StepMode::Update); + } + if (variable.m_ShapeID == ShapeID::GlobalArray) + { + DimCount = variable.m_Shape.size(); + Shape = variable.m_Shape.data(); + Start = variable.m_Start.data(); + Count = variable.m_Count.data(); + } + else if (variable.m_ShapeID == ShapeID::JoinedArray) + { + Shape = variable.m_Shape.data(); + DimCount = variable.m_Count.size(); + Count = variable.m_Count.data(); + } + else if (variable.m_ShapeID == ShapeID::LocalArray) + { + DimCount = variable.m_Count.size(); + Count = variable.m_Count.data(); + } + + if (std::is_same::value) + { + m_BP5Serializer.Marshal((void *)&variable, variable.m_Name.c_str(), + variable.m_Type, variable.m_ElementSize, + DimCount, Shape, Count, Start, nullptr, false, + &bp5span); + } + else + m_BP5Serializer.Marshal((void *)&variable, variable.m_Name.c_str(), + variable.m_Type, variable.m_ElementSize, + DimCount, Shape, Count, Start, nullptr, false, + &bp5span); + + span.m_PayloadPosition = bp5span.posInBuffer; + span.m_BufferIdx = bp5span.bufferIdx; + span.m_Value = value; + + /* initialize buffer if needed */ + if (initialize) + { + const size_t ElemCount = m_BP5Serializer.CalcSize(DimCount, Count); + T *itBegin = reinterpret_cast( + m_BP5Serializer.GetPtr(span.m_BufferIdx, span.m_PayloadPosition)); + + // TODO from BP4: does std::fill_n have a bug in gcc or due to + // optimizations this is impossible due to memory alignment? This seg + // faults in Release mode only . Even RelWithDebInfo works, replacing + // with explicit loop below using access operator [] + // std::fill_n(itBegin, blockSize, span->m_Value); + + for (size_t i = 0; i < ElemCount; ++i) + { + itBegin[i] = value; + } + } +} + +} // end namespace engine +} // end namespace core +} // end namespace adios2 + +#endif /* ADIOS2_ENGINE_DAOS_DAOSWRITER_TCC_ */ diff --git a/source/adios2/engine/daos/DaosWriter_EveryoneWrites_Async.cpp b/source/adios2/engine/daos/DaosWriter_EveryoneWrites_Async.cpp new file mode 100644 index 0000000000..4549ed9dc8 --- /dev/null +++ b/source/adios2/engine/daos/DaosWriter_EveryoneWrites_Async.cpp @@ -0,0 +1,357 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * BP5Writer.cpp + * + */ + +#include "DaosWriter.h" +#include "DaosWriter.tcc" + +#include "adios2/common/ADIOSMacros.h" +#include "adios2/core/IO.h" +#include "adios2/helper/adiosFunctions.h" //CheckIndexRange +#include "adios2/toolkit/format/buffer/chunk/ChunkV.h" +#include "adios2/toolkit/format/buffer/malloc/MallocV.h" +#include "adios2/toolkit/transport/file/FileFStream.h" +#include + +#include // max +#include +#include + +namespace adios2 +{ +namespace core +{ +namespace engine +{ + +using namespace adios2::format; + +DaosWriter::ComputationStatus +DaosWriter::IsInComputationBlock(AsyncWriteInfo *info, size_t &compBlockIdx) +{ + ComputationStatus compStatus = ComputationStatus::NotInComp_ExpectMore; + size_t nExpectedBlocks = info->expectedComputationBlocks.size(); + + if (compBlockIdx >= nExpectedBlocks) + { + compStatus = ComputationStatus::NoMoreComp; + } + else + { + bool inComp = false; + size_t compBlockID = 0; + // access variables modified by main thread to avoid data race + info->lock->lock(); + compBlockID = *info->currentComputationBlockID; + inComp = *info->inComputationBlock; + info->lock->unlock(); + + /* Track which computation block we are in */ + if (inComp) + { + while (compBlockIdx < nExpectedBlocks && + info->expectedComputationBlocks[compBlockIdx].blockID < + compBlockID) + { + ++compBlockIdx; + } + if (info->expectedComputationBlocks[compBlockIdx].blockID > + compBlockID) + { + // the current computation block is a short one that was not + // recorded + compStatus = ComputationStatus::NotInComp_ExpectMore; + } + else + { + compStatus = ComputationStatus::InComp; + } + } + } + return compStatus; +} + +void DaosWriter::AsyncWriteOwnData(AsyncWriteInfo *info, + std::vector &DataVec, + const size_t totalsize, + const bool seekOnFirstWrite) +{ + /* local variables to track variables modified by main thread */ + size_t compBlockIdx = 0; /* position in vector to get length */ + + /* In a loop, write the data in smaller blocks */ + size_t nBlocks = DataVec.size(); + size_t wrote = 0; + size_t block = 0; + size_t temp_offset = 0; + size_t max_size = std::max(1024 * 1024UL, totalsize / 100UL); + + bool firstWrite = seekOnFirstWrite; + while (block < nBlocks) + { + bool doRush = false; + bool doSleep = false; + + info->lock->lock(); + doRush = *info->flagRush; + info->lock->unlock(); + + if (!doRush) + { + ComputationStatus compStatus = + IsInComputationBlock(info, compBlockIdx); + + /* Scheduling decisions: + Cases: + 1. Not in a computation block AND we still expect more + computation blocks down the line ==> Sleep + 2. In computation block ==> Write + 3. We are at the end of a computation block (how close??) AND we + still expect more computation blocks down the line 3. ==> Sleep + 4. We are at the end of the LAST computation block ==> Write + 5. No more computation blocks expected ==> Write all at once + 6. Main thread set flagRush ==> Write all at once + -- case 3 not handled yet properly + */ + + switch (compStatus) + { + case ComputationStatus::NotInComp_ExpectMore: + // case 1 + doSleep = true; + break; + case ComputationStatus::NoMoreComp: + // case 5 + doRush = true; + break; + default: + // cases 2, 3, 4 + break; + } + } + + if (doRush) + { + auto vec = std::vector(DataVec.begin() + block, + DataVec.end()); + vec[0].iov_base = + (const char *)DataVec[block].iov_base + temp_offset; + vec[0].iov_len = DataVec[block].iov_len - temp_offset; + size_t pos = MaxSizeT; // <==> no seek inside WriteFileAt + if (firstWrite) + { + pos = info->startPos + wrote; // seek to pos + } + /*std::cout << "Async write on Rank " << info->rank_global + << " write the rest of " << totalsize - wrote + << " bytes at pos " << pos << std::endl;*/ + + info->tm->WriteFileAt(vec.data(), vec.size(), pos); + + break; /* Exit loop after this final write */ + } + + if (doSleep) + { + std::this_thread::sleep_for(core::Seconds(0.01)); + continue; + } + + /* Write next batch of data */ + + /* Get the next n bytes from the current block, current offset */ + size_t n = DataVec[block].iov_len - temp_offset; + if (n > max_size) + { + n = max_size; + } + + if (firstWrite) + { + info->tm->WriteFileAt((const char *)DataVec[block].iov_base + + temp_offset, + n, info->startPos); + firstWrite = false; + } + else + { + info->tm->WriteFiles( + (const char *)DataVec[block].iov_base + temp_offset, n); + } + + /* Have we processed the entire block or staying with it? */ + if (n + temp_offset < DataVec[block].iov_len) + { + temp_offset += n; + } + else + { + temp_offset = 0; + ++block; + } + wrote += n; + } +}; + +int DaosWriter::AsyncWriteThread_EveryoneWrites(AsyncWriteInfo *info) +{ + if (info->tokenChain) + { + if (info->rank_chain > 0) + { + info->tokenChain->RecvToken(); + } + } + + std::vector DataVec = info->Data->DataVec(); + const uint64_t mysize = info->Data->Size(); + AsyncWriteOwnData(info, DataVec, mysize, true); + + if (info->tokenChain) + { + uint64_t t = 1; + info->tokenChain->SendToken(t); + if (!info->rank_chain) + { + info->tokenChain->RecvToken(); + } + } + delete info->Data; + return 1; +}; + +void DaosWriter::WriteData_EveryoneWrites_Async(format::BufferV *Data, + bool SerializedWriters) +{ + + const aggregator::MPIChain *a = + dynamic_cast(m_Aggregator); + + // new step writing starts at offset m_DataPos on aggregator + // others will wait for the position to arrive from the rank below + + if (a->m_Comm.Rank() > 0) + { + a->m_Comm.Recv( + &m_DataPos, 1, a->m_Comm.Rank() - 1, 0, + "Chain token in DaosWriter::WriteData_EveryoneWrites_Async"); + } + + // align to PAGE_SIZE + m_DataPos += + helper::PaddingToAlignOffset(m_DataPos, m_Parameters.StripeSize); + m_StartDataPos = m_DataPos; + + if (a->m_Comm.Rank() < a->m_Comm.Size() - 1) + { + uint64_t nextWriterPos = m_DataPos + Data->Size(); + a->m_Comm.Isend( + &nextWriterPos, 1, a->m_Comm.Rank() + 1, 0, + "Chain token in DaosWriter::WriteData_EveryoneWrites_Async"); + } + + m_DataPos += Data->Size(); + + /* a->comm can span multiple nodes but we need comm inside a node + when doing serialized aggregation */ + m_AsyncWriteInfo = new AsyncWriteInfo(); + m_AsyncWriteInfo->aggregator = nullptr; + m_AsyncWriteInfo->rank_global = m_Comm.Rank(); + if (SerializedWriters) + { + m_AsyncWriteInfo->comm_chain = a->m_Comm.GroupByShm(); + m_AsyncWriteInfo->rank_chain = m_AsyncWriteInfo->comm_chain.Rank(); + m_AsyncWriteInfo->nproc_chain = m_AsyncWriteInfo->comm_chain.Size(); + m_AsyncWriteInfo->tokenChain = + new shm::TokenChain(&m_AsyncWriteInfo->comm_chain); + } + else + { + m_AsyncWriteInfo->comm_chain = helper::Comm(); // not needed + m_AsyncWriteInfo->rank_chain = a->m_Comm.Rank(); + m_AsyncWriteInfo->nproc_chain = a->m_Comm.Size(); + m_AsyncWriteInfo->tokenChain = nullptr; + } + m_AsyncWriteInfo->tstart = m_EngineStart; + m_AsyncWriteInfo->tm = &m_FileDataManager; + m_AsyncWriteInfo->Data = Data; + m_AsyncWriteInfo->startPos = m_StartDataPos; + m_AsyncWriteInfo->totalSize = Data->Size(); + m_AsyncWriteInfo->deadline = m_ExpectedTimeBetweenSteps.count(); + m_AsyncWriteInfo->flagRush = &m_flagRush; + m_AsyncWriteInfo->lock = &m_AsyncWriteLock; + + if (m_ComputationBlocksLength > 0.0 && + m_Parameters.AsyncWrite == (int)AsyncWrite::Guided) + { + m_AsyncWriteInfo->inComputationBlock = &m_InComputationBlock; + m_AsyncWriteInfo->computationBlocksLength = m_ComputationBlocksLength; + if (m_AsyncWriteInfo->deadline < m_ComputationBlocksLength) + { + m_AsyncWriteInfo->deadline = m_ComputationBlocksLength; + } + m_AsyncWriteInfo->expectedComputationBlocks = + m_ComputationBlockTimes; // copy! + m_AsyncWriteInfo->currentComputationBlocks = + &m_ComputationBlockTimes; // ptr! + m_AsyncWriteInfo->currentComputationBlockID = &m_ComputationBlockID; + + /* Clear current block tracker now so that async thread does not get + confused with the past info */ + m_ComputationBlockTimes.clear(); + m_ComputationBlocksLength = 0.0; + m_ComputationBlockID = 0; + } + else + { + if (m_Parameters.AsyncWrite == (int)AsyncWrite::Naive) + { + m_AsyncWriteInfo->deadline = 0; + } + m_AsyncWriteInfo->inComputationBlock = nullptr; + m_AsyncWriteInfo->computationBlocksLength = 0.0; + m_AsyncWriteInfo->currentComputationBlocks = nullptr; + m_AsyncWriteInfo->currentComputationBlockID = nullptr; + } + + m_WriteFuture = std::async( + std::launch::async, AsyncWriteThread_EveryoneWrites, m_AsyncWriteInfo); + + // At this point modifying Data in main thread is prohibited !!! + + if (a->m_Comm.Size() > 1) + { + // at the end, last rank sends back the final data pos to first rank + // so it can update its data pos + if (a->m_Comm.Rank() == a->m_Comm.Size() - 1) + { + a->m_Comm.Isend(&m_DataPos, 1, 0, 0, + "Final chain token in " + "DaosWriter::WriteData_EveryoneWrites_Async"); + } + if (a->m_Comm.Rank() == 0) + { + a->m_Comm.Recv( + &m_DataPos, 1, a->m_Comm.Size() - 1, 0, + "Chain token in DaosWriter::WriteData_EveryoneWrites_Async"); + } + } +} + +void DaosWriter::AsyncWriteDataCleanup_EveryoneWrites() +{ + if (m_AsyncWriteInfo->tokenChain) + { + delete m_AsyncWriteInfo->tokenChain; + } + delete m_AsyncWriteInfo; + m_AsyncWriteInfo = nullptr; +} + +} // end namespace engine +} // end namespace core +} // end namespace adios2 diff --git a/source/adios2/engine/daos/DaosWriter_TwoLevelShm.cpp b/source/adios2/engine/daos/DaosWriter_TwoLevelShm.cpp new file mode 100644 index 0000000000..9ef11dd74b --- /dev/null +++ b/source/adios2/engine/daos/DaosWriter_TwoLevelShm.cpp @@ -0,0 +1,298 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * DaosWriter.cpp + * + */ + +#include "DaosWriter.h" + +#include "adios2/common/ADIOSMacros.h" +#include "adios2/core/IO.h" +#include "adios2/helper/adiosFunctions.h" //CheckIndexRange, PaddingToAlignOffset +#include "adios2/toolkit/format/buffer/chunk/ChunkV.h" +#include "adios2/toolkit/format/buffer/malloc/MallocV.h" +#include "adios2/toolkit/shm/TokenChain.h" +#include "adios2/toolkit/transport/file/FileFStream.h" +#include + +#include +#include +#include + +namespace adios2 +{ +namespace core +{ +namespace engine +{ + +using namespace adios2::format; + +void DaosWriter::WriteData_TwoLevelShm(format::BufferV *Data) +{ + aggregator::MPIShmChain *a = + dynamic_cast(m_Aggregator); + + // new step writing starts at offset m_DataPos on master aggregator + // other aggregators to the same file will need to wait for the position + // to arrive from the rank below + + // align to PAGE_SIZE (only valid on master aggregator at this point) + m_DataPos += + helper::PaddingToAlignOffset(m_DataPos, m_Parameters.StripeSize); + + // Each aggregator needs to know the total size they write + // This calculation is valid on aggregators only + std::vector mySizes = a->m_Comm.GatherValues(Data->Size()); + uint64_t myTotalSize = 0; + uint64_t maxSize = 0; + for (auto s : mySizes) + { + myTotalSize += s; + if (s > maxSize) + { + maxSize = s; + } + } + + if (a->m_Comm.Size() > 1) + { + size_t alignment_size = sizeof(max_align_t); + if (m_Parameters.DirectIO) + { + alignment_size = m_Parameters.DirectIOAlignOffset; + } + a->CreateShm(static_cast(maxSize), m_Parameters.MaxShmSize, + alignment_size); + } + + shm::TokenChain tokenChain(&a->m_Comm); + + if (a->m_IsAggregator) + { + // In each aggregator chain, send from master down the line + // these total sizes, so every aggregator knows where to start + if (a->m_AggregatorChainComm.Rank() > 0) + { + a->m_AggregatorChainComm.Recv( + &m_DataPos, 1, a->m_AggregatorChainComm.Rank() - 1, 0, + "AggregatorChain token in DaosWriter::WriteData_TwoLevelShm"); + // align to PAGE_SIZE + m_DataPos += helper::PaddingToAlignOffset(m_DataPos, + m_Parameters.StripeSize); + } + m_StartDataPos = m_DataPos; // metadata needs this info + if (a->m_AggregatorChainComm.Rank() < + a->m_AggregatorChainComm.Size() - 1) + { + uint64_t nextWriterPos = m_DataPos + myTotalSize; + a->m_AggregatorChainComm.Isend( + &nextWriterPos, 1, a->m_AggregatorChainComm.Rank() + 1, 0, + "Chain token in DaosWriter::WriteData"); + } + else if (a->m_AggregatorChainComm.Size() > 1) + { + // send back final position from last aggregator in file to master + // aggregator + uint64_t nextWriterPos = m_DataPos + myTotalSize; + a->m_AggregatorChainComm.Isend( + &nextWriterPos, 1, 0, 0, + "Chain token in DaosWriter::WriteData"); + } + + /*std::cout << "Rank " << m_Comm.Rank() + << " aggregator start writing step " << m_WriterStep + << " to subfile " << a->m_SubStreamIndex << " at pos " + << m_DataPos << " totalsize " << myTotalSize << std::endl;*/ + + // Send token to first non-aggregator to start filling shm + // Also informs next process its starting offset (for correct metadata) + uint64_t nextWriterPos = m_DataPos + Data->Size(); + tokenChain.SendToken(nextWriterPos); + + WriteMyOwnData(Data); + + /* Write from shm until every non-aggr sent all data */ + if (a->m_Comm.Size() > 1) + { + WriteOthersData(myTotalSize - Data->Size()); + } + + // Master aggregator needs to know where the last writing ended by the + // last aggregator in the chain, so that it can start from the correct + // position at the next output step + if (a->m_AggregatorChainComm.Size() > 1 && + !a->m_AggregatorChainComm.Rank()) + { + a->m_AggregatorChainComm.Recv( + &m_DataPos, 1, a->m_AggregatorChainComm.Size() - 1, 0, + "Chain token in DaosWriter::WriteData"); + } + } + else + { + // non-aggregators fill shared buffer in marching order + // they also receive their starting offset this way + m_StartDataPos = tokenChain.RecvToken(); + + /*std::cout << "Rank " << m_Comm.Rank() + << " non-aggregator recv token to fill shm = " + << m_StartDataPos << std::endl;*/ + + SendDataToAggregator(Data); + + uint64_t nextWriterPos = m_StartDataPos + Data->Size(); + tokenChain.SendToken(nextWriterPos); + } + + if (a->m_Comm.Size() > 1) + { + a->DestroyShm(); + } +} + +void DaosWriter::WriteMyOwnData(format::BufferV *Data) +{ + std::vector DataVec = Data->DataVec(); + m_StartDataPos = m_DataPos; + m_FileDataManager.WriteFileAt(DataVec.data(), DataVec.size(), + m_StartDataPos); + m_DataPos += Data->Size(); +} + +/*std::string DoubleBufferToString(const double *b, int n) +{ + std::ostringstream out; + out.precision(1); + out << std::fixed << "["; + char s[32]; + + for (int i = 0; i < n; ++i) + { + snprintf(s, sizeof(s), "%g", b[i]); + out << s; + if (i < n - 1) + { + out << ", "; + } + } + out << "]"; + return out.str(); +}*/ + +void DaosWriter::SendDataToAggregator(format::BufferV *Data) +{ + /* Only one process is running this function at once + See shmFillerToken in the caller function + + In a loop, copy the local data into the shared memory, alternating + between the two segments. + */ + + aggregator::MPIShmChain *a = + dynamic_cast(m_Aggregator); + + std::vector DataVec = Data->DataVec(); + size_t nBlocks = DataVec.size(); + + // size_t sent = 0; + size_t block = 0; + size_t temp_offset = 0; + while (block < nBlocks) + { + // potentially blocking call waiting on Aggregator + aggregator::MPIShmChain::ShmDataBuffer *b = a->LockProducerBuffer(); + // b->max_size: how much we can copy + // b->actual_size: how much we actually copy + b->actual_size = 0; + while (true) + { + /* Copy n bytes from the current block, current offset to shm + making sure to use up to shm_size bytes + */ + size_t n = DataVec[block].iov_len - temp_offset; + if (n > (b->max_size - b->actual_size)) + { + n = b->max_size - b->actual_size; + } + std::memcpy(&b->buf[b->actual_size], + (const char *)DataVec[block].iov_base + temp_offset, n); + b->actual_size += n; + + /* Have we processed the entire block or staying with it? */ + if (n + temp_offset < DataVec[block].iov_len) + { + temp_offset += n; + } + else + { + temp_offset = 0; + ++block; + } + + /* Have we reached the max allowed shm size ?*/ + if (b->actual_size >= b->max_size) + { + break; + } + if (block >= nBlocks) + { + break; + } + } + // sent += b->actual_size; + + /*if (m_RankMPI >= 42) + { + std::cout << "Rank " << m_Comm.Rank() + << " filled shm, data_size = " << b->actual_size + << " block = " << block + << " temp offset = " << temp_offset << " sent = " << sent + << " buf = " << static_cast(b->buf) << " = " + << DoubleBufferToString((double *)b->buf, + b->actual_size / sizeof(double)) + << std::endl; + }*/ + + a->UnlockProducerBuffer(); + } +} +void DaosWriter::WriteOthersData(size_t TotalSize) +{ + /* Only an Aggregator calls this function */ + aggregator::MPIShmChain *a = + dynamic_cast(m_Aggregator); + + size_t wrote = 0; + while (wrote < TotalSize) + { + // potentially blocking call waiting on some non-aggr process + aggregator::MPIShmChain::ShmDataBuffer *b = a->LockConsumerBuffer(); + + /*std::cout << "Rank " << m_Comm.Rank() + << " write from shm, data_size = " << b->actual_size + << " total so far = " << wrote + << " buf = " << static_cast(b->buf) << " = " + << DoubleBufferToString((double *)b->buf, + b->actual_size / sizeof(double)) + << std::endl;*/ + /*<< " buf = " << static_cast(b->buf) << " = [" + << (int)b->buf[0] << (int)b->buf[1] << "..." + << (int)b->buf[b->actual_size - 2] + << (int)b->buf[b->actual_size - 1] << "]" << std::endl;*/ + + // b->actual_size: how much we need to write + m_FileDataManager.WriteFiles(b->buf, b->actual_size); + + wrote += b->actual_size; + + a->UnlockConsumerBuffer(); + } + m_DataPos += TotalSize; +} + +} // end namespace engine +} // end namespace core +} // end namespace adios2 diff --git a/source/adios2/engine/daos/DaosWriter_TwoLevelShm_Async.cpp b/source/adios2/engine/daos/DaosWriter_TwoLevelShm_Async.cpp new file mode 100644 index 0000000000..4c632ff3ff --- /dev/null +++ b/source/adios2/engine/daos/DaosWriter_TwoLevelShm_Async.cpp @@ -0,0 +1,357 @@ +/* + * Distributed under the OSI-approved Apache License, Version 2.0. See + * accompanying file Copyright.txt for details. + * + * DaosWriter.cpp + * + */ + +#include "DaosWriter.h" + +#include "adios2/common/ADIOSMacros.h" +#include "adios2/core/CoreTypes.h" +#include "adios2/core/IO.h" +#include "adios2/helper/adiosFunctions.h" //CheckIndexRange, PaddingToAlignOffset +#include "adios2/toolkit/format/buffer/chunk/ChunkV.h" +#include "adios2/toolkit/format/buffer/malloc/MallocV.h" +#include "adios2/toolkit/transport/file/FileFStream.h" +#include + +#include +#include +#include +#include + +namespace adios2 +{ +namespace core +{ +namespace engine +{ + +using namespace adios2::format; + +/* Aggregator part of the async two level aggregation Guided version + This process is the one writing to disk +*/ +void DaosWriter::AsyncWriteThread_TwoLevelShm_Aggregator(AsyncWriteInfo *info) +{ + aggregator::MPIShmChain *a = + dynamic_cast(info->aggregator); + uint64_t totalSize = info->totalSize; + + /* Write own data first */ + { + std::vector DataVec = info->Data->DataVec(); + const uint64_t mysize = info->Data->Size(); + info->tm->SeekTo(info->startPos); + AsyncWriteOwnData(info, DataVec, mysize, false); + totalSize -= mysize; + } + + /* Write from shm until every non-aggr sent all data */ + std::vector DataVec(1); + size_t wrote = 0; + while (wrote < totalSize) + { + /* Write the next shm block now */ + // potentially blocking call waiting on some non-aggr process + aggregator::MPIShmChain::ShmDataBuffer *b = a->LockConsumerBuffer(); + // b->actual_size: how much we need to write + DataVec[0].iov_base = b->buf; + DataVec[0].iov_len = b->actual_size; + AsyncWriteOwnData(info, DataVec, b->actual_size, false); + wrote += b->actual_size; + a->UnlockConsumerBuffer(); + } +} + +/* Non-aggregator part of the async two level aggregation. + This process passes data to Aggregator through SHM segment. + tokenChain in caller ensures only one process (per aggregator chain) + is running this function at a time +*/ +void DaosWriter::AsyncWriteThread_TwoLevelShm_SendDataToAggregator( + aggregator::MPIShmChain *a, format::BufferV *Data) +{ + /* In a loop, copy the local data into the shared memory, alternating + between the two segments. + */ + + std::vector DataVec = Data->DataVec(); + size_t nBlocks = DataVec.size(); + + // size_t sent = 0; + size_t block = 0; + size_t temp_offset = 0; + while (block < nBlocks) + { + // potentially blocking call waiting on Aggregator + aggregator::MPIShmChain::ShmDataBuffer *b = a->LockProducerBuffer(); + // b->max_size: how much we can copy + // b->actual_size: how much we actually copy + b->actual_size = 0; + while (true) + { + /* Copy n bytes from the current block, current offset to shm + making sure to use up to shm_size bytes + */ + size_t n = DataVec[block].iov_len - temp_offset; + if (n > (b->max_size - b->actual_size)) + { + n = b->max_size - b->actual_size; + } + std::memcpy(&b->buf[b->actual_size], + (const char *)DataVec[block].iov_base + temp_offset, n); + b->actual_size += n; + + /* Have we processed the entire block or staying with it? */ + if (n + temp_offset < DataVec[block].iov_len) + { + temp_offset += n; + } + else + { + temp_offset = 0; + ++block; + } + + /* Have we reached the max allowed shm size ?*/ + if (b->actual_size >= b->max_size) + { + break; + } + if (block >= nBlocks) + { + break; + } + } + // sent += b->actual_size; + a->UnlockProducerBuffer(); + } +} + +int DaosWriter::AsyncWriteThread_TwoLevelShm(AsyncWriteInfo *info) +{ + /* DO NOT use MPI in this separate thread, including destroying + shm segments explicitely (a->DestroyShm) or implicitely (tokenChain) */ + Seconds ts = Now() - info->tstart; + // std::cout << "ASYNC rank " << info->rank_global + // << " starts at: " << ts.count() << std::endl; + aggregator::MPIShmChain *a = + dynamic_cast(info->aggregator); + if (a->m_IsAggregator) + { + // Send token to first non-aggregator to start filling shm + // Also informs next process its starting offset (for correct + // metadata) + uint64_t nextWriterPos = info->startPos + info->Data->Size(); + info->tokenChain->SendToken(nextWriterPos); + AsyncWriteThread_TwoLevelShm_Aggregator(info); + info->tokenChain->RecvToken(); + } + else + { + // non-aggregators fill shared buffer in marching order + // they also receive their starting offset this way + uint64_t startPos = info->tokenChain->RecvToken(); + AsyncWriteThread_TwoLevelShm_SendDataToAggregator(a, info->Data); + uint64_t nextWriterPos = startPos + info->Data->Size(); + info->tokenChain->SendToken(nextWriterPos); + } + delete info->Data; + + ts = Now() - info->tstart; + /*std::cout << "ASYNC " << info->rank_global << " ended at: " << ts.count() + << std::endl;*/ + return 1; +}; + +void DaosWriter::WriteData_TwoLevelShm_Async(format::BufferV *Data) +{ + aggregator::MPIShmChain *a = + dynamic_cast(m_Aggregator); + + // new step writing starts at offset m_DataPos on master aggregator + // other aggregators to the same file will need to wait for the position + // to arrive from the rank below + + // align to PAGE_SIZE (only valid on master aggregator at this point) + m_DataPos += + helper::PaddingToAlignOffset(m_DataPos, m_Parameters.StripeSize); + + // Each aggregator needs to know the total size they write + // This calculation is valid on aggregators only + std::vector mySizes = a->m_Comm.GatherValues(Data->Size()); + uint64_t myTotalSize = 0; + uint64_t maxSize = 0; + for (auto s : mySizes) + { + myTotalSize += s; + if (s > maxSize) + { + maxSize = s; + } + } + + if (a->m_Comm.Size() > 1) + { + size_t alignment_size = sizeof(max_align_t); + if (m_Parameters.DirectIO) + { + alignment_size = m_Parameters.DirectIOAlignOffset; + } + a->CreateShm(static_cast(maxSize), m_Parameters.MaxShmSize, + alignment_size); + } + + if (a->m_IsAggregator) + { + // In each aggregator chain, send from master down the line + // these total sizes, so every aggregator knows where to start + if (a->m_AggregatorChainComm.Rank() > 0) + { + a->m_AggregatorChainComm.Recv( + &m_DataPos, 1, a->m_AggregatorChainComm.Rank() - 1, 0, + "AggregatorChain token in DaosWriter::WriteData_TwoLevelShm"); + // align to PAGE_SIZE + m_DataPos += helper::PaddingToAlignOffset(m_DataPos, + m_Parameters.StripeSize); + } + m_StartDataPos = m_DataPos; // metadata needs this info + if (a->m_AggregatorChainComm.Rank() < + a->m_AggregatorChainComm.Size() - 1) + { + uint64_t nextWriterPos = m_DataPos + myTotalSize; + a->m_AggregatorChainComm.Isend( + &nextWriterPos, 1, a->m_AggregatorChainComm.Rank() + 1, 0, + "Chain token in DaosWriter::WriteData"); + } + else if (a->m_AggregatorChainComm.Size() > 1) + { + // send back final position from last aggregator in file to master + // aggregator + uint64_t nextWriterPos = m_DataPos + myTotalSize; + a->m_AggregatorChainComm.Isend( + &nextWriterPos, 1, 0, 0, + "Chain token in DaosWriter::WriteData"); + } + + // Master aggregator needs to know where the last writing ended by the + // last aggregator in the chain, so that it can start from the correct + // position at the next output step + if (!a->m_AggregatorChainComm.Rank()) + { + if (a->m_AggregatorChainComm.Size() > 1) + { + a->m_AggregatorChainComm.Recv( + &m_DataPos, 1, a->m_AggregatorChainComm.Size() - 1, 0, + "Chain token in DaosWriter::WriteData"); + } + else + { + m_DataPos = m_StartDataPos + myTotalSize; + } + } + } + + /*std::cout << "Rank " << m_Comm.Rank() << " start data async " + << " to subfile " << a->m_SubStreamIndex << " at pos " + << m_StartDataPos << std::endl;*/ + + m_AsyncWriteInfo = new AsyncWriteInfo(); + m_AsyncWriteInfo->aggregator = m_Aggregator; + m_AsyncWriteInfo->rank_global = m_Comm.Rank(); + m_AsyncWriteInfo->rank_chain = a->m_Comm.Rank(); + m_AsyncWriteInfo->nproc_chain = a->m_Comm.Size(); + m_AsyncWriteInfo->comm_chain = helper::Comm(); // unused in this aggregation + m_AsyncWriteInfo->tstart = m_EngineStart; + m_AsyncWriteInfo->tokenChain = new shm::TokenChain(&a->m_Comm); + m_AsyncWriteInfo->tm = &m_FileDataManager; + m_AsyncWriteInfo->Data = Data; + m_AsyncWriteInfo->flagRush = &m_flagRush; + m_AsyncWriteInfo->lock = &m_AsyncWriteLock; + + // Metadata collection needs m_StartDataPos correctly set on + // every process before we call the async writing thread + if (a->m_IsAggregator) + { + // Informs next process its starting offset (for correct metadata) + uint64_t nextWriterPos = m_StartDataPos + Data->Size(); + m_AsyncWriteInfo->tokenChain->SendToken(nextWriterPos); + m_AsyncWriteInfo->tokenChain->RecvToken(); + } + else + { + // non-aggregators fill shared buffer in marching order + // they also receive their starting offset this way + m_StartDataPos = m_AsyncWriteInfo->tokenChain->RecvToken(); + uint64_t nextWriterPos = m_StartDataPos + Data->Size(); + m_AsyncWriteInfo->tokenChain->SendToken(nextWriterPos); + } + + // Launch data writing thread, m_StartDataPos is valid + // m_DataPos is already pointing to the end of the write, do not use here. + m_AsyncWriteInfo->startPos = m_StartDataPos; + m_AsyncWriteInfo->totalSize = myTotalSize; + m_AsyncWriteInfo->deadline = m_ExpectedTimeBetweenSteps.count(); + + if (m_ComputationBlocksLength > 0.0 && + m_Parameters.AsyncWrite == (int)AsyncWrite::Guided) + { + m_AsyncWriteInfo->inComputationBlock = &m_InComputationBlock; + m_AsyncWriteInfo->computationBlocksLength = m_ComputationBlocksLength; + if (m_AsyncWriteInfo->deadline < m_ComputationBlocksLength) + { + m_AsyncWriteInfo->deadline = m_ComputationBlocksLength; + } + m_AsyncWriteInfo->expectedComputationBlocks = + m_ComputationBlockTimes; // copy! + m_AsyncWriteInfo->currentComputationBlocks = + &m_ComputationBlockTimes; // ptr! + m_AsyncWriteInfo->currentComputationBlockID = &m_ComputationBlockID; + + /* Clear current block tracker now so that async thread does not get + confused with the past info */ + m_ComputationBlockTimes.clear(); + m_ComputationBlocksLength = 0.0; + m_ComputationBlockID = 0; + } + else + { + if (m_Parameters.AsyncWrite == (int)AsyncWrite::Naive) + { + m_AsyncWriteInfo->deadline = 0; + } + m_AsyncWriteInfo->inComputationBlock = nullptr; + m_AsyncWriteInfo->computationBlocksLength = 0.0; + m_AsyncWriteInfo->currentComputationBlocks = nullptr; + m_AsyncWriteInfo->currentComputationBlockID = nullptr; + } + + m_WriteFuture = std::async(std::launch::async, AsyncWriteThread_TwoLevelShm, + m_AsyncWriteInfo); + + /* At this point it is prohibited in the main thread + - to modify Data, which will be deleted in the async thread any tiume + - to use m_FileDataManager until next BeginStep, which is being used + in the async thread to write data + */ +} + +void DaosWriter::AsyncWriteDataCleanup_TwoLevelShm() +{ + aggregator::MPIShmChain *a = + dynamic_cast(m_AsyncWriteInfo->aggregator); + if (a->m_Comm.Size() > 1) + { + a->DestroyShm(); + } + delete m_AsyncWriteInfo->tokenChain; + delete m_AsyncWriteInfo; + m_AsyncWriteInfo = nullptr; +} + +} // end namespace engine +} // end namespace core +} // end namespace adios2