Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement a first draft of the ROCmServices #40642

Merged
merged 1 commit into from
Feb 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions HeterogeneousCore/ROCmServices/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<iftool name="rocm">
<use name="rocm"/>
<use name="FWCore/MessageLogger"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="HeterogeneousCore/ROCmUtilities" source_only="true"/>
<export>
<lib name="1"/>
</export>
</iftool>
45 changes: 45 additions & 0 deletions HeterogeneousCore/ROCmServices/interface/ROCmService.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#ifndef HeterogeneousCore_ROCmServices_interface_ROCmService_h
#define HeterogeneousCore_ROCmServices_interface_ROCmService_h

#include <utility>
#include <vector>

#include "FWCore/Utilities/interface/StreamID.h"

namespace edm {
class ParameterSet;
class ActivityRegistry;
class ConfigurationDescriptions;
} // namespace edm

class ROCmService {
public:
ROCmService(edm::ParameterSet const& config);
~ROCmService();

static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);

bool enabled() const { return enabled_; }

int numberOfDevices() const { return numberOfDevices_; }

// major, minor
std::pair<int, int> computeCapability(int device) const { return computeCapabilities_.at(device); }

// Returns the id of device with most free memory. If none is found, returns -1.
int deviceWithMostFreeMemory() const;

private:
int numberOfDevices_ = 0;
std::vector<std::pair<int, int>> computeCapabilities_;
bool enabled_ = false;
bool verbose_ = false;
};

namespace edm {
namespace service {
inline bool isProcessWideService(ROCmService const*) { return true; }
} // namespace service
} // namespace edm

#endif // HeterogeneousCore_ROCmServices_interface_ROCmService_h
12 changes: 12 additions & 0 deletions HeterogeneousCore/ROCmServices/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<iftool name="rocm">
<use name="rocm"/>
<use name="DataFormats/Provenance"/>
<use name="FWCore/MessageLogger"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ServiceRegistry"/>
<use name="HeterogeneousCore/ROCmServices"/>
<use name="HeterogeneousCore/ROCmUtilities" source_only="true"/>
<library file="*.cc" name="HeterogeneousCoreROCmServicesPlugins">
<flags EDM_PLUGIN="1"/>
</library>
</iftool>
120 changes: 120 additions & 0 deletions HeterogeneousCore/ROCmServices/plugins/ROCmMonitoringService.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#include <iostream>

#include <hip/hip_runtime.h>

#include "DataFormats/Provenance/interface/ModuleDescription.h"
#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
#include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
#include "FWCore/ServiceRegistry/interface/Service.h"
#include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h"
#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h"

namespace edm {
class StreamContext;
}

class ROCmMonitoringService {
public:
ROCmMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry);
~ROCmMonitoringService() = default;

static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);

void postModuleConstruction(edm::ModuleDescription const& desc);
void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc);
void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc);
void postEvent(edm::StreamContext const& sc);

private:
int numberOfDevices_ = 0;
};

ROCmMonitoringService::ROCmMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
// make sure that ROCm is initialised, and that the ROCmService destructor is called after this service's destructor
edm::Service<ROCmService> rocmService;
if (!rocmService->enabled())
return;
numberOfDevices_ = rocmService->numberOfDevices();

if (config.getUntrackedParameter<bool>("memoryConstruction")) {
registry.watchPostModuleConstruction(this, &ROCmMonitoringService::postModuleConstruction);
}
if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
registry.watchPostModuleBeginStream(this, &ROCmMonitoringService::postModuleBeginStream);
}
if (config.getUntrackedParameter<bool>("memoryPerModule")) {
registry.watchPostModuleEvent(this, &ROCmMonitoringService::postModuleEvent);
}
if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
registry.watchPostEvent(this, &ROCmMonitoringService::postEvent);
}
}

void ROCmMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
edm::ParameterSetDescription desc;

desc.addUntracked<bool>("memoryConstruction", false)
->setComment("Print memory information for each device after the construction of each module");
desc.addUntracked<bool>("memoryBeginStream", true)
->setComment("Print memory information for each device after the beginStream() of each module");
desc.addUntracked<bool>("memoryPerModule", true)
->setComment("Print memory information for each device after the event of each module");
desc.addUntracked<bool>("memoryPerEvent", true)
->setComment("Print memory information for each device after each event");

descriptions.add("ROCmMonitoringService", desc);
descriptions.setComment(
"The memory information is the global state of the device. This gets confusing if there are multiple processes "
"running on the same device. Probably the information retrieval should be re-thought?");
}

// activity handlers
namespace {
template <typename T>
void dumpUsedMemory(T& log, int num) {
int old = 0;
hipCheck(hipGetDevice(&old));
constexpr auto mbytes = 1 << 20;
for (int i = 0; i < num; ++i) {
size_t freeMemory, totalMemory;
hipCheck(hipSetDevice(i));
hipCheck(hipMemGetInfo(&freeMemory, &totalMemory));
log << "\n"
<< i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total";
}
hipCheck(hipSetDevice(old));
}
} // namespace

void ROCmMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
auto log = edm::LogPrint("ROCmMonitoringService");
log << "ROCm device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
}

void ROCmMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
auto log = edm::LogPrint("ROCmMonitoringService");
log << "ROCm device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " ("
<< mcc.moduleDescription()->moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
}

void ROCmMonitoringService::postModuleEvent(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
auto log = edm::LogPrint("ROCmMonitoringService");
log << "ROCm device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " ("
<< mcc.moduleDescription()->moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
}

void ROCmMonitoringService::postEvent(edm::StreamContext const& sc) {
auto log = edm::LogPrint("ROCmMonitoringService");
log << "ROCm device memory after event";
dumpUsedMemory(log, numberOfDevices_);
}

DEFINE_FWK_SERVICE(ROCmMonitoringService);
4 changes: 4 additions & 0 deletions HeterogeneousCore/ROCmServices/plugins/plugins.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h"
#include "FWCore/ServiceRegistry/interface/ServiceMaker.h"

DEFINE_FWK_SERVICE_MAKER(ROCmService, edm::serviceregistry::ParameterSetMaker<ROCmService>);
Loading