diff --git a/HeterogeneousCore/ROCmServices/BuildFile.xml b/HeterogeneousCore/ROCmServices/BuildFile.xml new file mode 100644 index 0000000000000..0ff47a94f4ebc --- /dev/null +++ b/HeterogeneousCore/ROCmServices/BuildFile.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/HeterogeneousCore/ROCmServices/interface/ROCmService.h b/HeterogeneousCore/ROCmServices/interface/ROCmService.h new file mode 100644 index 0000000000000..c78ec27f51d80 --- /dev/null +++ b/HeterogeneousCore/ROCmServices/interface/ROCmService.h @@ -0,0 +1,45 @@ +#ifndef HeterogeneousCore_ROCmServices_interface_ROCmService_h +#define HeterogeneousCore_ROCmServices_interface_ROCmService_h + +#include +#include + +#include "FWCore/Utilities/interface/StreamID.h" + +namespace edm { + class ParameterSet; + class ActivityRegistry; + class ConfigurationDescriptions; +} // namespace edm + +class ROCmService { +public: + ROCmService(edm::ParameterSet const& config); + ~ROCmService(); + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + bool enabled() const { return enabled_; } + + int numberOfDevices() const { return numberOfDevices_; } + + // major, minor + std::pair computeCapability(int device) const { return computeCapabilities_.at(device); } + + // Returns the id of device with most free memory. If none is found, returns -1. + int deviceWithMostFreeMemory() const; + +private: + int numberOfDevices_ = 0; + std::vector> computeCapabilities_; + bool enabled_ = false; + bool verbose_ = false; +}; + +namespace edm { + namespace service { + inline bool isProcessWideService(ROCmService const*) { return true; } + } // namespace service +} // namespace edm + +#endif // HeterogeneousCore_ROCmServices_interface_ROCmService_h diff --git a/HeterogeneousCore/ROCmServices/plugins/BuildFile.xml b/HeterogeneousCore/ROCmServices/plugins/BuildFile.xml new file mode 100644 index 0000000000000..42f9e3024fc2f --- /dev/null +++ b/HeterogeneousCore/ROCmServices/plugins/BuildFile.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/HeterogeneousCore/ROCmServices/plugins/ROCmMonitoringService.cc b/HeterogeneousCore/ROCmServices/plugins/ROCmMonitoringService.cc new file mode 100644 index 0000000000000..3bd0f2448f1b4 --- /dev/null +++ b/HeterogeneousCore/ROCmServices/plugins/ROCmMonitoringService.cc @@ -0,0 +1,120 @@ +#include + +#include + +#include "DataFormats/Provenance/interface/ModuleDescription.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h" +#include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/ServiceRegistry/interface/ServiceMaker.h" +#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" + +namespace edm { + class StreamContext; +} + +class ROCmMonitoringService { +public: + ROCmMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry); + ~ROCmMonitoringService() = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void postModuleConstruction(edm::ModuleDescription const& desc); + void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc); + void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc); + void postEvent(edm::StreamContext const& sc); + +private: + int numberOfDevices_ = 0; +}; + +ROCmMonitoringService::ROCmMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) { + // make sure that ROCm is initialised, and that the ROCmService destructor is called after this service's destructor + edm::Service rocmService; + if (!rocmService->enabled()) + return; + numberOfDevices_ = rocmService->numberOfDevices(); + + if (config.getUntrackedParameter("memoryConstruction")) { + registry.watchPostModuleConstruction(this, &ROCmMonitoringService::postModuleConstruction); + } + if (config.getUntrackedParameter("memoryBeginStream")) { + registry.watchPostModuleBeginStream(this, &ROCmMonitoringService::postModuleBeginStream); + } + if (config.getUntrackedParameter("memoryPerModule")) { + registry.watchPostModuleEvent(this, &ROCmMonitoringService::postModuleEvent); + } + if (config.getUntrackedParameter("memoryPerEvent")) { + registry.watchPostEvent(this, &ROCmMonitoringService::postEvent); + } +} + +void ROCmMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.addUntracked("memoryConstruction", false) + ->setComment("Print memory information for each device after the construction of each module"); + desc.addUntracked("memoryBeginStream", true) + ->setComment("Print memory information for each device after the beginStream() of each module"); + desc.addUntracked("memoryPerModule", true) + ->setComment("Print memory information for each device after the event of each module"); + desc.addUntracked("memoryPerEvent", true) + ->setComment("Print memory information for each device after each event"); + + descriptions.add("ROCmMonitoringService", desc); + descriptions.setComment( + "The memory information is the global state of the device. This gets confusing if there are multiple processes " + "running on the same device. Probably the information retrieval should be re-thought?"); +} + +// activity handlers +namespace { + template + void dumpUsedMemory(T& log, int num) { + int old = 0; + hipCheck(hipGetDevice(&old)); + constexpr auto mbytes = 1 << 20; + for (int i = 0; i < num; ++i) { + size_t freeMemory, totalMemory; + hipCheck(hipSetDevice(i)); + hipCheck(hipMemGetInfo(&freeMemory, &totalMemory)); + log << "\n" + << i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total"; + } + hipCheck(hipSetDevice(old)); + } +} // namespace + +void ROCmMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) { + auto log = edm::LogPrint("ROCmMonitoringService"); + log << "ROCm device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")"; + dumpUsedMemory(log, numberOfDevices_); +} + +void ROCmMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) { + auto log = edm::LogPrint("ROCmMonitoringService"); + log << "ROCm device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" + << mcc.moduleDescription()->moduleName() << ")"; + dumpUsedMemory(log, numberOfDevices_); +} + +void ROCmMonitoringService::postModuleEvent(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) { + auto log = edm::LogPrint("ROCmMonitoringService"); + log << "ROCm device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " (" + << mcc.moduleDescription()->moduleName() << ")"; + dumpUsedMemory(log, numberOfDevices_); +} + +void ROCmMonitoringService::postEvent(edm::StreamContext const& sc) { + auto log = edm::LogPrint("ROCmMonitoringService"); + log << "ROCm device memory after event"; + dumpUsedMemory(log, numberOfDevices_); +} + +DEFINE_FWK_SERVICE(ROCmMonitoringService); diff --git a/HeterogeneousCore/ROCmServices/plugins/plugins.cc b/HeterogeneousCore/ROCmServices/plugins/plugins.cc new file mode 100644 index 0000000000000..a418eeced333f --- /dev/null +++ b/HeterogeneousCore/ROCmServices/plugins/plugins.cc @@ -0,0 +1,4 @@ +#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h" +#include "FWCore/ServiceRegistry/interface/ServiceMaker.h" + +DEFINE_FWK_SERVICE_MAKER(ROCmService, edm::serviceregistry::ParameterSetMaker); diff --git a/HeterogeneousCore/ROCmServices/src/ROCmService.cc b/HeterogeneousCore/ROCmServices/src/ROCmService.cc new file mode 100644 index 0000000000000..2cabaed127d99 --- /dev/null +++ b/HeterogeneousCore/ROCmServices/src/ROCmService.cc @@ -0,0 +1,382 @@ +#include +#include +#include +#include +#include +#include + +#include +/* +#include +*/ + +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/ResourceInformation.h" +#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" +/* +#include "HeterogeneousCore/ROCmUtilities/interface/nvmlCheck.h" +*/ + +void setHipLimit(hipLimit_t limit, const char* name, size_t request) { + // read the current device + int device; + hipCheck(hipGetDevice(&device)); + // try to set the requested limit + auto result = hipDeviceSetLimit(limit, request); + if (hipErrorUnsupportedLimit == result) { + edm::LogWarning("ROCmService") << "ROCm device " << device << ": unsupported limit \"" << name << "\""; + return; + } + // read back the limit value + size_t value; + result = hipDeviceGetLimit(&value, limit); + if (hipSuccess != result) { + edm::LogWarning("ROCmService") << "ROCm device " << device << ": failed to set limit \"" << name << "\" to " + << request << ", current value is " << value; + } else if (value != request) { + edm::LogWarning("ROCmService") << "ROCm device " << device << ": limit \"" << name << "\" set to " << value + << " instead of requested " << request; + } +} + +std::string decodeVersion(int version) { + return std::to_string(version / 1000) + '.' + std::to_string(version % 1000 / 10); +} + +/// Constructor +ROCmService::ROCmService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter("verbose")) { + bool configEnabled = config.getUntrackedParameter("enabled"); + if (not configEnabled) { + edm::LogInfo("ROCmService") << "ROCmService disabled by configuration"; + return; + } + + auto status = hipGetDeviceCount(&numberOfDevices_); + if (hipSuccess != status) { + edm::LogWarning("ROCmService") << "Failed to initialize the ROCm runtime.\n" + << "Disabling the ROCmService."; + return; + } + computeCapabilities_.reserve(numberOfDevices_); + + /* + // AMD system driver version, e.g. 470.57.02 + char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; + nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH)); + nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion))); + nvmlCheck(nvmlShutdown()); + */ + + // ROCm driver version, e.g. 11.4 + // the full version, like 11.4.1 or 11.4.100, is not reported + int driverVersion = 0; + hipCheck(hipDriverGetVersion(&driverVersion)); + + // ROCm runtime version, e.g. 11.4 + // the full version, like 11.4.1 or 11.4.108, is not reported + int runtimeVersion = 0; + hipCheck(hipRuntimeGetVersion(&runtimeVersion)); + + edm::LogInfo log("ROCmService"); + if (verbose_) { + /* + log << "AMD driver: " << systemDriverVersion << '\n'; + */ + log << "ROCm driver API: " << decodeVersion(driverVersion) << /*" (compiled with " << decodeVersion(ROCm_VERSION) + << ")" */ + "\n"; + log << "ROCm runtime API: " << decodeVersion(runtimeVersion) + << /*" (compiled with " << decodeVersion(ROCmRT_VERSION) + << ")" */ + "\n"; + log << "ROCm runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n"; + } else { + log << "ROCm runtime version " << decodeVersion(runtimeVersion) << ", driver version " + << decodeVersion(driverVersion) + /* + << ", AMD driver version " << systemDriverVersion + */ + ; + } + + auto const& limits = config.getUntrackedParameter("limits"); + /* + auto printfFifoSize = limits.getUntrackedParameter("hipLimitPrintfFifoSize"); + */ + auto stackSize = limits.getUntrackedParameter("hipLimitStackSize"); + auto mallocHeapSize = limits.getUntrackedParameter("hipLimitMallocHeapSize"); + /* + auto devRuntimeSyncDepth = limits.getUntrackedParameter("hipLimitDevRuntimeSyncDepth"); + auto devRuntimePendingLaunchCount = limits.getUntrackedParameter("hipLimitDevRuntimePendingLaunchCount"); + */ + + std::set models; + + for (int i = 0; i < numberOfDevices_; ++i) { + // read information about the compute device. + // see the documentation of hipGetDeviceProperties() for more information. + hipDeviceProp_t properties; + hipCheck(hipGetDeviceProperties(&properties, i)); + log << '\n' << "ROCm device " << i << ": " << properties.name; + if (verbose_) { + log << '\n'; + } + models.insert(std::string(properties.name)); + + // compute capabilities + computeCapabilities_.emplace_back(properties.major, properties.minor); + if (verbose_) { + log << " compute capability: " << properties.major << "." << properties.minor; + } + log << " (sm_" << properties.major << properties.minor << ")"; + if (verbose_) { + log << '\n'; + log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n'; + log << " ROCm cores: " << std::setw(28) << "not yet implemented" << '\n'; + /* + log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio + << ":1\n"; + */ + } + + // compute mode + static constexpr const char* computeModeDescription[] = { + "default (shared)", // hipComputeModeDefault + "exclusive (single thread)", // hipComputeModeExclusive + "prohibited", // hipComputeModeProhibited + "exclusive (single process)", // hipComputeModeExclusiveProcess + "unknown"}; + if (verbose_) { + log << " compute mode:" << std::right << std::setw(27) + << computeModeDescription[std::min(properties.computeMode, + static_cast(std::size(computeModeDescription)) - 1)] + << '\n'; + } + + // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with an exception + hipCheck(hipSetDevice(i)); + hipCheck(hipSetDeviceFlags(hipDeviceScheduleAuto | hipDeviceMapHost)); + + // read the free and total amount of memory available for allocation by the device, in bytes. + // see the documentation of hipMemGetInfo() for more information. + if (verbose_) { + size_t freeMemory, totalMemory; + hipCheck(hipMemGetInfo(&freeMemory, &totalMemory)); + log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6) + << totalMemory / (1 << 20) << " MB total\n"; + log << " constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n"; + log << " L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n"; + } + + // L1 cache behaviour + if (verbose_) { + /* + static constexpr const char* l1CacheModeDescription[] = { + "unknown", "local memory", "global memory", "local and global memory"}; + int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported; + log << " L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n'; + log << '\n'; + */ + + log << "Other capabilities\n"; + log << " " << (properties.canMapHostMemory ? "can" : "cannot") + << " map host memory into the ROCm address space for use with hipHostAlloc()/hipHostGetDevicePointer()\n"; + log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support") + << " coherently accessing pageable memory without calling hipHostRegister() on it\n"; + log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot") + << " access pageable memory via the host's page tables\n"; + /* + log << " " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot") + << " access host registered memory at the same virtual address as the host\n"; + log << " " << (properties.unifiedAddressing ? "shares" : "does not share") + << " a unified address space with the host\n"; + */ + log << " " << (properties.managedMemory ? "supports" : "does not support") + << " allocating managed memory on this system\n"; + log << " " << (properties.concurrentManagedAccess ? "can" : "cannot") + << " coherently access managed memory concurrently with the host\n"; + log << " " + << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot") + << " directly access managed memory on the device without migration\n"; + log << " " << (properties.cooperativeLaunch ? "supports" : "does not support") + << " launching cooperative kernels via hipLaunchCooperativeKernel()\n"; + log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support") + << " launching cooperative kernels via hipLaunchCooperativeKernelMultiDevice()\n"; + log << '\n'; + } + + // set and read the ROCm device flags. + // see the documentation of hipSetDeviceFlags and hipGetDeviceFlags for more information. + if (verbose_) { + log << "ROCm flags\n"; + unsigned int flags; + hipCheck(hipGetDeviceFlags(&flags)); + switch (flags & hipDeviceScheduleMask) { + case hipDeviceScheduleAuto: + log << " thread policy: default\n"; + break; + case hipDeviceScheduleSpin: + log << " thread policy: spin\n"; + break; + case hipDeviceScheduleYield: + log << " thread policy: yield\n"; + break; + case hipDeviceScheduleBlockingSync: + log << " thread policy: blocking sync\n"; + break; + default: + log << " thread policy: undefined\n"; + } + if (flags & hipDeviceMapHost) { + log << " pinned host memory allocations: enabled\n"; + } else { + log << " pinned host memory allocations: disabled\n"; + } + if (flags & hipDeviceLmemResizeToMax) { + log << " kernel host memory reuse: enabled\n"; + } else { + log << " kernel host memory reuse: disabled\n"; + } + log << '\n'; + } + + // set and read the ROCm resource limits. + // see the documentation of hipDeviceSetLimit() for more information. + + /* + // hipLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the + // printf() device system call. + if (printfFifoSize >= 0) { + setHipLimit(hipLimitPrintfFifoSize, "hipLimitPrintfFifoSize", printfFifoSize); + } + */ + // hipLimitStackSize controls the stack size in bytes of each GPU thread. + if (stackSize >= 0) { + setHipLimit(hipLimitStackSize, "hipLimitStackSize", stackSize); + } + // hipLimitMallocHeapSize controls the size in bytes of the heap used by the malloc() + // and free() device system calls. + if (mallocHeapSize >= 0) { + setHipLimit(hipLimitMallocHeapSize, "hipLimitMallocHeapSize", mallocHeapSize); + } + /* + if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) { + // hipLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which + // a thread can safely call hipDeviceSynchronize(). + if (devRuntimeSyncDepth >= 0) { + setHipLimit(hipLimitDevRuntimeSyncDepth, "hipLimitDevRuntimeSyncDepth", devRuntimeSyncDepth); + } + // hipLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding + // device runtime launches that can be made from the current device. + if (devRuntimePendingLaunchCount >= 0) { + setHipLimit( + hipLimitDevRuntimePendingLaunchCount, "hipLimitDevRuntimePendingLaunchCount", devRuntimePendingLaunchCount); + } + } + */ + + if (verbose_) { + size_t value; + log << "ROCm limits\n"; + /* + hipCheck(hipDeviceGetLimit(&value, hipLimitPrintfFifoSize)); + log << " printf buffer size: " << std::setw(10) << value / (1 << 20) << " MB\n"; + */ + hipCheck(hipDeviceGetLimit(&value, hipLimitStackSize)); + log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n"; + hipCheck(hipDeviceGetLimit(&value, hipLimitMallocHeapSize)); + log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n"; + /* + if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) { + hipCheck(hipDeviceGetLimit(&value, hipLimitDevRuntimeSyncDepth)); + log << " runtime sync depth: " << std::setw(10) << value << '\n'; + hipCheck(hipDeviceGetLimit(&value, hipLimitDevRuntimePendingLaunchCount)); + log << " runtime pending launch count: " << std::setw(10) << value << '\n'; + } + */ + } + } + + edm::Service resourceInformationService; + if (resourceInformationService.isAvailable()) { + std::vector modelsV(models.begin(), models.end()); + resourceInformationService->setGPUModels(modelsV); + /* + std::string nvidiaDriverVersion{systemDriverVersion}; + resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion); + resourceInformationService->setCudaDriverVersion(driverVersion); + resourceInformationService->setCudaRuntimeVersion(runtimeVersion); + */ + } + + if (verbose_) { + log << '\n' << "ROCmService fully initialized"; + } + enabled_ = true; +} + +ROCmService::~ROCmService() { + if (enabled_) { + for (int i = 0; i < numberOfDevices_; ++i) { + hipCheck(hipSetDevice(i)); + hipCheck(hipDeviceSynchronize()); + // Explicitly destroys and cleans up all resources associated with the current device in the + // current process. Any subsequent API call to this device will reinitialize the device. + // Useful to check for memory leaks. + hipCheck(hipDeviceReset()); + } + } +} + +void ROCmService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.addUntracked("enabled", true); + desc.addUntracked("verbose", false); + + edm::ParameterSetDescription limits; + /* + limits.addUntracked("hipLimitPrintfFifoSize", -1) + ->setComment("Size in bytes of the shared FIFO used by the printf() device system call."); + */ + limits.addUntracked("hipLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread."); + limits.addUntracked("hipLimitMallocHeapSize", -1) + ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls."); + limits.addUntracked("hipLimitDevRuntimeSyncDepth", -1) + ->setComment("Maximum nesting depth of a grid at which a thread can safely call hipDeviceSynchronize()."); + limits.addUntracked("hipLimitDevRuntimePendingLaunchCount", -1) + ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device."); + desc.addUntracked("limits", limits) + ->setComment( + "See the documentation of hipDeviceSetLimit for more information.\nSetting any of these options to -1 keeps " + "the default value."); + + descriptions.add("ROCmService", desc); +} + +int ROCmService::deviceWithMostFreeMemory() const { + // save the current device + int currentDevice; + hipCheck(hipGetDevice(¤tDevice)); + + size_t maxFreeMemory = 0; + int device = -1; + for (int i = 0; i < numberOfDevices_; ++i) { + size_t freeMemory, totalMemory; + hipCheck(hipSetDevice(i)); + hipCheck(hipMemGetInfo(&freeMemory, &totalMemory)); + edm::LogPrint("ROCmService") << "ROCm device " << i << ": " << freeMemory / (1 << 20) << " MB free / " + << totalMemory / (1 << 20) << " MB total memory"; + if (freeMemory > maxFreeMemory) { + maxFreeMemory = freeMemory; + device = i; + } + } + // restore the current device + hipCheck(hipSetDevice(currentDevice)); + return device; +} diff --git a/HeterogeneousCore/ROCmServices/test/BuildFile.xml b/HeterogeneousCore/ROCmServices/test/BuildFile.xml new file mode 100644 index 0000000000000..7fbe8d1931848 --- /dev/null +++ b/HeterogeneousCore/ROCmServices/test/BuildFile.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/HeterogeneousCore/ROCmServices/test/testROCmService.cpp b/HeterogeneousCore/ROCmServices/test/testROCmService.cpp new file mode 100644 index 0000000000000..06b2c90c6db8b --- /dev/null +++ b/HeterogeneousCore/ROCmServices/test/testROCmService.cpp @@ -0,0 +1,155 @@ +#include +#include +#include +#include +#include + +#include + +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSetReader/interface/ParameterSetReader.h" +#include "FWCore/PluginManager/interface/PluginManager.h" +#include "FWCore/PluginManager/interface/standard.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/ServiceRegistry/interface/ServiceRegistry.h" +#include "FWCore/ServiceRegistry/interface/ServiceToken.h" +#include "FWCore/Utilities/interface/Exception.h" +#include "FWCore/Utilities/interface/ResourceInformation.h" +#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h" +#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h" + +namespace { + ROCmService makeROCmService(edm::ParameterSet ps) { + auto desc = edm::ConfigurationDescriptions("Service", "ROCmService"); + ROCmService::fillDescriptions(desc); + desc.validate(ps, "ROCmService"); + return ROCmService(ps); + } +} // namespace + +TEST_CASE("Tests of ROCmService", "[ROCmService]") { + // Test setup: check if a simple ROCm runtime API call fails: + // if so, skip the test with the ROCmService enabled + int deviceCount = 0; + auto ret = hipGetDeviceCount(&deviceCount); + + if (ret != hipSuccess) { + WARN("Unable to query the ROCm capable devices from the ROCm runtime API: (" + << ret << ") " << hipGetErrorString(ret) << ". Running only tests not requiring devices."); + } + + // Make Service system available as ROCmService depends on ResourceInformationService + std::vector psets; + edm::ServiceToken serviceToken = edm::ServiceRegistry::createSet(psets); + edm::ServiceRegistry::Operate operate(serviceToken); + + SECTION("ROCmService enabled") { + edm::ParameterSet ps; + ps.addUntrackedParameter("enabled", true); + SECTION("Enabled only if there are ROCm capable GPUs") { + auto cs = makeROCmService(ps); + if (deviceCount <= 0) { + REQUIRE(cs.enabled() == false); + WARN("ROCmService is disabled as there are no ROCm GPU devices"); + } else { + REQUIRE(cs.enabled() == true); + INFO("ROCmService is enabled"); + } + } + + if (deviceCount <= 0) { + return; + } + + auto cs = makeROCmService(ps); + int driverVersion = 0, runtimeVersion = 0; + ret = hipDriverGetVersion(&driverVersion); + if (ret != hipSuccess) { + FAIL("Unable to query the ROCm driver version from the ROCm runtime API: (" << ret << ") " + << hipGetErrorString(ret)); + } + ret = hipRuntimeGetVersion(&runtimeVersion); + if (ret != hipSuccess) { + FAIL("Unable to query the ROCm runtime API version: (" << ret << ") " << hipGetErrorString(ret)); + } + + SECTION("ROCm Queries") { + WARN("ROCm Driver Version / Runtime Version: " << driverVersion / 1000 << "." << (driverVersion % 100) / 10 + << " / " << runtimeVersion / 1000 << "." + << (runtimeVersion % 100) / 10); + + // Test that the number of devices found by the service + // is the same as detected by the ROCm runtime API + REQUIRE(cs.numberOfDevices() == deviceCount); + WARN("Detected " << cs.numberOfDevices() << " ROCm Capable device(s)"); + + // Test that the compute capabilities of each device + // are the same as detected by the ROCm runtime API + for (int i = 0; i < deviceCount; ++i) { + hipDeviceProp_t deviceProp; + ret = hipGetDeviceProperties(&deviceProp, i); + if (ret != hipSuccess) { + FAIL("Unable to query the ROCm properties for device " << i << " from the ROCm runtime API: (" << ret << ") " + << hipGetErrorString(ret)); + } + + REQUIRE(deviceProp.major == cs.computeCapability(i).first); + REQUIRE(deviceProp.minor == cs.computeCapability(i).second); + INFO("Device " << i << ": " << deviceProp.name << "\n ROCm Capability Major/Minor version number: " + << deviceProp.major << "." << deviceProp.minor); + } + } + + SECTION("ROCmService device free memory") { + size_t mem = 0; + int dev = -1; + for (int i = 0; i < deviceCount; ++i) { + size_t free, tot; + REQUIRE_NOTHROW(hipCheck(hipSetDevice(i))); + REQUIRE_NOTHROW(hipCheck(hipMemGetInfo(&free, &tot))); + WARN("Device " << i << " memory total " << tot << " free " << free); + if (free > mem) { + mem = free; + dev = i; + } + } + WARN("Device with most free memory " << dev << "\n" + << " as given by ROCmService " << cs.deviceWithMostFreeMemory()); + } + + SECTION("With ResourceInformationService available") { + edmplugin::PluginManager::configure(edmplugin::standard::config()); + + std::string const config = R"_(import FWCore.ParameterSet.Config as cms +process = cms.Process('Test') +process.add_(cms.Service('ResourceInformationService')) +)_"; + std::unique_ptr params; + edm::makeParameterSets(config, params); + edm::ServiceToken tempToken(edm::ServiceRegistry::createServicesFromConfig(std::move(params))); + edm::ServiceRegistry::Operate operate2(tempToken); + + auto cs = makeROCmService(edm::ParameterSet{}); + REQUIRE(cs.enabled()); + edm::Service ri; + REQUIRE(ri->gpuModels().size() > 0); + /* + REQUIRE(ri->nvidiaDriverVersion().size() > 0); + REQUIRE(ri->cudaDriverVersion() == driverVersion); + REQUIRE(ri->cudaRuntimeVersion() == runtimeVersion); + */ + } + } + + SECTION("Force to be disabled") { + edm::ParameterSet ps; + ps.addUntrackedParameter("enabled", false); + auto cs = makeROCmService(ps); + REQUIRE(cs.enabled() == false); + REQUIRE(cs.numberOfDevices() == 0); + } +} diff --git a/HeterogeneousCore/ROCmServices/test/testROCmService.py b/HeterogeneousCore/ROCmServices/test/testROCmService.py new file mode 100644 index 0000000000000..d96d02f25be44 --- /dev/null +++ b/HeterogeneousCore/ROCmServices/test/testROCmService.py @@ -0,0 +1,20 @@ +import FWCore.ParameterSet.Config as cms + +process = cms.Process( "TEST" ) + +process.options = cms.untracked.PSet( + numberOfThreads = cms.untracked.uint32( 4 ), + numberOfStreams = cms.untracked.uint32( 0 ), +) + +process.load('FWCore.MessageService.MessageLogger_cfi') +process.MessageLogger.ROCmService = {} + +process.load('HeterogeneousCore.ROCmServices.ROCmService_cfi') +process.ROCmService.verbose = True + +process.source = cms.Source("EmptySource") + +process.maxEvents = cms.untracked.PSet( + input = cms.untracked.int32( 0 ) +)