Skip to content

Commit

Permalink
Reduce the ECAL and HCAL GPU memory usage
Browse files Browse the repository at this point in the history
Allocate memory buffers based on the actual number of events, instead of
always allocating the maximum size.
  • Loading branch information
fwyzard committed Oct 3, 2022
1 parent 462d015 commit eaa3636
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 65 deletions.
24 changes: 13 additions & 11 deletions RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,15 +90,15 @@ namespace ecal {
std::array<uint32_t, 3> kernelMinimizeThreads;

bool shouldRunTimingComputation;

uint32_t maxNumberHitsEB;
uint32_t maxNumberHitsEE;
};

struct EventOutputDataGPU {
UncalibratedRecHit<::calo::common::DevStoragePolicy> recHitsEB, recHitsEE;

void allocate(int sizeEB, int sizeEE, ConfigurationParameters const& configParameters, cudaStream_t cudaStream) {
void allocate(ConfigurationParameters const& configParameters,
uint32_t sizeEB,
uint32_t sizeEE,
cudaStream_t cudaStream) {
recHitsEB.amplitudesAll = cms::cuda::make_device_unique<reco::ComputationScalarType[]>(
sizeEB * EcalDataFrame::MAXSAMPLES, cudaStream);
recHitsEB.amplitude = cms::cuda::make_device_unique<reco::StorageScalarType[]>(sizeEB, cudaStream);
Expand Down Expand Up @@ -163,13 +163,16 @@ namespace ecal {
cms::cuda::device::unique_ptr<SVT[]> timeMax, timeError;
cms::cuda::device::unique_ptr<TimeComputationState[]> tcState;

void allocate(int size, ConfigurationParameters const& configParameters, cudaStream_t cudaStream) {
void allocate(ConfigurationParameters const& configParameters,
uint32_t sizeEB,
uint32_t sizeEE,
cudaStream_t cudaStream) {
constexpr auto svlength = getLength<SampleVector>();
constexpr auto sgvlength = getLength<SampleGainVector>();
constexpr auto smlength = getLength<SampleMatrix>();
constexpr auto pmlength = getLength<PulseMatrixType>();
constexpr auto bxvlength = getLength<BXVectorType>();
auto const size = configParameters.maxNumberHitsEB + configParameters.maxNumberHitsEE;
auto const size = sizeEB + sizeEE;

auto alloc = [cudaStream](auto& var, uint32_t size) {
using element_type = typename std::remove_reference_t<decltype(var)>::element_type;
Expand Down Expand Up @@ -271,24 +274,23 @@ namespace ecal {
uint32_t expanded_v_DB_reco_flagsSize;

uint32_t flagmask;
uint32_t maxNumberHitsEB;
uint32_t maxNumberHitsEE;
};

struct EventOutputDataGPU {
RecHit<::calo::common::DevStoragePolicy> recHitsEB, recHitsEE;

void allocate(ConfigurationParameters const& configParameters, cudaStream_t cudaStream) {
void allocate(ConfigurationParameters const& configParameters,
uint32_t sizeEB,
uint32_t sizeEE,
cudaStream_t cudaStream) {
//---- configParameters -> needed only to decide if to save the timing information or not
auto const sizeEB = configParameters.maxNumberHitsEB;
recHitsEB.energy = cms::cuda::make_device_unique<::ecal::reco::StorageScalarType[]>(sizeEB, cudaStream);
recHitsEB.time = cms::cuda::make_device_unique<::ecal::reco::StorageScalarType[]>(sizeEB, cudaStream);
recHitsEB.chi2 = cms::cuda::make_device_unique<::ecal::reco::StorageScalarType[]>(sizeEB, cudaStream);
recHitsEB.flagBits = cms::cuda::make_device_unique<uint32_t[]>(sizeEB, cudaStream);
recHitsEB.extra = cms::cuda::make_device_unique<uint32_t[]>(sizeEB, cudaStream);
recHitsEB.did = cms::cuda::make_device_unique<uint32_t[]>(sizeEB, cudaStream);

auto const sizeEE = configParameters.maxNumberHitsEE;
recHitsEE.energy = cms::cuda::make_device_unique<::ecal::reco::StorageScalarType[]>(sizeEE, cudaStream);
recHitsEE.time = cms::cuda::make_device_unique<::ecal::reco::StorageScalarType[]>(sizeEE, cudaStream);
recHitsEE.chi2 = cms::cuda::make_device_unique<::ecal::reco::StorageScalarType[]>(sizeEE, cudaStream);
Expand Down
14 changes: 1 addition & 13 deletions RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,6 @@ void EcalRecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& con
desc.add<double>("EELaserMIN", 0.01);
desc.add<double>("EBLaserMAX", 30.0);
desc.add<double>("EELaserMAX", 30.0);

desc.add<uint32_t>("maxNumberHitsEB", 61200);
desc.add<uint32_t>("maxNumberHitsEE", 14648);
}

EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) {
Expand All @@ -125,10 +122,6 @@ EcalRecHitProducerGPU::EcalRecHitProducerGPU(const edm::ParameterSet& ps) {
configParameters_.EBLaserMAX = ps.getParameter<double>("EBLaserMAX");
configParameters_.EELaserMAX = ps.getParameter<double>("EELaserMAX");

// max number of digis to allocate for
configParameters_.maxNumberHitsEB = ps.getParameter<uint32_t>("maxNumberHitsEB");
configParameters_.maxNumberHitsEE = ps.getParameter<uint32_t>("maxNumberHitsEE");

flagmask_ = 0;
flagmask_ |= 0x1 << EcalRecHit::kNeighboursRecovered;
flagmask_ |= 0x1 << EcalRecHit::kTowerRecovered;
Expand Down Expand Up @@ -182,11 +175,6 @@ void EcalRecHitProducerGPU::acquire(edm::Event const& event,
if (neb_ + nee_ == 0)
return;

if ((neb_ > configParameters_.maxNumberHitsEB) || (nee_ > configParameters_.maxNumberHitsEE)) {
edm::LogError("EcalRecHitProducerGPU")
<< "max number of channels exceeded. See options 'maxNumberHitsEB and maxNumberHitsEE' ";
}

int nchannelsEB = ebUncalibRecHits.size; // --> offsetForInput, first EB and then EE

// conditions
Expand Down Expand Up @@ -227,7 +215,7 @@ void EcalRecHitProducerGPU::acquire(edm::Event const& event,
IntercalibConstantsHandle_->getOffset()};

// dev mem
eventOutputDataGPU_.allocate(configParameters_, ctx.stream());
eventOutputDataGPU_.allocate(configParameters_, neb_, nee_, ctx.stream());

//
// schedule algorithms
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,6 @@ void EcalUncalibRecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptio
desc.add<double>("outOfTimeThresholdGain61mEE", 1000);
desc.add<double>("amplitudeThresholdEB", 10);
desc.add<double>("amplitudeThresholdEE", 10);
desc.add<uint32_t>("maxNumberHitsEB", 61200);
desc.add<uint32_t>("maxNumberHitsEE", 14648);
desc.addUntracked<std::vector<uint32_t>>("kernelMinimizeThreads", {32, 1, 1});
desc.add<bool>("shouldRunTimingComputation", true);
confDesc.addWithDefaultLabel(desc);
Expand Down Expand Up @@ -132,10 +130,6 @@ EcalUncalibRecHitProducerGPU::EcalUncalibRecHitProducerGPU(const edm::ParameterS
auto amplitudeThreshEB = ps.getParameter<double>("amplitudeThresholdEB");
auto amplitudeThreshEE = ps.getParameter<double>("amplitudeThresholdEE");

// max number of digis to allocate for
configParameters_.maxNumberHitsEB = ps.getParameter<uint32_t>("maxNumberHitsEB");
configParameters_.maxNumberHitsEE = ps.getParameter<uint32_t>("maxNumberHitsEE");

// switch to run timing computation kernels
configParameters_.shouldRunTimingComputation = ps.getParameter<bool>("shouldRunTimingComputation");

Expand Down Expand Up @@ -203,13 +197,6 @@ void EcalUncalibRecHitProducerGPU::produce(edm::Event& event, edm::EventSetup co

// stop here if there are no digis
if (neb + nee > 0) {
if ((neb > configParameters_.maxNumberHitsEB) || (nee > configParameters_.maxNumberHitsEE)) {
edm::LogError("EcalUncalibRecHitProducerGPU")
<< "Max number of channels exceeded in barrel or endcap. Number of barrel channels: " << neb
<< " with maxNumberHitsEB=" << configParameters_.maxNumberHitsEB << ", number of endcap channels: " << nee
<< " with maxNumberHitsEE=" << configParameters_.maxNumberHitsEE;
}

// conditions
auto const& timeCalibConstantsData = setup.getData(timeCalibConstantsToken_);
auto const& sampleMaskData = setup.getData(sampleMaskToken_);
Expand Down Expand Up @@ -247,11 +234,11 @@ void EcalUncalibRecHitProducerGPU::produce(edm::Event& event, edm::EventSetup co
multifitParameters};

// dev mem
eventOutputDataGPU.allocate(neb_, nee_, configParameters_, ctx.stream());
eventOutputDataGPU.allocate(configParameters_, neb, nee, ctx.stream());

// scratch mem
ecal::multifit::EventDataForScratchGPU eventDataForScratchGPU;
eventDataForScratchGPU.allocate(neb_ + nee_, configParameters_, ctx.stream());
eventDataForScratchGPU.allocate(configParameters_, neb, nee, ctx.stream());

//
// schedule algorithms
Expand Down
3 changes: 0 additions & 3 deletions RecoLocalCalo/EcalRecProducers/python/ecalRecHitGPU_cfi.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@
recHitsLabelEB = cms.string("EcalRecHitsEB"),
recHitsLabelEE = cms.string("EcalRecHitsEE"),

maxNumberHitsEB = cms.uint32(61200),
maxNumberHitsEE = cms.uint32(14648),

## db statuses to be exluded from reconstruction (some will be recovered)
ChannelStatusToBeExcluded = cms.vstring( 'kDAC',
'kNoisy',
Expand Down
13 changes: 6 additions & 7 deletions RecoLocalCalo/HcalRecProducers/src/DeclsForKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ namespace hcal {
};

struct ConfigParameters {
uint32_t maxChannels;
uint32_t maxTimeSamples;
uint32_t kprep1dChannelsPerBlock;
int sipmQTSShift;
Expand All @@ -93,12 +92,12 @@ namespace hcal {
struct OutputDataGPU {
RecHitCollection<::calo::common::DevStoragePolicy> recHits;

void allocate(ConfigParameters const& config, cudaStream_t cudaStream) {
recHits.energy = cms::cuda::make_device_unique<float[]>(config.maxChannels, cudaStream);
recHits.chi2 = cms::cuda::make_device_unique<float[]>(config.maxChannels, cudaStream);
recHits.energyM0 = cms::cuda::make_device_unique<float[]>(config.maxChannels, cudaStream);
recHits.timeM0 = cms::cuda::make_device_unique<float[]>(config.maxChannels, cudaStream);
recHits.did = cms::cuda::make_device_unique<uint32_t[]>(config.maxChannels, cudaStream);
void allocate(ConfigParameters const& config, uint32_t size, cudaStream_t cudaStream) {
recHits.energy = cms::cuda::make_device_unique<float[]>(size, cudaStream);
recHits.chi2 = cms::cuda::make_device_unique<float[]>(size, cudaStream);
recHits.energyM0 = cms::cuda::make_device_unique<float[]>(size, cudaStream);
recHits.timeM0 = cms::cuda::make_device_unique<float[]>(size, cudaStream);
recHits.did = cms::cuda::make_device_unique<uint32_t[]>(size, cudaStream);
}
};

Expand Down
25 changes: 9 additions & 16 deletions RecoLocalCalo/HcalRecProducers/src/HBHERecHitProducerGPU.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ HBHERecHitProducerGPU::HBHERecHitProducerGPU(edm::ParameterSet const& ps)
sipmCharacteristicsToken_{esConsumes()},
chQualProductToken_{esConsumes()},
pulseOffsetsToken_{esConsumes()} {
configParameters_.maxChannels = ps.getParameter<uint32_t>("maxChannels");
configParameters_.maxTimeSamples = ps.getParameter<uint32_t>("maxTimeSamples");
configParameters_.kprep1dChannelsPerBlock = ps.getParameter<uint32_t>("kprep1dChannelsPerBlock");
configParameters_.sipmQTSShift = ps.getParameter<int>("sipmQTSShift");
Expand Down Expand Up @@ -115,7 +114,6 @@ HBHERecHitProducerGPU::~HBHERecHitProducerGPU() {}

void HBHERecHitProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& cdesc) {
edm::ParameterSetDescription desc;
desc.add<uint32_t>("maxChannels", 10000u);
desc.add<uint32_t>("maxTimeSamples", 10);
desc.add<uint32_t>("kprep1dChannelsPerBlock", 32);
desc.add<edm::InputTag>("digisLabelF01HE", edm::InputTag{"hcalRawToDigiGPU", "f01HEDigisGPU"});
Expand Down Expand Up @@ -156,6 +154,7 @@ void HBHERecHitProducerGPU::acquire(edm::Event const& event,
auto const& f01HEDigis = ctx.get(f01HEProduct);
auto const& f5HBDigis = ctx.get(f5HBProduct);
auto const& f3HBDigis = ctx.get(f3HBProduct);
auto const totalChannels = f01HEDigis.size + f5HBDigis.size + f3HBDigis.size;

hcal::reconstruction::InputDataGPU inputGPU{f01HEDigis, f5HBDigis, f3HBDigis};

Expand Down Expand Up @@ -225,26 +224,20 @@ void HBHERecHitProducerGPU::acquire(edm::Event const& event,

// scratch mem on device
hcal::reconstruction::ScratchDataGPU scratchGPU = {
cms::cuda::make_device_unique<float[]>(configParameters_.maxChannels * configParameters_.maxTimeSamples,
ctx.stream()),
cms::cuda::make_device_unique<float[]>(configParameters_.maxChannels * configParameters_.maxTimeSamples,
ctx.stream()),
cms::cuda::make_device_unique<float[]>(configParameters_.maxChannels * configParameters_.maxTimeSamples,
ctx.stream()),
cms::cuda::make_device_unique<float[]>(totalChannels * configParameters_.maxTimeSamples, ctx.stream()),
cms::cuda::make_device_unique<float[]>(totalChannels * configParameters_.maxTimeSamples, ctx.stream()),
cms::cuda::make_device_unique<float[]>(totalChannels * configParameters_.maxTimeSamples, ctx.stream()),
cms::cuda::make_device_unique<float[]>(
configParameters_.maxChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples,
ctx.stream()),
totalChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples, ctx.stream()),
cms::cuda::make_device_unique<float[]>(
configParameters_.maxChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples,
ctx.stream()),
totalChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples, ctx.stream()),
cms::cuda::make_device_unique<float[]>(
configParameters_.maxChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples,
ctx.stream()),
cms::cuda::make_device_unique<int8_t[]>(configParameters_.maxChannels, ctx.stream()),
totalChannels * configParameters_.maxTimeSamples * configParameters_.maxTimeSamples, ctx.stream()),
cms::cuda::make_device_unique<int8_t[]>(totalChannels, ctx.stream()),
};

// output dev mem
outputGPU_.allocate(configParameters_, ctx.stream());
outputGPU_.allocate(configParameters_, totalChannels, ctx.stream());

hcal::reconstruction::entryPoint(inputGPU, outputGPU_, conditions, scratchGPU, configParameters_, ctx.stream());

Expand Down

0 comments on commit eaa3636

Please sign in to comment.