Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ECAL - Implement follow up issues for Phase 2 GPU reconstruction #39195

Merged
merged 3 commits into from
Sep 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Configuration/PyReleaseValidation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ The offsets currently in use are:
* 0.21: Production-like sequence
* 0.24: 0 Tesla (Run-2, Run-3)
* 0.31: Photon energy corrections with DRN architecture
* 0.61: `phase2_ecal_devel` era
* 0.61: ECAL `phase2_ecal_devel` era, on CPU
* 0.612: ECAL `phase2_ecal_devel` era, with automatic offload to GPU if available
* 0.75: Phase-2 HLT
* 0.91: Track DNN modifier
* 0.97: Premixing stage1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1388,36 +1388,58 @@ def condition(self, fragment, stepList, key, hasHarvest):
offset = 0.6,
)

# ECAL Phase 2 development WF
class UpgradeWorkflow_ecalDevel(UpgradeWorkflow):
def __init__(self, digi = {}, reco = {}, harvest = {}, **kwargs):
# adapt the parameters for the UpgradeWorkflow init method
super(UpgradeWorkflow_ecalDevel, self).__init__(
steps = [
'DigiTrigger',
'RecoGlobal',
'HARVESTGlobal',
],
PU = [
'DigiTrigger',
'RecoGlobal',
'HARVESTGlobal',
],
**kwargs)
self.__digi = digi
self.__reco = reco
self.__harvest = harvest

def setup_(self, step, stepName, stepDict, k, properties):
# temporarily remove trigger & downstream steps
mods = {'--era': stepDict[step][k]['--era']+',phase2_ecal_devel'}
if 'Digi' in step:
mods['-s'] = 'DIGI:pdigi_valid,DIGI2RAW'
mods |= self.__digi
elif 'Reco' in step:
mods['-s'] = 'RAW2DIGI,RECO:reconstruction_ecalOnly,VALIDATION:@ecalOnlyValidation,DQM:@ecalOnly'
mods['--datatier'] = 'GEN-SIM-RECO,DQMIO'
mods['--eventcontent'] = 'FEVTDEBUGHLT,DQM'
mods |= self.__reco
elif 'HARVEST' in step:
mods['-s'] = 'HARVESTING:@ecalOnlyValidation+@ecal'
mods |= self.__harvest
stepDict[stepName][k] = merge([mods, stepDict[step][k]])

def condition(self, fragment, stepList, key, hasHarvest):
return fragment=="TTbar_14TeV" and '2026' in key

# ECAL Phase 2 workflow running on CPU
upgradeWFs['ecalDevel'] = UpgradeWorkflow_ecalDevel(
steps = [
'DigiTrigger',
'RecoGlobal',
'HARVESTGlobal',
],
PU = [
'DigiTrigger',
'RecoGlobal',
'HARVESTGlobal',
],
suffix = '_ecalDevel',
offset = 0.61,
)

# ECAL Phase 2 workflow running on CPU or GPU (if available)
upgradeWFs['ecalDevelGPU'] = UpgradeWorkflow_ecalDevel(
reco = {'--procModifiers': 'gpu'},
suffix = '_ecalDevelGPU',
offset = 0.612,
)

class UpgradeWorkflow_0T(UpgradeWorkflow):
def setup_(self, step, stepName, stepDict, k, properties):
myGT=stepDict[step][k]['--conditions']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,18 @@

#include "DeclsForKernelsPhase2.h"

class EcalPhase2DigiToGPUProducer : public edm::stream::EDProducer<edm::ExternalWork> {
class EcalPhase2DigiToGPUProducer : public edm::stream::EDProducer<> {
public:
explicit EcalPhase2DigiToGPUProducer(const edm::ParameterSet& ps);
~EcalPhase2DigiToGPUProducer() override = default;
static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);

void acquire(edm::Event const& event, edm::EventSetup const& setup, edm::WaitingTaskWithArenaHolder holder) override;
void produce(edm::Event& evt, edm::EventSetup const& setup) override;

private:
const edm::EDGetTokenT<EBDigiCollectionPh2> digiCollectionToken_;
const edm::EDPutTokenT<cms::cuda::Product<ecal::DigisCollection<calo::common::DevStoragePolicy>>>
digisCollectionToken_;
uint32_t size_;

ecal::DigisCollection<::calo::common::DevStoragePolicy> digis_;

cms::cuda::ContextState cudaState_;
};

void EcalPhase2DigiToGPUProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
Expand All @@ -46,24 +40,24 @@ EcalPhase2DigiToGPUProducer::EcalPhase2DigiToGPUProducer(const edm::ParameterSet
digisCollectionToken_(produces<cms::cuda::Product<ecal::DigisCollection<calo::common::DevStoragePolicy>>>(
ps.getParameter<std::string>("digisLabelEB"))) {}

void EcalPhase2DigiToGPUProducer::acquire(edm::Event const& event,
edm::EventSetup const& setup,
edm::WaitingTaskWithArenaHolder holder) {
cms::cuda::ScopedContextAcquire ctx{event.streamID(), std::move(holder), cudaState_};
void EcalPhase2DigiToGPUProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
cms::cuda::ScopedContextProduce ctx{event.streamID()};

//input data from event
const auto& pdigis = event.get(digiCollectionToken_);

size_ = pdigis.size();
const uint32_t size = pdigis.size();

ecal::DigisCollection<::calo::common::DevStoragePolicy> digis;
digis.size = size;

digis_.size = size_;
//allocate device pointers for output
digis_.ids = cms::cuda::make_device_unique<uint32_t[]>(size_, ctx.stream());
digis_.data = cms::cuda::make_device_unique<uint16_t[]>(size_ * EcalDataFrame_Ph2::MAXSAMPLES, ctx.stream());
digis.ids = cms::cuda::make_device_unique<uint32_t[]>(size, ctx.stream());
digis.data = cms::cuda::make_device_unique<uint16_t[]>(size * EcalDataFrame_Ph2::MAXSAMPLES, ctx.stream());

//allocate host pointers for holding product data and id vectors
auto idstmp = cms::cuda::make_host_unique<uint32_t[]>(size_, ctx.stream());
auto datatmp = cms::cuda::make_host_unique<uint16_t[]>(size_ * EcalDataFrame_Ph2::MAXSAMPLES, ctx.stream());
auto idstmp = cms::cuda::make_host_unique<uint32_t[]>(size, ctx.stream());
auto datatmp = cms::cuda::make_host_unique<uint16_t[]>(size * EcalDataFrame_Ph2::MAXSAMPLES, ctx.stream());

//iterate over digis
uint32_t i = 0;
Expand All @@ -82,22 +76,16 @@ void EcalPhase2DigiToGPUProducer::acquire(edm::Event const& event,
}

//copy output vectors into member variable device pointers for the output struct

cudaCheck(
cudaMemcpyAsync(digis_.ids.get(), idstmp.get(), size_ * sizeof(uint32_t), cudaMemcpyHostToDevice, ctx.stream()));
cudaCheck(cudaMemcpyAsync(digis_.data.get(),
cudaMemcpyAsync(digis.ids.get(), idstmp.get(), size * sizeof(uint32_t), cudaMemcpyHostToDevice, ctx.stream()));
cudaCheck(cudaMemcpyAsync(digis.data.get(),
datatmp.get(),
size_ * EcalDataFrame_Ph2::MAXSAMPLES * sizeof(uint16_t),
size * EcalDataFrame_Ph2::MAXSAMPLES * sizeof(uint16_t),
cudaMemcpyHostToDevice,
ctx.stream()));
}

void EcalPhase2DigiToGPUProducer::produce(edm::Event& event, edm::EventSetup const& setup) {
//get cuda context state for producer
cms::cuda::ScopedContextProduce ctx{cudaState_};

//emplace output in the context
ctx.emplace(event, digisCollectionToken_, std::move(digis_));
ctx.emplace(event, digisCollectionToken_, std::move(digis));
}

DEFINE_FWK_MODULE(EcalPhase2DigiToGPUProducer);
Original file line number Diff line number Diff line change
Expand Up @@ -19,36 +19,39 @@ namespace ecal {
double const* __restrict__ weights,
uint32_t* __restrict__ flags) {
constexpr int nsamples = EcalDataFrame_Ph2::MAXSAMPLES;
int const tx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int nchannels_per_block = blockDim.x;
unsigned int const threadx = threadIdx.x;

if (tx < nchannels) {
extern __shared__ char shared_mem[];
double* shr_weights = (double*)&shared_mem[0];
float* shr_amp = (float*)&shared_mem[nsamples * sizeof(double)];
uint16_t* shr_digis = (uint16_t*)&shared_mem[nsamples * sizeof(double) + nchannels_per_block * sizeof(float)];
for (int i = 0; i < nsamples; ++i)
shr_weights[i] = weights[i];
// copy data from global to shared memory
extern __shared__ char shared_mem[];
double* shr_weights = reinterpret_cast<double*>(shared_mem); // nsamples elements
float* shr_amp = reinterpret_cast<float*>(shr_weights + nsamples); // nchannels_per_block elements
uint16_t* shr_digis = reinterpret_cast<uint16_t*>(shr_amp + nchannels_per_block); // nchannels_per_block elements
for (int i = 0; i < nsamples; ++i)
shr_weights[i] = weights[i];

unsigned int const bx = blockIdx.x; //block index
unsigned int const threadx = threadIdx.x;
unsigned int const blockx = blockIdx.x;

for (int sample = 0; sample < nsamples; ++sample) {
int const idx = threadx * nsamples + sample;
shr_digis[idx] = digis_in[bx * nchannels_per_block * nsamples + idx];
}
shr_amp[threadx] = 0.0;
__syncthreads();
for (int sample = 0; sample < nsamples; ++sample) {
int const idx = threadx * nsamples + sample;
shr_digis[idx] = digis_in[blockx * nchannels_per_block * nsamples + idx];
}
shr_amp[threadx] = 0.;

__syncthreads();

const auto first = threadIdx.x + blockIdx.x * blockDim.x;
const auto stride = blockDim.x * gridDim.x;
for (auto tx = first; tx < nchannels; tx += stride) {
auto const did = DetId{dids[tx]};
CMS_UNROLL_LOOP
for (int sample = 0; sample < nsamples; ++sample) {
const unsigned int idx = threadIdx.x * nsamples + sample;
const unsigned int idx = threadx * nsamples + sample;
const auto shr_digi = shr_digis[idx];
shr_amp[threadx] += (static_cast<float>(ecalLiteDTU::adc(shr_digi)) *
ecalPh2::gains[ecalLiteDTU::gainId(shr_digi)] * shr_weights[sample]);
}
const unsigned int tdx = threadIdx.x * nsamples;
const unsigned int tdx = threadx * nsamples;
amplitude[tx] = shr_amp[threadx];
amplitudeError[tx] = 1.0f;
dids_out[tx] = did.rawId();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,13 @@
#include "EcalUncalibRecHitPhase2WeightsAlgoGPU.h"
#include "DeclsForKernelsPhase2.h"

class EcalUncalibRecHitPhase2WeightsProducerGPU : public edm::stream::EDProducer<edm::ExternalWork> {
class EcalUncalibRecHitPhase2WeightsProducerGPU : public edm::stream::EDProducer<> {
public:
explicit EcalUncalibRecHitPhase2WeightsProducerGPU(edm::ParameterSet const &ps);
~EcalUncalibRecHitPhase2WeightsProducerGPU() override = default;
static void fillDescriptions(edm::ConfigurationDescriptions &);

private:
void acquire(edm::Event const &, edm::EventSetup const &, edm::WaitingTaskWithArenaHolder) override;
void produce(edm::Event &, edm::EventSetup const &) override;

private:
Expand All @@ -31,10 +30,6 @@ class EcalUncalibRecHitPhase2WeightsProducerGPU : public edm::stream::EDProducer

// event data
ecal::weights::EventOutputDataGPU eventOutputDataGPU_;

cms::cuda::ContextState cudaState_;

uint32_t size_;
};

// constructor with initialisation of elements
Expand Down Expand Up @@ -74,43 +69,35 @@ void EcalUncalibRecHitPhase2WeightsProducerGPU::fillDescriptions(edm::Configurat
descriptions.addWithDefaultLabel(desc);
}

// aquire function which initislises objects on host and device to their actual objects and calls kernal
void EcalUncalibRecHitPhase2WeightsProducerGPU::acquire(edm::Event const &event,
edm::EventSetup const &setup,
edm::WaitingTaskWithArenaHolder holder) {
void EcalUncalibRecHitPhase2WeightsProducerGPU::produce(edm::Event &event, const edm::EventSetup &setup) {
// cuda products
auto const &digisProduct = event.get(digisToken_);
// raii
cms::cuda::ScopedContextAcquire ctx{digisProduct, std::move(holder), cudaState_};
cms::cuda::ScopedContextProduce ctx{digisProduct};

// get actual obj
auto const &digis = ctx.get(digisProduct);

size_ = digis.size;

// if no digis stop here
if (size_ == 0)
return;
const uint32_t size = digis.size;

auto weights_d = cms::cuda::make_device_unique<double[]>(EcalDataFrame_Ph2::MAXSAMPLES, ctx.stream());
// do not run the algo if there are no digis
if (size > 0) {
auto weights_d = cms::cuda::make_device_unique<double[]>(EcalDataFrame_Ph2::MAXSAMPLES, ctx.stream());

cudaCheck(cudaMemcpyAsync(weights_d.get(),
weights_.data(),
EcalDataFrame_Ph2::MAXSAMPLES * sizeof(double),
cudaMemcpyHostToDevice,
ctx.stream()));
cudaCheck(cudaMemcpyAsync(weights_d.get(),
weights_.data(),
EcalDataFrame_Ph2::MAXSAMPLES * sizeof(double),
cudaMemcpyHostToDevice,
ctx.stream()));

// output on GPU
eventOutputDataGPU_.allocate(size_, ctx.stream());
// output on GPU
eventOutputDataGPU_.allocate(size, ctx.stream());

ecal::weights::phase2Weights(digis, eventOutputDataGPU_, weights_d, ctx.stream());
}

void EcalUncalibRecHitPhase2WeightsProducerGPU::produce(edm::Event &event, const edm::EventSetup &setup) {
cms::cuda::ScopedContextProduce ctx{cudaState_};
ecal::weights::phase2Weights(digis, eventOutputDataGPU_, weights_d, ctx.stream());
}

// set the size of digis
eventOutputDataGPU_.recHits.size = size_;
eventOutputDataGPU_.recHits.size = size;

// put into the event
ctx.emplace(event, recHitsToken_, std::move(eventOutputDataGPU_.recHits));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@
from RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitConvertGPU2CPUFormat_cfi import ecalUncalibRecHitConvertGPU2CPUFormat as _ecalUncalibRecHitConvertGPU2CPUFormat
gpu.toModify(ecalUncalibRecHitPhase2,
cuda = _ecalUncalibRecHitConvertGPU2CPUFormat.clone(
isPhase2 = cms.bool(True),
recHitsLabelGPUEB = cms.InputTag('ecalUncalibRecHitSoA', 'EcalUncalibRecHitsEB'),
isPhase2 = True,
recHitsLabelGPUEB = ('ecalUncalibRecHitSoA', 'EcalUncalibRecHitsEB'),
recHitsLabelGPUEE = None, # remove unneeded Phase1 parameters
recHitsLabelCPUEE = None
)
Expand Down