Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Printouts and bug fix on indices access. #3

Merged
merged 32 commits into from
Jul 8, 2016
Merged
Changes from 4 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
fb36c28
parasitic Cellular Automaton
felicepantaleo Jul 1, 2016
cf7beb1
added python configuration
felicepantaleo Jul 1, 2016
d99d021
adding plugin generation in module seal
felicepantaleo Jul 1, 2016
d90b523
adding process customizer
felicepantaleo Jul 2, 2016
be171fa
removed printouts
felicepantaleo Jul 2, 2016
6d04a6f
Merged CA_81X from repository felicepantaleo
felicepantaleo Jul 5, 2016
a6e05b3
cache hit pairs across multiple regions
fwyzard Jul 6, 2016
7da7f8c
non working version of a gpu cellular automaton integrated in cmssw
felicepantaleo Jul 7, 2016
af3e5a2
allocate array outside of the loop
fwyzard Jul 7, 2016
24b3406
cleanup
fwyzard Jul 7, 2016
d844292
creating data structures
felicepantaleo Jul 7, 2016
824719a
add public accessor to the indices
fwyzard Jul 7, 2016
89b4ff2
support CUDA version of the Celular Automaton
fwyzard Jul 7, 2016
fba6bb9
Merged CA_81X_cuda from repository fwyzard
fwyzard Jul 7, 2016
c099d7c
Merged hackaton_01 from repository felicepantaleo
fwyzard Jul 7, 2016
7bfea73
work very much in progress
fwyzard Jul 7, 2016
9ac587a
Merged CA_81X_cuda from repository fwyzard
rovere Jul 8, 2016
2aef109
Factorize GPUArena out of main gcc compilation area
rovere Jul 8, 2016
fa22a7f
make GPUArena implicitly use T* instead of T
fwyzard Jul 8, 2016
afadf03
Merge pull request #2 from rovere/CA_81X_cuda
fwyzard Jul 8, 2016
6b35d79
fixup fa22a7f
fwyzard Jul 8, 2016
0717671
fixup fa22a7f
fwyzard Jul 8, 2016
b87fd5c
fixup fa22a7f
fwyzard Jul 8, 2016
22da43b
fixup fa22a7f
fwyzard Jul 8, 2016
f3bd02a
enable first kernel
fwyzard Jul 8, 2016
0856d92
fixup fa22a7f
fwyzard Jul 8, 2016
130b20c
fix return type
fwyzard Jul 8, 2016
5e8c161
enable second kernel
fwyzard Jul 8, 2016
a2dc88b
fix GPUSimpleVector interface
fwyzard Jul 8, 2016
15a4292
fix find_ntuplets()
fwyzard Jul 8, 2016
e81b91d
enable third kernel
fwyzard Jul 8, 2016
3a1d7d6
Printouts and fix wrong indices
rovere Jul 8, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions RecoPixelVertexing/PixelTriplets/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<use name="cuda"/>
<use name="FWCore/Framework"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/PluginManager"/>
@@ -18,3 +19,5 @@
</export>

<flags CXXFLAGS="-Ofast -fno-math-errno"/>
<flags CUDAFLAGS="-std=c++11"/>

196 changes: 196 additions & 0 deletions RecoPixelVertexing/PixelTriplets/interface/GPUArena.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#ifndef GPUARENA_H_
#define GPUARENA_H_
#include "cuda_runtime.h"
#define checkCudaError(val) __checkCudaError__ ( (val), #val, __FILE__, __LINE__ )

template <typename T>
inline void __checkCudaError__(T code, const char *func, const char *file, int line)
{
if (code) {
fprintf(stderr, "CUDA error at %s:%d: %s (code=%d)\n",
file, line, cudaGetErrorString(code), (unsigned int)code);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
}

#define checkLastCudaError() checkCudaError( (cudaGetLastError()) )

template <int CHUNK_SIZE, typename T>
class GPUChunk {
//To think about: maybe it would be better to move the
//nextFreeValue and the next pointer out of the chunk
//and make their own lists inside the arena
//if a chunk only consists of CHUNK_SIZE values then each
//thread can load one memory transaction worth of pure data
//whereas the nextFreeValue and next pointers ruin that
public:
T values[CHUNK_SIZE];
int nextFreeValue;
GPUChunk<CHUNK_SIZE, T> *next;

__device__ int num_values_in_chunk() {
if(nextFreeValue > CHUNK_SIZE) {
return CHUNK_SIZE;
} else {
return nextFreeValue;
}
}

__device__ bool push_back(T value) {
int id = atomicAdd(&nextFreeValue, 1);
if(id < CHUNK_SIZE) {
//found space
values[id] = value;
return true;
} else {
//chunk is full and this thread must get a new one
return false;
}
}

__device__ T& get_element_at(int i) {
return values[i];
}
};

//This iterator starts at the head chunk for a given element on a given layer
//and iterates "backwards"
template <int CHUNK_SIZE, typename T>
class GPUArenaIterator {
private:
GPUChunk<CHUNK_SIZE, T> *currentChunk;
int cursorInChunk;

public:
__device__ GPUArenaIterator(GPUChunk<CHUNK_SIZE, T> *head_chunk) {
currentChunk = head_chunk;
if(currentChunk != NULL) {
cursorInChunk = currentChunk->num_values_in_chunk() - 1;
}
}

__device__ bool has_next() {
return currentChunk != NULL && (cursorInChunk >= 0 || currentChunk->next != NULL);
}

__device__ T& get_next() {
if(cursorInChunk < 0) {
//No more elements left in chunk, go to next chunk
//assuming there are more chunks because you hopefully called hasNext before
currentChunk = currentChunk->next;
cursorInChunk = currentChunk->num_values_in_chunk() - 1;
}
return currentChunk->get_element_at(cursorInChunk--);
}
};

template <int CHUNK_SIZE, typename T>
__global__ void init_mappings_kernel(GPUChunk<CHUNK_SIZE, T> **mappings, GPUChunk<CHUNK_SIZE, T> *startOfChunks, int offset, int numElements) {
for(int mySlot = threadIdx.x + blockIdx.x * blockDim.x; mySlot < numElements; mySlot += gridDim.x * blockDim.x) {
mappings[mySlot] = startOfChunks + offset + mySlot;
}
};

template <int NumLayers,int CHUNK_SIZE, typename T>
class GPUArena {
private:
//how many elements does the arena store per layer
int numElementsPerLayer[NumLayers];
//a map from an element id (per layer) to the head of the chunk linked list that stores the values
GPUChunk<CHUNK_SIZE, T> **mappingIdToCurrentChunk[NumLayers];
//the shared chunks
GPUChunk<CHUNK_SIZE, T> *chunks;
//how many chunks are there in total
int capacity;
//shared cursor to indicate the next free chunk
//next free chunk does not start out as 0 but every element in every layer
//by default gets a chunk
public:
//there appears to be a bug in nvcc that doesn't allow for atomicAdd on an int field in this class
//(even though the same thing works for GPUChunk above). A work around is to allocate the nextFreeChunk
//integer with a cudaMalloc
int *nextFreeChunk_d;

GPUArena(int _capacity, std::array<int, NumLayers> pNumElementsPerLayer)
: capacity(_capacity)
{
//allocate the main arena storage and set everything to 0 (important
//because the counters in each chunk must be )
checkCudaError(cudaMalloc(&chunks, sizeof(GPUChunk<CHUNK_SIZE, T>) * capacity));
checkCudaError(cudaMemset(chunks, 0, sizeof(GPUChunk<CHUNK_SIZE, T>) * capacity));
checkCudaError(cudaMalloc(&nextFreeChunk_d, sizeof(int)));
checkCudaError(cudaMemset(nextFreeChunk_d, 0, sizeof(int)));

int offset = 0;
for(int layer = 0; layer < NumLayers; layer++) {
numElementsPerLayer[layer] = pNumElementsPerLayer[layer];
//each element implicitly gets its own initial chunk
size_t mapSizeInBytes = sizeof(GPUChunk<CHUNK_SIZE, T>*) * numElementsPerLayer[layer];
checkCudaError(cudaMalloc(&mappingIdToCurrentChunk[layer], mapSizeInBytes));

init_mappings_kernel<<<64, 16>>>(mappingIdToCurrentChunk[layer], chunks, offset, numElementsPerLayer[layer]);
checkLastCudaError();
cudaDeviceSynchronize();
checkLastCudaError();
offset += numElementsPerLayer[layer];
}
checkCudaError(cudaMemcpy(nextFreeChunk_d, &offset, sizeof(int), cudaMemcpyHostToDevice));
}

__device__ int get_num_elements_per_layer(int layer) {
return numElementsPerLayer[layer];
}

__device__ GPUChunk<CHUNK_SIZE, T>* get_new_chunk() {
int id = atomicAdd(nextFreeChunk_d, 1);

if(id >= capacity) {
printf("PANIC: GPUArena ran out of capacity\n");
assert(false);
return NULL;
}
return &chunks[id];
}

__device__ GPUChunk<CHUNK_SIZE, T>* get_head_chunk(int layer, int elementId) {
return mappingIdToCurrentChunk[layer][elementId];
}

__device__ GPUArenaIterator<CHUNK_SIZE, T> iterator(int layer, int elementId) {
return GPUArenaIterator<CHUNK_SIZE, T>(get_head_chunk(layer, elementId));
}

__device__ void push_back(int layer, int elementId, T &value) {

GPUChunk<CHUNK_SIZE, T> *currentChunk = get_head_chunk(layer, elementId);
assert(currentChunk);

while(true) {
bool status = currentChunk->push_back(value);
if(status == true) {
//we were able to snatch a value spot in the chunk, done
break;
} else {
//chunk is full. Every thread seeing a full chunk gets a new
//one and tries to add it. Because the GPU doesn't guarantee
GPUChunk<CHUNK_SIZE, T> *newChunk = get_new_chunk();
newChunk->next = currentChunk; //hook up list
//Note: we don't need a threadfence_system here because we are
//either only writing or only reading, never both. And while writing
//nobody cares about the next pointer
GPUChunk<CHUNK_SIZE, T> *oldChunk = (GPUChunk<CHUNK_SIZE, T>*)atomicCAS((unsigned long long int *)&mappingIdToCurrentChunk[layer][elementId], (unsigned long long int)currentChunk, (unsigned long long int)newChunk);
//if our CAS succeeded, oldChunk will be our currentChunk. In this case we move to newChunk immediately to avoid an extra loop;
//if oldChunk is different from newChunk, somebody else came first and we will continue with the chunk returned by the atomicCAS
//in the latter case, newChunk is wasted, but thats unavoidable to avoid livelocks
currentChunk = (oldChunk == currentChunk) ? newChunk : oldChunk;
}
}
}

};
#endif




248 changes: 248 additions & 0 deletions RecoPixelVertexing/PixelTriplets/interface/GPUCACell.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
#ifndef GPU_CACELL_H_
#define GPU_CACELL_H_

#include "RecoPixelVertexing/PixelTriplets/interface/GPUHitsAndDoublets.h"
#include "TrackingTools/TransientTrackingRecHit/interface/SeedingLayerSetsHits.h"
#include "RecoTracker/TkTrackingRegions/interface/TrackingRegion.h"
#include "GPUSimpleVector.h"

#include "DataFormats/Math/interface/deltaPhi.h"
#include "GPUArena.h"
#include <cmath>
#include <array>


template<int numberOfLayers>
class GPUCACell {
public:

using CAntuplet = GPUSimpleVector<numberOfLayers, GPUCACell<numberOfLayers>* >;
__device__
GPUCACell()
{

}

__device__
void init(const GPULayerDoublets* doublets, const int layerId, const int doubletId, const int innerHitId, const int outerHitId)
{
theCAState = 0;
theInnerHitId = innerHitId;
theOuterHitId =outerHitId;
hasSameStateNeighbors = 0;
theDoublets=doublets;
theDoubletId=doubletId;
theLayerIdInFourLayers=layerId;

theInnerX=doublets->layers[0].x[doubletId];
theOuterX=doublets->layers[1].x[doubletId];

theInnerY=doublets->layers[0].y[doubletId];
theOuterY=doublets->layers[1].y[doubletId];

theInnerZ=doublets->layers[0].z[doubletId];
theOuterZ=doublets->layers[1].z[doubletId];

theInnerR=hypot (theInnerX, theInnerY);
theOuterR=hypot (theOuterX, theOuterY);

}


__device__
float get_inner_x() const {
return theInnerX;
}
__device__
float get_outer_x() const {
return theOuterX;
}
__device__
float get_inner_y() const {
return theInnerY;
}
__device__
float get_outer_y() const {
return theOuterY;
}
__device__
float get_inner_z() const {
return theInnerZ;
}
__device__
float get_outer_z() const {
return theOuterZ;
}
__device__
float get_inner_r() const {
return theInnerR;
}
__device__
float get_outer_r() const {
return theOuterR;
}
__device__
unsigned int get_inner_hit_id() const {
return theInnerHitId;
}
__device__
unsigned int get_outer_hit_id() const {
return theOuterHitId;
}

__device__
void evolve(GPUArena<numberOfLayers,4,GPUCACell<numberOfLayers>*>& InnerNeighbors) {

hasSameStateNeighbors = 0;
GPUArenaIterator<4, GPUCACell<numberOfLayers>*> innerNeighborsIterator(theLayerIdInFourLayers,theDoubletId);
GPUCACell<numberOfLayers>* otherCell;
while (innerNeighborsIterator.has_next())
{
otherCell = innerNeighborsIterator.get_next();


if (otherCell->get_CA_state() == theCAState) {

hasSameStateNeighbors = 1;

break;
}
}

}

__device__
void check_alignment_and_tag(const GPUCACell<numberOfLayers>* innerCell, const float ptmin, const float region_origin_x, const float region_origin_y, const float region_origin_radius, const float thetaCut, const float phiCut) {

return (are_aligned_RZ(innerCell, ptmin, thetaCut) && have_similar_curvature(innerCell, region_origin_x, region_origin_y, region_origin_radius, phiCut));

}

__device__
bool are_aligned_RZ(const GPUCACell<numberOfLayers>* otherCell, const float ptmin, const float thetaCut) const {

float r1 = otherCell->get_inner_r();
float z1 = otherCell->get_inner_z();
float distance_13_squared = (r1 - theOuterR)*(r1 - theOuterR) + (z1 - theOuterZ)*(z1 - theOuterZ);
float tan_12_13_half = fabs(z1 * (theInnerR - theOuterR) + theInnerZ * (theOuterR - r1) + theOuterZ * (r1 - theInnerR)) / distance_13_squared;
return tan_12_13_half * ptmin <= thetaCut;
}

__device__
bool have_similar_curvature(const GPUCACell<numberOfLayers>* otherCell,
const float region_origin_x, const float region_origin_y, const float region_origin_radius, const float phiCut) const {
auto x1 = otherCell->get_inner_x();
auto y1 = otherCell->get_inner_y();

auto x2 = get_inner_x();
auto y2 = get_inner_y();

auto x3 = get_outer_x();
auto y3 = get_outer_y();

auto precision = 0.5f;
auto offset = x2 * x2 + y2*y2;

auto bc = (x1 * x1 + y1 * y1 - offset) / 2.f;

auto cd = (offset - x3 * x3 - y3 * y3) / 2.f;

auto det = (x1 - x2) * (y2 - y3) - (x2 - x3)* (y1 - y2);

//points are aligned
if (fabs(det) < precision)
return true;

auto idet = 1.f / det;

auto x_center = (bc * (y2 - y3) - cd * (y1 - y2)) * idet;
auto y_center = (cd * (x1 - x2) - bc * (x2 - x3)) * idet;

auto radius = std::sqrt((x2 - x_center)*(x2 - x_center) + (y2 - y_center)*(y2 - y_center));
auto centers_distance_squared = (x_center - region_origin_x)*(x_center - region_origin_x) + (y_center - region_origin_y)*(y_center - region_origin_y);

auto minimumOfIntesectionRange = (radius - region_origin_radius)*(radius - region_origin_radius) - phiCut;

if (centers_distance_squared >= minimumOfIntesectionRange) {
auto maximumOfIntesectionRange = (radius + region_origin_radius)*(radius + region_origin_radius) + phiCut;
return centers_distance_squared <= maximumOfIntesectionRange;
} else {

return false;
}

}
__device__
unsigned int get_CA_state() const {
return theCAState;
}

// if there is at least one left neighbor with the same state (friend), the state has to be increased by 1.
__device__
void update_state() {
theCAState += hasSameStateNeighbors;
}


__device__
bool is_root_cell(const unsigned int minimumCAState) const {
return (theCAState >= minimumCAState);
}

// trying to free the track building process from hardcoded layers, leaving the visit of the graph
// based on the neighborhood connections between cells.

__device__
void find_ntuplets(GPUSimpleVector<100,CAntuplet>* foundNtuplets, GPUArena<numberOfLayers,4,GPUCACell<numberOfLayers>*>& theInnerNeighbors, CAntuplet& tmpNtuplet, const unsigned int minHitsPerNtuplet) const {

// the building process for a track ends if:
// it has no right neighbor
// it has no compatible neighbor
// the ntuplets is then saved if the number of hits it contains is greater than a threshold
GPUArenaIterator<4, GPUCACell<numberOfLayers>*> innerNeighborsIterator(theLayerIdInFourLayers,theDoubletId);
GPUCACell<numberOfLayers>* otherCell;

if (theInnerNeighbors.has_next() == 0) {
if (tmpNtuplet.size() >= minHitsPerNtuplet - 1)
foundNtuplets->push_back(tmpNtuplet);
else
return;
} else {


while (innerNeighborsIterator.has_next())
{
otherCell = innerNeighborsIterator.get_next();
tmpNtuplet.push_back(otherCell);
otherCell->find_ntuplets(foundNtuplets, theInnerNeighbors, tmpNtuplet, minHitsPerNtuplet);
tmpNtuplet.pop_back();

}

}
}


private:

unsigned int theCAState;
unsigned int theInnerHitId;
unsigned int theOuterHitId;
unsigned int hasSameStateNeighbors;
const GPULayerDoublets* theDoublets;
int theDoubletId;
int theLayerIdInFourLayers;
float theInnerX;
float theOuterX;
float theInnerY;
float theOuterY;
float theInnerZ;
float theOuterZ;
float theInnerR;
float theOuterR;


};


#endif /*CACELL_H_ */
46 changes: 46 additions & 0 deletions RecoPixelVertexing/PixelTriplets/interface/GPUCellularAutomaton.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#ifndef GPUCELLULARAUTOMATON_H_
#define GPUCELLULARAUTOMATON_H_


#include <array>
#include "GPUCACell.h"
#include "TrackingTools/TransientTrackingRecHit/interface/SeedingLayerSetsHits.h"
#include "RecoTracker/TkTrackingRegions/interface/TrackingRegion.h"
#include "RecoTracker/TkHitPairs/interface/RecHitsSortedInPhi.h"


template<unsigned int theNumberOfLayers>
class GPUCellularAutomaton {
public:

GPUCellularAutomaton(std::vector<const HitDoublets*> doublets, const TrackingRegion& region, const float phiCut, const float thetaCut) {



}

void create_and_connect_cells();


void evolve();


void find_ntuplets(std::vector<GPUCACell<theNumberOfLayers>::CAntuplet>&, const unsigned int);



private:

GPU_HitDoublets* gpuDoublets;
RecHitsSortedInPhi_gpu* hits;
GPUCACell<numberOfLayers>** theCells;
RecHitsSortedInPhi_gpu* hitsOnLayers;
GPUArena<numberOfLayers-1, 4, GPUCACell<numberOfLayers>* >* isOuterHitOfCell;
GPUArena<numberOfLayers,4,GPUCACell<numberOfLayers>* >* theInnerNeighbors;
GPUSimpleVector<maxNumberOfQuadruplets, CAntuplet>* foundNtuplets;


};


#endif
80 changes: 80 additions & 0 deletions RecoPixelVertexing/PixelTriplets/interface/GPUSimpleVector.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#ifndef GPU_SIMPLEVECTOR_H_
#define GPU_SIMPLEVECTOR_H_

template< int maxSize, class T>
struct GPUSimpleVector
{
// __forceinline__ __host__ __device__ CUDAQueue( ) { }

__inline__ __device__
int push_threadsafe(const T& element) {
auto previousSize = atomicAdd(&m_size, 1);
if(previousSize<maxSize)
{
m_data[previousSize] = element;
return previousSize;
} else

atomicSub(&m_size, 1);
return -1;
};


__inline__ __device__
int push(const T& element) {

auto previousSize = m_size++;
if(previousSize<maxSize)
{
m_data[previousSize] = element;
return previousSize;
} else
return -1;
};

__inline__ __device__
T pop_back_threadsafe() {
#ifdef __CUDACC__
if(m_size > 0)
{
auto previousSize = atomicAdd (&m_size, -1);
return m_data[previousSize-1];
}
#endif

};

__inline__ __device__
T pop_back() {
if(m_size > 0)
{
auto previousSize = m_size--;
return m_data[previousSize-1];
}

};

__inline__ __host__ __device__
void reset()
{
m_size = 0;
};


__inline__ __host__ __device__
int size() const
{
return m_size;
};



T m_data[maxSize];
int m_size;


};



#endif
1 change: 1 addition & 0 deletions RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@
<use name="RecoPixelVertexing/PixelTriplets"/>
<use name="RecoTracker/TkSeedingLayers"/>
<use name="RecoPixelVertexing/PixelTrackFitting"/>
<use name="cuda"/>
<library file="*.cc" name="RecoPixelVertexingPixelTripletsPlugins">
<flags EDM_PLUGIN="1"/>
</library>
85 changes: 85 additions & 0 deletions RecoPixelVertexing/PixelTriplets/src/GPUCellularAutomaton.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#include "RecoTracker/TkHitPairs/interface/RecHitsSortedInPhi.h"
#include "../interface/GPUCACell.h"
#include "../interface/GPUArena.h"


using CAntuplet = GPUSimpleVector<4, GPUCACell<4>*>;


template<int numberOfLayers>
__global__
void kernel_create(const GPULayerDoublets* gpuDoublets,
GPUCACell<numberOfLayers>** cells, GPUArena<numberOfLayers-1, 4, GPUCACell<numberOfLayers>* > isOuterHitOfCell)
{
unsigned int layerPairIndex = blockIdx.y;
unsigned int cellIndexInLayerPair = threadIdx.x + blockIdx.x * blockDim.x;
if(layerPairIndex < numberOfLayers-1)
{
for(int i = cellIndexInLayerPair; i < gpuDoublets[layerPairIndex].size; i+=gridDim.x * blockDim.x)
{

cells[layerPairIndex][i].init( &gpuDoublets[layerPairIndex],layerPairIndex,i,gpuDoublets[layerPairIndex].indices[2*i], gpuDoublets[layerPairIndex].indices[2*i+1]);
isOuterHitOfCell.push_back(layerPairIndex,cells[layerPairIndex][i].outerHitId(), &(cells[layerPairIndex][i]));
}
}

}


template<unsigned int numberOfLayers>
__global__
void kernel_connect(const GPULayerDoublets* gpuDoublets, GPUCACell<numberOfLayers>** cells,
GPUArena<numberOfLayers-1,4, GPUCACell<numberOfLayers>* > isOuterHitOfCell,
GPUArena<numberOfLayers-1,4, GPUCACell<numberOfLayers>* > innerNeighbors,float ptmin, float region_origin_x,
float region_origin_y, float region_origin_radius, float thetaCut,
float phiCut)
{

unsigned int layerPairIndex = blockIdx.y;
unsigned int cellIndexInLayerPair = threadIdx.x + blockIdx.x * blockDim.x;
if(layerPairIndex < numberOfLayers-1)
{
for (int i = cellIndexInLayerPair; i < gpuDoublets[layerPairIndex].size;
i += gridDim.x * blockDim.x)
{
GPUArenaIterator<4, GPUCACell<numberOfLayers>*> innerNeighborsIterator(layerPairIndex,i);
GPUCACell<numberOfLayers>* otherCell();
while (innerNeighborsIterator.has_next())
{
otherCell = innerNeighborsIterator.get_next();
if(cells[layerPairIndex][i].check_alignment_and_tag(otherCell,
ptmin, region_origin_x, region_origin_y,
region_origin_radius, thetaCut, phiCut))
innerNeighbors.push_back(layerPairIndex,i,otherCell);


}
}
}
}

template<unsigned int numberOfLayers, unsigned int maxNumberOfQuadruplets>
__global__
void kernel_find_ntuplets(const GPULayerDoublets* gpuDoublets,GPUCACell<numberOfLayers>** cells,
GPUSimpleVector<maxNumberOfQuadruplets, CAntuplet>* foundNtuplets,
GPUArena<numberOfLayers,4,GPUCACell<numberOfLayers>* >* theInnerNeighbors, const unsigned int minHitsPerNtuplet)
{
unsigned int cellIndexInLastLayerPair = threadIdx.x
+ blockIdx.x * blockDim.x;
constexpr unsigned int lastLayerPairIndex = numberOfLayers - 2;
CAntuplet tmpNtuplet;

for (int i = cellIndexInLastLayerPair; i < gpuDoublets[lastLayerPairIndex].size;
i += gridDim.x * blockDim.x)
{
tmpNtuplet.reset();
cells[lastLayerPairIndex][i].find_ntuplets(foundNtuplets,
theInnerNeighbors, tmpNtuplet, minHitsPerNtuplet);

}

}




1 change: 1 addition & 0 deletions RecoTracker/TkHitPairs/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -12,6 +12,7 @@
<use name="TrackingTools/TransientTrackingRecHit"/>
<use name="RecoTracker/TkMSParametrization"/>
<use name="RecoTracker/TkSeedingLayers"/>
<use name="cuda"/>
<export>
<lib name="1"/>
</export>