fwyzard · fwyzard · Jul 8, 2016 · Jul 1, 2016 · Jul 1, 2016 · Jul 1, 2016
diff --git a/RecoPixelVertexing/PixelTriplets/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/BuildFile.xml
@@ -1,3 +1,4 @@
+<use   name="cuda"/>
 <use   name="FWCore/Framework"/>
 <use   name="FWCore/ParameterSet"/>
 <use   name="FWCore/PluginManager"/>
@@ -18,3 +19,5 @@
 </export>
 
 <flags   CXXFLAGS="-Ofast -fno-math-errno"/>
+<flags   CUDAFLAGS="-std=c++11"/>
+
diff --git a/RecoPixelVertexing/PixelTriplets/interface/GPUArena.h b/RecoPixelVertexing/PixelTriplets/interface/GPUArena.h
@@ -0,0 +1,196 @@
+#ifndef GPUARENA_H_
+#define GPUARENA_H_
+#include "cuda_runtime.h"
+#define checkCudaError(val) __checkCudaError__ ( (val), #val, __FILE__, __LINE__ )
+
+template <typename T>
+inline void __checkCudaError__(T code, const char *func, const char *file, int line)
+{
+    if (code) {
+        fprintf(stderr, "CUDA error at %s:%d: %s (code=%d)\n",
+                file, line, cudaGetErrorString(code), (unsigned int)code);
+        cudaDeviceReset();
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define checkLastCudaError() checkCudaError( (cudaGetLastError()) )
+
+template <int CHUNK_SIZE, typename T>
+class GPUChunk {
+	//To think about: maybe it would be better to move the
+	//nextFreeValue and the next pointer out of the chunk
+	//and make their own lists inside the arena
+	//if a chunk only consists of CHUNK_SIZE values then each
+	//thread can load one memory transaction worth of pure data
+	//whereas the nextFreeValue and next pointers ruin that
+    public:
+	T values[CHUNK_SIZE];
+	int nextFreeValue;
+	GPUChunk<CHUNK_SIZE, T> *next;
+
+	__device__ int num_values_in_chunk() {
+        if(nextFreeValue > CHUNK_SIZE) {
+            return CHUNK_SIZE;
+        } else {
+		    return nextFreeValue;
+        }
+	}
+
+	__device__ bool push_back(T value) {
+		int id = atomicAdd(&nextFreeValue, 1);
+		if(id < CHUNK_SIZE) {
+			//found space
+			values[id] = value;
+			return true;
+		} else {
+			//chunk is full and this thread must get a new one
+			return false;
+		}
+	}
+
+	__device__ T& get_element_at(int i) {
+		return values[i];
+	}
+};
+
+//This iterator starts at the head chunk for a given element on a given layer
+//and iterates "backwards"
+template <int CHUNK_SIZE, typename T>
+class GPUArenaIterator {
+    private:
+    GPUChunk<CHUNK_SIZE, T> *currentChunk;
+    int cursorInChunk;
+
+    public:
+    __device__ GPUArenaIterator(GPUChunk<CHUNK_SIZE, T> *head_chunk) {
+        currentChunk = head_chunk;
+        if(currentChunk != NULL) {
+            cursorInChunk = currentChunk->num_values_in_chunk() - 1;
+        }
+    }
+
+    __device__ bool has_next() {
+        return currentChunk != NULL && (cursorInChunk >= 0 || currentChunk->next != NULL);
+    }
+
+    __device__ T& get_next() {
+        if(cursorInChunk < 0) {
+            //No more elements left in chunk, go to next chunk
+            //assuming there are more chunks because you hopefully called hasNext before
+            currentChunk = currentChunk->next;
+            cursorInChunk = currentChunk->num_values_in_chunk() - 1;
+        }
+        return currentChunk->get_element_at(cursorInChunk--);
+    }
+};
+
+template <int CHUNK_SIZE, typename T>
+__global__ void init_mappings_kernel(GPUChunk<CHUNK_SIZE, T> **mappings, GPUChunk<CHUNK_SIZE, T> *startOfChunks, int offset, int numElements) {
+    for(int mySlot = threadIdx.x + blockIdx.x * blockDim.x; mySlot < numElements; mySlot += gridDim.x * blockDim.x) {
+        mappings[mySlot] = startOfChunks + offset + mySlot;
+    }
+};
+
+template <int NumLayers,int CHUNK_SIZE, typename T>
+class GPUArena {
+    private:
+	//how many elements does the arena store per layer
+	int numElementsPerLayer[NumLayers];
+	//a map from an element id (per layer) to the head of the chunk linked list that stores the values
+	GPUChunk<CHUNK_SIZE, T> **mappingIdToCurrentChunk[NumLayers];
+	//the shared chunks
+	GPUChunk<CHUNK_SIZE, T> *chunks;
+	//how many chunks are there in total
+	int capacity;
+	//shared cursor to indicate the next free chunk
+	//next free chunk does not start out as 0 but every element in every layer
+	//by default gets a chunk
+    public:
+    //there appears to be a bug in nvcc that doesn't allow for atomicAdd on an int field in this class
+    //(even though the same thing works for GPUChunk above). A work around is to allocate the nextFreeChunk
+    //integer with a cudaMalloc
+	int *nextFreeChunk_d;
+
+	GPUArena(int _capacity, std::array<int, NumLayers> pNumElementsPerLayer)
+	: capacity(_capacity)
+	{
+		//allocate the main arena storage and set everything to 0 (important
+		//because the counters in each chunk must be )
+		checkCudaError(cudaMalloc(&chunks, sizeof(GPUChunk<CHUNK_SIZE, T>) * capacity));
+		checkCudaError(cudaMemset(chunks, 0, sizeof(GPUChunk<CHUNK_SIZE, T>) * capacity));
+        checkCudaError(cudaMalloc(&nextFreeChunk_d, sizeof(int)));
+        checkCudaError(cudaMemset(nextFreeChunk_d, 0, sizeof(int)));
+
+        int offset = 0;
+        for(int layer = 0; layer < NumLayers; layer++) {
+            numElementsPerLayer[layer] = pNumElementsPerLayer[layer];
+		    //each element implicitly gets its own initial chunk
+            size_t mapSizeInBytes = sizeof(GPUChunk<CHUNK_SIZE, T>*) * numElementsPerLayer[layer];
+    		checkCudaError(cudaMalloc(&mappingIdToCurrentChunk[layer], mapSizeInBytes));
+
+            init_mappings_kernel<<<64, 16>>>(mappingIdToCurrentChunk[layer], chunks, offset, numElementsPerLayer[layer]);
+            checkLastCudaError();
+            cudaDeviceSynchronize();
+            checkLastCudaError();
+            offset += numElementsPerLayer[layer];
+        }
+        checkCudaError(cudaMemcpy(nextFreeChunk_d, &offset, sizeof(int), cudaMemcpyHostToDevice));
+	}
+
+    __device__ int get_num_elements_per_layer(int layer) {
+        return numElementsPerLayer[layer];
+    }
+
+	__device__ GPUChunk<CHUNK_SIZE, T>* get_new_chunk() {
+        int id = atomicAdd(nextFreeChunk_d, 1);
+
+        if(id >= capacity) {
+			printf("PANIC: GPUArena ran out of capacity\n");
+            assert(false);
+            return NULL;
+		}
+		return &chunks[id];
+	}
+
+	__device__ GPUChunk<CHUNK_SIZE, T>* get_head_chunk(int layer, int elementId) {
+		return mappingIdToCurrentChunk[layer][elementId];
+	}
+
+	__device__ GPUArenaIterator<CHUNK_SIZE, T> iterator(int layer, int elementId) {
+		return GPUArenaIterator<CHUNK_SIZE, T>(get_head_chunk(layer, elementId));
+	}
+
+	__device__ void push_back(int layer, int elementId, T &value) {
+
+		GPUChunk<CHUNK_SIZE, T> *currentChunk = get_head_chunk(layer, elementId);
+		assert(currentChunk);
+
+		while(true) {
+			bool status = currentChunk->push_back(value);
+			if(status == true) {
+				//we were able to snatch a value spot in the chunk, done
+				break;
+			} else {
+				//chunk is full. Every thread seeing a full chunk gets a new
+				//one and tries to add it. Because the GPU doesn't guarantee
+				GPUChunk<CHUNK_SIZE, T> *newChunk = get_new_chunk();
+				newChunk->next = currentChunk; //hook up list
+				//Note: we don't need a threadfence_system here because we are
+				//either only writing or only reading, never both. And while writing
+				//nobody cares about the next pointer
+				GPUChunk<CHUNK_SIZE, T> *oldChunk = (GPUChunk<CHUNK_SIZE, T>*)atomicCAS((unsigned long long int *)&mappingIdToCurrentChunk[layer][elementId], (unsigned long long int)currentChunk, (unsigned long long int)newChunk);
+                //if our CAS succeeded, oldChunk will be our currentChunk. In this case we move to newChunk immediately to avoid an extra loop;
+                //if oldChunk is different from newChunk, somebody else came first and we will continue with the chunk returned by the atomicCAS
+                //in the latter case, newChunk is wasted, but thats unavoidable to avoid livelocks
+                currentChunk = (oldChunk == currentChunk) ? newChunk : oldChunk;
+			}
+		}
+	}
+
+};
+#endif
+
+
+
+
diff --git a/RecoPixelVertexing/PixelTriplets/interface/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/interface/GPUCACell.h
@@ -0,0 +1,248 @@
+#ifndef GPU_CACELL_H_
+#define GPU_CACELL_H_
+
+#include "RecoPixelVertexing/PixelTriplets/interface/GPUHitsAndDoublets.h"
+#include "TrackingTools/TransientTrackingRecHit/interface/SeedingLayerSetsHits.h"
+#include "RecoTracker/TkTrackingRegions/interface/TrackingRegion.h"
+#include "GPUSimpleVector.h"
+
+#include "DataFormats/Math/interface/deltaPhi.h"
+#include "GPUArena.h"
+#include <cmath>
+#include <array>
+
+
+template<int numberOfLayers>
+class GPUCACell {
+public:
+
+    using CAntuplet = GPUSimpleVector<numberOfLayers, GPUCACell<numberOfLayers>* >;
+    __device__
+    GPUCACell()
+    {
+
+    }
+
+    __device__
+	void init(const GPULayerDoublets* doublets, const int layerId, const int doubletId, const int innerHitId, const int outerHitId)
+    {
+        theCAState = 0;
+        theInnerHitId = innerHitId;
+        theOuterHitId =outerHitId;
+        hasSameStateNeighbors = 0;
+        theDoublets=doublets;
+        theDoubletId=doubletId;
+        theLayerIdInFourLayers=layerId;
+
+        theInnerX=doublets->layers[0].x[doubletId];
+        theOuterX=doublets->layers[1].x[doubletId];
+
+        theInnerY=doublets->layers[0].y[doubletId];
+        theOuterY=doublets->layers[1].y[doubletId];
+
+        theInnerZ=doublets->layers[0].z[doubletId];
+        theOuterZ=doublets->layers[1].z[doubletId];
+
+    	theInnerR=hypot (theInnerX, theInnerY);
+    	theOuterR=hypot (theOuterX, theOuterY);
+
+    }
+
+
+    __device__
+    float get_inner_x() const {
+        return theInnerX;
+    }
+    __device__
+    float get_outer_x() const {
+        return theOuterX;
+    }
+    __device__
+    float get_inner_y() const {
+        return theInnerY;
+    }
+    __device__
+    float get_outer_y() const {
+        return theOuterY;
+    }
+    __device__
+    float get_inner_z() const {
+        return theInnerZ;
+    }
+    __device__
+    float get_outer_z() const {
+        return theOuterZ;
+    }
+    __device__
+    float get_inner_r() const {
+        return theInnerR;
+    }
+    __device__
+    float get_outer_r() const {
+        return theOuterR;
+    }
+    __device__
+    unsigned int get_inner_hit_id() const {
+        return theInnerHitId;
+    }
+    __device__
+    unsigned int get_outer_hit_id() const {
+        return theOuterHitId;
+    }
+
+    __device__
+    void evolve(GPUArena<numberOfLayers,4,GPUCACell<numberOfLayers>*>& InnerNeighbors) {
+
+        hasSameStateNeighbors = 0;
+		GPUArenaIterator<4, GPUCACell<numberOfLayers>*> innerNeighborsIterator(theLayerIdInFourLayers,theDoubletId);
+		GPUCACell<numberOfLayers>* otherCell;
+		while (innerNeighborsIterator.has_next())
+		{
+			otherCell = innerNeighborsIterator.get_next();
+
+
+            if (otherCell->get_CA_state() == theCAState) {
+
+                hasSameStateNeighbors = 1;
+
+                break;
+            }
+		}
+
+    }
+
+    __device__
+    void check_alignment_and_tag(const GPUCACell<numberOfLayers>* innerCell, const float ptmin, const float region_origin_x, const float region_origin_y, const float region_origin_radius, const float thetaCut, const float phiCut) {
+
+        return (are_aligned_RZ(innerCell, ptmin, thetaCut) && have_similar_curvature(innerCell, region_origin_x, region_origin_y, region_origin_radius, phiCut));
+
+    }
+
+    __device__
+    bool are_aligned_RZ(const GPUCACell<numberOfLayers>* otherCell, const float ptmin, const float thetaCut) const {
+
+        float r1 = otherCell->get_inner_r();
+        float z1 = otherCell->get_inner_z();
+        float distance_13_squared = (r1 - theOuterR)*(r1 - theOuterR) + (z1 - theOuterZ)*(z1 - theOuterZ);
+        float tan_12_13_half = fabs(z1 * (theInnerR - theOuterR) + theInnerZ * (theOuterR - r1) + theOuterZ * (r1 - theInnerR)) / distance_13_squared;
+        return tan_12_13_half * ptmin <= thetaCut;
+    }
+
+    __device__
+    bool have_similar_curvature(const GPUCACell<numberOfLayers>* otherCell,
+            const float region_origin_x, const float region_origin_y, const float region_origin_radius, const float phiCut) const {
+        auto x1 = otherCell->get_inner_x();
+        auto y1 = otherCell->get_inner_y();
+
+        auto x2 = get_inner_x();
+        auto y2 = get_inner_y();
+
+        auto x3 = get_outer_x();
+        auto y3 = get_outer_y();
+
+        auto precision = 0.5f;
+        auto offset = x2 * x2 + y2*y2;
+
+        auto bc = (x1 * x1 + y1 * y1 - offset) / 2.f;
+
+        auto cd = (offset - x3 * x3 - y3 * y3) / 2.f;
+
+        auto det = (x1 - x2) * (y2 - y3) - (x2 - x3)* (y1 - y2);
+
+        //points are aligned
+        if (fabs(det) < precision)
+            return true;
+
+        auto idet = 1.f / det;
+
+        auto x_center = (bc * (y2 - y3) - cd * (y1 - y2)) * idet;
+        auto y_center = (cd * (x1 - x2) - bc * (x2 - x3)) * idet;
+
+        auto radius = std::sqrt((x2 - x_center)*(x2 - x_center) + (y2 - y_center)*(y2 - y_center));
+        auto centers_distance_squared = (x_center - region_origin_x)*(x_center - region_origin_x) + (y_center - region_origin_y)*(y_center - region_origin_y);
+
+        auto minimumOfIntesectionRange = (radius - region_origin_radius)*(radius - region_origin_radius) - phiCut;
+
+        if (centers_distance_squared >= minimumOfIntesectionRange) {
+            auto maximumOfIntesectionRange = (radius + region_origin_radius)*(radius + region_origin_radius) + phiCut;
+            return centers_distance_squared <= maximumOfIntesectionRange;
+        } else {
+
+            return false;
+        }
+
+    }
+    __device__
+    unsigned int get_CA_state() const {
+        return theCAState;
+    }
+
+    // if there is at least one left neighbor with the same state (friend), the state has to be increased by 1.
+    __device__
+    void update_state() {
+        theCAState += hasSameStateNeighbors;
+    }
+
+
+    __device__
+    bool is_root_cell(const unsigned int minimumCAState) const {
+        return (theCAState >= minimumCAState);
+    }
+
+    // trying to free the track building process from hardcoded layers, leaving the visit of the graph
+    // based on the neighborhood connections between cells.
+
+    __device__
+    void find_ntuplets(GPUSimpleVector<100,CAntuplet>* foundNtuplets, GPUArena<numberOfLayers,4,GPUCACell<numberOfLayers>*>& theInnerNeighbors,  CAntuplet& tmpNtuplet, const unsigned int minHitsPerNtuplet) const {
+
+        // the building process for a track ends if:
+        // it has no right neighbor
+        // it has no compatible neighbor
+        // the ntuplets is then saved if the number of hits it contains is greater than a threshold
+		GPUArenaIterator<4, GPUCACell<numberOfLayers>*> innerNeighborsIterator(theLayerIdInFourLayers,theDoubletId);
+		GPUCACell<numberOfLayers>* otherCell;
+
+        if (theInnerNeighbors.has_next() == 0) {
+            if (tmpNtuplet.size() >= minHitsPerNtuplet - 1)
+                foundNtuplets->push_back(tmpNtuplet);
+            else
+                return;
+        } else {
+
+
+			while (innerNeighborsIterator.has_next())
+			{
+				otherCell = innerNeighborsIterator.get_next();
+				tmpNtuplet.push_back(otherCell);
+				otherCell->find_ntuplets(foundNtuplets, theInnerNeighbors, tmpNtuplet, minHitsPerNtuplet);
+                tmpNtuplet.pop_back();
+
+			}
+
+        }
+    }
+
+
+private:
+
+    unsigned int theCAState;
+    unsigned int theInnerHitId;
+    unsigned int theOuterHitId;
+    unsigned int hasSameStateNeighbors;
+    const GPULayerDoublets* theDoublets;
+    int theDoubletId;
+    int theLayerIdInFourLayers;
+    float theInnerX;
+    float theOuterX;
+    float theInnerY;
+    float theOuterY;
+    float theInnerZ;
+    float theOuterZ;
+    float theInnerR;
+    float theOuterR;
+
+
+};
+
+
+#endif /*CACELL_H_ */
diff --git a/RecoPixelVertexing/PixelTriplets/interface/GPUCellularAutomaton.h b/RecoPixelVertexing/PixelTriplets/interface/GPUCellularAutomaton.h
@@ -0,0 +1,46 @@
+#ifndef GPUCELLULARAUTOMATON_H_
+#define GPUCELLULARAUTOMATON_H_
+
+
+#include <array>
+#include "GPUCACell.h"
+#include "TrackingTools/TransientTrackingRecHit/interface/SeedingLayerSetsHits.h"
+#include "RecoTracker/TkTrackingRegions/interface/TrackingRegion.h"
+#include "RecoTracker/TkHitPairs/interface/RecHitsSortedInPhi.h"
+
+
+template<unsigned int theNumberOfLayers>
+class GPUCellularAutomaton {
+public:
+
+    GPUCellularAutomaton(std::vector<const HitDoublets*> doublets, const TrackingRegion& region, const float phiCut, const float thetaCut) {
+
+
+
+    }
+
+    void create_and_connect_cells();
+
+
+    void evolve();
+
+
+    void find_ntuplets(std::vector<GPUCACell<theNumberOfLayers>::CAntuplet>&, const unsigned int);
+
+
+
+private:
+
+    GPU_HitDoublets* gpuDoublets;
+    RecHitsSortedInPhi_gpu* hits;
+    GPUCACell<numberOfLayers>** theCells;
+    RecHitsSortedInPhi_gpu* hitsOnLayers;
+    GPUArena<numberOfLayers-1, 4, GPUCACell<numberOfLayers>* >* isOuterHitOfCell;
+    GPUArena<numberOfLayers,4,GPUCACell<numberOfLayers>* >* theInnerNeighbors;
+    GPUSimpleVector<maxNumberOfQuadruplets, CAntuplet>* foundNtuplets;
+
+
+};
+
+
+#endif
diff --git a/RecoPixelVertexing/PixelTriplets/interface/GPUSimpleVector.h b/RecoPixelVertexing/PixelTriplets/interface/GPUSimpleVector.h
@@ -0,0 +1,80 @@
+#ifndef GPU_SIMPLEVECTOR_H_
+#define GPU_SIMPLEVECTOR_H_
+
+template< int maxSize, class T>
+struct GPUSimpleVector
+{
+//	__forceinline__ __host__ __device__ CUDAQueue( ) { }
+
+	__inline__ __device__
+	int push_threadsafe(const T& element) {
+		auto previousSize = atomicAdd(&m_size, 1);
+		if(previousSize<maxSize)
+		{
+			m_data[previousSize] = element;
+			return previousSize;
+		} else
+
+		atomicSub(&m_size, 1);
+		return -1;
+	};
+
+
+	__inline__ __device__
+	int push(const T& element) {
+
+		auto previousSize = m_size++;
+		if(previousSize<maxSize)
+		{
+			m_data[previousSize] = element;
+			return previousSize;
+		} else
+			return -1;
+	};
+
+	__inline__ __device__
+	T pop_back_threadsafe() {
+#ifdef __CUDACC__
+		if(m_size > 0)
+		{
+			auto previousSize = atomicAdd (&m_size, -1);
+			return m_data[previousSize-1];
+		}
+#endif
+
+	};
+
+	__inline__ __device__
+	T pop_back() {
+		if(m_size > 0)
+		{
+			auto previousSize = m_size--;
+			return m_data[previousSize-1];
+		}
+
+	};
+
+	__inline__ __host__ __device__
+	void reset()
+	{
+		m_size = 0;
+	};
+
+
+	__inline__ __host__ __device__
+	int size() const
+	{
+		return m_size;
+	};
+
+
+
+	T m_data[maxSize];
+	int m_size;
+
+
+};
+
+
+
+#endif
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -2,6 +2,7 @@
 <use   name="RecoPixelVertexing/PixelTriplets"/>
 <use   name="RecoTracker/TkSeedingLayers"/>
 <use   name="RecoPixelVertexing/PixelTrackFitting"/>
+<use   name="cuda"/>
 <library   file="*.cc" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags   EDM_PLUGIN="1"/>
 </library>

diff --git a/RecoPixelVertexing/PixelTriplets/src/GPUCellularAutomaton.cu b/RecoPixelVertexing/PixelTriplets/src/GPUCellularAutomaton.cu
@@ -0,0 +1,85 @@
+#include "RecoTracker/TkHitPairs/interface/RecHitsSortedInPhi.h"
+#include "../interface/GPUCACell.h"
+#include "../interface/GPUArena.h"
+
+
+using CAntuplet = GPUSimpleVector<4, GPUCACell<4>*>;
+
+
+template<int numberOfLayers>
+__global__
+void kernel_create(const GPULayerDoublets* gpuDoublets,
+		GPUCACell<numberOfLayers>** cells, GPUArena<numberOfLayers-1, 4, GPUCACell<numberOfLayers>* > isOuterHitOfCell)
+{
+	unsigned int layerPairIndex = blockIdx.y;
+	unsigned int cellIndexInLayerPair = threadIdx.x + blockIdx.x * blockDim.x;
+	if(layerPairIndex < numberOfLayers-1)
+	{
+		for(int i = cellIndexInLayerPair; i < gpuDoublets[layerPairIndex].size; i+=gridDim.x * blockDim.x)
+		{
+
+			cells[layerPairIndex][i].init(	&gpuDoublets[layerPairIndex],layerPairIndex,i,gpuDoublets[layerPairIndex].indices[2*i], gpuDoublets[layerPairIndex].indices[2*i+1]);
+			isOuterHitOfCell.push_back(layerPairIndex,cells[layerPairIndex][i].outerHitId(), &(cells[layerPairIndex][i]));
+		}
+	}
+
+}
+
+
+template<unsigned int numberOfLayers>
+__global__
+void kernel_connect(const GPULayerDoublets* gpuDoublets, GPUCACell<numberOfLayers>** cells,
+		GPUArena<numberOfLayers-1,4, GPUCACell<numberOfLayers>* > isOuterHitOfCell,
+		GPUArena<numberOfLayers-1,4, GPUCACell<numberOfLayers>* > innerNeighbors,float ptmin, float region_origin_x,
+		float region_origin_y, float region_origin_radius, float thetaCut,
+		float phiCut)
+{
+
+	unsigned int layerPairIndex = blockIdx.y;
+	unsigned int cellIndexInLayerPair = threadIdx.x + blockIdx.x * blockDim.x;
+	if(layerPairIndex < numberOfLayers-1)
+	{
+		for (int i = cellIndexInLayerPair; i < gpuDoublets[layerPairIndex].size;
+				i += gridDim.x * blockDim.x)
+		{
+			GPUArenaIterator<4, GPUCACell<numberOfLayers>*> innerNeighborsIterator(layerPairIndex,i);
+			GPUCACell<numberOfLayers>* otherCell();
+			while (innerNeighborsIterator.has_next())
+			{
+				otherCell = innerNeighborsIterator.get_next();
+				if(cells[layerPairIndex][i].check_alignment_and_tag(otherCell,
+								ptmin, region_origin_x, region_origin_y,
+								region_origin_radius, thetaCut, phiCut))
+					innerNeighbors.push_back(layerPairIndex,i,otherCell);
+
+
+			}
+		}
+	}
+}
+
+template<unsigned int numberOfLayers, unsigned int maxNumberOfQuadruplets>
+__global__
+void kernel_find_ntuplets(const GPULayerDoublets* gpuDoublets,GPUCACell<numberOfLayers>** cells,
+		GPUSimpleVector<maxNumberOfQuadruplets, CAntuplet>* foundNtuplets,
+		GPUArena<numberOfLayers,4,GPUCACell<numberOfLayers>* >* theInnerNeighbors, const unsigned int minHitsPerNtuplet)
+{
+	unsigned int cellIndexInLastLayerPair = threadIdx.x
+			+ blockIdx.x * blockDim.x;
+	constexpr unsigned int lastLayerPairIndex = numberOfLayers - 2;
+	CAntuplet tmpNtuplet;
+
+		for (int i = cellIndexInLastLayerPair; i < gpuDoublets[lastLayerPairIndex].size;
+				i += gridDim.x * blockDim.x)
+		{
+			tmpNtuplet.reset();
+			cells[lastLayerPairIndex][i].find_ntuplets(foundNtuplets,
+					theInnerNeighbors, tmpNtuplet, minHitsPerNtuplet);
+
+		}
+
+}
+
+
+
+
diff --git a/RecoTracker/TkHitPairs/BuildFile.xml b/RecoTracker/TkHitPairs/BuildFile.xml
@@ -12,6 +12,7 @@
 <use   name="TrackingTools/TransientTrackingRecHit"/>
 <use   name="RecoTracker/TkMSParametrization"/>
 <use   name="RecoTracker/TkSeedingLayers"/>
+<use   name="cuda"/>
 <export>
   <lib   name="1"/>
 </export>