diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h index 1c076e8705aa0..100766324a424 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h @@ -5,6 +5,7 @@ #include #include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" +#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" @@ -42,8 +43,26 @@ class TrackSoAHeterogeneousT { // this is chi2/ndof as not necessarely all hits are used in the fit eigenSoA::ScalarSoA chi2; + eigenSoA::ScalarSoA nLayers; + constexpr int nHits(int i) const { return detIndices.size(i); } + constexpr bool isTriplet(int i) const { return nLayers(i) == 3; } + + constexpr int computeNumberOfLayers(int32_t i) const { + // layers are in order and we assume tracks are either forward or backward + auto pdet = detIndices.begin(i); + int nl = 1; + auto ol = phase1PixelTopology::getLayer(*pdet); + for (; pdet < detIndices.end(i); ++pdet) { + auto il = phase1PixelTopology::getLayer(*pdet); + if (il != ol) + ++nl; + ol = il; + } + return nl; + } + // State at the Beam spot // phi,tip,1/pt,cotan(theta),zip TrajectoryStateSoAT stateAtBS; diff --git a/DataFormats/Math/interface/choleskyInversion.h b/DataFormats/Math/interface/choleskyInversion.h index b84e7cc80a120..90d9262957376 100644 --- a/DataFormats/Math/interface/choleskyInversion.h +++ b/DataFormats/Math/interface/choleskyInversion.h @@ -5,7 +5,78 @@ #include -/** +namespace math { + namespace cholesky { + + template +// without this: either does not compile or compiles and then fails silently at runtime +#ifdef __CUDACC__ + __host__ __device__ +#endif + inline constexpr void + invertNN(M1 const& src, M2& dst) { + + // origin: CERNLIB + + using T = typename M2::Scalar; + + T a[N][N]; + for (int i = 0; i < N; ++i) { + a[i][i] = src(i, i); + for (int j = i + 1; j < N; ++j) + a[j][i] = src(i, j); + } + + for (int j = 0; j < N; ++j) { + a[j][j] = T(1.) / a[j][j]; + int jp1 = j + 1; + for (int l = jp1; l < N; ++l) { + a[j][l] = a[j][j] * a[l][j]; + T s1 = -a[l][jp1]; + for (int i = 0; i < jp1; ++i) + s1 += a[l][i] * a[i][jp1]; + a[l][jp1] = -s1; + } + } + + if constexpr (N == 1) { + dst(0, 0) = a[0][0]; + return; + } + a[0][1] = -a[0][1]; + a[1][0] = a[0][1] * a[1][1]; + for (int j = 2; j < N; ++j) { + int jm1 = j - 1; + for (int k = 0; k < jm1; ++k) { + T s31 = a[k][j]; + for (int i = k; i < jm1; ++i) + s31 += a[k][i + 1] * a[i + 1][j]; + a[k][j] = -s31; + a[j][k] = -s31 * a[j][j]; + } + a[jm1][j] = -a[jm1][j]; + a[j][jm1] = a[jm1][j] * a[j][j]; + } + + int j = 0; + while (j < N - 1) { + T s33 = a[j][j]; + for (int i = j + 1; i < N; ++i) + s33 += a[j][i] * a[i][j]; + dst(j, j) = s33; + + ++j; + for (int k = 0; k < j; ++k) { + T s32 = 0; + for (int i = j; i < N; ++i) + s32 += a[k][i] * a[i][j]; + dst(k, j) = dst(j, k) = s32; + } + } + dst(j, j) = a[j][j]; + } + + /** * fully inlined specialized code to perform the inversion of a * positive defined matrix of rank up to 6. * @@ -16,8 +87,6 @@ * * */ -namespace math { - namespace cholesky { template inline constexpr void __attribute__((always_inline)) invert11(M1 const& src, M2& dst) { @@ -336,12 +405,12 @@ namespace math { }; // Eigen interface - template - inline constexpr void __attribute__((always_inline)) - invert(Eigen::DenseBase const& src, Eigen::DenseBase& dst) { - using M1 = Eigen::DenseBase; - using M2 = Eigen::DenseBase; - Inverter::eval(src, dst); + template + inline constexpr void __attribute__((always_inline)) invert(M1 const& src, M2& dst) { + if constexpr (M2::ColsAtCompileTime < 7) + Inverter::eval(src, dst); + else + invertNN(src, dst); } } // namespace cholesky diff --git a/DataFormats/Math/test/BuildFile.xml b/DataFormats/Math/test/BuildFile.xml index faa85fb2224ad..ad0058fe0dcb9 100644 --- a/DataFormats/Math/test/BuildFile.xml +++ b/DataFormats/Math/test/BuildFile.xml @@ -127,4 +127,10 @@ + + + + + + diff --git a/DataFormats/Math/test/CholeskyInvert_t.cpp b/DataFormats/Math/test/CholeskyInvert_t.cpp index de048e3d4455b..7794e1d1d0113 100644 --- a/DataFormats/Math/test/CholeskyInvert_t.cpp +++ b/DataFormats/Math/test/CholeskyInvert_t.cpp @@ -138,6 +138,7 @@ int main() { go<5>(true); go<6>(true); - // go<10>(); + go<10>(false); + go<10>(true); return 0; } diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu index 558f9296150c7..7faf30ee3d54c 100644 --- a/DataFormats/Math/test/CholeskyInvert_t.cu +++ b/DataFormats/Math/test/CholeskyInvert_t.cu @@ -85,7 +85,8 @@ template void go(bool soa) { constexpr unsigned int DIM = N; using MX = MXN; - std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl; + std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << " in " << (soa ? "SOA" : "AOS") + << " mode" << std::endl; auto start = std::chrono::high_resolution_clock::now(); auto delta = start - start; @@ -197,13 +198,15 @@ int main() { go<4>(false); go<5>(false); go<6>(false); + go<7>(false); + go<10>(false); go<2>(true); go<3>(true); go<4>(true); go<5>(true); go<6>(true); - - // go<10>(); + go<7>(true); + go<10>(true); return 0; } diff --git a/DataFormats/Math/test/simpleCholeskyTest.cu b/DataFormats/Math/test/simpleCholeskyTest.cu new file mode 100644 index 0000000000000..04a8438524276 --- /dev/null +++ b/DataFormats/Math/test/simpleCholeskyTest.cu @@ -0,0 +1,103 @@ +#include "DataFormats/Math/interface/choleskyInversion.h" + +using namespace math::cholesky; + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + +template +__global__ void invert(M* mm, int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + auto& m = mm[i]; + + printf("before %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1)); + + invertNN(m, m); + + printf("after %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1)); +} + +template +__global__ void invertE(M* mm, int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + auto& m = mm[i]; + + printf("before %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1)); + + m = m.inverse(); + + printf("after %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1)); +} + +// generate matrices +template +void genMatrix(M& m) { + using T = typename std::remove_reference::type; + int n = M::ColsAtCompileTime; + std::mt19937 eng; + // std::mt19937 eng2; + std::uniform_real_distribution rgen(0., 1.); + + // generate first diagonal elemets + for (int i = 0; i < n; ++i) { + double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 + m(i, i) = maxVal * rgen(eng); + } + for (int i = 0; i < n; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined + m(i, j) = v * rgen(eng); + m(j, i) = m(i, j); + } + } +} + +template +using MXN = Eigen::Matrix; + +int main() { + cms::cudatest::requireDevices(); + + constexpr int DIM = 6; + + using M = MXN; + + M m; + + genMatrix(m); + + printf("on CPU before %d %f %f %f\n", DIM, m(0, 0), m(1, 0), m(1, 1)); + + invertNN(m, m); + + printf("on CPU after %d %f %f %f\n", DIM, m(0, 0), m(1, 0), m(1, 1)); + + double* d; + cudaMalloc(&d, sizeof(M)); + cudaMemcpy(d, &m, sizeof(M), cudaMemcpyHostToDevice); + invert<<<1, 1>>>((M*)d, 1); + cudaCheck(cudaDeviceSynchronize()); + invertE<<<1, 1>>>((M*)d, 1); + cudaCheck(cudaDeviceSynchronize()); + + return 0; +} diff --git a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h index 4e70cd86e1524..27f5b8b573ea7 100644 --- a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h +++ b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h @@ -27,17 +27,20 @@ namespace phase1PixelTopology { constexpr uint32_t numberOfModules = 1856; constexpr uint32_t numberOfLayers = 10; - constexpr uint32_t layerStart[numberOfLayers + 1] = {0, - 96, - 320, - 672, // barrel - 1184, - 1296, - 1408, // positive endcap - 1520, - 1632, - 1744, // negative endcap - numberOfModules}; +#ifdef __CUDA_ARCH__ + __device__ +#endif + constexpr uint32_t layerStart[numberOfLayers + 1] = {0, + 96, + 320, + 672, // barrel + 1184, + 1296, + 1408, // positive endcap + 1520, + 1632, + 1744, // negative endcap + numberOfModules}; constexpr char const* layerName[numberOfLayers] = { "BL1", "BL2", @@ -84,8 +87,8 @@ namespace phase1PixelTopology { constexpr uint32_t maxModuleStride = findMaxModuleStride(); - constexpr uint8_t findLayer(uint32_t detId) { - for (uint8_t i = 0; i < std::size(layerStart); ++i) + constexpr uint8_t findLayer(uint32_t detId, uint8_t sl = 0) { + for (uint8_t i = sl; i < std::size(layerStart); ++i) if (detId < layerStart[i + 1]) return i; return std::size(layerStart); @@ -100,7 +103,15 @@ namespace phase1PixelTopology { } constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride; - constexpr std::array layer = map_to_array(findLayerFromCompact); +#ifdef __CUDA_ARCH__ + __device__ +#endif + constexpr std::array + layer = map_to_array(findLayerFromCompact); + + constexpr uint8_t getLayer(uint32_t detId) { + return phase1PixelTopology::layer[detId / phase1PixelTopology::maxModuleStride]; + } constexpr bool validateLayerIndex() { bool res = true; diff --git a/Geometry/TrackerGeometryBuilder/test/BuildFile.xml b/Geometry/TrackerGeometryBuilder/test/BuildFile.xml index d9efc604f4627..238ae24f0f170 100644 --- a/Geometry/TrackerGeometryBuilder/test/BuildFile.xml +++ b/Geometry/TrackerGeometryBuilder/test/BuildFile.xml @@ -40,7 +40,11 @@ - + + + + + diff --git a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu similarity index 86% rename from Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp rename to Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu index 2dda8ed41500b..7344382de7293 100644 --- a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp +++ b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cu @@ -4,6 +4,9 @@ #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" + namespace { // original code from CMSSW_4_4 @@ -123,7 +126,23 @@ namespace { } // namespace +constexpr void testLayer() { + for (auto i = 0U; i < phase1PixelTopology::numberOfModules; ++i) { + uint32_t layer = phase1PixelTopology::getLayer(i); + uint32_t tLayer = phase1PixelTopology::findLayer(i); + assert(tLayer == layer); + //std::cout << "module " << i << ": " << "layer " << layer << ", \"" << phase1PixelTopology::layerName[layer] << "\", [" << phase1PixelTopology::layerStart[layer] << ", " << phase1PixelTopology::layerStart[layer+1] << ")" << std::endl; + assert(layer < phase1PixelTopology::numberOfLayers); + assert(i >= phase1PixelTopology::layerStart[layer]); + assert(i < phase1PixelTopology::layerStart[layer + 1]); + } +} + +__global__ void kernel_testLayer() { testLayer(); } + int main() { + cms::cudatest::requireDevices(); + for (uint16_t ix = 0; ix < 80 * 2; ++ix) { auto ori = localXori(ix); auto xl = phase1PixelTopology::localX(ix); @@ -144,17 +163,17 @@ int main() { for (auto i = 0U; i < phase1PixelTopology::numberOfLayers; ++i) { std::cout << "layer " << i << ", \"" << phase1PixelTopology::layerName[i] << "\", [" - << phase1PixelTopology::layerStart[i] << ", " << phase1PixelTopology::layerStart[i + 1] << ")" - << std::endl; + << phase1PixelTopology::layerStart[i] << ", " << phase1PixelTopology::layerStart[i + 1] << ") " + << phase1PixelTopology::layerStart[i + 1] - phase1PixelTopology::layerStart[i] << std::endl; } - for (auto i = 0U; i < phase1PixelTopology::numberOfModules; ++i) { - auto layer = static_cast(phase1PixelTopology::layer[i / phase1PixelTopology::maxModuleStride]); - //std::cout << "module " << i << ": " << "layer " << layer << ", \"" << phase1PixelTopology::layerName[layer] << "\", [" << phase1PixelTopology::layerStart[layer] << ", " << phase1PixelTopology::layerStart[layer+1] << ")" << std::endl; - assert(layer < phase1PixelTopology::numberOfLayers); - assert(i >= phase1PixelTopology::layerStart[layer]); - assert(i < phase1PixelTopology::layerStart[layer + 1]); - } + std::cout << "maxModuleStide layerIndexSize " << phase1PixelTopology::maxModuleStride << ' ' + << phase1PixelTopology::layerIndexSize << std::endl; + + testLayer(); + + kernel_testLayer<<<1, 1>>>(); + cudaCheck(cudaDeviceSynchronize()); return 0; } diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h index 4b661bad26681..fbcac4c07836f 100644 --- a/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h +++ b/RecoPixelVertexing/PixelTrackFitting/interface/BrokenLine.h @@ -154,8 +154,17 @@ namespace brokenline { riemannFit::Vector2d dVec; riemannFit::Vector2d eVec; - dVec = hits.block(0, 1, 2, 1) - hits.block(0, 0, 2, 1); - eVec = hits.block(0, n - 1, 2, 1) - hits.block(0, n - 2, 2, 1); + int mId = 1; + + if constexpr (n > 3) { + riemannFit::Vector2d middle = 0.5 * (hits.block(0, n - 1, 2, 1) + hits.block(0, 0, 2, 1)); + auto d1 = (hits.block(0, n / 2, 2, 1) - middle).squaredNorm(); + auto d2 = (hits.block(0, n / 2 - 1, 2, 1) - middle).squaredNorm(); + mId = d1 < d2 ? n / 2 : n / 2 - 1; + } + + dVec = hits.block(0, mId, 2, 1) - hits.block(0, 0, 2, 1); + eVec = hits.block(0, n - 1, 2, 1) - hits.block(0, mId, 2, 1); results.qCharge = riemannFit::cross2D(dVec, eVec) > 0 ? -1 : 1; const double slope = -results.qCharge / fast_fit(3); @@ -249,8 +258,17 @@ namespace brokenline { __host__ __device__ inline void fastFit(const M3xN& hits, V4& result) { constexpr uint32_t n = M3xN::ColsAtCompileTime; - const riemannFit::Vector2d a = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1); - const riemannFit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, n / 2, 2, 1); + int mId = 1; + + if constexpr (n > 3) { + riemannFit::Vector2d middle = 0.5 * (hits.block(0, n - 1, 2, 1) + hits.block(0, 0, 2, 1)); + auto d1 = (hits.block(0, n / 2, 2, 1) - middle).squaredNorm(); + auto d2 = (hits.block(0, n / 2 - 1, 2, 1) - middle).squaredNorm(); + mId = d1 < d2 ? n / 2 : n / 2 - 1; + } + + const riemannFit::Vector2d a = hits.block(0, mId, 2, 1) - hits.block(0, 0, 2, 1); + const riemannFit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, mId, 2, 1); const riemannFit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1); auto tmp = 0.5 / riemannFit::cross2D(c, a); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc index bebfe0e08008e..1523640e2ef8f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc @@ -3,67 +3,118 @@ void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) { assert(tuples_); +#ifdef BROKENLINE_DEBUG + setlinebuf(stdout); +#endif + // Fit internals - auto hitsGPU_ = - std::make_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double)); - auto hits_geGPU_ = - std::make_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float)); - auto fast_fit_resultsGPU_ = + auto tkidGPU = std::make_unique(maxNumberOfConcurrentFits_); + auto hitsGPU = + std::make_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double)); + auto hits_geGPU = + std::make_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float)); + auto fast_fit_resultsGPU = std::make_unique(maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double)); for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // fit triplets - kernel_BLFastFit<3>( - tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset); + kernel_BLFastFit<3>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 3, + 3, + offset); kernel_BLFit<3>(tupleMultiplicity_, bField_, outputSoa_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 3, - offset); - - // fit quads - kernel_BLFastFit<4>( - tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset); - - kernel_BLFit<4>(tupleMultiplicity_, - bField_, - outputSoa_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 4, - offset); + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); - if (fit5as4_) { - // fit penta (only first 4) - kernel_BLFastFit<4>( - tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); + if (fitNas4_) { + // fit all as 4 + kernel_BLFastFit<4>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 4, + 8, + offset); kernel_BLFit<4>(tupleMultiplicity_, bField_, outputSoa_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); } else { + // fit quads + kernel_BLFastFit<4>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 4, + 4, + offset); + + kernel_BLFit<4>(tupleMultiplicity_, + bField_, + outputSoa_, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); // fit penta (all 5) - kernel_BLFastFit<5>( - tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); + kernel_BLFastFit<5>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 5, + 5, + offset); kernel_BLFit<5>(tupleMultiplicity_, bField_, outputSoa_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); + // fit sexta and above (as 6) + kernel_BLFastFit<6>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 6, + 8, + offset); + + kernel_BLFit<6>(tupleMultiplicity_, + bField_, + outputSoa_, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); } } // loop on concurrent fits diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu index d2ca583e86bd0..d99a96b705451 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu @@ -11,73 +11,120 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; // Fit internals - auto hitsGPU_ = cms::cuda::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double), stream); - auto hits_geGPU_ = cms::cuda::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float), stream); - auto fast_fit_resultsGPU_ = cms::cuda::make_device_unique( + auto tkidGPU = cms::cuda::make_device_unique(maxNumberOfConcurrentFits_, stream); + auto hitsGPU = cms::cuda::make_device_unique( + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double), stream); + auto hits_geGPU = cms::cuda::make_device_unique( + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float), stream); + auto fast_fit_resultsGPU = cms::cuda::make_device_unique( maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream); for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // fit triplets - kernel_BLFastFit<3><<>>( - tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 3, offset); + kernel_BLFastFit<3><<>>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 3, + 3, + offset); cudaCheck(cudaGetLastError()); kernel_BLFit<3><<>>(tupleMultiplicity_, bField_, outputSoa_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 3, - offset); - cudaCheck(cudaGetLastError()); - - // fit quads - kernel_BLFastFit<4><<>>( - tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 4, offset); - cudaCheck(cudaGetLastError()); - - kernel_BLFit<4><<>>(tupleMultiplicity_, - bField_, - outputSoa_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 4, - offset); + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); cudaCheck(cudaGetLastError()); - if (fit5as4_) { - // fit penta (only first 4) - kernel_BLFastFit<4><<>>( - tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); + if (fitNas4_) { + // fit all as 4 + kernel_BLFastFit<4><<>>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 4, + 8, + offset); cudaCheck(cudaGetLastError()); kernel_BLFit<4><<>>(tupleMultiplicity_, bField_, outputSoa_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); - cudaCheck(cudaGetLastError()); + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); } else { - // fit penta (all 5) - kernel_BLFastFit<5><<>>( - tuples_, tupleMultiplicity_, hv, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), 5, offset); + // fit quads + kernel_BLFastFit<4><<>>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 4, + 4, + offset); cudaCheck(cudaGetLastError()); - kernel_BLFit<5><<>>(tupleMultiplicity_, + kernel_BLFit<4><<>>(tupleMultiplicity_, bField_, outputSoa_, - hitsGPU_.get(), - hits_geGPU_.get(), - fast_fit_resultsGPU_.get(), - 5, - offset); + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); + // fit penta (all 5) + kernel_BLFastFit<5><<>>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 5, + 5, + offset); + cudaCheck(cudaGetLastError()); + + kernel_BLFit<5><<<8, blockSize, 0, stream>>>(tupleMultiplicity_, + bField_, + outputSoa_, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); + cudaCheck(cudaGetLastError()); + // fit sexta and above (as 6) + kernel_BLFastFit<6><<<4, blockSize, 0, stream>>>(tuples_, + tupleMultiplicity_, + hv, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get(), + 6, + 8, + offset); + cudaCheck(cudaGetLastError()); + + kernel_BLFit<6><<<4, blockSize, 0, stream>>>(tupleMultiplicity_, + bField_, + outputSoa_, + tkidGPU.get(), + hitsGPU.get(), + hits_geGPU.get(), + fast_fit_resultsGPU.get()); cudaCheck(cudaGetLastError()); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index d572c8010cdfb..6ec6afb83cba1 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -19,6 +19,8 @@ using HitsOnGPU = TrackingRecHit2DSOAView; using Tuples = pixelTrack::HitContainer; using OutputSoA = pixelTrack::TrackSoA; +using tindex_type = caConstants::tindex_type; +constexpr auto invalidTkId = std::numeric_limits::max(); // #define BL_DUMP_HITS @@ -26,41 +28,53 @@ template __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, HitsOnGPU const *__restrict__ hhp, + tindex_type *__restrict__ ptkids, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit, - uint32_t nHits, - uint32_t offset) { + uint32_t nHitsL, + uint32_t nHitsH, + int32_t offset) { constexpr uint32_t hitsInFit = N; - assert(hitsInFit <= nHits); - + assert(hitsInFit <= nHitsL); + assert(nHitsL <= nHitsH); assert(hhp); + assert(phits); assert(pfast_fit); assert(foundNtuplets); assert(tupleMultiplicity); // look in bin for this hit multiplicity auto local_start = blockIdx.x * blockDim.x + threadIdx.x; + int totTK = tupleMultiplicity->end(nHitsH) - tupleMultiplicity->begin(nHitsL); + assert(totTK <= int(tupleMultiplicity->size())); + assert(totTK >= 0); #ifdef BROKENLINE_DEBUG if (0 == local_start) { - printf("%d total Ntuple\n", foundNtuplets->nOnes()); - printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit); + printf("%d total Ntuple\n", tupleMultiplicity->size()); + printf("%d Ntuple of size %d/%d for %d hits to fit\n", totTK, nHitsL, nHitsH, hitsInFit); } #endif for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt; local_idx += gridDim.x * blockDim.x) { - auto tuple_idx = local_idx + offset; - if (tuple_idx >= tupleMultiplicity->size(nHits)) + int tuple_idx = local_idx + offset; + if (tuple_idx >= totTK) { + ptkids[local_idx] = invalidTkId; break; - + } // get it from the ntuple container (one to one to helix) - auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + auto tkid = *(tupleMultiplicity->begin(nHitsL) + tuple_idx); assert(tkid < foundNtuplets->nOnes()); - assert(foundNtuplets->size(tkid) == nHits); + ptkids[local_idx] = tkid; + + auto nHits = foundNtuplets->size(tkid); + + assert(nHits >= nHitsL); + assert(nHits <= nHitsH); riemannFit::Map3xNd hits(phits + local_idx); riemannFit::Map4d fast_fit(pfast_fit + local_idx); @@ -84,9 +98,18 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, auto dz = hhp->zGlobal(hitId[hitsInFit - 1]) - hhp->zGlobal(hitId[0]); float ux, uy, uz; #endif - for (unsigned int i = 0; i < hitsInFit; ++i) { - auto hit = hitId[i]; + + float incr = std::max(1.f, float(nHits) / float(hitsInFit)); + float n = 0; + for (uint32_t i = 0; i < hitsInFit; ++i) { + int j = int(n + 0.5f); // round + if (hitsInFit - 1 == i) + j = nHits - 1; // force last hit to ensure max lever arm. + assert(j < int(nHits)); + n += incr; + auto hit = hitId[j]; float ge[6]; + #ifdef YERR_FROM_DC auto const &dp = hhp->cpeParams().detParams(hhp->detectorIndex(hit)); auto status = hhp->status(hit); @@ -115,26 +138,21 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, #endif #ifdef BL_DUMP_HITS + bool dump = foundNtuplets->size(tkid) == 5; if (dump) { - printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", + printf("Track id %d %d Hit %d on %d\nGlobal: hits.col(%d) << %f,%f,%f\n", + local_idx, tkid, + hit, hhp->detectorIndex(hit), i, hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit)); - printf("Error: %d: %d hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", - tkid, - hhp->detetectorIndex(hit), - i, - ge[0], - ge[1], - ge[2], - ge[3], - ge[4], - ge[5]); + printf("Error: hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", i, ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]); } #endif + hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; } @@ -152,13 +170,10 @@ template __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, double bField, OutputSoA *results, + tindex_type const *__restrict__ ptkids, double *__restrict__ phits, float *__restrict__ phits_ge, - double *__restrict__ pfast_fit, - uint32_t nHits, - uint32_t offset) { - assert(N <= nHits); - + double *__restrict__ pfast_fit) { assert(results); assert(pfast_fit); @@ -168,12 +183,12 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ auto local_start = blockIdx.x * blockDim.x + threadIdx.x; for (int local_idx = local_start, nt = riemannFit::maxNumberOfConcurrentFits; local_idx < nt; local_idx += gridDim.x * blockDim.x) { - auto tuple_idx = local_idx + offset; - if (tuple_idx >= tupleMultiplicity->size(nHits)) + if (invalidTkId == ptkids[local_idx]) break; - // get it for the ntuple container (one to one to helix) - auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + auto tkid = ptkids[local_idx]; + + assert(tkid < caConstants::maxTuples); riemannFit::Map3xNd hits(phits + local_idx); riemannFit::Map4d fast_fit(pfast_fit + local_idx); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h index c1428a6daaf55..127831e0e2eb7 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h @@ -44,6 +44,7 @@ namespace caConstants { constexpr uint32_t maxNumberOfLayerPairs = 20; constexpr uint32_t maxNumberOfLayers = 10; constexpr uint32_t maxTuples = maxNumberOfTuples; + constexpr int32_t maxHitsOnTrack = 10; // Modules constants constexpr uint32_t max_ladder_bpx0 = 12; @@ -76,7 +77,7 @@ namespace caConstants { using OuterHitOfCellContainer = cms::cuda::VecArray; using TuplesContainer = cms::cuda::OneToManyAssoc; using HitToTuple = cms::cuda::OneToManyAssoc; // 3.5 should be enough - using TupleMultiplicity = cms::cuda::OneToManyAssoc; + using TupleMultiplicity = cms::cuda::OneToManyAssoc; struct OuterHitOfCell { OuterHitOfCellContainer* container; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index c36e6787ef871..bd8151a2a816a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -13,11 +13,6 @@ void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters const *counters) { kernel_printCounters(counters); } -template <> -void CAHitNtupletGeneratorKernelsCPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t) { - kernel_fillHitDetIndices(&tracks_d->hitIndices, hv, &tracks_d->detIndices); -} - template <> void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { auto nhits = hh.nHits(); @@ -85,6 +80,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr template <> void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { auto *tuples_d = &tracks_d->hitIndices; + auto *detId_d = &tracks_d->detIndices; auto *quality_d = tracks_d->qualityData(); assert(tuples_d && quality_d); @@ -128,12 +124,15 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * quality_d, params_.minHitsPerNtuplet_); if (params_.doStats_) - kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_); + kernel_mark_used(device_theCells_.get(), device_nCells_); cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d); + kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d); + kernel_fillNLayers(tracks_d); + // remove duplicates (tracks that share a doublet) - kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, quality_d, params_.dupPassThrough_); + kernel_earlyDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_); kernel_countMultiplicity(tuples_d, quality_d, device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(device_tupleMultiplicity_.get(), cudaStream); @@ -160,7 +159,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA } // remove duplicates (tracks that share a doublet) - kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tuples_d, tracks_d, params_.dupPassThrough_); + kernel_fastDuplicateRemover(device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_); // fill hit->track "map" if (params_.doSharedHitCut_ || params_.doStats_) { @@ -171,37 +170,21 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // remove duplicates (tracks that share at least one hit) if (params_.doSharedHitCut_) { - kernel_rejectDuplicate(hh.view(), - tuples_d, - tracks_d, - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); + kernel_rejectDuplicate( + tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); kernel_sharedHitCleaner(hh.view(), - tuples_d, tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); if (params_.useSimpleTripletCleaner_) { - kernel_simpleTripletCleaner(hh.view(), - tuples_d, - tracks_d, - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); + kernel_simpleTripletCleaner( + tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } else { - kernel_tripletCleaner(hh.view(), - tuples_d, - tracks_d, - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); + kernel_tripletCleaner( + tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index ae770cd77e740..689cc0afd052b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -1,23 +1,10 @@ #include "RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h" -template <> -void CAHitNtupletGeneratorKernelsGPU::fillHitDetIndices(HitsView const *hv, TkSoA *tracks_d, cudaStream_t cudaStream) { - auto blockSize = 128; - auto numberOfBlocks = (HitContainer::ctCapacity() + blockSize - 1) / blockSize; - - kernel_fillHitDetIndices<<>>( - &tracks_d->hitIndices, hv, &tracks_d->detIndices); - cudaCheck(cudaGetLastError()); -#ifdef GPU_DEBUG - cudaDeviceSynchronize(); - cudaCheck(cudaGetLastError()); -#endif -} - template <> void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA *tracks_d, cudaStream_t cudaStream) { // these are pointer on GPU! auto *tuples_d = &tracks_d->hitIndices; + auto *detId_d = &tracks_d->detIndices; auto *quality_d = tracks_d->qualityData(); // zero tuples @@ -89,7 +76,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * cudaCheck(cudaGetLastError()); if (params_.doStats_) - kernel_mark_used<<>>(hh.view(), device_theCells_.get(), device_nCells_); + kernel_mark_used<<>>(device_theCells_.get(), device_nCells_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -101,10 +88,15 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize; cms::cuda::finalizeBulk<<>>(device_hitTuple_apc_, tuples_d); + kernel_fillHitDetIndices<<>>(tuples_d, hh.view(), detId_d); + cudaCheck(cudaGetLastError()); + kernel_fillNLayers<<>>(tracks_d); + cudaCheck(cudaGetLastError()); + // remove duplicates (tracks that share a doublet) numberOfBlocks = nDoubletBlocks(blockSize); kernel_earlyDuplicateRemover<<>>( - device_theCells_.get(), device_nCells_, tuples_d, quality_d, params_.dupPassThrough_); + device_theCells_.get(), device_nCells_, tracks_d, quality_d, params_.dupPassThrough_); cudaCheck(cudaGetLastError()); blockSize = 128; @@ -254,7 +246,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // mark duplicates (tracks that share a doublet) numberOfBlocks = nDoubletBlocks(blockSize); kernel_fastDuplicateRemover<<>>( - device_theCells_.get(), device_nCells_, tuples_d, tracks_d, params_.dupPassThrough_); + device_theCells_.get(), device_nCells_, tracks_d, params_.dupPassThrough_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -282,16 +274,10 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // mark duplicates (tracks that share at least one hit) numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize; - kernel_rejectDuplicate<<>>(hh.view(), - tuples_d, - tracks_d, - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); + kernel_rejectDuplicate<<>>( + tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); kernel_sharedHitCleaner<<>>(hh.view(), - tuples_d, tracks_d, quality_d, params_.minHitsForSharingCut_, @@ -299,21 +285,11 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA device_hitToTuple_.get()); if (params_.useSimpleTripletCleaner_) { - kernel_simpleTripletCleaner<<>>(hh.view(), - tuples_d, - tracks_d, - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); + kernel_simpleTripletCleaner<<>>( + tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } else { - kernel_tripletCleaner<<>>(hh.view(), - tuples_d, - tracks_d, - quality_d, - params_.minHitsForSharingCut_, - params_.dupPassThrough_, - device_hitToTuple_.get()); + kernel_tripletCleaner<<>>( + tracks_d, quality_d, params_.minHitsForSharingCut_, params_.dupPassThrough_, device_hitToTuple_.get()); } cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index 684a54f5d2ed4..10a02309185a3 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -59,7 +59,7 @@ namespace cAHitNtupletGenerator { uint32_t maxNumberOfDoublets, uint16_t minHitsForSharingCuts, bool useRiemannFit, - bool fit5as4, + bool fitNas4, bool includeJumpingForwardDoublets, bool earlyFishbone, bool lateFishbone, @@ -84,7 +84,7 @@ namespace cAHitNtupletGenerator { maxNumberOfDoublets_(maxNumberOfDoublets), minHitsForSharingCut_(minHitsForSharingCuts), useRiemannFit_(useRiemannFit), - fit5as4_(fit5as4), + fitNas4_(fitNas4), includeJumpingForwardDoublets_(includeJumpingForwardDoublets), earlyFishbone_(earlyFishbone), lateFishbone_(lateFishbone), @@ -109,7 +109,7 @@ namespace cAHitNtupletGenerator { const uint32_t maxNumberOfDoublets_; const uint16_t minHitsForSharingCut_; const bool useRiemannFit_; - const bool fit5as4_; + const bool fitNas4_; const bool includeJumpingForwardDoublets_; const bool earlyFishbone_; const bool lateFishbone_; @@ -185,8 +185,6 @@ class CAHitNtupletGeneratorKernels { void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); - void fillHitDetIndices(HitsView const* hv, TkSoA* tuples_d, cudaStream_t cudaStream); - void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); void allocateOnGPU(int32_t nHits, cudaStream_t stream); void cleanup(cudaStream_t cudaStream); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 418fff8d8d67e..86d7ab8b5a1a6 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -78,9 +78,9 @@ __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, } for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) { - if (foundNtuplets->size(idx) > 5) + if (foundNtuplets->size(idx) > 7) // current real limit printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); - assert(foundNtuplets->size(idx) < 6); + assert(foundNtuplets->size(idx) <= caConstants::maxHitsOnTrack); for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) assert(int(*ih) < nHits); } @@ -137,12 +137,14 @@ __global__ void kernel_fishboneCleaner(GPUCACell const *cells, uint32_t const *_ // It does not seem to affect efficiency in any way! __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, uint32_t const *__restrict__ nCells, - HitContainer *foundNtuplets, + TkSoA const *__restrict__ ptracks, Quality *quality, bool dupPassThrough) { // quality to mark rejected constexpr auto reject = pixelTrack::Quality::edup; /// cannot be loose + auto const &tracks = *ptracks; + assert(nCells); auto first = threadIdx.x + blockIdx.x * blockDim.x; for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { @@ -150,22 +152,21 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, if (thisCell.tracks().size() < 2) continue; - //if (0==thisCell.theUsed) continue; - // if (thisCell.theDoubletId < 0) continue; - uint32_t maxNh = 0; + int8_t maxNl = 0; - // find maxNh + // find maxNl for (auto it : thisCell.tracks()) { - auto nh = foundNtuplets->size(it); - maxNh = std::max(nh, maxNh); + auto nl = tracks.nLayers(it); + maxNl = std::max(nl, maxNl); } + // if (maxNl<4) continue; // quad pass through (leave it her for tests) - // maxNh = std::min(4U, maxNh); + // maxNl = std::min(4, maxNl); for (auto it : thisCell.tracks()) { - if (foundNtuplets->size(it) < maxNh) + if (tracks.nLayers(it) < maxNl) quality[it] = reject; //no race: simple assignment of the same constant } } @@ -174,7 +175,6 @@ __global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, // assume the above (so, short tracks already removed) __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, uint32_t const *__restrict__ nCells, - HitContainer const *__restrict__ foundNtuplets, TkSoA *__restrict__ tracks, bool dupPassThrough) { // quality to mark rejected @@ -188,14 +188,13 @@ __global__ void kernel_fastDuplicateRemover(GPUCACell const *__restrict__ cells, auto const &thisCell = cells[idx]; if (thisCell.tracks().size() < 2) continue; - // if (thisCell.theDoubletId < 0) continue; float mc = maxScore; uint16_t im = tkNotFound; /* chi2 penalize higher-pt tracks (try rescale it?) auto score = [&](auto it) { - return foundNtuplets->size(it) < 4 ? + return tracks->nHits(it) < 4 ? std::abs(tracks->tip(it)) : // tip for triplets tracks->chi2(it); //chi2 for quads }; @@ -329,8 +328,8 @@ __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1, : dcaCutOuterTriplet, hardCurvCut)) { // FIXME tune cuts oc.addOuterNeighbor(cellIndex, *cellNeighbors); - thisCell.setUsedBit(1); - oc.setUsedBit(1); + thisCell.setStatusBits(GPUCACell::StatusBit::kUsed); + oc.setStatusBits(GPUCACell::StatusBit::kUsed); } } // loop on inner cells } // loop on outer cells @@ -368,14 +367,12 @@ __global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp, } } -__global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp, - GPUCACell *__restrict__ cells, - uint32_t const *nCells) { +__global__ void kernel_mark_used(GPUCACell *__restrict__ cells, uint32_t const *nCells) { auto first = threadIdx.x + blockIdx.x * blockDim.x; for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { auto &thisCell = cells[idx]; if (!thisCell.tracks().empty()) - thisCell.setUsedBit(2); + thisCell.setStatusBits(GPUCACell::StatusBit::kInTrack); } } @@ -390,9 +387,9 @@ __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundN if (quality[it] == pixelTrack::Quality::edup) continue; assert(quality[it] == pixelTrack::Quality::bad); - if (nhits > 5) + if (nhits > 7) // current limit printf("wrong mult %d %d\n", it, nhits); - assert(nhits < 8); + assert(nhits <= caConstants::maxHitsOnTrack); tupleMultiplicity->count(nhits); } } @@ -408,9 +405,9 @@ __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNt if (quality[it] == pixelTrack::Quality::edup) continue; assert(quality[it] == pixelTrack::Quality::bad); - if (nhits > 5) + if (nhits > 7) printf("wrong mult %d %d\n", it, nhits); - assert(nhits < 8); + assert(nhits <= caConstants::maxHitsOnTrack); tupleMultiplicity->fill(nhits, it); } } @@ -560,6 +557,17 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples } } +__global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks) { + auto &tracks = *ptracks; + auto first = blockIdx.x * blockDim.x + threadIdx.x; + for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) { + auto nHits = tracks.nHits(idx); + if (nHits == 0) + break; // this is a guard: maybe we need to move to nTracks... + tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx); + } +} + __global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ hitToTuple, CAHitNtupletGeneratorKernelsGPU::Counters *counters) { auto &c = *counters; @@ -633,9 +641,7 @@ __global__ void kernel_markSharedHit(int const *__restrict__ nshared, } // mostly for very forward triplets..... -__global__ void kernel_rejectDuplicate(TrackingRecHit2DSOAView const *__restrict__ hhp, - HitContainer const *__restrict__ ptuples, - TkSoA const *__restrict__ ptracks, +__global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks, Quality *__restrict__ quality, uint16_t nmin, bool dupPassThrough, @@ -644,7 +650,6 @@ __global__ void kernel_rejectDuplicate(TrackingRecHit2DSOAView const *__restrict auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; auto &hitToTuple = *phitToTuple; - auto const &foundNtuplets = *ptuples; auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; @@ -653,12 +658,12 @@ __global__ void kernel_rejectDuplicate(TrackingRecHit2DSOAView const *__restrict continue; /* chi2 is bad for large pt - auto score = [&](auto it, auto nh) { - return nh < 4 ? std::abs(tracks.tip(it)) : // tip for triplets + auto score = [&](auto it, auto nl) { + return nl < 4 ? std::abs(tracks.tip(it)) : // tip for triplets tracks.chi2(it); //chi2 }; */ - auto score = [&](auto it, auto nh) { return std::abs(tracks.tip(it)); }; + auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); }; // full combinatorics for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { @@ -670,7 +675,7 @@ __global__ void kernel_rejectDuplicate(TrackingRecHit2DSOAView const *__restrict auto e2opi = tracks.stateAtBS.covariance(it)(9); auto cti = tracks.stateAtBS.state(it)(3); auto e2cti = tracks.stateAtBS.covariance(it)(12); - auto nhi = foundNtuplets.size(it); + auto nli = tracks.nLayers(it); for (auto jp = ip + 1; jp != hitToTuple.end(idx); ++jp) { auto const jt = *jp; auto qj = quality[jt]; @@ -684,8 +689,8 @@ __global__ void kernel_rejectDuplicate(TrackingRecHit2DSOAView const *__restrict auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi); if ((opi - opj) * (opi - opj) > dop) continue; - auto nhj = foundNtuplets.size(jt); - if (nhj < nhi || (nhj == nhi && (qj < qi || (qj == qi && score(it, nhi) < score(jt, nhj))))) + auto nlj = tracks.nLayers(jt); + if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj))))) quality[jt] = reject; else { quality[it] = reject; @@ -697,10 +702,9 @@ __global__ void kernel_rejectDuplicate(TrackingRecHit2DSOAView const *__restrict } __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp, - HitContainer const *__restrict__ ptuples, TkSoA const *__restrict__ ptracks, Quality *__restrict__ quality, - uint16_t nmin, + int nmin, bool dupPassThrough, CAHitNtupletGeneratorKernelsGPU::HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected @@ -709,8 +713,7 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric auto const longTqual = pixelTrack::Quality::highPurity; auto &hitToTuple = *phitToTuple; - auto const &foundNtuplets = *ptuples; - // auto const &tracks = *ptracks; + auto const &tracks = *ptracks; auto const &hh = *hhp; int l1end = hh.hitsLayerStart()[1]; @@ -720,39 +723,38 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric if (hitToTuple.size(idx) < 2) continue; - uint32_t maxNh = 0; + int8_t maxNl = 0; - // find maxNh + // find maxNl for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { if (quality[*it] < longTqual) continue; - uint32_t nh = foundNtuplets.size(*it); - maxNh = std::max(nh, maxNh); + // if (tracks.nHits(*it)==3) continue; + auto nl = tracks.nLayers(*it); + maxNl = std::max(nl, maxNl); } - if (maxNh < 4) + if (maxNl < 4) continue; // quad pass through (leave for tests) - // maxNh = std::min(4U, maxNh); + // maxNl = std::min(4, maxNl); - // kill all tracks shorter than maxHn (only triplets??? + // kill all tracks shorter than maxHl (only triplets??? for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - uint32_t nh = foundNtuplets.size(*it); + auto nl = tracks.nLayers(*it); //checking if shared hit is on bpix1 and if the tuple is short enough - if (idx < l1end and nh > nmin) + if (idx < l1end and nl > nmin) continue; - if (nh < maxNh && quality[*it] > reject) + if (nl < maxNl && quality[*it] > reject) quality[*it] = reject; } } } -__global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp, - HitContainer const *__restrict__ ptuples, - TkSoA const *__restrict__ ptracks, +__global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, Quality *__restrict__ quality, uint16_t nmin, bool dupPassThrough, @@ -763,7 +765,6 @@ __global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict_ auto const good = pixelTrack::Quality::strict; auto &hitToTuple = *phitToTuple; - auto const &foundNtuplets = *ptuples; auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; @@ -773,18 +774,19 @@ __global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict_ float mc = maxScore; uint16_t im = tkNotFound; - uint32_t maxNh = 0; + bool onlyTriplets = true; - // find maxNh + // check if only triplets for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { if (quality[*it] <= good) continue; - uint32_t nh = foundNtuplets.size(*it); - maxNh = std::max(nh, maxNh); + onlyTriplets &= tracks.isTriplet(*it); + if (!onlyTriplets) + break; } // only triplets - if (maxNh != 3) + if (!onlyTriplets) continue; // for triplets choose best tip! (should we first find best quality???) @@ -810,8 +812,6 @@ __global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict_ } __global__ void kernel_simpleTripletCleaner( - TrackingRecHit2DSOAView const *__restrict__ hhp, - HitContainer const *__restrict__ ptuples, TkSoA const *__restrict__ ptracks, Quality *__restrict__ quality, uint16_t nmin, @@ -823,7 +823,6 @@ __global__ void kernel_simpleTripletCleaner( auto const good = pixelTrack::Quality::loose; auto &hitToTuple = *phitToTuple; - auto const &foundNtuplets = *ptuples; auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; @@ -849,7 +848,7 @@ __global__ void kernel_simpleTripletCleaner( // mark worse ambiguities for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] > reject && foundNtuplets.size(it) == 3 && it != im) + if (quality[it] > reject && tracks.isTriplet(it) && it != im) quality[it] = reject; //no race: simple assignment of the same constant } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index bf16e63b27163..d03e5e88b5699 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -63,7 +63,7 @@ CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg.getParameter("maxNumberOfDoublets"), cfg.getParameter("minHitsForSharingCut"), cfg.getParameter("useRiemannFit"), - cfg.getParameter("fit5as4"), + cfg.getParameter("fitNas4"), cfg.getParameter("includeJumpingForwardDoublets"), cfg.getParameter("earlyFishbone"), cfg.getParameter("lateFishbone"), @@ -149,17 +149,17 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& desc.add("fillStatistics", false); desc.add("minHitsPerNtuplet", 4); desc.add("maxNumberOfDoublets", caConstants::maxNumberOfDoublets); - desc.add("minHitsForSharingCut", 5) + desc.add("minHitsForSharingCut", 10) ->setComment("Maximum number of hits in a tuple to clean also if the shared hit is on bpx1"); desc.add("includeJumpingForwardDoublets", false); - desc.add("fit5as4", true); + desc.add("fitNas4", false)->setComment("fit only 4 hits out of N"); desc.add("doClusterCut", true); desc.add("doZ0Cut", true); desc.add("doPtCut", true); desc.add("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine"); desc.add("doSharedHitCut", true)->setComment("Sharing hit nTuples cleaning"); desc.add("dupPassThrough", false)->setComment("Do not reject duplicate"); - desc.add("useSimpleTripletCleaner", false)->setComment("use alternate implementation"); + desc.add("useSimpleTripletCleaner", true)->setComment("use alternate implementation"); edm::ParameterSetDescription trackQualityCuts; trackQualityCuts.add("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut"); @@ -194,9 +194,8 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH kernels.buildDoublets(hits_d, stream); kernels.launchKernels(hits_d, soa, stream); - kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available" - HelixFitOnGPU fitter(bfield, m_params.fit5as4_); + HelixFitOnGPU fitter(bfield, m_params.fitNas4_); fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); if (m_params.useRiemannFit_) { fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); @@ -226,13 +225,12 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC kernels.buildDoublets(hits_d, nullptr); kernels.launchKernels(hits_d, soa, nullptr); - kernels.fillHitDetIndices(hits_d.view(), soa, nullptr); // in principle needed only if Hits not "available" if (0 == hits_d.nHits()) return tracks; // now fit - HelixFitOnGPU fitter(bfield, m_params.fit5as4_); + HelixFitOnGPU fitter(bfield, m_params.fitNas4_); fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); if (m_params.useRiemannFit_) { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 50cef5b59cb80..2bcfbf401fb14 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -38,20 +38,21 @@ class GPUCACell { using Quality = pixelTrack::Quality; static constexpr auto bad = pixelTrack::Quality::bad; + enum class StatusBit : uint16_t { kUsed = 1, kInTrack = 2, kKilled = 1 << 15 }; + GPUCACell() = default; __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors, CellTracksVector& cellTracks, Hits const& hh, int layerPairId, - int doubletId, hindex_type innerHitId, hindex_type outerHitId) { theInnerHitId = innerHitId; theOuterHitId = outerHitId; - theDoubletId_ = doubletId; theLayerPairId_ = layerPairId; - theUsed_ = 0; + theStatus_ = 0; + theFishboneId = std::numeric_limits::max(); // optimization that depends on access pattern theInnerZ = hh.zGlobal(innerHitId); @@ -130,8 +131,7 @@ class GPUCACell { constexpr unsigned int outer_hit_id() const { return theOuterHitId; } __device__ void print_cell() const { - printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: %d \n", - theDoubletId_, + printf("printing cell: on layerPair: %d, innerHitId: %d, outerHitId: %d \n", theLayerPairId_, theInnerHitId, theOuterHitId); @@ -287,12 +287,13 @@ class GPUCACell { // the ntuplets is then saved if the number of hits it contains is greater // than a threshold - tmpNtuplet.push_back_unsafe(theDoubletId_); + auto doubletId = this - cells; + tmpNtuplet.push_back_unsafe(doubletId); assert(tmpNtuplet.size() <= 4); bool last = true; for (unsigned int otherCell : outerNeighbors()) { - if (cells[otherCell].theDoubletId_ < 0) + if (cells[otherCell].isKilled()) continue; // killed by earlyFishbone last = false; cells[otherCell].find_ntuplets( @@ -306,13 +307,20 @@ class GPUCACell { ((!startAt0) && hole0(hh, cells[tmpNtuplet[0]]))) #endif { - hindex_type hits[6]; + hindex_type hits[8]; auto nh = 0U; + constexpr int maxFB = 2; // for the time being let's limit this + int nfb = 0; for (auto c : tmpNtuplet) { hits[nh++] = cells[c].theInnerHitId; + if (nfb < maxFB && cells[c].hasFishbone()) { + ++nfb; + hits[nh++] = cells[c].theFishboneId; // fishbone hit is always outer than inner hit + } } + assert(nh < caConstants::maxHitsOnTrack); hits[nh] = theOuterHitId; - auto it = foundNtuplets.bulkFill(apc, hits, tmpNtuplet.size() + 1); + auto it = foundNtuplets.bulkFill(apc, hits, nh + 1); if (it >= 0) { // if negative is overflow.... for (auto c : tmpNtuplet) cells[c].addTrack(it, cellTracks); @@ -326,26 +334,32 @@ class GPUCACell { } // Cell status management - __device__ __forceinline__ void kill() { theDoubletId_ = -1; } - __device__ __forceinline__ bool isKilled() const { return theDoubletId_ < 0; } + __device__ __forceinline__ void kill() { theStatus_ |= uint16_t(StatusBit::kKilled); } + __device__ __forceinline__ bool isKilled() const { return theStatus_ & uint16_t(StatusBit::kKilled); } __device__ __forceinline__ int16_t layerPairId() const { return theLayerPairId_; } - __device__ __forceinline__ bool unused() const { return 0 == theUsed_; } - __device__ __forceinline__ void setUsedBit(uint16_t bit) { theUsed_ |= bit; } + __device__ __forceinline__ bool unused() const { return 0 == (uint16_t(StatusBit::kUsed) & theStatus_); } + __device__ __forceinline__ void setStatusBits(StatusBit mask) { theStatus_ |= uint16_t(mask); } + + __device__ __forceinline__ void setFishbone(hindex_type id) { theFishboneId = id; } + __device__ __forceinline__ auto fishboneId() const { return theFishboneId; } + __device__ __forceinline__ bool hasFishbone() const { + return theFishboneId != std::numeric_limits::max(); + } private: CellNeighbors* theOuterNeighbors; CellTracks* theTracks; - int32_t theDoubletId_; int16_t theLayerPairId_; - uint16_t theUsed_; // tbd + uint16_t theStatus_; // tbd float theInnerZ; float theInnerR; hindex_type theInnerHitId; hindex_type theOuterHitId; + hindex_type theFishboneId; }; template <> diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 938994840f8c0..9a9c85970af33 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -40,7 +40,7 @@ class HelixFitOnGPU { using TupleMultiplicity = caConstants::TupleMultiplicity; - explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {} + explicit HelixFitOnGPU(float bf, bool fitNas4) : bField_(bf), fitNas4_(fitNas4) {} ~HelixFitOnGPU() { deallocateOnGPU(); } void setBField(double bField) { bField_ = bField; } @@ -62,7 +62,7 @@ class HelixFitOnGPU { OutputSoA *outputSoa_; float bField_; - const bool fit5as4_; + const bool fitNas4_; }; #endif // RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc index 491dd0df2004f..13d665b597b13 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc @@ -61,7 +61,7 @@ void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits circle_fit_resultsGPU, offset); - if (fit5as4_) { + if (fitNas4_) { // penta kernel_FastFit<4>( tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu index 90af2ac13730b..d8530bac964c1 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu @@ -74,7 +74,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, offset); cudaCheck(cudaGetLastError()); - if (fit5as4_) { + if (fitNas4_) { // penta kernel_FastFit<4><<>>( tuples_, tupleMultiplicity_, 5, hv, hitsGPU.get(), hits_geGPU.get(), fast_fit_resultsGPU.get(), offset); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h index 9a3572df4e313..8eb3faaa677f7 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h @@ -36,7 +36,7 @@ namespace gpuPixelDoublets { float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit]; uint32_t cc[maxCellsPerHit]; uint16_t d[maxCellsPerHit]; - // uint8_t l[maxCellsPerHit]; + uint8_t l[maxCellsPerHit]; for (int idy = firstY, nt = nHits - offset; idy < nt; idy += gridDim.y * blockDim.y) { auto const& vc = isOuterHitOfCell[idy]; @@ -57,6 +57,7 @@ namespace gpuPixelDoublets { if (checkTrack && ci.tracks().empty()) continue; cc[sg] = vc[ic]; + l[sg] = ci.layerPairId(); d[sg] = ci.inner_detIndex(hh); x[sg] = ci.inner_x(hh) - xo; y[sg] = ci.inner_y(hh) - yo; @@ -71,17 +72,29 @@ namespace gpuPixelDoublets { auto& ci = cells[cc[ic]]; for (auto jc = ic + 1; jc < sg; ++jc) { auto& cj = cells[cc[jc]]; - // must be different detectors (in the same layer) + // must be different detectors // if (d[ic]==d[jc]) continue; - // || l[ic]!=l[jc]) continue; auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc]; if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * n[ic] * n[jc]) { - // alligned: kill farthest (prefer consecutive layers) + // alligned: kill farthest (prefer consecutive layers) + // if same layer prefer farthest (longer level arm) and make space for intermediate hit + bool sameLayer = l[ic] == l[jc]; if (n[ic] > n[jc]) { - ci.kill(); - break; + if (sameLayer) { + cj.kill(); // closest + ci.setFishbone(cj.inner_hit_id()); + } else { + ci.kill(); // farthest + break; + } } else { - cj.kill(); + if (!sameLayer) { + cj.kill(); // farthest + } else { + ci.kill(); // closest + cj.setFishbone(ci.inner_hit_id()); + break; + } } } } // cj diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h index 37970048b3168..80316d24c748b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h @@ -222,7 +222,7 @@ namespace gpuPixelDoublets { break; } // move to SimpleVector?? // int layerPairId, int doubletId, int innerHitId, int outerHitId) - cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, ind, i, oi); + cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, i, oi); isOuterHitOfCell[oi].push_back(ind); #ifdef GPU_DEBUG if (isOuterHitOfCell[oi].full()) diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index b8568a7bbcfe9..2defd6f5ba2ca 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -34,7 +34,7 @@ namespace gpuVertexFinder { // initialize soa... soa->idv[idx] = -1; - if (nHits < 4) + if (tracks.isTriplet(idx)) continue; // no triplets if (quality[idx] < pixelTrack::Quality::highPurity) continue; diff --git a/Validation/RecoTrack/python/TrackValidation_cff.py b/Validation/RecoTrack/python/TrackValidation_cff.py index 0b3b973da0d53..d77e589dbee89 100644 --- a/Validation/RecoTrack/python/TrackValidation_cff.py +++ b/Validation/RecoTrack/python/TrackValidation_cff.py @@ -1000,10 +1000,10 @@ def _uniqueFirstLayers(layerList): cut = cms.string("") ) -cutstring = "numberOfValidHits == 3" +cutstring = "hitPattern.trackerLayersWithMeasurement == 3" pixelTracks3hits = trackRefSelector.clone( cut = cutstring ) -cutstring = "numberOfValidHits >= 4" +cutstring = "hitPattern.trackerLayersWithMeasurement >= 4" pixelTracks4hits = trackRefSelector.clone( cut = cutstring ) cutstring = "pt > 0.9" @@ -1014,7 +1014,7 @@ def _uniqueFirstLayers(layerList): #pixelTracksFromPVPt09 = generalTracksPt09.clone(quality = ["loose","tight","highPurity"], vertexTag = "pixelVertices", src = "pixelTracksFromPV") pixelTracksFromPVPt09 = pixelTracksFromPV.clone(ptMin = 0.9) -cutstring = "numberOfValidHits >= 4" +cutstring = "hitPattern.trackerLayersWithMeasurement >= 4" #pixelTracksFromPV4hits = trackRefSelector.clone( cut = cutstring, src = "pixelTracksFromPV" ) pixelTracksFromPV4hits = pixelTracksFromPV.clone( numberOfValidPixelHits = 4 ) @@ -1124,7 +1124,7 @@ def _uniqueFirstLayers(layerList): tracksPreValidationPixelTrackingOnly.add(locals()[label]) #----------- label = "pixelTracks4hits"+key - cutstring = "numberOfValidHits == 4 & qualityMask <= 7 & qualityMask >= " + str(qualityBit) + cutstring = "hitPattern.trackerLayersWithMeasurement >= 4 & qualityMask <= 7 & qualityMask >= " + str(qualityBit) locals()[label] = trackRefSelector.clone( cut = cutstring ) tracksPreValidationPixelTrackingOnly.add(locals()[label]) @@ -1134,7 +1134,7 @@ def _uniqueFirstLayers(layerList): tracksPreValidationPixelTrackingOnly.add(locals()[label]) #-------- label = "pixelTracks3hits"+key - cutstring = "numberOfValidHits == 3 & qualityMask <= 7 & qualityMask >= " + str(qualityBit) + cutstring = "hitPattern.trackerLayersWithMeasurement == 3 & qualityMask <= 7 & qualityMask >= " + str(qualityBit) locals()[label] = trackRefSelector.clone( cut = cutstring ) tracksPreValidationPixelTrackingOnly.add(locals()[label])