cms-sw · cmsbuild · Dec 1, 2021 · Nov 11, 2021 · Nov 12, 2021 · Nov 12, 2021
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
@@ -5,6 +5,7 @@
 #include <algorithm>
 
 #include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
+#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
 #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" 
 #include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" 
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
 #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
@@ -42,8 +43,27 @@ class TrackSoAHeterogeneousT {
   // this is chi2/ndof as not necessarely all hits are used in the fit
   eigenSoA::ScalarSoA<float, S> chi2;
 
+  eigenSoA::ScalarSoA<int8_t, S> nLayers;
+
   constexpr int nHits(int i) const { return detIndices.size(i); }
 
+  constexpr bool isTriplet(int i) const { return nLayers(i) == 3; }
+
+  constexpr int computeNumberOfLayers(int32_t i) const {
+    // layers are in order and we assume tracks are either forward or backward
+    auto pdet = detIndices.begin(i);
+    int nl = 1;
+    auto ol = phase1PixelTopology::getLayer(*pdet);
+    for (; pdet < detIndices.end(i); ++pdet) {
+      auto il = phase1PixelTopology::getLayer(*pdet);  // , ol);
+      if (il != ol)
+        ++nl;
+      ol = il;
+    }
+    assert(nl >= 3);
+    return nl;
+  }
+
   // State at the Beam spot
   // phi,tip,1/pt,cotan(theta),zip
   TrajectoryStateSoAT<S> stateAtBS;

diff --git a/DataFormats/Math/interface/choleskyInversion.h b/DataFormats/Math/interface/choleskyInversion.h
@@ -5,7 +5,79 @@
 
 #include <Eigen/Core>
 
-/**
+namespace math {
+  namespace cholesky {
+
+    template <typename M1, typename M2, int N = M2::ColsAtCompileTime>
+// without this: either does not compile or compiles and then fails silently at runtime
+#ifdef __CUDACC__
+    __host__ __device__
+#endif
+        inline constexpr void
+        invertNN(M1 const& src, M2& dst) {
+
+      // origin: CERNLIB
+
+      using T = typename M2::Scalar;
+
+      T a[N][N];
+      for (int i = 0; i < N; ++i) {
+        a[i][i] = src(i, i);
+        for (int j = i + 1; j < N; ++j)
+          // a[i][j] =
+          a[j][i] = src(i, j);
+      }
+
+      for (int j = 0; j < N; ++j) {
+        a[j][j] = T(1.) / a[j][j];
+        int jp1 = j + 1;
+        for (int l = jp1; l < N; ++l) {
+          a[j][l] = a[j][j] * a[l][j];
+          T s1 = -a[l][jp1];
+          for (int i = 0; i < jp1; ++i)
+            s1 += a[l][i] * a[i][jp1];
+          a[l][jp1] = -s1;
+        }
+      }
+
+      if constexpr (N == 1) {
+        dst(0, 0) = a[0][0];
+        return;
+      }
+      a[0][1] = -a[0][1];
+      a[1][0] = a[0][1] * a[1][1];
+      for (int j = 2; j < N; ++j) {
+        int jm1 = j - 1;
+        for (int k = 0; k < jm1; ++k) {
+          T s31 = a[k][j];
+          for (int i = k; i < jm1; ++i)
+            s31 += a[k][i + 1] * a[i + 1][j];
+          a[k][j] = -s31;
+          a[j][k] = -s31 * a[j][j];
+        }
+        a[jm1][j] = -a[jm1][j];
+        a[j][jm1] = a[jm1][j] * a[j][j];
+      }
+
+      int j = 0;
+      while (j < N - 1) {
+        T s33 = a[j][j];
+        for (int i = j + 1; i < N; ++i)
+          s33 += a[j][i] * a[i][j];
+        dst(j, j) = s33;
+
+        ++j;
+        for (int k = 0; k < j; ++k) {
+          T s32 = 0;
+          for (int i = j; i < N; ++i)
+            s32 += a[k][i] * a[i][j];
+          dst(k, j) = dst(j, k) = s32;
+        }
+      }
+      dst(j, j) = a[j][j];
+    }
+
+    /**
  * fully inlined specialized code to perform the inversion of a
  * positive defined matrix of rank up to 6.
  *
@@ -16,8 +88,6 @@
  *
  *
  */
-namespace math {
-  namespace cholesky {
 
     template <typename M1, typename M2>
     inline constexpr void __attribute__((always_inline)) invert11(M1 const& src, M2& dst) {
@@ -336,12 +406,12 @@ namespace math {
     };
 
     // Eigen interface
-    template <typename D1, typename D2>
-    inline constexpr void __attribute__((always_inline))
-    invert(Eigen::DenseBase<D1> const& src, Eigen::DenseBase<D2>& dst) {
-      using M1 = Eigen::DenseBase<D1>;
-      using M2 = Eigen::DenseBase<D2>;
-      Inverter<M1, M2, M2::ColsAtCompileTime>::eval(src, dst);
+    template <typename M1, typename M2>
+    inline constexpr void __attribute__((always_inline)) invert(M1 const& src, M2& dst) {
+      if constexpr (M2::ColsAtCompileTime < 7)
+        Inverter<M1, M2, M2::ColsAtCompileTime>::eval(src, dst);
+      else
+        invertNN(src, dst);
     }
 
   }  // namespace cholesky

diff --git a/DataFormats/Math/test/BuildFile.xml b/DataFormats/Math/test/BuildFile.xml
@@ -127,4 +127,10 @@
     <flags CUDA_FLAGS="-w"/>
   </bin>
 
+  <bin file="simpleCholeskyTest.cu">
+    <use name="cuda"/>
+    <use name="HeterogeneousCore/CUDAUtilities"/>
+    <flags CUDA_FLAGS="-w"/>
+  </bin>
+
 </iftool>
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cpp b/DataFormats/Math/test/CholeskyInvert_t.cpp
@@ -138,6 +138,7 @@ int main() {
   go<5>(true);
   go<6>(true);
 
-  // go<10>();
+  go<10>(false);
+  go<10>(true);
   return 0;
 }
diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu
@@ -85,7 +85,8 @@ template <int N>
 void go(bool soa) {
   constexpr unsigned int DIM = N;
   using MX = MXN<DIM>;
-  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl;
+  std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << " in " << (soa ? "SOA" : "AOS")
+            << " mode" << std::endl;
 
   auto start = std::chrono::high_resolution_clock::now();
   auto delta = start - start;
@@ -197,13 +198,15 @@ int main() {
   go<4>(false);
   go<5>(false);
   go<6>(false);
+  go<7>(false);
+  go<10>(false);
 
   go<2>(true);
   go<3>(true);
   go<4>(true);
   go<5>(true);
   go<6>(true);
-
-  // go<10>();
+  go<7>(true);
+  go<10>(true);
   return 0;
 }
diff --git a/DataFormats/Math/test/simpleCholeskyTest.cu b/DataFormats/Math/test/simpleCholeskyTest.cu
@@ -0,0 +1,103 @@
+#include "DataFormats/Math/interface/choleskyInversion.h"
+
+using namespace math::cholesky;
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+#include <iomanip>
+#include <memory>
+#include <algorithm>
+#include <chrono>
+#include <random>
+
+#include <cassert>
+#include <iostream>
+#include <limits>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+
+template <typename M, int N>
+__global__ void invert(M* mm, int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  auto& m = mm[i];
+
+  printf("before %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1));
+
+  invertNN(m, m);
+
+  printf("after %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1));
+}
+
+template <typename M, int N>
+__global__ void invertE(M* mm, int n) {
+  auto i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n)
+    return;
+
+  auto& m = mm[i];
+
+  printf("before %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1));
+
+  m = m.inverse();
+
+  printf("after %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1));
+}
+
+// generate matrices
+template <class M>
+void genMatrix(M& m) {
+  using T = typename std::remove_reference<decltype(m(0, 0))>::type;
+  int n = M::ColsAtCompileTime;
+  std::mt19937 eng;
+  // std::mt19937 eng2;
+  std::uniform_real_distribution<T> rgen(0., 1.);
+
+  // generate first diagonal elemets
+  for (int i = 0; i < n; ++i) {
+    double maxVal = i * 10000 / (n - 1) + 1;  // max condition is 10^4
+    m(i, i) = maxVal * rgen(eng);
+  }
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3 * std::sqrt(m(i, i) * m(j, j));  // this makes the matrix pos defined
+      m(i, j) = v * rgen(eng);
+      m(j, i) = m(i, j);
+    }
+  }
+}
+
+template <int DIM>
+using MXN = Eigen::Matrix<double, DIM, DIM>;
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  constexpr int DIM = 6;
+
+  using M = MXN<DIM>;
+
+  M m;
+
+  genMatrix(m);
+
+  printf("on CPU before %d %f %f %f\n", DIM, m(0, 0), m(1, 0), m(1, 1));
+
+  invertNN(m, m);
+
+  printf("on CPU after %d %f %f %f\n", DIM, m(0, 0), m(1, 0), m(1, 1));
+
+  double* d;
+  cudaMalloc(&d, sizeof(M));
+  cudaMemcpy(d, &m, sizeof(M), cudaMemcpyHostToDevice);
+  invert<M, DIM><<<1, 1>>>((M*)d, 1);
+  cudaCheck(cudaDeviceSynchronize());
+  invertE<M, DIM><<<1, 1>>>((M*)d, 1);
+  cudaCheck(cudaDeviceSynchronize());
+
+  return 0;
+}
diff --git a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
@@ -3,6 +3,7 @@
 
 #include <cstdint>
 #include <array>
+// #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 
 namespace phase1PixelTopology {
 
@@ -27,17 +28,20 @@ namespace phase1PixelTopology {
 
   constexpr uint32_t numberOfModules = 1856;
   constexpr uint32_t numberOfLayers = 10;
-  constexpr uint32_t layerStart[numberOfLayers + 1] = {0,
-                                                       96,
-                                                       320,
-                                                       672,  // barrel
-                                                       1184,
-                                                       1296,
-                                                       1408,  // positive endcap
-                                                       1520,
-                                                       1632,
-                                                       1744,  // negative endcap
-                                                       numberOfModules};
+#ifdef __CUDA_ARCH__
+  __device__
+#endif
+      constexpr uint32_t layerStart[numberOfLayers + 1] = {0,
+                                                           96,
+                                                           320,
+                                                           672,  // barrel
+                                                           1184,
+                                                           1296,
+                                                           1408,  // positive endcap
+                                                           1520,
+                                                           1632,
+                                                           1744,  // negative endcap
+                                                           numberOfModules};
   constexpr char const* layerName[numberOfLayers] = {
       "BL1",
       "BL2",
@@ -84,8 +88,8 @@ namespace phase1PixelTopology {
 
   constexpr uint32_t maxModuleStride = findMaxModuleStride();
 
-  constexpr uint8_t findLayer(uint32_t detId) {
-    for (uint8_t i = 0; i < std::size(layerStart); ++i)
+  constexpr uint8_t findLayer(uint32_t detId, uint8_t sl = 0) {
+    for (uint8_t i = sl; i < std::size(layerStart); ++i)
       if (detId < layerStart[i + 1])
         return i;
     return std::size(layerStart);
@@ -100,7 +104,15 @@ namespace phase1PixelTopology {
   }
 
   constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride;
-  constexpr std::array<uint8_t, layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);
+#ifdef __CUDA_ARCH__
+  __device__
+#endif
+      constexpr std::array<uint8_t, layerIndexSize>
+          layer = map_to_array<layerIndexSize>(findLayerFromCompact);
+
+  constexpr uint8_t getLayer(uint32_t detId) {
+    return phase1PixelTopology::layer[detId / phase1PixelTopology::maxModuleStride];
+  }
 
   constexpr bool validateLayerIndex() {
     bool res = true;

diff --git a/Geometry/TrackerGeometryBuilder/test/BuildFile.xml b/Geometry/TrackerGeometryBuilder/test/BuildFile.xml
@@ -40,7 +40,11 @@
   <flags EDM_PLUGIN="1"/>
 </library>
 
-<bin file="phase1PixelTopology_t.cpp"/>
+<bin file="phase1PixelTopology_t.cu">
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <use name="cuda"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
 
 <bin file="runTests.cpp" name="GeometryTrackerGeometryBuilderTestDriver">
   <use name="FWCore/Utilities"/>