-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fully implement fishbone in patatrack #36215
Changes from 19 commits
67c26fa
654223b
6e3c160
69c6015
626f6fe
6f4b794
fbda253
f76c7af
78b8be1
a5b3e17
8f11205
8d9e72d
088654a
a4a692e
3f4ba47
88cbddc
1c58efd
0842c63
4afbf2c
b2bdc8a
ac07875
eaa9260
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
#include <algorithm> | ||
|
||
#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" | ||
#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" | ||
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" | ||
|
||
#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" | ||
|
@@ -42,8 +43,27 @@ class TrackSoAHeterogeneousT { | |
// this is chi2/ndof as not necessarely all hits are used in the fit | ||
eigenSoA::ScalarSoA<float, S> chi2; | ||
|
||
eigenSoA::ScalarSoA<int8_t, S> nLayers; | ||
|
||
constexpr int nHits(int i) const { return detIndices.size(i); } | ||
|
||
constexpr bool isTriplet(int i) const { return nLayers(i) == 3; } | ||
|
||
constexpr int computeNumberOfLayers(int32_t i) const { | ||
// layers are in order and we assume tracks are either forward or backward | ||
auto pdet = detIndices.begin(i); | ||
int nl = 1; | ||
auto ol = phase1PixelTopology::getLayer(*pdet); | ||
for (; pdet < detIndices.end(i); ++pdet) { | ||
auto il = phase1PixelTopology::getLayer(*pdet); // , ol); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the purpose of the commented out part here? ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. was needed when using findLayer, will clean |
||
if (il != ol) | ||
++nl; | ||
ol = il; | ||
} | ||
assert(nl >= 3); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why the assert ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. formally You are right. was just there to catch a possible bug. will delete. |
||
return nl; | ||
} | ||
|
||
// State at the Beam spot | ||
// phi,tip,1/pt,cotan(theta),zip | ||
TrajectoryStateSoAT<S> stateAtBS; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,79 @@ | |
|
||
#include <Eigen/Core> | ||
|
||
/** | ||
namespace math { | ||
namespace cholesky { | ||
|
||
template <typename M1, typename M2, int N = M2::ColsAtCompileTime> | ||
// without this: either does not compile or compiles and then fails silently at runtime | ||
#ifdef __CUDACC__ | ||
__host__ __device__ | ||
#endif | ||
inline constexpr void | ||
invertNN(M1 const& src, M2& dst) { | ||
|
||
// origin: CERNLIB | ||
|
||
using T = typename M2::Scalar; | ||
|
||
T a[N][N]; | ||
for (int i = 0; i < N; ++i) { | ||
a[i][i] = src(i, i); | ||
for (int j = i + 1; j < N; ++j) | ||
// a[i][j] = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not needed because we only care about the triangular part ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. |
||
a[j][i] = src(i, j); | ||
} | ||
|
||
for (int j = 0; j < N; ++j) { | ||
a[j][j] = T(1.) / a[j][j]; | ||
int jp1 = j + 1; | ||
for (int l = jp1; l < N; ++l) { | ||
a[j][l] = a[j][j] * a[l][j]; | ||
T s1 = -a[l][jp1]; | ||
for (int i = 0; i < jp1; ++i) | ||
s1 += a[l][i] * a[i][jp1]; | ||
a[l][jp1] = -s1; | ||
} | ||
} | ||
|
||
if constexpr (N == 1) { | ||
dst(0, 0) = a[0][0]; | ||
return; | ||
} | ||
a[0][1] = -a[0][1]; | ||
a[1][0] = a[0][1] * a[1][1]; | ||
for (int j = 2; j < N; ++j) { | ||
int jm1 = j - 1; | ||
for (int k = 0; k < jm1; ++k) { | ||
T s31 = a[k][j]; | ||
for (int i = k; i < jm1; ++i) | ||
s31 += a[k][i + 1] * a[i + 1][j]; | ||
a[k][j] = -s31; | ||
a[j][k] = -s31 * a[j][j]; | ||
} | ||
a[jm1][j] = -a[jm1][j]; | ||
a[j][jm1] = a[jm1][j] * a[j][j]; | ||
} | ||
|
||
int j = 0; | ||
while (j < N - 1) { | ||
T s33 = a[j][j]; | ||
for (int i = j + 1; i < N; ++i) | ||
s33 += a[j][i] * a[i][j]; | ||
dst(j, j) = s33; | ||
|
||
++j; | ||
for (int k = 0; k < j; ++k) { | ||
T s32 = 0; | ||
for (int i = j; i < N; ++i) | ||
s32 += a[k][i] * a[i][j]; | ||
dst(k, j) = dst(j, k) = s32; | ||
} | ||
} | ||
dst(j, j) = a[j][j]; | ||
} | ||
|
||
/** | ||
* fully inlined specialized code to perform the inversion of a | ||
* positive defined matrix of rank up to 6. | ||
* | ||
|
@@ -16,8 +88,6 @@ | |
* | ||
* | ||
*/ | ||
namespace math { | ||
namespace cholesky { | ||
|
||
template <typename M1, typename M2> | ||
inline constexpr void __attribute__((always_inline)) invert11(M1 const& src, M2& dst) { | ||
|
@@ -336,12 +406,12 @@ namespace math { | |
}; | ||
|
||
// Eigen interface | ||
template <typename D1, typename D2> | ||
inline constexpr void __attribute__((always_inline)) | ||
invert(Eigen::DenseBase<D1> const& src, Eigen::DenseBase<D2>& dst) { | ||
using M1 = Eigen::DenseBase<D1>; | ||
using M2 = Eigen::DenseBase<D2>; | ||
Inverter<M1, M2, M2::ColsAtCompileTime>::eval(src, dst); | ||
template <typename M1, typename M2> | ||
inline constexpr void __attribute__((always_inline)) invert(M1 const& src, M2& dst) { | ||
if constexpr (M2::ColsAtCompileTime < 7) | ||
Inverter<M1, M2, M2::ColsAtCompileTime>::eval(src, dst); | ||
else | ||
invertNN(src, dst); | ||
} | ||
|
||
} // namespace cholesky | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -138,6 +138,7 @@ int main() { | |
go<5>(true); | ||
go<6>(true); | ||
|
||
// go<10>(); | ||
go<10>(false); | ||
go<10>(true); | ||
return 0; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
#include "DataFormats/Math/interface/choleskyInversion.h" | ||
|
||
using namespace math::cholesky; | ||
|
||
#include <Eigen/Core> | ||
#include <Eigen/Eigenvalues> | ||
|
||
#include <iomanip> | ||
#include <memory> | ||
#include <algorithm> | ||
#include <chrono> | ||
#include <random> | ||
|
||
#include <cassert> | ||
#include <iostream> | ||
#include <limits> | ||
|
||
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" | ||
#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" | ||
|
||
template <typename M, int N> | ||
__global__ void invert(M* mm, int n) { | ||
auto i = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (i >= n) | ||
return; | ||
|
||
auto& m = mm[i]; | ||
|
||
printf("before %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1)); | ||
|
||
invertNN(m, m); | ||
|
||
printf("after %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1)); | ||
} | ||
|
||
template <typename M, int N> | ||
__global__ void invertE(M* mm, int n) { | ||
auto i = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (i >= n) | ||
return; | ||
|
||
auto& m = mm[i]; | ||
|
||
printf("before %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1)); | ||
|
||
m = m.inverse(); | ||
|
||
printf("after %d %f %f %f\n", N, m(0, 0), m(1, 0), m(1, 1)); | ||
} | ||
|
||
// generate matrices | ||
template <class M> | ||
void genMatrix(M& m) { | ||
using T = typename std::remove_reference<decltype(m(0, 0))>::type; | ||
int n = M::ColsAtCompileTime; | ||
std::mt19937 eng; | ||
// std::mt19937 eng2; | ||
std::uniform_real_distribution<T> rgen(0., 1.); | ||
|
||
// generate first diagonal elemets | ||
for (int i = 0; i < n; ++i) { | ||
double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 | ||
m(i, i) = maxVal * rgen(eng); | ||
} | ||
for (int i = 0; i < n; ++i) { | ||
for (int j = 0; j < i; ++j) { | ||
double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined | ||
m(i, j) = v * rgen(eng); | ||
m(j, i) = m(i, j); | ||
} | ||
} | ||
} | ||
|
||
template <int DIM> | ||
using MXN = Eigen::Matrix<double, DIM, DIM>; | ||
|
||
int main() { | ||
cms::cudatest::requireDevices(); | ||
|
||
constexpr int DIM = 6; | ||
|
||
using M = MXN<DIM>; | ||
|
||
M m; | ||
|
||
genMatrix(m); | ||
|
||
printf("on CPU before %d %f %f %f\n", DIM, m(0, 0), m(1, 0), m(1, 1)); | ||
|
||
invertNN(m, m); | ||
|
||
printf("on CPU after %d %f %f %f\n", DIM, m(0, 0), m(1, 0), m(1, 1)); | ||
|
||
double* d; | ||
cudaMalloc(&d, sizeof(M)); | ||
cudaMemcpy(d, &m, sizeof(M), cudaMemcpyHostToDevice); | ||
invert<M, DIM><<<1, 1>>>((M*)d, 1); | ||
cudaCheck(cudaDeviceSynchronize()); | ||
invertE<M, DIM><<<1, 1>>>((M*)d, 1); | ||
cudaCheck(cudaDeviceSynchronize()); | ||
|
||
return 0; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
|
||
#include <cstdint> | ||
#include <array> | ||
// #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this needed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not anymore, we prefer NOT to have dependency on HeterogeneousCore in Geometry for the time being |
||
|
||
namespace phase1PixelTopology { | ||
|
||
|
@@ -27,17 +28,20 @@ namespace phase1PixelTopology { | |
|
||
constexpr uint32_t numberOfModules = 1856; | ||
constexpr uint32_t numberOfLayers = 10; | ||
constexpr uint32_t layerStart[numberOfLayers + 1] = {0, | ||
96, | ||
320, | ||
672, // barrel | ||
1184, | ||
1296, | ||
1408, // positive endcap | ||
1520, | ||
1632, | ||
1744, // negative endcap | ||
numberOfModules}; | ||
#ifdef __CUDA_ARCH__ | ||
__device__ | ||
#endif | ||
Comment on lines
+30
to
+32
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like we should "abstract" this approach and make it easier to declare Of course this PR is ok as it is, I'm just thinking how we can improve things later. Maybe something like #ifdef __CUDA_ARCH__
#define HOST_DEVICE_AGGREGATE_CONSTANT __device__ constexpr
#else
#define HOST_DEVICE_AGGREGATE_CONSTANT constexpr
#endif ? Or maybe a simpler name :-) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did not want to introduce dependencies in this file in this moment (is used also in code that does not depend on Heterogeous) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Opened #36311 to remember about this. |
||
constexpr uint32_t layerStart[numberOfLayers + 1] = {0, | ||
96, | ||
320, | ||
672, // barrel | ||
1184, | ||
1296, | ||
1408, // positive endcap | ||
1520, | ||
1632, | ||
1744, // negative endcap | ||
numberOfModules}; | ||
constexpr char const* layerName[numberOfLayers] = { | ||
"BL1", | ||
"BL2", | ||
|
@@ -84,8 +88,8 @@ namespace phase1PixelTopology { | |
|
||
constexpr uint32_t maxModuleStride = findMaxModuleStride(); | ||
|
||
constexpr uint8_t findLayer(uint32_t detId) { | ||
for (uint8_t i = 0; i < std::size(layerStart); ++i) | ||
constexpr uint8_t findLayer(uint32_t detId, uint8_t sl = 0) { | ||
for (uint8_t i = sl; i < std::size(layerStart); ++i) | ||
if (detId < layerStart[i + 1]) | ||
return i; | ||
return std::size(layerStart); | ||
|
@@ -100,7 +104,15 @@ namespace phase1PixelTopology { | |
} | ||
|
||
constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride; | ||
constexpr std::array<uint8_t, layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact); | ||
#ifdef __CUDA_ARCH__ | ||
__device__ | ||
#endif | ||
constexpr std::array<uint8_t, layerIndexSize> | ||
layer = map_to_array<layerIndexSize>(findLayerFromCompact); | ||
|
||
constexpr uint8_t getLayer(uint32_t detId) { | ||
return phase1PixelTopology::layer[detId / phase1PixelTopology::maxModuleStride]; | ||
} | ||
|
||
constexpr bool validateLayerIndex() { | ||
bool res = true; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how strict are the rules for the dependencies in the CUDADataFormats?
@makortel please clarify.
Here, a dependence on
Geometry/TrackerGeometryBuilder
seems somewhat out of order.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
CUDADataFormats
fall in the same category as "transient products".There is already a dependence here
cmssw/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
Line 9 in df8ebdd
so this PR is not making things (much) worse.
But probably (after this PR) it would be necessary to think again its placement, because in the long term I'd assume classes using the
phase1PixelTopology
to be persisted (at which point the relevant classes should be defined inDataFormats
).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Dependency on Geometry is allowed in DataFormat since ages
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DataFormats
are allowed to depend on some subpackages ofGeometry
, but not everything. Currently the set is limited toGeometry/CaloGeometry
Geometry/CommonDetUnit
Geometry/CommonTopologies
Geometry/GEMGeometry
(from FWLite build set).
Geometry/TrackerGeometryBuilder
itself would bring inDetectorDescription/Core
DetectorDescription/DDCMS
CondFormats/GeometryObjects
FWCore/ParameterSet
(although this might be possible to remove?)dd4hep
These dependencies would be too much for
DataFormats
(although there is already one case depending onFWCore/ParameterSet
).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for what I am concerned
phase1PixelTopology.h
can move anywhere else: the whole idea is to have geometry constants as constant not as database or DD quantities!Moving elsewhere will need changes in many packages. I propose to open an issue and not hold this specific PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed, I was trying to think for longer term.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I opened an issue #36302