Merge pull request #20 from cerati/mt-devel

Fix standard track building.
slava77 · Jan 12, 2016 · 1bc0bef · 1bc0bef
2 parents c5f8c4c + 9378b30
commit 1bc0bef
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 34 deletions.
diff --git a/mkFit/MkBuilder.cc b/mkFit/MkBuilder.cc
@@ -504,8 +504,9 @@ void MkBuilder::FindTracks()
         std::cout << "processing lay=" << ilay+1 << std::endl;
 #endif
 
-        //prepare unrolled vector to loop over
+        // prepare unrolled vector to loop over
         std::vector<std::pair<int,int> > seed_cand_idx;
+
         for (int iseed = otd.th_start_seed; iseed != otd.th_end_seed; ++iseed)
         {
           for (int ic = 0; ic < etabin_of_comb_candidates.m_candidates[iseed].size(); ++ic)
@@ -546,7 +547,20 @@ void MkBuilder::FindTracks()
           //fixme find a way to deal only with the candidates needed in this thread
           mkfp->InputTracksAndHitIdx(etabin_of_comb_candidates.m_candidates,
                                      seed_cand_idx, itrack, end,
-                                     true);
+                                     ilay == Config::nlayers_per_seed);
+
+          //propagate to layer
+          if (ilay > Config::nlayers_per_seed)
+          {
+#ifdef DEBUG
+            std::cout << "propagate to lay=" << ilay+1 << " start from x=" << mkfp->getPar(0, 0, 0) << " y=" << mkfp->getPar(0, 0, 1) << " z=" << mkfp->getPar(0, 0, 2)<< " r=" << getHypot(mkfp->getPar(0, 0, 0), mkfp->getPar(0, 0, 1))
+                      << " px=" << mkfp->getPar(0, 0, 3) << " py=" << mkfp->getPar(0, 0, 4) << " pz=" << mkfp->getPar(0, 0, 5) << " pT=" << getHypot(mkfp->getPar(0, 0, 3), mkfp->getPar(0, 0, 4)) << std::endl;
+#endif
+            mkfp->PropagateTracksToR(m_event->geom_.Radius(ilay), end - itrack);
+#ifdef DEBUG
+            std::cout << "propagate to lay=" << ilay+1 << " arrive at x=" << mkfp->getPar(0, 1, 0) << " y=" << mkfp->getPar(0, 1, 1) << " z=" << mkfp->getPar(0, 1, 2)<< " r=" << getHypot(mkfp->getPar(0, 1, 0), mkfp->getPar(0, 1, 1)) << std::endl;
+#endif
+          }
 
 #ifdef DEBUG
           std::cout << "now get hit range" << std::endl;
@@ -562,23 +576,7 @@ void MkBuilder::FindTracks()
           std::cout << "make new candidates" << std::endl;
 #endif
 
-          mkfp->FindCandidates(bunch_of_hits, tmp_candidates, otd.th_start_seed);
-
-          //propagate to layer
-          // This is sort of a silly fix as no-clone-engine code produces
-          // zero good tracks with propagate-at-the-end.
-          // But at least it doesn't crash with uncaught exception :)
-          if (ilay + 1 < Config::nLayers)
-          {
-#ifdef DEBUG
-            std::cout << "propagate to lay=" << ilay+2 << " start from x=" << mkfp->getPar(0, 0, 0) << " y=" << mkfp->getPar(0, 0, 1) << " z=" << mkfp->getPar(0, 0, 2)<< " r=" << getHypot(mkfp->getPar(0, 0, 0), mkfp->getPar(0, 0, 1))
-                      << " px=" << mkfp->getPar(0, 0, 3) << " py=" << mkfp->getPar(0, 0, 4) << " pz=" << mkfp->getPar(0, 0, 5) << " pT=" << getHypot(mkfp->getPar(0, 0, 3), mkfp->getPar(0, 0, 4)) << std::endl;
-#endif
-            mkfp->PropagateTracksToR(m_event->geom_.Radius(ilay+1), end - itrack);
-#ifdef DEBUG
-            std::cout << "propagate to lay=" << ilay+2 << " arrive at x=" << mkfp->getPar(0, 1, 0) << " y=" << mkfp->getPar(0, 1, 1) << " z=" << mkfp->getPar(0, 1, 2)<< " r=" << getHypot(mkfp->getPar(0, 1, 0), mkfp->getPar(0, 1, 1)) << std::endl;
-#endif
-          }
+          mkfp->FindCandidates(bunch_of_hits, tmp_candidates, otd.th_start_seed, end - itrack);
 
         } //end of vectorized loop
 

diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc
@@ -109,7 +109,6 @@ void MkFitter::InputTracksAndHitIdx(std::vector<std::vector<Track> >& tracks,
   int itrack = 0;
   for (int i = beg; i < end; ++i, ++itrack)
   {
-
     Track &trk = tracks[idxs[i].first][idxs[i].second];
 
     Label(itrack, 0, 0) = trk.label();
@@ -119,7 +118,7 @@ void MkFitter::InputTracksAndHitIdx(std::vector<std::vector<Track> >& tracks,
     Err[iI].CopyIn(itrack, trk.errors().Array());
     Par[iI].CopyIn(itrack, trk.parameters().Array());
 
-    Chg(itrack, 0, 0) = trk.charge();
+    Chg (itrack, 0, 0) = trk.charge();
     Chi2(itrack, 0, 0) = trk.chi2();
 
     for (int hi = 0; hi < Nhits; ++hi)
@@ -914,7 +913,9 @@ void MkFitter::AddBestHit(BunchOfHits &bunch_of_hits)
 
 
 
-void MkFitter::FindCandidates(BunchOfHits &bunch_of_hits, std::vector<std::vector<Track> >& tmp_candidates, int offset)
+void MkFitter::FindCandidates(BunchOfHits &bunch_of_hits,
+                              std::vector<std::vector<Track> >& tmp_candidates,
+                              const int offset, const int N_proc)
 {
 
   const char *varr      = (char*) bunch_of_hits.m_hits;
@@ -923,25 +924,37 @@ void MkFitter::FindCandidates(BunchOfHits &bunch_of_hits, std::vector<std::vecto
   const int   off_param = (char*) bunch_of_hits.m_hits[0].posArray() - varr;
 
   int idx[NN]      __attribute__((aligned(64)));
-  int idx_chew[NN] __attribute__((aligned(64)));
+  // int idx_chew[NN] __attribute__((aligned(64)));
 
   int maxSize = -1;
 
   // Determine maximum number of hits for tracks in the collection.
   // At the same time prefetch the first set of hits to L1 and the second one to L2.
-  for (int it = 0; it < NN; ++it)
+  for (int it = 0; it < N_proc; ++it)
   {
     int off = XHitPos.At(it, 0, 0) * sizeof(Hit);
 
     _mm_prefetch(varr + off, _MM_HINT_T0);
     _mm_prefetch(varr + sizeof(Hit) + off, _MM_HINT_T1);
 
     idx[it]      = off;
-    idx_chew[it] = it*sizeof(Hit);
+    // idx_chew[it] = it*sizeof(Hit);
 
     // XXX There is an intrinsic for that, out of loop.
     maxSize = std::max(maxSize, XHitSize.At(it, 0, 0));
   }
+  // XXXX MT FIXME: Use the limit for:
+  // - SlurpIns, use masked gather for MIC_INTRINSICS
+  // - prefetching loops - DONE
+  // - computeChi2MPlex() -- really hard ... it calls Matriplex functions. This
+  //       should be fine. - DOES NOT NEED TO BE DONE
+  // - hit (valid or invalid) registration loops - DONE
+  // The following loop is not needed then. But I do need a mask for intrinsics slurpin.
+  for (int it = N_proc; it < NN; ++it)
+  {
+    idx[it]      = idx[0];
+    // idx_chew[it] = idx_chew[0];
+  }
 
   // XXXX MT Uber hack to avoid tracks with like 300 hits to process.
   maxSize = std::min(maxSize, Config::maxHitsConsidered);
@@ -950,7 +963,7 @@ void MkFitter::FindCandidates(BunchOfHits &bunch_of_hits, std::vector<std::vecto
   //__m512i vi = _mm512_setr_epi32(idx[ 0], idx[ 1], idx[ 2], idx[ 3], idx[ 4], idx[ 5], idx[ 6], idx[ 7],
   //                               idx[ 8], idx[ 9], idx[10], idx[11], idx[12], idx[13], idx[14], idx[15]);
   __m512i vi      = _mm512_load_epi32(idx);
-  __m512i vi_chew = _mm512_load_epi32(idx_chew);
+  // __m512i vi_chew = _mm512_load_epi32(idx_chew);
 #endif
 
 // Has basically no effect, it seems.
@@ -961,7 +974,7 @@ void MkFitter::FindCandidates(BunchOfHits &bunch_of_hits, std::vector<std::vecto
 
     // Prefetch to L2 the hits we'll process after two loops iterations.
     // Ideally this would be initiated before coming here, for whole bunch_of_hits.m_hits vector.
-    for (int itrack = 0; itrack < NN; ++itrack)
+    for (int itrack = 0; itrack < N_proc; ++itrack)
     {
       _mm_prefetch(varr + 2*sizeof(Hit) + idx[itrack], _MM_HINT_T1);
     }
@@ -992,7 +1005,7 @@ void MkFitter::FindCandidates(BunchOfHits &bunch_of_hits, std::vector<std::vecto
     computeChi2MPlex(Err[iP], Par[iP], Chg, msErr[Nhits], msPar[Nhits], outChi2);
 
     // Prefetch to L1 the hits we'll process in the next loop iteration.
-    for (int itrack = 0; itrack < NN; ++itrack)
+    for (int itrack = 0; itrack < N_proc; ++itrack)
     {
       _mm_prefetch(varr + sizeof(Hit) + idx[itrack], _MM_HINT_T0);
     }
@@ -1001,7 +1014,7 @@ void MkFitter::FindCandidates(BunchOfHits &bunch_of_hits, std::vector<std::vecto
     //this is not needed for candidates the hit is not added to, but it's vectorized so doing it serially below should take the same time
     //still it's a waste of time in case the hit is not added to any of the candidates, so check beforehand that at least one cand needs update
     bool oneCandPassCut = false;
-    for (int itrack = 0; itrack < NN;++itrack)
+    for (int itrack = 0; itrack < N_proc;++itrack)
       {
 	float chi2 = fabs(outChi2[itrack]);//fixme negative chi2 sometimes...
 #ifdef DEBUG
@@ -1026,7 +1039,7 @@ void MkFitter::FindCandidates(BunchOfHits &bunch_of_hits, std::vector<std::vecto
 
       //create candidate with hit in case chi2<Config::chi2Cut
       //fixme: please vectorize me... (not sure it's possible in this case)
-      for (int itrack = 0; itrack < NN; ++itrack)
+      for (int itrack = 0; itrack < N_proc; ++itrack)
 	{
 	  float chi2 = fabs(outChi2[itrack]);//fixme negative chi2 sometimes...
 #ifdef DEBUG
@@ -1051,11 +1064,11 @@ void MkFitter::FindCandidates(BunchOfHits &bunch_of_hits, std::vector<std::vecto
 	      //set the track state to the updated parameters
 	      Err[iC].CopyOut(itrack, newcand.errors_nc().Array());
 	      Par[iC].CopyOut(itrack, newcand.parameters_nc().Array());
-	      
+
 #ifdef DEBUG
 	      std::cout << "updated track parameters x=" << newcand.parameters()[0] << " y=" << newcand.parameters()[1] << std::endl;
 #endif
-	      
+
 	      tmp_candidates[SeedIdx(itrack, 0, 0)-offset].push_back(newcand);
 	    }
 	}
@@ -1065,7 +1078,7 @@ void MkFitter::FindCandidates(BunchOfHits &bunch_of_hits, std::vector<std::vecto
 
   //now add invalid hit
   //fixme: please vectorize me...
-  for (int itrack = 0; itrack < NN;++itrack)
+  for (int itrack = 0; itrack < N_proc; ++itrack)
     {
       if (countInvalidHits(itrack)>0) continue;//check this is ok for vectorization //fixme not optimal
       Track newcand;

diff --git a/mkFit/MkFitter.h b/mkFit/MkFitter.h
@@ -104,7 +104,8 @@ class MkFitter
   void SelectHitRanges(BunchOfHits &bunch_of_hits, const int N_proc);
   void AddBestHit     (BunchOfHits &bunch_of_hits);
 
-  void FindCandidates(BunchOfHits &bunch_of_hits, std::vector<std::vector<Track> >& tmp_candidates, int offset);
+  void FindCandidates(BunchOfHits &bunch_of_hits, std::vector<std::vector<Track> >& tmp_candidates,
+                      const int offset, const int N_proc);
 
   // ================================================================
   // Methods to be used with clone engine