From b1390d522a2170742f2920962db8883cb413f75c Mon Sep 17 00:00:00 2001
From: Lindsey Gray <lindsey.gray@gmail.com>
Date: Wed, 26 Jul 2017 19:04:30 +0200
Subject: [PATCH] vectorized 2D simulated annealing vertexing

---
 .../Configuration/python/RecoVertex_cff.py    |   4 +-
 .../interface/DAClusterizerInZT_vect.h        | 243 +++++
 .../interface/PrimaryVertexProducer.h         |   2 +
 .../plugins/PrimaryVertexProducer.cc          |   3 +
 .../python/TkClusParameters_cff.py            |  16 +
 .../src/DAClusterizerInZT.cc                  |   2 +-
 .../src/DAClusterizerInZT_vect.cc             | 959 ++++++++++++++++++
 7 files changed, 1226 insertions(+), 3 deletions(-)
 create mode 100644 RecoVertex/PrimaryVertexProducer/interface/DAClusterizerInZT_vect.h
 create mode 100644 RecoVertex/PrimaryVertexProducer/src/DAClusterizerInZT_vect.cc

diff --git a/RecoVertex/Configuration/python/RecoVertex_cff.py b/RecoVertex/Configuration/python/RecoVertex_cff.py
index 0c117617b2a13..d0abec70e376e 100644
--- a/RecoVertex/Configuration/python/RecoVertex_cff.py
+++ b/RecoVertex/Configuration/python/RecoVertex_cff.py
@@ -33,14 +33,14 @@
                           )
 
 #timing
-from RecoVertex.PrimaryVertexProducer.TkClusParameters_cff import DA2DParameters
+from RecoVertex.PrimaryVertexProducer.TkClusParameters_cff import DA2D_vectParameters
 unsortedOfflinePrimaryVertices1D = unsortedOfflinePrimaryVertices.clone()
 unsortedOfflinePrimaryVertices1D.TkFilterParameters.minPt = cms.double(0.7)
 offlinePrimaryVertices1D=sortedPrimaryVertices.clone(vertices="unsortedOfflinePrimaryVertices1D", particles="trackRefsForJetsBeforeSorting")
 offlinePrimaryVertices1DWithBS=sortedPrimaryVertices.clone(vertices="unsortedOfflinePrimaryVertices1D:WithBS", particles="trackRefsForJetsBeforeSorting")
 DA2DParameters.TkDAClusParameters.verbose = cms.untracked.bool(False)
 unsortedOfflinePrimaryVertices4D = unsortedOfflinePrimaryVertices.clone( verbose = cms.untracked.bool(False),
-                                                                         TkClusParameters = DA2DParameters )
+                                                                         TkClusParameters = DA2D_vectParameters )
 unsortedOfflinePrimaryVertices4D.TkFilterParameters.minPt = cms.double(0.7)
 unsortedOfflinePrimaryVertices4D.TrackTimesLabel = cms.InputTag("trackTimeValueMapProducer:generalTracksConfigurableFlatResolutionModel")
 unsortedOfflinePrimaryVertices4D.TrackTimeResosLabel = cms.InputTag("trackTimeValueMapProducer:generalTracksConfigurableFlatResolutionModelResolution")
diff --git a/RecoVertex/PrimaryVertexProducer/interface/DAClusterizerInZT_vect.h b/RecoVertex/PrimaryVertexProducer/interface/DAClusterizerInZT_vect.h
new file mode 100644
index 0000000000000..196fd88321d97
--- /dev/null
+++ b/RecoVertex/PrimaryVertexProducer/interface/DAClusterizerInZT_vect.h
@@ -0,0 +1,243 @@
+#ifndef DAClusterizerInZT_vect_h
+#define DAClusterizerInZT_vect_h
+
+/**\class DAClusterizerInZT_vect
+
+ Description: separates event tracks into clusters along the beam line
+
+	Version which auto-vectorizes with gcc 4.6 or newer
+
+ */
+
+#include "RecoVertex/PrimaryVertexProducer/interface/TrackClusterizerInZ.h"
+#include "TrackingTools/TransientTrack/interface/TransientTrack.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include <vector>
+#include "DataFormats/Math/interface/Error.h"
+#include "RecoVertex/VertexTools/interface/VertexDistanceXY.h"
+#include "RecoVertex/VertexPrimitives/interface/TransientVertex.h"
+
+
+class DAClusterizerInZT_vect  final : public TrackClusterizerInZ {
+
+public:
+  // Internal data structure to 
+  struct track_t {
+    
+    void AddItem( double new_z, double new_t, double new_dz2, double new_dt2, const reco::TransientTrack* new_tt, double new_pi   )
+    {
+      z.push_back( new_z );
+      t.push_back( new_t );
+      dz2.push_back( new_dz2 );
+      dt2.push_back( new_dt2 );
+      errsum.push_back( 1./(1./new_dz2 + 1./new_dt2) );
+      tt.push_back( new_tt );
+      
+      pi.push_back( new_pi ); // track weight
+      Z_sum.push_back( 1.0 ); // Z[i]   for DA clustering, initial value as done in ::fill
+    }
+
+    
+    
+    unsigned int GetSize() const
+    {
+      return z.size();
+    }
+    
+    
+    // has to be called everytime the items are modified
+    void ExtractRaw()
+    {
+      _z = &z.front();
+      _t = &t.front();
+      _dz2 = &dz2.front();
+      _dt2 = &dt2.front();
+      _errsum = &errsum.front();
+      _Z_sum = &Z_sum.front();
+      _pi = &pi.front();
+    }
+    
+    double * __restrict__ _z; // z-coordinate at point of closest approach to the beamline
+    double * __restrict__ _t; // t-coordinate at point of closest approach to the beamline
+    double * __restrict__ _dz2; // square of the error of z(pca)
+    double * __restrict__ _dt2; // square of the error of t(pca)
+    double * __restrict__ _errsum; // sum of squares of the pca errors
+    
+    double * __restrict__  _Z_sum; // Z[i]   for DA clustering
+    double * __restrict__  _pi; // track weight
+    
+    std::vector<double> z; // z-coordinate at point of closest approach to the beamline
+    std::vector<double> t; // t-coordinate at point of closest approach to the beamline
+    std::vector<double> dz2; // square of the error of z(pca)
+    std::vector<double> dt2; // square of the error of t(pca)
+    std::vector<double> errsum; // sum of squares of the pca errors
+    std::vector< const reco::TransientTrack* > tt; // a pointer to the Transient Track
+    
+    std::vector<double> Z_sum; // Z[i]   for DA clustering
+    std::vector<double> pi; // track weight
+  };
+  
+  struct vertex_t {
+    std::vector<double> z; //           z coordinate
+    std::vector<double> t; //           t coordinate
+    std::vector<double> pk; //           vertex weight for "constrained" clustering
+    
+    // --- temporary numbers, used during update
+    std::vector<double> ei_cache;
+    std::vector<double> ei;
+    std::vector<double> sw;
+    std::vector<double> swz;
+    std::vector<double> swt;
+    std::vector<double> se;
+    std::vector<double> swE;
+    
+    
+    unsigned int GetSize() const
+    {
+      return z.size();
+    }
+    
+    void AddItem( double new_z, double new_t, double new_pk   )
+    {
+      z.push_back( new_z);
+      t.push_back( new_t);
+      pk.push_back( new_pk);
+      
+      ei_cache.push_back( 0.0 );
+      ei.push_back( 0.0 );
+      sw.push_back( 0.0 );
+      swz.push_back( 0.0);
+      swt.push_back( 0.0);
+      se.push_back( 0.0);
+      swE.push_back( 0.0);
+      
+      ExtractRaw();
+    }
+    
+    void InsertItem( unsigned int i, double new_z, double new_t, double new_pk   )
+    {
+      z.insert(z.begin() + i, new_z);
+      t.insert(t.begin() + i, new_t);
+      pk.insert(pk.begin() + i, new_pk);
+      
+      ei_cache.insert(ei_cache.begin() + i, 0.0 );
+      ei.insert( ei.begin()  + i, 0.0 );
+      sw.insert( sw.begin()  + i, 0.0 );
+      swz.insert(swz.begin() + i, 0.0 );
+      swt.insert(swt.begin() + i, 0.0 );
+      se.insert( se.begin()  + i, 0.0 );
+      swE.insert(swE.begin() + i, 0.0 );
+      
+      ExtractRaw();
+    }
+    
+    void RemoveItem( unsigned int i )
+    {
+      z.erase( z.begin() + i );
+      t.erase( t.begin() + i );
+      pk.erase( pk.begin() + i );
+      
+      ei_cache.erase( ei_cache.begin() + i);
+      ei.erase( ei.begin() + i);
+      sw.erase( sw.begin() + i);
+      swz.erase( swz.begin() + i);
+      swt.erase( swt.begin() + i);
+      se.erase(se.begin() + i);
+      swE.erase(swE.begin() + i);
+      
+      ExtractRaw();
+    }
+    
+    void DebugOut()
+    {
+      std::cout <<  "vertex_t size: " << GetSize() << std::endl;
+      
+      for ( unsigned int i =0; i < GetSize(); ++ i)
+	{
+	  std::cout << " z = " << _z[i] << " t = " << _t[i] << " pk = " << _pk[i] << std::endl;
+	}
+    }
+    
+    // has to be called everytime the items are modified
+    void ExtractRaw()
+    {
+      _z = &z.front();
+      _t = &t.front();
+      _pk = &pk.front();
+      
+      _ei = &ei.front();
+      _sw = &sw.front();
+      _swz = &swz.front();
+      _swt = &swt.front();
+      _se = &se.front();
+      _swE = &swE.front();
+      _ei_cache = &ei_cache.front();
+      
+    }
+    
+    double * __restrict__ _z;
+    double * __restrict__ _t;
+    double * __restrict__ _pk;
+    
+    double * __restrict__ _ei_cache;
+    double * __restrict__ _ei;
+    double * __restrict__ _sw;
+    double * __restrict__ _swz;
+    double * __restrict__ _swt;
+    double * __restrict__ _se;
+    double * __restrict__ _swE;
+    
+  };
+  
+  DAClusterizerInZT_vect(const edm::ParameterSet& conf);
+  
+  
+  std::vector<std::vector<reco::TransientTrack> >
+  clusterize(const std::vector<reco::TransientTrack> & tracks) const;
+  
+  
+  std::vector<TransientVertex>
+  vertices(const std::vector<reco::TransientTrack> & tracks,
+	   const int verbosity = 0) const ;
+  
+  track_t	fill(const std::vector<reco::TransientTrack> & tracks) const;
+  
+  double update(double beta, track_t & gtracks,
+		vertex_t & gvertices, bool useRho0, const double & rho0) const;
+
+  void dump(const double beta, const vertex_t & y,
+	    const track_t & tks, const int verbosity = 0) const;
+  bool merge(vertex_t & y, double & beta)const;
+  bool purge(vertex_t &, track_t &, double &,
+	     const double) const;
+  void splitAll( vertex_t & y) const;
+  bool split(const double beta,  track_t &t, vertex_t & y, double threshold = 1. ) const;
+  
+  double beta0(const double betamax, track_t const & tks, vertex_t const & y) const;
+    
+  
+private:
+  bool verbose_;
+  double zdumpcenter_;
+  double zdumpwidth_;
+
+  double vertexSize_;
+  int maxIterations_;
+  double coolingFactor_;
+  double betamax_;
+  double betastop_;
+  double dzCutOff_;
+  double d0CutOff_;
+  bool useTc_;
+
+  double mintrkweight_;
+  double uniquetrkweight_;
+  double zmerge_;
+  double tmerge_;
+  double betapurge_;
+
+};
+
+
+//#ifndef DAClusterizerInZT_new_h
+#endif
diff --git a/RecoVertex/PrimaryVertexProducer/interface/PrimaryVertexProducer.h b/RecoVertex/PrimaryVertexProducer/interface/PrimaryVertexProducer.h
index bafb5b6821b7c..e55489e1b8f84 100644
--- a/RecoVertex/PrimaryVertexProducer/interface/PrimaryVertexProducer.h
+++ b/RecoVertex/PrimaryVertexProducer/interface/PrimaryVertexProducer.h
@@ -34,6 +34,8 @@
 #include "RecoVertex/PrimaryVertexProducer/interface/TrackFilterForPVFindingBase.h"
 #include "RecoVertex/PrimaryVertexProducer/interface/TrackClusterizerInZ.h"
 #include "RecoVertex/PrimaryVertexProducer/interface/DAClusterizerInZ_vect.h"
+#include "RecoVertex/PrimaryVertexProducer/interface/DAClusterizerInZT_vect.h"
+
 
 #include "RecoVertex/PrimaryVertexProducer/interface/TrackFilterForPVFinding.h"
 #include "RecoVertex/PrimaryVertexProducer/interface/HITrackFilterForPVFinding.h"
diff --git a/RecoVertex/PrimaryVertexProducer/plugins/PrimaryVertexProducer.cc b/RecoVertex/PrimaryVertexProducer/plugins/PrimaryVertexProducer.cc
index 3aef3af31403d..12cb6c8f9310b 100644
--- a/RecoVertex/PrimaryVertexProducer/plugins/PrimaryVertexProducer.cc
+++ b/RecoVertex/PrimaryVertexProducer/plugins/PrimaryVertexProducer.cc
@@ -53,6 +53,9 @@ PrimaryVertexProducer::PrimaryVertexProducer(const edm::ParameterSet& conf)
   else if( clusteringAlgorithm=="DA2D" ) {
     theTrackClusterizer = new DAClusterizerInZT(conf.getParameter<edm::ParameterSet>("TkClusParameters").getParameter<edm::ParameterSet>("TkDAClusParameters"));
     f4D = true;
+  } else if( clusteringAlgorithm=="DA2D_vect" ) {
+    theTrackClusterizer = new DAClusterizerInZT_vect(conf.getParameter<edm::ParameterSet>("TkClusParameters").getParameter<edm::ParameterSet>("TkDAClusParameters"));
+    f4D = true;
   }
 
   else{
diff --git a/RecoVertex/PrimaryVertexProducer/python/TkClusParameters_cff.py b/RecoVertex/PrimaryVertexProducer/python/TkClusParameters_cff.py
index 10afac03823ca..8965c98dac5c4 100644
--- a/RecoVertex/PrimaryVertexProducer/python/TkClusParameters_cff.py
+++ b/RecoVertex/PrimaryVertexProducer/python/TkClusParameters_cff.py
@@ -25,3 +25,19 @@
         uniquetrkweight = cms.double(0.8) # require at least two tracks with this weight at T=Tpurge
         )
 )
+
+DA2D_vectParameters = cms.PSet(
+    algorithm   = cms.string("DA2D_vect"),
+    TkDAClusParameters = cms.PSet(
+        coolingFactor = cms.double(0.6),  # moderate annealing speed
+        Tmin = cms.double(2.0),           # end of vertex splitting
+        Tpurge = cms.double(2.0),         # cleaning 
+        Tstop = cms.double(0.5),          # end of annealing 
+        vertexSize = cms.double(0.006),   # added in quadrature to track-z resolutions
+        d0CutOff = cms.double(3.),        # downweight high IP tracks 
+        dzCutOff = cms.double(3.),        # outlier rejection after freeze-out (T<Tmin)
+        zmerge = cms.double(1e-2),        # merge intermediat clusters separated by less than zmerge and tmerge
+        tmerge = cms.double(1e-2),        # merge intermediat clusters separated by less than zmerge and tmerge
+        uniquetrkweight = cms.double(0.8) # require at least two tracks with this weight at T=Tpurge
+        )
+)
diff --git a/RecoVertex/PrimaryVertexProducer/src/DAClusterizerInZT.cc b/RecoVertex/PrimaryVertexProducer/src/DAClusterizerInZT.cc
index 486adc598c87c..49a8741a7bffc 100644
--- a/RecoVertex/PrimaryVertexProducer/src/DAClusterizerInZT.cc
+++ b/RecoVertex/PrimaryVertexProducer/src/DAClusterizerInZT.cc
@@ -324,7 +324,7 @@ bool DAClusterizerInZT::split( double beta,
       }
     }
     if(w1>0){  z1=z1/w1; t1=t1/w1;} else{ z1=y[ik].z-epsilon; t1=y[ik].t-epsilon; }
-    if(w2>0){  z2=z2/w2; t2=t2/w2;} else{ z2=y[ik].z+epsilon; t2=y[ik].t+epsilon;}
+    if(w2>0){  z2=z2/w2; t2=t2/w2;} else{ z2=y[ik].z+epsilon; t2=y[ik].t+epsilon; }
 
     // reduce split size if there is not enough room
     if( ( ik   > 0       ) && ( y[ik-1].z>=z1 ) ){ z1=0.5*(y[ik].z+y[ik-1].z); t1=0.5*(y[ik].t+y[ik-1].t); }
diff --git a/RecoVertex/PrimaryVertexProducer/src/DAClusterizerInZT_vect.cc b/RecoVertex/PrimaryVertexProducer/src/DAClusterizerInZT_vect.cc
new file mode 100644
index 0000000000000..c759849a4c30d
--- /dev/null
+++ b/RecoVertex/PrimaryVertexProducer/src/DAClusterizerInZT_vect.cc
@@ -0,0 +1,959 @@
+#include "RecoVertex/PrimaryVertexProducer/interface/DAClusterizerInZT_vect.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "DataFormats/GeometryCommonDetAlgo/interface/Measurement1D.h"
+#include "RecoVertex/VertexPrimitives/interface/VertexException.h"
+
+#include <cmath>
+#include <cassert>
+#include <limits>
+#include <iomanip>
+#include "FWCore/Utilities/interface/isFinite.h"
+#include "vdt/vdtMath.h"
+
+using namespace std;
+
+namespace {
+  constexpr double epsilon = 1.0e-3;
+  constexpr double vertexSizeTime = 0.008;
+  constexpr double dtCutOff = 4.0;
+}
+
+DAClusterizerInZT_vect::DAClusterizerInZT_vect(const edm::ParameterSet& conf) {
+
+  // hardcoded parameters
+  maxIterations_ = 100;
+  mintrkweight_ = 0.5; // conf.getParameter<double>("mintrkweight");
+
+
+  // configurable debug outptut debug output
+  verbose_ = conf.getUntrackedParameter<bool> ("verbose", false);
+  zdumpcenter_ = conf.getUntrackedParameter<double> ("zdumpcenter", 0.);
+  zdumpwidth_ = conf.getUntrackedParameter<double> ("zdumpwidth", 20.);
+  
+  // configurable parameters
+  double Tmin = conf.getParameter<double> ("Tmin");
+  double Tpurge = conf.getParameter<double> ("Tpurge");
+  double Tstop = conf.getParameter<double> ("Tstop");
+  vertexSize_ = conf.getParameter<double> ("vertexSize");
+  coolingFactor_ = conf.getParameter<double> ("coolingFactor");
+  useTc_=true;
+  if(coolingFactor_<0){
+    coolingFactor_=-coolingFactor_; 
+    useTc_=false;
+  }
+  d0CutOff_ = conf.getParameter<double> ("d0CutOff");
+  dzCutOff_ = conf.getParameter<double> ("dzCutOff");
+  uniquetrkweight_ = conf.getParameter<double>("uniquetrkweight");
+  zmerge_ = conf.getParameter<double>("zmerge");
+  tmerge_ = conf.getParameter<double>("tmerge");
+
+  if(verbose_){
+    std::cout << "DAClusterizerinZT_vect: mintrkweight = " << mintrkweight_ << std::endl;
+    std::cout << "DAClusterizerinZT_vect: uniquetrkweight = " << uniquetrkweight_ << std::endl;
+    std::cout << "DAClusterizerinZT_vect: zmerge = " << zmerge_ << std::endl;
+    std::cout << "DAClusterizerinZT_vect: tmerge = " << tmerge_ << std::endl;
+    std::cout << "DAClusterizerinZT_vect: Tmin = " << Tmin << std::endl;
+    std::cout << "DAClusterizerinZT_vect: Tpurge = " << Tpurge << std::endl;
+    std::cout << "DAClusterizerinZT_vect: Tstop = " << Tstop << std::endl;
+    std::cout << "DAClusterizerinZT_vect: vertexSize = " << vertexSize_ << std::endl;
+    std::cout << "DAClusterizerinZT_vect: coolingFactor = " << coolingFactor_ << std::endl;
+    std::cout << "DAClusterizerinZT_vect: d0CutOff = " << d0CutOff_ << std::endl;
+    std::cout << "DAClusterizerinZT_vect: dzCutOff = " << dzCutOff_ << std::endl;
+    std::cout << "DAClusterizerinZT_vect: dtCutoff = " << dtCutOff << std::endl;
+  }
+
+
+  if (Tmin == 0) {
+    edm::LogWarning("DAClusterizerinZT_vectorized") << "DAClusterizerInZT: invalid Tmin" << Tmin
+						   << "  reset do default " << 1. / betamax_;
+  } else {
+    betamax_ = 1. / Tmin;
+  }
+
+
+  if ((Tpurge > Tmin) || (Tpurge == 0)) {
+    edm::LogWarning("DAClusterizerinZT_vectorized") << "DAClusterizerInZT: invalid Tpurge" << Tpurge
+						   << "  set to " << Tmin;
+    Tpurge =  Tmin;
+  }
+  betapurge_ = 1./Tpurge;
+  
+
+  if ((Tstop > Tpurge) || (Tstop == 0)) {
+    edm::LogWarning("DAClusterizerinZT_vectorized") << "DAClusterizerInZT: invalid Tstop" << Tstop
+						   << "  set to  " << max(1., Tpurge);
+    Tstop = max(1., Tpurge) ;
+  }
+  betastop_ = 1./Tstop;
+  
+}
+
+
+namespace {
+  inline double local_exp( double const& inp) {
+    return vdt::fast_exp( inp );
+  }
+  
+  inline void local_exp_list( double const * __restrict__ arg_inp, double * __restrict__ arg_out, const int arg_arr_size) {
+    for(int i=0; i!=arg_arr_size; ++i ) arg_out[i]=vdt::fast_exp(arg_inp[i]);
+  }
+
+}
+
+//todo: use r-value possibility of c++11 here
+DAClusterizerInZT_vect::track_t 
+DAClusterizerInZT_vect::fill(const vector<reco::TransientTrack> & tracks) const {
+
+  // prepare track data for clustering
+  track_t tks;
+  for (auto it = tracks.begin(); it!= tracks.end(); it++){
+    if (!(*it).isValid()) continue;
+    double t_pi=1.;
+    double t_z = ((*it).stateAtBeamLine().trackStateAtPCA()).position().z();
+    double t_t = it->timeExt();
+    if (std::fabs(t_z) > 1000.) continue;
+    auto const & t_mom = (*it).stateAtBeamLine().trackStateAtPCA().momentum();
+    //  get the beam-spot
+    reco::BeamSpot beamspot = (it->stateAtBeamLine()).beamSpot();
+    double t_dz2 = 
+      std::pow((*it).track().dzError(), 2) // track errror
+      + (std::pow(beamspot.BeamWidthX()*t_mom.x(),2)+std::pow(beamspot.BeamWidthY()*t_mom.y(),2))*std::pow(t_mom.z(),2)/std::pow(t_mom.perp2(),2) // beam spot width
+      + std::pow(vertexSize_, 2); // intrinsic vertex size, safer for outliers and short lived decays
+    t_dz2 = 1./ t_dz2;
+    double t_dt2 =std::pow((*it).dtErrorExt(),2.) + std::pow(vertexSizeTime,2.); // the ~injected~ timing error, need to add a small minimum vertex size in time
+    t_dt2 = 1./t_dt2;
+    if (edm::isNotFinite(t_dz2) || t_dz2 < std::numeric_limits<double>::min() ) continue;
+    if (d0CutOff_ > 0) {
+      Measurement1D atIP =
+	(*it).stateAtBeamLine().transverseImpactParameter();// error contains beamspot
+      t_pi = 1. / (1. + local_exp(std::pow(atIP.value() / atIP.error(), 2) - std::pow(d0CutOff_, 2))); // reduce weight for high ip tracks
+      if (edm::isNotFinite(t_pi) ||  t_pi < std::numeric_limits<double>::epsilon())  continue; // usually is > 0.99
+    }
+    LogTrace("DAClusterizerinZT_vectorized") << t_z << ' ' << t_t <<' '<< t_dz2 << ' ' << t_dt2 <<' '<< t_pi;
+    tks.AddItem(t_z, t_t, t_dz2, t_dt2, &(*it), t_pi);
+  }
+  tks.ExtractRaw();
+  
+  if (verbose_) {
+    std::cout << "Track count " << tks.GetSize() << std::endl;
+  }
+  
+  return tks;
+}
+
+
+namespace {
+  inline
+  double Eik(double t_z, double k_z, double t_dz2, double t_t, double k_t, double t_dt2) {
+    return std::pow(t_z - k_z, 2) * t_dz2 + std::pow(t_t - k_t,2) * t_dt2;
+  }
+}
+
+double DAClusterizerInZT_vect::update(double beta, track_t & gtracks,
+				     vertex_t & gvertices, bool useRho0, const double & rho0) const {
+
+  //update weights and vertex positions
+  // mass constrained annealing without noise
+  // returns the squared sum of changes of vertex positions
+  
+  const unsigned int nt = gtracks.GetSize();
+  const unsigned int nv = gvertices.GetSize();
+  
+  //initialize sums
+  double sumpi = 0;
+  
+  // to return how much the prototype moved
+  double delta = 0;
+  
+
+  // intial value of a sum
+  double Z_init = 0;
+  // independpent of loop
+  if ( useRho0 )
+    {
+      Z_init = rho0 * local_exp(-beta * dzCutOff_ * dzCutOff_); // cut-off
+    }
+  
+  // define kernels
+  auto kernel_calc_exp_arg = [ beta, nv ] ( const unsigned int itrack,
+					     track_t const& tracks,
+					     vertex_t const& vertices ) {
+    const double track_z = tracks._z[itrack];
+    const double track_t = tracks._t[itrack];
+    const double botrack_dz2 = -beta*tracks._dz2[itrack];
+    const double botrack_dt2 = -beta*tracks._dt2[itrack];
+
+    // auto-vectorized
+    for ( unsigned int ivertex = 0; ivertex < nv; ++ivertex) {
+      const auto mult_resz =  track_z - vertices._z[ivertex];
+      const auto mult_rest =  track_t - vertices._t[ivertex];
+      vertices._ei_cache[ivertex] = botrack_dz2 * ( mult_resz * mult_resz ) + botrack_dt2 * ( mult_rest * mult_rest );
+    }
+  };
+  
+  auto kernel_add_Z = [ nv, Z_init ] (vertex_t const& vertices) -> double
+    {
+      double ZTemp = Z_init;
+      for (unsigned int ivertex = 0; ivertex < nv; ++ivertex) {	
+	ZTemp += vertices._pk[ivertex] * vertices._ei[ivertex];
+      }
+      return ZTemp;
+    };
+
+  auto kernel_calc_normalization = [ beta, nv ] (const unsigned int track_num,
+						  track_t & tks_vec,
+						  vertex_t & y_vec ) {
+    auto tmp_trk_pi = tks_vec._pi[track_num];
+    auto o_trk_Z_sum = 1./tks_vec._Z_sum[track_num];
+    //auto o_trk_dz2 = tks_vec._dz2[track_num];
+    //auto o_trk_dt2 = tks_vec._dt2[track_num];
+    auto o_trk_err_sum = tks_vec._errsum[track_num];
+    auto tmp_trk_z = tks_vec._z[track_num];
+    auto tmp_trk_t = tks_vec._t[track_num];
+    auto obeta =  -1./beta;
+    
+    // auto-vectorized
+    for (unsigned int k = 0; k < nv; ++k) {
+      y_vec._se[k] +=  y_vec._ei[k] * (tmp_trk_pi* o_trk_Z_sum);
+      // parens are important for numerical stability
+      auto w = tmp_trk_pi * (y_vec._pk[k] * y_vec._ei[k] * o_trk_Z_sum) * o_trk_err_sum; 
+      y_vec._sw[k]  += w;
+      y_vec._swz[k] += w * tmp_trk_z;
+      y_vec._swt[k] += w * tmp_trk_t;
+      y_vec._swE[k] += w * y_vec._ei_cache[k]*obeta;
+    }
+  };
+  
+  
+  for (auto ivertex = 0U; ivertex < nv; ++ivertex) {
+    gvertices._se[ivertex] = 0.0;
+    gvertices._sw[ivertex] = 0.0;
+    gvertices._swz[ivertex] = 0.0;
+    gvertices._swt[ivertex] = 0.0;
+    gvertices._swE[ivertex] = 0.0;
+  }
+  
+  
+  
+  // loop over tracks
+  for (auto itrack = 0U; itrack < nt; ++itrack) {
+    kernel_calc_exp_arg(itrack, gtracks, gvertices);
+    local_exp_list(gvertices._ei_cache, gvertices._ei, nv);
+    
+    gtracks._Z_sum[itrack] = kernel_add_Z(gvertices);
+    if (edm::isNotFinite(gtracks._Z_sum[itrack])) gtracks._Z_sum[itrack] = 0.0;
+    // used in the next major loop to follow
+    sumpi += gtracks._pi[itrack];
+    
+    if (gtracks._Z_sum[itrack] > 1.e-100){
+      kernel_calc_normalization(itrack, gtracks, gvertices);
+    }
+  }
+  
+  // now update z and pk
+  auto kernel_calc_zt = [  sumpi, nv, this, useRho0 ] (vertex_t & vertices ) -> double {
+    
+    double delta=0;
+    // does not vectorizes
+    for (unsigned int ivertex = 0; ivertex < nv; ++ ivertex ) {
+      if (vertices._sw[ivertex] > 0) {
+	auto znew = vertices._swz[ ivertex ] / vertices._sw[ ivertex ];
+	// prevents from vectorizing if 
+	delta += std::pow( vertices._z[ ivertex ] - znew, 2 );
+	vertices._z[ ivertex ] = znew;
+        auto tnew = vertices._swt[ ivertex ] / vertices._sw[ ivertex ];
+	// prevents from vectorizing if 
+	delta += std::pow( vertices._t[ ivertex ] - tnew, 2 );
+        vertices._t[ ivertex ] = tnew;
+      }
+#ifdef VI_DEBUG
+      else {
+	edm::LogInfo("sumw") << "invalid sum of weights in fit: " << vertices._sw[ivertex] << endl;
+	if (this->verbose_) {
+	  std::cout  << " a cluster melted away ?  pk=" << vertices._pk[ ivertex ] << " sumw="
+						   << vertices._sw[ivertex] << endl;
+	}
+      }
+#endif
+    }
+
+    auto osumpi = 1./sumpi;
+    for (unsigned int ivertex = 0; ivertex < nv; ++ ivertex )
+      vertices._pk[ ivertex ] = vertices._pk[ ivertex ] * vertices._se[ ivertex ] * osumpi;
+
+    return delta;
+  };
+  
+  delta += kernel_calc_zt(gvertices);
+  
+  // return how much the prototypes moved
+  return delta;
+}
+
+
+
+
+
+bool DAClusterizerInZT_vect::merge(vertex_t & y, double & beta)const{
+  // merge clusters that collapsed or never separated,
+  // only merge if the estimated critical temperature of the merged vertex is below the current temperature
+  // return true if vertices were merged, false otherwise
+  const unsigned int nv = y.GetSize();
+
+  if (nv < 2)
+    return false;
+
+  // merge the smallest distance clusters first
+  std::vector<std::pair<double, unsigned int> > critical;
+  for (unsigned int k = 0; (k + 1) < nv; k++) {
+    if ( std::fabs(y._z[k + 1] - y._z[k]) < zmerge_ &&
+         std::fabs(y._t[k + 1] - y._t[k]) < tmerge_    ) {
+      auto dt2 = std::pow(y._t[k + 1] - y._t[k],2);
+      auto dz2 = std::pow(y._z[k + 1] - y._z[k],2);
+      critical.push_back( make_pair( dz2 + dt2, k ) );
+    }
+  }
+  if (critical.size() == 0) return false;
+
+  std::stable_sort(critical.begin(), critical.end(), std::less<std::pair<double, unsigned int> >() );
+
+
+  for (unsigned int ik=0; ik < critical.size(); ik++){
+    unsigned int k = critical[ik].second;
+    double rho = y._pk[k]+y._pk[k+1];
+    double swE = y._swE[k]+y._swE[k+1]-y._pk[k]*y._pk[k+1] / rho*( std::pow(y._z[k+1]-y._z[k],2) + std::pow(y._t[k+1]-y._t[k],2) );
+    double Tc = 2*swE / (y._sw[k]+y._sw[k+1]);
+
+    if(Tc*beta < 1){
+      if(verbose_){ std::cout << "merging (" << y._z[k + 1] << ',' << y._t[k + 1] << ") and (" <<  y._z[k] << ',' << y._t[k] << ")  Tc = " << Tc <<  "  sw = "  << y._sw[k]+y._sw[k+1]  <<std::endl;}
+      if(rho > 0){
+	y._z[k] = (y._pk[k]*y._z[k] + y._pk[k+1]*y._z[k + 1])/rho;
+        y._t[k] = (y._pk[k]*y._t[k] + y._pk[k+1]*y._t[k + 1])/rho;
+      }else{
+	y._z[k] = 0.5 * (y._z[k] + y._z[k + 1]);
+        y._t[k] = 0.5 * (y._t[k] + y._t[k + 1]);
+      }
+      y._pk[k] = rho;
+      y._sw[k] += y._sw[k+1];
+      y._swE[k] = swE;
+      y.RemoveItem(k+1);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+
+
+bool 
+DAClusterizerInZT_vect::purge(vertex_t & y, track_t & tks, double & rho0, const double beta) const {
+  // eliminate clusters with only one significant/unique track
+  const unsigned int nv = y.GetSize();
+  const unsigned int nt = tks.GetSize();
+
+  if (nv < 2)
+    return false;
+  
+  double sumpmin = nt;
+  unsigned int k0 = nv;
+  
+  for (unsigned int k = 0; k < nv; k++) {
+    
+    int nUnique = 0;
+    double sump = 0;
+
+    double pmax = y._pk[k] / (y._pk[k] + rho0 * local_exp(-beta * dzCutOff_* dzCutOff_));
+    for (unsigned int i = 0; i < nt; i++) {
+      if (tks._Z_sum[i] > 1.e-100) {
+	double p = y._pk[k] * local_exp(-beta * Eik(tks._z[i], y._z[k], tks._dz2[i], tks._t[i], y._t[k], tks._dt2[i])) / tks._Z_sum[i];
+	sump += p;
+	if ((p > uniquetrkweight_ * pmax) && (tks._pi[i] > 0)) {
+	  nUnique++;
+	}
+      }
+    }
+
+    if ((nUnique < 2) && (sump < sumpmin)) {
+      sumpmin = sump;
+      k0 = k;
+    }
+
+  }
+  
+  if (k0 != nv) {
+    if (verbose_) {
+      std::cout  << "eliminating prototype at " << std::setw(10) << std::setprecision(4) << y._z[k0] 
+		 << " with sump=" << sumpmin
+		 << "  rho*nt =" << y._pk[k0]*nt
+		 << endl;
+    }
+    y.RemoveItem(k0);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+
+
+
+double 
+DAClusterizerInZT_vect::beta0(double betamax, track_t const  & tks, vertex_t const & y) const {
+  
+  double T0 = 0; // max Tc for beta=0
+  // estimate critical temperature from beta=0 (T=inf)
+  const unsigned int nt = tks.GetSize();
+  const unsigned int nv = y.GetSize();
+  
+  for (unsigned int k = 0; k < nv; k++) {
+    
+    // vertex fit at T=inf
+    double sumwz = 0;
+    double sumwt = 0;
+    double sumw = 0;
+    for (unsigned int i = 0; i < nt; i++) {
+      double w = tks._pi[i] * tks._errsum[i];
+      sumwz += w * tks._z[i];
+      sumwt += w * tks._t[i];
+      sumw += w;
+    }
+    y._z[k] = sumwz / sumw;
+    y._t[k] = sumwt / sumw;
+    
+    // estimate Tcrit, eventually do this in the same loop
+    double a = 0, b = 0;
+    for (unsigned int i = 0; i < nt; i++) {
+      double dx = tks._z[i] - y._z[k];
+      double dt = tks._t[i] - y._t[k];
+      double w = tks._pi[i] * tks._errsum[i];
+      a += w * std::pow(dx, 2) * tks._dz2[i] + std::pow(dt,2) * tks._dt2[i];
+      b += w;
+    }
+    double Tc = 2. * a / b; // the critical temperature of this vertex
+    if (Tc > T0) T0 = Tc;
+  }// vertex loop (normally there should be only one vertex at beta=0)
+  
+  if(verbose_){
+    std::cout << "DAClustrizerInZT_vect.beta0:   Tc = " << T0 << std::endl;
+    int coolingsteps =  1 - int(std::log(T0 * betamax) / std::log(coolingFactor_));
+    std::cout << "DAClustrizerInZT_vect.beta0:   nstep = " << coolingsteps << std::endl;
+  }
+
+
+  if (T0 > 1. / betamax) {
+    return betamax / std::pow(coolingFactor_, int(std::log(T0 * betamax) / std::log(coolingFactor_)) - 1);
+  } else {
+    // ensure at least one annealing step
+    return betamax * coolingFactor_;
+  }
+}
+
+
+  
+bool 
+DAClusterizerInZT_vect::split(const double beta,  track_t &tks, vertex_t & y, double threshold ) const{
+  // split only critical vertices (Tc >~ T=1/beta   <==>   beta*Tc>~1)
+  // an update must have been made just before doing this (same beta, no merging)
+  // returns true if at least one cluster was split
+  
+  constexpr double epsilonz=1e-3;      // minimum split size z
+  constexpr double epsilont=1e-3;      // minimum split size t
+  unsigned int nv = y.GetSize();
+  
+  // avoid left-right biases by splitting highest Tc first
+  
+  std::vector<std::pair<double, unsigned int> > critical;
+  for(unsigned int k=0; k<nv; k++){
+    double Tc= 2*y._swE[k]/y._sw[k];
+    if (beta*Tc > threshold){
+      critical.push_back( make_pair(Tc, k));
+    }
+  }
+  if (critical.size()==0) return false;
+
+
+  std::stable_sort(critical.begin(), critical.end(), std::greater<std::pair<double, unsigned int> >() );
+  
+  
+  bool split=false;
+  const unsigned int nt = tks.GetSize();
+
+  for(unsigned int ic=0; ic<critical.size(); ic++){
+    unsigned int k=critical[ic].second;
+
+    // estimate subcluster positions and weight
+    double p1=0, z1=0, t1=0, w1=0;
+    double p2=0, z2=0, t2=0, w2=0;
+    for(unsigned int i=0; i<nt; ++i){
+      if (tks._Z_sum[i] > 1.e-100) {
+	// winner-takes-all, usually overestimates splitting
+	double tl = tks._z[i] < y._z[k] ? 1.: 0.;
+	double tr = 1. - tl;
+
+	 // soften it, especially at low T
+	double arg = ( (tks._z[i] - y._z[k]) + (tks._t[i] - y._t[k]) ) * sqrt(beta * tks._errsum[i]);
+	if(std::fabs(arg) < 20){
+	  double t = local_exp(-arg);
+	  tl = t/(t+1.);
+	  tr = 1/(t+1.);
+	}
+
+	double p = y._pk[k] * tks._pi[i] * local_exp(-beta * Eik(tks._z[i], y._z[k], tks._dz2[i], 
+                                                                 tks._t[i], y._t[k], tks._dt2[i])) / tks._Z_sum[i];
+	double w = p*tks._errsum[i];
+	p1 += p*tl ; z1 += w*tl*tks._z[i]; t1 += w*tl*tks._t[i]; w1 += w*tl;
+	p2 += p*tr;  z2 += w*tr*tks._z[i]; t2 += w*tr*tks._t[i]; w2 += w*tr;
+      }
+    }
+
+    if(w1>0){z1 = z1/w1; t1 = t1/w1;} else {z1=y._z[k]-epsilonz; t1=y._t[k]-epsilont;}
+    if(w2>0){z2 = z2/w2; t2 = t2/w2;} else {z2=y._z[k]+epsilonz; t2=y._t[k]+epsilont;}
+
+    // reduce split size if there is not enough room
+    if( ( k   > 0 ) && ( z1 < (0.6*y._z[k] + 0.4*y._z[k-1])) ){ z1 = 0.6*y._z[k] + 0.4*y._z[k-1]; }
+    if( ( k+1 < nv) && ( z2 > (0.6*y._z[k] + 0.4*y._z[k+1])) ){ z2 = 0.6*y._z[k] + 0.4*y._z[k+1]; }
+
+    if(verbose_){
+      if (std::fabs(y._z[k] - zdumpcenter_) < zdumpwidth_){
+	std::cout << " T= " << std::setw(8) << 1./beta 
+		  << " Tc= " << critical[ic].first 
+		  << "    splitting " << std::fixed << std::setprecision(4) << y._z[k] 
+		  << " --> (" << z1 << ',' << t1<< "),(" << z2 << ',' << t2 
+		  << ")     [" << p1 << "," << p2 << "]" ;
+	if (std::fabs(z2-z1) > epsilonz || std::fabs(t2-t1) > epsilont){
+	  std::cout << std::endl;
+	}else{
+	  std::cout <<  "  rejected " << std::endl;
+	}
+      }
+    }
+
+    // split if the new subclusters are significantly separated
+    if( std::fabs(z2-z1) > epsilonz || std::fabs(t2-t1) > epsilont){
+      split = true;
+      double pk1 = p1*y._pk[k]/(p1+p2);
+      double pk2 = p2*y._pk[k]/(p1+p2);
+      y._z[k]  =  z2;
+      y._t[k]  =  t2;
+      y._pk[k] = pk2;
+      y.InsertItem(k, z1, t1, pk1);
+      nv++;
+
+     // adjust remaining pointers
+      for(unsigned int jc=ic; jc < critical.size(); jc++){
+        if (critical[jc].second > k) {critical[jc].second++;}
+      }
+    }
+  }
+  return split;
+}
+
+
+
+void DAClusterizerInZT_vect::splitAll( vertex_t & y) const {
+
+  const unsigned int nv = y.GetSize();
+  
+  constexpr double epsilonz = 1e-3; // split all single vertices by 10 um
+  constexpr double epsilont = 1e-2; // split all single vertices by 10 ps
+  constexpr double zsep = 2 * epsilonz; // split vertices that are isolated by at least zsep (vertices that haven't collapsed)
+  constexpr double tsep = 2 * epsilont; 
+  vertex_t y1;
+  
+  if (verbose_) {
+    std::cout << "Before Split "<< std::endl;
+    y.DebugOut();
+  }
+
+  for (unsigned int k = 0; k < nv; k++) {
+    if (
+	( ( (k == 0)       	|| ( y._z[k - 1]	< (y._z[k] - zsep)) ) &&
+          ( ((k + 1) == nv)	|| ( y._z[k + 1] 	> (y._z[k] + zsep)) )    ) || 
+        ( ( (k == 0)       	|| ( y._t[k - 1]	< (y._t[k] - tsep)) ) &&
+          ( ((k + 1) == nv)	|| ( y._t[k + 1] 	> (y._t[k] + tsep)) )    )    )
+      {
+	// isolated prototype, split
+	double new_z = y.z[k] - epsilonz;
+        double new_t = y.t[k] - epsilont;
+	y.z[k] = y.z[k] + epsilonz;
+        y.t[k] = y.t[k] + epsilont;
+	
+	double new_pk = 0.5 * y.pk[k];
+	y.pk[k] = 0.5 * y.pk[k];
+	
+	y1.AddItem(new_z, new_t, new_pk);
+	y1.AddItem(y._z[k], y._t[k], y._pk[k]);	
+      }
+    else if ( (y1.GetSize() == 0 ) ||
+	      (y1._z[y1.GetSize() - 1] <  (y._z[k] - zsep)  ) ||
+              (y1._t[y1.GetSize() - 1] <  (y._t[k] - tsep)  ))
+      {
+	y1.AddItem(y._z[k], y._t[k], y._pk[k]);
+      }
+    else
+      {
+	y1._z[y1.GetSize() - 1] = y1._z[y1.GetSize() - 1] - epsilonz;
+        y1._t[y1.GetSize() - 1] = y1._t[y1.GetSize() - 1] - epsilont;
+	y._z[k] = y._z[k] + epsilonz;
+        y._t[k] = y._t[k] + epsilont;
+	y1.AddItem( y._z[k], y._t[k] , y._pk[k]);
+      }
+  }// vertex loop
+
+  y = y1;
+  y.ExtractRaw();
+  
+  if (verbose_) {
+    std::cout << "After split " << std::endl;
+    y.DebugOut();
+  }
+}
+
+
+
+vector<TransientVertex> 
+DAClusterizerInZT_vect::vertices(const vector<reco::TransientTrack> & tracks, const int verbosity) const {
+  track_t && tks = fill(tracks);
+  tks.ExtractRaw();
+  
+  unsigned int nt = tks.GetSize();
+  double rho0 = 0.0; // start with no outlier rejection
+  
+  vector<TransientVertex> clusters;
+  if (tks.GetSize() == 0) return clusters;
+  
+  vertex_t y; // the vertex prototypes
+  
+  // initialize:single vertex at infinite temperature
+  y.AddItem( 0, 0, 1.0);
+  
+  int niter = 0; // number of iterations
+  
+  
+  // estimate first critical temperature
+  double beta = beta0(betamax_, tks, y);
+  if ( verbose_) std::cout << "Beta0 is " << beta << std::endl;
+  
+  niter = 0;
+  while ((update(beta, tks, y, false, rho0) > 1.e-6) && 
+	 (niter++ < maxIterations_)) {}
+
+  // annealing loop, stop when T<Tmin  (i.e. beta>1/Tmin)
+
+  double betafreeze = betamax_ * sqrt(coolingFactor_);
+
+  while (beta < betafreeze) {
+    if(useTc_){
+      update(beta, tks,y, false, rho0);
+      while(merge(y, beta)){update(beta, tks, y, false, rho0);}
+      split(beta, tks, y);
+      beta=beta/coolingFactor_;
+    }else{
+      beta=beta/coolingFactor_;
+      splitAll(y);
+    }
+    
+    // make sure we are not too far from equilibrium before cooling further
+    niter = 0;
+    while ((update(beta, tks, y, false, rho0) > 1.e-6) && 
+	   (niter++ < maxIterations_)) {}
+
+    if(verbose_){ dump( beta, y, tks, 0); }
+  }
+  
+
+  if(useTc_){
+    //last round of splitting, make sure no critical clusters are left
+    if(verbose_){ std::cout << "last spliting at " << 1./beta << std::endl; }
+    update(beta, tks,y, false, rho0);// make sure Tc is up-to-date
+    while(merge(y,beta)){update(beta, tks,y, false, rho0);}
+    unsigned int ntry=0;
+    double threshold = 1.0;
+    while( split(beta, tks, y, threshold) && (ntry++<10) ){
+      niter=0; 
+      while((update(beta, tks,y, false, rho0)>1.e-6)  && (niter++ < maxIterations_)){}
+      while(merge(y,beta)){update(beta, tks,y, false, rho0);}
+      if(verbose_){ 
+	std::cout << "after final splitting,  try " <<  ntry << std::endl; 
+	dump(beta, y, tks, 2); 
+      }
+      // relax splitting a bit to reduce multiple split-merge cycles of the same cluster
+      threshold *= 1.1; 
+    }
+  }else{
+    // merge collapsed clusters 
+    while(merge(y,beta)){update(beta, tks,y, false, rho0);}  
+  }
+  
+  if (verbose_) {
+    update(beta, tks,y, false, rho0);
+    std::cout  << "dump after 1st merging " << endl;
+    dump(beta, y, tks, 2);
+  }
+  
+  
+  // switch on outlier rejection at T=Tmin
+  if(dzCutOff_ > 0){
+    rho0 = 1./nt;
+    for(unsigned int a=0; a<10; a++){ update(beta, tks, y, true, a*rho0/10);} // adiabatic turn-on
+  }
+
+  niter=0;
+  while ((update(beta, tks, y, true, rho0) > 1.e-8) && (niter++ < maxIterations_)) {};
+  if (verbose_) {
+    std::cout  << "dump after noise-suppression, rho0=" << rho0  << "  niter = " << niter << endl;
+    dump(beta, y, tks, 2);
+  }
+
+  // merge again  (some cluster split by outliers collapse here)
+  while (merge(y, beta)) {update(beta, tks, y, true, rho0); }
+  if (verbose_) {
+    std::cout  << "dump after merging " << endl;
+    dump(beta, y, tks, 2);
+  }
+
+  // go down to the purging temperature (if it is lower than tmin)
+  while( beta < betapurge_ ){
+    beta = min( beta/coolingFactor_, betapurge_);
+    niter = 0;
+    while ((update(beta, tks, y, false, rho0) > 1.e-8) && (niter++ < maxIterations_)) {}
+  }
+
+
+  // eliminate insigificant vertices, this is more restrictive at higher T
+  while (purge(y, tks, rho0, beta)) {
+    niter = 0;
+    while ((update(beta, tks, y, true, rho0) > 1.e-6) && (niter++ < maxIterations_)) {}
+  }
+
+  if (verbose_) {
+    update(beta, tks,y, true, rho0);
+    std::cout  << " after purging " << std:: endl;
+    dump(beta, y, tks, 2);
+  }
+
+  // optionally cool some more without doing anything, to make the assignment harder
+  while( beta < betastop_ ){
+    beta = min( beta/coolingFactor_, betastop_);
+    niter =0;
+    while ((update(beta, tks, y, true, rho0) > 1.e-8) && (niter++ < maxIterations_)) {}
+  }
+
+
+  if (verbose_) {
+    std::cout  << "Final result, rho0=" << std::scientific << rho0 << endl;
+    dump(beta, y, tks, 2);
+  }
+
+  // select significant tracks and use a TransientVertex as a container
+  GlobalError dummyError(0.01, 0, 0.01, 0., 0., 0.01);
+  
+  // ensure correct normalization of probabilities, should makes double assignment reasonably impossible
+  const unsigned int nv = y.GetSize();
+  for (unsigned int k = 0; k < nv; k++)
+     if ( edm::isNotFinite(y._pk[k]) || edm::isNotFinite(y._z[k]) ) { y._pk[k]=0; y._z[k]=0;}
+
+  for (unsigned int i = 0; i < nt; i++) // initialize
+    tks._Z_sum[i] = rho0 * local_exp(-beta * dzCutOff_ * dzCutOff_);
+
+  // improve vectorization (does not require reduction ....)
+  for (unsigned int k = 0; k < nv; k++) {
+     for (unsigned int i = 0; i < nt; i++)  
+       tks._Z_sum[i] += y._pk[k] * local_exp(-beta * Eik(tks._z[i], y._z[k],tks._dz2[i], tks._t[i], y._t[k],tks._dt2[i]));
+  }
+
+
+  for (unsigned int k = 0; k < nv; k++) {
+    GlobalPoint pos(0, 0, y._z[k]);
+    
+    vector<reco::TransientTrack> vertexTracks;
+    for (unsigned int i = 0; i < nt; i++) {
+      if (tks._Z_sum[i] > 1e-100) {
+	
+	double p = y._pk[k] * local_exp(-beta * Eik(tks._z[i], y._z[k], tks._dz2[i],
+                                                    tks._t[i], y._t[k], tks._dt2[i] )) / tks._Z_sum[i];
+	if ((tks._pi[i] > 0) && (p > mintrkweight_)) {
+	  vertexTracks.push_back(*(tks.tt[i]));
+	  tks._Z_sum[i] = 0; // setting Z=0 excludes double assignment
+	}
+      }
+    }
+    TransientVertex v(pos, dummyError, vertexTracks, 0);
+    clusters.push_back(v);
+  }
+
+  return clusters;
+
+}
+
+vector<vector<reco::TransientTrack> > DAClusterizerInZT_vect::clusterize(
+		const vector<reco::TransientTrack> & tracks) const {
+  
+  if (verbose_) {
+    std::cout  << "###################################################" << endl;
+    std::cout  << "# vectorized DAClusterizerInZT_vect::clusterize   nt=" << tracks.size() << endl;
+    std::cout  << "###################################################" << endl;
+  }
+  
+  vector<vector<reco::TransientTrack> > clusters;
+  vector<TransientVertex> && pv = vertices(tracks);
+  
+  if (verbose_) {
+    std::cout  << "# DAClusterizerInZT::clusterize   pv.size=" << pv.size()
+					     << endl;
+  }
+  if (pv.size() == 0) {
+    return clusters;
+  }
+  
+  // fill into clusters and merge
+  vector<reco::TransientTrack> aCluster = pv.begin()->originalTracks();
+  
+  for (auto k = pv.begin() + 1; k != pv.end(); k++) {
+    if ( std::abs(k->position().z() - (k - 1)->position().z()) > (2 * vertexSize_)) {
+      // close a cluster
+      if (aCluster.size()>1){
+	clusters.push_back(aCluster);
+      }else{
+	if(verbose_){
+	  std::cout << " one track cluster at " << k->position().z() << "  suppressed" << std::endl;
+	}
+      }
+      aCluster.clear();
+    }
+    for (unsigned int i = 0; i < k->originalTracks().size(); i++) {
+      aCluster.push_back(k->originalTracks()[i]);
+    }
+    
+  }
+  clusters.emplace_back(std::move(aCluster));
+  
+  return clusters;
+
+}
+
+
+
+void DAClusterizerInZT_vect::dump(const double beta, const vertex_t & y,
+		const track_t & tks, int verbosity) const {
+
+	const unsigned int nv = y.GetSize();
+	const unsigned int nt = tks.GetSize();
+	
+	std::vector< unsigned int > iz;
+	for(unsigned int j=0; j<nt; j++){ iz.push_back(j); }
+	std::sort(iz.begin(), iz.end(), [tks](unsigned int a, unsigned int b){ return tks._z[a]<tks._z[b];} ); 
+	std::cout  << std::endl;
+	std::cout  << "-----DAClusterizerInZT::dump ----" <<  nv << "  clusters " << std::endl;
+	std::cout  << "                                                                z= ";
+	std::cout  << setprecision(4);
+	for (unsigned int ivertex = 0; ivertex < nv; ++ ivertex) {
+	  if (std::fabs(y._z[ivertex]-zdumpcenter_) < zdumpwidth_){
+		std::cout  << setw(8) << fixed << y._z[ivertex];
+	  }
+	}
+	std::cout  << endl << "T=" << setw(15) << 1. / beta
+		   << " Tmin =" << setw(10) << 1./betamax_
+			<< "                             Tc= ";
+	for (unsigned int ivertex = 0; ivertex < nv; ++ ivertex) {
+	  if (std::fabs(y._z[ivertex]-zdumpcenter_) < zdumpwidth_){
+	    double Tc = 2*y._swE[ivertex]/y._sw[ivertex];
+	    std::cout  << setw(8) << fixed << setprecision(1) <<  Tc;
+	  }
+	}
+	std::cout  << endl;
+
+	std::cout  <<  "                                                               pk= ";
+	double sumpk = 0;
+	for (unsigned int ivertex = 0; ivertex < nv; ++ ivertex) {
+		sumpk += y._pk[ivertex];
+		if  (std::fabs(y._z[ivertex] - zdumpcenter_) > zdumpwidth_) continue;
+		std::cout  << setw(8) << setprecision(4) << fixed << y._pk[ivertex];
+	}
+	std::cout  << endl;
+
+	std::cout << "                                                               nt= ";
+	for (unsigned int ivertex = 0; ivertex < nv; ++ ivertex) {
+		sumpk += y._pk[ivertex];
+		if  (std::fabs(y._z[ivertex] - zdumpcenter_) > zdumpwidth_) continue;
+		std::cout  << setw(8) << setprecision(1) << fixed << y._pk[ivertex]*nt;
+	}
+	std::cout  << endl;
+
+	if (verbosity > 0) {
+		double E = 0, F = 0;
+		std::cout  << endl;
+		std::cout 
+		<< "----        z +/- dz                ip +/-dip       pt    phi  eta    weights  ----"
+		<< endl;
+		std::cout  << setprecision(4);
+		for (unsigned int i0 = 0; i0 < nt; i0++) {
+		  unsigned int i = iz[i0];
+			if (tks._Z_sum[i] > 0) {
+				F -= std::log(tks._Z_sum[i]) / beta;
+			}
+			double tz = tks._z[i];
+
+			if( std::fabs(tz - zdumpcenter_) > zdumpwidth_) continue;
+			std::cout  << setw(4) << i << ")" << setw(8) << fixed << setprecision(4)
+			 << tz << " +/-" << setw(6) << sqrt(1./tks._dz2[i]);
+
+			if (tks.tt[i]->track().quality(reco::TrackBase::highPurity)) {
+				std::cout  << " *";
+			} else {
+				std::cout  << "  ";
+			}
+			if (tks.tt[i]->track().hitPattern().hasValidHitInPixelLayer(PixelSubdetector::SubDetector::PixelBarrel, 1)) {
+				std::cout  << "+";
+			} else {
+				std::cout  << "-";
+			}
+			std::cout  << setw(1)
+			 << tks.tt[i]->track().hitPattern().pixelBarrelLayersWithMeasurement(); // see DataFormats/TrackReco/interface/HitPattern.h
+			std::cout  << setw(1)
+			 << tks.tt[i]->track().hitPattern().pixelEndcapLayersWithMeasurement();
+			std::cout  << setw(1) << hex
+					<< tks.tt[i]->track().hitPattern().trackerLayersWithMeasurement()
+					- tks.tt[i]->track().hitPattern().pixelLayersWithMeasurement()
+					<< dec;
+			std::cout  << "=" << setw(1) << hex
+					<< tks.tt[i]->track().hitPattern().numberOfHits(reco::HitPattern::MISSING_OUTER_HITS)
+					<< dec;
+
+			Measurement1D IP =
+					tks.tt[i]->stateAtBeamLine().transverseImpactParameter();
+			std::cout  << setw(8) << IP.value() << "+/-" << setw(6) << IP.error();
+			std::cout  << " " << setw(6) << setprecision(2)
+			 << tks.tt[i]->track().pt() * tks.tt[i]->track().charge();
+			std::cout  << " " << setw(5) << setprecision(2)
+			 << tks.tt[i]->track().phi() << " " << setw(5)
+			 << setprecision(2) << tks.tt[i]->track().eta();
+
+			double sump = 0.;
+			for (unsigned int ivertex = 0; ivertex < nv; ++ ivertex) {
+			  if  (std::fabs(y._z[ivertex]-zdumpcenter_) > zdumpwidth_) continue;
+
+				if ((tks._pi[i] > 0) && (tks._Z_sum[i] > 0)) {
+					//double p=pik(beta,tks[i],*k);
+                                  double p = y._pk[ivertex]  * exp(-beta * Eik(tks._z[i], y._z[ivertex], tks._dz2[i],
+                                                                               tks._t[i], y._t[ivertex], tks._dt2[i] )) / tks._Z_sum[i];
+					if (p > 0.0001) {
+						std::cout  << setw(8) << setprecision(3) << p;
+					} else {
+						std::cout  << "    .   ";
+					}
+					E += p * Eik(tks._z[i], y._z[ivertex], tks._dz2[i],
+                                                     tks._t[i], y._t[ivertex], tks._dt2[i] );
+					sump += p;
+				} else {
+					std::cout  << "        ";
+				}
+			}
+			std::cout  << endl;
+		}
+		std::cout  << endl << "T=" << 1 / beta << " E=" << E << " n=" << y.GetSize()
+			 << "  F= " << F << endl << "----------" << endl;
+	}
+}