From 4a753a65d505f3f9e4c5c428dd2c03fd3433f830 Mon Sep 17 00:00:00 2001
From: Steve Lantz <steve.lantz@cornell.edu>
Date: Wed, 28 Oct 2020 18:06:35 -0700
Subject: [PATCH 1/3] remove all deprecated GPU code

---
 BinInfoUtils.h                     |   4 -
 Config.cc                          |   5 +-
 Config.h                           |  14 +-
 Hit.h                              |   6 -
 Makefile                           |   2 +-
 Makefile.config                    |  33 --
 Matrix.h                           |   2 -
 README.md                          |   3 +-
 Track.h                            |  46 +--
 gpu_scripts/gpu_setup.py           |  76 -----
 gpu_scripts/runBenchmark_cuda.py   |  36 --
 gpu_scripts/run_cleanup_only.py    |  26 --
 gpu_scripts/run_setup_only.py      |  29 --
 mkFit/BuilderCU.cu                 | 111 -------
 mkFit/BuilderCU.h                  |  55 ----
 mkFit/FitterCU-imp.h               | 446 -------------------------
 mkFit/FitterCU.h                   | 149 ---------
 mkFit/GPlex.h                      | 149 ---------
 mkFit/GeometryCU.h                 |  30 --
 mkFit/HitStructuresCU.cu           | 394 ----------------------
 mkFit/HitStructuresCU.h            | 211 ------------
 mkFit/KalmanUtilsMPlex.h           |   4 -
 mkFit/KalmanUtilsMPlex.icc         |  23 --
 mkFit/Makefile                     |  35 +-
 mkFit/MkBuilder.h                  |   4 -
 mkFit/MkFitter.cc                  |  52 +--
 mkFit/MkFitter.h                   |   5 -
 mkFit/PropagationMPlex.icc         |   3 -
 mkFit/accessors_cu.h               |  40 ---
 mkFit/array_algorithms_cu.h        |  86 -----
 mkFit/atomic_utils.h               | 102 ------
 mkFit/best_hit_kernels.cu          | 298 -----------------
 mkFit/best_hit_kernels.h           |  51 ---
 mkFit/build_tracks_kernels.cu      | 203 ------------
 mkFit/build_tracks_kernels.h       |  35 --
 mkFit/buildtestMPlex.cc            | 170 ----------
 mkFit/buildtestMPlex.h             |  11 -
 mkFit/check_gpu_hit_structures.cu  | 144 --------
 mkFit/check_gpu_hit_structures.h   |  12 -
 mkFit/clone_engine_cuda_helpers.cu |  46 ---
 mkFit/clone_engine_cuda_helpers.h  |  69 ----
 mkFit/clone_engine_kernels.cu      | 258 ---------------
 mkFit/clone_engine_kernels.h       |  26 --
 mkFit/clone_engine_kernels_seed.cu | 182 ----------
 mkFit/clone_engine_kernels_seed.h  |  26 --
 mkFit/computeChi2_kernels.cu       | 144 --------
 mkFit/computeChi2_kernels.h        |  35 --
 mkFit/device_array_view.h          |  46 ---
 mkFit/device_bundle.h              |  88 -----
 mkFit/device_vector.h              | 118 -------
 mkFit/fittestMPlex.cc              | 106 ------
 mkFit/fittestMPlex.h               |   9 -
 mkFit/fittracks_kernels.cu         |  51 ---
 mkFit/fittracks_kernels.h          |  17 -
 mkFit/gpu_utils.cu                 |   9 -
 mkFit/gpu_utils.h                  |  63 ----
 mkFit/index_selection_kernels.cu   | 153 ---------
 mkFit/index_selection_kernels.h    |  15 -
 mkFit/kalmanUpdater_kernels.cu     | 402 -----------------------
 mkFit/kalmanUpdater_kernels.h      |  38 ---
 mkFit/mkFit.cc                     | 105 +-----
 mkFit/propagation_kernels.cu       | 316 ------------------
 mkFit/propagation_kernels.h        |  36 --
 mkFit/reorganize_gplex.cu          | 510 -----------------------------
 mkFit/reorganize_gplex.h           | 126 -------
 65 files changed, 26 insertions(+), 6073 deletions(-)
 delete mode 100644 gpu_scripts/gpu_setup.py
 delete mode 100755 gpu_scripts/runBenchmark_cuda.py
 delete mode 100755 gpu_scripts/run_cleanup_only.py
 delete mode 100755 gpu_scripts/run_setup_only.py
 delete mode 100644 mkFit/BuilderCU.cu
 delete mode 100644 mkFit/BuilderCU.h
 delete mode 100644 mkFit/FitterCU-imp.h
 delete mode 100644 mkFit/FitterCU.h
 delete mode 100644 mkFit/GPlex.h
 delete mode 100644 mkFit/GeometryCU.h
 delete mode 100644 mkFit/HitStructuresCU.cu
 delete mode 100644 mkFit/HitStructuresCU.h
 delete mode 100644 mkFit/accessors_cu.h
 delete mode 100644 mkFit/array_algorithms_cu.h
 delete mode 100644 mkFit/atomic_utils.h
 delete mode 100644 mkFit/best_hit_kernels.cu
 delete mode 100644 mkFit/best_hit_kernels.h
 delete mode 100644 mkFit/build_tracks_kernels.cu
 delete mode 100644 mkFit/build_tracks_kernels.h
 delete mode 100644 mkFit/check_gpu_hit_structures.cu
 delete mode 100644 mkFit/check_gpu_hit_structures.h
 delete mode 100644 mkFit/clone_engine_cuda_helpers.cu
 delete mode 100644 mkFit/clone_engine_cuda_helpers.h
 delete mode 100644 mkFit/clone_engine_kernels.cu
 delete mode 100644 mkFit/clone_engine_kernels.h
 delete mode 100644 mkFit/clone_engine_kernels_seed.cu
 delete mode 100644 mkFit/clone_engine_kernels_seed.h
 delete mode 100644 mkFit/computeChi2_kernels.cu
 delete mode 100644 mkFit/computeChi2_kernels.h
 delete mode 100644 mkFit/device_array_view.h
 delete mode 100644 mkFit/device_bundle.h
 delete mode 100644 mkFit/device_vector.h
 delete mode 100644 mkFit/fittracks_kernels.cu
 delete mode 100644 mkFit/fittracks_kernels.h
 delete mode 100644 mkFit/gpu_utils.cu
 delete mode 100644 mkFit/gpu_utils.h
 delete mode 100644 mkFit/index_selection_kernels.cu
 delete mode 100644 mkFit/index_selection_kernels.h
 delete mode 100644 mkFit/kalmanUpdater_kernels.cu
 delete mode 100644 mkFit/kalmanUpdater_kernels.h
 delete mode 100644 mkFit/propagation_kernels.cu
 delete mode 100644 mkFit/propagation_kernels.h
 delete mode 100644 mkFit/reorganize_gplex.cu
 delete mode 100644 mkFit/reorganize_gplex.h

diff --git a/BinInfoUtils.h b/BinInfoUtils.h
index d5eddad2..bd6f6fc6 100644
--- a/BinInfoUtils.h
+++ b/BinInfoUtils.h
@@ -17,21 +17,18 @@ typedef std::pair<int, int>                 BinInfo;
 typedef std::vector<std::vector<BinInfo>>   BinInfoLayerMap;
 typedef std::vector<BinInfoLayerMap>        BinInfoMap;
 
-CUDA_CALLABLE
 inline float downPhi(float phi)
 {
   while (phi >= Config::PI) {phi-=Config::TwoPI;}
   return phi;
 }
 	
-CUDA_CALLABLE
 inline float upPhi(float phi)
 {
   while (phi <= -Config::PI) {phi+=Config::TwoPI;}
   return phi;
 }
 
-CUDA_CALLABLE
 inline float normalizedPhi(float phi)
 {
   //  return std::fmod(phi, (float) Config::PI); // return phi +pi out of phase for |phi| beyond boundary! 
@@ -39,7 +36,6 @@ inline float normalizedPhi(float phi)
   return phi;
 }
 
-CUDA_CALLABLE
 inline int getPhiPartition(float phi)
 {
   //assume phi is between -PI and PI
diff --git a/Config.cc b/Config.cc
index 9ac789b4..93c697d5 100644
--- a/Config.cc
+++ b/Config.cc
@@ -24,11 +24,8 @@ namespace Config
 
   // Multi threading and Clone engine configuration
   int   numThreadsFinder = 1;
-  
-  // GPU computations
   int   numThreadsEvents = 1;
-  int   numThreadsReorg = 1;
-
+  
 #if defined(__MIC__) || defined(__AVX512F__)
   int   numThreadsSimulation = 60;
 #else
diff --git a/Config.h b/Config.h
index e84ca618..cc0702e0 100644
--- a/Config.h
+++ b/Config.h
@@ -6,12 +6,6 @@
 #include <string> // won't compile on clang gcc for mac OS w/o this!
 #include <map>
 
-#if defined(__CUDACC__)
-  #define CUDA_CALLABLE __host__ __device__
-#else
-  #define CUDA_CALLABLE 
-#endif
-
 namespace mkfit {
 
 // Cram this in here for now ...
@@ -259,7 +253,7 @@ namespace Config
   // Config for Hit and BinInfoUtils
   constexpr int   nPhiPart   = 1260;
   constexpr float fPhiFactor = nPhiPart / TwoPI;
-  constexpr int   nEtaPart   = 11;  // 1 is better for GPU best_hit
+  constexpr int   nEtaPart   = 11;
   constexpr int   nEtaBin    = 2 * nEtaPart - 1;
 
   constexpr float        fEtaFull  = 2 * Config::fEtaDet;
@@ -363,10 +357,7 @@ namespace Config
   // Threading
   extern int    numThreadsFinder;
   extern int    numThreadsSimulation;
-
-  // For GPU computations
   extern int    numThreadsEvents;
-  extern int    numThreadsReorg;
 
   extern int    finderReportBestOutOfN;
 
@@ -409,7 +400,6 @@ namespace Config
 
   void RecalculateDependentConstants();
   
-  CUDA_CALLABLE
   inline float BfieldFromZR(const float z, const float r)
   {
     return (Config::mag_b0*z*z + Config::mag_b1*z + Config::mag_c1)*(Config::mag_a*r*r + 1.f);
@@ -420,8 +410,6 @@ namespace Config
   #ifndef MPT_SIZE
     #if defined(__MIC__) || defined(__AVX512F__)
       #define MPT_SIZE 16
-    #elif defined USE_CUDA
-      #define MPT_SIZE 8
     #elif defined(__AVX__) || defined(__AVX2__)
       #define MPT_SIZE 8
     #else
diff --git a/Hit.h b/Hit.h
index c9a077b1..56fb7c1f 100644
--- a/Hit.h
+++ b/Hit.h
@@ -61,13 +61,11 @@ inline float getInvRad2(float x, float y){
   return 1.0f/(x*x + y*y);
 }
 
-CUDA_CALLABLE
 inline float getPhi(float x, float y)
 {
   return std::atan2(y,x); 
 }
 
-CUDA_CALLABLE
 inline float getTheta(float r, float z){
   return std::atan2(r,z);
 }
@@ -201,10 +199,6 @@ class Hit
 
   const float* posArray() const {return state_.pos_.Array();}
   const float* errArray() const {return state_.err_.Array();}
-#if __CUDACC__
-  __device__ float* posArrayCU();
-  __device__ float* errArrayCU();
-#endif
 
   // Non-const versions needed for CopyOut of Matriplex.
   SVector3&     parameters_nc() {return state_.pos_;}
diff --git a/Makefile b/Makefile
index d16bbbc6..b0b49084 100644
--- a/Makefile
+++ b/Makefile
@@ -57,7 +57,7 @@ distclean: clean-local
 
 ${LIB_CORE}: ${CORE_OBJS}
 	@mkdir -p $(@D)
-	${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS_CU} ${LDFLAGS}
+	${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS}
 
 main: ${AUTO_TGTS} ${LIB_CORE} main.o
 	${CXX} ${CXXFLAGS} ${VEC_HOST} -o $@ main.o ${LDFLAGS_HOST} ${LDFLAGS} -Llib -lMicCore -Wl,-rpath,lib
diff --git a/Makefile.config b/Makefile.config
index 886bd61b..4dac05f5 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -47,25 +47,6 @@ else ifdef OSXMPCLANG
   TBB_PREFIX := /opt/local
 endif
 
-# 2.1 Use nvcc to compile cuda code
-# Using the CUB library for standard GPU algorithm http://nvlabs.github.io/cub/
-# It's header only and potentially exported by the environment
-# Maybe it is good enough to have:
-# CUBROOT?=Undefined
-# CUDAINCDIR and CUDALIBDIR also need to be defined
-ifneq (,$(realpath /home/ml15/tools/cub))
-	CUBROOT?=/home/ml15/tools/cub
-else ifneq (,$(realpath /nfs/opt/cuda-8-0/include))
-	CUBROOT?=/nfs/opt/cuda-8-0/include
-else ifneq (,$(realpath /usr/local/cuda/include))
-	CUBROOT?=/usr/local/cuda/include
-endif
-NV := nvcc -prec-sqrt=true -I${CUBROOT} 
-#-g -G -lineinfo
-# Comment out to compile for CPU
-#USE_CUDA := 1
-# For CUDA: Also need to change maxCandsPerSeed to 8 and nEtaPart to 1
-
 # 3. Optimization
 # -O3 implies vectorization and simd (but not AVX)
 OPT := -g -O3
@@ -143,20 +124,6 @@ CXXFLAGS := -fPIC ${OPT} ${OSX_CXXFLAGS}
 LDFLAGS_HOST := 
 LDFLAGS_MIC  := -static-intel
 
-ifdef USE_CUDA
-	CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo
-	LDFLAGS_HOST += -L${CUDALIBDIR}
-	ifeq ($(CXX),icpc)
-	  CXXFLAGS += -qopenmp-simd
-	  LDFLAGS  += -qopenmp-simd
-	else
-	  CXXFLAGS += -fopenmp-simd
-	  LDFLAGS  += -fopenmp-simd
-	endif
-endif
-#CXXFLAGS += -qopenmp
-#LDFLAGS += -qopenmp
-
 CPPFLAGS += ${USE_STATE_VALIDITY_CHECKS} ${USE_SCATTERING} ${USE_LINEAR_INTERPOLATION} ${ENDTOEND} ${INWARD_FIT} 
 
 ifdef USE_VTUNE_NOTIFY
diff --git a/Matrix.h b/Matrix.h
index 5aecfd9f..baef4568 100644
--- a/Matrix.h
+++ b/Matrix.h
@@ -81,13 +81,11 @@ inline double dtime()
     return( tseconds );
 }
 
-CUDA_CALLABLE
 inline float hipo(float x, float y)
 {
   return std::sqrt(x*x + y*y);
 }
 
-CUDA_CALLABLE
 inline void sincos4(const float x, float& sin, float& cos)
 {
    // Had this writen with explicit division by factorial.
diff --git a/README.md b/README.md
index 07630a97..47e6affb 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,6 @@
 - **phi3.t2.ucsd.edu**: [Intel Xeon Gold 6130 Processor](https://ark.intel.com/products/120492/Intel-Xeon-Gold-6130-Processor-22M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Au, SKL-SP, phi3)
 - **lnx4108.classe.cornell.edu**: [Intel Xeon Silver 4116 Processor](https://ark.intel.com/products/120481/Intel-Xeon-Silver-4116-Processor-16_5M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Ag, SKL-SP, lnx4108, LNX-S)
 - **lnx7188.classe.cornell.edu**: [Intel Xeon Gold 6142 Processor](https://ark.intel.com/content/www/us/en/ark/products/120487/intel-xeon-gold-6142-processor-22m-cache-2-60-ghz.html) _Skylake Scalable Performance_ (referred to as lnx7188,LNX-G)
-- **GPUs**: to be filled out
 
 phi1, phi2, and phi3 are all managed across a virtual login server and therefore the home user spaces are shared. phi1, phi2, phi3, lnx7188, and lnx4108 also have /cvmfs mounted so you can source the environment needed to run the code.
 
@@ -420,7 +419,7 @@ Described in validation manifesto. See Section 8 for more info on manifesto.
 ### TO DO
 
 - flesh out sections as needed
-- GPU specific code
+- GPU specific code?
 
 ### Vestigial code
 
diff --git a/Track.h b/Track.h
index 5278960d..d5463a0e 100644
--- a/Track.h
+++ b/Track.h
@@ -71,7 +71,6 @@ typedef std::vector<ReducedTrack> RedTrackVec;
 struct TrackState //  possible to add same accessors as track? 
 {
 public:
-  CUDA_CALLABLE
   TrackState() : valid(true) {}
   TrackState(int charge, const SVector3& pos, const SVector3& mom, const SMatrixSym66& err) :
     parameters(SVector6(pos.At(0),pos.At(1),pos.At(2),mom.At(0),mom.At(1),mom.At(2))),
@@ -152,17 +151,13 @@ class TrackBase
   ~TrackBase() {}
 
   const TrackState&  state() const { return state_; }
-  CUDA_CALLABLE void setState(const TrackState& newState) { state_ = newState; }
+  void setState(const TrackState& newState) { state_ = newState; }
 
   const SVector6&     parameters() const {return state_.parameters;}
   const SMatrixSym66& errors()     const {return state_.errors;}
 
   const float* posArray() const {return state_.parameters.Array();}
   const float* errArray() const {return state_.errors.Array();}
-#if __CUDACC__
-  __device__ float* posArrayCU();
-  __device__ float* errArrayCU();
-#endif
 
   // Non-const versions needed for CopyOut of Matriplex.
   SVector6&     parameters_nc() {return state_.parameters;}
@@ -196,15 +191,15 @@ class TrackBase
 
   // ------------------------------------------------------------------------
 
-  CUDA_CALLABLE int   charge() const { return state_.charge; }
-  CUDA_CALLABLE float chi2()   const { return chi2_; }
-  CUDA_CALLABLE float score()  const { return score_; }
-  CUDA_CALLABLE int   label()  const { return label_; }
+  int   charge() const { return state_.charge; }
+  float chi2()   const { return chi2_; }
+  float score()  const { return score_; }
+  int   label()  const { return label_; }
 
-  CUDA_CALLABLE void  setCharge(int chg)  { state_.charge = chg; }
-  CUDA_CALLABLE void  setChi2(float chi2) { chi2_ = chi2; }
-  CUDA_CALLABLE void  setScore(float s)   { score_ = s; }
-  CUDA_CALLABLE void  setLabel(int lbl)   { label_ = lbl; }
+  void  setCharge(int chg)  { state_.charge = chg; }
+  void  setChi2(float chi2) { chi2_ = chi2; }
+  void  setScore(float s)   { score_ = s; }
+  void  setLabel(int lbl)   { label_ = lbl; }
 
   bool  hasSillyValues(bool dump, bool fix, const char* pref="");
 
@@ -358,7 +353,6 @@ class TrackBase
 class Track : public TrackBase
 {
 public:
-  CUDA_CALLABLE
   Track() {}
 
   explicit Track(const TrackBase& base) :
@@ -369,7 +363,6 @@ class Track : public TrackBase
     nFoundHits_ =  0;
   }
 
-  CUDA_CALLABLE
   Track(const TrackState& state, float chi2, int label, int nHits, const HitOnTrack* hits) :
     TrackBase(state, chi2, label)
   {
@@ -390,7 +383,6 @@ class Track : public TrackBase
     hitsOnTrk_ (t.hitsOnTrk_)
   {}
 
-  CUDA_CALLABLE
   ~Track(){}
 
   // used for swimming cmssw rec tracks to mkFit position
@@ -434,7 +426,6 @@ class Track : public TrackBase
   // out of TrackBase. If we do it.
   void reserveHits(int nHits) { hitsOnTrk_.reserve(nHits); }
 
-  CUDA_CALLABLE
   void resetHits() { lastHitIdx_ = -1; nFoundHits_ =  0; hitsOnTrk_.clear(); }
 
   // Hackish for MkFinder::copy_out ... to be reviewed
@@ -443,7 +434,6 @@ class Track : public TrackBase
 
   void resizeHitsForInput();
 
-  CUDA_CALLABLE
   void addHitIdx(int hitIdx, int hitLyr, float chi2)
   {
     hitsOnTrk_.push_back( { hitIdx, hitLyr } );
@@ -462,12 +452,12 @@ class Track : public TrackBase
 
   HitOnTrack getHitOnTrack(int posHitIdx) const { return hitsOnTrk_[posHitIdx]; }
 
-  CUDA_CALLABLE int getHitIdx(int posHitIdx) const { return hitsOnTrk_[posHitIdx].index; }
-  CUDA_CALLABLE int getHitLyr(int posHitIdx) const { return hitsOnTrk_[posHitIdx].layer; }
+  int getHitIdx(int posHitIdx) const { return hitsOnTrk_[posHitIdx].index; }
+  int getHitLyr(int posHitIdx) const { return hitsOnTrk_[posHitIdx].layer; }
 
-  CUDA_CALLABLE HitOnTrack getLastHitOnTrack() const { return hitsOnTrk_[lastHitIdx_]; }
-  CUDA_CALLABLE int        getLastHitIdx()     const { return hitsOnTrk_[lastHitIdx_].index;  }
-  CUDA_CALLABLE int        getLastHitLyr()     const { return hitsOnTrk_[lastHitIdx_].layer;  }
+  HitOnTrack getLastHitOnTrack() const { return hitsOnTrk_[lastHitIdx_]; }
+  int        getLastHitIdx()     const { return hitsOnTrk_[lastHitIdx_].index;  }
+  int        getLastHitLyr()     const { return hitsOnTrk_[lastHitIdx_].layer;  }
 
   int getLastFoundHitPos() const
   {
@@ -506,12 +496,10 @@ class Track : public TrackBase
 
   HitOnTrack* BeginHitsOnTrack_nc() { return hitsOnTrk_.data(); }
 
-  CUDA_CALLABLE
   void setHitIdx(int posHitIdx, int newIdx) {
     hitsOnTrk_[posHitIdx].index = newIdx;
   }
 
-  CUDA_CALLABLE
   void setHitIdxLyr(int posHitIdx, int newIdx, int newLyr) {
     hitsOnTrk_[posHitIdx] = { newIdx, newLyr };
   }
@@ -523,8 +511,8 @@ class Track : public TrackBase
     }
   }
 
-  CUDA_CALLABLE int  nFoundHits() const { return nFoundHits_; }
-  CUDA_CALLABLE int  nTotalHits() const { return lastHitIdx_ + 1; }
+  int  nFoundHits() const { return nFoundHits_; }
+  int  nTotalHits() const { return lastHitIdx_ + 1; }
 
   int nInsideMinusOneHits() const
   {
@@ -578,7 +566,7 @@ class Track : public TrackBase
     return layers;
   }
 
-  CUDA_CALLABLE Track clone() const { return Track(*this); }
+  Track clone() const { return Track(*this); }
 
 
 private:
diff --git a/gpu_scripts/gpu_setup.py b/gpu_scripts/gpu_setup.py
deleted file mode 100644
index 0113dc7a..00000000
--- a/gpu_scripts/gpu_setup.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import os
-import shutil
-import subprocess
-
-
-def backup_original_files(files):
-    for f in files:
-        shutil.copy(f, f + ".bck")
-
-
-def replace_in_file(file_name, regexes, suffix=''):
-    for regex in regexes:
-        subprocess.check_call(['sed', '-i' + suffix, regex, file_name])
-
-
-def restore_file(file_name, suffix='.bck'):
-    backup = file_name + '.bck'
-    if (os.path.exists(backup)):
-        shutil.move(backup, file_name)
-
-
-def check_env():
-    if not 'CUBROOT' in os.environ:
-        raise EnvironmentError(
-              "CUB (https://github.com/NVlabs/cub) should be downloaded and "
-              "the environment variable 'CUBROOT' set to where it lies. "
-              "No installation required: it's header only.")
-    try:
-        subprocess.check_output(["nvcc", "--version"])
-    except:
-       raise EnvironmentError("nvcc should be available on the system PATH")
-
-
-def setup_files(makefile, config_h):
-    replace_in_file(makefile,
-                    [r's/^\s*KNC_BUILD\s*/# &/',
-                     r's/^\s*ICC.*/ICC := icpc/',
-                     r's/^#\(USE_CUDA := yes\)/\1/'])
-            
-    replace_in_file(config_h,
-                    [r's/\(\s*constexpr\s*int\s*nEtaPart\s*= \).*\(\s*;.*\)/\1 1\2/',
-                     r's/\(\s*constexpr\s*int\s*maxCandsPerSeed\s*= \).*\(\s*;\s*\/\/.*\)/\1 8\2/'])
-
-
-def cleanup(makefile, config_h):
-    restore_file(makefile)
-    restore_file(config_h)
-
-
-def make_distclean(rootdir):
-    saved_path = os.path.abspath(os.path.curdir)
-    os.chdir(rootdir)
-    subprocess.check_call(["make", "-f", "Makefile", "distclean"])
-    os.chdir(saved_path)
-
-
-def make_all(rootdir):
-    saved_path = os.path.abspath(os.path.curdir)
-    os.chdir(rootdir)
-    subprocess.check_call(["make", "-j", "24", "-f", "Makefile", "all"])
-    os.chdir(saved_path)
-
-
-def get_file_path_safe(rootdir, file_name):
-    path = os.path.join(rootdir, file_name)
-    if not os.path.isfile(path):
-        raise RuntimeError('%s should exist, check the path' % path)
-    return path
-
-
-def get_file_paths(rootdir):
-    makefile = get_file_path_safe(rootdir, 'Makefile.config')
-    config_h = get_file_path_safe(rootdir, 'Config.h')
-    return (makefile, config_h)
-
-
diff --git a/gpu_scripts/runBenchmark_cuda.py b/gpu_scripts/runBenchmark_cuda.py
deleted file mode 100755
index f1c98e13..00000000
--- a/gpu_scripts/runBenchmark_cuda.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-import os
-
-import gpu_setup
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--rootdir', dest='rootdir',
-                        action='store', default='.',
-                        help='Where Makefile.config and Config.h are.')
-    args = parser.parse_args()
-    if not os.path.isdir(args.rootdir):
-        raise RuntimeError('--rootdir should be a directory.')
-    return args
-
-
-def main():
-    args = get_args()
-    makefile, config_h = gpu_setup.get_file_paths(os.path.abspath(args.rootdir))
-    gpu_setup.check_env()
-    gpu_setup.backup_original_files([makefile, config_h])
-
-    gpu_setup.setup_files(makefile, config_h)
-    try:
-        pass
-        gpu_setup.make_distclean(args.rootdir)
-        gpu_setup.make_all(args.rootdir)
-    finally:
-        gpu_setup.cleanup(makefile, config_h)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/gpu_scripts/run_cleanup_only.py b/gpu_scripts/run_cleanup_only.py
deleted file mode 100755
index 6690a224..00000000
--- a/gpu_scripts/run_cleanup_only.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import argparse
-import gpu_setup
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--rootdir', dest='rootdir',
-                        action='store', default='.',
-                        help='Where Makefile.config and Config.h are.')
-    args = parser.parse_args()
-    if not os.path.isdir(args.rootdir):
-        raise RuntimeError('--rootdir should be a directory.')
-    return args
-
-
-def main():
-    args = get_args()
-    makefile, config_h = gpu_setup.get_file_paths(os.path.abspath(args.rootdir))
-    gpu_setup.cleanup(makefile, config_h)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/gpu_scripts/run_setup_only.py b/gpu_scripts/run_setup_only.py
deleted file mode 100755
index 78d3d9cf..00000000
--- a/gpu_scripts/run_setup_only.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import argparse
-import gpu_setup
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--rootdir', dest='rootdir',
-                        action='store', default='.',
-                        help='Where Makefile.config and Config.h are.')
-    args = parser.parse_args()
-    if not os.path.isdir(args.rootdir):
-        raise RuntimeError('--rootdir should be a directory.')
-    return args
-
-
-def main():
-    args = get_args()
-    makefile, config_h = gpu_setup.get_file_paths(os.path.abspath(args.rootdir))
-    gpu_setup.check_env()
-    gpu_setup.backup_original_files([makefile, config_h])
-
-    gpu_setup.setup_files(makefile, config_h)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/mkFit/BuilderCU.cu b/mkFit/BuilderCU.cu
deleted file mode 100644
index 4ec1ac60..00000000
--- a/mkFit/BuilderCU.cu
+++ /dev/null
@@ -1,111 +0,0 @@
-#include "BuilderCU.h"
-
-#include "gpu_utils.h"
-#include "HitStructures.h"
-#include "HitStructuresCU.h"
-#include "GeometryCU.h"
-#include "FitterCU.h"
-#include "Event.h"
-
-
-BuilderCU::BuilderCU() {}
-
-
-BuilderCU::BuilderCU(FitterCU<float> *fitter) {
-  cuFitter = fitter;
-}
-
-
-void BuilderCU::setUpFitter(int gplex_size)
-{
-  cuFitter = new FitterCU<float> (gplex_size);
-  cuFitter->allocateDevice();
-  cuFitter->allocate_extra_addBestHit();
-  cuFitter->createStream();
-  cuFitter->setNumberTracks(gplex_size);
-}
-
-
-void BuilderCU::tearDownFitter()
-{
-  cuFitter->destroyStream();
-  cuFitter->free_extra_addBestHit();
-  cuFitter->freeDevice();
-  delete cuFitter;
-}
-
-
-BuilderCU::~BuilderCU() {
-  geom_cu.deallocate();
-}
-
-
-void BuilderCU::allocateGeometry(const Geometry& geom)
-{
-  std::vector<float> radii (Config::nLayers);
-  for (int ilay = Config::nlayers_per_seed; ilay < Config::nLayers; ++ilay) {
-    radii[ilay] = geom.Radius(ilay);
-  }
-  geom_cu.allocate();
-  geom_cu.getRadiiFromCPU(&radii[0]);
-}
-
-
-void BuilderCU::setUpBH(const EventOfHits& event_of_hits, const Event* event,
-                      const EventOfCandidates& event_of_cands)
-{
-  event_of_hits_cu.reserve_layers(event_of_hits, 2.f);
-  event_of_hits_cu.copyFromCPU(event_of_hits, cuFitter->get_stream());
-  event_of_cands_cu.allocGPU(event_of_cands);
-}
-
-
-void BuilderCU::tearDownBH() {
-  event_of_cands_cu.deallocGPU();
-}
-
-
-void BuilderCU::allocateCE(const EventOfHits& event_of_hits, const Event* event,
-                           const EventOfCombCandidates& event_of_cands)
-{
-  event_of_hits_cu.reserve_layers(event_of_hits, 2.f);
-  event_of_comb_cands_cu.allocate(event_of_cands);
-}
-
-
-void BuilderCU::setUpCE(const EventOfHits& event_of_hits, const Event* event,
-                      const EventOfCombCandidates& event_of_cands)
-{
-  event_of_hits_cu.copyFromCPU(event_of_hits, cuFitter->get_stream());
-}
-
-
-void BuilderCU::tearDownCE() {
-  /*event_of_comb_cands_cu.free();*/
-  /*event_of_hits_cu.deallocGPU();*/
-}
-
-
-void BuilderCU::FindTracksBestHit(EventOfCandidates& event_of_cands) 
-{
-  event_of_cands_cu.copyFromCPU(event_of_cands, cuFitter->get_stream());
-
-  cuFitter->addBestHit(event_of_hits_cu, geom_cu, event_of_cands_cu);
-
-  event_of_cands_cu.copyToCPU(event_of_cands, cuFitter->get_stream());
-  cudaStreamSynchronize(cuFitter->get_stream());
-  //cudaCheckError();
-}
-
-
-void BuilderCU::FindTracksCloneEngine(EventOfCombCandidates& event_of_comb_cands,
-    bool seed_based)
-{
-  event_of_comb_cands_cu.copyFromCPU(event_of_comb_cands, cuFitter->get_stream());
-
-  cuFitter->FindTracksInLayers(event_of_hits_cu.m_layers_of_hits.data(),
-                               event_of_comb_cands_cu, geom_cu, seed_based);
-
-  event_of_comb_cands_cu.copyToCPU(event_of_comb_cands, cuFitter->get_stream());
-  cudaStreamSynchronize(cuFitter->get_stream());
-}
diff --git a/mkFit/BuilderCU.h b/mkFit/BuilderCU.h
deleted file mode 100644
index 0b54973d..00000000
--- a/mkFit/BuilderCU.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef BUILDER_CU_H
-#define BUILDER_CU_H 
-
-#include "FitterCU.h"
-#include "HitStructures.h"
-#include "HitStructuresCU.h"
-#include "GeometryCU.h"
-#include "Geometry.h"
-#include "Event.h"
-
-namespace mkfit {
-
-// FIXME: Design Issue
-//        What to do, allocation in ctor, free in dtor?
-//            not exception-safe
-//            but manage mem 
-//        or in separate function?
-// FIXME: A lot of duplication between BH/CE. Remove it after CtD
-class BuilderCU
-{
-public:
-  BuilderCU();
-  BuilderCU(FitterCU<float> *fitter);
-  ~BuilderCU();
-
-  void setUpBH(const EventOfHits& event_of_hits, const Event* event,
-               const EventOfCandidates& event_of_cands);
-  void tearDownBH();
-
-  void allocateCE(const EventOfHits& event_of_hits, const Event* event,
-                  const EventOfCombCandidates& event_of_cands);
-  void setUpCE(const EventOfHits& event_of_hits, const Event* event,
-               const EventOfCombCandidates& event_of_cands);
-  void tearDownCE();
-
-  void FindTracksBestHit(EventOfCandidates& event_of_cands);
-
-  void setUpFitter(int gplex_size);
-  void tearDownFitter();
-
-  void allocateGeometry(const Geometry& geom);
-
-  void FindTracksCloneEngine(EventOfCombCandidates& event_of_cands,
-                             bool seed_based=false);
-private:
-  FitterCU<float> *cuFitter;
-  EventOfHitsCU event_of_hits_cu;
-  EventOfCandidatesCU event_of_cands_cu;
-  EventOfCombCandidatesCU event_of_comb_cands_cu;
-  GeometryCU geom_cu;
-};
-
-
-} // end namespace mkfit
-#endif /* ifndef BUILDER_CU_H */
diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h
deleted file mode 100644
index db1658c4..00000000
--- a/mkFit/FitterCU-imp.h
+++ /dev/null
@@ -1,446 +0,0 @@
-#include <cstdlib>
-#include "Config.h"
-#include "GeometryCU.h"
-#include "reorganize_gplex.h"
-#include "fittracks_kernels.h"
-#include "build_tracks_kernels.h"
-
-#include "clone_engine_kernels.h"
-#include "clone_engine_kernels_seed.h"
-
-#include "Track.h"
-
-template <typename T>
-void FitterCU<T>::FindTracksInLayers(LayerOfHitsCU *layers, 
-                                     EventOfCombCandidatesCU& event_of_cands_cu,
-                                     GeometryCU &geom, bool seed_based)
-{
-  auto f = seed_based ? findInLayers_wrapper_seed : findInLayers_wrapper;
-
-  //findInLayers_wrapper(stream, layers, event_of_cands_cu,
-  //findInLayers_wrapper_seed(stream, layers, event_of_cands_cu,
-  f (stream, layers, event_of_cands_cu,
-     d_XHitSize, d_XHitArr, d_Err_iP, d_par_iP,
-     d_msErr_arr, d_msPar_arr, d_Err_iC, d_par_iC,
-     d_outChi2, d_Chi2, d_HitsIdx_arr, d_inChg, d_Label,
-     d_SeedIdx, d_CandIdx, d_Valid, geom, d_maxSize, N);
-}
-
-template <typename T>
-void FitterCU<T>::UpdateWithLastHit(LayerOfHitsCU& layer, int ilay, int N)
-{
-  UpdateWithLastHit_wrapper(stream, layer, d_HitsIdx[ilay],
-     d_msPar[ilay], d_msErr[ilay], d_par_iP, d_Err_iP, d_par_iC, d_Err_iC, N); 
-}
-
-template <typename T>
-void FitterCU<T>::UpdateWithLastHit_standalone(
-    LayerOfHitsCU& layer_cu, MPlexQI& HitsIdx, 
-    MPlexLS &Err_iP, MPlexLV& Par_iP, MPlexQI &Chg,
-    MPlexHV& msPar, MPlexHS& msErr, 
-    MPlexLS &Err_iC, MPlexLV& Par_iC,
-    int Nhits, int N_proc)
-{
-  d_HitsIdx[Nhits-1].copyAsyncFromHost(stream, HitsIdx);
-  d_Err_iP.copyAsyncFromHost(stream, Err_iP);
-  d_par_iP.copyAsyncFromHost(stream, Par_iP);
-  d_msErr[Nhits-1].copyAsyncFromHost(stream, msErr);
-  d_msPar[Nhits-1].copyAsyncFromHost(stream, msPar);
-
-  UpdateWithLastHit_wrapper(stream, layer_cu, d_HitsIdx[Nhits-1],
-     d_msPar[Nhits-1], d_msErr[Nhits-1], d_par_iP, d_Err_iP, d_par_iC, d_Err_iC, N_proc); 
-
-  d_par_iC.copyAsyncToHost(stream, Par_iC);
-  d_Err_iC.copyAsyncToHost(stream, Err_iC);
-  d_msErr[Nhits-1].copyAsyncToHost(stream, msErr);
-  d_msPar[Nhits-1].copyAsyncToHost(stream, msPar);
-
-  cudaStreamSynchronize(stream);
-}
-template <typename T>
-void FitterCU<T>::GetHitFromLayer_standalone(LayerOfHitsCU& layer_cu,
-    MPlexQI& HitsIdx, MPlexHV& msPar, MPlexHS& msErr, int hit_idx, int N)
-{
-  d_HitsIdx[hit_idx].copyAsyncFromHost(stream, HitsIdx);
-
-  getHitFromLayer_wrappper(stream, layer_cu, d_HitsIdx[hit_idx], 
-      d_msPar[hit_idx], d_msErr[hit_idx], N);
-
-  d_msErr[hit_idx].copyAsyncToHost(stream, msErr);
-  d_msPar[hit_idx].copyAsyncToHost(stream, msPar);
-}
-template <typename T>
-void FitterCU<T>::UpdateMissingHits_standalone(
-    MPlexLS& Err_iP, MPlexLV& Par_iP, 
-    MPlexLS& Err_iC, MPlexLV& Par_iC, 
-    MPlexQI& HitsIdx, 
-    int hit_idx, int N)
-{
-  d_HitsIdx[hit_idx].copyAsyncFromHost(stream, HitsIdx);
-  d_Err_iP.copyAsyncFromHost(stream, Err_iP);
-  d_par_iP.copyAsyncFromHost(stream, Par_iP);
-
-  UpdateMissingHits_wrapper(stream, d_HitsIdx[hit_idx],
-      d_par_iP, d_Err_iP, d_par_iC, d_Err_iC, N);
-
-  d_Err_iC.copyAsyncToHost(stream, Err_iC);
-  d_par_iC.copyAsyncToHost(stream, Par_iC);
-  cudaStreamSynchronize(stream);
-}
-
-template <typename T>
-void FitterCU<T>::setNumberTracks(const idx_t Ntracks) {
-  N = Ntracks;
-
-  // Raise an exceptioin when the FitterCU instance is too small
-  // This should not happen as the loop over tracks in runFittestGPU 
-  // takes care of it.
-  if (Ntracks > Nalloc) {
-    throw std::length_error("FitterCU: Ntracks should be smaller than Nalloc");
-  }
-}
-
-template <typename T>
-void FitterCU<T>::createStream() {
-  cudaStreamCreate(&stream);
-}
-
-template <typename T>
-void FitterCU<T>::destroyStream() {
-  cudaStreamDestroy(stream);
-}
-
-template <typename T>
-void FitterCU<T>::allocateDevice() {
-  d_par_iP.allocate(Nalloc);
-  d_par_iC.allocate(Nalloc);
-
-  d_Err_iP.allocate(Nalloc);
-  d_Err_iC.allocate(Nalloc);
-
-  d_inChg.allocate(Nalloc);
-  d_errorProp.allocate(Nalloc);
-
-  cudaMalloc((void**)&d_msPar_arr, Config::nLayers * sizeof(GPlexHV));
-  cudaMalloc((void**)&d_msErr_arr, Config::nLayers * sizeof(GPlexHS));
-  for (int hi = 0; hi < Config::nLayers; ++hi) {
-    d_msPar[hi].allocate(Nalloc);
-    d_msErr[hi].allocate(Nalloc);
-  }
-  cudaMemcpy(d_msPar_arr, d_msPar, Config::nLayers*sizeof(GPlexHV), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_msErr_arr, d_msErr, Config::nLayers*sizeof(GPlexHS), cudaMemcpyHostToDevice);
-  cudaMalloc((void**)&d_maxSize, sizeof(int));  //  global maximum
-  cudaCheckError();
-}
-
-template <typename T>
-void FitterCU<T>::freeDevice() {
-  d_par_iC.free();
-  d_inChg.free();
-  d_par_iP.free();
-  d_errorProp.free();
-  d_Err_iP.free();
-  d_Err_iC.free();
-  for (int hi = 0; hi < Config::nLayers; ++hi) {
-    d_msPar[hi].free();
-    d_msErr[hi].free();
-  }
-  cudaFree(d_msPar_arr);
-  cudaFree(d_msErr_arr);
-  cudaFree(d_maxSize);
-  cudaCheckError();
-}
-
-template <typename T>
-void FitterCU<T>::kalmanUpdateMerged(const int hit_idx) {
-  kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr[hit_idx],
-                       d_par_iP, d_msPar[hit_idx], d_par_iC, d_Err_iC, N);
-}
-
-template <typename T>
-void FitterCU<T>::kalmanUpdate_standalone(
-    const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg,
-    const MPlexHS &msErr, const MPlexHV& msPar,
-    MPlexLS &outErr, MPlexLV& outPar,
-    const int hit_idx, const int N_proc)
-{
-  d_Err_iP.copyAsyncFromHost(stream, psErr);
-  d_msErr[hit_idx].copyAsyncFromHost(stream, msErr);
-  d_par_iP.copyAsyncFromHost(stream, psPar);
-  d_msPar[hit_idx].copyAsyncFromHost(stream, msPar);
-
-  kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr[hit_idx],
-                       d_par_iP, d_msPar[hit_idx], d_par_iC, d_Err_iC, N_proc);
-
-  d_par_iC.copyAsyncToHost(stream, outPar);
-  d_Err_iC.copyAsyncToHost(stream, outErr);
-}
-
-template <typename T>
-void FitterCU<T>::propagationMerged(const int hit_idx) {
-  propagation_wrapper(stream, d_msPar[hit_idx], d_Err_iC, d_par_iC, d_inChg,
-                      d_par_iP, d_Err_iP, false, N);
-}
-
-// FIXME: Temporary. Separate allocations / transfers
-template <typename T>
-void FitterCU<T>::allocate_extra_addBestHit() {
-  d_outChi2.allocate(Nalloc);
-  d_XHitPos.allocate(Nalloc);
-  d_XHitSize.allocate(Nalloc);
-  d_XHitArr.allocate(Nalloc);
-  cudaMalloc((void**)&d_HitsIdx_arr, Config::nLayers * sizeof(GPlexQI));
-  for (int hi = 0; hi < Config::nLayers; ++hi) {
-    d_HitsIdx[hi].allocate(Nalloc);
-  }
-  cudaMemcpy(d_HitsIdx_arr, d_HitsIdx, Config::nLayers*sizeof(GPlexQI), cudaMemcpyHostToDevice);
-  d_Chi2.allocate(Nalloc);
-  d_Label.allocate(Nalloc);
-  cudaCheckError();
-}
-
-template <typename T>
-void FitterCU<T>::free_extra_addBestHit() {
-  for (int hi = 0; hi < Config::nLayers; ++hi) {
-    d_HitsIdx[hi].free(); cudaCheckError();
-  }
-  cudaFree(d_HitsIdx_arr);
-  d_Label.free(); cudaCheckError();
-  d_Chi2.free(); cudaCheckError();
-
-  d_XHitArr.free(); cudaCheckError();
-  d_XHitSize.free(); cudaCheckError();
-  d_XHitPos.free(); cudaCheckError();
-  d_outChi2.free(); cudaCheckError();
-}
-
-
-template <typename T>
-void FitterCU<T>::allocate_extra_combinatorial() {
-  d_SeedIdx.allocate(Nalloc);
-  d_CandIdx.allocate(Nalloc);
-  d_Valid.allocate(Nalloc);
-}
-
-
-template <typename T>
-void FitterCU<T>::free_extra_combinatorial() {
-  d_SeedIdx.free();
-  d_CandIdx.free();
-  d_Valid.free();
-}
-
-
-template <typename T>
-void FitterCU<T>::setHitsIdxToZero(const int hit_idx) {
-  cudaMemset(d_HitsIdx[hit_idx].ptr, 0, Nalloc*sizeof(int));
-}
-
-template <typename T>
-void FitterCU<T>::addBestHit(EventOfHitsCU &event, GeometryCU &geom_cu,
-                             EventOfCandidatesCU &event_of_cands_cu) {
-    findBestHit_wrapper(stream, event.m_layers_of_hits.data(),
-                        event_of_cands_cu,
-                        d_XHitSize, d_XHitArr,
-                        d_Err_iP, d_par_iP, 
-                        d_msErr_arr, d_msPar_arr,
-                        d_Err_iC, d_par_iC, d_outChi2,
-                        d_Chi2, d_HitsIdx_arr,
-                        d_inChg, d_Label, geom_cu,
-                        d_maxSize, N);
-}   
-
-template <typename T>
-void FitterCU<T>::InputTracksAndHitIdx(const EtaBinOfCandidatesCU &etaBin,
-                              const int beg, const int end, const bool inputProp) {
-  InputTracksCU_wrapper(stream, etaBin, d_Err_iP, d_par_iP,
-                        d_inChg, d_Chi2, d_Label, d_HitsIdx_arr,
-                        beg, end, inputProp, N);
-}
-
-template <typename T>
-void FitterCU<T>::OutputTracksAndHitIdx(EtaBinOfCandidatesCU &etaBin,
-                               const int beg, const int end, const bool outputProp) {
-  OutputTracksCU_wrapper(stream, etaBin, d_Err_iP, d_par_iP,
-                         d_inChg, d_Chi2, d_Label, d_HitsIdx_arr,
-                         beg, end, outputProp, N);
-  cudaStreamSynchronize(stream);
-  cudaCheckError();
-}
-
-
-template <typename T>
-void FitterCU<T>::InputTracksAndHitIdxComb(const EtaBinOfCombCandidatesCU &etaBin,
-      const int Nhits, const int beg, const int end, const bool inputProp) {
-  InputTracksAndHitIdxComb_wrapper(stream, 
-                                   etaBin,
-                                   d_Err_iP, d_par_iP,
-                                   d_inChg, d_Chi2,
-                                   d_Label, d_HitsIdx,
-                                   d_SeedIdx, d_CandIdx,
-                                   d_Valid, Nhits,
-                                   beg, end, 
-                                   inputProp, N);
-}
-
-
-template <typename T>
-void FitterCU<T>::propagateTracksToR(const float radius, const int N) {
-  propagationForBuilding_wrapper(stream, d_Err_iC, d_par_iC, d_inChg, 
-                                 radius, d_Err_iP, d_par_iP, N); 
-}
-
-
-#if 1
-template <typename T>
-void FitterCU<T>::propagateTracksToR_standalone(const float radius, const int N,
-    const MPlexLS& Err_iC, const MPlexLV& Par_iC, const MPlexQI& inChg, 
-    MPlexLS& Err_iP, MPlexLV& Par_iP) {
-  d_Err_iC.copyAsyncFromHost(stream, Err_iC);
-  d_par_iC.copyAsyncFromHost(stream, Par_iC);
-  d_inChg.copyAsyncFromHost(stream, inChg);
-
-  propagateTracksToR(radius, N);
-
-  d_Err_iP.copyAsyncToHost(stream, Err_iP);
-  d_par_iP.copyAsyncToHost(stream, Par_iP);
-  cudaStreamSynchronize(stream);
-}
-
-template <typename T>
-void FitterCU<T>::FitTracks(Track *tracks_cu, int num_tracks,
-                            EventOfHitsCU &events_of_hits_cu,
-                            int Nhits, const bool useParamBfield)
-{
-  float etime;
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-
-  cudaEventRecord(start, 0);
- 
-  for (int itrack = 0; itrack < num_tracks; itrack += Nalloc)
-  {
-    int beg = itrack;
-    int end = std::min(itrack + Nalloc, num_tracks);
-    setNumberTracks(end-beg);
-
-    InputTracksAndHitsCU_wrapper(stream, tracks_cu, events_of_hits_cu,
-                                 d_Err_iC, d_par_iC,
-                                 d_msErr_arr, d_msPar_arr,
-                                 d_inChg, d_Chi2, d_Label,
-                                 d_HitsIdx_arr, beg, end, false, N);
-    fittracks_wrapper(stream, d_Err_iP, d_par_iP, d_msErr_arr, d_msPar_arr,
-                      d_Err_iC, d_par_iC, d_errorProp, d_inChg, useParamBfield,
-                      Nhits, N);
-    OutputFittedTracksCU_wrapper(stream, tracks_cu, 
-                                 d_Err_iC, d_par_iC,
-                                 d_inChg, d_Chi2, d_Label,
-                                 beg, end, N);
-  }
-
-  cudaEventRecord(stop, 0);
-  cudaEventSynchronize(stop);
-
-  cudaEventElapsedTime(&etime, start, stop);
-  std::cerr << "CUDA etime: " << etime << " ms.\n";
-  
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-}
-#else
-template <typename T>
-void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
-                            MPlexHV* msPar, MPlexHS* msErr, int Nhits,
-                            std::vector<Track> &tracks, int beg, int end,
-                            std::vector<HitVec> &layerHits) {
-  float etime;
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-
-  //launch everything in a stream to enable concurrent execution of events
-  createStream();
-
-  // allocateDevice();  -> moved to mkFit/mkFit.cc
-
-  setNumberTracks(end-beg);
-
-  Track *tracks_cu;
-  cudaMalloc((void**)&tracks_cu, tracks.size()*sizeof(Track));
-  cudaMemcpy(tracks_cu, &tracks[0], tracks.size()*sizeof(Track), cudaMemcpyHostToDevice);
-  allocate_extra_addBestHit();
-
-  InputTracksAndHitsCU_wrapper(stream, tracks_cu, d_Err_iC, d_par_iC, d_inChg,
-                               d_Chi2, d_Label, d_HitsIdx_arr, beg, end, false, N);
-
-
-  //d_inChg.copyAsyncFromHost(stream, Chg);
-  //d_par_iC.copyAsyncFromHost(stream, par_iC);
-  //d_Err_iC.copyAsyncFromHost(stream, err_iC);
-
-  cudaEventRecord(start, 0);
- 
-  double total_reorg = 0.;
-  for (int hi = 0; hi < Nhits; ++hi)
-  {
-    // Switch outPut and inPut parameters and errors
-    // similar to iC <-> iP in the CPU code.
-    d_par_iP.copyAsyncFromDevice(stream, d_par_iC); 
-    d_Err_iP.copyAsyncFromDevice(stream, d_Err_iC);
-    
-#if 0
-    double time_input = dtime();
-    int itrack;
-    //omp_set_num_threads(Config::numThreadsReorg);
-//#pragma omp parallel for
-    for (int i = beg; i < end; ++i) {
-      itrack = i - beg;
-      Track &trk = tracks[i];
-
-      const int hidx = trk.getHitIdx(hi);
-      const Hit &hit = layerHits[hi][hidx];
-
-      msErr[hi].CopyIn(itrack, hit.errArray());
-      msPar[hi].CopyIn(itrack, hit.posArray());
-    }
-    total_reorg += (dtime() - time_input)*1e3;
-#endif
-
-    d_msPar[hi].copyAsyncFromHost(stream, msPar[hi]);
-    d_msErr[hi].copyAsyncFromHost(stream, msErr[hi]);
-
-    propagationMerged(hi);
-    //MPlexLS  err_iP;
-    //MPlexLV par_iP;
-    //d_par_iC.copyAsyncToHost(stream, par_iC);
-    //d_par_iP.copyAsyncToHost(stream, par_iP);
-    //d_Err_iP.copyAsyncToHost(stream, err_iP);
-    //propagation_wrapper(stream, d_msPar[hi], d_Err_iC, d_par_iC, d_inChg,
-                      //d_par_iP, d_errorProp, d_Err_iP, N);
-    kalmanUpdateMerged(hi);
-    //fittracks_wrapper(stream, d_Err_iP, d_par_iP, d_msErr, d_msPar,
-                      //d_Err_iC, d_par_iC, d_errorProp, d_inChg,
-                      //hi, N);
-  }
-  cudaEventRecord(stop, 0);
-  cudaEventSynchronize(stop);
-
-  cudaEventElapsedTime(&etime, start, stop);
-  //std::cerr << "CUDA etime: " << etime << " ms.\n";
-  //std::cerr << "Total reorg: " << total_reorg << " ms.\n";
-  
-  free_extra_addBestHit();
-  cudaFree(tracks_cu);
-
-  d_par_iC.copyAsyncToHost(stream, par_iC);
-  d_Err_iC.copyAsyncToHost(stream, err_iC);
-  
-  cudaStreamSynchronize(stream);
-  // freeDevice(); -> moved to mkFit/mkFit.cc
-  destroyStream();
-
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-}
-#endif
diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h
deleted file mode 100644
index 1ed9d5b9..00000000
--- a/mkFit/FitterCU.h
+++ /dev/null
@@ -1,149 +0,0 @@
-#ifndef _PROPAGATOR_CU_H_
-#define _PROPAGATOR_CU_H_
-
-#include <cuda_runtime.h>
-
-#include "Matrix.h"
-
-#include "HitStructuresCU.h"
-#include "GPlex.h"
-#include "GeometryCU.h"
-#include "gpu_utils.h"
-
-#include "propagation_kernels.h"
-#include "kalmanUpdater_kernels.h"
-#include "computeChi2_kernels.h"
-#include "index_selection_kernels.h"
-#include "best_hit_kernels.h"
-
-//#include <omp.h>
-#include <stdexcept>
-
-#define BLOCK_SIZE_X 256
-
-namespace mkfit {
-
-using idx_t = Matriplex::idx_t;
-
-template <typename T>
-class FitterCU {
- public:
-  FitterCU (idx_t Nalloc) : Nalloc(Nalloc) {};
-  virtual ~FitterCU () {};
-
-  void allocateDevice();
-  void freeDevice();
-
-  void createStream();
-  void destroyStream();
-  cudaStream_t& get_stream() { return stream; }
-
-  int get_Nalloc() const { return Nalloc; }
-  void setNumberTracks(const idx_t Ntracks);
-
-  void propagationMerged(const int hit_idx);
-  void kalmanUpdateMerged(const int hit_idx);
-  void kalmanUpdate_standalone(
-      const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg,
-      const MPlexHS &msErr, const MPlexHV& msPar,
-      MPlexLS &outErr, MPlexLV& outPar,
-      const int hit_idx, const int N_proc);
-
-  void allocate_extra_addBestHit();
-  void free_extra_addBestHit();
-  void allocate_extra_combinatorial();
-  void free_extra_combinatorial();
-  void setHitsIdxToZero(const int hit_idx);
-
-  void addBestHit(EventOfHitsCU& event, GeometryCU &geom_cu,
-                  EventOfCandidatesCU &event_of_cands_cu);
-
-  void propagateTracksToR(const float radius, const int N);
-  void propagateTracksToR_standalone(const float radius, const int N,
-      const MPlexLS& Err_iC, const MPlexLV& par_iC, 
-      const MPlexQI& inChg, 
-      MPlexLS& Err_iP, MPlexLV& Par_iP);
-
-  // fitting higher order methods
-  void FitTracks(Track *tracks_cu, int num_tracks,
-                 EventOfHitsCU &events_of_hits_cu,
-                 int Nhits, const bool useParamBfield=false);
-  void InputTracksAndHitIdx(const EtaBinOfCandidatesCU &etaBin,
-                   const int beg, const int end, const bool inputProp);
-  void OutputTracksAndHitIdx(EtaBinOfCandidatesCU &etaBin,
-                    const int beg, const int end, const bool outputProp);
-
-  void InputTracksAndHitIdxComb(const EtaBinOfCombCandidatesCU &etaBin,
-      const int Nhits, const int beg, const int end, const bool inputProp);
-
-  //====================
-  // TEMP FCT; copy stuffs
-  //====================
-  void UpdateWithLastHit(LayerOfHitsCU& layer, int ilay, int N);
-  void GetHitFromLayer_standalone(LayerOfHitsCU& layer_cu,
-    MPlexQI& HitsIdx, MPlexHV& msPar, MPlexHS& msErr, int hit_idx, int N);
-  void UpdateMissingHits_standalone(
-      MPlexLS& Err_iP, MPlexLV& Par_iP, 
-      MPlexLS& Err_iC, MPlexLV& Par_iC, 
-      MPlexQI& HitsIdx, 
-      int hit_idx, int N);
-  void UpdateWithLastHit_standalone(
-      LayerOfHitsCU& layer_cu, MPlexQI& HitsIdx, 
-      MPlexLS &Err_iP, MPlexLV& Par_iP, MPlexQI &inChg,
-      MPlexHV& msPar, MPlexHS& msErr, 
-      MPlexLS &Err_iC, MPlexLV& Par_iC,
-      int Nhits, int N_proc);
-
-  void FindTracksInLayers(LayerOfHitsCU *layers, 
-                          EventOfCombCandidatesCU& event_of_cands_cu,
-                          GeometryCU &geom, bool seed_based=false);
-  
-  //====================
- private:
-  // N is the actual size, Nalloc should be >= N, as it is intended
-  // to allocated arrays that can be used for several sets of tracks.
-  idx_t Nalloc;
-  idx_t N;
-
-  /* data */
-  GPlexLV d_par_iP; // LV
-  GPlexLV d_par_iC; // LV
-
-  GPlexLS d_Err_iP; // LS
-  GPlexLS d_Err_iC; // LS
-
-  GPlexQI d_inChg;  // QI
-  GPlexQF d_msRad;  // QF
-  GPlexLL d_errorProp;  // LL
-
-  GPlexHV *d_msPar_arr;  // completely on the GPU
-  GPlexHV d_msPar[Config::nLayers];  // on the CPU, with arrays on the GPU
-  GPlexHS *d_msErr_arr;
-  GPlexHS d_msErr[Config::nLayers];
-  
-  GPlexQI d_XHitPos;  // QI : 1D arrary following itracks
-  GPlexQI d_XHitSize;  // QI : " "
-  GPlexHitIdx d_XHitArr;
-
-  GPlexQF d_outChi2;
-  GPlexQI *d_HitsIdx_arr;
-  GPlexQI d_HitsIdx[Config::nLayers];
-  GPlexQF d_Chi2;
-  GPlexQI d_Label;
-
-  GPlexQI d_SeedIdx;
-  GPlexQI d_CandIdx;
-
-  GPlexQB d_Valid;
-
-  int *d_maxSize;  // max number of tracks for AddBestHit
-
-  // everything run in a stream so multiple instance of FitterCU can
-  // run concurrently on the GPU.
-  cudaStream_t stream;
-};
-
-#include "FitterCU-imp.h"
-
-} // end namespace mkfit
-#endif  // _PROPAGATOR_CU_H_
diff --git a/mkFit/GPlex.h b/mkFit/GPlex.h
deleted file mode 100644
index 18606b31..00000000
--- a/mkFit/GPlex.h
+++ /dev/null
@@ -1,149 +0,0 @@
-#ifndef _GPLEX_H_
-#define _GPLEX_H_
-
-#include <cuda_runtime.h>
-#include <stdio.h>
-
-#include "gpu_utils.h"
-#include "Matrix.h"
-
-namespace mkfit {
-
-//#include "gpu_constants.h"
-__device__ __constant__ static int gplexSymOffsets[7][36] =
-{
-  {}, 
-  {},
-  { 0, 1, 1, 2 },
-  { 0, 1, 3, 1, 2, 4, 3, 4, 5 }, // 3
-  {},
-  {},
-  { 0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17, 6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20 }
-};
-
-
-// GPU implementation of a Matriplex-like structure
-// The number of tracks is the fast dimension and is padded in order to have
-// consecutive and aligned memory accesses. For cached reads, this result in a
-// single memory transaction for the 32 threads of a warp to access 32 floats.
-// See:
-// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-3-0
-// In practice, The number of tracks (ntracks) is set to be MPT_SIZE
-template <typename M>
-struct GPlex { 
-  using T = typename M::value_type;
-  using value_type = T;
-
-  size_t kRows = M::kRows;
-  size_t kCols = M::kCols;
-  size_t kSize = M::kSize;
-
-  T* ptr;
-  size_t pitch, stride, N;
-
-  __device__ T  operator[](int xx) const { return ptr[xx]; }
-  __device__ T& operator[](int xx)       { return ptr[xx]; }
-
-  __device__ T& operator()(int n, int i, int j)       { return ptr[n + (i*kCols + j)*stride]; }
-  __device__ T  operator()(int n, int i, int j) const { return ptr[n + (i*kCols + j)*stride]; }
-
-  void allocate(size_t ntracks) {
-    N = ntracks;
-    cudaMallocPitch((void**)&ptr, &pitch, N*sizeof(T), kSize);
-    stride = pitch/sizeof(T);  // Number of elements
-  }
-  void free() {
-    cudaFree(ptr);
-    N = 0; pitch = 0; stride = 0;
-  }
-  //cudaMemcpy2D(d_msErr.ptr, d_msErr.pitch, msErr.fArray, N*sizeof(T),
-               //N*sizeof(T), HS, cudaMemcpyHostToDevice);
-
-  void copyAsyncFromHost(cudaStream_t& stream, const M& mplex) {
-    cudaMemcpy2DAsync(ptr, pitch, mplex.fArray, N*sizeof(T),
-                      N*sizeof(T), kSize, cudaMemcpyHostToDevice, stream);
-    cudaCheckError();
-  }
-  void copyAsyncToHost(cudaStream_t& stream, M& mplex) {
-    cudaMemcpy2DAsync(mplex.fArray, N*sizeof(T), ptr, pitch,
-                      N*sizeof(T), kSize, cudaMemcpyDeviceToHost, stream);
-    cudaCheckError();
-  }
-  void copyAsyncFromDevice(cudaStream_t& stream, GPlex<M>& gplex) {
-    cudaMemcpy2DAsync(ptr, pitch, gplex.ptr, gplex.pitch,
-                      N*sizeof(T), kSize, cudaMemcpyDeviceToDevice, stream);
-    cudaCheckError();
-  }
-};
-
-
-template <typename M>
-struct GPlexSym : GPlex<M> {
-  using T = typename GPlex<M>::T;
-  using GPlex<M>::kRows;
-  using GPlex<M>::kCols;
-  using GPlex<M>::stride;
-  using GPlex<M>::ptr;
-  __device__ size_t Off(size_t i) const { return gplexSymOffsets[kRows][i]; }
-  // Note: convenient but noticeably slower due to the indirection
-  __device__ T& operator()(int n, int i, int j)       { return ptr[n + Off(i*kCols + j)*stride]; }
-  __device__ T  operator()(int n, int i, int j) const { return ptr[n + Off(i*kCols + j)*stride]; }
-  //__device__ T& operator()(int n, int i, int j)       { return ptr[n + i*stride]; }
-  //__device__ T  operator()(int n, int i, int j) const { return ptr[n + i*stride]; }
-};
-
-using GPlexLL = GPlex<MPlexLL>;
-using GPlexLV = GPlex<MPlexLV>;
-using GPlexLS = GPlexSym<MPlexLS>;
-
-using GPlexHH = GPlex<MPlexHH>;
-using GPlexHV = GPlex<MPlexHV>;
-using GPlexHS = GPlexSym<MPlexHS>;
-
-using GPlexLH = GPlex<MPlexLH>;
-
-using GPlexQF = GPlex<MPlexQF>;
-using GPlexQI = GPlex<MPlexQI>;
-using GPlexQB = GPlex<MPlexQB>;
-
-const int GPlexHitIdxMax = 16;  // FIXME: copied and past from MkFitter.h
-using GPlexHitIdx = GPlex<Matriplex::Matriplex<int, GPlexHitIdxMax, 1, MPT_SIZE>>;
-
-template <typename M>
-struct GPlexReg {
-  using T = typename M::value_type;
-  using value_type = T;
-
-  size_t kRows = M::kRows;
-  size_t kCols = M::kCols;
-  size_t kSize = M::kSize;
-
-  __device__ T  operator[](int xx) const { return arr[xx]; }
-  __device__ T& operator[](int xx)       { return arr[xx]; }
-
-  __device__ T& operator()(int n, int i, int j)       { return arr[i*kCols + j]; }
-  __device__ T  operator()(int n, int i, int j) const { return arr[i*kCols + j]; }
-
-  __device__ void SetVal(T v)
-  {
-     for (int i = 0; i < kSize; ++i)
-     {
-        arr[i] = v;
-     }
-  }
-
-  T arr[M::kSize];
-};
-
-using GPlexRegLL = GPlexReg<MPlexLL>;
-using GPlexRegLH = GPlexReg<MPlexLH>;
-using GPlexRegHH = GPlexReg<MPlexHH>;
-using GPlexRegLV = GPlexReg<MPlexLV>;
-using GPlexRegHS = GPlexReg<MPlexHS>;
-using GPlexRegHV = GPlexReg<MPlexHV>;
-using GPlexReg2V = GPlexReg<MPlex2V>;
-using GPlexReg2S = GPlexReg<MPlex2S>;
-using GPlexRegQF = GPlexReg<MPlexQF>;
-
-} // end namespace mkfit
-#endif  // _GPLEX_H_
diff --git a/mkFit/GeometryCU.h b/mkFit/GeometryCU.h
deleted file mode 100644
index da6347ff..00000000
--- a/mkFit/GeometryCU.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef GEOMETRY_CU_H
-#define GEOMETRY_CU_H
-
-#include "gpu_utils.h"
-
-namespace mkfit {
-
-struct GeometryCU {
-  float *radii = nullptr;
-
-  void allocate() {
-    if (radii == nullptr)
-      cudaMalloc((void**)&radii, Config::nLayers * sizeof(float));
-    //cudaCheckError();
-  }
-  void deallocate() {
-    if (radii != nullptr) {
-      cudaFree(radii);
-      cudaCheckError();
-      radii = nullptr;
-    }
-  }
-  void getRadiiFromCPU(const float *h_radii) {
-    cudaMemcpy(radii, h_radii, Config::nLayers * sizeof(float), cudaMemcpyHostToDevice);
-    //cudaCheckError();
-  }
-};
-
-} // end namespace mkfit
-#endif /* ifndef GEOMETRY_CU_H */
diff --git a/mkFit/HitStructuresCU.cu b/mkFit/HitStructuresCU.cu
deleted file mode 100644
index c42b6e03..00000000
--- a/mkFit/HitStructuresCU.cu
+++ /dev/null
@@ -1,394 +0,0 @@
-
-#include "HitStructuresCU.h"
-
-#include <vector>
-#include <algorithm>
-
-#include "HitStructures.h"
-#include "gpu_utils.h"
-
-///////////////////////////////////////////////////////////////////////////////
-/// LayerOfHitsCU
-///////////////////////////////////////////////////////////////////////////////
-
-void LayerOfHitsCU::copy_layer_values(const LayerOfHits &layer) {
-  Tracer tracer("copyLayer", tracer_colors["Blue"]);
-  m_zmin = layer.m_zmin;
-  m_zmax = layer.m_zmax;
-  m_fz = layer.m_fz;
-}
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// EventOfHitsCU
-///////////////////////////////////////////////////////////////////////////////
-
-void EventOfHitsCU::reserve_all_hits(const EventOfHits &event_of_hits, float factor)
-{
-  auto largest_layer = std::max_element(event_of_hits.m_layers_of_hits.begin(),
-                                        event_of_hits.m_layers_of_hits.end(),
-                                        [](const LayerOfHits &a, const LayerOfHits &b)
-                                        {
-                                          return a.m_capacity < b.m_capacity;
-                                        });
-  auto m_max_hits_layer = largest_layer->m_capacity;
-
-  all_hits.reserve(m_n_layers, m_max_hits_layer*factor);
-}
-
-
-void EventOfHitsCU::reserve_all_hits(const std::vector<HitVec> &layers_of_hits, float factor)
-{
-  auto largest_layer = std::max_element(layers_of_hits.begin(),
-                                        layers_of_hits.end(),
-                                        [](const HitVec &a, const HitVec &b)
-                                        {
-                                          return a.size() < b.size();
-                                        });
-  auto m_max_hits_layer = largest_layer->size();
-
-  all_hits.reserve(m_n_layers, m_max_hits_layer*factor);
-}
-
-
-void EventOfHitsCU::reserve_all_phi_bins(const EventOfHits &event_of_hits, float factor)
-{
-  auto binnest_layer = std::max_element(event_of_hits.m_layers_of_hits.begin(),
-                                        event_of_hits.m_layers_of_hits.end(),
-                                        [](const LayerOfHits &a, const LayerOfHits &b)
-                                        {
-                                          return a.m_nz < b.m_nz;
-                                        });
-  auto m_max_nz_layer = binnest_layer->m_nz;
-
-  all_phi_bin_infos.reserve(m_n_layers, m_max_nz_layer*Config::m_nphi*factor);
-}
-
-
-void EventOfHitsCU::set_layer_hit_views(const EventOfHits& event_of_hits,
-                                    float factor)
-{
-  for (int i = 0; i < m_n_layers; ++i) {
-    m_layers_of_hits_alloc[i].m_hits.set_view(all_hits.get_ptr_to_part(i),
-                                              event_of_hits.m_layers_of_hits[i].m_capacity);
-    m_layers_of_hits_alloc[i].m_capacity = event_of_hits.m_layers_of_hits[i].m_capacity;
-    m_layers_of_hits_alloc[i].m_capacity_alloc =
-        event_of_hits.m_layers_of_hits[i].m_capacity * factor;
-  }
-}
-
-
-void EventOfHitsCU::set_layer_bin_views(const EventOfHits& event_of_hits,
-                                        float factor)
-{
-  for (int i = 0; i < m_n_layers; ++i) {
-    m_layers_of_hits_alloc[i].m_phi_bin_infos.set_view(all_phi_bin_infos.get_ptr_to_part(i),
-                                                       event_of_hits.m_layers_of_hits[i].m_nz * Config::m_nphi);
-    m_layers_of_hits_alloc[i].m_nz = event_of_hits.m_layers_of_hits[i].m_nz;
-    m_layers_of_hits_alloc[i].m_nz_alloc = event_of_hits.m_layers_of_hits[i].m_nz * factor;
-    m_layers_of_hits_alloc[i].m_nphi_alloc = Config::m_nphi;
-  }
-}
-
-
-void EventOfHitsCU::reserve_layers(const EventOfHits &event_of_hits, float factor)
-{
-  if (m_n_layers <= 0) {  // is this still needed?
-    m_n_layers = event_of_hits.m_n_layers;
-    m_layers_of_hits.reserve(m_n_layers);
-    m_layers_of_hits_alloc.reserve(m_n_layers);
-  } else {
-    assert(m_n_layers == event_of_hits.m_n_layers);
-  }
-  reserve_all_hits(event_of_hits, factor);
-  reserve_all_phi_bins(event_of_hits, factor);
-
-  set_layer_hit_views(event_of_hits, factor);
-  set_layer_bin_views(event_of_hits, factor);
-}
-
-
-void EventOfHitsCU::reserve_layers(const std::vector<HitVec> &layerHits)
-{
-  m_n_layers = layerHits.size();
-  // Allocate GPU array.
-  // Members's address  of array's elements are in the GPU space
-  m_layers_of_hits.reserve(m_n_layers);
-  // Allocate CPU array. 
-  // Members's address  of array's elements are in the CPU space
-  // This allows to call allocate for each array's element.
-  m_layers_of_hits_alloc.reserve(m_n_layers);
-
-  reserve_all_hits(layerHits, 1.f);
-}
-
-
-void EventOfHitsCU::prepare_all_host_hits(const EventOfHits& event_of_hits) {
-  all_host_hits.reserve(all_hits.global_capacity());
-
-  for (auto i = 0; i < m_n_layers; ++i) {
-    auto it1 = event_of_hits.m_layers_of_hits[i].m_hits;
-    auto it2 = event_of_hits.m_layers_of_hits[i].m_hits
-        + event_of_hits.m_layers_of_hits[i].m_capacity;
-    std::copy(it1, it2, &all_host_hits[all_hits.local_capacity() * i]);
-  }
-}
-
-
-void EventOfHitsCU::prepare_all_host_bins(const EventOfHits& event_of_hits) {
-  all_host_bins.reserve(all_phi_bin_infos.global_capacity());
-
-  for (int i = 0; i < event_of_hits.m_n_layers; i++) {
-    size_t offset_layer = i * all_phi_bin_infos.local_capacity();
-    for (auto j = 0; j < event_of_hits.m_layers_of_hits[i].m_nz; ++j) {
-      auto it1 = event_of_hits.m_layers_of_hits[i].m_phi_bin_infos[j].begin();
-      auto it2 = event_of_hits.m_layers_of_hits[i].m_phi_bin_infos[j].end();
-      size_t offset = offset_layer + j * Config::m_nphi;
-      std::copy(it1, it2, &all_host_bins[offset]);
-    }
-  }
-}
-
-
-void EventOfHitsCU::copyFromCPU(const EventOfHits& event_of_hits,
-                                const cudaStream_t &stream) {
-  Tracer tracer("copyEventOfHits", tracer_colors["PaleGreen"]);
-
-  prepare_all_host_hits(event_of_hits);
-  all_hits.copy_from_cpu(all_host_hits.data(), stream);
-
-  prepare_all_host_bins(event_of_hits);
-  all_phi_bin_infos.copy_from_cpu((PairIntsCU *)all_host_bins.data(), stream);
-
-  for (int i = 0; i < event_of_hits.m_n_layers; i++) {
-    m_layers_of_hits_alloc[i].copy_layer_values(event_of_hits.m_layers_of_hits[i]);
-  }
-  m_layers_of_hits.resize(m_n_layers);
-  m_layers_of_hits.copy_from_cpu(m_layers_of_hits_alloc.data(), stream);
-}
-
-
-void EventOfHitsCU::copyFromCPU(const std::vector<HitVec> &layerHits,
-                                const cudaStream_t &stream) {
-  all_host_hits.reserve(all_hits.global_capacity());
-
-  for (auto i = 0; i < m_n_layers; ++i) {
-    auto it1 = layerHits[i].begin();
-    auto it2 = layerHits[i].end();
-    std::copy(it1, it2, &all_host_hits[all_hits.local_capacity() * i]);
-  }
-  all_hits.copy_from_cpu(all_host_hits.data(), stream);
-
-  for (int i = 0; i < m_n_layers; ++i) {
-    m_layers_of_hits_alloc[i].m_hits.set_ptr(all_hits.get_ptr_to_part(i));
-  }
-  m_layers_of_hits.resize(m_n_layers);
-  m_layers_of_hits.copy_from_cpu(m_layers_of_hits_alloc.data(), stream);
-}
-
-
-// ============================================================================
-
-void EtaBinOfCandidatesCU::alloc_tracks(const int ntracks) {
-  m_real_size = ntracks;
-  m_fill_index = 0;
-
-  cudaMalloc((void**)&m_candidates, sizeof(Track)*m_real_size);
-  /*cudaCheckError();*/
-}
-
-
-void EtaBinOfCandidatesCU::free_tracks() {
-  cudaFree(m_candidates);
-  m_real_size = 0;
-  m_fill_index = 0;
-}
-
-
-void EtaBinOfCandidatesCU::copyFromCPU(const EtaBinOfCandidates &eta_bin, 
-                                       const cudaStream_t &stream) {
-  assert (eta_bin.m_fill_index < m_real_size); // or something
-  m_fill_index = eta_bin.m_fill_index;
-
-  cudaMemcpyAsync(m_candidates, &eta_bin.m_candidates[0],
-                  sizeof(Track)*m_fill_index, cudaMemcpyHostToDevice, stream);
-  /*cudaCheckError();*/
-}
-
-
-void EtaBinOfCandidatesCU::copyToCPU(EtaBinOfCandidates &eta_bin,
-                                     const cudaStream_t &stream) const {
-  assert (eta_bin.m_fill_index < m_real_size); // or something
-
-  cudaMemcpyAsync(&eta_bin.m_candidates[0], m_candidates,
-                  sizeof(Track)*m_fill_index, cudaMemcpyDeviceToHost, stream);
-  /*cudaCheckError();*/
-}
-
-// ============================================================================
-
-void EventOfCandidatesCU::allocGPU(const EventOfCandidates &event_of_cands) {
-  m_n_etabins = Config::nEtaBin;
-  // Allocate GPU array. 
-  // Members's address  of array's elements are in the GPU space
-  cudaMalloc((void**)&m_etabins_of_candidates, m_n_etabins*sizeof(EtaBinOfCandidatesCU));
-  /*cudaCheckError();*/
-  // Allocate CPU array. 
-  // Members's address  of array's elements are in the CPU space
-  // This allows to call allocate for each array's element.
-  m_etabins_of_candidates_alloc = new EtaBinOfCandidatesCU[m_n_etabins];
-  for (int i = 0; i < m_n_etabins; ++i) {
-    const EtaBinOfCandidates& h_etabin = event_of_cands.m_etabins_of_candidates[i];
-    m_etabins_of_candidates_alloc[i].alloc_tracks(h_etabin.m_real_size);
-  }
-  /*cudaCheckError();*/
-}
-
-
-void EventOfCandidatesCU::deallocGPU() {
-  for (int i = 0; i < m_n_etabins; ++i) {
-    /*cudaCheckError();*/
-    m_etabins_of_candidates_alloc[i].free_tracks();
-    /*cudaCheckError();*/
-  }
-  cudaFree(m_etabins_of_candidates);
-  /*cudaCheckError();*/
-  delete[] m_etabins_of_candidates_alloc;
-}
-
-
-void EventOfCandidatesCU::copyFromCPU(const EventOfCandidates& event_of_cands,
-                                      const cudaStream_t &stream) {
-  for (int i = 0; i < m_n_etabins; i++) {
-    m_etabins_of_candidates_alloc[i].copyFromCPU(event_of_cands.m_etabins_of_candidates[i], stream);
-  }
-  cudaCheckError();
-  cudaMemcpyAsync(m_etabins_of_candidates, m_etabins_of_candidates_alloc, 
-      m_n_etabins*sizeof(EtaBinOfCandidatesCU), 
-      cudaMemcpyHostToDevice, stream);
-  cudaCheckError();
-}
-
-
-void EventOfCandidatesCU::copyToCPU(EventOfCandidates& event_of_cands, 
-                                    const cudaStream_t &stream) const {
-  for (int i = 0; i < m_n_etabins; i++) {
-    m_etabins_of_candidates_alloc[i].copyToCPU(event_of_cands.m_etabins_of_candidates[i], stream);
-  }
-  /*cudaCheckError();*/
-  // We do not need to copy the array of pointers to EventOfCandidatesCU back
-}
-
-// ============================================================================
-
-void EtaBinOfCombCandidatesCU::allocate(const int nseed, const int factor)
-{
-  m_fill_index = 0;
-  m_nseed = nseed;
-
-  if (m_nseed_alloc < nseed * factor) {
-    m_nseed_alloc = nseed * factor;
-    m_real_size = m_nseed_alloc * Config::maxCandsPerSeed;
-  }
-  m_candidates.reserve(m_real_size);
-  m_ntracks_per_seed.reserve(m_nseed_alloc);
-}
-
-
-void EtaBinOfCombCandidatesCU::copyFromCPU(
-    const EtaBinOfCombCandidates& eta_bin, const cudaStream_t& stream)
-{
-  Tracer("etaBinFromCPU", tracer_colors["Chocolate"]);
-
-  assert (eta_bin.m_fill_index < m_real_size); // or something
-  m_fill_index = eta_bin.m_fill_index * Config::maxCandsPerSeed;
-
-  std::vector<int> track_per_seed_array (m_nseed);
-  std::vector<Track> tracks_tmp (m_real_size);
-
-  for (auto i = 0; i < eta_bin.m_fill_index; ++i)
-  {
-    // That is to be general, because ntracks should be one, for each seed
-    // when the building is launched.
-    int ntracks = std::min(Config::maxCandsPerSeed, 
-                           static_cast<int>(eta_bin.m_candidates[i].size()));
-
-    auto it1 = eta_bin.m_candidates[i].begin();
-    auto it2 = eta_bin.m_candidates[i].end();
-    std::move(it1, it2, &tracks_tmp[i*Config::maxCandsPerSeed]);
-
-    track_per_seed_array[i] = ntracks;
-  }
-  m_candidates.resize(m_real_size);
-  m_candidates.copy_from_cpu(tracks_tmp.data(), stream);
-
-  m_ntracks_per_seed.resize(m_nseed);
-  m_ntracks_per_seed.copy_from_cpu(track_per_seed_array.data(), stream);
-
-  cudaStreamSynchronize(stream);
-}
-
-
-void EtaBinOfCombCandidatesCU::copyToCPU(
-    EtaBinOfCombCandidates& eta_bin, const cudaStream_t& stream) const
-{
-  Tracer("etaBinToCPU", tracer_colors["Pink"]);
-
-  assert (eta_bin.m_fill_index < m_real_size); // or something
-
-  std::vector<int> ntracks (eta_bin.m_fill_index);
-  std::vector<Track> tracks_tmp (m_real_size);
-
-  m_ntracks_per_seed.copy_to_cpu(ntracks.data(), stream);
-  m_candidates.copy_to_cpu(tracks_tmp.data(), stream);
-  cudaStreamSynchronize(stream);
-
-  for (auto i = 0; i < eta_bin.m_fill_index; ++i)
-  {
-    auto it1 = &tracks_tmp[i*Config::maxCandsPerSeed];
-    auto it2 = &tracks_tmp[(i+1)*Config::maxCandsPerSeed];
-    std::move(it1, it2, eta_bin.m_candidates[i].begin());
-    eta_bin.m_candidates[i].resize(ntracks[i]);
-  }
-}
-
-// ============================================================================
-
-void EventOfCombCandidatesCU::allocate(
-    const EventOfCombCandidates &event_of_cands, const float factor)
-{
-  m_n_etabins = Config::nEtaBin;
-  m_etabins_of_comb_candidates.reserve_and_resize(m_n_etabins);
-  m_etabins_of_comb_candidates_alloc.resize(m_n_etabins);
-
-  for (int i = 0; i < m_n_etabins; ++i) {
-    const EtaBinOfCombCandidates& h_etabin = event_of_cands.m_etabins_of_comb_candidates[i];
-    m_etabins_of_comb_candidates_alloc[i].allocate(h_etabin.m_fill_index, factor);
-  }
-}
-
-
-void EventOfCombCandidatesCU::copyFromCPU(
-    const EventOfCombCandidates &event_of_cands,
-    const cudaStream_t &stream)
-{
-  Tracer tracer("eventCandsFromCPU", tracer_colors["Orange"]);
-  for (int i = 0; i < m_n_etabins; i++) {
-    m_etabins_of_comb_candidates_alloc[i].copyFromCPU(
-        event_of_cands.m_etabins_of_comb_candidates[i], stream);
-  }
-  m_etabins_of_comb_candidates.resize(m_n_etabins);  // useless but reassuring
-  m_etabins_of_comb_candidates.copy_from_cpu(m_etabins_of_comb_candidates_alloc.data(), stream);
-}
-
-
-void EventOfCombCandidatesCU::copyToCPU(
-    EventOfCombCandidates &event_of_cands,
-    const cudaStream_t &stream) const
-{
-   Tracer tracer("eventCandsToCPU", tracer_colors["Red"]);
-
-  for (int i = 0; i < m_n_etabins; i++) {
-    m_etabins_of_comb_candidates_alloc[i].copyToCPU(event_of_cands.m_etabins_of_comb_candidates[i], stream);
-  }
-}
diff --git a/mkFit/HitStructuresCU.h b/mkFit/HitStructuresCU.h
deleted file mode 100644
index cd457c66..00000000
--- a/mkFit/HitStructuresCU.h
+++ /dev/null
@@ -1,211 +0,0 @@
-#ifndef _HIT_STRUCTURES_H_
-#define _HIT_STRUCTURES_H_
-
-#include "Config.h"
-#include "HitStructures.h"
-
-#include "device_vector.h"
-#include "device_bundle.h"
-#include "device_array_view.h"
-#include "gpu_utils.h"
-
-namespace mkfit {
-
-template <typename T1, typename T2>
-struct PairCU {
-  T1 first;
-  T2 second;
-};
-
-using PairIntsCU = PairCU<int, int>;
-
-class LayerOfHitsCU {
- public:
-  DeviceArrayView<Hit> m_hits;
-  DeviceArrayView<PairIntsCU> m_phi_bin_infos;
-
-  float m_zmin = 0, m_zmax, m_fz = 0;
-  int m_nz = 0;
-  int m_capacity = 0;
-  
-  int m_capacity_alloc = 0;
-  int m_nz_alloc = 0;
-  int m_nphi_alloc = 0;
- 
-  //fixme, these are copies of the ones above, need to merge with a more generic name
-  float m_rmin, m_rmax, m_fr;
-  int   m_nr = 0;
-
-  // This could be a parameter, layer dependent.
-  //static constexpr int   m_nphi = 1024;
-  // Testing bin filling
-  //  static constexpr int   m_nphi = 16;
-  static constexpr float m_fphi = Config::m_nphi / Config::TwoPI;
-  static constexpr int   m_phi_mask = 0x3ff;
-
-  // As above
-  //static constexpr float m_max_dz   = 1;
-  //static constexpr float m_max_dphi = 0.02;
-
-  void copy_layer_values(const LayerOfHits &layer);
-
-#ifdef __CUDACC__
-  __device__
-#endif
-  int GetZBin(const float z)    const { return (z - m_zmin) * m_fz; }
-    
-#ifdef __CUDACC__
-  __device__
-#endif
-  int GetZBinChecked(float z) const {
-    int zb = GetZBin(z);
-    if (zb < 0) { 
-      zb = 0;
-    } else {
-      if (zb >= m_nz) zb = m_nz - 1;
-    }
-    return zb; 
-  }
-
-  // if you don't pass phi in (-pi, +pi), mask away the upper bits using m_phi_mask
-#ifdef __CUDACC__
-  __device__
-#endif
-  int   GetPhiBin(float phi) const {
-    return floorf(m_fphi * (phi + Config::PI)); 
-  }
-};
-
-
-class EventOfHitsCU 
-{
-public:
-  DeviceVector<LayerOfHitsCU> m_layers_of_hits;
-  int            m_n_layers = 0;
-
-  // The following array is to be able to allocate GPU arrays from
-  // the CPU and then copy the address of the GPU ptr to the GPU structure
-  std::vector<LayerOfHitsCU> m_layers_of_hits_alloc;
-  
-  EventOfHitsCU() : m_n_layers{} {};
-
-  void reserve_layers(const EventOfHits &event_of_hits, float factor=1.f);
-  void reserve_layers(const std::vector<HitVec> &layerHits);  // fittest
-  void copyFromCPU(const EventOfHits& event_of_hits,
-                   const cudaStream_t &stream);
-  void copyFromCPU(const std::vector<HitVec> &layerHits,
-                   const cudaStream_t &stream);
-
-private:
-  void prepare_all_host_hits(const EventOfHits &event_of_hits);
-  void prepare_all_host_bins(const EventOfHits &event_of_hits);
-
-  void reserve_all_hits(const EventOfHits &event_of_hits, float factor);
-  void reserve_all_hits(const std::vector<HitVec> &layers_of_hits, float factor);
-  void reserve_all_phi_bins(const EventOfHits &event_of_hits, float factor);
-
-  void set_layer_hit_views(const EventOfHits& event_of_hits, float factor);
-  void set_layer_bin_views(const EventOfHits& event_of_hits, float factor);
-
-  // Large 'arrays' to help with data transfers -- Fortran 77's style.
-  // The whole trick is to flatten the memory layout to minimize the number of transfers
-  // Where we move host data to be consecutive (with padding)
-  std::vector<PhiBinInfo_t> all_host_bins;
-  std::vector<Hit> all_host_hits;
-  // Where we store device data. Layers are pointers to parts of the array
-  DeviceBundle<Hit> all_hits;
-  DeviceBundle<PairIntsCU> all_phi_bin_infos;
-};
-
-// ============================================================================
-
-class EtaBinOfCandidatesCU 
-{
-public:
-  Track *m_candidates;
-  
-  int m_real_size;
-  int m_fill_index;
-
-  void alloc_tracks(const int ntracks);
-  void free_tracks();
-
-  void copyFromCPU(const EtaBinOfCandidates &eta_bin,
-                   const cudaStream_t &stream);
-  void copyToCPU(EtaBinOfCandidates &eta_bin,
-                  const cudaStream_t &stream) const;
-};
-
-
-class EventOfCandidatesCU 
-{
-public:
-  int m_n_etabins;
-  EtaBinOfCandidatesCU *m_etabins_of_candidates;  // device array
-  
-  // Host array. For allocation and transfer purposes
-  EtaBinOfCandidatesCU *m_etabins_of_candidates_alloc;
-
-  EventOfCandidatesCU() : m_n_etabins{},
-                          m_etabins_of_candidates{nullptr},
-                          m_etabins_of_candidates_alloc{nullptr} {}
-
-  void allocGPU(const EventOfCandidates &event_of_cands);
-  void deallocGPU();
-  void copyFromCPU(const EventOfCandidates &event_of_cands,
-                   const cudaStream_t &stream=0);
-  void copyToCPU(EventOfCandidates &event_of_cands,
-                 const cudaStream_t &stream=0) const;
-};
-
-// ============================================================================
-
-class EtaBinOfCombCandidatesCU
-{
-public:
-  // CPU code: std::vector<std::vector<Track> > m_candidates;
-  // GPU code: Array [maxCandsPerSeed*numSeedPerEtaBin]
-  //      -> We don't actually care about numSeedPerEtaBin, it's all known
-  //      from the CPU side, use EtaBinOfCombCandidates.m_fill_index
-  //      or EtaBinOfCombCandidates.m_real_size
-  // More trouble to come: We cannot do the same as for EtaBinOfCandidatesCU
-  // for copying from host mem to dev. mem. The non contiguous host mem
-  // for m_candidates forces a loop over the seeds.
-  DeviceVector<Track> m_candidates;
-  DeviceVector<int> m_ntracks_per_seed;  // array of m_candidates[i].size()
-
-  int m_real_size = 0;
-  int m_fill_index = 0;
-  int m_nseed = 0;
-  int m_nseed_alloc = 0;
-
-  void allocate(const int nseed, const int factor=1.f);
-  void copyFromCPU(const EtaBinOfCombCandidates& eta_bin,
-                   const cudaStream_t& stream);
-  void copyToCPU(EtaBinOfCombCandidates& eta_bin,
-                 const cudaStream_t& stream) const;
-};
-
-
-// TODO: The comb and non-comb version are quite similar: refactor?
-class EventOfCombCandidatesCU
-{
-public:
-  DeviceVector<EtaBinOfCombCandidatesCU> m_etabins_of_comb_candidates;
-  int m_n_etabins = 0;
-
-  // Host array. For allocation and transfer purposes
-  std::vector<EtaBinOfCombCandidatesCU> m_etabins_of_comb_candidates_alloc;
-
-  EventOfCombCandidatesCU() : m_n_etabins{} {};
-
-  void allocate(const EventOfCombCandidates &event_of_cands, const float factor=1.f);
-  void copyFromCPU(const EventOfCombCandidates &event_of_cands,
-                   const cudaStream_t &stream);
-  void copyToCPU(EventOfCombCandidates &event_of_cands,
-                 const cudaStream_t &stream) const;
-};
-
-// end namespace mkfit
-#endif  // _HIT_STRUCTURES_H_
-
diff --git a/mkFit/KalmanUtilsMPlex.h b/mkFit/KalmanUtilsMPlex.h
index 30301127..748f8bb5 100644
--- a/mkFit/KalmanUtilsMPlex.h
+++ b/mkFit/KalmanUtilsMPlex.h
@@ -4,10 +4,6 @@
 #include "Track.h"
 #include "Matrix.h"
 
-#ifdef USE_CUDA
-#include "FitterCU.h"
-#endif
-
 namespace mkfit {
 
 //------------------------------------------------------------------------------
diff --git a/mkFit/KalmanUtilsMPlex.icc b/mkFit/KalmanUtilsMPlex.icc
index b9ccd70f..87a324ad 100644
--- a/mkFit/KalmanUtilsMPlex.icc
+++ b/mkFit/KalmanUtilsMPlex.icc
@@ -3,9 +3,6 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 template <typename TfLH, typename TfQF1, typename TfQF2, typename TfLL>
-#ifdef __CUDACC__
-__device__
-#endif
 static inline void KHMult_imp(const TfLH& a, const TfQF1& b00, const TfQF2&  b01, TfLL& c,
                               const int nmin, const int nmax)
 {
@@ -56,21 +53,13 @@ static inline void KHMult_imp(const TfLH& a, const TfQF1& b00, const TfQF2&  b01
 ///////////////////////////////////////////////////////////////////////////////
 
 template <typename TfLV1, typename TfLV2, typename TfLL>
-#ifdef __CUDACC__
-__device__
-#endif
 static inline void ConvertToCCS_imp(const TfLV1& a, TfLV2& b, TfLL& c,
                                     const int nmin, const int nmax)
 {
 #pragma omp simd
   for (int n = nmin; n < nmax; ++n)
   {
-    // TODO Check why getHypot and getHypot_fn have to be different
-#ifdef __CUDACC__
-    const float pt = getHypot_fn(a(n, 0,  3), a(n, 0,  4));
-#else
     const float pt = getHypot(a(n, 0,  3), a(n, 0,  4));
-#endif
     const float p2 = pt*pt + a(n, 0,  5)*a(n, 0,  5);
     //
     b(n, 0,  0) = a(n, 0,  0);
@@ -124,9 +113,6 @@ static inline void ConvertToCCS_imp(const TfLV1& a, TfLV2& b, TfLL& c,
 ///////////////////////////////////////////////////////////////////////////////
 
 template <typename TfLV1, typename TfLV2, typename TfLL>
-#ifdef __CUDACC__
-__device__
-#endif
 static inline void ConvertToCartesian_imp(const TfLV1& a, TfLV2& b, TfLL& c,
                                           const int nmin, const int nmax)
 {
@@ -189,9 +175,6 @@ static inline void ConvertToCartesian_imp(const TfLV1& a, TfLV2& b, TfLL& c,
 ///////////////////////////////////////////////////////////////////////////////
 
 template <typename TfLH, typename TfLV1, typename Tf2V, typename TfLV2>
-#ifdef __CUDACC__
-__device__
-#endif
 static inline void MultResidualsAdd_imp(const TfLH& a, const TfLV1& b,
                                        const Tf2V& c, TfLV2& d,
                                        const int nmin, const int nmax)
@@ -213,9 +196,6 @@ static inline void MultResidualsAdd_imp(const TfLH& a, const TfLV1& b,
 ///////////////////////////////////////////////////////////////////////////////
 
 template <typename TfLS, typename TfHS1, typename TfHS2>
-#ifdef __CUDACC_
-__device__
-#endif
 static inline void AddIntoUpperLeft3x3_imp(const TfLS& A, const TfHS1& B, TfHS2& C,
                                            const int aN, const int bN, const int cN,
                                            const int nmin, const int nmax)
@@ -228,9 +208,6 @@ static inline void AddIntoUpperLeft3x3_imp(const TfLS& A, const TfHS1& B, TfHS2&
 ///////////////////////////////////////////////////////////////////////////////
 
 template <typename TfQF1, typename TfQF2, typename TfHV, typename Tf2V>
-#ifdef __CUDACC__
-__device__
-#endif
 static inline void RotateResidulsOnTangentPlane_impl(const TfQF1& r00,
     const TfQF2& r01, const TfHV& a, Tf2V& b, const int nmin, const int nmax)
 {
diff --git a/mkFit/Makefile b/mkFit/Makefile
index 8d607316..87b48808 100644
--- a/mkFit/Makefile
+++ b/mkFit/Makefile
@@ -51,7 +51,6 @@ echo:
 	@echo "CXXFLAGS = ${CXXFLAGS}"
 	@echo "LDFLAGS  = ${LDFLAGS}"
 	@echo "EXES     = ${EXES}"
-	@echo "CU_OBJ   = ${CU_OBJS}"
 
 
 ################################################################
@@ -62,30 +61,7 @@ MKFHDRS := $(wildcard *.h)
 MKFOBJS := $(MKFSRCS:.cc=.o)
 MKFDEPS := $(MKFSRCS:.cc=.d)
 
-ifdef USE_CUDA
-CU_SRCS := $(wildcard *.cu)
-CU_OBJS := $(CU_SRCS:.cu=.o)
-
-LDFLAGS_CU := -lcudart
-LDFLAGS_CU += -lnvToolsExt
-
-# To share __device__ function across several source files
-# 1) compile with --device-c (works as -c should be)
-# 2) Create some kind of dictionary with all the .cu.o
-# 3) The dictionary AND the original .o files should be used
-# -- Works only for CUDA_VERSION >= 5
-# TODO: Clean the "-I.. -std=c++11"
-${CU_OBJS}: %.o: %.cu
-	${NV} --device-c -o $@ $< -I.. -std=c++11 -DUSE_MATRIPLEX -Wno-deprecated-gpu-targets -lineinfo 
-
-# -g -G
-
-CU_LINK := cu_link.o
-${CU_LINK}: ${CU_OBJS}
-	${NV} --device-link $^ --output-file $@
-endif
-
-ALLOBJS := ${MKFOBJS} ${CU_OBJS} ${CU_LINK}
+ALLOBJS := ${MKFOBJS}
 LIBOBJS := $(filter-out mkFit.o, ${ALLOBJS})
 
 ${MKFDEPS}: auto-genmplex
@@ -95,10 +71,10 @@ include ${MKFDEPS}
 endif
 
 mkFit: ${ALLOBJS}
-	${CXX} ${CXXFLAGS} ${VEC_HOST} ${LDFLAGS} ${ALLOBJS} -o $@ ${LDFLAGS_HOST} ${LDFLAGS_CU} -L../lib -lMicCore -Wl,-rpath,../lib,-rpath,./lib
+	${CXX} ${CXXFLAGS} ${VEC_HOST} ${LDFLAGS} ${ALLOBJS} -o $@ ${LDFLAGS_HOST} -L../lib -lMicCore -Wl,-rpath,../lib,-rpath,./lib
 
 ${LIB_MKFIT}: ${LIBOBJS}
-	${CXX} ${CXXFLAGS} ${VEC_HOST} ${LIBOBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS_CU} ${LDFLAGS}
+	${CXX} ${CXXFLAGS} ${VEC_HOST} ${LIBOBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS}
 
 ifdef WITH_ROOT
 fittestMPlex.o : CPPFLAGS += $(shell root-config --cflags)
@@ -110,11 +86,6 @@ TTree.h:
 	echo "Using dummy rule for TTree.h"
 endif
 
-ifdef CUBROOT
-cub/util_debug.cuh:
-	echo "Using dummy rule for cub/util_debug.cuh"
-endif
-
 ${MKFOBJS}: %.o: %.cc %.d
 	${CXX} ${CPPFLAGS} ${CXXFLAGS} ${VEC_HOST} -c -o $@ $<
 
diff --git a/mkFit/MkBuilder.h b/mkFit/MkBuilder.h
index 5339c638..98fdc32e 100644
--- a/mkFit/MkBuilder.h
+++ b/mkFit/MkBuilder.h
@@ -210,10 +210,6 @@ class MkBuilder
   void BackwardFit();
   void fit_cands(MkFinder *mkfndr, int start_cand, int end_cand, int region);
 
-#ifdef USE_CUDA
-  const Event* get_event() const { return m_event; }
-  const EventOfHits& get_event_of_hits() const { return m_event_of_hits; }
-#endif
 };
 
 } // end namespace mkfit
diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc
index 3d4c14bd..a50be1ab 100644
--- a/mkFit/MkFitter.cc
+++ b/mkFit/MkFitter.cc
@@ -4,10 +4,6 @@
 #include "ConformalUtilsMPlex.h"
 #include "MatriplexPackers.h"
 
-#ifdef USE_CUDA
-//#include "FitterCU.h"
-#endif
-
 //#define DEBUG
 #include "Debug.h"
 
@@ -68,15 +64,6 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
 
   int itrack = 0;
 
-// FIXME: uncomment when track building is ported to GPU.
-//#ifdef USE_CUDA
-  // This openmp loop brings some performances when using
-  // a single thread to fit all events.
-  // However, it is more advantageous to use the threads to
-  // parallelize over Events.
-//  omp_set_num_threads(Config::numThreadsReorg);
-//#pragma omp parallel for private(itrack)
-//#endif
   for (int i = beg; i < end; ++i, ++itrack)
   {
     const Track &trk = tracks[i];
@@ -89,11 +76,6 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
     Label(itrack, 0, 0) = trk.label();
 
 // CopyIn seems fast enough, but indirections are quite slow.
-// For GPU computations, it has been moved in between kernels
-// in an attempt to overlap CPU and GPU computations.
-// FIXME: uncomment when track building is ported to GPU.
-#if 1
-//#ifndef USE_CUDA
     for (int hi = 0; hi < Nhits; ++hi)
     {
       HoTArr[hi](itrack, 0, 0) = trk.getHitOnTrack(hi);
@@ -105,7 +87,6 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
       msErr[hi].CopyIn(itrack, hit.errArray());
       msPar[hi].CopyIn(itrack, hit.posArray());
     }
-#endif
   }
 }
 
@@ -119,14 +100,7 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
   // assert(end - beg == NN);
 
   int itrack;
-//#ifdef USE_CUDA
-  // This openmp loop brings some performances when using
-  // a single thread to fit all events.
-  // However, it is more advantageous to use the threads to
-  // parallelize over Events.
-//  omp_set_num_threads(Config::numThreadsReorg);
-//#pragma omp parallel for private(itrack)
-//#endif
+
   for (int i = beg; i < end; ++i) {
     itrack = i - beg;
     const Track &trk = tracks[i];
@@ -140,10 +114,6 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
     Chi2(itrack, 0, 0) = trk.chi2();
 
 // CopyIn seems fast enough, but indirections are quite slow.
-// For GPU computations, it has been moved in between kernels
-// in an attempt to overlap CPU and GPU computations.
-//#ifndef USE_CUDA
-#if 1
     for (int hi = 0; hi < Nhits; ++hi)
     {
       const int hidx = trk.getHitIdx(hi);
@@ -155,7 +125,6 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
 
       HoTArr[hi](itrack, 0, 0) = trk.getHitOnTrack(hi);
     }
-#endif
   }
 }
 
@@ -170,14 +139,6 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>&  tracks,
 
   MatriplexTrackPacker mtp(tracks[beg]);
 
-//#ifdef USE_CUDA
-  // This openmp loop brings some performances when using
-  // a single thread to fit all events.
-  // However, it is more advantageous to use the threads to
-  // parallelize over Events.
-//  omp_set_num_threads(Config::numThreadsReorg);
-//#pragma omp parallel for private(itrack)
-//#endif
   for (int i = beg; i < end; ++i)
   {
     int itrack = i - beg;
@@ -194,11 +155,6 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>&  tracks,
   mtp.Pack(Err[iC], Par[iC]);
 
 // CopyIn seems fast enough, but indirections are quite slow.
-// For GPU computations, it has been moved in between kernels
-// in an attempt to overlap CPU and GPU computations.
-//#ifndef USE_CUDA
-#if 1
-
   for (int hi = 0; hi < Nhits; ++hi)
   {
     MatriplexHitPacker mhp(layerHits[hi][0]);
@@ -215,7 +171,6 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>&  tracks,
 
     mhp.Pack(msErr[hi], msPar[hi]);
   }
-#endif
 }
 
 void MkFitter::InputTracksAndHitIdx(const std::vector<Track>& tracks,
@@ -310,10 +265,6 @@ void MkFitter::InputSeedsTracksAndHits(const std::vector<Track>&  seeds,
     const Track &trk = tracks[see.label()];
 
 // CopyIn seems fast enough, but indirections are quite slow.
-// For GPU computations, it has been moved in between kernels
-// in an attempt to overlap CPU and GPU computations.
-//#ifndef USE_CUDA
-#if 1
     for (int hi = 0; hi < Nhits; ++hi)
     {
       HoTArr[hi](itrack, 0, 0) = trk.getHitOnTrack(hi);
@@ -325,7 +276,6 @@ void MkFitter::InputSeedsTracksAndHits(const std::vector<Track>&  seeds,
       msErr[hi].CopyIn(itrack, hit.errArray());
       msPar[hi].CopyIn(itrack, hit.posArray());
     }
-#endif
   }
 }
 
diff --git a/mkFit/MkFitter.h b/mkFit/MkFitter.h
index e234bb3d..80643a2b 100644
--- a/mkFit/MkFitter.h
+++ b/mkFit/MkFitter.h
@@ -8,11 +8,6 @@
 
 #include "HitStructures.h"
 
-#if USE_CUDA
-#include "FitterCU.h"
-#include "HitStructuresCU.h"
-#endif
-
 //#define DEBUG 1
 
 //#define USE_BOHS
diff --git a/mkFit/PropagationMPlex.icc b/mkFit/PropagationMPlex.icc
index 5571df88..b7ac5f9b 100644
--- a/mkFit/PropagationMPlex.icc
+++ b/mkFit/PropagationMPlex.icc
@@ -3,9 +3,6 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 template<typename Tf, typename Ti, typename TfLL1, typename Tf11, typename TfLLL>
-#ifdef __CUDACC__
-__device__
-#endif
 static inline void helixAtRFromIterativeCCS_impl(const    Tf& __restrict__ inPar,
                                                  const    Ti& __restrict__ inChg,
                                                  const  Tf11& __restrict__ msRad,
diff --git a/mkFit/accessors_cu.h b/mkFit/accessors_cu.h
deleted file mode 100644
index f2bb9d9b..00000000
--- a/mkFit/accessors_cu.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef ACCESSORS_CU_H
-#define ACCESSORS_CU_H 
-
-template <>
-__device__ float* SVector3::ArrayCU() {
-  return fArray; 
-}
-
-template <>
-__device__ float* SVector6::ArrayCU() {
-  return fArray; 
-}
-
-template <>
-__device__ float* ROOT::Math::MatRepSym<float,6>::ArrayCU() {
-  return fArray;
-}
-
-template <>
-__device__ float* SMatrixSym66::ArrayCU() {
-  return fRep.ArrayCU(); 
-}
-
-__device__ float *Hit::posArrayCU() {
-  return state_.pos_.ArrayCU();
-}
-
-__device__ float *Hit::errArrayCU() {
-  return state_.err_.ArrayCU();
-}
-
-__device__ float *Track::posArrayCU() {
-  return state_.parameters.ArrayCU();
-}
-
-__device__ float *Track::errArrayCU() {
-  return state_.errors.ArrayCU();
-}
-
-#endif /* ifndef ACCESSORS_CU_H */
diff --git a/mkFit/array_algorithms_cu.h b/mkFit/array_algorithms_cu.h
deleted file mode 100644
index 68dde020..00000000
--- a/mkFit/array_algorithms_cu.h
+++ /dev/null
@@ -1,86 +0,0 @@
-#ifndef ARRAY_ALGORITHMS_CU_H
-#define ARRAY_ALGORITHMS_CU_H 
-
-#include <cub/cub.cuh>
-//#include "atomic_utils.h"
-#include "gpu_utils.h"
-
-namespace mkfit {
-
-template <typename T,
-          int BLOCK_THREADS,
-          int ITEMS_PER_THREAD,
-          cub::BlockReduceAlgorithm ALGORITHM>
-__device__ void reduceMax_fn(const T *d_in, const int in_size, T *d_max) {
-  // Specialize BlockReduce type for our thread block
-  typedef cub::BlockReduce<T, BLOCK_THREADS, ALGORITHM> BlockReduceT;
-  // Shared memory
-  __shared__ typename BlockReduceT::TempStorage temp_storage;
-
-  for (int i = threadIdx.x + blockIdx.x*blockDim.x;
-           i < in_size;
-           i += blockDim.x * gridDim.x) {
-    // Per-thread tile data
-    T data[ITEMS_PER_THREAD];
-    int block_offset = i - threadIdx.x;
-    cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, 
-                                          d_in + block_offset, //blockIdx.x*blockDim.x,
-                                          data);
-    // Compute sum for a single thread block
-    T aggregate = BlockReduceT(temp_storage).Reduce(data, cub::Max());
-
-    // FIXME: Is reduction over block enough, or do we need device-wise reduction
-    //        CPU code reduces on NN (typically 8 or 16) values. So block-wise should
-    //        be good enough.
-    //        Device-wise reductions with atomics are performance killers
-    //if (threadIdx.x == 0) {
-      //AtomicMax(d_max, aggregate);
-    //}
-    *d_max = aggregate;
-  }
-}
-
-template <typename T,
-          int BLOCK_THREADS,
-          int ITEMS_PER_THREAD,
-          cub::BlockReduceAlgorithm ALGORITHM>
-__global__ void reduceMax_kernel(T *d_in, int in_size, T *d_max) {
-  reduceMax_fn<T, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(d_in, in_size, d_max);
-}
-
-template <typename T, 
-          int BLOCK_THREADS,
-          int ITEMS_PER_THREAD>
-void max_element_wrapper(T *d_v, int num_elems, T *d_max) {
-  dim3 block (BLOCK_THREADS, 1, 1);
-  dim3 grid (std::min(max_blocks_x, (num_elems-1)/BLOCK_THREADS + 1), 1, 1) ;
-  reduceMax_kernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_REDUCE_RAKING>
-       <<< grid, block >>>
-       (d_v, num_elems, d_max);
-}
-
-template <typename T,
-          int BLOCK_THREADS,
-          int ITEMS_PER_THREAD,
-          cub::BlockReduceAlgorithm ALGORITHM>
-__device__ void reduceMaxPartial_fn(const T *d_in, const int start_idx,
-    const int in_size, T *d_max) {
-  // Specialize BlockReduce type for our thread block
-  typedef cub::BlockReduce<T, BLOCK_THREADS, ALGORITHM> BlockReduceT;
-  // Shared memory
-  __shared__ typename BlockReduceT::TempStorage temp_storage;
-
-  int i = threadIdx.x;
-  if (i < in_size) {
-    // Per-thread tile data
-    T data[ITEMS_PER_THREAD];
-    cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, 
-                                          d_in + start_idx,
-                                          data);
-    // Compute sum for a single thread block
-    T aggregate = BlockReduceT(temp_storage).Reduce(data, cub::Max());
-    *d_max = aggregate;
-  }
-}
-} // end namespace mkfit
-#endif /* ifndef ARRAY_ALGORITHMS_CU_H */
diff --git a/mkFit/atomic_utils.h b/mkFit/atomic_utils.h
deleted file mode 100644
index c5f82d99..00000000
--- a/mkFit/atomic_utils.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef ATOMIC_UTILS_H
-#define ATOMIC_UTILS_H
-
-/* Copyright (c) 2012, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-// AtomicMAX code from:
-// https://github.com/parallel-forall/code-samples/blob/master/posts/cuda-aware-mpi-example/src/Device.cu
-
-#define uint64 unsigned long long
-
-  template <typename T>
-__device__ void AtomicMax(T * const address, const T value)
-{
-  atomicMax(address, value);
-}
-
-/**
- * @brief Compute the maximum of 2 single-precision floating point values using an atomic operation
- *
- * @param[in]addressThe address of the reference value which might get updated with the maximum
- * @param[in]valueThe value that is compared to the reference in order to determine the maximum
- */
-  template <>
-__device__ void AtomicMax(float * const address, const float value)
-{
-  if (* address >= value)
-  {
-    return;
-  }
-
-  int * const address_as_i = (int *)address;
-  int old = * address_as_i, assumed;
-
-  do 
-  {
-    assumed = old;
-    if (__int_as_float(assumed) >= value)
-    {
-      break;
-    }
-    // The value stored at address_as_i might have changed since it has been loaded
-    // into old and into assumed.
-    old = atomicCAS(address_as_i, assumed, __float_as_int(value));
-  } while (assumed != old);  // the other threads did not change address_as_i, so the
-                             // atomicCAS return action makes sense.
-}
-
-/**
- * @brief Compute the maximum of 2 double-precision floating point values using an atomic operation
- *
- * @param[in]addressThe address of the reference value which might get updated with the maximum
- * @param[in]valueThe value that is compared to the reference in order to determine the maximum
- */
-  template <>
-__device__ void AtomicMax(double * const address, const double value)
-{
-  if (* address >= value)
-  {
-    return;
-  }
-
-  uint64 * const address_as_i = (uint64 *)address;
-  uint64 old = * address_as_i, assumed;
-
-  do 
-  {
-    assumed = old;
-    if (__longlong_as_double(assumed) >= value)
-    {
-      break;
-    }
-
-    old = atomicCAS(address_as_i, assumed, __double_as_longlong(value));
-  } while (assumed != old);
-}
-
-#endif /* ifndef ATOMIC_UTILS_H */
diff --git a/mkFit/best_hit_kernels.cu b/mkFit/best_hit_kernels.cu
deleted file mode 100644
index 9d98aa6b..00000000
--- a/mkFit/best_hit_kernels.cu
+++ /dev/null
@@ -1,298 +0,0 @@
-#include "best_hit_kernels.h"
-
-#include "computeChi2_kernels.h"
-#include "index_selection_kernels.h"
-#include "reorganize_gplex.h"
-#include "array_algorithms_cu.h"
-#include "kalmanUpdater_kernels.h"
-#include "propagation_kernels.h"
-
-#include <thrust/device_vector.h>
-#include <thrust/fill.h>
-#include <thrust/reduce.h>
-#include <thrust/functional.h>
-
-#define BLOCK_SIZE_X 256
-
-
-__device__ void getNewBestHitChi2_fn(
-    const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
-    const float *outChi2, float &minChi2,
-    int &bestHit, const int hit_cnt, const int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-
-  if (itrack < N) {
-    if (hit_cnt < XHitSize[itrack]) {
-      float chi2 = fabs(outChi2[itrack]);
-      if (chi2 < minChi2) {
-        minChi2 = chi2;
-        bestHit = XHitArr(itrack, hit_cnt, 0);
-      }
-    }
-  }
-}
-
-
-__global__ void getNewBestHitChi2_kernel(
-    const GPlexQI XHitSize, const GPlexHitIdx XHitArr,
-    const float *outChi2, float *minChi2,
-    int *bestHit, const int hit_cnt, const int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  if (itrack < N) {
-    getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2, 
-        minChi2[itrack], bestHit[itrack], hit_cnt, N);
-  }
-}
-
-
-void getNewBestHitChi2_wrapper(const cudaStream_t &stream,
-    const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
-    const GPlexQF &outChi2, float *minChi2, int *bestHit, 
-    const int hit_cnt, const int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-  getNewBestHitChi2_kernel <<< grid, block, 0, stream >>>
-    (XHitSize, XHitArr, outChi2.ptr, minChi2, bestHit, hit_cnt, N);
-}
-
-
-void fill_array_cu(float *array, const int size, const float value) {
-  thrust::device_ptr<float> d_ptr(array);
-  thrust::fill(d_ptr, d_ptr + size, value);
-}
-
-
-__device__ void updateTracksWithBestHit_fn(Hit *hits, 
-    const float minChi2, const int bestHit,
-    GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar, 
-    GPlexQF &Chi2, GPlexQI &HitsIdx, const int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  if (itrack < N) {
-    if (bestHit >= 0)
-    {
-      Hit   &hit  = hits[ bestHit ];
-      const float &chi2_local = minChi2;
-	  
-      for (int i = 0; i < msErr.kSize; ++i) {
-        msErr[itrack + i*msErr.stride] = hit.errArrayCU()[i];
-      }
-      for (int i = 0; i < msPar.kSize; ++i) {
-        msPar(itrack, i, 0) = hit.posArrayCU()[i];
-      }
-      Chi2[itrack] += chi2_local;
-      HitsIdx[itrack] = bestHit;
-    }
-    else
-    {
-      msErr[itrack+ 0*msErr.stride] = 666;
-      msErr[itrack+ 1*msErr.stride] = 0;
-      msErr[itrack+ 2*msErr.stride] = 666;
-      msErr[itrack+ 3*msErr.stride] = 0;
-      msErr[itrack+ 4*msErr.stride] = 0;
-      msErr[itrack+ 5*msErr.stride] = 666;
-
-      for (int i = 0; i < msPar.kSize; ++i) {
-        msPar(itrack, i, 0) = propPar(itrack, i, 0);
-      }
-      HitsIdx[itrack] = -1;
-      // Don't update chi2
-    }
-  }
-}
-
-
-__global__ void updateTracksWithBestHit_kernel(Hit *hits, 
-    const float *minChi2, const int *bestHit,
-    GPlexHS msErr, GPlexHV msPar, const GPlexLV propPar, 
-    GPlexQF Chi2, GPlexQI HitsIdx, const int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  if (itrack < N) {
-    updateTracksWithBestHit_fn
-        (hits, minChi2[itrack], bestHit[itrack],
-         msErr, msPar, propPar, Chi2, HitsIdx, N);
-  }
-}
-
-
-void updateTracksWithBestHit_wrapper(const cudaStream_t &stream,
-    LayerOfHitsCU &layer, const float *minChi2, const int *best_hit, 
-    GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar,
-    GPlexQF &Chi2, GPlexQI &HitsIdx, const int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-  updateTracksWithBestHit_kernel <<< grid, block, 0, stream >>>
-      (layer.m_hits.data(), minChi2, best_hit, msErr, msPar, propPar, Chi2, HitsIdx, N);
-}
-
-
-int getMaxNumHits_wrapper(const GPlexQI d_XHitSize, const int N) {
-  thrust::device_ptr<int> d_ptr(d_XHitSize.ptr);
-  int maxSize=  thrust::reduce(d_ptr, d_ptr + N, -1, thrust::maximum<int>());
-  maxSize = std::min(maxSize, Config::maxHitsConsidered);
-
-  return maxSize;
-}
-
-
-__device__ void bestHit_fn(
-    Hit *hits, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, 
-    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
-    const GPlexLV &propPar, GPlexQF &outChi2,
-    GPlexQF &Chi2, GPlexQI &HitsIdx,
-    const int maxSize, const int itrack, const int N) {
-
-  /*int itrack = threadIdx.x + blockDim.x*blockIdx.x;*/
-  int bestHit_reg = -1;
-  float minChi2_reg = Config::chi2Cut;
-
-  if (itrack < N)
-    HitsIdx[itrack] = 0;
-
-  for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt)
-  {
-    HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, itrack, N);
-#if 0
-      // TODO: add CMSGeom
-      if (Config::useCMSGeom) {
-        //propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
-        throw std::runtime_error("useCMSGeom not implemented yet for GPU");
-      } else {}
-#endif
-    computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, itrack, N);
-    getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2.ptr, minChi2_reg, bestHit_reg, hit_cnt, N);
-  }
-  updateTracksWithBestHit_fn
-      (hits, 
-       minChi2_reg, bestHit_reg,
-       msErr, msPar, propPar,
-       Chi2, HitsIdx,
-       N);
-}
-
-
-__global__ void bestHit_kernel(
-    Hit *hits, const GPlexQI XHitSize, const GPlexHitIdx XHitArr, 
-    const GPlexLS propErr, GPlexHS msErr, GPlexHV msPar,
-    const GPlexLV propPar, GPlexQF outChi2,
-    GPlexQF Chi2, GPlexQI HitsIdx,
-    const int maxSize, const int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  bestHit_fn(hits, XHitSize, XHitArr, 
-    propErr, msErr, msPar,
-    propPar, outChi2,
-    Chi2, HitsIdx,
-    maxSize, itrack, N);
-}
-
-
-void bestHit_wrapper(const cudaStream_t &stream,
-    LayerOfHitsCU &layer, const GPlexQI &XHitSize,  const GPlexHitIdx &XHitArr,
-    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
-    const GPlexLV &propPar, GPlexQF &outChi2,
-    GPlexQF &Chi2, GPlexQI &HitsIdx,
-    const int maxSize, const int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  bestHit_kernel <<< grid, block, 0, stream >>>
-    (layer.m_hits.data(), XHitSize, XHitArr,
-     propErr, msErr, msPar, propPar, outChi2,
-     Chi2, HitsIdx,
-     maxSize, N);
-}
-
-
-template <int BLOCK_THREADS>
-__global__ void findBestHit_kernel(LayerOfHitsCU *layers,
-                                   EtaBinOfCandidatesCU *etabin_of_cands,
-                                   GPlexQI XHitSize, GPlexHitIdx XHitArr,
-                                   GPlexLS Err_iP, GPlexLV Par_iP,
-                                   GPlexHS *msErr_arr, GPlexHV *msPar_arr,
-                                   GPlexLS Err_iC, GPlexLV Par_iC,
-                                   GPlexQF outChi2,
-                                   GPlexQF Chi2, GPlexQI *HitsIdx_arr,
-                                   GPlexQI inChg, GPlexQI Label, GeometryCU geom, 
-                                   int *maxSize, int gplex_size,
-                                   const int nlayers_per_seed) {
-  for (int ebin = 0; ebin != Config::nEtaBin; ++ebin) {
-    for (int beg = 0; beg < etabin_of_cands[ebin].m_fill_index; beg += gplex_size) {
-      int end = min(beg + gplex_size, etabin_of_cands[ebin].m_fill_index);
-      int N = end - beg; 
-
-      int tidx = threadIdx.x + blockDim.x*blockIdx.x;
-      int itrack = beg + tidx;
-
-      if (itrack < end) {
-
-        InputTracksCU_fn(etabin_of_cands[ebin].m_candidates, Err_iP, Par_iP,
-            inChg, Chi2, Label, HitsIdx_arr, beg, end, tidx, N);
-
-        for (int ilay = nlayers_per_seed; ilay < Config::nLayers; ++ilay)
-        {
-          int hit_idx = ilay;
-          GPlexHS &msErr = msErr_arr[hit_idx];
-          GPlexHV &msPar = msPar_arr[hit_idx];
-          GPlexQI &HitsIdx = HitsIdx_arr[hit_idx];
-
-          float *radii = geom.radii;
-
-          LayerOfHitsCU &layer = layers[ilay];
-
-          int maxSize_block;
-          selectHitIndices_fn(layer, Err_iP, Par_iP, XHitSize, XHitArr, tidx, N);
-          reduceMax_fn<int, BLOCK_THREADS, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS>
-            (XHitSize.ptr, XHitSize.N, &maxSize_block);
-          bestHit_fn(layer.m_hits.data(), XHitSize, XHitArr,
-                     Err_iP, msErr, msPar, Par_iP, outChi2,
-                     Chi2, HitsIdx, maxSize_block, tidx, N);
-          kalmanUpdate_fn( Err_iP, msErr, Par_iP, msPar, Par_iC, Err_iC, tidx, N);
-          if (ilay+1 < Config::nLayers) {
-            float radius = radii[ilay+1];
-            propagationForBuilding_fn(Err_iC, Par_iC, inChg, radius, Err_iP, Par_iP, tidx, N);
-          }
-        }
-        OutputTracksCU_fn(etabin_of_cands[ebin].m_candidates, 
-            Err_iP, Par_iP, inChg, Chi2, Label, HitsIdx_arr, beg, end, tidx, N);
-      }
-    }
-  }
-}
-
-
-void findBestHit_wrapper(cudaStream_t &stream,
-    LayerOfHitsCU *layers,
-    EventOfCandidatesCU &event_of_cands_cu,
-    GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
-    GPlexLS &Err_iP, GPlexLV &Par_iP,
-    GPlexHS *msErr, GPlexHV *msPar,
-    GPlexLS &Err_iC, GPlexLV &Par_iC,
-    GPlexQF &outChi2,
-    GPlexQF &Chi2, GPlexQI *HitsIdx,
-    GPlexQI &inChg, GPlexQI &Label,
-    GeometryCU &geom, 
-    int *maxSize, int N) {
-  int gridx = (N-1)/BLOCK_SIZE_X + 1;
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  if (gridx > max_blocks_x) {
-    throw std::runtime_error("The matriplex size should be chosen such "
-                             "that gplex_size*BLOCK_SIZE_X <= max_blocks_x");
-  }
-  // The loop over tracks is taking care of the case where there is more tracks
-  // than available threads.
-  // We should actually not throw an exception, GPlex should be allocated with
-  // a smaller size in MkFitter.
-
-  findBestHit_kernel<BLOCK_SIZE_X> <<< grid, block, 0, stream >>>
-    (layers, event_of_cands_cu.m_etabins_of_candidates,
-     XHitSize, XHitArr, Err_iP, Par_iP, msErr, msPar,
-     Err_iC, Par_iC, outChi2, Chi2, HitsIdx, inChg, Label, geom, maxSize, N,
-     Config::nlayers_per_seed);
-}
diff --git a/mkFit/best_hit_kernels.h b/mkFit/best_hit_kernels.h
deleted file mode 100644
index 2a6fb16d..00000000
--- a/mkFit/best_hit_kernels.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef BEST_HIT_KERNELS_H
-#define BEST_HIT_KERNELS_H 
-
-
-#include "GPlex.h"
-#include "HitStructuresCU.h"
-#include "GeometryCU.h"
-
-namespace mkfit {
-
-void getNewBestHitChi2_wrapper(const cudaStream_t &stream,
-    const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
-    const GPlexQF &outChi2, float *minChi2, int *bestHit,
-    const int hit_cnt, const int N);
-
-
-void fill_array_cu(float *array, const int size, const float value);
-
-
-void updateTracksWithBestHit_wrapper(const cudaStream_t &stream,
-    LayerOfHitsCU &, const float *minChi2, const int *best_hit, 
-    GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar,
-    GPlexQF &Chi2, GPlexQI& HitsIdx, const int N);
-
-int getMaxNumHits_wrapper(const GPlexQI d_XHitSize, const int N);
-
-
-void bestHit_wrapper(const cudaStream_t &stream,
-    LayerOfHitsCU &layer, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
-    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
-    const GPlexLV &propPar, GPlexQF &outChi2,
-    GPlexQF &Chi2, GPlexQI& HitsIdx,
-    const int maxSize, const int N);
-
-
-void findBestHit_wrapper(cudaStream_t &stream,
-                         LayerOfHitsCU *layers, 
-                         EventOfCandidatesCU &event_of_cands_cu,
-                         GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
-                         GPlexLS &Err_iP, GPlexLV &Par_iP,
-                         GPlexHS *msErr, GPlexHV *msPar,
-                         GPlexLS &Err_iC, GPlexLV &Par_iC,
-                         GPlexQF &outChi2,
-                         GPlexQF &Chi2, GPlexQI *HitsIdx,
-                         GPlexQI &inChg, GPlexQI &Label,
-                         GeometryCU &geom, 
-                         int *maxSize, int N);
-
-
-} // end namespace mkfit
-#endif  /* ifndef BEST_HIT_KERNELS_H */
diff --git a/mkFit/build_tracks_kernels.cu b/mkFit/build_tracks_kernels.cu
deleted file mode 100644
index e0ef847b..00000000
--- a/mkFit/build_tracks_kernels.cu
+++ /dev/null
@@ -1,203 +0,0 @@
-#include "build_tracks_kernels.h"
-
-#include "GPlex.h"
-#include "reorganize_gplex.h"
-#include "kalmanUpdater_kernels.h"
-#include "computeChi2_kernels.h"
-
-constexpr int BLOCK_SIZE_X = 16;
-
-
-__device__ void getHitFromLayer_fn(LayerOfHitsCU& layer_of_hits, 
-    GPlexQI& HitsIdx, GPlexHV& msPar, GPlexHS& msErr, int itrack_plex, int N)
-{
-  if (itrack_plex < N)
-  {
-    int hit_idx = HitsIdx[itrack_plex];
-    if (hit_idx >= 0)
-    {
-      Hit &hit = layer_of_hits.m_hits[hit_idx];
-
-      GetHitErr(msErr, (char *)hit.errArrayCU(), 0, N);
-      GetHitPar(msPar, (char *)hit.posArrayCU(), 0, N);
-    }
-  }
-}
-
-__global__ void getHitFromLayer_kernel(LayerOfHitsCU& layer_of_hits, 
-    GPlexQI HitsIdx, GPlexHV msPar, GPlexHS msErr, int N)
-{
-  int itrack_plex = threadIdx.x + blockDim.x * blockIdx.x;
-  getHitFromLayer_fn(layer_of_hits, HitsIdx, msPar, msErr, itrack_plex, N);
-}
-
-void getHitFromLayer_wrappper( const cudaStream_t& stream,
-    LayerOfHitsCU& layer_cu, GPlexQI& HitsIdx, 
-    GPlexHV& msPar, GPlexHS& msErr, int N)
-{
-  int gridx = (N-1)/BLOCK_SIZE_X + 1;
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  getHitFromLayer_kernel <<< grid, block, 0, stream >>>
-    (layer_cu, HitsIdx, msPar, msErr, N);
-}
-
-
-__device__ void updateMissingHits_fn(GPlexQI& HitsIdx, 
-    GPlexLV& Par_iP, GPlexLS& Err_iP,
-    GPlexLV& Par_iC, GPlexLS& Err_iC, int i, int N)
-{
-  if (i < N)
-  {
-    if (HitsIdx[i] < 0)
-    {
-      for (auto j = 0; j < Err_iC.kSize; ++j) {
-        Err_iC[i + j * Err_iC.stride] = Err_iP[i + j * Err_iP.stride];
-      }
-      for (auto j = 0; j < Par_iC.kSize; ++j) {
-        Par_iC[i + j * Par_iC.stride] = Par_iP[i + j * Par_iP.stride];
-      }
-    }
-  }
-}
-
-
-__global__ void updateMissingHits_kernel(GPlexQI HitsIdx, 
-    GPlexLV Par_iP, GPlexLS Err_iP,
-    GPlexLV Par_iC, GPlexLS Err_iC, int N)
-{
-  int i = threadIdx.x + blockDim.x * blockIdx.x;
-  updateMissingHits_fn(HitsIdx, Par_iP, Err_iP, Par_iC, Err_iC, i, N);
-}
-
-
-void UpdateMissingHits_wrapper(
-    const cudaStream_t& stream, GPlexQI& HitsIdx, 
-    GPlexLV& Par_iP, GPlexLS& Err_iP,
-    GPlexLV& Par_iC, GPlexLS& Err_iC,
-    int N)
-{
-  int gridx = (N-1)/BLOCK_SIZE_X + 1;
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  updateMissingHits_kernel <<< grid, block, 0, stream >>>
-    (HitsIdx, Par_iP, Err_iP, Par_iC, Err_iC, N);
-}
-
-
-__device__
-void UpdateWithLastHit_fn(
-    LayerOfHitsCU& layer_of_hits, GPlexQI& HitsIdx, 
-    GPlexHV& msPar,  GPlexHS& msErr,
-    GPlexLV& Par_iP, GPlexLS& Err_iP,
-    GPlexLV& Par_iC, GPlexLS& Err_iC,
-    int itrack_plex, int N)
-{
-  getHitFromLayer_fn(layer_of_hits, HitsIdx, msPar, msErr, itrack_plex, N);
-  kalmanUpdate_fn(Err_iP, msErr, Par_iP, msPar, Par_iC, Err_iC, itrack_plex, N);
-  updateMissingHits_fn(HitsIdx, Par_iP, Err_iP, Par_iC, Err_iC, itrack_plex, N);
-}
-
-
-__global__
-void UpdateWithLastHit_kernel(
-    LayerOfHitsCU& layer_of_hits, GPlexQI HitsIdx, 
-    GPlexHV msPar, GPlexHS msErr,
-    GPlexLV Par_iP, GPlexLS Err_iP,
-    GPlexLV Par_iC, GPlexLS Err_iC,
-    int N)
-{
-  int i = threadIdx.x + blockDim.x * blockIdx.x;
-  UpdateWithLastHit_fn(layer_of_hits, HitsIdx, msPar, msErr, 
-                       Par_iP, Err_iP, Par_iC, Err_iC, i, N);
-}
-
-
-void UpdateWithLastHit_wrapper(
-    const cudaStream_t& stream,
-    LayerOfHitsCU& layer_cu, GPlexQI& HitsIdx, 
-    GPlexHV& msPar, GPlexHS& msErr,
-    GPlexLV& Par_iP, GPlexLS& Err_iP,
-    GPlexLV& Par_iC, GPlexLS& Err_iC,
-    int N)
-{
-  int gridx = (N-1)/BLOCK_SIZE_X + 1;
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  UpdateWithLastHit_kernel <<< grid, block, 0, stream >>>
-    (layer_cu, HitsIdx, msPar, msErr,
-     Par_iP, Err_iP, Par_iC, Err_iC, N);
-}
-
-
-#if 0
-__device__ void findCandidates_fn(
-    Hit *hits, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, 
-    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
-    const GPlexLV &propPar, GPlexQF &outChi2,
-    GPlexQF &Chi2, GPlexQI &HitsIdx,
-    const int maxSize, const int itrack, const int N) {
-
-  // FIXME: We probably don't want to find candidates for
-  //        non-existing tracks. 
-  //        However, Valid will need to be updated at some point.
-  //    This is just a note to remember to do it.
-  //if (!Valid[itrack]) return;
-
-  if (itrack < N)
-    HitsIdx[itrack] = 0;
-
-  for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt)
-  {
-    HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, itrack, N);
-    computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, itrack, N);
-    // get the maxCandsPerSeed best cands for this TRACK
-    //getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2.ptr, minChi2_reg, bestHit_reg, hit_cnt, N);
-  }
-  // TODO
-  // get the maxCandsPerSeed best cands;
-  // Eventually add to the list of cands for this SEED
-  // ...
-}
-
-
-__global__ void findCandidates_kernel(
-    Hit *hits, const GPlexQI XHitSize, const GPlexHitIdx XHitArr, 
-    const GPlexLS propErr, GPlexHS msErr, GPlexHV msPar,
-    const GPlexLV propPar, GPlexQF outChi2,
-    GPlexQF Chi2, GPlexQI HitsIdx,
-    const int maxSize, const int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  findCandidates_fn(hits, XHitSize, XHitArr, 
-    propErr, msErr, msPar,
-    propPar, outChi2,
-    Chi2, HitsIdx,
-    maxSize, itrack, N);
-}
-
-
-void findCandidates_wrapper(const cudaStream_t &stream,
-    LayerOfHitsCU &layer, const GPlexQI &XHitSize,  const GPlexHitIdx &XHitArr,
-    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
-    const GPlexLV &propPar, GPlexQF &outChi2,
-    GPlexQF &Chi2, GPlexQI &HitsIdx,
-    const int maxSize, const int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  findCandidates_kernel <<< grid, block, 0, stream >>>
-    (layer.m_hits, XHitSize, XHitArr,
-     propErr, msErr, msPar, propPar, outChi2,
-     /*propErr.ptr, propErr.stride,*/
-     /*msErr.ptr, msErr.stride, msErr.kSize,*/
-     /*msPar.ptr, msPar.stride, msPar.kSize,*/
-     /*outChi2.ptr, outChi2.stride,*/
-     Chi2, HitsIdx,
-     maxSize, N);
-}
-#endif
diff --git a/mkFit/build_tracks_kernels.h b/mkFit/build_tracks_kernels.h
deleted file mode 100644
index ae45a1a4..00000000
--- a/mkFit/build_tracks_kernels.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef _BUILD_TRACKS_KERNELS_
-#define _BUILD_TRACKS_KERNELS_
-
-#include "GPlex.h"
-#include "HitStructuresCU.h"
-
-namespace mkfit {
-
-void getHitFromLayer_wrappper( const cudaStream_t& stream,
-    LayerOfHitsCU& layer_cu, GPlexQI& HitsIdx, 
-    GPlexHV& msPar, GPlexHS& msErr, int N);
-void UpdateMissingHits_wrapper(
-    const cudaStream_t& stream, GPlexQI& HitsIdx, 
-    GPlexLV& Par_iP, GPlexLS& Err_iP,
-    GPlexLV& Par_iC, GPlexLS& Err_iC,
-    int N);
-
-__device__
-void UpdateWithLastHit_fn(
-    LayerOfHitsCU& layer_of_hits, GPlexQI& HitsIdx, 
-    GPlexHV& msPar,  GPlexHS& msErr,
-    GPlexLV& Par_iP, GPlexLS& Err_iP,
-    GPlexLV& Par_iC, GPlexLS& Err_iC,
-    int itrack_plex, int N);
-
-void UpdateWithLastHit_wrapper(
-    const cudaStream_t& stream,
-    LayerOfHitsCU& layer_cu, GPlexQI& HitsIdx, 
-    GPlexHV& msPar, GPlexHS& msErr,
-    GPlexLV& Par_iP, GPlexLS& Err_iP,
-    GPlexLV& Par_iC, GPlexLS& Err_iC,
-    int N);
-
-} // end namespace mkfit
-#endif  /* ifndef _BUILD_TRACKS_KERNELS_ */ 
diff --git a/mkFit/buildtestMPlex.cc b/mkFit/buildtestMPlex.cc
index c01c8b93..21cd1818 100644
--- a/mkFit/buildtestMPlex.cc
+++ b/mkFit/buildtestMPlex.cc
@@ -7,12 +7,6 @@
 
 #include "MkBuilder.h"
 
-#ifdef USE_CUDA
-#include "FitterCU.h"
-#include "BuilderCU.h"
-#include "check_gpu_hit_structures.h"
-#endif
-
 #if defined(USE_VTUNE_PAUSE)
 #include "ittnotify.h"
 #endif
@@ -159,22 +153,9 @@ double runBuildingTestPlexBestHit(Event& ev, MkBuilder& builder)
   __itt_resume();
 #endif
 
-#if USE_CUDA
-  //check_event_of_hits_gpu(builder.get_event_of_hits());
-  //check_event_of_cands_gpu(event_of_cands);
-  BuilderCU builder_cu;
-  builder_cu.setUpBH(builder.get_event_of_hits(), builder.get_event(),
-                     event_of_cands);
-#endif
-
   double time = dtime();
 
-#if USE_CUDA
-  builder_cu.FindTracksBestHit(event_of_cands);
-  builder_cu.tearDownBH();
-#else
   builder.FindTracksBestHit();
-#endif
 
   time = dtime() - time;
 
@@ -212,42 +193,6 @@ double runBuildingTestPlexBestHit(Event& ev, MkBuilder& builder)
   return time;
 }
 
-#if USE_CUDA
-double runBuildingTestPlexBestHitGPU(Event& ev, MkBuilder& builder,
-                                     BuilderCU& builder_cu)
-{
-  builder.begin_event(&ev, 0, __func__);
-
-  if   (Config::seedInput == findSeeds) {builder.find_seeds();}
-  else                                  {builder.map_seed_hits();} // all other simulated seeds need to have hit indices line up in LOH for seed fit
-
-  builder.fit_seeds();
-
-  EventOfCandidates event_of_cands;
-  builder.find_tracks_load_seeds(event_of_cands);
-  // Allocate event specific arrays
-  builder_cu.setUpBH(builder.get_event_of_hits(), builder.get_event(),
-                     event_of_cands);
-
-  double time = dtime();
-
-  builder_cu.FindTracksBestHit(event_of_cands);
-  // Deallocate event specific arrays 
-  builder_cu.tearDownBH();
-
-  time = dtime() - time;
-  if   (Config::quality_val) {
-    if (!Config::silent) builder.quality_val_BH(event_of_cands);
-  } else {
-    builder.sim_val_BH(event_of_cands);
-  }
-
-  builder.end_event();
-  
-  return time;
-}
-#endif
-
 
 //==============================================================================
 // runBuildTestPlex Combinatorial: Standard TBB
@@ -441,119 +386,4 @@ double runBuildingTestPlexFV(Event& ev, MkBuilder& builder)
   return time;
 }
 
-#if USE_CUDA
-double runBuildingTestPlexCloneEngineGPU(Event& ev, EventTmp& ev_tmp, 
-                                         MkBuilder& builder,
-                                         BuilderCU& builder_cu,
-                                         bool seed_based)
-{
-  EventOfCombCandidates &event_of_comb_cands = ev_tmp.m_event_of_comb_cands;
-  event_of_comb_cands.Reset();
-
-  builder.begin_event(&ev, &ev_tmp, __func__);
-
-  if   (Config::seedInput == findSeeds) {builder.find_seeds();}
-  else                                  {builder.map_seed_hits();} // all other simulated seeds need to have hit indices line up in LOH for seed fit
-
-  builder.fit_seeds();
-
-  builder.find_tracks_load_seeds();
-
-#ifdef USE_VTUNE_PAUSE
-  __itt_resume();
-#endif
-
-  //builder_cu.setUpFitterCE(-1 [> does not matter for now <]);
-  builder_cu.allocateCE(builder.get_event_of_hits(), builder.get_event(),
-                      event_of_comb_cands);
-  builder_cu.setUpCE(builder.get_event_of_hits(), builder.get_event(),
-                     event_of_comb_cands);
-
-  double time = dtime();
-
-  //builder.FindTracksCloneEngine();
-  builder_cu.FindTracksCloneEngine(event_of_comb_cands, seed_based);
-
-  time = dtime() - time;
-
-  builder_cu.tearDownCE();
-#ifdef USE_VTUNE_PAUSE
-  __itt_pause();
-#endif
-
-  if (Config::quality_val) {
-    if (!Config::silent) builder.quality_val_COMB();
-  } else {builder.sim_val_COMB();}
-
-  builder.end_event();
-
-  return time;
-}
-#endif
-
-//==============================================================================
-// runAllBuildTestPlexBestHitGPU
-//==============================================================================
-
-#if USE_CUDA
-double runAllBuildingTestPlexBestHitGPU(std::vector<Event> &events)
-{
-
-  int num_builders = events.size();
-  std::vector<std::unique_ptr<MkBuilder>> builder_ptrs(num_builders);
-  std::vector<EventOfCandidates> event_of_cands_vec(num_builders);
-  std::vector<BuilderCU> builder_cu_vec(num_builders);
-
-  for (int i = 0; i < builder_ptrs.size(); ++i) {
-    Event &ev = events[i];
-    builder_ptrs[i] = std::unique_ptr<MkBuilder> (MkBuilder::make_builder());
-
-    MkBuilder &builder = * builder_ptrs[i].get();
-
-    builder.begin_event(&ev, 0, __func__);
-
-    if   (Config::seedInput == findSeeds) {builder.find_seeds();}
-    else                                  {builder.map_seed_hits();} // all other simulated seeds need to have hit indices line up in LOH for seed fit
-
-    builder.fit_seeds();
-
-    EventOfCandidates &event_of_cands = event_of_cands_vec[i];
-    builder.find_tracks_load_seeds(event_of_cands);
-
-    BuilderCU &builder_cu = builder_cu_vec[i];
-    builder_cu.setUpBH(builder.get_event_of_hits(), builder.get_event(),
-                       event_of_cands);
-  }
-
-  //omp_set_num_threads(Config::numThreadsEvents);
-  //std::cerr << "num threads "<< omp_get_num_threads() << std::endl;
-//#pragma omp parallel for reduction(+:total_time)
-  //for (int i = 0; i < builder_ptrs.size(); ++i) {
-  double time = dtime();
-  tbb::parallel_for(size_t(0), builder_ptrs.size(), [&](size_t i) {
-    EventOfCandidates &event_of_cands = event_of_cands_vec[i];
-    BuilderCU &builder_cu = builder_cu_vec[i];
-    MkBuilder &builder = * builder_ptrs[i].get();
-
-    builder_cu.FindTracksBestHit(event_of_cands);
-  });
-  time = dtime() - time;
-
-  for (int i = 0; i < builder_ptrs.size(); ++i) {
-    EventOfCandidates &event_of_cands = event_of_cands_vec[i];
-    BuilderCU &builder_cu = builder_cu_vec[i];
-    builder_cu.tearDownBH();
-    MkBuilder &builder = * builder_ptrs[i].get();
-    if   (!Config::sim_val && !Config::cmssw_val) {
-      if (!Config::silent) builder.quality_val();
-    } else if (Config::sim_val) {
-      builder.root_val();
-    }
-
-    builder.end_event();
-  }
-  
-  return time;
-}
-#endif
 } // end namespace mkfit
diff --git a/mkFit/buildtestMPlex.h b/mkFit/buildtestMPlex.h
index a0764f12..f655e31b 100644
--- a/mkFit/buildtestMPlex.h
+++ b/mkFit/buildtestMPlex.h
@@ -3,9 +3,6 @@
 
 #include "Event.h"
 #include "Track.h"
-#ifdef USE_CUDA
-#include "BuilderCU.h"
-#endif
 
 namespace mkfit {
 
@@ -17,13 +14,5 @@ double runBuildingTestPlexStandard(Event& ev, MkBuilder& builder);
 double runBuildingTestPlexCloneEngine(Event& ev, MkBuilder& builder);
 double runBuildingTestPlexFV(Event& ev, MkBuilder& builder);
 
-#if USE_CUDA
-double runBuildingTestPlexBestHitGPU(Event& ev, MkBuilder& builder,
-                                     BuilderCU& builder_cu);
-double runBuildingTestPlexCloneEngineGPU(Event& ev, EventTmp& ev_tmp, MkBuilder& builder,
-                                         BuilderCU& builder_cu, bool seed_based=false);
-double runAllBuildingTestPlexBestHitGPU(std::vector<Event> &events);
-#endif
-
 } // end namespace mkfit
 #endif
diff --git a/mkFit/check_gpu_hit_structures.cu b/mkFit/check_gpu_hit_structures.cu
deleted file mode 100644
index b5a3234a..00000000
--- a/mkFit/check_gpu_hit_structures.cu
+++ /dev/null
@@ -1,144 +0,0 @@
-#include "check_gpu_hit_structures.h"
-
-#include "Hit.h"
-#include "HitStructures.h"
-#include "HitStructuresCU.h"
-#include "reorganize_gplex.h"
-#include "gpu_utils.h"
-
-#include <iostream>
-
-
-__global__ void get_hit_pos_and_err(LayerOfHitsCU *layers,
-    int ilay, int hit_idx, float *pos, float *err, int pos_size, int err_size) {
-  if (threadIdx.x + blockDim.x * blockIdx.x == 0) {
-    LayerOfHitsCU &layer = layers[ilay];
-    Hit &hit = layer.m_hits[hit_idx];
-    float *posArray = get_posArray(hit);
-    float *errArray = get_errArray(hit);
-    for (int i = 0; i < pos_size; ++i) {
-      pos[i] = posArray[i];
-    }
-    for (int i = 0; i < err_size; ++i) {
-      err[i] = errArray[i];
-    }
-  }
-}
-
-
-void compare_carrays(const float *h_a, const float *d_a, 
-                     const float prec, const int n) 
-{
-  for (int i = 0; i < n; ++i) {
-    // should be relative comparison, verify if div by 0 will happen
-    if (std::abs(h_a[i] - d_a[i]) > prec) {
-      std::cerr << i << " : " << h_a[i] << " / " << d_a[i] << std::endl;
-    }
-  }
-}
-
-
-void check_event_of_hits_gpu(const EventOfHits& event_of_hits)
-{
-  EventOfHitsCU event_of_hits_cu;
-  event_of_hits_cu.reserve_layers(event_of_hits);
-  event_of_hits_cu.copyFromCPU(event_of_hits, 0);
-
-  constexpr int pos_size = 3;
-  constexpr int err_size = 6;
-
-  float *d_pos, *d_err;
-  float pos[pos_size], err[err_size];
-
-  cudaMalloc((void**)&d_pos, pos_size*sizeof(float));
-  cudaMalloc((void**)&d_err, err_size*sizeof(float));
-
-  dim3 grid(1, 1, 1);
-  dim3 block(1, 1, 1);
-
-  int ilay = 2;
-  int hit_idx = 3;
-
-  get_hit_pos_and_err <<< grid, block >>>
-    (event_of_hits_cu.m_layers_of_hits.data(), ilay, hit_idx, d_pos, d_err, pos_size, err_size);
-
-  cudaMemcpy(pos, d_pos, pos_size*sizeof(float), cudaMemcpyDeviceToHost);
-  cudaMemcpy(err, d_err, err_size*sizeof(float), cudaMemcpyDeviceToHost);
-
-  //std::cerr << "pos ......................\n";
-  compare_carrays(event_of_hits.m_layers_of_hits[ilay].m_hits[hit_idx].posArray(),
-                  pos, 1e-3, pos_size);
-  //std::cerr << "err ......................\n";
-  compare_carrays(event_of_hits.m_layers_of_hits[ilay].m_hits[hit_idx].errArray(),
-                  err, 1e-3, err_size);
-
-  cudaFree(d_pos);
-  cudaFree(d_err);
-}
-
-
-__global__ void get_cand_pos_and_err(EtaBinOfCandidatesCU *etabin_of_cands,
-    const int ebin, const int itrack, float *pos, float *err,
-    const int pos_size, const int err_size)
-{
-  if (threadIdx.x + blockDim.x * blockIdx.x == 0) {
-    Track &track = etabin_of_cands[ebin].m_candidates[itrack];
-    float *posArray = get_posArray(track);
-    float *errArray = get_errArray(track);
-
-    for (int i = 0; i < pos_size; ++i) {
-      pos[i] = posArray[i];
-    }
-    for (int i = 0; i < err_size; ++i) {
-      err[i] = errArray[i];
-    }
-  }
-}
-
-
-void check_event_of_cands_gpu(const EventOfCandidates& event_of_cands)
-{
-  EventOfCandidatesCU event_of_cands_cu;
-  event_of_cands_cu.allocGPU(event_of_cands);
-  event_of_cands_cu.copyFromCPU(event_of_cands);
-
-  constexpr int pos_size = 6;
-  constexpr int err_size = 21;
-
-  float *d_pos, *d_err;
-  float pos[pos_size], err[err_size];
-
-  cudaMalloc((void**)&d_pos, pos_size*sizeof(float));
-  cudaMalloc((void**)&d_err, err_size*sizeof(float));
-
-  dim3 grid(1, 1, 1);
-  dim3 block(1, 1, 1);
-
-  int etabin = std::min(2, Config::nEtaBin-1);
-  int itrack = 3;
-
-  get_cand_pos_and_err <<< grid, block >>>
-      (event_of_cands_cu.m_etabins_of_candidates,
-       etabin, itrack, d_pos, d_err, pos_size, err_size);
-  cudaCheckErrorSync();
-
-  cudaMemcpy(pos, d_pos, pos_size*sizeof(float), cudaMemcpyDeviceToHost);
-  cudaCheckErrorSync();
-  cudaMemcpy(err, d_err, err_size*sizeof(float), cudaMemcpyDeviceToHost);
-  cudaCheckErrorSync();
-  
-  /*std::cerr << "pos ......................\n";*/
-  compare_carrays(event_of_cands.m_etabins_of_candidates[etabin].m_candidates[itrack].posArray(),
-                  pos, 1e-3, pos_size);
-  /*std::cerr << "err ......................\n";*/
-  compare_carrays(event_of_cands.m_etabins_of_candidates[etabin].m_candidates[itrack].errArray(),
-                  err, 1e-3, err_size);
-
-  cudaFree(d_pos);
-  cudaFree(d_err);
-
-  //event_of_cands_cu.copyToCPU(event_of_cands);
-  event_of_cands_cu.deallocGPU();
-
-  cudaCheckErrorSync();
-}
diff --git a/mkFit/check_gpu_hit_structures.h b/mkFit/check_gpu_hit_structures.h
deleted file mode 100644
index fe80c21f..00000000
--- a/mkFit/check_gpu_hit_structures.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef CHECK_GPU_HIT_STRUCTURE_H
-#define CHECK_GPU_HIT_STRUCTURE_H 
-
-#include "HitStructures.h"
-
-namespace mkfit {
-
-void check_event_of_hits_gpu(const EventOfHits& event_of_hits);
-void check_event_of_cands_gpu(const EventOfCandidates& event_of_cands);
-
-} // end namespace mkfit
-#endif /* ifndef CHECK_GPU_HIT_STRUCTURE_H */
diff --git a/mkFit/clone_engine_cuda_helpers.cu b/mkFit/clone_engine_cuda_helpers.cu
deleted file mode 100644
index af4fed8c..00000000
--- a/mkFit/clone_engine_cuda_helpers.cu
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "clone_engine_cuda_helpers.h"
-#include "GPlex.h"
-
-#include "BestCands.h"
-
-
-__device__ int countValidHits_fn(const int itrack, const int end_hit, 
-                                 const GPlexQI* HitsIdx_arr)
-{
-  int result = {};
-  for (int hi = 0; hi < end_hit; ++hi)
-    {
-      if (HitsIdx_arr[hi](itrack, 0, 0) >= 0) result++;
-    }
-  return result;
-}
-
-
-__device__ int countInvalidHits_fn(const int itrack, const int end_hit, 
-                                   const GPlexQI* HitsIdx_arr)
-{
-  int result = {};
-  for (int hi = 0; hi < end_hit; ++hi)
-    {
-      if (HitsIdx_arr[hi](itrack, 0, 0) == -1) result++;
-    }
-  return result;
-}
-
-
-// Return (Valid Hits, Invalid Hits)
-__device__ int2 countHits_fn(const int itrack, const int end_hit, 
-                             const GPlexQI* HitsIdx_arr)
-{
-  auto result = int2 {};
-  for (int hi = 0; hi < end_hit; ++hi) {
-    auto idx = HitsIdx_arr[hi](itrack, 0, 0);
-    if (idx >=0) {
-      ++result.x;
-    } else if (idx == -1) {
-      ++result.y;
-    }
-
-  }
-  return result;
-}
diff --git a/mkFit/clone_engine_cuda_helpers.h b/mkFit/clone_engine_cuda_helpers.h
deleted file mode 100644
index f8d3dc49..00000000
--- a/mkFit/clone_engine_cuda_helpers.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef CLONE_ENGINE_CUDA_HELPERS_H
-#define CLONE_ENGINE_CUDA_HELPERS_H
-
-#include "GPlex.h"
-#include "BestCands.h"
-#include "reorganize_gplex.h"
-#include "computeChi2_kernels.h"
-
-namespace mkfit {
-
-__device__ int countValidHits_fn(const int itrack, const int end_hit,
-                                 const GPlexQI *HitsIdx_arr);
-
-__device__ int countInvalidHits_fn(const int itrack, const int end_hit, 
-                                   const GPlexQI *HitsIdx_arr);
-
-__device__ int2 countHits_fn(const int itrack, const int end_hit, 
-                            const GPlexQI *HitsIdx_arr);
-
-template <int MAX_CANDS_PER_SEED, int BLOCK_SIZE>
-__device__ void findTrackCands_fn(const int ilay,
-    Hit *hits, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, 
-    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
-    const GPlexLV &propPar, GPlexQF &outChi2,
-    GPlexQF &Chi2, GPlexQI *HitsIdx_arr,
-    const int maxSize, const int itrack, const int icand, const int N,
-    CandsGPU::BestCands<MAX_CANDS_PER_SEED, BLOCK_SIZE>& cand_list)
-{
-  // Note: cand_list should initially be filled with sentinels
-  int itrack_block = threadIdx.x;  // for SM arrays
-  float chi2_track = Chi2[itrack];
-  int XHitSize_track = XHitSize[itrack];
-
-  GPlexQI &HitsIdx = HitsIdx_arr[ilay];
-
-  // Note: 0 < itrack < N = matriplex_width
-  HitsIdx[itrack] = 0;  // reset
-
-  for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt) {
-    HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, 
-               HitsIdx, hit_cnt, itrack, N);
-    // TODO: add CMSGeom
-    computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, itrack, N);
-
-    // make sure the hit was in the compatiblity window for the candidate
-    if (hit_cnt < XHitSize_track)
-    {
-      float chi2 = fabs(outChi2[itrack]); //fixme negative chi2 sometimes...
-      if (chi2 < Config::chi2Cut)
-      {
-        int cand_hitIdx = XHitArr(itrack, hit_cnt, 0);
-        int cand_nhits = countValidHits_fn(itrack, ilay, HitsIdx_arr) + 1; 
-        float cand_chi2 = chi2_track + chi2;
-        cand_list.update(itrack_block, icand, cand_hitIdx,
-                         cand_nhits, cand_chi2);
-      }
-    }
-  }
-  // now add invalid hit 
-  // if icand has enough 'good' hits, update will discard invalid hit.
-  auto num_good_bad_hits = countHits_fn(itrack, ilay, HitsIdx_arr);
-  int hit_idx = num_good_bad_hits.y < Config::maxHolesPerCand ? -1 : -2;
-  cand_list.update(itrack_block, icand, hit_idx, 
-                   num_good_bad_hits.x, 
-                   chi2_track);
-}
-
-} // end namespace mkfit
-#endif  // CLONE_ENGINE_CUDA_HELPERS_H
diff --git a/mkFit/clone_engine_kernels.cu b/mkFit/clone_engine_kernels.cu
deleted file mode 100644
index 60d814af..00000000
--- a/mkFit/clone_engine_kernels.cu
+++ /dev/null
@@ -1,258 +0,0 @@
-#include "clone_engine_kernels.h"
-#include "clone_engine_cuda_helpers.h"
-
-#include "BestCands.h"
-
-#include "computeChi2_kernels.h"
-#include "index_selection_kernels.h"
-#include "reorganize_gplex.h"
-#include "build_tracks_kernels.h"
-#include "array_algorithms_cu.h"
-#include "propagation_kernels.h"
-
-
-constexpr int BLOCK_SIZE_X = 256;
-
-
-__device__ void findCandidates_fn(const int ilay,
-    Hit *hits, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, 
-    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
-    const GPlexLV &propPar, GPlexQF &outChi2,
-    GPlexQF &Chi2, GPlexQI *HitsIdx_arr,
-    GPlexQB &Valid,
-    const int maxSize, const int itrack, const int N,
-    CandsGPU::BestCands<Config::maxCandsPerSeed, BLOCK_SIZE_X>& cand_list)
-{
-  if (itrack >= N) return;
-  if (!Valid[itrack]) return;
-
-  const int icand = itrack % Config::maxCandsPerSeed;
-
-  findTrackCands_fn(ilay, hits, XHitSize, XHitArr, 
-    propErr, msErr, msPar, propPar, outChi2, Chi2, HitsIdx_arr,
-    maxSize, itrack, icand, N, cand_list);
-
-  // At this point, each track has the best candidates organized in a stack
-  //__syncthreads(); no need for that, sync hapeen in merge
-  // sorting garbbage is fine (i.e. threads with no valid tracks to start with)
-  // because of the default sentinel values.
-  int itrack_block = threadIdx.x;
-  int iseed_block = itrack_block / Config::maxCandsPerSeed;
-  int icand_block = itrack_block % Config::maxCandsPerSeed;
-  cand_list.merge_cands_for_seed(iseed_block, icand_block);
-  // At this point the best overall candidates for seed iseed are
-  // stored in the first list (icand == 0)
-}
-
-
-__device__ void updateCandsWithInfo(Track *candidates, int *ntracks_per_seed,
-                                    int my_trkIdx, int my_hitIdx, int my_nhits,
-                                    float my_chi2, 
-                                    int beg, int itrack)
-{
-  int itrack_ev = beg + itrack;
-  int iseed_ev = itrack_ev / Config::maxCandsPerSeed;
-  int icand_ev = itrack_ev % Config::maxCandsPerSeed; 
-
-  int ncands_seed = ntracks_per_seed[iseed_ev];
-
-  Track tmp_track;  // BAD: too big to fit in registers
-
-  if (0 <= my_trkIdx && my_trkIdx < ncands_seed) {
-    // Get candidate of previous step that match this trkIdx
-    Track& base_track = candidates[iseed_ev*Config::maxCandsPerSeed + my_trkIdx];
-
-    tmp_track = base_track;
-    tmp_track.addHitIdx(my_hitIdx, 0.f /*chi2*/);
-    tmp_track.setChi2(my_chi2);
-  }
-
-  // Before writing the new track to the EtaBinOfCombCandidatesCU
-  // makes sure everybody has a copy of it.
-  // CUDA 8; synchronize only cands for seed;
-  __syncthreads();
-
-  if (0 <= my_trkIdx && my_trkIdx < ncands_seed) {
-    Track& my_track = candidates[iseed_ev*Config::maxCandsPerSeed + icand_ev];
-    my_track = tmp_track;
-  }
-}
-
- 
-__global__ void findInLayers_kernel(LayerOfHitsCU *layers, 
-    EtaBinOfCombCandidatesCU* etabins_of_comb_candidates, 
-    GPlexQI XHitSize, GPlexHitIdx XHitArr,  
-    GPlexLS Err_iP, GPlexLV Par_iP, 
-    GPlexHS *msErr_arr, GPlexHV *msPar_arr,
-    GPlexLS Err_iC, GPlexLV Par_iC,
-    GPlexQF outChi2,
-    GPlexQF Chi2, GPlexQI *HitsIdx_arr,
-    GPlexQI inChg, GPlexQI Label, 
-    GPlexQI SeedIdx, GPlexQI CandIdx,
-    GPlexQB Valid, GeometryCU geom, 
-    int *maxSize, int gplex_size,
-    const int nlayers_per_seed)
-{
-  constexpr int max_cands_per_block = (BLOCK_SIZE_X / Config::maxCandsPerSeed) * Config::maxCandsPerSeed;
-
-  for (int ebin = 0; ebin < Config::nEtaBin; ebin++) {
-    EtaBinOfCombCandidatesCU& etabin_of_cands = etabins_of_comb_candidates[ebin];
-    int nseed = etabin_of_cands.m_nseed;
-    if (nseed == 0) continue;
-
-    int max_nseed_plex = gplex_size/Config::maxCandsPerSeed;
-
-    for (int beg_seed = 0; beg_seed < nseed; beg_seed += max_nseed_plex) {
-      int beg = beg_seed * Config::maxCandsPerSeed;
-      int end_seed = min(beg_seed + max_nseed_plex, nseed);
-      int end = end_seed * Config::maxCandsPerSeed;
-
-      int itrack_plex = threadIdx.x + blockIdx.x * max_cands_per_block;  // starts at 0. Fits a gplex 
-      int itrack /* itrack_ev */  = beg + itrack_plex;  // absolute track idx. If 'num_tracks' > gplex_size
-      int iseed_ev = itrack / Config::maxCandsPerSeed;
-      int icand_ev = itrack % Config::maxCandsPerSeed;
-      int N = end - beg;
-
-      int itrack_block = threadIdx.x;
-      int iseed_block = itrack_block / Config::maxCandsPerSeed;
-      int icand_block = itrack_block % Config::maxCandsPerSeed;
-
-      int maxSize_block;
-
-      __shared__ CandsGPU::BestCands<Config::maxCandsPerSeed, BLOCK_SIZE_X> cand_list;
-
-      // NOTE: maxCandsPerSeed should divide BLOCK_SIZE_X because
-      //       padding is handled differently in GPlex and CandList (in SM)
-      //       One easy way to deal with that is to allocate GPlexes a bit wider and
-      //       let the last threads of blocks being idle; and referencing arrays with:
-      //            tid = tidx + bidx * BLOCK_SIZE_X;
-      //       we still need itrack_plex for the guard.
-      //       We might has well not do it, and take advantages of these thread to
-      //       enlarge the number of candidates for each seed.
-      if (itrack_block < max_cands_per_block && itrack_plex < N) {
-          for (int ilay = nlayers_per_seed; ilay <= Config::nLayers; ++ilay) {
-            Track *tracks = etabin_of_cands.m_candidates.data();
-            int *tracks_per_seed = etabin_of_cands.m_ntracks_per_seed.data();
-
-            int Nhits = ilay;
-            InputTracksAndHitIdxComb_fn(tracks, tracks_per_seed, 
-                Err_iP, Par_iP,
-                inChg, Chi2, Label, HitsIdx_arr, 
-                SeedIdx, CandIdx, Valid, Nhits ,
-                beg, end, itrack_plex, N);
-
-            // no cands at this position for this layer
-            if (Valid[itrack_plex]) {
-              if (ilay > nlayers_per_seed) {
-                UpdateWithLastHit_fn(layers[ilay-1], HitsIdx_arr[ilay-1],
-                    msPar_arr[ilay-1], msErr_arr[ilay-1],
-                    Par_iP, Err_iP, Par_iC, Err_iC, itrack_plex, N);
-                if (ilay < Config::nLayers) {
-                  propagationForBuilding_fn(Err_iC, Par_iC, inChg, geom.radii[ilay], 
-                      Err_iP, Par_iP, itrack_plex, N);
-                  OutputParErrCU_fn(tracks, Err_iP, Par_iP,
-                      beg, end, itrack_plex, N);
-                } else {
-                  OutputParErrCU_fn(tracks, Err_iC, Par_iC,
-                      beg, end, itrack_plex, N);
-                  break;
-                }
-              }
-            } else {
-              if (ilay >= Config::nLayers) {  // for those invalid hits
-                break;
-              }
-            }
-            XHitSize[itrack_plex] = 0;  // reset here has well, because of invalid tracks
-            if (Valid[itrack_plex]) {
-              selectHitIndices_fn(layers[ilay], Err_iP, Par_iP, XHitSize, XHitArr, itrack_plex, N);
-            }
-            reduceMaxPartial_fn<int, BLOCK_SIZE_X, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS>
-              (XHitSize.ptr, 
-               max_cands_per_block * blockIdx.x,
-               max_cands_per_block,
-               &maxSize_block);
-
-            cand_list.reset(itrack_block);
-
-            findCandidates_fn(ilay,
-                layers[ilay].m_hits.data(), XHitSize, XHitArr,
-                Err_iP, msErr_arr[ilay], msPar_arr[ilay], Par_iP, 
-                outChi2, Chi2, HitsIdx_arr, Valid,
-                maxSize_block, itrack_plex, N, cand_list);
-            // cand_list: each seed has the best cands 
-            // It's similar to CandCloner::ProcessSeddRange (called from end_smthg)
-            // except than everything is already sorted.
-            // Here we cannot avoid bank conflicts, because we scatter (as in MPI_Scatter)
-            // a column (representing the list of overall best candidates) to different=
-            // thread. This is a consequence of the BestCand structure that
-            // was designed to minimized bank conflicts when performing the more 
-            // frequent stack insertion operation.
-
-            // Don't check if Valid as a previously inValid track might pick a correct value
-            int my_seed_track_idx = iseed_block * Config::maxCandsPerSeed;
-            int my_trkIdx, my_hitIdx, my_nhits;
-            float my_chi2;
-
-            cand_list.get_cand_info(my_seed_track_idx, icand_block,
-                my_trkIdx, my_hitIdx, my_nhits, my_chi2);
-            // Needs to count how many non sentinel there is:
-            if (!icand_ev) {
-              tracks_per_seed[iseed_ev] = cand_list.count_valid_cands(itrack_block);
-            }
-            __syncthreads();
-
-            // Now, every thread has the info of its own new best cand.
-            // -> Feed the etabin with that
-            updateCandsWithInfo(tracks, tracks_per_seed, 
-                my_trkIdx, my_hitIdx, my_nhits, my_chi2, 
-                beg, itrack_plex);
-          }  // ilay
-          // final sorting, 
-      }
-    }
-  }
-}
-
-__global__ void print_stuffs(Track* tracks, int*ntracks_per_seed) {
-  if (threadIdx.x + blockIdx.x * blockDim.x == 0) {
-    int idx = 0 + 1*Config::maxCandsPerSeed ;
-    int val = tracks[idx].getHitIdx(0);
-    printf("tracks[%d] = %d\n", idx, val);
-    val = tracks[idx].getHitIdx(1);
-    printf("tracks[%d] = %d\n", idx, val);
-    val = tracks[idx].getHitIdx(2);
-    printf("tracks[%d] = %d\n", idx, val);
-    val = ntracks_per_seed[idx]; 
-    printf("tracks[%d] = %d\n", idx, val);
-  }
-}
-
-void findInLayers_wrapper(cudaStream_t &stream,
-    LayerOfHitsCU *layers, 
-    EventOfCombCandidatesCU &event_of_cands_cu,
-    GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
-    GPlexLS &Err_iP, GPlexLV &Par_iP,
-    GPlexHS *msErr, GPlexHV *msPar,
-    GPlexLS &Err_iC, GPlexLV &Par_iC,
-    GPlexQF &outChi2,
-    GPlexQF &Chi2, GPlexQI *HitsIdx_arr,
-    GPlexQI &inChg, GPlexQI &Label,
-    GPlexQI &SeedIdx, GPlexQI &CandIdx,
-    GPlexQB& Valid,
-    GeometryCU &geom, 
-    int *maxSize, int N)
-{
-  int gridx = (N-1)/BLOCK_SIZE_X + 1;
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  findInLayers_kernel <<<grid, block, 0, stream>>>
-      (layers, 
-       event_of_cands_cu.m_etabins_of_comb_candidates.data(),
-       XHitSize, XHitArr, Err_iP, Par_iP, msErr, msPar,
-       Err_iC, Par_iC, outChi2, Chi2, HitsIdx_arr, inChg, Label, 
-       SeedIdx, CandIdx, Valid, geom, maxSize, N, Config::nlayers_per_seed);
-  
-  /*cudaCheckErrorSync();*/
-}
diff --git a/mkFit/clone_engine_kernels.h b/mkFit/clone_engine_kernels.h
deleted file mode 100644
index 22f0eead..00000000
--- a/mkFit/clone_engine_kernels.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef CLONE_ENGINE_KERNELS_H
-#define CLONE_ENGINE_KERNELS_H 
-
-#include "GPlex.h"
-#include "HitStructuresCU.h"
-#include "GeometryCU.h"
-
-namespace mkfit {
-
-void findInLayers_wrapper(cudaStream_t &stream,
-                          LayerOfHitsCU *layers, 
-                          EventOfCombCandidatesCU &event_of_cands_cu,
-                          GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
-                          GPlexLS &Err_iP, GPlexLV &Par_iP,
-                          GPlexHS *msErr, GPlexHV *msPar,
-                          GPlexLS &Err_iC, GPlexLV &Par_iC,
-                          GPlexQF &outChi2,
-                          GPlexQF &Chi2, GPlexQI *HitsIdx,
-                          GPlexQI &inChg, GPlexQI &Label,
-                          GPlexQI &SeedIdx, GPlexQI &CandIdx,
-                          GPlexQB& Valid,
-                          GeometryCU &geom, 
-                          int *maxSize, int N);
-
-} // end namespace mkfit
-#endif  /* ifndef CLONE_ENGINE_KERNELS_H */
diff --git a/mkFit/clone_engine_kernels_seed.cu b/mkFit/clone_engine_kernels_seed.cu
deleted file mode 100644
index 5968c9df..00000000
--- a/mkFit/clone_engine_kernels_seed.cu
+++ /dev/null
@@ -1,182 +0,0 @@
-#include "clone_engine_kernels_seed.h"
-#include "clone_engine_cuda_helpers.h"
-
-#include "BestCands.h"
-
-#include "computeChi2_kernels.h"
-#include "index_selection_kernels.h"
-#include "reorganize_gplex.h"
-#include "build_tracks_kernels.h"
-#include "array_algorithms_cu.h"
-#include "propagation_kernels.h"
-
-#include <cstdio>
-
-constexpr int BLOCK_SIZE_X = 256;
-
-
-__device__ void findCandidates_fn_seed(const int ilay,
-    Hit *hits, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, 
-    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
-    const GPlexLV &propPar, GPlexQF &outChi2,
-    GPlexQF &Chi2, GPlexQI *HitsIdx_arr,
-    GPlexQB &Valid,
-    const int maxSize, const int itrack, const int icand, const int N,
-    CandsGPU::BestCands<Config::maxCandsPerSeed, BLOCK_SIZE_X>& cand_list)
-{
-  if (itrack >= N) return;
-
-  findTrackCands_fn(ilay, hits, XHitSize, XHitArr, 
-    propErr, msErr, msPar, propPar, outChi2, Chi2, HitsIdx_arr,
-    maxSize, itrack, icand, N, cand_list);
-}
-
-
-__device__ void updateCandsWithInfo_seed(Track *candidates, int *ntracks_per_seed,
-    int iseed_ev, CandsGPU::BestCands<Config::maxCandsPerSeed, BLOCK_SIZE_X>& cand_list)
-{
-  int iseed_block = threadIdx.x;
-  Track new_cands[Config::maxCandsPerSeed];  // too big too be __shared__
-
-  for (int icand = 0; icand < ntracks_per_seed[iseed_ev]; ++icand) {
-    Track &tmp_track = new_cands[icand];
-
-    int my_trkIdx, my_hitIdx, my_nhits;
-    float my_chi2;
-
-    cand_list.get_cand_info(iseed_block, icand, my_trkIdx, 
-                            my_hitIdx, my_nhits, my_chi2);
-
-    if (0 <= my_trkIdx && my_trkIdx < ntracks_per_seed[iseed_ev]) {
-      Track& base_track = candidates[iseed_ev*Config::maxCandsPerSeed + my_trkIdx];
-
-      tmp_track = base_track;
-      tmp_track.addHitIdx(my_hitIdx, 0.f /*chi2*/);
-      tmp_track.setChi2(my_chi2);
-    }
-  }  // icand
-
-  // Second loop required: several new candidates can come from the same old one.
-  for (int icand = 0; icand < ntracks_per_seed[iseed_ev]; ++icand) {
-    candidates[iseed_ev*Config::maxCandsPerSeed + icand] = new_cands[icand];
-  }
-}
-
- 
-__global__ void findInLayers_kernel_seed (LayerOfHitsCU *layers, 
-    EtaBinOfCombCandidatesCU* etabins_of_comb_candidates, 
-    GPlexQI XHitSize, GPlexHitIdx XHitArr,  
-    GPlexLS Err_iP, GPlexLV Par_iP, 
-    GPlexHS *msErr_arr, GPlexHV *msPar_arr,
-    GPlexLS Err_iC, GPlexLV Par_iC,
-    GPlexQF outChi2,
-    GPlexQF Chi2, GPlexQI *HitsIdx_arr,
-    GPlexQI inChg, GPlexQI Label, 
-    GPlexQI SeedIdx, GPlexQI CandIdx,
-    GPlexQB Valid, GeometryCU geom, 
-    int *maxSize, int gplex_size,
-    const int nlayers_per_seed)
-{
-  int iseed_plex = threadIdx.x + blockIdx.x * blockDim.x;
-
-  for (int ebin = 0; ebin < Config::nEtaBin; ebin++) {
-    EtaBinOfCombCandidatesCU& etabin_of_cands = etabins_of_comb_candidates[ebin];
-    int nseed = etabin_of_cands.m_nseed;
-    if (nseed == 0) continue;
-
-    //int max_nseed_plex = gplex_size; // no more /Config::maxCandsPerSeed; -> one thread 
-    int N = gplex_size;
-    int maxSize_block;
-
-    // we need both iseed_plex and iseed_ev:
-    //   * iseed_plex to access the matriplex structures, used for the algorithm
-    //   * iseed_ev to access the other structures (tracks, mostly), used to 
-    //     interact with the outside world.
-    for (int iseed_ev = iseed_plex; 
-             iseed_ev < nseed; 
-             iseed_ev += gridDim.x * blockDim.x) {
-      int iseed_block = threadIdx.x;
-
-      __shared__ CandsGPU::BestCands<Config::maxCandsPerSeed, BLOCK_SIZE_X> cand_list;
-
-      Track *tracks = etabin_of_cands.m_candidates.data();
-      int *tracks_per_seed = etabin_of_cands.m_ntracks_per_seed.data();
-
-      for (int ilay = nlayers_per_seed; ilay <= Config::nLayers; ++ilay) {
-        cand_list.reset(iseed_block);
-        int Nhits = ilay;
-
-        for (int icand_seed = 0; icand_seed < tracks_per_seed[iseed_ev]; ++icand_seed) {
-          InputTracksAndHitIdxComb_fn_seed(tracks, tracks_per_seed, 
-              Err_iP, Par_iP,
-              inChg, Chi2, Label, HitsIdx_arr, 
-              SeedIdx, CandIdx, Valid, Nhits ,
-              iseed_ev, icand_seed, iseed_plex, N);
-
-          // no cands at this position for this layer
-          if (ilay > nlayers_per_seed) {
-            UpdateWithLastHit_fn(layers[ilay-1], HitsIdx_arr[ilay-1],
-                msPar_arr[ilay-1], msErr_arr[ilay-1],
-                Par_iP, Err_iP, Par_iC, Err_iC, iseed_plex, N);
-            if (ilay < Config::nLayers) {
-              propagationForBuilding_fn(Err_iC, Par_iC, inChg, geom.radii[ilay], 
-                  Err_iP, Par_iP, iseed_plex, N);
-              OutputParErrCU_fn_seed(tracks, Err_iP, Par_iP,
-                  iseed_ev, icand_seed, N);
-            } else {
-              OutputParErrCU_fn_seed(tracks, Err_iC, Par_iC,
-                  iseed_ev, icand_seed, N);
-              return;
-            }
-          }
-          XHitSize[iseed_plex] = 0;
-          selectHitIndices_fn(layers[ilay], Err_iP, Par_iP, 
-                              XHitSize, XHitArr, iseed_plex, N);
-          reduceMaxPartial_fn<int, BLOCK_SIZE_X, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS>
-            (XHitSize.ptr, blockDim.x * blockIdx.x, blockDim.x, &maxSize_block);
-
-          findCandidates_fn_seed(ilay,
-              layers[ilay].m_hits.data(), XHitSize, XHitArr,
-              Err_iP, msErr_arr[ilay], msPar_arr[ilay], Par_iP, 
-              outChi2, Chi2, HitsIdx_arr, Valid,
-              maxSize_block, iseed_plex, icand_seed, N, cand_list);
-        }  // icand_seed 
-        
-        cand_list.heap_sort(iseed_block, Config::maxCandsPerSeed);
-        tracks_per_seed[iseed_ev] = cand_list.count_valid_cands(iseed_block);
-
-        updateCandsWithInfo_seed(tracks, tracks_per_seed, iseed_ev, cand_list);
-      }  // ilay
-    }  // iseed
-  }  // eta
-}
-
-
-void findInLayers_wrapper_seed(cudaStream_t &stream,
-    LayerOfHitsCU *layers, 
-    EventOfCombCandidatesCU &event_of_cands_cu,
-    GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
-    GPlexLS &Err_iP, GPlexLV &Par_iP,
-    GPlexHS *msErr, GPlexHV *msPar,
-    GPlexLS &Err_iC, GPlexLV &Par_iC,
-    GPlexQF &outChi2,
-    GPlexQF &Chi2, GPlexQI *HitsIdx_arr,
-    GPlexQI &inChg, GPlexQI &Label,
-    GPlexQI &SeedIdx, GPlexQI &CandIdx,
-    GPlexQB& Valid,
-    GeometryCU &geom, 
-    int *maxSize, int N)
-{
-  int gridx = (N-1)/BLOCK_SIZE_X + 1;
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  findInLayers_kernel_seed <<<grid, block, 0, stream>>>
-      (layers, 
-       event_of_cands_cu.m_etabins_of_comb_candidates.data(),
-       XHitSize, XHitArr, Err_iP, Par_iP, msErr, msPar,
-       Err_iC, Par_iC, outChi2, Chi2, HitsIdx_arr, inChg, Label, 
-       SeedIdx, CandIdx, Valid, geom, maxSize, N, Config::nlayers_per_seed);
-  
-  /*cudaCheckErrorSync();*/
-}
diff --git a/mkFit/clone_engine_kernels_seed.h b/mkFit/clone_engine_kernels_seed.h
deleted file mode 100644
index 27c1d377..00000000
--- a/mkFit/clone_engine_kernels_seed.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef CLONE_ENGINE_KERNELS_SEED_H
-#define CLONE_ENGINE_KERNELS_SEED_H 
-
-#include "GPlex.h"
-#include "HitStructuresCU.h"
-#include "GeometryCU.h"
-
-namespace mkfit {
-
-void findInLayers_wrapper_seed(cudaStream_t &stream,
-                          LayerOfHitsCU *layers, 
-                          EventOfCombCandidatesCU &event_of_cands_cu,
-                          GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
-                          GPlexLS &Err_iP, GPlexLV &Par_iP,
-                          GPlexHS *msErr, GPlexHV *msPar,
-                          GPlexLS &Err_iC, GPlexLV &Par_iC,
-                          GPlexQF &outChi2,
-                          GPlexQF &Chi2, GPlexQI *HitsIdx,
-                          GPlexQI &inChg, GPlexQI &Label,
-                          GPlexQI &SeedIdx, GPlexQI &CandIdx,
-                          GPlexQB& Valid,
-                          GeometryCU &geom, 
-                          int *maxSize, int N);
-
-} // end namespace mkfit
-#endif  /* ifndef CLONE_ENGINE_KERNELS_SEED_H */
diff --git a/mkFit/computeChi2_kernels.cu b/mkFit/computeChi2_kernels.cu
deleted file mode 100644
index 46aaee59..00000000
--- a/mkFit/computeChi2_kernels.cu
+++ /dev/null
@@ -1,144 +0,0 @@
-#include "computeChi2_kernels.h"
-
-#include "GPlex.h"
-#include "kalmanUpdater_kernels.h"
-#include "gpu_utils.h"
-
-#include <cstdio>
-
-#define L 6
-#define HS 6
-#define HV 3
-#define BLOCK_SIZE_X 256
-
-#include "KalmanUtilsMPlex.icc"
-
-__device__ void chi2Similarity_fn(
-    const GPlexReg2V &a,
-    const GPlexReg2S &c, // in registers
-    float *d, const size_t dN,
-    const int n) {
-
-  //int n = threadIdx.x + blockIdx.x * blockDim.x;
-
-  // manually subrtact into local vars -- 3 of them
-  /*float x0 = a[0 * aN + n] - b[0 * aN + n];*/
-  /*float x1 = a[1 * aN + n] - b[1 * aN + n];*/
-  /*float x2 = a[2 * aN + n] - b[2 * aN + n];*/
-  /*d[0 * dN + n] = c[0]*x0*x0 + c[2]*x1*x1 + c[5]*x2*x2 +*/
-              /*2*( c[1]*x1*x0 + c[3]*x2*x0 + c[4]*x1*x2);*/
-  d[0 * dN + n] = c[0]*a[0]*a[0]
-                + c[2]*a[1]*a[1] 
-            + 2*( c[1]*a[1]*a[0]);
-}
-
-
-__device__ void RotateResidulsOnTangentPlane_fn(const GPlexRegQF& r00,//r00
-				  const GPlexRegQF& r01,//r01
-				  const GPlexRegHV &a  ,//res_glo
-          GPlexReg2V &b  )//res_loc
-{
-   // res_loc = rotT * res_glo
-   //   B     = N R   *    A   
-   RotateResidulsOnTangentPlane_impl(r00, r01, a, b, 0, 1);
-}
-
-
-__device__ void ProjectResErr_fn(const GPlexRegQF& a00,
-		   const GPlexRegQF& a01,
-		   const GPlexRegHS &b, 
-       GPlexRegHH &c)
-{
-  // C = A * B, C is 3x3, A is 3x3 , B is 3x3 sym
-
-  // Based on script generation and adapted to custom sizes.
-      c[ 0] = a00[0]*b[ 0] + a01[0]*b[ 1];
-      c[ 1] = a00[0]*b[ 1] + a01[0]*b[ 2];
-      c[ 2] = a00[0]*b[ 3] + a01[0]*b[ 4];
-      c[ 3] = b[ 3];
-      c[ 4] = b[ 4];
-      c[ 5] = b[ 5];
-      c[ 6] = a01[0]*b[ 0] - a00[0]*b[ 1];
-      c[ 7] = a01[0]*b[ 1] - a00[0]*b[ 2];
-      c[ 8] = a01[0]*b[ 3] - a00[0]*b[ 4];
-}
-
-
-__device__ void ProjectResErrTransp_fn(const GPlexRegQF& a00,
-			 const GPlexRegQF& a01, const GPlexRegHH& b, GPlexReg2S& c)
-{
-  // C = A * B, C is 3x3 sym, A is 3x3 , B is 3x3
-
-  // Based on script generation and adapted to custom sizes.
-      c[ 0] = b[ 0]*a00[0] + b[ 1]*a01[0];
-      c[ 1] = b[ 3]*a00[0] + b[ 4]*a01[0];
-      c[ 2] = b[ 5];
-}
-
-
-__device__ void computeChi2_fn(
-    const GPlexLS &propErr, const GPlexHS &msErr, const GPlexHV &msPar,
-    const GPlexLV &propPar, GPlexQF &outChi2, const int n, const int N) {
-  //int n = threadIdx.x + blockIdx.x * blockDim.x;
-  /*float resErr_reg[HS]; // ~ resErr_glo*/
-  GPlexRegHS resErr_reg;
-
-  if (n < N) {
-    // coordinate change
-    GPlexRegQF rotT00;
-    GPlexRegQF rotT01;
-    const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
-    rotT00[0] = -(msPar(n, 1, 0) + propPar(n, 1, 0))/(2*r);
-    rotT01[0] =  (msPar(n, 0, 0) + propPar(n, 0, 0))/(2*r);
-
-    GPlexRegHV res_glo;
-    subtractFirst3_fn(msPar, propPar, res_glo, N, n);
-
-    for (int j = 0; j < HS; ++j) {
-      resErr_reg[j] = 0; //resErr[j*resErr_stride + n];
-    }
-    addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n);
-
-    GPlexReg2V res_loc;   //position residual in local coordinates
-    RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc);
-    /*MPlex2S resErr_loc;//covariance sum in local position coordinates*/
-    /*MPlexHH tempHH;*/
-    GPlexReg2S resErr_loc; // 2x2 sym
-    GPlexRegHH tempHH;  // 3*3 sym
-    ProjectResErr_fn  (rotT00, rotT01, resErr_reg, tempHH);
-    ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc);
-
-    /*invertCramerSym_fn(resErr_reg);*/
-    invertCramerSym2x2_fn(resErr_loc);
-
-    chi2Similarity_fn(res_loc, resErr_loc, outChi2.ptr, outChi2.stride, n);
-  }
-}
-
-
-__global__ void computeChi2_kernel(
-    const GPlexLS propErr, const GPlexHS msErr, const GPlexHV msPar, 
-    const GPlexLV propPar, GPlexQF outChi2, const int N) {
-  int grid_width = blockDim.x * gridDim.x;
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
-    itrack += z*grid_width;
-
-    if (itrack < N) {
-      computeChi2_fn (propErr, msErr, msPar, propPar, outChi2, itrack, N);
-    }
-  }
-}
-
-
-void computeChi2_wrapper(cudaStream_t &stream, 
-    const GPlexLS &propErr, const GPlexHS &msErr,
-    const GPlexHV &msPar, const GPlexLV &propPar, GPlexQF &outChi2,
-    const int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-  computeChi2_kernel <<< grid, block, 0, stream >>>
-    (propErr, msErr, msPar, propPar, outChi2, N);
- }
diff --git a/mkFit/computeChi2_kernels.h b/mkFit/computeChi2_kernels.h
deleted file mode 100644
index 145a6304..00000000
--- a/mkFit/computeChi2_kernels.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef _COMPUTE_CHI2_KERNELS_H_
-#define _COMPUTE_CHI2_KERNELS_H_
-
-#include "HitStructuresCU.h"
-#include "GPlex.h"
-#include "GeometryCU.h"
-
-namespace mkfit {
-
-__device__ void computeChi2_fn(const GPlexLS &propErr, const GPlexHS &msErr,
-    const GPlexHV &msPar, const GPlexLV &propPar, GPlexQF &outChi2, 
-    const int itrack, const int N);
-
-
-void computeChi2_wrapper(cudaStream_t &stream, 
-    const GPlexLS &propErr, const GPlexHS &msErr, const GPlexHV &msPar, 
-    const GPlexLV &propPar, GPlexQF &outChi2, const int N);
-
-#if 1
-void HitToMs_wrapper(cudaStream_t& stream,
-    GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, GPlexQI &XHitSize, 
-    GPlexHitIdx &XHitArr, GPlexQI &HitsIdx, int hit_cnt, int N);
-#endif
-
-__device__ void RotateResidulsOnTangentPlane_fn(const GPlexRegQF& r00, 
-    const GPlexRegQF& r01, const GPlexRegHV &a, GPlexReg2V &b);
-
-__device__ void ProjectResErr_fn(const GPlexRegQF& a00, const GPlexRegQF& a01,
-                                 const GPlexRegHS &b, GPlexRegHH &c);
-
-__device__ void ProjectResErrTransp_fn(const GPlexRegQF& a00, const GPlexRegQF& a01, 
-                                       const GPlexRegHH &b, GPlexReg2S &c);
-
-} // end namespace mkfit
-#endif
diff --git a/mkFit/device_array_view.h b/mkFit/device_array_view.h
deleted file mode 100644
index 0ebccfde..00000000
--- a/mkFit/device_array_view.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef DEVICE_ARRAY_VIEW_H_
-#define DEVICE_ARRAY_VIEW_H_
-
-#include "Config.h"
-#include <cstddef>
-
-namespace mkfit {
-
-template <typename T>
-class DeviceArrayView {
- public:
-  using value_type = T;
-  using size_type = std::size_t;
-  using reference = value_type&;
-  using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const pointer;
-
-
-  DeviceArrayView() : size_(0), data_(nullptr) {}
-  DeviceArrayView(pointer ptr, size_type count) : size_(count), data_(ptr) {}
-
-  CUDA_CALLABLE const_reference operator[](int i) const {return data_[i];}
-  CUDA_CALLABLE       reference operator[](int i)       {return data_[i];}
-
-  void set_view(pointer ptr, size_type count) {
-    data_ = ptr;
-    size_ = count;
-  }
-  void set_ptr(pointer ptr) {
-    // when we don't care about size_ : needs to be fixed?
-    data_ = ptr;
-    //    size_ = count;
-  }
-
-  CUDA_CALLABLE const_pointer data() const {return data_;}
-  CUDA_CALLABLE       pointer data()       {return data_;}
-  CUDA_CALLABLE size_t size() {return size_;}
- private:
-  size_type size_;
-  pointer data_;
-};
-
-
-} // end namespace mkfit
-#endif /* DEVICE_ARRAY_VIEW_H_ */
diff --git a/mkFit/device_bundle.h b/mkFit/device_bundle.h
deleted file mode 100644
index 9eb4efc4..00000000
--- a/mkFit/device_bundle.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef BUNDLE_VECTOR_H
-#define BUNDLE_VECTOR_H
-
-#include "device_vector.h"
-
-namespace mkfit {
-
-template <typename T>
-class DeviceBundle {
-public:
-  using value_type = T;
-  using size_type = size_t;
-  using pointer = value_type*;
-  using const_pointer = const pointer;
-  using reference = value_type&;
-  using const_reference = const value_type&;
-
-  DeviceBundle(size_type num_part=0, size_type local_count=0);
-
-  const_reference operator[](int i) const { return data_[i]; }
-        reference operator[](int i)       { return data_[i]; }
-
-  void reserve(size_type num_parts, size_type local_count);
-        pointer data();
-  const_pointer data() const;
-  pointer get_ptr_to_part(size_type part_idx);
-
-  size_type global_capacity() { return global_capacity_; }
-  size_type local_capacity() { return local_capacity_; }
-
-  void copy_from_cpu(const pointer from, const cudaStream_t &stream);
-private:
-  DeviceVector<T> data_;
-
-  size_type num_parts_;
-  size_type local_capacity_;
-  size_type global_capacity_;
-};
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// Implementation
-///////////////////////////////////////////////////////////////////////////////
-
-template<typename T>
-inline DeviceBundle<T>::DeviceBundle(size_type num_part,
-                                     size_type local_count) {
-  reserve(num_part, local_count);
-}
-
-
-template<typename T>
-inline void DeviceBundle<T>::reserve(size_type num_parts, size_type local_count) {
-  num_parts_ = num_parts;
-  local_capacity_ = local_count;
-  global_capacity_ = num_parts_ * local_capacity_;
-  data_.reserve(global_capacity_);
-}
-
-
-template<typename T>
-inline typename DeviceBundle<T>::pointer DeviceBundle<T>::data() {
-  return data_.data();
-}
-
-
-template<typename T>
-inline typename DeviceBundle<T>::const_pointer DeviceBundle<T>::data() const {
-  return data_.data();
-}
-
-
-template<typename T>
-inline typename DeviceBundle<T>::pointer DeviceBundle<T>::get_ptr_to_part(size_type part_idx) {
-  return &data_[part_idx * local_capacity_];
-}
-
-
-template <typename T>
-inline void DeviceBundle<T>::copy_from_cpu(const pointer from,
-                                           const cudaStream_t &stream)
-{
-  cudaMemcpyAsync(data(), from, global_capacity()*sizeof(value_type),
-                  cudaMemcpyHostToDevice, stream);
-}
-
-} // end namespace mkfit
-#endif  // BUNDLE_VECTOR_H
diff --git a/mkFit/device_vector.h b/mkFit/device_vector.h
deleted file mode 100644
index 7a0c2afe..00000000
--- a/mkFit/device_vector.h
+++ /dev/null
@@ -1,118 +0,0 @@
-#ifndef DEVICE_VECTOR_H_
-#define DEVICE_VECTOR_H_
-
-#include "gpu_utils.h"
-#include "Config.h"
-#include "cuda_runtime.h"
-
-#include <cassert>
-
-namespace mkfit {
-
-template <typename T>
-class DeviceVector {
- public:
-  using value_type = T;
-  using size_type = size_t;
-  using pointer = value_type*;
-  using const_pointer = const pointer;
-  using reference = value_type&;
-  using const_reference = const value_type&;
-
-  DeviceVector() {}
-  DeviceVector(size_type count) { alloc(count); }
-  ~DeviceVector() { free(); }
-
-  CUDA_CALLABLE const_reference operator[](int i) const { return data_[i]; }
-  CUDA_CALLABLE       reference operator[](int i)       { return data_[i]; }
-
-  CUDA_CALLABLE       pointer data()       {return data_;}
-  CUDA_CALLABLE const_pointer data() const {return data_;}
-
-
-  CUDA_CALLABLE size_type size() {return size_;}
-  CUDA_CALLABLE size_type capacity() {return capacity_;}
-
-   void reserve(size_type count);
-   void resize(size_type Newsize);
-
-   void reserve_and_resize(size_type count);
-   void copy_from_cpu(const_pointer from, const cudaStream_t &stream);
-   void copy_to_cpu(pointer to, const cudaStream_t &stream) const;
- private:
-  void alloc(size_type count);
-  void free();
-
-  T* data_ = nullptr;
-  size_t size_ = 0;
-  size_t capacity_ = 0;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-/// Implementation
-///////////////////////////////////////////////////////////////////////////////
-
-template<typename T>
-inline void DeviceVector<T>::reserve(size_type count) {
-  if (count <= capacity_) return;
-  if (data_ != nullptr) free();
-  alloc(count);
-  capacity_ = count;
-}
-
-
-template<typename T>
-inline void DeviceVector<T>::resize(size_type new_size) {
-  assert(new_size <= capacity_);
-  size_ = new_size;
-}
-
-
-template<typename T>
-inline void DeviceVector<T>::reserve_and_resize(size_type count) {
-  reserve(count);
-  resize(count);
-}
-
-
-template <typename T>
-inline void DeviceVector<T>::copy_from_cpu(const pointer from,
-                                           const cudaStream_t& stream)
-{
-  cudaMemcpyAsync(data_, from, size_*sizeof(value_type),
-                  cudaMemcpyHostToDevice, stream);
-}
-
-
-template <typename T>
-inline void DeviceVector<T>::copy_to_cpu(pointer to,
-                                         const cudaStream_t& stream) const
-{
-  cudaMemcpyAsync(to, data_, size_*sizeof(value_type),
-                  cudaMemcpyDeviceToHost, stream);
-}
-
-
-template<typename T>
-inline void DeviceVector<T>::alloc(size_type count) {
-  if (count > 0) {
-    cudaMalloc((void**)&data_, count*sizeof(T));
-  }
-  cudaCheckError();
-  capacity_ = count;
-}
-
-
-template<typename T>
-inline void DeviceVector<T>::free() {
-  if (data_ != nullptr) {
-    cudaFree(data_);
-    cudaCheckError();
-  }
-  capacity_ = 0;
-  size_ = 0;
-  data_ = nullptr;
-}
-
-} // end namespace mkfit
-#endif /* DEVICE_VECTOR_H_ */
diff --git a/mkFit/fittestMPlex.cc b/mkFit/fittestMPlex.cc
index 7c81e226..c443cb17 100644
--- a/mkFit/fittestMPlex.cc
+++ b/mkFit/fittestMPlex.cc
@@ -9,11 +9,6 @@
 #include <Debug.h>
 
 #include "MkFitter.h"
-#if USE_CUDA
-#include "fittestMPlex.h"
-#include "FitterCU.h"
-//#include <omp.h>
-#endif
 
 #ifndef NO_ROOT
 #include "TFile.h"
@@ -120,105 +115,4 @@ double runFittingTestPlex(Event& ev, std::vector<Track>& rectracks)
    return time;
 }
 
-#ifdef USE_CUDA
-void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
-{
-  double s_tmp = 0.0;
-#if 0
-  In principle, the warmup loop should not be required.
-  The separate_first_call_for_meaningful_profiling_numbers() function
-  should be enough.
-  // Warmup loop
-  for (int i = 0; i < 1; ++i) {
-    FitterCU<float> cuFitter(NN);
-    cuFitter.allocateDevice();
-    Event &ev = events[0];
-    std::vector<Track> plex_tracks_ev;
-    plex_tracks_ev.resize(ev.simTracks_.size());
-
-    if (g_run_fit_std) runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
-    cuFitter.freeDevice();
-  }
-#endif
-  separate_first_call_for_meaningful_profiling_numbers();
-
-  // Reorganization (copyIn) can eventually be multithreaded.
-// FIXME: revisit multithreading when track building is ported to GPU.
-//  omp_set_nested(1);
-      
-//  omp_set_num_threads(Config::numThreadsEvents);
-  double total_gpu_time = dtime();
-//#pragma omp parallel reduction(+:s_tmp)
-  {
-//  int numThreadsEvents = omp_get_num_threads();
-//  int thr_idx = omp_get_thread_num();
-  int numThreadsEvents = 1;
-  int thr_idx = 0;
-
-  // FitterCU is declared here to share allocations and deallocations
-  // between the multiple events processed by a single thread.
-  int gplex_size = 10000;
-  FitterCU<float> cuFitter(gplex_size);
-  cuFitter.allocateDevice();
-  cuFitter.allocate_extra_addBestHit();
-
-    for (int evt = thr_idx+1; evt <= Config::nEvents; evt+= numThreadsEvents) {
-      int idx = thr_idx;
-      printf("==============================================================\n");
-      printf("Processing event %d with thread %d\n", evt, idx);
-      Event &ev = events[evt-1];
-      std::vector<Track> plex_tracks_ev;
-      plex_tracks_ev.resize(ev.simTracks_.size());
-      double tmp = 0, tmp2bh = 0, tmp2 = 0, tmp2ce = 0;
-
-      //if (g_run_fit_std) tmp = runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
-      runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
-
-      printf("Matriplex fit = %.5f  -------------------------------------", tmp);
-      printf("\n");
-      s_tmp    += tmp;
-#if 0  // 0 for timing, 1 for validation
-      // Validation crashes for multiple threads.
-      // It is something in relation to ROOT. Not sure what. 
-      //if (omp_get_num_threads() <= 1) {
-        //if (g_run_fit_std) {
-          std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root";
-        //}
-      //}
-#endif
-    }
-    cuFitter.free_extra_addBestHit();
-    cuFitter.freeDevice();
-  }
-  std::cerr << "###### [Fitting] Total GPU time: " << dtime() - total_gpu_time << " ######\n";
-}
-
-
-double runFittingTestPlexGPU(FitterCU<float> &cuFitter, 
-    Event& ev, std::vector<Track>& rectracks)
-{
-  std::vector<Track>& simtracks = ev.simTracks_;
-
-  Track *tracks_cu;
-  cudaMalloc((void**)&tracks_cu, simtracks.size()*sizeof(Track));
-  cudaMemcpyAsync(tracks_cu, &simtracks[0], simtracks.size()*sizeof(Track),
-                  cudaMemcpyHostToDevice, cuFitter.get_stream());
-
-  EventOfHitsCU events_of_hits_cu;
-  events_of_hits_cu.reserve_layers(ev.layerHits_);
-  events_of_hits_cu.copyFromCPU(ev.layerHits_, cuFitter.get_stream());
-
-  double time = dtime();
-
-  cuFitter.FitTracks(tracks_cu, simtracks.size(), events_of_hits_cu, Config::nLayers, true);
-
-  cudaMemcpy(&rectracks[0], tracks_cu, simtracks.size()*sizeof(Track), cudaMemcpyDeviceToHost);
-
-  time = dtime() - time;
-
-  cudaFree(tracks_cu);
-
-  return time;
-}
-#endif
 } // end namespace mkfit
diff --git a/mkFit/fittestMPlex.h b/mkFit/fittestMPlex.h
index 3f932e7a..8925167d 100644
--- a/mkFit/fittestMPlex.h
+++ b/mkFit/fittestMPlex.h
@@ -4,18 +4,9 @@
 #include "Event.h"
 #include "Track.h"
 
-#ifdef USE_CUDA
-#include "FitterCU.h"
-#endif
-
 namespace mkfit {
 
 double runFittingTestPlex(Event& ev, std::vector<Track>& rectracks);
 
-#ifdef USE_CUDA
-void runAllEventsFittingTestPlexGPU(std::vector<Event>& events);
-double runFittingTestPlexGPU(FitterCU<float> &cuFitter, Event& ev, std::vector<Track>& rectracks);
-#endif
-
 } // end namespace mkfit
 #endif
diff --git a/mkFit/fittracks_kernels.cu b/mkFit/fittracks_kernels.cu
deleted file mode 100644
index 6a042d8b..00000000
--- a/mkFit/fittracks_kernels.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "fittracks_kernels.h"
-
-#include "kalmanUpdater_kernels.h"
-#include "propagation_kernels.h"
-
-constexpr int BLOCK_SIZE_X = 256;
-
-__global__ void fittracks_kernel(
-      GPlexLV par_iP, GPlexLS Err_iP,
-      GPlexHV *msPar_arr, GPlexHS *msErr_arr,
-      GPlexLV par_iC, GPlexLS Err_iC,
-      GPlexLL errorProp, GPlexQI inChg,
-      const bool useParamBfield,
-      const int Nhits, int N)
-{
-  int grid_width = blockDim.x * gridDim.x;
-  int n = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int hi = 0; hi < Nhits; ++hi) {
-    GPlexHV &msPar = msPar_arr[hi];
-    GPlexHS &msErr = msErr_arr[hi];
-
-    for (int z = 0; z < (N-1)/grid_width  +1; z++) {
-      n += z*grid_width;
-
-      propagation_fn(Err_iC, par_iC, inChg, msPar, Err_iP, par_iP, useParamBfield, n, N);
-      kalmanUpdate_fn(Err_iP, msErr, par_iP, msPar, par_iC, Err_iC, n, N);
-    }
-  }
-}
-
-void fittracks_wrapper(cudaStream_t &stream,
-                       GPlexLS &Err_iP, GPlexLV &par_iP, 
-                       GPlexHS *msErr_arr, GPlexHV *msPar_arr,
-                       GPlexLS &Err_iC, GPlexLV &par_iC,
-                       GPlexLL &errorProp, GPlexQI &inChg,
-                       const bool useParamBfield,
-                       const int Nhits, const int N)
-{
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  fittracks_kernel <<< grid, block, 0, stream >>>
-      (par_iP, Err_iP, 
-       msPar_arr, msErr_arr,
-       par_iC, Err_iC,
-       errorProp, inChg, 
-       useParamBfield,
-       Nhits, N);
-}
diff --git a/mkFit/fittracks_kernels.h b/mkFit/fittracks_kernels.h
deleted file mode 100644
index 9afeb13b..00000000
--- a/mkFit/fittracks_kernels.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef FITTRACKS_KERNELS_H_G3FDJYTX
-#define FITTRACKS_KERNELS_H_G3FDJYTX
-
-#include "GPlex.h"
-
-namespace mkfit {
-
-void fittracks_wrapper(cudaStream_t &stream,
-                       GPlexLS &Err_iP, GPlexLV &par_iP, 
-                       GPlexHS *msErr, GPlexHV *msPar,
-                       GPlexLS &Err_iC, GPlexLV &par_iC,
-                       GPlexLL &errorProp, GPlexQI &inChg,
-                       const bool useParamBfield,
-                       const int hit_idx, const int N);
-
-} // end namespace mkfit
-#endif /* end of include guard: FITTRACKS_KERNELS_H_G3FDJYTX */
diff --git a/mkFit/gpu_utils.cu b/mkFit/gpu_utils.cu
deleted file mode 100644
index 2800aa39..00000000
--- a/mkFit/gpu_utils.cu
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "gpu_utils.h"
-
-void sync_gpu() {
-  cudaCheckErrorSync();
-}
-
-void separate_first_call_for_meaningful_profiling_numbers() {
-  sync_gpu();
-}
diff --git a/mkFit/gpu_utils.h b/mkFit/gpu_utils.h
deleted file mode 100644
index 6f84d800..00000000
--- a/mkFit/gpu_utils.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef GPU_UTILS_H
-#define GPU_UTILS_H 
-
-#include <cub/util_debug.cuh>
-#include <nvToolsExt.h>
-
-#include <cstdint>
-#include <unordered_map>
-
-#define cudaCheckError()               \
-  do {                                 \
-    cudaError_t e=cudaGetLastError();  \
-    CubDebugExit(e);                  \
-  } while(0)  
-
-#define cudaCheckErrorSync()           \
-  do {                                 \
-    cudaDeviceSynchronize();           \
-    cudaCheckError();                  \
-  } while(0)
-
-namespace mkfit {
-
-// CUDA specific:
-// Maximum number of blocks in the X direction of the thread grid.
-constexpr int max_blocks_x = INT32_MAX;
-
-// The first call to a CUDA API function takes the initialization hit.
-void separate_first_call_for_meaningful_profiling_numbers();
-
-void sync_gpu();
-
-
-class Tracer {
- public:
-  Tracer(const char* name, unsigned int color) {
-    nvtxEventAttributes_t eventAttrib = {0};
-    eventAttrib.version = NVTX_VERSION;
-    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-    eventAttrib.colorType = NVTX_COLOR_ARGB;
-    eventAttrib.color = color;
-    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
-    eventAttrib.message.ascii = name;
-    nvtxRangePushEx(&eventAttrib);
-  }
-  ~Tracer() {
-    nvtxRangePop();
-  }
-};
-
-static std::unordered_map<std::string, unsigned int> tracer_colors = {
-    {"Blue",            0x0000FF},
-    {"Orange",          0xFFA500},
-    {"Red",             0xFF0000},
-    {"Pink",            0xFFC0CB},
-    {"Green",           0x008000},
-    {"PaleGreen",       0x98FB98},
-    {"Chocolate",       0xD2691E}
-};
-
-} // end namespace mkfit
-
-#endif /* ifndef GPU_UTILS_H */
diff --git a/mkFit/index_selection_kernels.cu b/mkFit/index_selection_kernels.cu
deleted file mode 100644
index 8116ad59..00000000
--- a/mkFit/index_selection_kernels.cu
+++ /dev/null
@@ -1,153 +0,0 @@
-#include "index_selection_kernels.h"
-#include "Config.h"
-#include "HitStructures.h"
-#include "gpu_utils.h"
-
-#include "stdio.h"
-
-#define BLOCK_SIZE_X 32
-
-constexpr bool tmp_useCMSGeom = false;
-
-__device__ void selectHitIndices_fn(const LayerOfHitsCU &layer_of_hits,
-    const GPlexLS &Err, const GPlexLV &Par, GPlexQI &XHitSize, 
-    GPlexHitIdx &XHitArr, const int itrack, const int N)
-{
-  if (itrack < N) {
-    bool dump = false;
-    const float nSigmaPhi = 3;
-    const float nSigmaZ   = 3;
-
-    int xhitsize_tmp = XHitSize[itrack];
-    XHitSize[itrack] = 0;
-
-    float z, phi, dz, dphi;
-    {
-      const float x = Par(itrack, 0, 0);
-      const float y = Par(itrack, 1, 0);
-
-      const float r2 = x*x + y*y;
-
-      z   = Par(itrack, 2, 0);
-      phi = getPhi(x, y);
-      dz  = nSigmaZ * sqrtf(Err[5*Err.stride + itrack]);
-
-      const float dphidx = -y/r2, dphidy = x/r2;
-      const float dphi2  = dphidx * dphidx * Err[0*Err.stride + itrack] +
-                           dphidy * dphidy * Err[2*Err.stride + itrack] +
-                       2 * dphidx * dphidy * Err[1*Err.stride + itrack];
-
-#ifdef HARD_CHECK
-      assert(dphi2 >= 0);
-#endif
-
-      dphi = nSigmaPhi * sqrtf(fabs(dphi2));
-
-      if (tmp_useCMSGeom)
-      {
-        //now correct for bending and for layer thickness unsing linear approximation
-        const float deltaR = Config::cmsDeltaRad; //fixme! using constant value, to be taken from layer properties
-        const float r  = sqrt(r2);
-        //here alpha is the difference between posPhi and momPhi
-        const float alpha = phi - Par(itrack, 4, 0);
-        float cosA, sinA;
-        if (Config::useTrigApprox) {
-          sincos4(alpha, sinA, cosA);
-        } else {
-          cosA = cos(alpha);
-          sinA = sin(alpha);
-        }
-        //take abs so that we always inflate the window
-        const float dist = fabs(deltaR*sinA/cosA);
-
-        dphi += dist / r;
-      }
-    }
-
-    const LayerOfHitsCU &L = layer_of_hits;
-
-    if (fabs(dz)   > Config::m_max_dz)   dz   = Config::m_max_dz;
-    if (fabs(dphi) > Config::m_max_dphi) dphi = Config::m_max_dphi;
-
-    const int zb1 = L.GetZBinChecked(z - dz);
-    const int zb2 = L.GetZBinChecked(z + dz) + 1;
-    const int pb1 = L.GetPhiBin(phi - dphi);
-    const int pb2 = L.GetPhiBin(phi + dphi) + 1;
-    // MT: The extra phi bins give us ~1.5% more good tracks at expense of 10% runtime.
-    // const int pb1 = L.GetPhiBin(phi - dphi) - 1;
-    // const int pb2 = L.GetPhiBin(phi + dphi) + 2;
-
-    if (dump)
-      printf("LayerOfHitsCU::SelectHitIndices %6.3f %6.3f %6.6f %7.5f %3d %3d %4d %4d\n",
-             z, phi, dz, dphi, zb1, zb2, pb1, pb2);
-
-    // MT: One could iterate in "spiral" order, to pick hits close to the center.
-    // http://stackoverflow.com/questions/398299/looping-in-a-spiral
-    // This would then work best with relatively small bin sizes.
-    // Or, set them up so I can always take 3x3 array around the intersection.
-    for (int zi = zb1; zi < zb2; ++zi)
-    {
-      for (int pi = pb1; pi < pb2; ++pi)
-      {
-        const int pb = pi & L.m_phi_mask;
-
-        // MT: The following line is the biggest hog (4% total run time).
-        // This comes from cache misses, I presume.
-        // It might make sense to make first loop to extract bin indices
-        // and issue prefetches at the same time.
-        // Then enter vectorized loop to actually collect the hits in proper order.
-
-        /*for (int hi = L.m_phi_bin_infos[zi][pb].first; hi < L.m_phi_bin_infos[zi][pb].second; ++hi)*/
-        for (int hi = L.m_phi_bin_infos[zi*Config::m_nphi + pb].first; 
-                 hi < L.m_phi_bin_infos[zi*Config::m_nphi + pb].second; ++hi)
-        {
-          // MT: Access into m_hit_zs and m_hit_phis is 1% run-time each.
-
-#ifdef LOH_USE_PHI_Z_ARRAYS
-          float ddz   = fabs(z   - L.m_hit_zs[hi]);
-          float ddphi = fabs(phi - L.m_hit_phis[hi]);
-          if (ddphi > Config::PI) ddphi = Config::TwoPI - ddphi;
-
-          if (dump)
-            printf("     SHI %3d %4d %4d %5d  %6.3f %6.3f %6.4f %7.5f   %s\n",
-                   zi, pi, pb, hi,
-                   L.m_hit_zs[hi], L.m_hit_phis[hi], ddz, ddphi,
-                   (ddz < dz && ddphi < dphi) ? "PASS" : "FAIL");
-
-          // MT: Commenting this check out gives full efficiency ...
-          //     and means our error estimations are wrong!
-          // Avi says we should have *minimal* search windows per layer.
-          // Also ... if bins are sufficiently small, we do not need the extra
-          // checks, see above.
-          // if (ddz < dz && ddphi < dphi && XHitSize[itrack] < MPlexHitIdxMax)
-#endif
-          // MT: The following check also makes more sense with spiral traversal,
-          // we'd be taking in closest hits first.
-          if (XHitSize[itrack] < GPlexHitIdxMax)
-          {
-            XHitArr(itrack, XHitSize[itrack]++, 0) = hi;
-          }
-        }
-      }
-    }
-  }
-}
-
-
-__global__ void selectHitIndices_kernel(const LayerOfHitsCU layer_of_hits,
-    const GPlexLS Err, const GPlexLV Par, GPlexQI XHitSize, GPlexHitIdx XHitArr, const int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  selectHitIndices_fn(layer_of_hits, Err, Par, XHitSize, XHitArr, itrack, N);
-}
-
-
-void selectHitIndices_wrapper(const cudaStream_t& stream,
-    const LayerOfHitsCU& layer_of_hits, const GPlexLS& Err, const GPlexLV& Par, 
-    GPlexQI& XHitSize, GPlexHitIdx& XHitArr, const int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-  selectHitIndices_kernel <<< grid, block, 0, stream >>>
-    (layer_of_hits, Err, Par, XHitSize, XHitArr, N);
-}
diff --git a/mkFit/index_selection_kernels.h b/mkFit/index_selection_kernels.h
deleted file mode 100644
index 801b5a4b..00000000
--- a/mkFit/index_selection_kernels.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _INDEX_SELECTION_KERNELS_H_
-#define _INDEX_SELECTION_KERNELS_H_
-
-#include "HitStructuresCU.h"
-#include "GPlex.h"
-
-void selectHitIndices_wrapper(const cudaStream_t& stream,
-    const LayerOfHitsCU& layer_of_hits, const GPlexLS& Err, const GPlexLV& Par, 
-    GPlexQI& XHitSize, GPlexHitIdx& XHitArr, const int N);
-
-__device__ void selectHitIndices_fn(const LayerOfHitsCU &layer_of_hits,
-    const GPlexLS &Err, const GPlexLV &Par, GPlexQI &XHitSize,
-    GPlexHitIdx &XHitArr, const int itrack, const int N);
-
-#endif  // _INDEX_SELECTION_KERNELS_H_
diff --git a/mkFit/kalmanUpdater_kernels.cu b/mkFit/kalmanUpdater_kernels.cu
deleted file mode 100644
index 18155c8c..00000000
--- a/mkFit/kalmanUpdater_kernels.cu
+++ /dev/null
@@ -1,402 +0,0 @@
-#include "Config.h"
-#include "Hit.h"
-#include "kalmanUpdater_kernels.h"
-#include "computeChi2_kernels.h"
-#include "gpu_utils.h"
-
-// TODO: Clean all the hard-coded #define
-#define LS 21
-#define HS 6
-#define LH 18
-#define HV 3
-
-#define BLOCK_SIZE_X 32
-
-
-__device__ void subtract_matrix(const float *a, const int aN, 
-                                const float *b, const int bN, 
-                                      float *c, const int cN,
-                                const int size, const int n) {
-  for (int i = 0; i < size; ++i) {
-    c[i*cN + n] = a[i*aN + n] - b[i*bN + n];
-    
-  }
-}
-
-__device__ float getHypot_fn(const float x, const float y)
-{
-  return sqrt(x*x + y*y);
-}
-
-__device__
-void KalmanHTG_fn(const GPlexRegQF& a00, const GPlexRegQF& a01,
-                  const GPlexReg2S &b, GPlexRegHH &c)
-{
-
-   // HTG  = rot * res_loc
-   //   C  =  A  *    B   
-
-   // Based on script generation and adapted to custom sizes.
-      c[ 0] = a00[0]*b[ 0];
-      c[ 1] = a00[0]*b[ 1];
-      c[ 2] = 0.;
-      c[ 3] = a01[0]*b[ 0];
-      c[ 4] = a01[0]*b[ 1];
-      c[ 5] = 0.;
-      c[ 6] = b[ 1];
-      c[ 7] = b[ 2];
-      c[ 8] = 0.;
-}
-
-__device__
-void KalmanGain_fn(const GPlexLS &A, const GPlexRegHH &b, GPlexRegLH &c, const int n)
-{
-  // C = A * B, C is 6x3, A is 6x6 sym , B is 6x3
-  using T = float;
-  float *a = A.ptr;
-  int aN = A.stride; int an = n;  // Global array
-  int bN = 1;        int bn = 0;  // Register array
-  int cN = 1;        int cn = 0;
-
-#include "KalmanGain.ah"
-}
-
-#include "KalmanUtilsMPlex.icc"
-
-__device__
-void KHMult_fn(const GPlexRegLH &A, 
-               const GPlexRegQF& B00,
-               const GPlexRegQF& B01,
-               GPlexRegLL &C)
-{
-  KHMult_imp(A, B00, B01, C, 0, 1);
-}
-
-__device__
-void KHC_fn(const GPlexRegLL &a, const GPlexLS &B, GPlexLS &C, const int n)
-{
-  // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
-  using T = float;
-                 int aN = 1; int an = 0;  // Register array
-  T *b = B.ptr;  int bN = B.stride;  int bn = n;
-  T *c = C.ptr;  int cN = C.stride;  int cn = n;
-#include "KHC.ah"
-}
-
-// 
-__device__
-void ConvertToCCS_fn(const GPlexLV &a, GPlexRegLV &b, GPlexRegLL &c, const int n)
-{
-  ConvertToCCS_imp(a, b, c, n, n+1);
-}
-
-__device__
-void PolarErr_fn(const GPlexRegLL &a, const float *b, int bN, GPlexRegLL &c, const int n)
-{
-  // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
- 
-  // Generated code access arrays with variables cN, cn
-  // c[i*cN+cn]  
-  int aN = 1; int an = 0;  // Register array
-              int bn = n;  // Global array
-  int cN = 1; int cn = 0;
-#include "CCSErr.ah"
-}
-
-__device__
-void PolarErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, const int n)
-{
-  // C = A * B, C is sym, A is 6x6 , B is 6x6
-  using T = float;
-                 int aN = 1;         int an = 0;
-                 int bN = 1;         int bn = 0;
-  T *c = C.ptr;  int cN = C.stride;  int cn = n;
-#include "CCSErrTransp.ah"
-}
-
-__device__
-void ConvertToCartesian_fn(const GPlexRegLV &a, GPlexLV& b, GPlexRegLL &c, const int n)
-{
-  ConvertToCartesian_imp(a, b, c, n, n+1); 
-}
-
-__device__
-void CartesianErr_fn(const GPlexRegLL &a, const float *b, const int bN, GPlexRegLL &c, const int n)
-{
-  // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
-  int aN = 1; int an = 0;
-              int bn = n;
-  int cN = 1; int cn = 0;
-
-#include "CartesianErr.ah"
-}
-
-__device__
-void CartesianErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, const int n)
-{
-  // C = A * B, C is sym, A is 6x6 , B is 6x6
-  using T = float;
-  int aN = 1; int an = 0;
-  int bN = 1; int bn = 0;
-  T *c = C.ptr;  int cN = C.stride;  int cn = n;
-
-#include "CartesianErrTransp.ah"
-}
-
-
-/// MultKalmanGain ////////////////////////////////////////////////////////////
-
-__device__ void upParam_MultKalmanGain_fn(
-    const float* __restrict__ a, const size_t aN,
-    const float* b_reg, float *c, const int N, const int n) {
-  // const T* __restrict__ tells the compiler that it can uses the read-only
-  // cache, without worrying about coherency.
-  // c -> kalmanGain, in register
-
-  /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
-  // use registers to store values of 'a' that are use multiple times
-  // To reduce the number of registers and avoid spilling interlace
-  // read and compute instructions.
-  float a_reg[3];
-
-  int j;
-  j = 0; a_reg[0] = a[n + j*aN];
-  j = 1; a_reg[1] = a[n + j*aN];
-  j = 3; a_reg[2] = a[n + j*aN];
-
-  c[ 0] = a_reg[ 0]*b_reg[ 0] + a_reg[ 1]*b_reg[ 1] + a_reg[ 2]*b_reg[ 3];
-  c[ 1] = a_reg[ 0]*b_reg[ 1] + a_reg[ 1]*b_reg[ 2] + a_reg[ 2]*b_reg[ 4];
-  c[ 2] = a_reg[ 0]*b_reg[ 3] + a_reg[ 1]*b_reg[ 4] + a_reg[ 2]*b_reg[ 5];
-
-  j = 1; a_reg[0] = a[n + j*aN];
-  j = 2; a_reg[1] = a[n + j*aN];
-  j = 4; a_reg[2] = a[n + j*aN];
-
-  c[ 3] = a_reg[ 0]*b_reg[ 0] + a_reg[ 1]*b_reg[ 1] + a_reg[ 2]*b_reg[ 3];
-  c[ 4] = a_reg[ 0]*b_reg[ 1] + a_reg[ 1]*b_reg[ 2] + a_reg[ 2]*b_reg[ 4];
-  c[ 5] = a_reg[ 0]*b_reg[ 3] + a_reg[ 1]*b_reg[ 4] + a_reg[ 2]*b_reg[ 5];
-
-  j = 3; a_reg[0] = a[n + j*aN];
-  j = 4; a_reg[1] = a[n + j*aN];
-  j = 5; a_reg[2] = a[n + j*aN];
-
-  c[ 6] = a_reg[ 0]*b_reg[ 0] + a_reg[ 1]*b_reg[ 1] + a_reg[ 2]*b_reg[ 3];
-  c[ 7] = a_reg[ 0]*b_reg[ 1] + a_reg[ 1]*b_reg[ 2] + a_reg[ 2]*b_reg[ 4];
-  c[ 8] = a_reg[ 0]*b_reg[ 3] + a_reg[ 1]*b_reg[ 4] + a_reg[ 2]*b_reg[ 5];
-
-  j = 6; a_reg[0] = a[n + j*aN];
-  j = 7; a_reg[1] = a[n + j*aN];
-  j = 8; a_reg[2] = a[n + j*aN];
-
-  c[ 9] = a_reg[ 0]*b_reg[ 0] + a_reg[ 1]*b_reg[ 1] + a_reg[ 2]*b_reg[ 3];
-  c[10] = a_reg[ 0]*b_reg[ 1] + a_reg[ 1]*b_reg[ 2] + a_reg[ 2]*b_reg[ 4];
-  c[11] = a_reg[ 0]*b_reg[ 3] + a_reg[ 1]*b_reg[ 4] + a_reg[ 2]*b_reg[ 5];
-
-  j = 10; a_reg[0] = a[n + j*aN];
-  j = 11; a_reg[1] = a[n + j*aN];
-  j = 12; a_reg[2] = a[n + j*aN];
-
-  c[12] = a_reg[ 0]*b_reg[ 0] + a_reg[ 1]*b_reg[ 1] + a_reg[ 2]*b_reg[ 3];
-  c[13] = a_reg[ 0]*b_reg[ 1] + a_reg[ 1]*b_reg[ 2] + a_reg[ 2]*b_reg[ 4];
-  c[14] = a_reg[ 0]*b_reg[ 3] + a_reg[ 1]*b_reg[ 4] + a_reg[ 2]*b_reg[ 5];
-
-  j = 15; a_reg[0] = a[n + j*aN];
-  j = 16; a_reg[1] = a[n + j*aN];
-  j = 17; a_reg[2] = a[n + j*aN];
-
-  c[15] = a_reg[ 0]*b_reg[ 0] + a_reg[ 1]*b_reg[ 1] + a_reg[ 2]*b_reg[ 3];
-  c[16] = a_reg[ 0]*b_reg[ 1] + a_reg[ 1]*b_reg[ 2] + a_reg[ 2]*b_reg[ 4];
-  c[17] = a_reg[ 0]*b_reg[ 3] + a_reg[ 1]*b_reg[ 4] + a_reg[ 2]*b_reg[ 5];
-}
-
-/// Invert Cramer Symetric ////////////////////////////////////////////////////
-__device__ void invertCramerSym_fn(float *a) {
-  // a is in registers.
-  // to use global memory, a stride will be required and accesses would be:
-  // a[n + stride_a * i];
-  typedef float TT;
-
-  const TT c00 = a[2] * a[5] - a[4] * a[4];
-  const TT c01 = a[4] * a[3] - a[1] * a[5];
-  const TT c02 = a[1] * a[4] - a[2] * a[3];
-  const TT c11 = a[5] * a[0] - a[3] * a[3];
-  const TT c12 = a[3] * a[1] - a[4] * a[0];
-  const TT c22 = a[0] * a[2] - a[1] * a[1];
-
-  const TT det = a[0] * c00 + a[1] * c01 + a[3] * c02;
-
-  const TT s = TT(1) / det;
-
-  a[0] = s*c00;
-  a[1] = s*c01;
-  a[2] = s*c11;
-  a[3] = s*c02;
-  a[4] = s*c12;
-  a[5] = s*c22;
-}
-
-__device__ void invertCramerSym2x2_fn(GPlexReg2S &a) {
-  float det = a[0] * a[2] - a[1] * a[1];
-  const float s   = float(1) / det;
-  const float tmp = s * a[2];
-  a[1] *= -s;
-  a[2]  = s * a[0];
-  a[0]  = tmp;
-}
-
-__device__ void subtractFirst3_fn(const GPlexHV __restrict__ &A,
-                                  const GPlexLV __restrict__ &B,
-                                  GPlexRegHV &C, const int N, int n) {
-  using T = float;
-  const T *a = A.ptr;  int aN = A.stride;
-  const T *b = B.ptr;  int bN = B.stride;
-        T *c = C.arr;
-  /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
-  
-  if(n < N) {
-    c[0] = a[0*aN+n] - b[0*bN+n];
-    c[1] = a[1*aN+n] - b[1*bN+n];
-    c[2] = a[2*aN+n] - b[2*bN+n];
-  }
-}
-
-/// AddIntoUpperLeft3x3  //////////////////////////////////////////////////////
-__device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A,
-                                       const GPlexHS __restrict__ &B,
-                                       GPlexRegHS &C, const int N, const int n) {
-  using T = float;
-  const T *a = A.ptr;  int aN = A.stride;
-  const T *b = B.ptr;  int bN = B.stride;
-        T *c = C.arr;
-  /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
-  
-  if(n < N) {
-    c[0] = a[0*aN+n] + b[0*bN+n];
-    c[1] = a[1*aN+n] + b[1*bN+n];
-    c[2] = a[2*aN+n] + b[2*bN+n];
-    c[3] = a[3*aN+n] + b[3*bN+n];
-    c[4] = a[4*aN+n] + b[4*bN+n];
-    c[5] = a[5*aN+n] + b[5*bN+n];
-  }
-}
-
-
-/// MultResidualsAdd //////////////////////////////////////////////////////////
-__device__ void multResidualsAdd_fn(
-    const GPlexRegLH &reg_a,
-    const GPlexLV __restrict__ &B, 
-    const GPlexReg2V &c,
-          GPlexLV &D,
-    const int N, const int n) {
-
-  MultResidualsAdd_imp(reg_a, B, c, D, n, n+1);
-}
-
-__device__
-void MultResidualsAdd_all_reg(const GPlexRegLH &a,
-		      const GPlexRegLV &b,
-		      const GPlexReg2V &c,
-          GPlexRegLV &d)
-{
-   // outPar = psPar + kalmanGain*(dPar)
-   //   D    =   B         A         C
-   // where right half of kalman gain is 0 
-
-   // XXX Regenerate with a script.
-      // generate loop (can also write it manually this time, it's not much)
-      d[0] = b[0] + a[ 0] * c[0] + a[ 1] * c[1];
-      d[1] = b[1] + a[ 3] * c[0] + a[ 4] * c[1];
-      d[2] = b[2] + a[ 6] * c[0] + a[ 7] * c[1];
-      d[3] = b[3] + a[ 9] * c[0] + a[10] * c[1];
-      d[4] = b[4] + a[12] * c[0] + a[13] * c[1];
-      d[5] = b[5] + a[15] * c[0] + a[16] * c[1];
-}
-
-
-__device__ void kalmanUpdate_fn(
-    GPlexLS &propErr, const GPlexHS __restrict__ &msErr,
-    const GPlexLV __restrict__ &par_iP, const GPlexHV __restrict__ &msPar,
-    GPlexLV &par_iC, GPlexLS &outErr, const int n, const int N) {
-  // Note: similar results with propErr kept in registers.
-  //       It is read-only so using the read-only cache yields more flexibility
-  //       wrt block size without increasing the pressure on registers to much.
-  // There is no need to keep resErr and kalmanGain as global memory arrays.
-  GPlexRegHS resErr_reg;
-
-  // If there is more matrices than max_blocks_x * BLOCK_SIZE_X 
-  if (n < N) {
-    for (int j = 0; j < HS; ++j) {
-      resErr_reg[j] = 0; //resErr[j*resErr_stride + n];
-    }
-
-    // FIXME: Add useCMSGeom -> port propagateHelixToRMPlex
-#if 0
-    if (Config::useCMSGeom) {
-      propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
-    } else {
-      propErr = psErr;
-      propPar = psPar;
-    }
-#endif
-    GPlexRegQF rotT00;
-    GPlexRegQF rotT01;
-    const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
-    rotT00[0] = -(msPar(n, 1, 0) + par_iP(n, 1, 0))/(2*r);
-    rotT01[0] =  (msPar(n, 0, 0) + par_iP(n, 0, 0))/(2*r);
-
-    GPlexRegHV res_glo;
-    subtractFirst3_fn(msPar, par_iP, res_glo, N, n);
-
-    addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n);
-    GPlexReg2V res_loc;   //position residual in local coordinates
-    RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc);
-    GPlexReg2S resErr_loc; // 2x2 sym
-    GPlexRegHH tempHH;  // 3*3 sym
-    ProjectResErr_fn  (rotT00, rotT01, resErr_reg, tempHH);
-    ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc);
-
-    /*invertCramerSym_fn(resErr_reg);*/
-    invertCramerSym2x2_fn(resErr_loc);
-
-    // Kalman update in "polar" coordinates
-    GPlexRegLH K;
-    KalmanHTG_fn(rotT00, rotT01, resErr_loc, tempHH);
-    KalmanGain_fn(propErr, tempHH, K, n);
-
-    multResidualsAdd_fn(K, par_iP, res_loc, par_iC, N, n);// propPar_pol is now the updated parameters in "polar" coordinates
-    GPlexRegLL tempLL;
-
-    KHMult_fn(K, rotT00, rotT01, tempLL);
-    KHC_fn(tempLL, propErr, outErr, n);
-    subtract_matrix(propErr.ptr, propErr.stride, outErr.ptr, outErr.stride, 
-                    outErr.ptr, outErr.stride, LS, n);
-
-  }
-}
-
-__global__ void kalmanUpdate_kernel(
-    GPlexLS propErr, const GPlexHS __restrict__ msErr,
-    const GPlexLV __restrict__ par_iP, const GPlexHV __restrict__ msPar,
-    GPlexLV par_iC, GPlexLS outErr, const int N) {
-  int grid_width = blockDim.x * gridDim.x;
-  int n = threadIdx.x + blockIdx.x * blockDim.x;
-
-  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
-    n += z*grid_width;
-    kalmanUpdate_fn(propErr, msErr, par_iP, msPar, par_iC, outErr, n, N);
-  }
-}
-
-void kalmanUpdate_wrapper(const cudaStream_t& stream,
-    GPlexLS& d_propErr, const GPlexHS& d_msErr,
-    GPlexLV& d_par_iP, const GPlexHV& d_msPar,
-    GPlexLV& d_par_iC, GPlexLS& d_outErr,
-    const int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-  kalmanUpdate_kernel <<<grid, block, 0, stream >>>
-      (d_propErr, d_msErr, d_par_iP, d_msPar, d_par_iC, d_outErr, N);
-}
-
diff --git a/mkFit/kalmanUpdater_kernels.h b/mkFit/kalmanUpdater_kernels.h
deleted file mode 100644
index 112462eb..00000000
--- a/mkFit/kalmanUpdater_kernels.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef _KALMAN_UPDATER_KERNELS_H_
-#define _KALMAN_UPDATER_KERNELS_H_
-
-#include "GPlex.h"
-
-void kalmanUpdate_wrapper(const cudaStream_t& stream,
-    GPlexLS& d_propErr, const GPlexHS& d_msErr,
-    GPlexLV& d_par_iP, const GPlexHV& d_msPar,
-    GPlexLV& d_par_iC, GPlexLS& d_outErr,
-    const int N);
-
-//void reorganizeMs_wrapper(cudaStream_t& stream, GPlexQF& msPar,
-//    float *full_posArray, GPlexHS& msErr, 
-//    float *full_errArray, int *full_hitIdx, int hi, int maxHits,
-//    int N, int hs, int hv, int Nhits);
-
-__global__ void kalmanUpdate_kernel(
-    GPlexLS propErr, const GPlexHS __restrict__ msErr,
-    const GPlexLV __restrict__ par_iP, const GPlexHV __restrict__ msPar,
-    GPlexLV par_iC, GPlexLS outErr, const int N);
-
-__device__ void kalmanUpdate_fn(
-    GPlexLS &propErr, const GPlexHS __restrict__ &msErr,
-    const GPlexLV __restrict__ &par_iP, const GPlexHV __restrict__ &msPar,
-    GPlexLV &par_iC, GPlexLS &outErr, const int itrack, const int N);
-
-__device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A,
-                                       const GPlexHS __restrict__ &B,
-                                       GPlexRegHS &c, const int N, const int n);
-
-__device__ void subtractFirst3_fn(const GPlexHV __restrict__ &A,
-                                  const GPlexLV __restrict__ &B,
-                                  GPlexRegHV &C, const int N, const int n);
-
-__device__ void invertCramerSym_fn(float *a);
-__device__ void invertCramerSym2x2_fn(GPlexReg2S &a);
-
-#endif  // _KALMAN_UPDATER_KERNELS_H_
diff --git a/mkFit/mkFit.cc b/mkFit/mkFit.cc
index 18bd9ae2..c57c01c1 100644
--- a/mkFit/mkFit.cc
+++ b/mkFit/mkFit.cc
@@ -23,12 +23,6 @@
 #include "Validation.h"
 #endif
 
-#ifdef USE_CUDA
-#include "FitterCU.h"
-#include "BuilderCU.h"
-#include "gpu_utils.h"
-#endif
-
 #include <cstdlib>
 //#define DEBUG
 #include "Debug.h"
@@ -89,8 +83,6 @@ namespace
   bool  g_run_build_ce  = false;
   bool  g_run_build_fv  = false;
 
-  bool  g_seed_based    = false;
-
   std::string g_operation = "simulate_and_process";;
   std::string g_input_file = "";
   std::string g_output_file = "";
@@ -292,46 +284,6 @@ void test_standard()
   double t_skip[NT] = {0};
   double time = dtime();
 
-#if USE_CUDA_OLD
-  tbb::task_scheduler_init tbb_init(Config::numThreadsFinder);
-  //tbb::task_scheduler_init tbb_init(tbb::task_scheduler_init::automatic);
-
-  //omp_set_num_threads(Config::numThreadsFinder);
-  // fittest time. Sum of all events. In case of multiple events
-  // being run simultaneously in different streams this time will
-  // be larger than the elapsed time.
-
-  std::vector<Event> events;
-  std::vector<Validation> validations(Config::nEvents);
-
-  events.reserve(Config::nEvents);
-  // simulations are all performed before the fitting loop.
-  // It is mandatory in order to see the benefits of running
-  // multiple streams.
-  for (int evt = 1; evt <= Config::nEvents; ++evt) {
-    printf("Simulating event %d\n", evt);
-    events.emplace_back(geom, val, evt);
-    events.back().Simulate();
-    dprint("Event #" << events.back().evtID() << " simtracks " << events.back().simTracks_.size() << " layerhits " << events.back().layerHits_.size());
-  }
-
-  // The first call to a GPU function always take a very long time.
-  // Everything needs to be initialized.
-  // Nothing is done in this function, except calling an harmless
-  // CUDA function. These function can be changed to another one
-  // if it becomes important to time (e.g. if you want the profiler to
-  // tell you how much time is spend running cudaDeviceSynchronize(),
-  // use another function).
-  separate_first_call_for_meaningful_profiling_numbers();
-
-  if (g_run_fit_std) runAllEventsFittingTestPlexGPU(events);
-
-  if (g_run_build_all || g_run_build_bh) {
-    double total_best_hit_time = 0.;
-    total_best_hit_time = runAllBuildingTestPlexBestHitGPU(events);
-    std::cout << "Total best hit time (GPU): " << total_best_hit_time << std::endl;
-  }
-#else
   std::atomic<int> nevt{g_start_event};
   std::atomic<int> seedstot{0}, simtrackstot{0}, candstot{0};
   std::atomic<int> maxHits_all{0}, maxLayer_all{0};
@@ -344,13 +296,6 @@ void test_standard()
   std::vector<std::shared_ptr<FILE>>       fps;
   fps.reserve(Config::numThreadsEvents);
 
-#if USE_CUDA
-  separate_first_call_for_meaningful_profiling_numbers();
-
-  std::vector<std::unique_ptr<FitterCU<float>>> cuFitters(Config::numThreadsEvents);
-  std::vector<std::unique_ptr<BuilderCU>> cuBuilders(Config::numThreadsEvents);
-#endif
-
   const std::string valfile("valtree");
 
   for (int i = 0; i < Config::numThreadsEvents; ++i) {
@@ -362,18 +307,6 @@ void test_standard()
     if (g_operation == "read") {
       fps.emplace_back(fopen(g_input_file.c_str(), "r"), [](FILE* fp) { if (fp) fclose(fp); });
     }
-#if USE_CUDA
-    constexpr int gplex_width = 10000;
-    cuFitters[i].reset(new FitterCU<float>(gplex_width));
-    cuFitters[i].get()->allocateDevice();
-    cuFitters[i].get()->allocate_extra_addBestHit();
-    cuFitters[i].get()->allocate_extra_combinatorial();
-    cuFitters[i].get()->createStream();
-    cuFitters[i].get()->setNumberTracks(gplex_width);
-
-    cuBuilders[i].reset(new BuilderCU(cuFitters[i].get()));
-    cuBuilders[i].get()->allocateGeometry(geom);
-#endif
   }
 
   tbb::task_scheduler_init tbb_init(Config::numThreadsFinder);
@@ -398,11 +331,6 @@ void test_standard()
     int evstart = thisthread*events_per_thread;
     int evend   = std::min(Config::nEvents, evstart+events_per_thread);
 
-#if USE_CUDA
-    auto& cuFitter = *cuFitters[thisthread].get();
-    auto& cuBuilder = *cuBuilders[thisthread].get();
-#endif
-
     dprint("thisthread " << thisthread << " events " << Config::nEvents << " events/thread " << events_per_thread
                          << " range " << evstart << ":" << evend);
 
@@ -440,18 +368,11 @@ void test_standard()
       int maxLayer_thisthread = 0;
       for (int b = 0; b < Config::finderReportBestOutOfN; ++b)
       {
-  #ifndef USE_CUDA
         t_cur[0] = (g_run_fit_std) ? runFittingTestPlex(ev, plex_tracks) : 0;
         t_cur[1] = (g_run_build_all || g_run_build_bh)  ? runBuildingTestPlexBestHit(ev, mkb) : 0;
         t_cur[3] = (g_run_build_all || g_run_build_ce)  ? runBuildingTestPlexCloneEngine(ev, mkb) : 0;
         t_cur[4] = (g_run_build_all || g_run_build_fv)  ? runBuildingTestPlexFV(ev, mkb) : 0;
 	if (g_run_build_all || g_run_build_cmssw) runBuildingTestPlexDumbCMSSW(ev, mkb);
-  #else
-        t_cur[0] = (g_run_fit_std) ? runFittingTestPlexGPU(cuFitter, ev, plex_tracks) : 0;
-        t_cur[1] = (g_run_build_all || g_run_build_bh)  ? runBuildingTestPlexBestHitGPU(ev, mkb, cuBuilder) : 0;
-        // XXXX MT note for Matthieu: ev_tmp no longer exists ----------------------------------v
-        t_cur[3] = (g_run_build_all || g_run_build_ce)  ? runBuildingTestPlexCloneEngineGPU(ev, ev_tmp, mkb, cuBuilder, g_seed_based) : 0;
-  #endif
         t_cur[2] = (g_run_build_all || g_run_build_std) ? runBuildingTestPlexStandard(ev, mkb) : 0;
         if (g_run_build_ce){
           ncands_thisthread = mkb.total_cands();
@@ -495,7 +416,6 @@ void test_standard()
     }
   }, tbb::simple_partitioner());
 
-#endif
   time = dtime() - time;
 
   printf("\n");
@@ -520,14 +440,6 @@ void test_standard()
     val->fillConfigTree();
     val->saveTTrees();
   }
-#if USE_CUDA
-  for (int i = 0; i < Config::numThreadsEvents; ++i) {
-    cuFitters[i].get()->freeDevice();
-    cuFitters[i].get()->free_extra_addBestHit();
-    cuFitters[i].get()->free_extra_combinatorial();
-    cuFitters[i].get()->destroyStream();
-  }
-#endif
 }
 
 //==============================================================================
@@ -713,10 +625,6 @@ int main(int argc, const char *argv[])
 	"                             == --cmssw-val --read-cmssw-tracks --cmssw-match-fw %s --cmssw-match-bk %s\n"
 	"                             must enable: --cmssw-pureseeds --backward-fit-pca\n"
 	"\n----------------------------------------------------------------------------------------------------------\n\n"
-        "GPU specific options: \n\n"
-        "  --num-thr-reorg <num>    number of threads to run the hits reorganization (def: %d)\n"
-        "  --seed-based             For CE. Switch to 1 CUDA thread per seed (def: %s)\n"
-	"\n----------------------------------------------------------------------------------------------------------\n\n"
         ,
         argv[0],
 
@@ -785,10 +693,8 @@ int main(int argc, const char *argv[])
 	getOpt(hitBased, g_match_opts).c_str(), getOpt(trkParamBased, g_match_opts).c_str(), 
 	getOpt(trkParamBased, g_match_opts).c_str(), getOpt(hitBased, g_match_opts).c_str(), 
 	getOpt(trkParamBased, g_match_opts).c_str(), getOpt(trkParamBased, g_match_opts).c_str(), 
-	getOpt(labelBased, g_match_opts).c_str(), getOpt(labelBased, g_match_opts).c_str(), 
+	getOpt(labelBased, g_match_opts).c_str(), getOpt(labelBased, g_match_opts).c_str()
 
-	Config::numThreadsReorg,
-	b2a(g_seed_based)
       );
 
       printf("List of options for string based inputs \n");
@@ -1102,15 +1008,6 @@ int main(int argc, const char *argv[])
       Config::cmsswMatchingFW = labelBased;
       Config::cmsswMatchingBK = labelBased;
     }
-    else if (*i == "--num-thr-reorg")
-    {
-      next_arg_or_die(mArgs, i);
-      Config::numThreadsReorg = atoi(i->c_str());
-    }
-    else if (*i == "--seed-based")
-    {
-      g_seed_based = true;
-    }
     else
     {
       fprintf(stderr, "Error: Unknown option/argument '%s'.\n", i->c_str());
diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu
deleted file mode 100644
index 74691be6..00000000
--- a/mkFit/propagation_kernels.cu
+++ /dev/null
@@ -1,316 +0,0 @@
-#include "Config.h"
-#include "Debug.h"
-#include "propagation_kernels.h"
-#include <stdio.h>
-#include "gpu_utils.h"
-
-
-// values from 32 to 512 give good results.
-// 32 gives slightly better results (on a K40)
-constexpr int BLOCK_SIZE_X = 32;
-
-
-__device__
-void MultHelixProp_fn(const GPlexRegLL& a, const GPlexLS& b, GPlexRegLL& c, const int n)
-{
-   // C = A * B
-
-   typedef float T;
-   /*const idx_t N  = NN;*/
-
-   /*const T *a = A.fArray; ASSUME_ALIGNED(a, 64);*/
-   /*const T *b = B.fArray; ASSUME_ALIGNED(b, 64);*/
-         /*T *c = C.fArray; ASSUME_ALIGNED(c, 64);*/
-  /*float *a = A.ptr;*/
-  int aN = 1       ; int an = 0;  // Register array
-  int bN = b.stride; int bn = n;  // Global array
-  int cN = 1;        int cn = 0;
-
-#include "MultHelixProp.ah"
-}
-
-__device__
-void MultHelixPropTransp_fn(const GPlexRegLL& a, const GPlexRegLL& b, GPlexLS& c, const int n)
-{
-   // C = B * AT;
-
-   typedef float T;
-  int aN = 1       ; int an = 0;  // Register array
-  int bN = 1       ; int bn = 0;  // Global array
-  int cN = c.stride; int cn = n;
-
-#include "MultHelixPropTransp.ah"
-}
-
-// Not passing msRad.stride, as QF == 1 (second dim f msRad)
-__device__ void computeMsRad_fn(const GPlexHV& __restrict__ msPar,
-    GPlexRegQF &msRad, const int N, const int n) {
-  /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
-  if (n < N) {
-    msRad(n, 0, 0) = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
-  }
-}
-
-
-#include "PropagationMPlex.icc"
-
-__device__ void helixAtRFromIterativeCCS_fn(const GPlexLV& inPar, 
-    const GPlexQI& inChg, GPlexLV& outPar_global, const GPlexRegQF& msRad, 
-    GPlexRegLL& errorProp,const bool useParamBfield, const int N, const int n)
-{
-
-  GPlexRegLV outPar;
-
-  if (n < N) {
-    for (int j = 0; j < 5; ++j) {
-      outPar[j] = outPar_global(n, j, 0);
-    }
-    errorProp.SetVal(0);
-
-    helixAtRFromIterativeCCS_impl(inPar, inChg, outPar, msRad, errorProp, n, n+1, -1, useParamBfield);
-
-    // Once computations are done. Get values from registers to global memory.
-    for (int j = 0; j < 5; ++j) {
-      outPar_global(n, j, 0) = outPar[j];
-    }
-  }
-}
-
-/// Similarity ////////////////////////////////////////////////////////////////
-__device__ void similarity_fn(GPlexRegLL &a, GPlexLS &b, int N, int n) {
-
-  size_t bN = b.stride;
-  
-  // Keep most values in registers.
-  //float b_reg[LL2];
-  GPlexRegLL b_reg;
-  // To avoid using too many registers, tmp[] as a limited size and is reused.
-  float tmp[6];
-
-  if (n < N) {
-    for (int j = 0; j < b.kSize; j++) {
-      b_reg[j] = b[n + j*bN];
-    }
-
-    tmp[ 0] = a[0]*b_reg[ 0] + a[1]*b_reg[ 1] + a[3]*b_reg[ 6] + a[4]*b_reg[10];
-    tmp[ 1] = a[0]*b_reg[ 1] + a[1]*b_reg[ 2] + a[3]*b_reg[ 7] + a[4]*b_reg[11];
-    /*tmp[ 2] = a[0]*b_reg[ 3] + a[1]*b_reg[ 4] + a[3]*b_reg[ 8] + a[4]*b_reg[12];*/
-    tmp[ 3] = a[0]*b_reg[ 6] + a[1]*b_reg[ 7] + a[3]*b_reg[ 9] + a[4]*b_reg[13];
-    tmp[ 4] = a[0]*b_reg[10] + a[1]*b_reg[11] + a[3]*b_reg[13] + a[4]*b_reg[14];
-    /*tmp[ 5] = a[0]*b_reg[15] + a[1]*b_reg[16] + a[3]*b_reg[18] + a[4]*b_reg[19];*/
-
-    b[ 0*bN+n] = tmp[ 0]*a[0] + tmp[ 1]*a[1] + tmp[ 3]*a[3] + tmp[ 4]*a[4];
-
-
-    tmp[ 0] = a[6]*b_reg[ 0] + a[7]*b_reg[ 1] + a[9]*b_reg[ 6] + a[10]*b_reg[10];
-    tmp[ 1] = a[6]*b_reg[ 1] + a[7]*b_reg[ 2] + a[9]*b_reg[ 7] + a[10]*b_reg[11];
-    /*tmp[ 8] = a[6]*b_reg[ 3] + a[7]*b_reg[ 4] + a[9]*b_reg[ 8] + a[10]*b_reg[12];*/
-    tmp[ 3] = a[6]*b_reg[ 6] + a[7]*b_reg[ 7] + a[9]*b_reg[ 9] + a[10]*b_reg[13];
-    tmp[ 4] = a[6]*b_reg[10] + a[7]*b_reg[11] + a[9]*b_reg[13] + a[10]*b_reg[14];
-    /*tmp[11] = a[6]*b_reg[15] + a[7]*b_reg[16] + a[9]*b_reg[18] + a[10]*b_reg[19];*/
-
-    b[ 1*bN+n] = tmp[ 0]*a[0] + tmp[ 1]*a[1] + tmp[ 3]*a[3] + tmp[ 4]*a[4];
-    b[ 2*bN+n] = tmp[ 0]*a[6] + tmp[ 1]*a[7] + tmp[ 3]*a[9] + tmp[ 4]*a[10];
-
-
-    tmp[ 0] = a[12]*b_reg[ 0] + a[13]*b_reg[ 1] + b_reg[ 3] + a[15]*b_reg[ 6] + a[16]*b_reg[10] + a[17]*b_reg[15];
-    tmp[ 1] = a[12]*b_reg[ 1] + a[13]*b_reg[ 2] + b_reg[ 4] + a[15]*b_reg[ 7] + a[16]*b_reg[11] + a[17]*b_reg[16];
-    tmp[ 2] = a[12]*b_reg[ 3] + a[13]*b_reg[ 4] + b_reg[ 5] + a[15]*b_reg[ 8] + a[16]*b_reg[12] + a[17]*b_reg[17];
-    tmp[ 3] = a[12]*b_reg[ 6] + a[13]*b_reg[ 7] + b_reg[ 8] + a[15]*b_reg[ 9] + a[16]*b_reg[13] + a[17]*b_reg[18];
-    tmp[ 4] = a[12]*b_reg[10] + a[13]*b_reg[11] + b_reg[12] + a[15]*b_reg[13] + a[16]*b_reg[14] + a[17]*b_reg[19];
-    tmp[ 5] = a[12]*b_reg[15] + a[13]*b_reg[16] + b_reg[17] + a[15]*b_reg[18] + a[16]*b_reg[19] + a[17]*b_reg[20];
-
-    b[ 3*bN+n] = tmp[ 0]*a[0] + tmp[ 1]*a[1]           + tmp[ 3]*a[3] + tmp[ 4]*a[4];
-    b[ 4*bN+n] = tmp[ 0]*a[6] + tmp[ 1]*a[7]           + tmp[ 3]*a[9] + tmp[ 4]*a[10];
-    b[ 5*bN+n] = tmp[ 0]*a[12] + tmp[ 1]*a[13] + tmp[ 2] + tmp[ 3]*a[15] + tmp[ 4]*a[16] + tmp[ 5]*a[17];
-
-
-    tmp[ 0] = a[18]*b_reg[ 0] + a[19]*b_reg[ 1] + a[21]*b_reg[ 6] + a[22]*b_reg[10];
-    tmp[ 1] = a[18]*b_reg[ 1] + a[19]*b_reg[ 2] + a[21]*b_reg[ 7] + a[22]*b_reg[11];
-    tmp[ 2] = a[18]*b_reg[ 3] + a[19]*b_reg[ 4] + a[21]*b_reg[ 8] + a[22]*b_reg[12];
-    tmp[ 3] = a[18]*b_reg[ 6] + a[19]*b_reg[ 7] + a[21]*b_reg[ 9] + a[22]*b_reg[13];
-    tmp[ 4] = a[18]*b_reg[10] + a[19]*b_reg[11] + a[21]*b_reg[13] + a[22]*b_reg[14];
-    tmp[ 5] = a[18]*b_reg[15] + a[19]*b_reg[16] + a[21]*b_reg[18] + a[22]*b_reg[19];
-
-    b[ 6*bN+n] = tmp[ 0]*a[0] + tmp[ 1]*a[1]           + tmp[ 3]*a[3] + tmp[ 4]*a[4];
-    b[ 7*bN+n] = tmp[ 0]*a[6] + tmp[ 1]*a[7]           + tmp[ 3]*a[9] + tmp[ 4]*a[10];
-    b[ 8*bN+n] = tmp[ 0]*a[12] + tmp[ 1]*a[13] + tmp[ 2] + tmp[ 3]*a[15] + tmp[ 4]*a[16] + tmp[ 5]*a[17];
-    b[ 9*bN+n] = tmp[ 0]*a[18] + tmp[ 1]*a[19]           + tmp[ 3]*a[21] + tmp[ 4]*a[22];
-
-
-    tmp[ 0] = a[24]*b_reg[ 0] + a[25]*b_reg[ 1] + a[27]*b_reg[ 6] + a[28]*b_reg[10];
-    tmp[ 1] = a[24]*b_reg[ 1] + a[25]*b_reg[ 2] + a[27]*b_reg[ 7] + a[28]*b_reg[11];
-    tmp[ 2] = a[24]*b_reg[ 3] + a[25]*b_reg[ 4] + a[27]*b_reg[ 8] + a[28]*b_reg[12];
-    tmp[ 3] = a[24]*b_reg[ 6] + a[25]*b_reg[ 7] + a[27]*b_reg[ 9] + a[28]*b_reg[13];
-    tmp[ 4] = a[24]*b_reg[10] + a[25]*b_reg[11] + a[27]*b_reg[13] + a[28]*b_reg[14];
-    tmp[ 5] = a[24]*b_reg[15] + a[25]*b_reg[16] + a[27]*b_reg[18] + a[28]*b_reg[19];
-
-    b[10*bN+n] = tmp[ 0]*a[0] + tmp[ 1]*a[1]           + tmp[ 3]*a[3] + tmp[ 4]*a[4];
-    b[11*bN+n] = tmp[ 0]*a[6] + tmp[ 1]*a[7]           + tmp[ 3]*a[9] + tmp[ 4]*a[10];
-    b[12*bN+n] = tmp[ 0]*a[12] + tmp[ 1]*a[13] + tmp[ 2] + tmp[ 3]*a[15] + tmp[ 4]*a[16] + tmp[ 5]*a[17];
-    b[13*bN+n] = tmp[ 0]*a[18] + tmp[ 1]*a[19]           + tmp[ 3]*a[21] + tmp[ 4]*a[22];
-    b[14*bN+n] = tmp[ 0]*a[24] + tmp[ 1]*a[25]           + tmp[ 3]*a[27] + tmp[ 4]*a[28];
-
-    tmp[ 0] = b_reg[15];
-    tmp[ 1] = b_reg[16];
-    tmp[ 2] = b_reg[17];
-    tmp[ 3] = b_reg[18];
-    tmp[ 4] = b_reg[19];
-    tmp[ 5] = b_reg[20];
-
-    // MultHelixPropTransp
-    b[15*bN+n] = tmp[ 0]*a[0] + tmp[ 1]*a[1]           + tmp[ 3]*a[3] + tmp[ 4]*a[4];
-    b[16*bN+n] = tmp[ 0]*a[6] + tmp[ 1]*a[7]           + tmp[ 3]*a[9] + tmp[ 4]*a[10];
-    b[17*bN+n] = tmp[ 0]*a[12] + tmp[ 1]*a[13] + tmp[ 2] + tmp[ 3]*a[15] + tmp[ 4]*a[16] + tmp[ 5]*a[17];
-    b[18*bN+n] = tmp[ 0]*a[18] + tmp[ 1]*a[19]           + tmp[ 3]*a[21] + tmp[ 4]*a[22];
-    b[19*bN+n] = tmp[ 0]*a[24] + tmp[ 1]*a[25]           + tmp[ 3]*a[27] + tmp[ 4]*a[28];
-    b[20*bN+n] = tmp[ 5];
-  }
-}
-
-
-// PropagationMPlex.cc:propagateHelixToRMPlex, first version with 6 arguments 
-__device__ void propagation_fn(
-    GPlexLS &inErr, GPlexLV &inPar, 
-    GPlexQI &inChg, GPlexHV &msPar,
-    GPlexLS &outErr, GPlexLV &outPar,
-    const bool useParamBfield,
-    int n, int N) {
-
-  GPlexRegQF msRad_reg;
-  // Using registers instead of shared memory is ~ 30% faster.
-  GPlexRegLL errorProp_reg;
-  // If there is more matrices than max_blocks_x * BLOCK_SIZE_X 
-  if (n < N) {
-    for (int i = 0; i < inErr.kSize; ++i) {
-      outErr[n + i*outErr.stride] = inErr[n + i*inErr.stride];
-    }
-    for (int i = 0; i < inPar.kSize; ++i) {
-      outPar[n + i*outPar.stride] = inPar[n + i*inPar.stride];
-    }
-    for (int i = 0; i < 36; ++i) {
-      errorProp_reg[i] = 0.0;
-    }
-    computeMsRad_fn(msPar, msRad_reg, N, n);
-    helixAtRFromIterativeCCS_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, useParamBfield, N, n);
-    GPlexRegLL temp;
-    MultHelixProp_fn      (errorProp_reg, outErr, temp, n);
-    MultHelixPropTransp_fn(errorProp_reg, temp,   outErr, n);
-  }
-}
-
-
-__global__ void propagation_kernel(
-    GPlexLS inErr,
-    GPlexHV msPar,
-    GPlexLV inPar, GPlexQI inChg,
-    GPlexLV outPar,
-    GPlexLS outErr, 
-    const bool useParamBfield,
-    int N)
-{
-  int grid_width = blockDim.x * gridDim.x;
-  int n = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
-    n += z*grid_width;
-    propagation_fn(inErr, inPar, inChg, msPar, outErr, outPar, useParamBfield, n, N);
-  }
-}
-
-
-void propagation_wrapper(const cudaStream_t& stream,
-    GPlexHV& msPar, GPlexLS& inErr,
-    GPlexLV& inPar, GPlexQI& inChg,
-    GPlexLV& outPar,
-    GPlexLS& outErr, 
-    const bool useParamBfield,
-    const int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-  propagation_kernel <<<grid, block, 0, stream >>>
-    (inErr, msPar, inPar, inChg, outPar, outErr, useParamBfield, N);
-}
-
-
-// PropagationMPlex.cc:propagateHelixToRMPlex, second version with 7 arguments 
-// Imposes the radius
-__device__ void propagationForBuilding_fn(
-    const GPlexLS &inErr, const GPlexLV &inPar,
-    const GPlexQI &inChg, const float radius,
-    GPlexLS &outErr, GPlexLV &outPar, 
-    const int n, const int N) {
-  GPlexRegQF msRad_reg;
-  // Using registers instead of shared memory is ~ 30% faster.
-  GPlexRegLL errorProp_reg;
-  // If there is more matrices than max_blocks_x * BLOCK_SIZE_X 
-  if (n < N) {
-
-    for (int i = 0; i < inErr.kSize; ++i) {
-      outErr[n + i*outErr.stride] = inErr[n + i*inErr.stride];
-    }
-    for (int i = 0; i < inPar.kSize; ++i) {
-      outPar[n + i*outPar.stride] = inPar[n + i*inPar.stride];
-    }
-    for (int i = 0; i < 36; ++i) {
-      errorProp_reg[i] = 0.0;
-    }
-
-    msRad_reg(n, 0, 0) = radius;
-
-    bool useParamBfield = false;  // The propagation with radius as an arg do not use this parameter
-    helixAtRFromIterativeCCS_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, useParamBfield, N, n);
-    // TODO: port me
-    /*if (Config::useCMSGeom) {*/
-    /*MPlexQF hitsRl;*/
-    /*MPlexQF hitsXi;*/
-    /*for (int n = 0; n < NN; ++n) {*/
-    /*hitsRl.At(n, 0, 0) = getRlVal(r, outPar.ConstAt(n, 2, 0));*/
-    /*hitsXi.At(n, 0, 0) = getXiVal(r, outPar.ConstAt(n, 2, 0));*/
-    /*}*/
-    /*applyMaterialEffects(hitsRl, hitsXi, outErr, outPar, N_proc);*/
-    /*}*/
-    /*similarity_fn(errorProp_reg, outErr, N, n);*/
-
-    // Matriplex version of:
-    // result.errors = ROOT::Math::Similarity(errorProp, outErr);
-
-    //MultHelixProp can be optimized for polar coordinates, see GenMPlexOps.pl
-    /*MPlexLL temp;*/
-    /*MultHelixProp      (errorProp, outErr, temp);*/
-    /*MultHelixPropTransp(errorProp, temp,   outErr);*/
-    GPlexRegLL temp;
-    MultHelixProp_fn      (errorProp_reg, outErr, temp, n);
-    MultHelixPropTransp_fn(errorProp_reg, temp,   outErr, n);
-
-  }
-}
-
-__global__ void propagationForBuilding_kernel(
-    const GPlexLS inErr, const GPlexLV inPar,
-    const GPlexQI inChg, const float radius,
-    GPlexLS outErr, GPlexLV outPar, 
-    const int N) {
-  int grid_width = blockDim.x * gridDim.x;
-  int n = threadIdx.x + blockIdx.x * blockDim.x;
-
-  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
-    n += z*grid_width;
-    propagationForBuilding_fn( inErr, inPar, inChg, radius, outErr, outPar, n, N);
-  }
-}
-
-void propagationForBuilding_wrapper(const cudaStream_t& stream,
-    const GPlexLS& inErr, const GPlexLV& inPar,
-    const GPlexQI& inChg, const float radius,
-    GPlexLS& outErr, GPlexLV& outPar, 
-    const int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-  propagationForBuilding_kernel<<<grid, block, 0, stream >>>
-    (inErr, inPar, inChg, radius, outErr, outPar, N);
-}
-
diff --git a/mkFit/propagation_kernels.h b/mkFit/propagation_kernels.h
deleted file mode 100644
index a8818038..00000000
--- a/mkFit/propagation_kernels.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef _PROPAGATION_KERNELS_H_
-#define _PROPAGATION_KERNELS_H_
-
-#include "GPlex.h"
-
-namespace mkfit {
-
-void propagation_wrapper(const cudaStream_t& stream,
-    GPlexHV& msPar, GPlexLS& inErr,
-    GPlexLV& inPar, GPlexQI& inChg,
-    GPlexLV& outPar,
-    GPlexLS& outErr, 
-    const bool useParamBfield,
-    const int N);
-
-void propagationForBuilding_wrapper(const cudaStream_t& stream,
-    const GPlexLS& inErr, const GPlexLV& inPar,
-    const GPlexQI& inChg, const float radius,
-    GPlexLS& outErr, GPlexLV& outPar, 
-    const int N);
-
-__device__ void propagation_fn(
-    GPlexLS &inErr, GPlexLV &inPar, 
-    GPlexQI &inChg, GPlexHV &msPar,
-    GPlexLS &outErr, GPlexLV &outPar,
-    const bool useParamBfield,
-    int n, int N);
-
-__device__ void propagationForBuilding_fn(
-    const GPlexLS &inErr, const GPlexLV &inPar,
-    const GPlexQI &inChg, const float radius,
-    GPlexLS &outErr, GPlexLV &outPar, 
-    const int n, const int N);
-
-} // end namespace mkfit
-#endif  // _PROPAGATION_KERNELS_H_
diff --git a/mkFit/reorganize_gplex.cu b/mkFit/reorganize_gplex.cu
deleted file mode 100644
index 7efea439..00000000
--- a/mkFit/reorganize_gplex.cu
+++ /dev/null
@@ -1,510 +0,0 @@
-#include "reorganize_gplex.h"
-#include <stdio.h>
-
-#include "FitterCU.h"
-#include "accessors_cu.h"
-#include "Track.h"
-#include "gpu_utils.h"
-
-
-__device__ float *get_posArray(Hit &hit) {
-    return hit.posArrayCU();
-}
-
-
-__device__ float *get_errArray(Hit &hit) {
-    return hit.errArrayCU();
-}
-
-
-__device__ float *get_posArray(Track &track) {
-    return track.posArrayCU();
-}
-
-
-__device__ float *get_errArray(Track &track) {
-    return track.errArrayCU();
-}
-
-
-template <typename GPlexObj>
-__device__ void SlurpIn_fn(GPlexObj to, // float *fArray, int stride, int kSize, 
-                           const char *arr, const int *vi, const int N) {
-  int j = threadIdx.x + blockDim.x * blockIdx.x;
-  if (j<N) {
-    const int *XHitPos = vi;
-    const int off = XHitPos[j] * sizeof(Hit);
-    for (int i = 0; i < to.kSize; ++i) { // plex_size
-      to[j + to.stride*i] = *(decltype(to.ptr)) (arr + i*sizeof(decltype(*to.ptr)) + off);
-    }
-  }
-}
-
-
-template <typename GPlexObj>
-__device__ void SlurpInIdx_fn(GPlexObj to,
-                             const char *arr, const int idx, const int N) {
-  int j = threadIdx.x + blockDim.x * blockIdx.x;
-  if (j<N) {
-    for (int i = 0; i < to.kSize; ++i) { // plex_size
-      auto tmp = *(decltype(to.ptr)) (arr + i*sizeof(decltype(*to.ptr)) + idx);
-      to[j + to.stride*i] = tmp;
-    }
-  }
-}
-
-
-template <typename GPlexObj>
-__device__ void SlurpOutIdx_fn(GPlexObj from, // float *fArray, int stride, int kSize, 
-                               const char *arr, const int idx, const int N) {
-  int j = threadIdx.x + blockDim.x * blockIdx.x;
-  if (j<N) {
-    for (int i = 0; i < from.kSize; ++i) { // plex_size
-      * (decltype(from.ptr)) (arr + i*sizeof(decltype(*from.ptr)) + idx) = from[j + from.stride*i];
-    }
-  }
-}
-
-
-__device__
-void GetHitErr(GPlexHS& msErr, const char* array, const int beg, const int end)
-{
-      SlurpInIdx_fn(msErr, array, beg, end);
-}
-
-
-__device__
-void GetHitPar(GPlexHV& msPar, const char* array, const int beg, const int end)
-{
-      SlurpInIdx_fn(msPar, array, beg, end);
-}
-
-__device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar,
-                           Hit *hits, const GPlexQI &XHitSize, 
-                           const GPlexHitIdx &XHitArr, 
-                           GPlexQI &HitsIdx, const int hit_cnt, 
-                           const int itrack, const int N) {
-  if (itrack < N) {
-
-    const char *varr      = (char*) hits;
-    const int   off_error = (char*) hits[0].errArrayCU() - varr;
-    const int   off_param = (char*) hits[0].posArrayCU() - varr;
-
-    if (hit_cnt < XHitSize[itrack]) {
-      HitsIdx[itrack] = XHitArr(itrack, hit_cnt, 0) * sizeof(Hit);
-    }
-    SlurpInIdx_fn(msErr, varr + off_error, HitsIdx[itrack], N);
-    SlurpInIdx_fn(msPar, varr + off_param, HitsIdx[itrack], N);
-  }
-}
-
-
-__global__ void HitToMs_kernel(GPlexHS msErr, GPlexHV msPar, Hit *hits,
-                               const GPlexQI XHitSize, const GPlexHitIdx XHitArr,
-                               GPlexQI HitsIdx, const int hit_cnt, const int N) {
-  int itrack = threadIdx.x + blockDim.x * blockIdx.x;
-  HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, itrack, N);
-}
-
-
-void HitToMs_wrapper(const cudaStream_t& stream,
-                     GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, 
-                     const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
-                     GPlexQI &HitsIdx, int hit_cnt, const int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-  HitToMs_kernel <<< grid, block, 0 , stream >>>
-    (msErr, msPar, layer.m_hits.data(), XHitSize, XHitArr, HitsIdx, hit_cnt, N);
-  /*cudaDeviceSynchronize();*/
-}
-
-
-__device__ void InputTracksCU_fn (Track *tracks, 
-                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
-                                  GPlexQI &Chg, GPlexQF &Chi2,
-                                  GPlexQI &Label, GPlexQI *HitsIdx,
-                                  const int beg, const int end, 
-                                  const int itrack, const int N) {
-  if (itrack < (end-beg) && itrack < N) {
-    Track &trk = tracks[beg];
-    const char *varr       = (char*) &trk;
-    int   off_error = (char*) trk.errArrayCU() - varr;
-    int   off_param = (char*) trk.posArrayCU() - varr;
-
-    int i= itrack + beg;
-    const Track &trk_i = tracks[i];
-    int idx = (char*) &trk_i - varr;
-
-    Label(itrack, 0, 0) = tracks[i].label();
-    Chg(itrack, 0, 0) = tracks[i].charge();
-    Chi2(itrack, 0, 0) = tracks[i].chi2();
-    SlurpInIdx_fn(Err_iP, varr + off_error, idx, N);
-    SlurpInIdx_fn(Par_iP, varr + off_param, idx, N);
-
-    for (int hi = 0; hi < 3; ++hi)
-      HitsIdx[hi](itrack, 0, 0) = tracks[i].getHitIdx(hi);//dummy value for now
-  }
-}
-
-
-__global__ void InputTracksCU_kernel(Track *tracks, 
-                                     GPlexLS Err_iP, GPlexLV Par_iP,
-                                     GPlexQI Chg, GPlexQF Chi2, GPlexQI Label,
-                                     GPlexQI *HitsIdx,
-                                     int beg, int end, int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  InputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, itrack, N);
-}
-
-
-void InputTracksCU_wrapper(const cudaStream_t &stream, 
-                           const EtaBinOfCandidatesCU &etaBin,
-                           GPlexLS &Err_iP, GPlexLV &Par_iP,
-                           GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
-                           GPlexQI *HitsIdx,
-                           const int beg, const int end, const bool inputProp, int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  InputTracksCU_kernel <<< grid, block, 0, stream >>>
-    (etaBin.m_candidates, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx,
-     beg, end, N);
-}
-
-
-__device__ void InputTracksAndHitsCU_fn (Track *tracks, LayerOfHitsCU *layerHits,
-                                         GPlexLS &Err_iP, GPlexLV &Par_iP,
-                                         GPlexHS *msErr_arr, GPlexHV *msPar_arr,
-                                         GPlexQI &Chg, GPlexQF &Chi2,
-                                         GPlexQI &Label, GPlexQI *HitsIdx,
-                                         const int beg, const int end, 
-                                         const int itrack, const int N) {
-  if (itrack < (end-beg) && itrack < N) {
-    Track &trk = tracks[beg];
-    const char *varr       = (char*) &trk;
-    int   off_error = (char*) trk.errArrayCU() - varr;
-    int   off_param = (char*) trk.posArrayCU() - varr;
-
-    int i= itrack + beg;
-    const Track &trk_i = tracks[i];
-    int idx = (char*) &trk_i - varr;
-
-    Label(itrack, 0, 0) = tracks[i].label();
-    Chg(itrack, 0, 0) = tracks[i].charge();
-    Chi2(itrack, 0, 0) = tracks[i].chi2();
-    SlurpInIdx_fn(Err_iP, varr + off_error, idx, N);
-    SlurpInIdx_fn(Par_iP, varr + off_param, idx, N);
-
-    // Note Config::nLayers -- not suitable for building
-    for (int hi = 0; hi < Config::nLayers; ++hi) {
-      int hidx = tracks[i].getHitIdx(hi);
-      Hit &hit = layerHits[hi].m_hits[hidx];
-
-      HitsIdx[hi](itrack, 0, 0) = idx;
-      if (hidx < 0) continue;
-
-      SlurpInIdx_fn(msErr_arr[hi], (char *)hit.errArrayCU(), 0, N);
-      SlurpInIdx_fn(msPar_arr[hi], (char *)hit.posArrayCU(), 0, N);
-    }
-  }
-}
-
-
-__global__ void InputTracksAndHitsCU_kernel(Track *tracks, LayerOfHitsCU *layers,
-                                            GPlexLS Err_iP, GPlexLV Par_iP,
-                                            GPlexHS *msErr_arr, GPlexHV *msPar_arr,
-                                            GPlexQI Chg, GPlexQF Chi2, GPlexQI Label,
-                                            GPlexQI *HitsIdx,
-                                            int beg, int end, int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  InputTracksAndHitsCU_fn(tracks, layers, Err_iP, Par_iP, msErr_arr, msPar_arr,
-                          Chg, Chi2, Label, HitsIdx, beg, end, itrack, N);
-}
-
-
-void InputTracksAndHitsCU_wrapper(const cudaStream_t &stream, 
-                                  Track *tracks, EventOfHitsCU &event_of_hits,
-                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
-                                  GPlexHS *msErr_arr, GPlexHV *msPar_arr,
-                                  GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
-                                  GPlexQI *HitsIdx,
-                                  const int beg, const int end, 
-                                  const bool inputProp, int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  InputTracksAndHitsCU_kernel <<< grid, block, 0, stream >>>
-    (tracks, event_of_hits.m_layers_of_hits.data(),
-     Err_iP, Par_iP, 
-     msErr_arr, msPar_arr, 
-     Chg, Chi2, Label, HitsIdx,
-     beg, end, N);
-}
-
-
-__device__ void OutputParErrCU_fn(Track *tracks, 
-                                  const GPlexLS &Err, const GPlexLV &Par,
-                                  const int beg, const int end, 
-                                  const int itrack_plex, const int N) {
-  Track &trk = tracks[beg];
-  const char *varr       = (char*) &trk;
-  int   off_error = (char*) trk.errArrayCU() - varr;
-  int   off_param = (char*) trk.posArrayCU() - varr;
-
-  int i= itrack_plex + beg;
-  const Track &trk_i = tracks[i];
-  int idx = (char*) &trk_i - varr;
-
-  SlurpOutIdx_fn(Err, varr + off_error, idx, N);
-  SlurpOutIdx_fn(Par, varr + off_param, idx, N);
-}
-
-__device__ void OutputParErrCU_fn_seed(Track *tracks, 
-                                  const GPlexLS &Err, const GPlexLV &Par,
-                                  const int iseed_ev,
-                                  const int icand_ev,
-                                  int N) {
-  Track &trk = tracks[0];
-  const char *varr       = (char*) &trk;
-  int   off_error = (char*) trk.errArrayCU() - varr;
-  int   off_param = (char*) trk.posArrayCU() - varr;
-
-  int i= iseed_ev * Config::maxCandsPerSeed + icand_ev;
-  const Track &trk_i = tracks[i];
-  int idx = (char*) &trk_i - varr;
-
-  SlurpOutIdx_fn(Err, varr + off_error, idx, N);
-  SlurpOutIdx_fn(Par, varr + off_param, idx, N);
-}
-
-
-__device__ void OutputTracksCU_fn(Track *tracks, 
-                                  const GPlexLS &Err_iP, const GPlexLV &Par_iP,
-                                  const GPlexQI &Chg, const GPlexQF &Chi2,
-                                  const GPlexQI &Label, const GPlexQI *HitsIdx,
-                                  const int beg, const int end, 
-                                  const int itrack, const int N,
-                                  const bool update_hit_idx) {
-  if (itrack < (end-beg) && itrack < N) {
-    Track &trk = tracks[beg];
-    const char *varr       = (char*) &trk;
-    int   off_error = (char*) trk.errArrayCU() - varr;
-    int   off_param = (char*) trk.posArrayCU() - varr;
-
-    int i= itrack + beg;
-    const Track &trk_i = tracks[i];
-    int idx = (char*) &trk_i - varr;
-
-    SlurpOutIdx_fn(Err_iP, varr + off_error, idx, N);
-    SlurpOutIdx_fn(Par_iP, varr + off_param, idx, N);
-    tracks[i].setCharge(Chg(itrack, 0, 0));
-    tracks[i].setChi2(Chi2(itrack, 0, 0));
-    tracks[i].setLabel(Label(itrack, 0, 0));
-
-    if (update_hit_idx) {
-      tracks[i].resetHits();
-      /*int nGoodItIdx = 0;*/
-      for (int hi = 0; hi < Config::nLayers; ++hi) {
-        tracks[i].addHitIdx(HitsIdx[hi](itrack, 0, 0),0.);
-        // FIXME: We probably want to use registers instead of going for gmem class members:
-        /*int hit_idx = HitsIdx[hi](itrack, 0, 0);*/
-        /*tracks[i].setHitIdx(hi, hit_idx);*/
-        /*if (hit_idx >= 0) {*/
-        /*nGoodItIdx++; */
-        /*}*/
-      }
-      /*tracks[i].setNGoodHitIdx(nGoodItIdx);*/
-      /*tracks[i].setChi2(0.);*/
-    }
-  }
-}
-
-__global__ void OutputTracksCU_kernel(Track *tracks, 
-                                     GPlexLS Err_iP, GPlexLV Par_iP,
-                                     GPlexQI Chg, GPlexQF Chi2, GPlexQI Label,
-                                     GPlexQI *HitsIdx,
-                                     int beg, int end, int N,
-                                     const bool update_hit_idx=true) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  OutputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx,
-                    beg, end, itrack, N, update_hit_idx);
-}
-
-
-void OutputTracksCU_wrapper(const cudaStream_t &stream,
-                            EtaBinOfCandidatesCU &etaBin,
-                            GPlexLS &Err_iP, GPlexLV &Par_iP,
-                            GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
-                            GPlexQI *HitsIdx,
-                            const int beg, const int end, const bool outputProp, int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  OutputTracksCU_kernel <<< grid, block, 0, stream >>>
-    (etaBin.m_candidates, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, N);
-}
-
-
-void OutputFittedTracksCU_wrapper(const cudaStream_t &stream,
-                                  Track *tracks_cu, 
-                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
-                                  GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
-                                  const int beg, const int end, int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  OutputTracksCU_kernel <<< grid, block, 0, stream >>>
-    (tracks_cu, Err_iP, Par_iP, Chg, Chi2, Label, nullptr, beg, end, N, false);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-// m_tracks_per_seed: play the same role than seed_cand_idx in the cpu code
-__device__ void InputTracksAndHitIdxComb_fn(Track *tracks, int *m_tracks_per_seed,
-                                            GPlexLS &Err_iP, GPlexLV &Par_iP,
-                                            GPlexQI &Chg, GPlexQF &Chi2,
-                                            GPlexQI &Label, GPlexQI *HitsIdx,
-                                            GPlexQI &SeedIdx, GPlexQI &CandIdx,
-                                            GPlexQB &Valid,
-                                            const int Nhits,
-                                            const int beg, const int end, 
-                                            const int itrack_plex, const int N)
-{
-  if (itrack_plex < N) {
-    int itrack_ev = beg + itrack_plex;
-
-    // TODO:: make sure that the width of the FitterCU is a multiple of
-    //        Config::maxCandsPerSeed;
-    int iseed_ev = itrack_ev / Config::maxCandsPerSeed;
-    int icand_ev = itrack_ev % Config::maxCandsPerSeed;
-    // | o : o : x : x : x |
-    //  iseed
-    //  <----> m_tracks_per_seed[iseed]
-    //  <------------------> maxCandsPerSeed
-    Valid(itrack_plex, 0, 0) = icand_ev < m_tracks_per_seed[iseed_ev]
-                             && m_tracks_per_seed[iseed_ev] != 0;
-    if (!Valid(itrack_plex, 0, 0)) {
-      return;
-    }
-
-    Track &trk = tracks[beg];
-    const char *varr       = (char*) &trk;
-    int   off_error = (char*) trk.errArrayCU() - varr;
-    int   off_param = (char*) trk.posArrayCU() - varr;
-
-    int i= itrack_plex + beg;  // TODO: i == itrack_ev
-    const Track &trk_i = tracks[i];
-    int idx = (char*) &trk_i - varr;
-
-    Label(itrack_plex, 0, 0) = tracks[i].label();
-    SeedIdx(itrack_plex, 0, 0) = iseed_ev; 
-    CandIdx(itrack_plex, 0, 0) = icand_ev;
-
-    SlurpInIdx_fn(Err_iP, varr + off_error, idx, N);
-    SlurpInIdx_fn(Par_iP, varr + off_param, idx, N);
-
-    Chg(itrack_plex, 0, 0) = tracks[i].charge();
-    Chi2(itrack_plex, 0, 0) = tracks[i].chi2();
-    // Note Config::nLayers -- not suitable for building
-    for (int hi = 0; hi < Nhits; ++hi) {
-      HitsIdx[hi][itrack_plex] = tracks[i].getHitIdx(hi); 
-
-      int hit_idx = HitsIdx[hi][itrack_plex];
-    }
-  }
-}
-
-__global__ 
-void InputTracksAndHitIdxComb_kernel(Track *tracks, int *m_tracks_per_seed,
-                                     GPlexLS Err_iP, GPlexLV Par_iP,
-                                     GPlexQI Chg, GPlexQF Chi2,
-                                     GPlexQI Label, GPlexQI *HitsIdx,
-                                     GPlexQI SeedIdx, GPlexQI CandIdx,
-                                     GPlexQB Valid, const int Nhits,
-                                     const int beg, const int end, 
-                                     const int N)
-{
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  InputTracksAndHitIdxComb_fn(tracks, m_tracks_per_seed, 
-                              Err_iP, Par_iP,
-                              Chg, Chi2, Label, HitsIdx, 
-                              SeedIdx, CandIdx, Valid, Nhits ,
-                              beg, end, itrack, N);
-}
-
-void InputTracksAndHitIdxComb_wrapper(const cudaStream_t &stream, 
-                                      const EtaBinOfCombCandidatesCU &etaBin,
-                                     GPlexLS &Err_iP, GPlexLV &Par_iP,
-                                     GPlexQI &Chg, GPlexQF &Chi2, 
-                                     GPlexQI &Label, GPlexQI *HitsIdx,
-                                     GPlexQI &SeedIdx, GPlexQI &CandIdx,
-                                     GPlexQB &Valid, const int Nhits,
-                                     const int beg, const int end,
-                                     const bool inputProp, int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       max_blocks_x);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  InputTracksAndHitIdxComb_kernel<<< grid, block, 0, stream >>>
-    (etaBin.m_candidates.data(), etaBin.m_ntracks_per_seed.data(),
-     Err_iP, Par_iP, 
-     Chg, Chi2, Label, HitsIdx,
-     SeedIdx, CandIdx, Valid, Nhits,
-     beg, end, N);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-__device__ void InputTracksAndHitIdxComb_fn_seed(Track *tracks, int *m_tracks_per_seed,
-                                            GPlexLS &Err_iP, GPlexLV &Par_iP,
-                                            GPlexQI &Chg, GPlexQF &Chi2,
-                                            GPlexQI &Label, GPlexQI *HitsIdx,
-                                            GPlexQI &SeedIdx, GPlexQI &CandIdx,
-                                            GPlexQB &Valid,
-                                            const int Nhits,
-                                            const int iseed_ev, 
-                                            const int icand_ev,
-                                            const int iseed_plex,
-                                            const int N)
-{
-  if (iseed_plex < N) {
-    // seed-based algorithm do not depend on Valid
-    Valid(iseed_plex, 0, 0) = 1;
-
-    Track &trk = tracks[0];
-    const char *varr       = (char*) &trk;
-    int   off_error = (char*) trk.errArrayCU() - varr;
-    int   off_param = (char*) trk.posArrayCU() - varr;
-    
-    int i = iseed_ev * Config::maxCandsPerSeed + icand_ev;
-    const Track &trk_i = tracks[i];
-    int idx = (char*) &trk_i - varr;
-
-    Label(iseed_plex, 0, 0) = tracks[i].label();
-    SeedIdx(iseed_plex, 0, 0) = iseed_ev; 
-    CandIdx(iseed_plex, 0, 0) = icand_ev;
-
-    SlurpInIdx_fn(Err_iP, varr + off_error, idx, N);
-    SlurpInIdx_fn(Par_iP, varr + off_param, idx, N);
-
-    Chg(iseed_plex, 0, 0) = tracks[i].charge();
-    Chi2(iseed_plex, 0, 0) = tracks[i].chi2();
-    // Note Config::nLayers -- not suitable for building
-    for (int hi = 0; hi < Nhits; ++hi) {
-      HitsIdx[hi][iseed_plex] = tracks[i].getHitIdx(hi); 
-    }
-  }
-}
diff --git a/mkFit/reorganize_gplex.h b/mkFit/reorganize_gplex.h
deleted file mode 100644
index c5976dde..00000000
--- a/mkFit/reorganize_gplex.h
+++ /dev/null
@@ -1,126 +0,0 @@
-#ifndef REORGANIZE_GPLEX_H
-#define REORGANIZE_GPLEX_H
-
-#include "GPlex.h"
-#include "Hit.h"
-#include "HitStructuresCU.h"
-#include "Track.h"
-
-namespace mkfit {
-
-__device__ float *get_posArray(Hit &hit);
-__device__ float *get_errArray(Hit &hit);
-__device__ float *get_posArray(Track &track);
-__device__ float *get_errArray(Track &track);
-
-__device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar,
-                           Hit *hits, const GPlexQI &XHitSize,
-                           const GPlexHitIdx &XHitArr, 
-                           GPlexQI &HitsIdx, const int hit_cnt,
-                           const int itrack, const int N);
-
-__global__ void HitToMs_kernel(GPlexHS msErr, GPlexHV msPar, Hit *hits, 
-                               const GPlexQI XHitSize, const GPlexHitIdx XHitArr, 
-                               GPlexQI HitsIdx, const int hit_cnt, const int N);
-
-void HitToMs_wrapper(const cudaStream_t& stream,
-                     GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, 
-                     const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, 
-                     GPlexQI &HitsIdx, const int hit_cnt, const int N);
-
-__device__ void InputTracksCU_fn(Track *tracks, 
-                                 GPlexLS &Err_iP, GPlexLV &Par_iP,
-                                 GPlexQI &Chg, GPlexQF &Chi2,
-                                 GPlexQI &Label, GPlexQI *HitsIdx,
-                                 const int beg, const int end,
-                                 const int itrack, const int N);
-
-__device__ void OutputParErrCU_fn(Track *tracks, 
-                                  const GPlexLS &Err, const GPlexLV &Par,
-                                  const int beg, const int end, 
-                                  const int itrack, const int N);
-
-__device__ void OutputTracksCU_fn(Track *tracks, 
-                                  const GPlexLS &Err_iP, const GPlexLV &Par_iP,
-                                  const GPlexQI &Chg, const GPlexQF &Chi2,
-                                  const GPlexQI &Label, const GPlexQI *HitsIdx,
-                                  const int beg, const int end, 
-                                  const int itrack, const int N,
-                                  const bool update_hit_idx=true);
-
-void InputTracksCU_wrapper(const cudaStream_t &stream, 
-                           const EtaBinOfCandidatesCU &etaBin,
-                           GPlexLS &Err_iP, GPlexLV &Par_iP,
-                           GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
-                           GPlexQI *HitsIdx,
-                           const int beg, const int end, const bool inputProp, int N);
-
-void InputTracksAndHitsCU_wrapper(const cudaStream_t &stream, 
-                                  Track *tracks, EventOfHitsCU &event_of_hits,
-                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
-                                  GPlexHS *msErr_arr, GPlexHV *msPar_arr,
-                                  GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
-                                  GPlexQI *HitsIdx,
-                                  const int beg, const int end,
-                                  const bool inputProp, int N);
-
-void InputTracksAndHitIdxComb_wrapper(const cudaStream_t &stream, 
-                                      const EtaBinOfCombCandidatesCU &etaBin,
-                                      GPlexLS &Err_iP, GPlexLV &Par_iP,
-                                      GPlexQI &Chg, GPlexQF &Chi2,
-                                      GPlexQI &Label, GPlexQI *HitsIdx,
-                                      GPlexQI &SeedIdx, GPlexQI &CandIdx,
-                                      GPlexQB &Valid, const int Nhits,
-                                      const int beg, const int end, 
-                                      const bool inputProp, int N);
-
-void OutputTracksCU_wrapper(const cudaStream_t &stream,
-                            EtaBinOfCandidatesCU &etaBin,
-                            GPlexLS &Err_iP, GPlexLV &Par_iP,
-                            GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
-                            GPlexQI *HitsIdx,
-                            const int beg, const int end, bool outputProp, int N);
-
-
-void OutputFittedTracksCU_wrapper(const cudaStream_t &stream,
-                                  Track *tracks_cu, 
-                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
-                                  GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
-                                  const int beg, const int end, int N);
-
-__device__
-void GetHitErr(GPlexHS& msErr, const char* array, const int beg, const int end);
-__device__
-void GetHitPar(GPlexHV& msPar, const char* array, const int beg, const int end);
-
-
-__device__ void InputTracksAndHitIdxComb_fn(Track *tracks, int *m_tracks_per_seed,
-                                            GPlexLS &Err_iP, GPlexLV &Par_iP,
-                                            GPlexQI &Chg, GPlexQF &Chi2,
-                                            GPlexQI &Label, GPlexQI *HitsIdx,
-                                            GPlexQI &SeedIdx, GPlexQI &CandIdx,
-                                            GPlexQB &Valid,
-                                            const int Nhits,
-                                            const int beg, const int end, 
-                                            const int itrack, const int N);
-
-__device__ void InputTracksAndHitIdxComb_fn_seed(Track *tracks, int *m_tracks_per_seed,
-                                            GPlexLS &Err_iP, GPlexLV &Par_iP,
-                                            GPlexQI &Chg, GPlexQF &Chi2,
-                                            GPlexQI &Label, GPlexQI *HitsIdx,
-                                            GPlexQI &SeedIdx, GPlexQI &CandIdx,
-                                            GPlexQB &Valid,
-                                            const int Nhits,
-                                            const int iseed_ev, 
-                                            const int icand_ev,
-                                            const int iseed_plex,
-                                            const int N);
-
-__device__ void OutputParErrCU_fn_seed(Track *tracks, 
-                                  const GPlexLS &Err, const GPlexLV &Par,
-                                  const int iseed_ev,
-                                  const int icand_ev,
-                                  int N);
-
-} // end namespace mkfit
-#endif  // REORGANIZE_GPLEX_H

From b6e451cc05f8a3af16508c3144de4b4fbef86ab6 Mon Sep 17 00:00:00 2001
From: Matevz Tadel <mtadel@ucsd.edu>
Date: Thu, 29 Oct 2020 11:48:37 -0700
Subject: [PATCH 2/3] GenMul.pl - do not generate CUDA code unless explicitly
 requested.

---
 Matriplex/GenMul.pm | 46 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/Matriplex/GenMul.pm b/Matriplex/GenMul.pm
index 15bb3924..3bd06e85 100644
--- a/Matriplex/GenMul.pm
+++ b/Matriplex/GenMul.pm
@@ -777,6 +777,51 @@ sub dump_multiply_std_and_intrinsic
     select FF;
   }
 
+  print <<"FNORD";
+#ifdef MPLEX_INTRINSICS
+
+   for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T))
+   {
+FNORD
+
+  $S->multiply_intrinsic($a, $b, $c);
+
+  print <<"FNORD";
+   }
+
+#else
+
+#pragma omp simd
+   for (int n = 0; n < N; ++n)
+   {
+FNORD
+
+  $S->multiply_standard($a, $b, $c);
+
+  print <<"FNORD";
+   }
+#endif
+FNORD
+
+  unless ($fname eq '-')
+  {
+    close FF;
+    select STDOUT;
+  }
+}
+
+# ----------------------------------------------------------------------
+
+sub dump_multiply_std_and_intrinsic_and_gpu
+{
+  my ($S, $fname, $a, $b, $c) = @_;
+
+  unless ($fname eq '-')
+  {
+    open FF, ">$fname";
+    select FF;
+  }
+
   print <<"FNORD";
 #ifndef __CUDACC__
 #ifdef MPLEX_INTRINSICS
@@ -809,7 +854,6 @@ FNORD
 #endif  // __CUDACC__
 FNORD
 
-  
   unless ($fname eq '-')
   {
     close FF;

From 0d3a44c69d387be85efa5b4f26c8b1562b652f0e Mon Sep 17 00:00:00 2001
From: Steve Lantz <steve.lantz@cornell.edu>
Date: Fri, 30 Oct 2020 14:04:57 -0700
Subject: [PATCH 3/3] clean out __CUDACC__ ifdef's in from-root/Math

---
 from-root/Math/MatrixRepresentationsStatic.h | 16 +---------------
 from-root/Math/SMatrix.h                     |  9 ---------
 from-root/Math/SVector.h                     |  9 ---------
 3 files changed, 1 insertion(+), 33 deletions(-)

diff --git a/from-root/Math/MatrixRepresentationsStatic.h b/from-root/Math/MatrixRepresentationsStatic.h
index e32b7f47..78c6155c 100644
--- a/from-root/Math/MatrixRepresentationsStatic.h
+++ b/from-root/Math/MatrixRepresentationsStatic.h
@@ -215,9 +215,6 @@ namespace Math {
 
    public:
 
-#if __CUDACC__
-   __host__ __device__
-#endif
     /* constexpr */ inline MatRepSym(){}
 
     typedef T  value_type;
@@ -241,17 +238,9 @@ namespace Math {
        return fArray[off(i)];
      }
 
-#if __CUDACC__
-   __host__ __device__
-#endif
      inline T* Array() { return fArray; }
-#if __CUDACC__
-   __host__ __device__
-#endif
+
      inline const T* Array() const { return fArray; }
-#ifdef __CUDACC__
-     T* ArrayCU();
-#endif
 
       /**
          assignment : only symmetric to symmetric allowed
@@ -263,9 +252,6 @@ namespace Math {
          return *this;
       }
 
-#if __CUDACC__
-   __host__ __device__
-#endif
       inline MatRepSym<T, D>& operator=(const MatRepSym& rhs) {
          #pragma omp simd
          for(unsigned int i=0; i<kSize; ++i) fArray[i] = rhs.Array()[i];
diff --git a/from-root/Math/SMatrix.h b/from-root/Math/SMatrix.h
index 1dc9ac94..b14edfcc 100644
--- a/from-root/Math/SMatrix.h
+++ b/from-root/Math/SMatrix.h
@@ -147,9 +147,6 @@ class SMatrix {
    /**
       Default constructor:
    */
-#if __CUDACC__
-   __host__ __device__
-#endif
    SMatrix();
    /// 
    /** 
@@ -159,9 +156,6 @@ class SMatrix {
    /** 
        copy constructor (from a matrix of the same representation 
    */ 
-#if __CUDACC__
-   __host__ __device__
-#endif
    SMatrix(const SMatrix<T,D1,D2,R>& rhs);
    /**
       construct from a matrix with different representation.
@@ -278,9 +272,6 @@ class SMatrix {
    const T* Array() const;
    /// return pointer to internal array
    T* Array();
-#ifdef __CUDACC__
-   T* ArrayCU();
-#endif
 
    /** @name --- STL-like interface --- 
        The iterators access the matrix element in the order how they are 
diff --git a/from-root/Math/SVector.h b/from-root/Math/SVector.h
index 89ee73c8..7de0c830 100644
--- a/from-root/Math/SVector.h
+++ b/from-root/Math/SVector.h
@@ -93,17 +93,11 @@ class SVector {
    /**
       Default constructor: vector filled with zero values 
     */
-#if __CUDACC__
-   __host__ __device__
-#endif
    SVector();
    /// contruct from a vector expression
    template <class A>
    SVector(const VecExpr<A,T,D>& rhs);
    /// copy contructor
-#if __CUDACC__
-   __host__ __device__
-#endif
    SVector(const SVector<T,D>& rhs);
       
    // new constructs using STL iterator interface
@@ -191,9 +185,6 @@ class SVector {
    const T* Array() const;
    /// return non-const pointer to internal array
    T* Array();
-#ifdef __CUDACC__
-   T* ArrayCU();
-#endif
    
    /** @name --- STL-like interface --- */