trackreco · dan131riley · Nov 9, 2020 · Oct 29, 2020 · Oct 29, 2020 · Oct 29, 2020
diff --git a/BinInfoUtils.h b/BinInfoUtils.h
@@ -17,29 +17,25 @@ typedef std::pair<int, int>                 BinInfo;
 typedef std::vector<std::vector<BinInfo>>   BinInfoLayerMap;
 typedef std::vector<BinInfoLayerMap>        BinInfoMap;
 
-CUDA_CALLABLE
 inline float downPhi(float phi)
 {
   while (phi >= Config::PI) {phi-=Config::TwoPI;}
   return phi;
 }
 
-CUDA_CALLABLE
 inline float upPhi(float phi)
 {
   while (phi <= -Config::PI) {phi+=Config::TwoPI;}
   return phi;
 }
 
-CUDA_CALLABLE
 inline float normalizedPhi(float phi)
 {
   //  return std::fmod(phi, (float) Config::PI); // return phi +pi out of phase for |phi| beyond boundary! 
   if (std::abs(phi)>=Config::PI) {phi = (phi>0 ? downPhi(phi) : upPhi(phi));}
   return phi;
 }
 
-CUDA_CALLABLE
 inline int getPhiPartition(float phi)
 {
   //assume phi is between -PI and PI

diff --git a/Config.cc b/Config.cc
@@ -24,11 +24,8 @@ namespace Config
 
   // Multi threading and Clone engine configuration
   int   numThreadsFinder = 1;
-
-  // GPU computations
   int   numThreadsEvents = 1;
-  int   numThreadsReorg = 1;
-
+
 #if defined(__MIC__) || defined(__AVX512F__)
   int   numThreadsSimulation = 60;
 #else

diff --git a/Config.h b/Config.h
@@ -6,12 +6,6 @@
 #include <string> // won't compile on clang gcc for mac OS w/o this!
 #include <map>
 
-#if defined(__CUDACC__)
-  #define CUDA_CALLABLE __host__ __device__
-#else
-  #define CUDA_CALLABLE 
-#endif
-
 namespace mkfit {
 
 // Cram this in here for now ...
@@ -259,7 +253,7 @@ namespace Config
   // Config for Hit and BinInfoUtils
   constexpr int   nPhiPart   = 1260;
   constexpr float fPhiFactor = nPhiPart / TwoPI;
-  constexpr int   nEtaPart   = 11;  // 1 is better for GPU best_hit
+  constexpr int   nEtaPart   = 11;
   constexpr int   nEtaBin    = 2 * nEtaPart - 1;
 
   constexpr float        fEtaFull  = 2 * Config::fEtaDet;
@@ -363,10 +357,7 @@ namespace Config
   // Threading
   extern int    numThreadsFinder;
   extern int    numThreadsSimulation;
-
-  // For GPU computations
   extern int    numThreadsEvents;
-  extern int    numThreadsReorg;
 
   extern int    finderReportBestOutOfN;
 
@@ -409,7 +400,6 @@ namespace Config
 
   void RecalculateDependentConstants();
 
-  CUDA_CALLABLE
   inline float BfieldFromZR(const float z, const float r)
   {
     return (Config::mag_b0*z*z + Config::mag_b1*z + Config::mag_c1)*(Config::mag_a*r*r + 1.f);
@@ -420,8 +410,6 @@ namespace Config
   #ifndef MPT_SIZE
     #if defined(__MIC__) || defined(__AVX512F__)
       #define MPT_SIZE 16
-    #elif defined USE_CUDA
-      #define MPT_SIZE 8
     #elif defined(__AVX__) || defined(__AVX2__)
       #define MPT_SIZE 8
     #else

diff --git a/Hit.h b/Hit.h
@@ -61,13 +61,11 @@ inline float getInvRad2(float x, float y){
   return 1.0f/(x*x + y*y);
 }
 
-CUDA_CALLABLE
 inline float getPhi(float x, float y)
 {
   return std::atan2(y,x); 
 }
 
-CUDA_CALLABLE
 inline float getTheta(float r, float z){
   return std::atan2(r,z);
 }
@@ -201,10 +199,6 @@ class Hit
 
   const float* posArray() const {return state_.pos_.Array();}
   const float* errArray() const {return state_.err_.Array();}
-#if __CUDACC__
-  __device__ float* posArrayCU();
-  __device__ float* errArrayCU();
-#endif
 
   // Non-const versions needed for CopyOut of Matriplex.
   SVector3&     parameters_nc() {return state_.pos_;}

diff --git a/Makefile b/Makefile
@@ -57,7 +57,7 @@ distclean: clean-local
 
 ${LIB_CORE}: ${CORE_OBJS}
 	@mkdir -p $(@D)
-	${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS_CU} ${LDFLAGS}
+	${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS}
 
 main: ${AUTO_TGTS} ${LIB_CORE} main.o
 	${CXX} ${CXXFLAGS} ${VEC_HOST} -o $@ main.o ${LDFLAGS_HOST} ${LDFLAGS} -Llib -lMicCore -Wl,-rpath,lib

diff --git a/Makefile.config b/Makefile.config
@@ -47,25 +47,6 @@ else ifdef OSXMPCLANG
   TBB_PREFIX := /opt/local
 endif
 
-# 2.1 Use nvcc to compile cuda code
-# Using the CUB library for standard GPU algorithm http://nvlabs.github.io/cub/
-# It's header only and potentially exported by the environment
-# Maybe it is good enough to have:
-# CUBROOT?=Undefined
-# CUDAINCDIR and CUDALIBDIR also need to be defined
-ifneq (,$(realpath /home/ml15/tools/cub))
-	CUBROOT?=/home/ml15/tools/cub
-else ifneq (,$(realpath /nfs/opt/cuda-8-0/include))
-	CUBROOT?=/nfs/opt/cuda-8-0/include
-else ifneq (,$(realpath /usr/local/cuda/include))
-	CUBROOT?=/usr/local/cuda/include
-endif
-NV := nvcc -prec-sqrt=true -I${CUBROOT} 
-#-g -G -lineinfo
-# Comment out to compile for CPU
-#USE_CUDA := 1
-# For CUDA: Also need to change maxCandsPerSeed to 8 and nEtaPart to 1
-
 # 3. Optimization
 # -O3 implies vectorization and simd (but not AVX)
 OPT := -g -O3
@@ -143,20 +124,6 @@ CXXFLAGS := -fPIC ${OPT} ${OSX_CXXFLAGS}
 LDFLAGS_HOST := 
 LDFLAGS_MIC  := -static-intel
 
-ifdef USE_CUDA
-	CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo
-	LDFLAGS_HOST += -L${CUDALIBDIR}
-	ifeq ($(CXX),icpc)
-	  CXXFLAGS += -qopenmp-simd
-	  LDFLAGS  += -qopenmp-simd
-	else
-	  CXXFLAGS += -fopenmp-simd
-	  LDFLAGS  += -fopenmp-simd
-	endif
-endif
-#CXXFLAGS += -qopenmp
-#LDFLAGS += -qopenmp
-
 CPPFLAGS += ${USE_STATE_VALIDITY_CHECKS} ${USE_SCATTERING} ${USE_LINEAR_INTERPOLATION} ${ENDTOEND} ${INWARD_FIT} 
 
 ifdef USE_VTUNE_NOTIFY

diff --git a/Matriplex/GenMul.pm b/Matriplex/GenMul.pm
@@ -777,6 +777,51 @@ sub dump_multiply_std_and_intrinsic
     select FF;
   }
 
+  print <<"FNORD";
+#ifdef MPLEX_INTRINSICS
+
+   for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T))
+   {
+FNORD
+
+  $S->multiply_intrinsic($a, $b, $c);
+
+  print <<"FNORD";
+   }
+
+#else
+
+#pragma omp simd
+   for (int n = 0; n < N; ++n)
+   {
+FNORD
+
+  $S->multiply_standard($a, $b, $c);
+
+  print <<"FNORD";
+   }
+#endif
+FNORD
+
+  unless ($fname eq '-')
+  {
+    close FF;
+    select STDOUT;
+  }
+}
+
+# ----------------------------------------------------------------------
+
+sub dump_multiply_std_and_intrinsic_and_gpu
+{
+  my ($S, $fname, $a, $b, $c) = @_;
+
+  unless ($fname eq '-')
+  {
+    open FF, ">$fname";
+    select FF;
+  }
+
   print <<"FNORD";
 #ifndef __CUDACC__
 #ifdef MPLEX_INTRINSICS
@@ -809,7 +854,6 @@ FNORD
 #endif  // __CUDACC__
 FNORD
 
-
   unless ($fname eq '-')
   {
     close FF;

diff --git a/Matrix.h b/Matrix.h
@@ -81,13 +81,11 @@ inline double dtime()
     return( tseconds );
 }
 
-CUDA_CALLABLE
 inline float hipo(float x, float y)
 {
   return std::sqrt(x*x + y*y);
 }
 
-CUDA_CALLABLE
 inline void sincos4(const float x, float& sin, float& cos)
 {
    // Had this writen with explicit division by factorial.

diff --git a/README.md b/README.md
@@ -52,7 +52,6 @@
 - **phi3.t2.ucsd.edu**: [Intel Xeon Gold 6130 Processor](https://ark.intel.com/products/120492/Intel-Xeon-Gold-6130-Processor-22M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Au, SKL-SP, phi3)
 - **lnx4108.classe.cornell.edu**: [Intel Xeon Silver 4116 Processor](https://ark.intel.com/products/120481/Intel-Xeon-Silver-4116-Processor-16_5M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Ag, SKL-SP, lnx4108, LNX-S)
 - **lnx7188.classe.cornell.edu**: [Intel Xeon Gold 6142 Processor](https://ark.intel.com/content/www/us/en/ark/products/120487/intel-xeon-gold-6142-processor-22m-cache-2-60-ghz.html) _Skylake Scalable Performance_ (referred to as lnx7188,LNX-G)
-- **GPUs**: to be filled out
 
 phi1, phi2, and phi3 are all managed across a virtual login server and therefore the home user spaces are shared. phi1, phi2, phi3, lnx7188, and lnx4108 also have /cvmfs mounted so you can source the environment needed to run the code.
 
@@ -420,7 +419,7 @@ Described in validation manifesto. See Section 8 for more info on manifesto.
 ### TO DO
 
 - flesh out sections as needed
-- GPU specific code
+- GPU specific code?
 
 ### Vestigial code