From 7bfd16f26cb80d8337f8665bcdfe88796b2be403 Mon Sep 17 00:00:00 2001
From: Steve Lantz <steve.lantz@cornell.edu>
Date: Wed, 7 Oct 2020 09:30:07 -0700
Subject: [PATCH] comment out all omp refs but simd, change icc(gcc) flag to
 -q(f)openmp-simd

---
 Makefile.config       | 12 ++++++------
 mkFit/FitterCU-imp.h  |  4 ++--
 mkFit/FitterCU.h      |  2 +-
 mkFit/MkFitter.cc     | 21 +++++++++------------
 mkFit/fittestMPlex.cc | 21 ++++++++++++---------
 5 files changed, 30 insertions(+), 30 deletions(-)
diff --git a/Makefile.config b/Makefile.config
index 0192e1dd..886bd61b 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -147,11 +147,11 @@ ifdef USE_CUDA
 	CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo
 	LDFLAGS_HOST += -L${CUDALIBDIR}
 	ifeq ($(CXX),icpc)
-	  CXXFLAGS += -qopenmp
-	  LDFLAGS  += -qopenmp
+	  CXXFLAGS += -qopenmp-simd
+	  LDFLAGS  += -qopenmp-simd
 	else
-	  CXXFLAGS += -fopenmp
-	  LDFLAGS  += -fopenmp
+	  CXXFLAGS += -fopenmp-simd
+	  LDFLAGS  += -fopenmp-simd
 	endif
 endif
 #CXXFLAGS += -qopenmp
@@ -174,14 +174,14 @@ endif
 
 ifeq (${CXX}, ${ICC})
   VEC_HOST := ${VEC_ICC}
-  CXXFLAGS += -qopt-report=5 -qopenmp -qopt-report-phase=all
+  CXXFLAGS += -qopt-report=5 -qopenmp-simd -qopt-report-phase=all
 else
   VEC_HOST := ${VEC_GCC}
 endif
 
 ifeq ($(CXX), g++)
   CXXFLAGS += -std=c++1z -ftree-vectorize -Werror=main -Werror=pointer-arith -Werror=overlength-strings -Wno-vla -Werror=overflow -Wstrict-overflow -Werror=array-bounds -Werror=format-contains-nul -Werror=type-limits -fvisibility-inlines-hidden -fno-math-errno --param vect-max-version-for-alias-checks=50 -Xassembler --compress-debug-sections -felide-constructors -fmessage-length=0 -Wall -Wno-non-template-friend -Wno-long-long -Wreturn-type -Wunused -Wparentheses -Wno-deprecated -Werror=return-type -Werror=missing-braces -Werror=unused-value -Werror=address -Werror=format -Werror=sign-compare -Werror=write-strings -Werror=delete-non-virtual-dtor -Wstrict-aliasing -Werror=narrowing -Werror=unused-but-set-variable -Werror=reorder -Werror=unused-variable -Werror=conversion-null -Werror=return-local-addr -Wnon-virtual-dtor -Werror=switch -fdiagnostics-show-option -Wno-unused-local-typedefs -Wno-attributes -Wno-psabi
-  CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp
+  CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp-simd
 endif
 
 # Try to find a new enough TBB
diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h
index ffefc6ce..db1658c4 100644
--- a/mkFit/FitterCU-imp.h
+++ b/mkFit/FitterCU-imp.h
@@ -392,8 +392,8 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
 #if 0
     double time_input = dtime();
     int itrack;
-    omp_set_num_threads(Config::numThreadsReorg);
-#pragma omp parallel for
+    //omp_set_num_threads(Config::numThreadsReorg);
+//#pragma omp parallel for
     for (int i = beg; i < end; ++i) {
       itrack = i - beg;
       Track &trk = tracks[i];
diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h
index 3dc26039..1ed9d5b9 100644
--- a/mkFit/FitterCU.h
+++ b/mkFit/FitterCU.h
@@ -16,7 +16,7 @@
 #include "index_selection_kernels.h"
 #include "best_hit_kernels.h"
 
-#include <omp.h>
+//#include <omp.h>
 #include <stdexcept>
 
 #define BLOCK_SIZE_X 256
diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc
index 69a63f78..3d4c14bd 100644
--- a/mkFit/MkFitter.cc
+++ b/mkFit/MkFitter.cc
@@ -69,15 +69,14 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
   int itrack = 0;
 
 // FIXME: uncomment when track building is ported to GPU.
-#if USE_CUDA_NOT_YET
 //#ifdef USE_CUDA
   // This openmp loop brings some performances when using
   // a single thread to fit all events.
   // However, it is more advantageous to use the threads to
   // parallelize over Events.
-  omp_set_num_threads(Config::numThreadsReorg);
-#pragma omp parallel for private(itrack)
-#endif
+//  omp_set_num_threads(Config::numThreadsReorg);
+//#pragma omp parallel for private(itrack)
+//#endif
   for (int i = beg; i < end; ++i, ++itrack)
   {
     const Track &trk = tracks[i];
@@ -121,14 +120,13 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
 
   int itrack;
 //#ifdef USE_CUDA
-#if 0
   // This openmp loop brings some performances when using
   // a single thread to fit all events.
   // However, it is more advantageous to use the threads to
   // parallelize over Events.
-  omp_set_num_threads(Config::numThreadsReorg);
-#pragma omp parallel for private(itrack)
-#endif
+//  omp_set_num_threads(Config::numThreadsReorg);
+//#pragma omp parallel for private(itrack)
+//#endif
   for (int i = beg; i < end; ++i) {
     itrack = i - beg;
     const Track &trk = tracks[i];
@@ -173,14 +171,13 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>&  tracks,
   MatriplexTrackPacker mtp(tracks[beg]);
 
 //#ifdef USE_CUDA
-#if 0
   // This openmp loop brings some performances when using
   // a single thread to fit all events.
   // However, it is more advantageous to use the threads to
   // parallelize over Events.
-  omp_set_num_threads(Config::numThreadsReorg);
-#pragma omp parallel for private(itrack)
-#endif
+//  omp_set_num_threads(Config::numThreadsReorg);
+//#pragma omp parallel for private(itrack)
+//#endif
   for (int i = beg; i < end; ++i)
   {
     int itrack = i - beg;
diff --git a/mkFit/fittestMPlex.cc b/mkFit/fittestMPlex.cc
index cb5b2ba7..7c81e226 100644
--- a/mkFit/fittestMPlex.cc
+++ b/mkFit/fittestMPlex.cc
@@ -12,7 +12,7 @@
 #if USE_CUDA
 #include "fittestMPlex.h"
 #include "FitterCU.h"
-#include <omp.h>
+//#include <omp.h>
 #endif
 
 #ifndef NO_ROOT
@@ -142,15 +142,18 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
 #endif
   separate_first_call_for_meaningful_profiling_numbers();
 
-  // Reorgnanization (copyIn) can eventually be multithreaded.
-  omp_set_nested(1);
+  // Reorganization (copyIn) can eventually be multithreaded.
+// FIXME: revisit multithreading when track building is ported to GPU.
+//  omp_set_nested(1);
       
-  omp_set_num_threads(Config::numThreadsEvents);
+//  omp_set_num_threads(Config::numThreadsEvents);
   double total_gpu_time = dtime();
-#pragma omp parallel reduction(+:s_tmp)
+//#pragma omp parallel reduction(+:s_tmp)
   {
-  int numThreadsEvents = omp_get_num_threads();
-  int thr_idx = omp_get_thread_num();
+//  int numThreadsEvents = omp_get_num_threads();
+//  int thr_idx = omp_get_thread_num();
+  int numThreadsEvents = 1;
+  int thr_idx = 0;
 
   // FitterCU is declared here to share allocations and deallocations
   // between the multiple events processed by a single thread.
@@ -177,11 +180,11 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
 #if 0  // 0 for timing, 1 for validation
       // Validation crashes for multiple threads.
       // It is something in relation to ROOT. Not sure what. 
-      if (omp_get_num_threads() <= 1) {
+      //if (omp_get_num_threads() <= 1) {
         //if (g_run_fit_std) {
           std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root";
         //}
-      }
+      //}
 #endif
     }
     cuFitter.free_extra_addBestHit();