From 7bfd16f26cb80d8337f8665bcdfe88796b2be403 Mon Sep 17 00:00:00 2001 From: Steve Lantz Date: Wed, 7 Oct 2020 09:30:07 -0700 Subject: [PATCH] comment out all omp refs but simd, change icc(gcc) flag to -q(f)openmp-simd --- Makefile.config | 12 ++++++------ mkFit/FitterCU-imp.h | 4 ++-- mkFit/FitterCU.h | 2 +- mkFit/MkFitter.cc | 21 +++++++++------------ mkFit/fittestMPlex.cc | 21 ++++++++++++--------- 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/Makefile.config b/Makefile.config index 0192e1dd..886bd61b 100644 --- a/Makefile.config +++ b/Makefile.config @@ -147,11 +147,11 @@ ifdef USE_CUDA CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo LDFLAGS_HOST += -L${CUDALIBDIR} ifeq ($(CXX),icpc) - CXXFLAGS += -qopenmp - LDFLAGS += -qopenmp + CXXFLAGS += -qopenmp-simd + LDFLAGS += -qopenmp-simd else - CXXFLAGS += -fopenmp - LDFLAGS += -fopenmp + CXXFLAGS += -fopenmp-simd + LDFLAGS += -fopenmp-simd endif endif #CXXFLAGS += -qopenmp @@ -174,14 +174,14 @@ endif ifeq (${CXX}, ${ICC}) VEC_HOST := ${VEC_ICC} - CXXFLAGS += -qopt-report=5 -qopenmp -qopt-report-phase=all + CXXFLAGS += -qopt-report=5 -qopenmp-simd -qopt-report-phase=all else VEC_HOST := ${VEC_GCC} endif ifeq ($(CXX), g++) CXXFLAGS += -std=c++1z -ftree-vectorize -Werror=main -Werror=pointer-arith -Werror=overlength-strings -Wno-vla -Werror=overflow -Wstrict-overflow -Werror=array-bounds -Werror=format-contains-nul -Werror=type-limits -fvisibility-inlines-hidden -fno-math-errno --param vect-max-version-for-alias-checks=50 -Xassembler --compress-debug-sections -felide-constructors -fmessage-length=0 -Wall -Wno-non-template-friend -Wno-long-long -Wreturn-type -Wunused -Wparentheses -Wno-deprecated -Werror=return-type -Werror=missing-braces -Werror=unused-value -Werror=address -Werror=format -Werror=sign-compare -Werror=write-strings -Werror=delete-non-virtual-dtor -Wstrict-aliasing -Werror=narrowing -Werror=unused-but-set-variable -Werror=reorder -Werror=unused-variable -Werror=conversion-null -Werror=return-local-addr -Wnon-virtual-dtor -Werror=switch -fdiagnostics-show-option -Wno-unused-local-typedefs -Wno-attributes -Wno-psabi - CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp + CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp-simd endif # Try to find a new enough TBB diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h index ffefc6ce..db1658c4 100644 --- a/mkFit/FitterCU-imp.h +++ b/mkFit/FitterCU-imp.h @@ -392,8 +392,8 @@ void FitterCU::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC, #if 0 double time_input = dtime(); int itrack; - omp_set_num_threads(Config::numThreadsReorg); -#pragma omp parallel for + //omp_set_num_threads(Config::numThreadsReorg); +//#pragma omp parallel for for (int i = beg; i < end; ++i) { itrack = i - beg; Track &trk = tracks[i]; diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h index 3dc26039..1ed9d5b9 100644 --- a/mkFit/FitterCU.h +++ b/mkFit/FitterCU.h @@ -16,7 +16,7 @@ #include "index_selection_kernels.h" #include "best_hit_kernels.h" -#include +//#include #include #define BLOCK_SIZE_X 256 diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc index 69a63f78..3d4c14bd 100644 --- a/mkFit/MkFitter.cc +++ b/mkFit/MkFitter.cc @@ -69,15 +69,14 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, int itrack = 0; // FIXME: uncomment when track building is ported to GPU. -#if USE_CUDA_NOT_YET //#ifdef USE_CUDA // This openmp loop brings some performances when using // a single thread to fit all events. // However, it is more advantageous to use the threads to // parallelize over Events. - omp_set_num_threads(Config::numThreadsReorg); -#pragma omp parallel for private(itrack) -#endif +// omp_set_num_threads(Config::numThreadsReorg); +//#pragma omp parallel for private(itrack) +//#endif for (int i = beg; i < end; ++i, ++itrack) { const Track &trk = tracks[i]; @@ -121,14 +120,13 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, int itrack; //#ifdef USE_CUDA -#if 0 // This openmp loop brings some performances when using // a single thread to fit all events. // However, it is more advantageous to use the threads to // parallelize over Events. - omp_set_num_threads(Config::numThreadsReorg); -#pragma omp parallel for private(itrack) -#endif +// omp_set_num_threads(Config::numThreadsReorg); +//#pragma omp parallel for private(itrack) +//#endif for (int i = beg; i < end; ++i) { itrack = i - beg; const Track &trk = tracks[i]; @@ -173,14 +171,13 @@ void MkFitter::SlurpInTracksAndHits(const std::vector& tracks, MatriplexTrackPacker mtp(tracks[beg]); //#ifdef USE_CUDA -#if 0 // This openmp loop brings some performances when using // a single thread to fit all events. // However, it is more advantageous to use the threads to // parallelize over Events. - omp_set_num_threads(Config::numThreadsReorg); -#pragma omp parallel for private(itrack) -#endif +// omp_set_num_threads(Config::numThreadsReorg); +//#pragma omp parallel for private(itrack) +//#endif for (int i = beg; i < end; ++i) { int itrack = i - beg; diff --git a/mkFit/fittestMPlex.cc b/mkFit/fittestMPlex.cc index cb5b2ba7..7c81e226 100644 --- a/mkFit/fittestMPlex.cc +++ b/mkFit/fittestMPlex.cc @@ -12,7 +12,7 @@ #if USE_CUDA #include "fittestMPlex.h" #include "FitterCU.h" -#include +//#include #endif #ifndef NO_ROOT @@ -142,15 +142,18 @@ void runAllEventsFittingTestPlexGPU(std::vector& events) #endif separate_first_call_for_meaningful_profiling_numbers(); - // Reorgnanization (copyIn) can eventually be multithreaded. - omp_set_nested(1); + // Reorganization (copyIn) can eventually be multithreaded. +// FIXME: revisit multithreading when track building is ported to GPU. +// omp_set_nested(1); - omp_set_num_threads(Config::numThreadsEvents); +// omp_set_num_threads(Config::numThreadsEvents); double total_gpu_time = dtime(); -#pragma omp parallel reduction(+:s_tmp) +//#pragma omp parallel reduction(+:s_tmp) { - int numThreadsEvents = omp_get_num_threads(); - int thr_idx = omp_get_thread_num(); +// int numThreadsEvents = omp_get_num_threads(); +// int thr_idx = omp_get_thread_num(); + int numThreadsEvents = 1; + int thr_idx = 0; // FitterCU is declared here to share allocations and deallocations // between the multiple events processed by a single thread. @@ -177,11 +180,11 @@ void runAllEventsFittingTestPlexGPU(std::vector& events) #if 0 // 0 for timing, 1 for validation // Validation crashes for multiple threads. // It is something in relation to ROOT. Not sure what. - if (omp_get_num_threads() <= 1) { + //if (omp_get_num_threads() <= 1) { //if (g_run_fit_std) { std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root"; //} - } + //} #endif } cuFitter.free_extra_addBestHit();