Skip to content

Commit

Permalink
comment out all omp refs but simd, change icc(gcc) flag to -q(f)openm…
Browse files Browse the repository at this point in the history
…p-simd
  • Loading branch information
srlantz committed Oct 9, 2020
1 parent b767b4d commit 7bfd16f
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 30 deletions.
12 changes: 6 additions & 6 deletions Makefile.config
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,11 @@ ifdef USE_CUDA
CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo
LDFLAGS_HOST += -L${CUDALIBDIR}
ifeq ($(CXX),icpc)
CXXFLAGS += -qopenmp
LDFLAGS += -qopenmp
CXXFLAGS += -qopenmp-simd
LDFLAGS += -qopenmp-simd
else
CXXFLAGS += -fopenmp
LDFLAGS += -fopenmp
CXXFLAGS += -fopenmp-simd
LDFLAGS += -fopenmp-simd
endif
endif
#CXXFLAGS += -qopenmp
Expand All @@ -174,14 +174,14 @@ endif

ifeq (${CXX}, ${ICC})
VEC_HOST := ${VEC_ICC}
CXXFLAGS += -qopt-report=5 -qopenmp -qopt-report-phase=all
CXXFLAGS += -qopt-report=5 -qopenmp-simd -qopt-report-phase=all
else
VEC_HOST := ${VEC_GCC}
endif

ifeq ($(CXX), g++)
CXXFLAGS += -std=c++1z -ftree-vectorize -Werror=main -Werror=pointer-arith -Werror=overlength-strings -Wno-vla -Werror=overflow -Wstrict-overflow -Werror=array-bounds -Werror=format-contains-nul -Werror=type-limits -fvisibility-inlines-hidden -fno-math-errno --param vect-max-version-for-alias-checks=50 -Xassembler --compress-debug-sections -felide-constructors -fmessage-length=0 -Wall -Wno-non-template-friend -Wno-long-long -Wreturn-type -Wunused -Wparentheses -Wno-deprecated -Werror=return-type -Werror=missing-braces -Werror=unused-value -Werror=address -Werror=format -Werror=sign-compare -Werror=write-strings -Werror=delete-non-virtual-dtor -Wstrict-aliasing -Werror=narrowing -Werror=unused-but-set-variable -Werror=reorder -Werror=unused-variable -Werror=conversion-null -Werror=return-local-addr -Wnon-virtual-dtor -Werror=switch -fdiagnostics-show-option -Wno-unused-local-typedefs -Wno-attributes -Wno-psabi
CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp
CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp-simd
endif

# Try to find a new enough TBB
Expand Down
4 changes: 2 additions & 2 deletions mkFit/FitterCU-imp.h
Original file line number Diff line number Diff line change
Expand Up @@ -392,8 +392,8 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
#if 0
double time_input = dtime();
int itrack;
omp_set_num_threads(Config::numThreadsReorg);
#pragma omp parallel for
//omp_set_num_threads(Config::numThreadsReorg);
//#pragma omp parallel for
for (int i = beg; i < end; ++i) {
itrack = i - beg;
Track &trk = tracks[i];
Expand Down
2 changes: 1 addition & 1 deletion mkFit/FitterCU.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include "index_selection_kernels.h"
#include "best_hit_kernels.h"

#include <omp.h>
//#include <omp.h>
#include <stdexcept>

#define BLOCK_SIZE_X 256
Expand Down
21 changes: 9 additions & 12 deletions mkFit/MkFitter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,14 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>& tracks,
int itrack = 0;

// FIXME: uncomment when track building is ported to GPU.
#if USE_CUDA_NOT_YET
//#ifdef USE_CUDA
// This openmp loop brings some performances when using
// a single thread to fit all events.
// However, it is more advantageous to use the threads to
// parallelize over Events.
omp_set_num_threads(Config::numThreadsReorg);
#pragma omp parallel for private(itrack)
#endif
// omp_set_num_threads(Config::numThreadsReorg);
//#pragma omp parallel for private(itrack)
//#endif
for (int i = beg; i < end; ++i, ++itrack)
{
const Track &trk = tracks[i];
Expand Down Expand Up @@ -121,14 +120,13 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>& tracks,

int itrack;
//#ifdef USE_CUDA
#if 0
// This openmp loop brings some performances when using
// a single thread to fit all events.
// However, it is more advantageous to use the threads to
// parallelize over Events.
omp_set_num_threads(Config::numThreadsReorg);
#pragma omp parallel for private(itrack)
#endif
// omp_set_num_threads(Config::numThreadsReorg);
//#pragma omp parallel for private(itrack)
//#endif
for (int i = beg; i < end; ++i) {
itrack = i - beg;
const Track &trk = tracks[i];
Expand Down Expand Up @@ -173,14 +171,13 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>& tracks,
MatriplexTrackPacker mtp(tracks[beg]);

//#ifdef USE_CUDA
#if 0
// This openmp loop brings some performances when using
// a single thread to fit all events.
// However, it is more advantageous to use the threads to
// parallelize over Events.
omp_set_num_threads(Config::numThreadsReorg);
#pragma omp parallel for private(itrack)
#endif
// omp_set_num_threads(Config::numThreadsReorg);
//#pragma omp parallel for private(itrack)
//#endif
for (int i = beg; i < end; ++i)
{
int itrack = i - beg;
Expand Down
21 changes: 12 additions & 9 deletions mkFit/fittestMPlex.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#if USE_CUDA
#include "fittestMPlex.h"
#include "FitterCU.h"
#include <omp.h>
//#include <omp.h>
#endif

#ifndef NO_ROOT
Expand Down Expand Up @@ -142,15 +142,18 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
#endif
separate_first_call_for_meaningful_profiling_numbers();

// Reorgnanization (copyIn) can eventually be multithreaded.
omp_set_nested(1);
// Reorganization (copyIn) can eventually be multithreaded.
// FIXME: revisit multithreading when track building is ported to GPU.
// omp_set_nested(1);

omp_set_num_threads(Config::numThreadsEvents);
// omp_set_num_threads(Config::numThreadsEvents);
double total_gpu_time = dtime();
#pragma omp parallel reduction(+:s_tmp)
//#pragma omp parallel reduction(+:s_tmp)
{
int numThreadsEvents = omp_get_num_threads();
int thr_idx = omp_get_thread_num();
// int numThreadsEvents = omp_get_num_threads();
// int thr_idx = omp_get_thread_num();
int numThreadsEvents = 1;
int thr_idx = 0;

// FitterCU is declared here to share allocations and deallocations
// between the multiple events processed by a single thread.
Expand All @@ -177,11 +180,11 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
#if 0 // 0 for timing, 1 for validation
// Validation crashes for multiple threads.
// It is something in relation to ROOT. Not sure what.
if (omp_get_num_threads() <= 1) {
//if (omp_get_num_threads() <= 1) {
//if (g_run_fit_std) {
std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root";
//}
}
//}
#endif
}
cuFitter.free_extra_addBestHit();
Expand Down

0 comments on commit 7bfd16f

Please sign in to comment.