Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove old cuda #283

Merged
merged 4 commits into from
Nov 9, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions BinInfoUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,25 @@ typedef std::pair<int, int> BinInfo;
typedef std::vector<std::vector<BinInfo>> BinInfoLayerMap;
typedef std::vector<BinInfoLayerMap> BinInfoMap;

CUDA_CALLABLE
inline float downPhi(float phi)
{
while (phi >= Config::PI) {phi-=Config::TwoPI;}
return phi;
}

CUDA_CALLABLE
inline float upPhi(float phi)
{
while (phi <= -Config::PI) {phi+=Config::TwoPI;}
return phi;
}

CUDA_CALLABLE
inline float normalizedPhi(float phi)
{
// return std::fmod(phi, (float) Config::PI); // return phi +pi out of phase for |phi| beyond boundary!
if (std::abs(phi)>=Config::PI) {phi = (phi>0 ? downPhi(phi) : upPhi(phi));}
return phi;
}

CUDA_CALLABLE
inline int getPhiPartition(float phi)
{
//assume phi is between -PI and PI
Expand Down
5 changes: 1 addition & 4 deletions Config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,8 @@ namespace Config

// Multi threading and Clone engine configuration
int numThreadsFinder = 1;

// GPU computations
int numThreadsEvents = 1;
int numThreadsReorg = 1;


#if defined(__MIC__) || defined(__AVX512F__)
int numThreadsSimulation = 60;
#else
Expand Down
14 changes: 1 addition & 13 deletions Config.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@
#include <string> // won't compile on clang gcc for mac OS w/o this!
#include <map>

#if defined(__CUDACC__)
#define CUDA_CALLABLE __host__ __device__
#else
#define CUDA_CALLABLE
#endif

namespace mkfit {

// Cram this in here for now ...
Expand Down Expand Up @@ -259,7 +253,7 @@ namespace Config
// Config for Hit and BinInfoUtils
constexpr int nPhiPart = 1260;
constexpr float fPhiFactor = nPhiPart / TwoPI;
constexpr int nEtaPart = 11; // 1 is better for GPU best_hit
constexpr int nEtaPart = 11;
constexpr int nEtaBin = 2 * nEtaPart - 1;

constexpr float fEtaFull = 2 * Config::fEtaDet;
Expand Down Expand Up @@ -363,10 +357,7 @@ namespace Config
// Threading
extern int numThreadsFinder;
extern int numThreadsSimulation;

// For GPU computations
extern int numThreadsEvents;
extern int numThreadsReorg;

extern int finderReportBestOutOfN;

Expand Down Expand Up @@ -409,7 +400,6 @@ namespace Config

void RecalculateDependentConstants();

CUDA_CALLABLE
inline float BfieldFromZR(const float z, const float r)
{
return (Config::mag_b0*z*z + Config::mag_b1*z + Config::mag_c1)*(Config::mag_a*r*r + 1.f);
Expand All @@ -420,8 +410,6 @@ namespace Config
#ifndef MPT_SIZE
#if defined(__MIC__) || defined(__AVX512F__)
#define MPT_SIZE 16
#elif defined USE_CUDA
#define MPT_SIZE 8
#elif defined(__AVX__) || defined(__AVX2__)
#define MPT_SIZE 8
#else
Expand Down
6 changes: 0 additions & 6 deletions Hit.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,11 @@ inline float getInvRad2(float x, float y){
return 1.0f/(x*x + y*y);
}

CUDA_CALLABLE
inline float getPhi(float x, float y)
{
return std::atan2(y,x);
}

CUDA_CALLABLE
inline float getTheta(float r, float z){
return std::atan2(r,z);
}
Expand Down Expand Up @@ -201,10 +199,6 @@ class Hit

const float* posArray() const {return state_.pos_.Array();}
const float* errArray() const {return state_.err_.Array();}
#if __CUDACC__
__device__ float* posArrayCU();
__device__ float* errArrayCU();
#endif

// Non-const versions needed for CopyOut of Matriplex.
SVector3& parameters_nc() {return state_.pos_;}
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ distclean: clean-local

${LIB_CORE}: ${CORE_OBJS}
@mkdir -p $(@D)
${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS_CU} ${LDFLAGS}
${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS}

main: ${AUTO_TGTS} ${LIB_CORE} main.o
${CXX} ${CXXFLAGS} ${VEC_HOST} -o $@ main.o ${LDFLAGS_HOST} ${LDFLAGS} -Llib -lMicCore -Wl,-rpath,lib
Expand Down
33 changes: 0 additions & 33 deletions Makefile.config
Original file line number Diff line number Diff line change
Expand Up @@ -47,25 +47,6 @@ else ifdef OSXMPCLANG
TBB_PREFIX := /opt/local
endif

# 2.1 Use nvcc to compile cuda code
# Using the CUB library for standard GPU algorithm http://nvlabs.github.io/cub/
# It's header only and potentially exported by the environment
# Maybe it is good enough to have:
# CUBROOT?=Undefined
# CUDAINCDIR and CUDALIBDIR also need to be defined
ifneq (,$(realpath /home/ml15/tools/cub))
CUBROOT?=/home/ml15/tools/cub
else ifneq (,$(realpath /nfs/opt/cuda-8-0/include))
CUBROOT?=/nfs/opt/cuda-8-0/include
else ifneq (,$(realpath /usr/local/cuda/include))
CUBROOT?=/usr/local/cuda/include
endif
NV := nvcc -prec-sqrt=true -I${CUBROOT}
#-g -G -lineinfo
# Comment out to compile for CPU
#USE_CUDA := 1
# For CUDA: Also need to change maxCandsPerSeed to 8 and nEtaPart to 1

# 3. Optimization
# -O3 implies vectorization and simd (but not AVX)
OPT := -g -O3
Expand Down Expand Up @@ -143,20 +124,6 @@ CXXFLAGS := -fPIC ${OPT} ${OSX_CXXFLAGS}
LDFLAGS_HOST :=
LDFLAGS_MIC := -static-intel

ifdef USE_CUDA
CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo
LDFLAGS_HOST += -L${CUDALIBDIR}
ifeq ($(CXX),icpc)
CXXFLAGS += -qopenmp-simd
LDFLAGS += -qopenmp-simd
else
CXXFLAGS += -fopenmp-simd
LDFLAGS += -fopenmp-simd
endif
endif
#CXXFLAGS += -qopenmp
#LDFLAGS += -qopenmp

CPPFLAGS += ${USE_STATE_VALIDITY_CHECKS} ${USE_SCATTERING} ${USE_LINEAR_INTERPOLATION} ${ENDTOEND} ${INWARD_FIT}

ifdef USE_VTUNE_NOTIFY
Expand Down
46 changes: 45 additions & 1 deletion Matriplex/GenMul.pm
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,51 @@ sub dump_multiply_std_and_intrinsic
select FF;
}

print <<"FNORD";
#ifdef MPLEX_INTRINSICS

for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T))
{
FNORD

$S->multiply_intrinsic($a, $b, $c);

print <<"FNORD";
}

#else

#pragma omp simd
for (int n = 0; n < N; ++n)
{
FNORD

$S->multiply_standard($a, $b, $c);

print <<"FNORD";
}
#endif
FNORD

unless ($fname eq '-')
{
close FF;
select STDOUT;
}
}

# ----------------------------------------------------------------------

sub dump_multiply_std_and_intrinsic_and_gpu
{
my ($S, $fname, $a, $b, $c) = @_;

unless ($fname eq '-')
{
open FF, ">$fname";
select FF;
}

print <<"FNORD";
#ifndef __CUDACC__
#ifdef MPLEX_INTRINSICS
Expand Down Expand Up @@ -809,7 +854,6 @@ FNORD
#endif // __CUDACC__
FNORD


unless ($fname eq '-')
{
close FF;
Expand Down
2 changes: 0 additions & 2 deletions Matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,11 @@ inline double dtime()
return( tseconds );
}

CUDA_CALLABLE
inline float hipo(float x, float y)
{
return std::sqrt(x*x + y*y);
}

CUDA_CALLABLE
inline void sincos4(const float x, float& sin, float& cos)
{
// Had this writen with explicit division by factorial.
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
- **phi3.t2.ucsd.edu**: [Intel Xeon Gold 6130 Processor](https://ark.intel.com/products/120492/Intel-Xeon-Gold-6130-Processor-22M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Au, SKL-SP, phi3)
- **lnx4108.classe.cornell.edu**: [Intel Xeon Silver 4116 Processor](https://ark.intel.com/products/120481/Intel-Xeon-Silver-4116-Processor-16_5M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Ag, SKL-SP, lnx4108, LNX-S)
- **lnx7188.classe.cornell.edu**: [Intel Xeon Gold 6142 Processor](https://ark.intel.com/content/www/us/en/ark/products/120487/intel-xeon-gold-6142-processor-22m-cache-2-60-ghz.html) _Skylake Scalable Performance_ (referred to as lnx7188,LNX-G)
- **GPUs**: to be filled out

phi1, phi2, and phi3 are all managed across a virtual login server and therefore the home user spaces are shared. phi1, phi2, phi3, lnx7188, and lnx4108 also have /cvmfs mounted so you can source the environment needed to run the code.

Expand Down Expand Up @@ -420,7 +419,7 @@ Described in validation manifesto. See Section 8 for more info on manifesto.
### TO DO

- flesh out sections as needed
- GPU specific code
- GPU specific code?

### Vestigial code

Expand Down
Loading