Skip to content

Commit

Permalink
Merge pull request #283 from srlantz/remove-old-cuda
Browse files Browse the repository at this point in the history
Remove old cuda
  • Loading branch information
dan131riley authored Nov 9, 2020
2 parents b6a67e6 + 0d3a44c commit 10c2aaf
Show file tree
Hide file tree
Showing 69 changed files with 72 additions and 6,107 deletions.
4 changes: 0 additions & 4 deletions BinInfoUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,25 @@ typedef std::pair<int, int> BinInfo;
typedef std::vector<std::vector<BinInfo>> BinInfoLayerMap;
typedef std::vector<BinInfoLayerMap> BinInfoMap;

CUDA_CALLABLE
inline float downPhi(float phi)
{
while (phi >= Config::PI) {phi-=Config::TwoPI;}
return phi;
}

CUDA_CALLABLE
inline float upPhi(float phi)
{
while (phi <= -Config::PI) {phi+=Config::TwoPI;}
return phi;
}

CUDA_CALLABLE
inline float normalizedPhi(float phi)
{
// return std::fmod(phi, (float) Config::PI); // return phi +pi out of phase for |phi| beyond boundary!
if (std::abs(phi)>=Config::PI) {phi = (phi>0 ? downPhi(phi) : upPhi(phi));}
return phi;
}

CUDA_CALLABLE
inline int getPhiPartition(float phi)
{
//assume phi is between -PI and PI
Expand Down
5 changes: 1 addition & 4 deletions Config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,8 @@ namespace Config

// Multi threading and Clone engine configuration
int numThreadsFinder = 1;

// GPU computations
int numThreadsEvents = 1;
int numThreadsReorg = 1;


#if defined(__MIC__) || defined(__AVX512F__)
int numThreadsSimulation = 60;
#else
Expand Down
14 changes: 1 addition & 13 deletions Config.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@
#include <string> // won't compile on clang gcc for mac OS w/o this!
#include <map>

#if defined(__CUDACC__)
#define CUDA_CALLABLE __host__ __device__
#else
#define CUDA_CALLABLE
#endif

namespace mkfit {

// Cram this in here for now ...
Expand Down Expand Up @@ -259,7 +253,7 @@ namespace Config
// Config for Hit and BinInfoUtils
constexpr int nPhiPart = 1260;
constexpr float fPhiFactor = nPhiPart / TwoPI;
constexpr int nEtaPart = 11; // 1 is better for GPU best_hit
constexpr int nEtaPart = 11;
constexpr int nEtaBin = 2 * nEtaPart - 1;

constexpr float fEtaFull = 2 * Config::fEtaDet;
Expand Down Expand Up @@ -363,10 +357,7 @@ namespace Config
// Threading
extern int numThreadsFinder;
extern int numThreadsSimulation;

// For GPU computations
extern int numThreadsEvents;
extern int numThreadsReorg;

extern int finderReportBestOutOfN;

Expand Down Expand Up @@ -409,7 +400,6 @@ namespace Config

void RecalculateDependentConstants();

CUDA_CALLABLE
inline float BfieldFromZR(const float z, const float r)
{
return (Config::mag_b0*z*z + Config::mag_b1*z + Config::mag_c1)*(Config::mag_a*r*r + 1.f);
Expand All @@ -420,8 +410,6 @@ namespace Config
#ifndef MPT_SIZE
#if defined(__MIC__) || defined(__AVX512F__)
#define MPT_SIZE 16
#elif defined USE_CUDA
#define MPT_SIZE 8
#elif defined(__AVX__) || defined(__AVX2__)
#define MPT_SIZE 8
#else
Expand Down
6 changes: 0 additions & 6 deletions Hit.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,11 @@ inline float getInvRad2(float x, float y){
return 1.0f/(x*x + y*y);
}

CUDA_CALLABLE
inline float getPhi(float x, float y)
{
return std::atan2(y,x);
}

CUDA_CALLABLE
inline float getTheta(float r, float z){
return std::atan2(r,z);
}
Expand Down Expand Up @@ -201,10 +199,6 @@ class Hit

const float* posArray() const {return state_.pos_.Array();}
const float* errArray() const {return state_.err_.Array();}
#if __CUDACC__
__device__ float* posArrayCU();
__device__ float* errArrayCU();
#endif

// Non-const versions needed for CopyOut of Matriplex.
SVector3& parameters_nc() {return state_.pos_;}
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ distclean: clean-local

${LIB_CORE}: ${CORE_OBJS}
@mkdir -p $(@D)
${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS_CU} ${LDFLAGS}
${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS}

main: ${AUTO_TGTS} ${LIB_CORE} main.o
${CXX} ${CXXFLAGS} ${VEC_HOST} -o $@ main.o ${LDFLAGS_HOST} ${LDFLAGS} -Llib -lMicCore -Wl,-rpath,lib
Expand Down
33 changes: 0 additions & 33 deletions Makefile.config
Original file line number Diff line number Diff line change
Expand Up @@ -47,25 +47,6 @@ else ifdef OSXMPCLANG
TBB_PREFIX := /opt/local
endif

# 2.1 Use nvcc to compile cuda code
# Using the CUB library for standard GPU algorithm http://nvlabs.github.io/cub/
# It's header only and potentially exported by the environment
# Maybe it is good enough to have:
# CUBROOT?=Undefined
# CUDAINCDIR and CUDALIBDIR also need to be defined
ifneq (,$(realpath /home/ml15/tools/cub))
CUBROOT?=/home/ml15/tools/cub
else ifneq (,$(realpath /nfs/opt/cuda-8-0/include))
CUBROOT?=/nfs/opt/cuda-8-0/include
else ifneq (,$(realpath /usr/local/cuda/include))
CUBROOT?=/usr/local/cuda/include
endif
NV := nvcc -prec-sqrt=true -I${CUBROOT}
#-g -G -lineinfo
# Comment out to compile for CPU
#USE_CUDA := 1
# For CUDA: Also need to change maxCandsPerSeed to 8 and nEtaPart to 1

# 3. Optimization
# -O3 implies vectorization and simd (but not AVX)
OPT := -g -O3
Expand Down Expand Up @@ -143,20 +124,6 @@ CXXFLAGS := -fPIC ${OPT} ${OSX_CXXFLAGS}
LDFLAGS_HOST :=
LDFLAGS_MIC := -static-intel

ifdef USE_CUDA
CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo
LDFLAGS_HOST += -L${CUDALIBDIR}
ifeq ($(CXX),icpc)
CXXFLAGS += -qopenmp-simd
LDFLAGS += -qopenmp-simd
else
CXXFLAGS += -fopenmp-simd
LDFLAGS += -fopenmp-simd
endif
endif
#CXXFLAGS += -qopenmp
#LDFLAGS += -qopenmp

CPPFLAGS += ${USE_STATE_VALIDITY_CHECKS} ${USE_SCATTERING} ${USE_LINEAR_INTERPOLATION} ${ENDTOEND} ${INWARD_FIT}

ifdef USE_VTUNE_NOTIFY
Expand Down
46 changes: 45 additions & 1 deletion Matriplex/GenMul.pm
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,51 @@ sub dump_multiply_std_and_intrinsic
select FF;
}

print <<"FNORD";
#ifdef MPLEX_INTRINSICS
for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T))
{
FNORD

$S->multiply_intrinsic($a, $b, $c);

print <<"FNORD";
}
#else
#pragma omp simd
for (int n = 0; n < N; ++n)
{
FNORD

$S->multiply_standard($a, $b, $c);

print <<"FNORD";
}
#endif
FNORD

unless ($fname eq '-')
{
close FF;
select STDOUT;
}
}

# ----------------------------------------------------------------------

sub dump_multiply_std_and_intrinsic_and_gpu
{
my ($S, $fname, $a, $b, $c) = @_;

unless ($fname eq '-')
{
open FF, ">$fname";
select FF;
}

print <<"FNORD";
#ifndef __CUDACC__
#ifdef MPLEX_INTRINSICS
Expand Down Expand Up @@ -809,7 +854,6 @@ FNORD
#endif // __CUDACC__
FNORD


unless ($fname eq '-')
{
close FF;
Expand Down
2 changes: 0 additions & 2 deletions Matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,11 @@ inline double dtime()
return( tseconds );
}

CUDA_CALLABLE
inline float hipo(float x, float y)
{
return std::sqrt(x*x + y*y);
}

CUDA_CALLABLE
inline void sincos4(const float x, float& sin, float& cos)
{
// Had this writen with explicit division by factorial.
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
- **phi3.t2.ucsd.edu**: [Intel Xeon Gold 6130 Processor](https://ark.intel.com/products/120492/Intel-Xeon-Gold-6130-Processor-22M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Au, SKL-SP, phi3)
- **lnx4108.classe.cornell.edu**: [Intel Xeon Silver 4116 Processor](https://ark.intel.com/products/120481/Intel-Xeon-Silver-4116-Processor-16_5M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Ag, SKL-SP, lnx4108, LNX-S)
- **lnx7188.classe.cornell.edu**: [Intel Xeon Gold 6142 Processor](https://ark.intel.com/content/www/us/en/ark/products/120487/intel-xeon-gold-6142-processor-22m-cache-2-60-ghz.html) _Skylake Scalable Performance_ (referred to as lnx7188,LNX-G)
- **GPUs**: to be filled out

phi1, phi2, and phi3 are all managed across a virtual login server and therefore the home user spaces are shared. phi1, phi2, phi3, lnx7188, and lnx4108 also have /cvmfs mounted so you can source the environment needed to run the code.

Expand Down Expand Up @@ -420,7 +419,7 @@ Described in validation manifesto. See Section 8 for more info on manifesto.
### TO DO

- flesh out sections as needed
- GPU specific code
- GPU specific code?

### Vestigial code

Expand Down
Loading

0 comments on commit 10c2aaf

Please sign in to comment.