From 4a753a65d505f3f9e4c5c428dd2c03fd3433f830 Mon Sep 17 00:00:00 2001 From: Steve Lantz Date: Wed, 28 Oct 2020 18:06:35 -0700 Subject: [PATCH 1/3] remove all deprecated GPU code --- BinInfoUtils.h | 4 - Config.cc | 5 +- Config.h | 14 +- Hit.h | 6 - Makefile | 2 +- Makefile.config | 33 -- Matrix.h | 2 - README.md | 3 +- Track.h | 46 +-- gpu_scripts/gpu_setup.py | 76 ----- gpu_scripts/runBenchmark_cuda.py | 36 -- gpu_scripts/run_cleanup_only.py | 26 -- gpu_scripts/run_setup_only.py | 29 -- mkFit/BuilderCU.cu | 111 ------- mkFit/BuilderCU.h | 55 ---- mkFit/FitterCU-imp.h | 446 ------------------------- mkFit/FitterCU.h | 149 --------- mkFit/GPlex.h | 149 --------- mkFit/GeometryCU.h | 30 -- mkFit/HitStructuresCU.cu | 394 ---------------------- mkFit/HitStructuresCU.h | 211 ------------ mkFit/KalmanUtilsMPlex.h | 4 - mkFit/KalmanUtilsMPlex.icc | 23 -- mkFit/Makefile | 35 +- mkFit/MkBuilder.h | 4 - mkFit/MkFitter.cc | 52 +-- mkFit/MkFitter.h | 5 - mkFit/PropagationMPlex.icc | 3 - mkFit/accessors_cu.h | 40 --- mkFit/array_algorithms_cu.h | 86 ----- mkFit/atomic_utils.h | 102 ------ mkFit/best_hit_kernels.cu | 298 ----------------- mkFit/best_hit_kernels.h | 51 --- mkFit/build_tracks_kernels.cu | 203 ------------ mkFit/build_tracks_kernels.h | 35 -- mkFit/buildtestMPlex.cc | 170 ---------- mkFit/buildtestMPlex.h | 11 - mkFit/check_gpu_hit_structures.cu | 144 -------- mkFit/check_gpu_hit_structures.h | 12 - mkFit/clone_engine_cuda_helpers.cu | 46 --- mkFit/clone_engine_cuda_helpers.h | 69 ---- mkFit/clone_engine_kernels.cu | 258 --------------- mkFit/clone_engine_kernels.h | 26 -- mkFit/clone_engine_kernels_seed.cu | 182 ---------- mkFit/clone_engine_kernels_seed.h | 26 -- mkFit/computeChi2_kernels.cu | 144 -------- mkFit/computeChi2_kernels.h | 35 -- mkFit/device_array_view.h | 46 --- mkFit/device_bundle.h | 88 ----- mkFit/device_vector.h | 118 ------- mkFit/fittestMPlex.cc | 106 ------ mkFit/fittestMPlex.h | 9 - mkFit/fittracks_kernels.cu | 51 --- mkFit/fittracks_kernels.h | 17 - mkFit/gpu_utils.cu | 9 - mkFit/gpu_utils.h | 63 ---- mkFit/index_selection_kernels.cu | 153 --------- mkFit/index_selection_kernels.h | 15 - mkFit/kalmanUpdater_kernels.cu | 402 ----------------------- mkFit/kalmanUpdater_kernels.h | 38 --- mkFit/mkFit.cc | 105 +----- mkFit/propagation_kernels.cu | 316 ------------------ mkFit/propagation_kernels.h | 36 -- mkFit/reorganize_gplex.cu | 510 ----------------------------- mkFit/reorganize_gplex.h | 126 ------- 65 files changed, 26 insertions(+), 6073 deletions(-) delete mode 100644 gpu_scripts/gpu_setup.py delete mode 100755 gpu_scripts/runBenchmark_cuda.py delete mode 100755 gpu_scripts/run_cleanup_only.py delete mode 100755 gpu_scripts/run_setup_only.py delete mode 100644 mkFit/BuilderCU.cu delete mode 100644 mkFit/BuilderCU.h delete mode 100644 mkFit/FitterCU-imp.h delete mode 100644 mkFit/FitterCU.h delete mode 100644 mkFit/GPlex.h delete mode 100644 mkFit/GeometryCU.h delete mode 100644 mkFit/HitStructuresCU.cu delete mode 100644 mkFit/HitStructuresCU.h delete mode 100644 mkFit/accessors_cu.h delete mode 100644 mkFit/array_algorithms_cu.h delete mode 100644 mkFit/atomic_utils.h delete mode 100644 mkFit/best_hit_kernels.cu delete mode 100644 mkFit/best_hit_kernels.h delete mode 100644 mkFit/build_tracks_kernels.cu delete mode 100644 mkFit/build_tracks_kernels.h delete mode 100644 mkFit/check_gpu_hit_structures.cu delete mode 100644 mkFit/check_gpu_hit_structures.h delete mode 100644 mkFit/clone_engine_cuda_helpers.cu delete mode 100644 mkFit/clone_engine_cuda_helpers.h delete mode 100644 mkFit/clone_engine_kernels.cu delete mode 100644 mkFit/clone_engine_kernels.h delete mode 100644 mkFit/clone_engine_kernels_seed.cu delete mode 100644 mkFit/clone_engine_kernels_seed.h delete mode 100644 mkFit/computeChi2_kernels.cu delete mode 100644 mkFit/computeChi2_kernels.h delete mode 100644 mkFit/device_array_view.h delete mode 100644 mkFit/device_bundle.h delete mode 100644 mkFit/device_vector.h delete mode 100644 mkFit/fittracks_kernels.cu delete mode 100644 mkFit/fittracks_kernels.h delete mode 100644 mkFit/gpu_utils.cu delete mode 100644 mkFit/gpu_utils.h delete mode 100644 mkFit/index_selection_kernels.cu delete mode 100644 mkFit/index_selection_kernels.h delete mode 100644 mkFit/kalmanUpdater_kernels.cu delete mode 100644 mkFit/kalmanUpdater_kernels.h delete mode 100644 mkFit/propagation_kernels.cu delete mode 100644 mkFit/propagation_kernels.h delete mode 100644 mkFit/reorganize_gplex.cu delete mode 100644 mkFit/reorganize_gplex.h diff --git a/BinInfoUtils.h b/BinInfoUtils.h index d5eddad2..bd6f6fc6 100644 --- a/BinInfoUtils.h +++ b/BinInfoUtils.h @@ -17,21 +17,18 @@ typedef std::pair BinInfo; typedef std::vector> BinInfoLayerMap; typedef std::vector BinInfoMap; -CUDA_CALLABLE inline float downPhi(float phi) { while (phi >= Config::PI) {phi-=Config::TwoPI;} return phi; } -CUDA_CALLABLE inline float upPhi(float phi) { while (phi <= -Config::PI) {phi+=Config::TwoPI;} return phi; } -CUDA_CALLABLE inline float normalizedPhi(float phi) { // return std::fmod(phi, (float) Config::PI); // return phi +pi out of phase for |phi| beyond boundary! @@ -39,7 +36,6 @@ inline float normalizedPhi(float phi) return phi; } -CUDA_CALLABLE inline int getPhiPartition(float phi) { //assume phi is between -PI and PI diff --git a/Config.cc b/Config.cc index 9ac789b4..93c697d5 100644 --- a/Config.cc +++ b/Config.cc @@ -24,11 +24,8 @@ namespace Config // Multi threading and Clone engine configuration int numThreadsFinder = 1; - - // GPU computations int numThreadsEvents = 1; - int numThreadsReorg = 1; - + #if defined(__MIC__) || defined(__AVX512F__) int numThreadsSimulation = 60; #else diff --git a/Config.h b/Config.h index e84ca618..cc0702e0 100644 --- a/Config.h +++ b/Config.h @@ -6,12 +6,6 @@ #include // won't compile on clang gcc for mac OS w/o this! #include -#if defined(__CUDACC__) - #define CUDA_CALLABLE __host__ __device__ -#else - #define CUDA_CALLABLE -#endif - namespace mkfit { // Cram this in here for now ... @@ -259,7 +253,7 @@ namespace Config // Config for Hit and BinInfoUtils constexpr int nPhiPart = 1260; constexpr float fPhiFactor = nPhiPart / TwoPI; - constexpr int nEtaPart = 11; // 1 is better for GPU best_hit + constexpr int nEtaPart = 11; constexpr int nEtaBin = 2 * nEtaPart - 1; constexpr float fEtaFull = 2 * Config::fEtaDet; @@ -363,10 +357,7 @@ namespace Config // Threading extern int numThreadsFinder; extern int numThreadsSimulation; - - // For GPU computations extern int numThreadsEvents; - extern int numThreadsReorg; extern int finderReportBestOutOfN; @@ -409,7 +400,6 @@ namespace Config void RecalculateDependentConstants(); - CUDA_CALLABLE inline float BfieldFromZR(const float z, const float r) { return (Config::mag_b0*z*z + Config::mag_b1*z + Config::mag_c1)*(Config::mag_a*r*r + 1.f); @@ -420,8 +410,6 @@ namespace Config #ifndef MPT_SIZE #if defined(__MIC__) || defined(__AVX512F__) #define MPT_SIZE 16 - #elif defined USE_CUDA - #define MPT_SIZE 8 #elif defined(__AVX__) || defined(__AVX2__) #define MPT_SIZE 8 #else diff --git a/Hit.h b/Hit.h index c9a077b1..56fb7c1f 100644 --- a/Hit.h +++ b/Hit.h @@ -61,13 +61,11 @@ inline float getInvRad2(float x, float y){ return 1.0f/(x*x + y*y); } -CUDA_CALLABLE inline float getPhi(float x, float y) { return std::atan2(y,x); } -CUDA_CALLABLE inline float getTheta(float r, float z){ return std::atan2(r,z); } @@ -201,10 +199,6 @@ class Hit const float* posArray() const {return state_.pos_.Array();} const float* errArray() const {return state_.err_.Array();} -#if __CUDACC__ - __device__ float* posArrayCU(); - __device__ float* errArrayCU(); -#endif // Non-const versions needed for CopyOut of Matriplex. SVector3& parameters_nc() {return state_.pos_;} diff --git a/Makefile b/Makefile index d16bbbc6..b0b49084 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ distclean: clean-local ${LIB_CORE}: ${CORE_OBJS} @mkdir -p $(@D) - ${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS_CU} ${LDFLAGS} + ${CXX} ${CXXFLAGS} ${VEC_HOST} ${CORE_OBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS} main: ${AUTO_TGTS} ${LIB_CORE} main.o ${CXX} ${CXXFLAGS} ${VEC_HOST} -o $@ main.o ${LDFLAGS_HOST} ${LDFLAGS} -Llib -lMicCore -Wl,-rpath,lib diff --git a/Makefile.config b/Makefile.config index 886bd61b..4dac05f5 100644 --- a/Makefile.config +++ b/Makefile.config @@ -47,25 +47,6 @@ else ifdef OSXMPCLANG TBB_PREFIX := /opt/local endif -# 2.1 Use nvcc to compile cuda code -# Using the CUB library for standard GPU algorithm http://nvlabs.github.io/cub/ -# It's header only and potentially exported by the environment -# Maybe it is good enough to have: -# CUBROOT?=Undefined -# CUDAINCDIR and CUDALIBDIR also need to be defined -ifneq (,$(realpath /home/ml15/tools/cub)) - CUBROOT?=/home/ml15/tools/cub -else ifneq (,$(realpath /nfs/opt/cuda-8-0/include)) - CUBROOT?=/nfs/opt/cuda-8-0/include -else ifneq (,$(realpath /usr/local/cuda/include)) - CUBROOT?=/usr/local/cuda/include -endif -NV := nvcc -prec-sqrt=true -I${CUBROOT} -#-g -G -lineinfo -# Comment out to compile for CPU -#USE_CUDA := 1 -# For CUDA: Also need to change maxCandsPerSeed to 8 and nEtaPart to 1 - # 3. Optimization # -O3 implies vectorization and simd (but not AVX) OPT := -g -O3 @@ -143,20 +124,6 @@ CXXFLAGS := -fPIC ${OPT} ${OSX_CXXFLAGS} LDFLAGS_HOST := LDFLAGS_MIC := -static-intel -ifdef USE_CUDA - CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo - LDFLAGS_HOST += -L${CUDALIBDIR} - ifeq ($(CXX),icpc) - CXXFLAGS += -qopenmp-simd - LDFLAGS += -qopenmp-simd - else - CXXFLAGS += -fopenmp-simd - LDFLAGS += -fopenmp-simd - endif -endif -#CXXFLAGS += -qopenmp -#LDFLAGS += -qopenmp - CPPFLAGS += ${USE_STATE_VALIDITY_CHECKS} ${USE_SCATTERING} ${USE_LINEAR_INTERPOLATION} ${ENDTOEND} ${INWARD_FIT} ifdef USE_VTUNE_NOTIFY diff --git a/Matrix.h b/Matrix.h index 5aecfd9f..baef4568 100644 --- a/Matrix.h +++ b/Matrix.h @@ -81,13 +81,11 @@ inline double dtime() return( tseconds ); } -CUDA_CALLABLE inline float hipo(float x, float y) { return std::sqrt(x*x + y*y); } -CUDA_CALLABLE inline void sincos4(const float x, float& sin, float& cos) { // Had this writen with explicit division by factorial. diff --git a/README.md b/README.md index 07630a97..47e6affb 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,6 @@ - **phi3.t2.ucsd.edu**: [Intel Xeon Gold 6130 Processor](https://ark.intel.com/products/120492/Intel-Xeon-Gold-6130-Processor-22M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Au, SKL-SP, phi3) - **lnx4108.classe.cornell.edu**: [Intel Xeon Silver 4116 Processor](https://ark.intel.com/products/120481/Intel-Xeon-Silver-4116-Processor-16_5M-Cache-2_10-GHz) _Skylake Scalable Performance_ (referred to as SKL-Ag, SKL-SP, lnx4108, LNX-S) - **lnx7188.classe.cornell.edu**: [Intel Xeon Gold 6142 Processor](https://ark.intel.com/content/www/us/en/ark/products/120487/intel-xeon-gold-6142-processor-22m-cache-2-60-ghz.html) _Skylake Scalable Performance_ (referred to as lnx7188,LNX-G) -- **GPUs**: to be filled out phi1, phi2, and phi3 are all managed across a virtual login server and therefore the home user spaces are shared. phi1, phi2, phi3, lnx7188, and lnx4108 also have /cvmfs mounted so you can source the environment needed to run the code. @@ -420,7 +419,7 @@ Described in validation manifesto. See Section 8 for more info on manifesto. ### TO DO - flesh out sections as needed -- GPU specific code +- GPU specific code? ### Vestigial code diff --git a/Track.h b/Track.h index 5278960d..d5463a0e 100644 --- a/Track.h +++ b/Track.h @@ -71,7 +71,6 @@ typedef std::vector RedTrackVec; struct TrackState // possible to add same accessors as track? { public: - CUDA_CALLABLE TrackState() : valid(true) {} TrackState(int charge, const SVector3& pos, const SVector3& mom, const SMatrixSym66& err) : parameters(SVector6(pos.At(0),pos.At(1),pos.At(2),mom.At(0),mom.At(1),mom.At(2))), @@ -152,17 +151,13 @@ class TrackBase ~TrackBase() {} const TrackState& state() const { return state_; } - CUDA_CALLABLE void setState(const TrackState& newState) { state_ = newState; } + void setState(const TrackState& newState) { state_ = newState; } const SVector6& parameters() const {return state_.parameters;} const SMatrixSym66& errors() const {return state_.errors;} const float* posArray() const {return state_.parameters.Array();} const float* errArray() const {return state_.errors.Array();} -#if __CUDACC__ - __device__ float* posArrayCU(); - __device__ float* errArrayCU(); -#endif // Non-const versions needed for CopyOut of Matriplex. SVector6& parameters_nc() {return state_.parameters;} @@ -196,15 +191,15 @@ class TrackBase // ------------------------------------------------------------------------ - CUDA_CALLABLE int charge() const { return state_.charge; } - CUDA_CALLABLE float chi2() const { return chi2_; } - CUDA_CALLABLE float score() const { return score_; } - CUDA_CALLABLE int label() const { return label_; } + int charge() const { return state_.charge; } + float chi2() const { return chi2_; } + float score() const { return score_; } + int label() const { return label_; } - CUDA_CALLABLE void setCharge(int chg) { state_.charge = chg; } - CUDA_CALLABLE void setChi2(float chi2) { chi2_ = chi2; } - CUDA_CALLABLE void setScore(float s) { score_ = s; } - CUDA_CALLABLE void setLabel(int lbl) { label_ = lbl; } + void setCharge(int chg) { state_.charge = chg; } + void setChi2(float chi2) { chi2_ = chi2; } + void setScore(float s) { score_ = s; } + void setLabel(int lbl) { label_ = lbl; } bool hasSillyValues(bool dump, bool fix, const char* pref=""); @@ -358,7 +353,6 @@ class TrackBase class Track : public TrackBase { public: - CUDA_CALLABLE Track() {} explicit Track(const TrackBase& base) : @@ -369,7 +363,6 @@ class Track : public TrackBase nFoundHits_ = 0; } - CUDA_CALLABLE Track(const TrackState& state, float chi2, int label, int nHits, const HitOnTrack* hits) : TrackBase(state, chi2, label) { @@ -390,7 +383,6 @@ class Track : public TrackBase hitsOnTrk_ (t.hitsOnTrk_) {} - CUDA_CALLABLE ~Track(){} // used for swimming cmssw rec tracks to mkFit position @@ -434,7 +426,6 @@ class Track : public TrackBase // out of TrackBase. If we do it. void reserveHits(int nHits) { hitsOnTrk_.reserve(nHits); } - CUDA_CALLABLE void resetHits() { lastHitIdx_ = -1; nFoundHits_ = 0; hitsOnTrk_.clear(); } // Hackish for MkFinder::copy_out ... to be reviewed @@ -443,7 +434,6 @@ class Track : public TrackBase void resizeHitsForInput(); - CUDA_CALLABLE void addHitIdx(int hitIdx, int hitLyr, float chi2) { hitsOnTrk_.push_back( { hitIdx, hitLyr } ); @@ -462,12 +452,12 @@ class Track : public TrackBase HitOnTrack getHitOnTrack(int posHitIdx) const { return hitsOnTrk_[posHitIdx]; } - CUDA_CALLABLE int getHitIdx(int posHitIdx) const { return hitsOnTrk_[posHitIdx].index; } - CUDA_CALLABLE int getHitLyr(int posHitIdx) const { return hitsOnTrk_[posHitIdx].layer; } + int getHitIdx(int posHitIdx) const { return hitsOnTrk_[posHitIdx].index; } + int getHitLyr(int posHitIdx) const { return hitsOnTrk_[posHitIdx].layer; } - CUDA_CALLABLE HitOnTrack getLastHitOnTrack() const { return hitsOnTrk_[lastHitIdx_]; } - CUDA_CALLABLE int getLastHitIdx() const { return hitsOnTrk_[lastHitIdx_].index; } - CUDA_CALLABLE int getLastHitLyr() const { return hitsOnTrk_[lastHitIdx_].layer; } + HitOnTrack getLastHitOnTrack() const { return hitsOnTrk_[lastHitIdx_]; } + int getLastHitIdx() const { return hitsOnTrk_[lastHitIdx_].index; } + int getLastHitLyr() const { return hitsOnTrk_[lastHitIdx_].layer; } int getLastFoundHitPos() const { @@ -506,12 +496,10 @@ class Track : public TrackBase HitOnTrack* BeginHitsOnTrack_nc() { return hitsOnTrk_.data(); } - CUDA_CALLABLE void setHitIdx(int posHitIdx, int newIdx) { hitsOnTrk_[posHitIdx].index = newIdx; } - CUDA_CALLABLE void setHitIdxLyr(int posHitIdx, int newIdx, int newLyr) { hitsOnTrk_[posHitIdx] = { newIdx, newLyr }; } @@ -523,8 +511,8 @@ class Track : public TrackBase } } - CUDA_CALLABLE int nFoundHits() const { return nFoundHits_; } - CUDA_CALLABLE int nTotalHits() const { return lastHitIdx_ + 1; } + int nFoundHits() const { return nFoundHits_; } + int nTotalHits() const { return lastHitIdx_ + 1; } int nInsideMinusOneHits() const { @@ -578,7 +566,7 @@ class Track : public TrackBase return layers; } - CUDA_CALLABLE Track clone() const { return Track(*this); } + Track clone() const { return Track(*this); } private: diff --git a/gpu_scripts/gpu_setup.py b/gpu_scripts/gpu_setup.py deleted file mode 100644 index 0113dc7a..00000000 --- a/gpu_scripts/gpu_setup.py +++ /dev/null @@ -1,76 +0,0 @@ -import os -import shutil -import subprocess - - -def backup_original_files(files): - for f in files: - shutil.copy(f, f + ".bck") - - -def replace_in_file(file_name, regexes, suffix=''): - for regex in regexes: - subprocess.check_call(['sed', '-i' + suffix, regex, file_name]) - - -def restore_file(file_name, suffix='.bck'): - backup = file_name + '.bck' - if (os.path.exists(backup)): - shutil.move(backup, file_name) - - -def check_env(): - if not 'CUBROOT' in os.environ: - raise EnvironmentError( - "CUB (https://github.com/NVlabs/cub) should be downloaded and " - "the environment variable 'CUBROOT' set to where it lies. " - "No installation required: it's header only.") - try: - subprocess.check_output(["nvcc", "--version"]) - except: - raise EnvironmentError("nvcc should be available on the system PATH") - - -def setup_files(makefile, config_h): - replace_in_file(makefile, - [r's/^\s*KNC_BUILD\s*/# &/', - r's/^\s*ICC.*/ICC := icpc/', - r's/^#\(USE_CUDA := yes\)/\1/']) - - replace_in_file(config_h, - [r's/\(\s*constexpr\s*int\s*nEtaPart\s*= \).*\(\s*;.*\)/\1 1\2/', - r's/\(\s*constexpr\s*int\s*maxCandsPerSeed\s*= \).*\(\s*;\s*\/\/.*\)/\1 8\2/']) - - -def cleanup(makefile, config_h): - restore_file(makefile) - restore_file(config_h) - - -def make_distclean(rootdir): - saved_path = os.path.abspath(os.path.curdir) - os.chdir(rootdir) - subprocess.check_call(["make", "-f", "Makefile", "distclean"]) - os.chdir(saved_path) - - -def make_all(rootdir): - saved_path = os.path.abspath(os.path.curdir) - os.chdir(rootdir) - subprocess.check_call(["make", "-j", "24", "-f", "Makefile", "all"]) - os.chdir(saved_path) - - -def get_file_path_safe(rootdir, file_name): - path = os.path.join(rootdir, file_name) - if not os.path.isfile(path): - raise RuntimeError('%s should exist, check the path' % path) - return path - - -def get_file_paths(rootdir): - makefile = get_file_path_safe(rootdir, 'Makefile.config') - config_h = get_file_path_safe(rootdir, 'Config.h') - return (makefile, config_h) - - diff --git a/gpu_scripts/runBenchmark_cuda.py b/gpu_scripts/runBenchmark_cuda.py deleted file mode 100755 index f1c98e13..00000000 --- a/gpu_scripts/runBenchmark_cuda.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python - -import argparse -import os - -import gpu_setup - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--rootdir', dest='rootdir', - action='store', default='.', - help='Where Makefile.config and Config.h are.') - args = parser.parse_args() - if not os.path.isdir(args.rootdir): - raise RuntimeError('--rootdir should be a directory.') - return args - - -def main(): - args = get_args() - makefile, config_h = gpu_setup.get_file_paths(os.path.abspath(args.rootdir)) - gpu_setup.check_env() - gpu_setup.backup_original_files([makefile, config_h]) - - gpu_setup.setup_files(makefile, config_h) - try: - pass - gpu_setup.make_distclean(args.rootdir) - gpu_setup.make_all(args.rootdir) - finally: - gpu_setup.cleanup(makefile, config_h) - - -if __name__ == '__main__': - main() diff --git a/gpu_scripts/run_cleanup_only.py b/gpu_scripts/run_cleanup_only.py deleted file mode 100755 index 6690a224..00000000 --- a/gpu_scripts/run_cleanup_only.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python - -import os -import argparse -import gpu_setup - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--rootdir', dest='rootdir', - action='store', default='.', - help='Where Makefile.config and Config.h are.') - args = parser.parse_args() - if not os.path.isdir(args.rootdir): - raise RuntimeError('--rootdir should be a directory.') - return args - - -def main(): - args = get_args() - makefile, config_h = gpu_setup.get_file_paths(os.path.abspath(args.rootdir)) - gpu_setup.cleanup(makefile, config_h) - - -if __name__ == '__main__': - main() diff --git a/gpu_scripts/run_setup_only.py b/gpu_scripts/run_setup_only.py deleted file mode 100755 index 78d3d9cf..00000000 --- a/gpu_scripts/run_setup_only.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python - -import os -import argparse -import gpu_setup - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--rootdir', dest='rootdir', - action='store', default='.', - help='Where Makefile.config and Config.h are.') - args = parser.parse_args() - if not os.path.isdir(args.rootdir): - raise RuntimeError('--rootdir should be a directory.') - return args - - -def main(): - args = get_args() - makefile, config_h = gpu_setup.get_file_paths(os.path.abspath(args.rootdir)) - gpu_setup.check_env() - gpu_setup.backup_original_files([makefile, config_h]) - - gpu_setup.setup_files(makefile, config_h) - - -if __name__ == '__main__': - main() diff --git a/mkFit/BuilderCU.cu b/mkFit/BuilderCU.cu deleted file mode 100644 index 4ec1ac60..00000000 --- a/mkFit/BuilderCU.cu +++ /dev/null @@ -1,111 +0,0 @@ -#include "BuilderCU.h" - -#include "gpu_utils.h" -#include "HitStructures.h" -#include "HitStructuresCU.h" -#include "GeometryCU.h" -#include "FitterCU.h" -#include "Event.h" - - -BuilderCU::BuilderCU() {} - - -BuilderCU::BuilderCU(FitterCU *fitter) { - cuFitter = fitter; -} - - -void BuilderCU::setUpFitter(int gplex_size) -{ - cuFitter = new FitterCU (gplex_size); - cuFitter->allocateDevice(); - cuFitter->allocate_extra_addBestHit(); - cuFitter->createStream(); - cuFitter->setNumberTracks(gplex_size); -} - - -void BuilderCU::tearDownFitter() -{ - cuFitter->destroyStream(); - cuFitter->free_extra_addBestHit(); - cuFitter->freeDevice(); - delete cuFitter; -} - - -BuilderCU::~BuilderCU() { - geom_cu.deallocate(); -} - - -void BuilderCU::allocateGeometry(const Geometry& geom) -{ - std::vector radii (Config::nLayers); - for (int ilay = Config::nlayers_per_seed; ilay < Config::nLayers; ++ilay) { - radii[ilay] = geom.Radius(ilay); - } - geom_cu.allocate(); - geom_cu.getRadiiFromCPU(&radii[0]); -} - - -void BuilderCU::setUpBH(const EventOfHits& event_of_hits, const Event* event, - const EventOfCandidates& event_of_cands) -{ - event_of_hits_cu.reserve_layers(event_of_hits, 2.f); - event_of_hits_cu.copyFromCPU(event_of_hits, cuFitter->get_stream()); - event_of_cands_cu.allocGPU(event_of_cands); -} - - -void BuilderCU::tearDownBH() { - event_of_cands_cu.deallocGPU(); -} - - -void BuilderCU::allocateCE(const EventOfHits& event_of_hits, const Event* event, - const EventOfCombCandidates& event_of_cands) -{ - event_of_hits_cu.reserve_layers(event_of_hits, 2.f); - event_of_comb_cands_cu.allocate(event_of_cands); -} - - -void BuilderCU::setUpCE(const EventOfHits& event_of_hits, const Event* event, - const EventOfCombCandidates& event_of_cands) -{ - event_of_hits_cu.copyFromCPU(event_of_hits, cuFitter->get_stream()); -} - - -void BuilderCU::tearDownCE() { - /*event_of_comb_cands_cu.free();*/ - /*event_of_hits_cu.deallocGPU();*/ -} - - -void BuilderCU::FindTracksBestHit(EventOfCandidates& event_of_cands) -{ - event_of_cands_cu.copyFromCPU(event_of_cands, cuFitter->get_stream()); - - cuFitter->addBestHit(event_of_hits_cu, geom_cu, event_of_cands_cu); - - event_of_cands_cu.copyToCPU(event_of_cands, cuFitter->get_stream()); - cudaStreamSynchronize(cuFitter->get_stream()); - //cudaCheckError(); -} - - -void BuilderCU::FindTracksCloneEngine(EventOfCombCandidates& event_of_comb_cands, - bool seed_based) -{ - event_of_comb_cands_cu.copyFromCPU(event_of_comb_cands, cuFitter->get_stream()); - - cuFitter->FindTracksInLayers(event_of_hits_cu.m_layers_of_hits.data(), - event_of_comb_cands_cu, geom_cu, seed_based); - - event_of_comb_cands_cu.copyToCPU(event_of_comb_cands, cuFitter->get_stream()); - cudaStreamSynchronize(cuFitter->get_stream()); -} diff --git a/mkFit/BuilderCU.h b/mkFit/BuilderCU.h deleted file mode 100644 index 0b54973d..00000000 --- a/mkFit/BuilderCU.h +++ /dev/null @@ -1,55 +0,0 @@ -#ifndef BUILDER_CU_H -#define BUILDER_CU_H - -#include "FitterCU.h" -#include "HitStructures.h" -#include "HitStructuresCU.h" -#include "GeometryCU.h" -#include "Geometry.h" -#include "Event.h" - -namespace mkfit { - -// FIXME: Design Issue -// What to do, allocation in ctor, free in dtor? -// not exception-safe -// but manage mem -// or in separate function? -// FIXME: A lot of duplication between BH/CE. Remove it after CtD -class BuilderCU -{ -public: - BuilderCU(); - BuilderCU(FitterCU *fitter); - ~BuilderCU(); - - void setUpBH(const EventOfHits& event_of_hits, const Event* event, - const EventOfCandidates& event_of_cands); - void tearDownBH(); - - void allocateCE(const EventOfHits& event_of_hits, const Event* event, - const EventOfCombCandidates& event_of_cands); - void setUpCE(const EventOfHits& event_of_hits, const Event* event, - const EventOfCombCandidates& event_of_cands); - void tearDownCE(); - - void FindTracksBestHit(EventOfCandidates& event_of_cands); - - void setUpFitter(int gplex_size); - void tearDownFitter(); - - void allocateGeometry(const Geometry& geom); - - void FindTracksCloneEngine(EventOfCombCandidates& event_of_cands, - bool seed_based=false); -private: - FitterCU *cuFitter; - EventOfHitsCU event_of_hits_cu; - EventOfCandidatesCU event_of_cands_cu; - EventOfCombCandidatesCU event_of_comb_cands_cu; - GeometryCU geom_cu; -}; - - -} // end namespace mkfit -#endif /* ifndef BUILDER_CU_H */ diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h deleted file mode 100644 index db1658c4..00000000 --- a/mkFit/FitterCU-imp.h +++ /dev/null @@ -1,446 +0,0 @@ -#include -#include "Config.h" -#include "GeometryCU.h" -#include "reorganize_gplex.h" -#include "fittracks_kernels.h" -#include "build_tracks_kernels.h" - -#include "clone_engine_kernels.h" -#include "clone_engine_kernels_seed.h" - -#include "Track.h" - -template -void FitterCU::FindTracksInLayers(LayerOfHitsCU *layers, - EventOfCombCandidatesCU& event_of_cands_cu, - GeometryCU &geom, bool seed_based) -{ - auto f = seed_based ? findInLayers_wrapper_seed : findInLayers_wrapper; - - //findInLayers_wrapper(stream, layers, event_of_cands_cu, - //findInLayers_wrapper_seed(stream, layers, event_of_cands_cu, - f (stream, layers, event_of_cands_cu, - d_XHitSize, d_XHitArr, d_Err_iP, d_par_iP, - d_msErr_arr, d_msPar_arr, d_Err_iC, d_par_iC, - d_outChi2, d_Chi2, d_HitsIdx_arr, d_inChg, d_Label, - d_SeedIdx, d_CandIdx, d_Valid, geom, d_maxSize, N); -} - -template -void FitterCU::UpdateWithLastHit(LayerOfHitsCU& layer, int ilay, int N) -{ - UpdateWithLastHit_wrapper(stream, layer, d_HitsIdx[ilay], - d_msPar[ilay], d_msErr[ilay], d_par_iP, d_Err_iP, d_par_iC, d_Err_iC, N); -} - -template -void FitterCU::UpdateWithLastHit_standalone( - LayerOfHitsCU& layer_cu, MPlexQI& HitsIdx, - MPlexLS &Err_iP, MPlexLV& Par_iP, MPlexQI &Chg, - MPlexHV& msPar, MPlexHS& msErr, - MPlexLS &Err_iC, MPlexLV& Par_iC, - int Nhits, int N_proc) -{ - d_HitsIdx[Nhits-1].copyAsyncFromHost(stream, HitsIdx); - d_Err_iP.copyAsyncFromHost(stream, Err_iP); - d_par_iP.copyAsyncFromHost(stream, Par_iP); - d_msErr[Nhits-1].copyAsyncFromHost(stream, msErr); - d_msPar[Nhits-1].copyAsyncFromHost(stream, msPar); - - UpdateWithLastHit_wrapper(stream, layer_cu, d_HitsIdx[Nhits-1], - d_msPar[Nhits-1], d_msErr[Nhits-1], d_par_iP, d_Err_iP, d_par_iC, d_Err_iC, N_proc); - - d_par_iC.copyAsyncToHost(stream, Par_iC); - d_Err_iC.copyAsyncToHost(stream, Err_iC); - d_msErr[Nhits-1].copyAsyncToHost(stream, msErr); - d_msPar[Nhits-1].copyAsyncToHost(stream, msPar); - - cudaStreamSynchronize(stream); -} -template -void FitterCU::GetHitFromLayer_standalone(LayerOfHitsCU& layer_cu, - MPlexQI& HitsIdx, MPlexHV& msPar, MPlexHS& msErr, int hit_idx, int N) -{ - d_HitsIdx[hit_idx].copyAsyncFromHost(stream, HitsIdx); - - getHitFromLayer_wrappper(stream, layer_cu, d_HitsIdx[hit_idx], - d_msPar[hit_idx], d_msErr[hit_idx], N); - - d_msErr[hit_idx].copyAsyncToHost(stream, msErr); - d_msPar[hit_idx].copyAsyncToHost(stream, msPar); -} -template -void FitterCU::UpdateMissingHits_standalone( - MPlexLS& Err_iP, MPlexLV& Par_iP, - MPlexLS& Err_iC, MPlexLV& Par_iC, - MPlexQI& HitsIdx, - int hit_idx, int N) -{ - d_HitsIdx[hit_idx].copyAsyncFromHost(stream, HitsIdx); - d_Err_iP.copyAsyncFromHost(stream, Err_iP); - d_par_iP.copyAsyncFromHost(stream, Par_iP); - - UpdateMissingHits_wrapper(stream, d_HitsIdx[hit_idx], - d_par_iP, d_Err_iP, d_par_iC, d_Err_iC, N); - - d_Err_iC.copyAsyncToHost(stream, Err_iC); - d_par_iC.copyAsyncToHost(stream, Par_iC); - cudaStreamSynchronize(stream); -} - -template -void FitterCU::setNumberTracks(const idx_t Ntracks) { - N = Ntracks; - - // Raise an exceptioin when the FitterCU instance is too small - // This should not happen as the loop over tracks in runFittestGPU - // takes care of it. - if (Ntracks > Nalloc) { - throw std::length_error("FitterCU: Ntracks should be smaller than Nalloc"); - } -} - -template -void FitterCU::createStream() { - cudaStreamCreate(&stream); -} - -template -void FitterCU::destroyStream() { - cudaStreamDestroy(stream); -} - -template -void FitterCU::allocateDevice() { - d_par_iP.allocate(Nalloc); - d_par_iC.allocate(Nalloc); - - d_Err_iP.allocate(Nalloc); - d_Err_iC.allocate(Nalloc); - - d_inChg.allocate(Nalloc); - d_errorProp.allocate(Nalloc); - - cudaMalloc((void**)&d_msPar_arr, Config::nLayers * sizeof(GPlexHV)); - cudaMalloc((void**)&d_msErr_arr, Config::nLayers * sizeof(GPlexHS)); - for (int hi = 0; hi < Config::nLayers; ++hi) { - d_msPar[hi].allocate(Nalloc); - d_msErr[hi].allocate(Nalloc); - } - cudaMemcpy(d_msPar_arr, d_msPar, Config::nLayers*sizeof(GPlexHV), cudaMemcpyHostToDevice); - cudaMemcpy(d_msErr_arr, d_msErr, Config::nLayers*sizeof(GPlexHS), cudaMemcpyHostToDevice); - cudaMalloc((void**)&d_maxSize, sizeof(int)); // global maximum - cudaCheckError(); -} - -template -void FitterCU::freeDevice() { - d_par_iC.free(); - d_inChg.free(); - d_par_iP.free(); - d_errorProp.free(); - d_Err_iP.free(); - d_Err_iC.free(); - for (int hi = 0; hi < Config::nLayers; ++hi) { - d_msPar[hi].free(); - d_msErr[hi].free(); - } - cudaFree(d_msPar_arr); - cudaFree(d_msErr_arr); - cudaFree(d_maxSize); - cudaCheckError(); -} - -template -void FitterCU::kalmanUpdateMerged(const int hit_idx) { - kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr[hit_idx], - d_par_iP, d_msPar[hit_idx], d_par_iC, d_Err_iC, N); -} - -template -void FitterCU::kalmanUpdate_standalone( - const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg, - const MPlexHS &msErr, const MPlexHV& msPar, - MPlexLS &outErr, MPlexLV& outPar, - const int hit_idx, const int N_proc) -{ - d_Err_iP.copyAsyncFromHost(stream, psErr); - d_msErr[hit_idx].copyAsyncFromHost(stream, msErr); - d_par_iP.copyAsyncFromHost(stream, psPar); - d_msPar[hit_idx].copyAsyncFromHost(stream, msPar); - - kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr[hit_idx], - d_par_iP, d_msPar[hit_idx], d_par_iC, d_Err_iC, N_proc); - - d_par_iC.copyAsyncToHost(stream, outPar); - d_Err_iC.copyAsyncToHost(stream, outErr); -} - -template -void FitterCU::propagationMerged(const int hit_idx) { - propagation_wrapper(stream, d_msPar[hit_idx], d_Err_iC, d_par_iC, d_inChg, - d_par_iP, d_Err_iP, false, N); -} - -// FIXME: Temporary. Separate allocations / transfers -template -void FitterCU::allocate_extra_addBestHit() { - d_outChi2.allocate(Nalloc); - d_XHitPos.allocate(Nalloc); - d_XHitSize.allocate(Nalloc); - d_XHitArr.allocate(Nalloc); - cudaMalloc((void**)&d_HitsIdx_arr, Config::nLayers * sizeof(GPlexQI)); - for (int hi = 0; hi < Config::nLayers; ++hi) { - d_HitsIdx[hi].allocate(Nalloc); - } - cudaMemcpy(d_HitsIdx_arr, d_HitsIdx, Config::nLayers*sizeof(GPlexQI), cudaMemcpyHostToDevice); - d_Chi2.allocate(Nalloc); - d_Label.allocate(Nalloc); - cudaCheckError(); -} - -template -void FitterCU::free_extra_addBestHit() { - for (int hi = 0; hi < Config::nLayers; ++hi) { - d_HitsIdx[hi].free(); cudaCheckError(); - } - cudaFree(d_HitsIdx_arr); - d_Label.free(); cudaCheckError(); - d_Chi2.free(); cudaCheckError(); - - d_XHitArr.free(); cudaCheckError(); - d_XHitSize.free(); cudaCheckError(); - d_XHitPos.free(); cudaCheckError(); - d_outChi2.free(); cudaCheckError(); -} - - -template -void FitterCU::allocate_extra_combinatorial() { - d_SeedIdx.allocate(Nalloc); - d_CandIdx.allocate(Nalloc); - d_Valid.allocate(Nalloc); -} - - -template -void FitterCU::free_extra_combinatorial() { - d_SeedIdx.free(); - d_CandIdx.free(); - d_Valid.free(); -} - - -template -void FitterCU::setHitsIdxToZero(const int hit_idx) { - cudaMemset(d_HitsIdx[hit_idx].ptr, 0, Nalloc*sizeof(int)); -} - -template -void FitterCU::addBestHit(EventOfHitsCU &event, GeometryCU &geom_cu, - EventOfCandidatesCU &event_of_cands_cu) { - findBestHit_wrapper(stream, event.m_layers_of_hits.data(), - event_of_cands_cu, - d_XHitSize, d_XHitArr, - d_Err_iP, d_par_iP, - d_msErr_arr, d_msPar_arr, - d_Err_iC, d_par_iC, d_outChi2, - d_Chi2, d_HitsIdx_arr, - d_inChg, d_Label, geom_cu, - d_maxSize, N); -} - -template -void FitterCU::InputTracksAndHitIdx(const EtaBinOfCandidatesCU &etaBin, - const int beg, const int end, const bool inputProp) { - InputTracksCU_wrapper(stream, etaBin, d_Err_iP, d_par_iP, - d_inChg, d_Chi2, d_Label, d_HitsIdx_arr, - beg, end, inputProp, N); -} - -template -void FitterCU::OutputTracksAndHitIdx(EtaBinOfCandidatesCU &etaBin, - const int beg, const int end, const bool outputProp) { - OutputTracksCU_wrapper(stream, etaBin, d_Err_iP, d_par_iP, - d_inChg, d_Chi2, d_Label, d_HitsIdx_arr, - beg, end, outputProp, N); - cudaStreamSynchronize(stream); - cudaCheckError(); -} - - -template -void FitterCU::InputTracksAndHitIdxComb(const EtaBinOfCombCandidatesCU &etaBin, - const int Nhits, const int beg, const int end, const bool inputProp) { - InputTracksAndHitIdxComb_wrapper(stream, - etaBin, - d_Err_iP, d_par_iP, - d_inChg, d_Chi2, - d_Label, d_HitsIdx, - d_SeedIdx, d_CandIdx, - d_Valid, Nhits, - beg, end, - inputProp, N); -} - - -template -void FitterCU::propagateTracksToR(const float radius, const int N) { - propagationForBuilding_wrapper(stream, d_Err_iC, d_par_iC, d_inChg, - radius, d_Err_iP, d_par_iP, N); -} - - -#if 1 -template -void FitterCU::propagateTracksToR_standalone(const float radius, const int N, - const MPlexLS& Err_iC, const MPlexLV& Par_iC, const MPlexQI& inChg, - MPlexLS& Err_iP, MPlexLV& Par_iP) { - d_Err_iC.copyAsyncFromHost(stream, Err_iC); - d_par_iC.copyAsyncFromHost(stream, Par_iC); - d_inChg.copyAsyncFromHost(stream, inChg); - - propagateTracksToR(radius, N); - - d_Err_iP.copyAsyncToHost(stream, Err_iP); - d_par_iP.copyAsyncToHost(stream, Par_iP); - cudaStreamSynchronize(stream); -} - -template -void FitterCU::FitTracks(Track *tracks_cu, int num_tracks, - EventOfHitsCU &events_of_hits_cu, - int Nhits, const bool useParamBfield) -{ - float etime; - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - cudaEventRecord(start, 0); - - for (int itrack = 0; itrack < num_tracks; itrack += Nalloc) - { - int beg = itrack; - int end = std::min(itrack + Nalloc, num_tracks); - setNumberTracks(end-beg); - - InputTracksAndHitsCU_wrapper(stream, tracks_cu, events_of_hits_cu, - d_Err_iC, d_par_iC, - d_msErr_arr, d_msPar_arr, - d_inChg, d_Chi2, d_Label, - d_HitsIdx_arr, beg, end, false, N); - fittracks_wrapper(stream, d_Err_iP, d_par_iP, d_msErr_arr, d_msPar_arr, - d_Err_iC, d_par_iC, d_errorProp, d_inChg, useParamBfield, - Nhits, N); - OutputFittedTracksCU_wrapper(stream, tracks_cu, - d_Err_iC, d_par_iC, - d_inChg, d_Chi2, d_Label, - beg, end, N); - } - - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - - cudaEventElapsedTime(&etime, start, stop); - std::cerr << "CUDA etime: " << etime << " ms.\n"; - - cudaEventDestroy(start); - cudaEventDestroy(stop); -} -#else -template -void FitterCU::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC, - MPlexHV* msPar, MPlexHS* msErr, int Nhits, - std::vector &tracks, int beg, int end, - std::vector &layerHits) { - float etime; - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - //launch everything in a stream to enable concurrent execution of events - createStream(); - - // allocateDevice(); -> moved to mkFit/mkFit.cc - - setNumberTracks(end-beg); - - Track *tracks_cu; - cudaMalloc((void**)&tracks_cu, tracks.size()*sizeof(Track)); - cudaMemcpy(tracks_cu, &tracks[0], tracks.size()*sizeof(Track), cudaMemcpyHostToDevice); - allocate_extra_addBestHit(); - - InputTracksAndHitsCU_wrapper(stream, tracks_cu, d_Err_iC, d_par_iC, d_inChg, - d_Chi2, d_Label, d_HitsIdx_arr, beg, end, false, N); - - - //d_inChg.copyAsyncFromHost(stream, Chg); - //d_par_iC.copyAsyncFromHost(stream, par_iC); - //d_Err_iC.copyAsyncFromHost(stream, err_iC); - - cudaEventRecord(start, 0); - - double total_reorg = 0.; - for (int hi = 0; hi < Nhits; ++hi) - { - // Switch outPut and inPut parameters and errors - // similar to iC <-> iP in the CPU code. - d_par_iP.copyAsyncFromDevice(stream, d_par_iC); - d_Err_iP.copyAsyncFromDevice(stream, d_Err_iC); - -#if 0 - double time_input = dtime(); - int itrack; - //omp_set_num_threads(Config::numThreadsReorg); -//#pragma omp parallel for - for (int i = beg; i < end; ++i) { - itrack = i - beg; - Track &trk = tracks[i]; - - const int hidx = trk.getHitIdx(hi); - const Hit &hit = layerHits[hi][hidx]; - - msErr[hi].CopyIn(itrack, hit.errArray()); - msPar[hi].CopyIn(itrack, hit.posArray()); - } - total_reorg += (dtime() - time_input)*1e3; -#endif - - d_msPar[hi].copyAsyncFromHost(stream, msPar[hi]); - d_msErr[hi].copyAsyncFromHost(stream, msErr[hi]); - - propagationMerged(hi); - //MPlexLS err_iP; - //MPlexLV par_iP; - //d_par_iC.copyAsyncToHost(stream, par_iC); - //d_par_iP.copyAsyncToHost(stream, par_iP); - //d_Err_iP.copyAsyncToHost(stream, err_iP); - //propagation_wrapper(stream, d_msPar[hi], d_Err_iC, d_par_iC, d_inChg, - //d_par_iP, d_errorProp, d_Err_iP, N); - kalmanUpdateMerged(hi); - //fittracks_wrapper(stream, d_Err_iP, d_par_iP, d_msErr, d_msPar, - //d_Err_iC, d_par_iC, d_errorProp, d_inChg, - //hi, N); - } - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - - cudaEventElapsedTime(&etime, start, stop); - //std::cerr << "CUDA etime: " << etime << " ms.\n"; - //std::cerr << "Total reorg: " << total_reorg << " ms.\n"; - - free_extra_addBestHit(); - cudaFree(tracks_cu); - - d_par_iC.copyAsyncToHost(stream, par_iC); - d_Err_iC.copyAsyncToHost(stream, err_iC); - - cudaStreamSynchronize(stream); - // freeDevice(); -> moved to mkFit/mkFit.cc - destroyStream(); - - cudaEventDestroy(start); - cudaEventDestroy(stop); -} -#endif diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h deleted file mode 100644 index 1ed9d5b9..00000000 --- a/mkFit/FitterCU.h +++ /dev/null @@ -1,149 +0,0 @@ -#ifndef _PROPAGATOR_CU_H_ -#define _PROPAGATOR_CU_H_ - -#include - -#include "Matrix.h" - -#include "HitStructuresCU.h" -#include "GPlex.h" -#include "GeometryCU.h" -#include "gpu_utils.h" - -#include "propagation_kernels.h" -#include "kalmanUpdater_kernels.h" -#include "computeChi2_kernels.h" -#include "index_selection_kernels.h" -#include "best_hit_kernels.h" - -//#include -#include - -#define BLOCK_SIZE_X 256 - -namespace mkfit { - -using idx_t = Matriplex::idx_t; - -template -class FitterCU { - public: - FitterCU (idx_t Nalloc) : Nalloc(Nalloc) {}; - virtual ~FitterCU () {}; - - void allocateDevice(); - void freeDevice(); - - void createStream(); - void destroyStream(); - cudaStream_t& get_stream() { return stream; } - - int get_Nalloc() const { return Nalloc; } - void setNumberTracks(const idx_t Ntracks); - - void propagationMerged(const int hit_idx); - void kalmanUpdateMerged(const int hit_idx); - void kalmanUpdate_standalone( - const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg, - const MPlexHS &msErr, const MPlexHV& msPar, - MPlexLS &outErr, MPlexLV& outPar, - const int hit_idx, const int N_proc); - - void allocate_extra_addBestHit(); - void free_extra_addBestHit(); - void allocate_extra_combinatorial(); - void free_extra_combinatorial(); - void setHitsIdxToZero(const int hit_idx); - - void addBestHit(EventOfHitsCU& event, GeometryCU &geom_cu, - EventOfCandidatesCU &event_of_cands_cu); - - void propagateTracksToR(const float radius, const int N); - void propagateTracksToR_standalone(const float radius, const int N, - const MPlexLS& Err_iC, const MPlexLV& par_iC, - const MPlexQI& inChg, - MPlexLS& Err_iP, MPlexLV& Par_iP); - - // fitting higher order methods - void FitTracks(Track *tracks_cu, int num_tracks, - EventOfHitsCU &events_of_hits_cu, - int Nhits, const bool useParamBfield=false); - void InputTracksAndHitIdx(const EtaBinOfCandidatesCU &etaBin, - const int beg, const int end, const bool inputProp); - void OutputTracksAndHitIdx(EtaBinOfCandidatesCU &etaBin, - const int beg, const int end, const bool outputProp); - - void InputTracksAndHitIdxComb(const EtaBinOfCombCandidatesCU &etaBin, - const int Nhits, const int beg, const int end, const bool inputProp); - - //==================== - // TEMP FCT; copy stuffs - //==================== - void UpdateWithLastHit(LayerOfHitsCU& layer, int ilay, int N); - void GetHitFromLayer_standalone(LayerOfHitsCU& layer_cu, - MPlexQI& HitsIdx, MPlexHV& msPar, MPlexHS& msErr, int hit_idx, int N); - void UpdateMissingHits_standalone( - MPlexLS& Err_iP, MPlexLV& Par_iP, - MPlexLS& Err_iC, MPlexLV& Par_iC, - MPlexQI& HitsIdx, - int hit_idx, int N); - void UpdateWithLastHit_standalone( - LayerOfHitsCU& layer_cu, MPlexQI& HitsIdx, - MPlexLS &Err_iP, MPlexLV& Par_iP, MPlexQI &inChg, - MPlexHV& msPar, MPlexHS& msErr, - MPlexLS &Err_iC, MPlexLV& Par_iC, - int Nhits, int N_proc); - - void FindTracksInLayers(LayerOfHitsCU *layers, - EventOfCombCandidatesCU& event_of_cands_cu, - GeometryCU &geom, bool seed_based=false); - - //==================== - private: - // N is the actual size, Nalloc should be >= N, as it is intended - // to allocated arrays that can be used for several sets of tracks. - idx_t Nalloc; - idx_t N; - - /* data */ - GPlexLV d_par_iP; // LV - GPlexLV d_par_iC; // LV - - GPlexLS d_Err_iP; // LS - GPlexLS d_Err_iC; // LS - - GPlexQI d_inChg; // QI - GPlexQF d_msRad; // QF - GPlexLL d_errorProp; // LL - - GPlexHV *d_msPar_arr; // completely on the GPU - GPlexHV d_msPar[Config::nLayers]; // on the CPU, with arrays on the GPU - GPlexHS *d_msErr_arr; - GPlexHS d_msErr[Config::nLayers]; - - GPlexQI d_XHitPos; // QI : 1D arrary following itracks - GPlexQI d_XHitSize; // QI : " " - GPlexHitIdx d_XHitArr; - - GPlexQF d_outChi2; - GPlexQI *d_HitsIdx_arr; - GPlexQI d_HitsIdx[Config::nLayers]; - GPlexQF d_Chi2; - GPlexQI d_Label; - - GPlexQI d_SeedIdx; - GPlexQI d_CandIdx; - - GPlexQB d_Valid; - - int *d_maxSize; // max number of tracks for AddBestHit - - // everything run in a stream so multiple instance of FitterCU can - // run concurrently on the GPU. - cudaStream_t stream; -}; - -#include "FitterCU-imp.h" - -} // end namespace mkfit -#endif // _PROPAGATOR_CU_H_ diff --git a/mkFit/GPlex.h b/mkFit/GPlex.h deleted file mode 100644 index 18606b31..00000000 --- a/mkFit/GPlex.h +++ /dev/null @@ -1,149 +0,0 @@ -#ifndef _GPLEX_H_ -#define _GPLEX_H_ - -#include -#include - -#include "gpu_utils.h" -#include "Matrix.h" - -namespace mkfit { - -//#include "gpu_constants.h" -__device__ __constant__ static int gplexSymOffsets[7][36] = -{ - {}, - {}, - { 0, 1, 1, 2 }, - { 0, 1, 3, 1, 2, 4, 3, 4, 5 }, // 3 - {}, - {}, - { 0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17, 6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20 } -}; - - -// GPU implementation of a Matriplex-like structure -// The number of tracks is the fast dimension and is padded in order to have -// consecutive and aligned memory accesses. For cached reads, this result in a -// single memory transaction for the 32 threads of a warp to access 32 floats. -// See: -// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-3-0 -// In practice, The number of tracks (ntracks) is set to be MPT_SIZE -template -struct GPlex { - using T = typename M::value_type; - using value_type = T; - - size_t kRows = M::kRows; - size_t kCols = M::kCols; - size_t kSize = M::kSize; - - T* ptr; - size_t pitch, stride, N; - - __device__ T operator[](int xx) const { return ptr[xx]; } - __device__ T& operator[](int xx) { return ptr[xx]; } - - __device__ T& operator()(int n, int i, int j) { return ptr[n + (i*kCols + j)*stride]; } - __device__ T operator()(int n, int i, int j) const { return ptr[n + (i*kCols + j)*stride]; } - - void allocate(size_t ntracks) { - N = ntracks; - cudaMallocPitch((void**)&ptr, &pitch, N*sizeof(T), kSize); - stride = pitch/sizeof(T); // Number of elements - } - void free() { - cudaFree(ptr); - N = 0; pitch = 0; stride = 0; - } - //cudaMemcpy2D(d_msErr.ptr, d_msErr.pitch, msErr.fArray, N*sizeof(T), - //N*sizeof(T), HS, cudaMemcpyHostToDevice); - - void copyAsyncFromHost(cudaStream_t& stream, const M& mplex) { - cudaMemcpy2DAsync(ptr, pitch, mplex.fArray, N*sizeof(T), - N*sizeof(T), kSize, cudaMemcpyHostToDevice, stream); - cudaCheckError(); - } - void copyAsyncToHost(cudaStream_t& stream, M& mplex) { - cudaMemcpy2DAsync(mplex.fArray, N*sizeof(T), ptr, pitch, - N*sizeof(T), kSize, cudaMemcpyDeviceToHost, stream); - cudaCheckError(); - } - void copyAsyncFromDevice(cudaStream_t& stream, GPlex& gplex) { - cudaMemcpy2DAsync(ptr, pitch, gplex.ptr, gplex.pitch, - N*sizeof(T), kSize, cudaMemcpyDeviceToDevice, stream); - cudaCheckError(); - } -}; - - -template -struct GPlexSym : GPlex { - using T = typename GPlex::T; - using GPlex::kRows; - using GPlex::kCols; - using GPlex::stride; - using GPlex::ptr; - __device__ size_t Off(size_t i) const { return gplexSymOffsets[kRows][i]; } - // Note: convenient but noticeably slower due to the indirection - __device__ T& operator()(int n, int i, int j) { return ptr[n + Off(i*kCols + j)*stride]; } - __device__ T operator()(int n, int i, int j) const { return ptr[n + Off(i*kCols + j)*stride]; } - //__device__ T& operator()(int n, int i, int j) { return ptr[n + i*stride]; } - //__device__ T operator()(int n, int i, int j) const { return ptr[n + i*stride]; } -}; - -using GPlexLL = GPlex; -using GPlexLV = GPlex; -using GPlexLS = GPlexSym; - -using GPlexHH = GPlex; -using GPlexHV = GPlex; -using GPlexHS = GPlexSym; - -using GPlexLH = GPlex; - -using GPlexQF = GPlex; -using GPlexQI = GPlex; -using GPlexQB = GPlex; - -const int GPlexHitIdxMax = 16; // FIXME: copied and past from MkFitter.h -using GPlexHitIdx = GPlex>; - -template -struct GPlexReg { - using T = typename M::value_type; - using value_type = T; - - size_t kRows = M::kRows; - size_t kCols = M::kCols; - size_t kSize = M::kSize; - - __device__ T operator[](int xx) const { return arr[xx]; } - __device__ T& operator[](int xx) { return arr[xx]; } - - __device__ T& operator()(int n, int i, int j) { return arr[i*kCols + j]; } - __device__ T operator()(int n, int i, int j) const { return arr[i*kCols + j]; } - - __device__ void SetVal(T v) - { - for (int i = 0; i < kSize; ++i) - { - arr[i] = v; - } - } - - T arr[M::kSize]; -}; - -using GPlexRegLL = GPlexReg; -using GPlexRegLH = GPlexReg; -using GPlexRegHH = GPlexReg; -using GPlexRegLV = GPlexReg; -using GPlexRegHS = GPlexReg; -using GPlexRegHV = GPlexReg; -using GPlexReg2V = GPlexReg; -using GPlexReg2S = GPlexReg; -using GPlexRegQF = GPlexReg; - -} // end namespace mkfit -#endif // _GPLEX_H_ diff --git a/mkFit/GeometryCU.h b/mkFit/GeometryCU.h deleted file mode 100644 index da6347ff..00000000 --- a/mkFit/GeometryCU.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef GEOMETRY_CU_H -#define GEOMETRY_CU_H - -#include "gpu_utils.h" - -namespace mkfit { - -struct GeometryCU { - float *radii = nullptr; - - void allocate() { - if (radii == nullptr) - cudaMalloc((void**)&radii, Config::nLayers * sizeof(float)); - //cudaCheckError(); - } - void deallocate() { - if (radii != nullptr) { - cudaFree(radii); - cudaCheckError(); - radii = nullptr; - } - } - void getRadiiFromCPU(const float *h_radii) { - cudaMemcpy(radii, h_radii, Config::nLayers * sizeof(float), cudaMemcpyHostToDevice); - //cudaCheckError(); - } -}; - -} // end namespace mkfit -#endif /* ifndef GEOMETRY_CU_H */ diff --git a/mkFit/HitStructuresCU.cu b/mkFit/HitStructuresCU.cu deleted file mode 100644 index c42b6e03..00000000 --- a/mkFit/HitStructuresCU.cu +++ /dev/null @@ -1,394 +0,0 @@ - -#include "HitStructuresCU.h" - -#include -#include - -#include "HitStructures.h" -#include "gpu_utils.h" - -/////////////////////////////////////////////////////////////////////////////// -/// LayerOfHitsCU -/////////////////////////////////////////////////////////////////////////////// - -void LayerOfHitsCU::copy_layer_values(const LayerOfHits &layer) { - Tracer tracer("copyLayer", tracer_colors["Blue"]); - m_zmin = layer.m_zmin; - m_zmax = layer.m_zmax; - m_fz = layer.m_fz; -} - - -/////////////////////////////////////////////////////////////////////////////// -/// EventOfHitsCU -/////////////////////////////////////////////////////////////////////////////// - -void EventOfHitsCU::reserve_all_hits(const EventOfHits &event_of_hits, float factor) -{ - auto largest_layer = std::max_element(event_of_hits.m_layers_of_hits.begin(), - event_of_hits.m_layers_of_hits.end(), - [](const LayerOfHits &a, const LayerOfHits &b) - { - return a.m_capacity < b.m_capacity; - }); - auto m_max_hits_layer = largest_layer->m_capacity; - - all_hits.reserve(m_n_layers, m_max_hits_layer*factor); -} - - -void EventOfHitsCU::reserve_all_hits(const std::vector &layers_of_hits, float factor) -{ - auto largest_layer = std::max_element(layers_of_hits.begin(), - layers_of_hits.end(), - [](const HitVec &a, const HitVec &b) - { - return a.size() < b.size(); - }); - auto m_max_hits_layer = largest_layer->size(); - - all_hits.reserve(m_n_layers, m_max_hits_layer*factor); -} - - -void EventOfHitsCU::reserve_all_phi_bins(const EventOfHits &event_of_hits, float factor) -{ - auto binnest_layer = std::max_element(event_of_hits.m_layers_of_hits.begin(), - event_of_hits.m_layers_of_hits.end(), - [](const LayerOfHits &a, const LayerOfHits &b) - { - return a.m_nz < b.m_nz; - }); - auto m_max_nz_layer = binnest_layer->m_nz; - - all_phi_bin_infos.reserve(m_n_layers, m_max_nz_layer*Config::m_nphi*factor); -} - - -void EventOfHitsCU::set_layer_hit_views(const EventOfHits& event_of_hits, - float factor) -{ - for (int i = 0; i < m_n_layers; ++i) { - m_layers_of_hits_alloc[i].m_hits.set_view(all_hits.get_ptr_to_part(i), - event_of_hits.m_layers_of_hits[i].m_capacity); - m_layers_of_hits_alloc[i].m_capacity = event_of_hits.m_layers_of_hits[i].m_capacity; - m_layers_of_hits_alloc[i].m_capacity_alloc = - event_of_hits.m_layers_of_hits[i].m_capacity * factor; - } -} - - -void EventOfHitsCU::set_layer_bin_views(const EventOfHits& event_of_hits, - float factor) -{ - for (int i = 0; i < m_n_layers; ++i) { - m_layers_of_hits_alloc[i].m_phi_bin_infos.set_view(all_phi_bin_infos.get_ptr_to_part(i), - event_of_hits.m_layers_of_hits[i].m_nz * Config::m_nphi); - m_layers_of_hits_alloc[i].m_nz = event_of_hits.m_layers_of_hits[i].m_nz; - m_layers_of_hits_alloc[i].m_nz_alloc = event_of_hits.m_layers_of_hits[i].m_nz * factor; - m_layers_of_hits_alloc[i].m_nphi_alloc = Config::m_nphi; - } -} - - -void EventOfHitsCU::reserve_layers(const EventOfHits &event_of_hits, float factor) -{ - if (m_n_layers <= 0) { // is this still needed? - m_n_layers = event_of_hits.m_n_layers; - m_layers_of_hits.reserve(m_n_layers); - m_layers_of_hits_alloc.reserve(m_n_layers); - } else { - assert(m_n_layers == event_of_hits.m_n_layers); - } - reserve_all_hits(event_of_hits, factor); - reserve_all_phi_bins(event_of_hits, factor); - - set_layer_hit_views(event_of_hits, factor); - set_layer_bin_views(event_of_hits, factor); -} - - -void EventOfHitsCU::reserve_layers(const std::vector &layerHits) -{ - m_n_layers = layerHits.size(); - // Allocate GPU array. - // Members's address of array's elements are in the GPU space - m_layers_of_hits.reserve(m_n_layers); - // Allocate CPU array. - // Members's address of array's elements are in the CPU space - // This allows to call allocate for each array's element. - m_layers_of_hits_alloc.reserve(m_n_layers); - - reserve_all_hits(layerHits, 1.f); -} - - -void EventOfHitsCU::prepare_all_host_hits(const EventOfHits& event_of_hits) { - all_host_hits.reserve(all_hits.global_capacity()); - - for (auto i = 0; i < m_n_layers; ++i) { - auto it1 = event_of_hits.m_layers_of_hits[i].m_hits; - auto it2 = event_of_hits.m_layers_of_hits[i].m_hits - + event_of_hits.m_layers_of_hits[i].m_capacity; - std::copy(it1, it2, &all_host_hits[all_hits.local_capacity() * i]); - } -} - - -void EventOfHitsCU::prepare_all_host_bins(const EventOfHits& event_of_hits) { - all_host_bins.reserve(all_phi_bin_infos.global_capacity()); - - for (int i = 0; i < event_of_hits.m_n_layers; i++) { - size_t offset_layer = i * all_phi_bin_infos.local_capacity(); - for (auto j = 0; j < event_of_hits.m_layers_of_hits[i].m_nz; ++j) { - auto it1 = event_of_hits.m_layers_of_hits[i].m_phi_bin_infos[j].begin(); - auto it2 = event_of_hits.m_layers_of_hits[i].m_phi_bin_infos[j].end(); - size_t offset = offset_layer + j * Config::m_nphi; - std::copy(it1, it2, &all_host_bins[offset]); - } - } -} - - -void EventOfHitsCU::copyFromCPU(const EventOfHits& event_of_hits, - const cudaStream_t &stream) { - Tracer tracer("copyEventOfHits", tracer_colors["PaleGreen"]); - - prepare_all_host_hits(event_of_hits); - all_hits.copy_from_cpu(all_host_hits.data(), stream); - - prepare_all_host_bins(event_of_hits); - all_phi_bin_infos.copy_from_cpu((PairIntsCU *)all_host_bins.data(), stream); - - for (int i = 0; i < event_of_hits.m_n_layers; i++) { - m_layers_of_hits_alloc[i].copy_layer_values(event_of_hits.m_layers_of_hits[i]); - } - m_layers_of_hits.resize(m_n_layers); - m_layers_of_hits.copy_from_cpu(m_layers_of_hits_alloc.data(), stream); -} - - -void EventOfHitsCU::copyFromCPU(const std::vector &layerHits, - const cudaStream_t &stream) { - all_host_hits.reserve(all_hits.global_capacity()); - - for (auto i = 0; i < m_n_layers; ++i) { - auto it1 = layerHits[i].begin(); - auto it2 = layerHits[i].end(); - std::copy(it1, it2, &all_host_hits[all_hits.local_capacity() * i]); - } - all_hits.copy_from_cpu(all_host_hits.data(), stream); - - for (int i = 0; i < m_n_layers; ++i) { - m_layers_of_hits_alloc[i].m_hits.set_ptr(all_hits.get_ptr_to_part(i)); - } - m_layers_of_hits.resize(m_n_layers); - m_layers_of_hits.copy_from_cpu(m_layers_of_hits_alloc.data(), stream); -} - - -// ============================================================================ - -void EtaBinOfCandidatesCU::alloc_tracks(const int ntracks) { - m_real_size = ntracks; - m_fill_index = 0; - - cudaMalloc((void**)&m_candidates, sizeof(Track)*m_real_size); - /*cudaCheckError();*/ -} - - -void EtaBinOfCandidatesCU::free_tracks() { - cudaFree(m_candidates); - m_real_size = 0; - m_fill_index = 0; -} - - -void EtaBinOfCandidatesCU::copyFromCPU(const EtaBinOfCandidates &eta_bin, - const cudaStream_t &stream) { - assert (eta_bin.m_fill_index < m_real_size); // or something - m_fill_index = eta_bin.m_fill_index; - - cudaMemcpyAsync(m_candidates, &eta_bin.m_candidates[0], - sizeof(Track)*m_fill_index, cudaMemcpyHostToDevice, stream); - /*cudaCheckError();*/ -} - - -void EtaBinOfCandidatesCU::copyToCPU(EtaBinOfCandidates &eta_bin, - const cudaStream_t &stream) const { - assert (eta_bin.m_fill_index < m_real_size); // or something - - cudaMemcpyAsync(&eta_bin.m_candidates[0], m_candidates, - sizeof(Track)*m_fill_index, cudaMemcpyDeviceToHost, stream); - /*cudaCheckError();*/ -} - -// ============================================================================ - -void EventOfCandidatesCU::allocGPU(const EventOfCandidates &event_of_cands) { - m_n_etabins = Config::nEtaBin; - // Allocate GPU array. - // Members's address of array's elements are in the GPU space - cudaMalloc((void**)&m_etabins_of_candidates, m_n_etabins*sizeof(EtaBinOfCandidatesCU)); - /*cudaCheckError();*/ - // Allocate CPU array. - // Members's address of array's elements are in the CPU space - // This allows to call allocate for each array's element. - m_etabins_of_candidates_alloc = new EtaBinOfCandidatesCU[m_n_etabins]; - for (int i = 0; i < m_n_etabins; ++i) { - const EtaBinOfCandidates& h_etabin = event_of_cands.m_etabins_of_candidates[i]; - m_etabins_of_candidates_alloc[i].alloc_tracks(h_etabin.m_real_size); - } - /*cudaCheckError();*/ -} - - -void EventOfCandidatesCU::deallocGPU() { - for (int i = 0; i < m_n_etabins; ++i) { - /*cudaCheckError();*/ - m_etabins_of_candidates_alloc[i].free_tracks(); - /*cudaCheckError();*/ - } - cudaFree(m_etabins_of_candidates); - /*cudaCheckError();*/ - delete[] m_etabins_of_candidates_alloc; -} - - -void EventOfCandidatesCU::copyFromCPU(const EventOfCandidates& event_of_cands, - const cudaStream_t &stream) { - for (int i = 0; i < m_n_etabins; i++) { - m_etabins_of_candidates_alloc[i].copyFromCPU(event_of_cands.m_etabins_of_candidates[i], stream); - } - cudaCheckError(); - cudaMemcpyAsync(m_etabins_of_candidates, m_etabins_of_candidates_alloc, - m_n_etabins*sizeof(EtaBinOfCandidatesCU), - cudaMemcpyHostToDevice, stream); - cudaCheckError(); -} - - -void EventOfCandidatesCU::copyToCPU(EventOfCandidates& event_of_cands, - const cudaStream_t &stream) const { - for (int i = 0; i < m_n_etabins; i++) { - m_etabins_of_candidates_alloc[i].copyToCPU(event_of_cands.m_etabins_of_candidates[i], stream); - } - /*cudaCheckError();*/ - // We do not need to copy the array of pointers to EventOfCandidatesCU back -} - -// ============================================================================ - -void EtaBinOfCombCandidatesCU::allocate(const int nseed, const int factor) -{ - m_fill_index = 0; - m_nseed = nseed; - - if (m_nseed_alloc < nseed * factor) { - m_nseed_alloc = nseed * factor; - m_real_size = m_nseed_alloc * Config::maxCandsPerSeed; - } - m_candidates.reserve(m_real_size); - m_ntracks_per_seed.reserve(m_nseed_alloc); -} - - -void EtaBinOfCombCandidatesCU::copyFromCPU( - const EtaBinOfCombCandidates& eta_bin, const cudaStream_t& stream) -{ - Tracer("etaBinFromCPU", tracer_colors["Chocolate"]); - - assert (eta_bin.m_fill_index < m_real_size); // or something - m_fill_index = eta_bin.m_fill_index * Config::maxCandsPerSeed; - - std::vector track_per_seed_array (m_nseed); - std::vector tracks_tmp (m_real_size); - - for (auto i = 0; i < eta_bin.m_fill_index; ++i) - { - // That is to be general, because ntracks should be one, for each seed - // when the building is launched. - int ntracks = std::min(Config::maxCandsPerSeed, - static_cast(eta_bin.m_candidates[i].size())); - - auto it1 = eta_bin.m_candidates[i].begin(); - auto it2 = eta_bin.m_candidates[i].end(); - std::move(it1, it2, &tracks_tmp[i*Config::maxCandsPerSeed]); - - track_per_seed_array[i] = ntracks; - } - m_candidates.resize(m_real_size); - m_candidates.copy_from_cpu(tracks_tmp.data(), stream); - - m_ntracks_per_seed.resize(m_nseed); - m_ntracks_per_seed.copy_from_cpu(track_per_seed_array.data(), stream); - - cudaStreamSynchronize(stream); -} - - -void EtaBinOfCombCandidatesCU::copyToCPU( - EtaBinOfCombCandidates& eta_bin, const cudaStream_t& stream) const -{ - Tracer("etaBinToCPU", tracer_colors["Pink"]); - - assert (eta_bin.m_fill_index < m_real_size); // or something - - std::vector ntracks (eta_bin.m_fill_index); - std::vector tracks_tmp (m_real_size); - - m_ntracks_per_seed.copy_to_cpu(ntracks.data(), stream); - m_candidates.copy_to_cpu(tracks_tmp.data(), stream); - cudaStreamSynchronize(stream); - - for (auto i = 0; i < eta_bin.m_fill_index; ++i) - { - auto it1 = &tracks_tmp[i*Config::maxCandsPerSeed]; - auto it2 = &tracks_tmp[(i+1)*Config::maxCandsPerSeed]; - std::move(it1, it2, eta_bin.m_candidates[i].begin()); - eta_bin.m_candidates[i].resize(ntracks[i]); - } -} - -// ============================================================================ - -void EventOfCombCandidatesCU::allocate( - const EventOfCombCandidates &event_of_cands, const float factor) -{ - m_n_etabins = Config::nEtaBin; - m_etabins_of_comb_candidates.reserve_and_resize(m_n_etabins); - m_etabins_of_comb_candidates_alloc.resize(m_n_etabins); - - for (int i = 0; i < m_n_etabins; ++i) { - const EtaBinOfCombCandidates& h_etabin = event_of_cands.m_etabins_of_comb_candidates[i]; - m_etabins_of_comb_candidates_alloc[i].allocate(h_etabin.m_fill_index, factor); - } -} - - -void EventOfCombCandidatesCU::copyFromCPU( - const EventOfCombCandidates &event_of_cands, - const cudaStream_t &stream) -{ - Tracer tracer("eventCandsFromCPU", tracer_colors["Orange"]); - for (int i = 0; i < m_n_etabins; i++) { - m_etabins_of_comb_candidates_alloc[i].copyFromCPU( - event_of_cands.m_etabins_of_comb_candidates[i], stream); - } - m_etabins_of_comb_candidates.resize(m_n_etabins); // useless but reassuring - m_etabins_of_comb_candidates.copy_from_cpu(m_etabins_of_comb_candidates_alloc.data(), stream); -} - - -void EventOfCombCandidatesCU::copyToCPU( - EventOfCombCandidates &event_of_cands, - const cudaStream_t &stream) const -{ - Tracer tracer("eventCandsToCPU", tracer_colors["Red"]); - - for (int i = 0; i < m_n_etabins; i++) { - m_etabins_of_comb_candidates_alloc[i].copyToCPU(event_of_cands.m_etabins_of_comb_candidates[i], stream); - } -} diff --git a/mkFit/HitStructuresCU.h b/mkFit/HitStructuresCU.h deleted file mode 100644 index cd457c66..00000000 --- a/mkFit/HitStructuresCU.h +++ /dev/null @@ -1,211 +0,0 @@ -#ifndef _HIT_STRUCTURES_H_ -#define _HIT_STRUCTURES_H_ - -#include "Config.h" -#include "HitStructures.h" - -#include "device_vector.h" -#include "device_bundle.h" -#include "device_array_view.h" -#include "gpu_utils.h" - -namespace mkfit { - -template -struct PairCU { - T1 first; - T2 second; -}; - -using PairIntsCU = PairCU; - -class LayerOfHitsCU { - public: - DeviceArrayView m_hits; - DeviceArrayView m_phi_bin_infos; - - float m_zmin = 0, m_zmax, m_fz = 0; - int m_nz = 0; - int m_capacity = 0; - - int m_capacity_alloc = 0; - int m_nz_alloc = 0; - int m_nphi_alloc = 0; - - //fixme, these are copies of the ones above, need to merge with a more generic name - float m_rmin, m_rmax, m_fr; - int m_nr = 0; - - // This could be a parameter, layer dependent. - //static constexpr int m_nphi = 1024; - // Testing bin filling - // static constexpr int m_nphi = 16; - static constexpr float m_fphi = Config::m_nphi / Config::TwoPI; - static constexpr int m_phi_mask = 0x3ff; - - // As above - //static constexpr float m_max_dz = 1; - //static constexpr float m_max_dphi = 0.02; - - void copy_layer_values(const LayerOfHits &layer); - -#ifdef __CUDACC__ - __device__ -#endif - int GetZBin(const float z) const { return (z - m_zmin) * m_fz; } - -#ifdef __CUDACC__ - __device__ -#endif - int GetZBinChecked(float z) const { - int zb = GetZBin(z); - if (zb < 0) { - zb = 0; - } else { - if (zb >= m_nz) zb = m_nz - 1; - } - return zb; - } - - // if you don't pass phi in (-pi, +pi), mask away the upper bits using m_phi_mask -#ifdef __CUDACC__ - __device__ -#endif - int GetPhiBin(float phi) const { - return floorf(m_fphi * (phi + Config::PI)); - } -}; - - -class EventOfHitsCU -{ -public: - DeviceVector m_layers_of_hits; - int m_n_layers = 0; - - // The following array is to be able to allocate GPU arrays from - // the CPU and then copy the address of the GPU ptr to the GPU structure - std::vector m_layers_of_hits_alloc; - - EventOfHitsCU() : m_n_layers{} {}; - - void reserve_layers(const EventOfHits &event_of_hits, float factor=1.f); - void reserve_layers(const std::vector &layerHits); // fittest - void copyFromCPU(const EventOfHits& event_of_hits, - const cudaStream_t &stream); - void copyFromCPU(const std::vector &layerHits, - const cudaStream_t &stream); - -private: - void prepare_all_host_hits(const EventOfHits &event_of_hits); - void prepare_all_host_bins(const EventOfHits &event_of_hits); - - void reserve_all_hits(const EventOfHits &event_of_hits, float factor); - void reserve_all_hits(const std::vector &layers_of_hits, float factor); - void reserve_all_phi_bins(const EventOfHits &event_of_hits, float factor); - - void set_layer_hit_views(const EventOfHits& event_of_hits, float factor); - void set_layer_bin_views(const EventOfHits& event_of_hits, float factor); - - // Large 'arrays' to help with data transfers -- Fortran 77's style. - // The whole trick is to flatten the memory layout to minimize the number of transfers - // Where we move host data to be consecutive (with padding) - std::vector all_host_bins; - std::vector all_host_hits; - // Where we store device data. Layers are pointers to parts of the array - DeviceBundle all_hits; - DeviceBundle all_phi_bin_infos; -}; - -// ============================================================================ - -class EtaBinOfCandidatesCU -{ -public: - Track *m_candidates; - - int m_real_size; - int m_fill_index; - - void alloc_tracks(const int ntracks); - void free_tracks(); - - void copyFromCPU(const EtaBinOfCandidates &eta_bin, - const cudaStream_t &stream); - void copyToCPU(EtaBinOfCandidates &eta_bin, - const cudaStream_t &stream) const; -}; - - -class EventOfCandidatesCU -{ -public: - int m_n_etabins; - EtaBinOfCandidatesCU *m_etabins_of_candidates; // device array - - // Host array. For allocation and transfer purposes - EtaBinOfCandidatesCU *m_etabins_of_candidates_alloc; - - EventOfCandidatesCU() : m_n_etabins{}, - m_etabins_of_candidates{nullptr}, - m_etabins_of_candidates_alloc{nullptr} {} - - void allocGPU(const EventOfCandidates &event_of_cands); - void deallocGPU(); - void copyFromCPU(const EventOfCandidates &event_of_cands, - const cudaStream_t &stream=0); - void copyToCPU(EventOfCandidates &event_of_cands, - const cudaStream_t &stream=0) const; -}; - -// ============================================================================ - -class EtaBinOfCombCandidatesCU -{ -public: - // CPU code: std::vector > m_candidates; - // GPU code: Array [maxCandsPerSeed*numSeedPerEtaBin] - // -> We don't actually care about numSeedPerEtaBin, it's all known - // from the CPU side, use EtaBinOfCombCandidates.m_fill_index - // or EtaBinOfCombCandidates.m_real_size - // More trouble to come: We cannot do the same as for EtaBinOfCandidatesCU - // for copying from host mem to dev. mem. The non contiguous host mem - // for m_candidates forces a loop over the seeds. - DeviceVector m_candidates; - DeviceVector m_ntracks_per_seed; // array of m_candidates[i].size() - - int m_real_size = 0; - int m_fill_index = 0; - int m_nseed = 0; - int m_nseed_alloc = 0; - - void allocate(const int nseed, const int factor=1.f); - void copyFromCPU(const EtaBinOfCombCandidates& eta_bin, - const cudaStream_t& stream); - void copyToCPU(EtaBinOfCombCandidates& eta_bin, - const cudaStream_t& stream) const; -}; - - -// TODO: The comb and non-comb version are quite similar: refactor? -class EventOfCombCandidatesCU -{ -public: - DeviceVector m_etabins_of_comb_candidates; - int m_n_etabins = 0; - - // Host array. For allocation and transfer purposes - std::vector m_etabins_of_comb_candidates_alloc; - - EventOfCombCandidatesCU() : m_n_etabins{} {}; - - void allocate(const EventOfCombCandidates &event_of_cands, const float factor=1.f); - void copyFromCPU(const EventOfCombCandidates &event_of_cands, - const cudaStream_t &stream); - void copyToCPU(EventOfCombCandidates &event_of_cands, - const cudaStream_t &stream) const; -}; - -// end namespace mkfit -#endif // _HIT_STRUCTURES_H_ - diff --git a/mkFit/KalmanUtilsMPlex.h b/mkFit/KalmanUtilsMPlex.h index 30301127..748f8bb5 100644 --- a/mkFit/KalmanUtilsMPlex.h +++ b/mkFit/KalmanUtilsMPlex.h @@ -4,10 +4,6 @@ #include "Track.h" #include "Matrix.h" -#ifdef USE_CUDA -#include "FitterCU.h" -#endif - namespace mkfit { //------------------------------------------------------------------------------ diff --git a/mkFit/KalmanUtilsMPlex.icc b/mkFit/KalmanUtilsMPlex.icc index b9ccd70f..87a324ad 100644 --- a/mkFit/KalmanUtilsMPlex.icc +++ b/mkFit/KalmanUtilsMPlex.icc @@ -3,9 +3,6 @@ /////////////////////////////////////////////////////////////////////////////// template -#ifdef __CUDACC__ -__device__ -#endif static inline void KHMult_imp(const TfLH& a, const TfQF1& b00, const TfQF2& b01, TfLL& c, const int nmin, const int nmax) { @@ -56,21 +53,13 @@ static inline void KHMult_imp(const TfLH& a, const TfQF1& b00, const TfQF2& b01 /////////////////////////////////////////////////////////////////////////////// template -#ifdef __CUDACC__ -__device__ -#endif static inline void ConvertToCCS_imp(const TfLV1& a, TfLV2& b, TfLL& c, const int nmin, const int nmax) { #pragma omp simd for (int n = nmin; n < nmax; ++n) { - // TODO Check why getHypot and getHypot_fn have to be different -#ifdef __CUDACC__ - const float pt = getHypot_fn(a(n, 0, 3), a(n, 0, 4)); -#else const float pt = getHypot(a(n, 0, 3), a(n, 0, 4)); -#endif const float p2 = pt*pt + a(n, 0, 5)*a(n, 0, 5); // b(n, 0, 0) = a(n, 0, 0); @@ -124,9 +113,6 @@ static inline void ConvertToCCS_imp(const TfLV1& a, TfLV2& b, TfLL& c, /////////////////////////////////////////////////////////////////////////////// template -#ifdef __CUDACC__ -__device__ -#endif static inline void ConvertToCartesian_imp(const TfLV1& a, TfLV2& b, TfLL& c, const int nmin, const int nmax) { @@ -189,9 +175,6 @@ static inline void ConvertToCartesian_imp(const TfLV1& a, TfLV2& b, TfLL& c, /////////////////////////////////////////////////////////////////////////////// template -#ifdef __CUDACC__ -__device__ -#endif static inline void MultResidualsAdd_imp(const TfLH& a, const TfLV1& b, const Tf2V& c, TfLV2& d, const int nmin, const int nmax) @@ -213,9 +196,6 @@ static inline void MultResidualsAdd_imp(const TfLH& a, const TfLV1& b, /////////////////////////////////////////////////////////////////////////////// template -#ifdef __CUDACC_ -__device__ -#endif static inline void AddIntoUpperLeft3x3_imp(const TfLS& A, const TfHS1& B, TfHS2& C, const int aN, const int bN, const int cN, const int nmin, const int nmax) @@ -228,9 +208,6 @@ static inline void AddIntoUpperLeft3x3_imp(const TfLS& A, const TfHS1& B, TfHS2& /////////////////////////////////////////////////////////////////////////////// template -#ifdef __CUDACC__ -__device__ -#endif static inline void RotateResidulsOnTangentPlane_impl(const TfQF1& r00, const TfQF2& r01, const TfHV& a, Tf2V& b, const int nmin, const int nmax) { diff --git a/mkFit/Makefile b/mkFit/Makefile index 8d607316..87b48808 100644 --- a/mkFit/Makefile +++ b/mkFit/Makefile @@ -51,7 +51,6 @@ echo: @echo "CXXFLAGS = ${CXXFLAGS}" @echo "LDFLAGS = ${LDFLAGS}" @echo "EXES = ${EXES}" - @echo "CU_OBJ = ${CU_OBJS}" ################################################################ @@ -62,30 +61,7 @@ MKFHDRS := $(wildcard *.h) MKFOBJS := $(MKFSRCS:.cc=.o) MKFDEPS := $(MKFSRCS:.cc=.d) -ifdef USE_CUDA -CU_SRCS := $(wildcard *.cu) -CU_OBJS := $(CU_SRCS:.cu=.o) - -LDFLAGS_CU := -lcudart -LDFLAGS_CU += -lnvToolsExt - -# To share __device__ function across several source files -# 1) compile with --device-c (works as -c should be) -# 2) Create some kind of dictionary with all the .cu.o -# 3) The dictionary AND the original .o files should be used -# -- Works only for CUDA_VERSION >= 5 -# TODO: Clean the "-I.. -std=c++11" -${CU_OBJS}: %.o: %.cu - ${NV} --device-c -o $@ $< -I.. -std=c++11 -DUSE_MATRIPLEX -Wno-deprecated-gpu-targets -lineinfo - -# -g -G - -CU_LINK := cu_link.o -${CU_LINK}: ${CU_OBJS} - ${NV} --device-link $^ --output-file $@ -endif - -ALLOBJS := ${MKFOBJS} ${CU_OBJS} ${CU_LINK} +ALLOBJS := ${MKFOBJS} LIBOBJS := $(filter-out mkFit.o, ${ALLOBJS}) ${MKFDEPS}: auto-genmplex @@ -95,10 +71,10 @@ include ${MKFDEPS} endif mkFit: ${ALLOBJS} - ${CXX} ${CXXFLAGS} ${VEC_HOST} ${LDFLAGS} ${ALLOBJS} -o $@ ${LDFLAGS_HOST} ${LDFLAGS_CU} -L../lib -lMicCore -Wl,-rpath,../lib,-rpath,./lib + ${CXX} ${CXXFLAGS} ${VEC_HOST} ${LDFLAGS} ${ALLOBJS} -o $@ ${LDFLAGS_HOST} -L../lib -lMicCore -Wl,-rpath,../lib,-rpath,./lib ${LIB_MKFIT}: ${LIBOBJS} - ${CXX} ${CXXFLAGS} ${VEC_HOST} ${LIBOBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS_CU} ${LDFLAGS} + ${CXX} ${CXXFLAGS} ${VEC_HOST} ${LIBOBJS} -shared -o $@ ${LDFLAGS_HOST} ${LDFLAGS} ifdef WITH_ROOT fittestMPlex.o : CPPFLAGS += $(shell root-config --cflags) @@ -110,11 +86,6 @@ TTree.h: echo "Using dummy rule for TTree.h" endif -ifdef CUBROOT -cub/util_debug.cuh: - echo "Using dummy rule for cub/util_debug.cuh" -endif - ${MKFOBJS}: %.o: %.cc %.d ${CXX} ${CPPFLAGS} ${CXXFLAGS} ${VEC_HOST} -c -o $@ $< diff --git a/mkFit/MkBuilder.h b/mkFit/MkBuilder.h index 5339c638..98fdc32e 100644 --- a/mkFit/MkBuilder.h +++ b/mkFit/MkBuilder.h @@ -210,10 +210,6 @@ class MkBuilder void BackwardFit(); void fit_cands(MkFinder *mkfndr, int start_cand, int end_cand, int region); -#ifdef USE_CUDA - const Event* get_event() const { return m_event; } - const EventOfHits& get_event_of_hits() const { return m_event_of_hits; } -#endif }; } // end namespace mkfit diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc index 3d4c14bd..a50be1ab 100644 --- a/mkFit/MkFitter.cc +++ b/mkFit/MkFitter.cc @@ -4,10 +4,6 @@ #include "ConformalUtilsMPlex.h" #include "MatriplexPackers.h" -#ifdef USE_CUDA -//#include "FitterCU.h" -#endif - //#define DEBUG #include "Debug.h" @@ -68,15 +64,6 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, int itrack = 0; -// FIXME: uncomment when track building is ported to GPU. -//#ifdef USE_CUDA - // This openmp loop brings some performances when using - // a single thread to fit all events. - // However, it is more advantageous to use the threads to - // parallelize over Events. -// omp_set_num_threads(Config::numThreadsReorg); -//#pragma omp parallel for private(itrack) -//#endif for (int i = beg; i < end; ++i, ++itrack) { const Track &trk = tracks[i]; @@ -89,11 +76,6 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, Label(itrack, 0, 0) = trk.label(); // CopyIn seems fast enough, but indirections are quite slow. -// For GPU computations, it has been moved in between kernels -// in an attempt to overlap CPU and GPU computations. -// FIXME: uncomment when track building is ported to GPU. -#if 1 -//#ifndef USE_CUDA for (int hi = 0; hi < Nhits; ++hi) { HoTArr[hi](itrack, 0, 0) = trk.getHitOnTrack(hi); @@ -105,7 +87,6 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, msErr[hi].CopyIn(itrack, hit.errArray()); msPar[hi].CopyIn(itrack, hit.posArray()); } -#endif } } @@ -119,14 +100,7 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, // assert(end - beg == NN); int itrack; -//#ifdef USE_CUDA - // This openmp loop brings some performances when using - // a single thread to fit all events. - // However, it is more advantageous to use the threads to - // parallelize over Events. -// omp_set_num_threads(Config::numThreadsReorg); -//#pragma omp parallel for private(itrack) -//#endif + for (int i = beg; i < end; ++i) { itrack = i - beg; const Track &trk = tracks[i]; @@ -140,10 +114,6 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, Chi2(itrack, 0, 0) = trk.chi2(); // CopyIn seems fast enough, but indirections are quite slow. -// For GPU computations, it has been moved in between kernels -// in an attempt to overlap CPU and GPU computations. -//#ifndef USE_CUDA -#if 1 for (int hi = 0; hi < Nhits; ++hi) { const int hidx = trk.getHitIdx(hi); @@ -155,7 +125,6 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, HoTArr[hi](itrack, 0, 0) = trk.getHitOnTrack(hi); } -#endif } } @@ -170,14 +139,6 @@ void MkFitter::SlurpInTracksAndHits(const std::vector& tracks, MatriplexTrackPacker mtp(tracks[beg]); -//#ifdef USE_CUDA - // This openmp loop brings some performances when using - // a single thread to fit all events. - // However, it is more advantageous to use the threads to - // parallelize over Events. -// omp_set_num_threads(Config::numThreadsReorg); -//#pragma omp parallel for private(itrack) -//#endif for (int i = beg; i < end; ++i) { int itrack = i - beg; @@ -194,11 +155,6 @@ void MkFitter::SlurpInTracksAndHits(const std::vector& tracks, mtp.Pack(Err[iC], Par[iC]); // CopyIn seems fast enough, but indirections are quite slow. -// For GPU computations, it has been moved in between kernels -// in an attempt to overlap CPU and GPU computations. -//#ifndef USE_CUDA -#if 1 - for (int hi = 0; hi < Nhits; ++hi) { MatriplexHitPacker mhp(layerHits[hi][0]); @@ -215,7 +171,6 @@ void MkFitter::SlurpInTracksAndHits(const std::vector& tracks, mhp.Pack(msErr[hi], msPar[hi]); } -#endif } void MkFitter::InputTracksAndHitIdx(const std::vector& tracks, @@ -310,10 +265,6 @@ void MkFitter::InputSeedsTracksAndHits(const std::vector& seeds, const Track &trk = tracks[see.label()]; // CopyIn seems fast enough, but indirections are quite slow. -// For GPU computations, it has been moved in between kernels -// in an attempt to overlap CPU and GPU computations. -//#ifndef USE_CUDA -#if 1 for (int hi = 0; hi < Nhits; ++hi) { HoTArr[hi](itrack, 0, 0) = trk.getHitOnTrack(hi); @@ -325,7 +276,6 @@ void MkFitter::InputSeedsTracksAndHits(const std::vector& seeds, msErr[hi].CopyIn(itrack, hit.errArray()); msPar[hi].CopyIn(itrack, hit.posArray()); } -#endif } } diff --git a/mkFit/MkFitter.h b/mkFit/MkFitter.h index e234bb3d..80643a2b 100644 --- a/mkFit/MkFitter.h +++ b/mkFit/MkFitter.h @@ -8,11 +8,6 @@ #include "HitStructures.h" -#if USE_CUDA -#include "FitterCU.h" -#include "HitStructuresCU.h" -#endif - //#define DEBUG 1 //#define USE_BOHS diff --git a/mkFit/PropagationMPlex.icc b/mkFit/PropagationMPlex.icc index 5571df88..b7ac5f9b 100644 --- a/mkFit/PropagationMPlex.icc +++ b/mkFit/PropagationMPlex.icc @@ -3,9 +3,6 @@ /////////////////////////////////////////////////////////////////////////////// template -#ifdef __CUDACC__ -__device__ -#endif static inline void helixAtRFromIterativeCCS_impl(const Tf& __restrict__ inPar, const Ti& __restrict__ inChg, const Tf11& __restrict__ msRad, diff --git a/mkFit/accessors_cu.h b/mkFit/accessors_cu.h deleted file mode 100644 index f2bb9d9b..00000000 --- a/mkFit/accessors_cu.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef ACCESSORS_CU_H -#define ACCESSORS_CU_H - -template <> -__device__ float* SVector3::ArrayCU() { - return fArray; -} - -template <> -__device__ float* SVector6::ArrayCU() { - return fArray; -} - -template <> -__device__ float* ROOT::Math::MatRepSym::ArrayCU() { - return fArray; -} - -template <> -__device__ float* SMatrixSym66::ArrayCU() { - return fRep.ArrayCU(); -} - -__device__ float *Hit::posArrayCU() { - return state_.pos_.ArrayCU(); -} - -__device__ float *Hit::errArrayCU() { - return state_.err_.ArrayCU(); -} - -__device__ float *Track::posArrayCU() { - return state_.parameters.ArrayCU(); -} - -__device__ float *Track::errArrayCU() { - return state_.errors.ArrayCU(); -} - -#endif /* ifndef ACCESSORS_CU_H */ diff --git a/mkFit/array_algorithms_cu.h b/mkFit/array_algorithms_cu.h deleted file mode 100644 index 68dde020..00000000 --- a/mkFit/array_algorithms_cu.h +++ /dev/null @@ -1,86 +0,0 @@ -#ifndef ARRAY_ALGORITHMS_CU_H -#define ARRAY_ALGORITHMS_CU_H - -#include -//#include "atomic_utils.h" -#include "gpu_utils.h" - -namespace mkfit { - -template -__device__ void reduceMax_fn(const T *d_in, const int in_size, T *d_max) { - // Specialize BlockReduce type for our thread block - typedef cub::BlockReduce BlockReduceT; - // Shared memory - __shared__ typename BlockReduceT::TempStorage temp_storage; - - for (int i = threadIdx.x + blockIdx.x*blockDim.x; - i < in_size; - i += blockDim.x * gridDim.x) { - // Per-thread tile data - T data[ITEMS_PER_THREAD]; - int block_offset = i - threadIdx.x; - cub::LoadDirectStriped(threadIdx.x, - d_in + block_offset, //blockIdx.x*blockDim.x, - data); - // Compute sum for a single thread block - T aggregate = BlockReduceT(temp_storage).Reduce(data, cub::Max()); - - // FIXME: Is reduction over block enough, or do we need device-wise reduction - // CPU code reduces on NN (typically 8 or 16) values. So block-wise should - // be good enough. - // Device-wise reductions with atomics are performance killers - //if (threadIdx.x == 0) { - //AtomicMax(d_max, aggregate); - //} - *d_max = aggregate; - } -} - -template -__global__ void reduceMax_kernel(T *d_in, int in_size, T *d_max) { - reduceMax_fn(d_in, in_size, d_max); -} - -template -void max_element_wrapper(T *d_v, int num_elems, T *d_max) { - dim3 block (BLOCK_THREADS, 1, 1); - dim3 grid (std::min(max_blocks_x, (num_elems-1)/BLOCK_THREADS + 1), 1, 1) ; - reduceMax_kernel - <<< grid, block >>> - (d_v, num_elems, d_max); -} - -template -__device__ void reduceMaxPartial_fn(const T *d_in, const int start_idx, - const int in_size, T *d_max) { - // Specialize BlockReduce type for our thread block - typedef cub::BlockReduce BlockReduceT; - // Shared memory - __shared__ typename BlockReduceT::TempStorage temp_storage; - - int i = threadIdx.x; - if (i < in_size) { - // Per-thread tile data - T data[ITEMS_PER_THREAD]; - cub::LoadDirectStriped(threadIdx.x, - d_in + start_idx, - data); - // Compute sum for a single thread block - T aggregate = BlockReduceT(temp_storage).Reduce(data, cub::Max()); - *d_max = aggregate; - } -} -} // end namespace mkfit -#endif /* ifndef ARRAY_ALGORITHMS_CU_H */ diff --git a/mkFit/atomic_utils.h b/mkFit/atomic_utils.h deleted file mode 100644 index c5f82d99..00000000 --- a/mkFit/atomic_utils.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef ATOMIC_UTILS_H -#define ATOMIC_UTILS_H - -/* Copyright (c) 2012, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -// AtomicMAX code from: -// https://github.com/parallel-forall/code-samples/blob/master/posts/cuda-aware-mpi-example/src/Device.cu - -#define uint64 unsigned long long - - template -__device__ void AtomicMax(T * const address, const T value) -{ - atomicMax(address, value); -} - -/** - * @brief Compute the maximum of 2 single-precision floating point values using an atomic operation - * - * @param[in]addressThe address of the reference value which might get updated with the maximum - * @param[in]valueThe value that is compared to the reference in order to determine the maximum - */ - template <> -__device__ void AtomicMax(float * const address, const float value) -{ - if (* address >= value) - { - return; - } - - int * const address_as_i = (int *)address; - int old = * address_as_i, assumed; - - do - { - assumed = old; - if (__int_as_float(assumed) >= value) - { - break; - } - // The value stored at address_as_i might have changed since it has been loaded - // into old and into assumed. - old = atomicCAS(address_as_i, assumed, __float_as_int(value)); - } while (assumed != old); // the other threads did not change address_as_i, so the - // atomicCAS return action makes sense. -} - -/** - * @brief Compute the maximum of 2 double-precision floating point values using an atomic operation - * - * @param[in]addressThe address of the reference value which might get updated with the maximum - * @param[in]valueThe value that is compared to the reference in order to determine the maximum - */ - template <> -__device__ void AtomicMax(double * const address, const double value) -{ - if (* address >= value) - { - return; - } - - uint64 * const address_as_i = (uint64 *)address; - uint64 old = * address_as_i, assumed; - - do - { - assumed = old; - if (__longlong_as_double(assumed) >= value) - { - break; - } - - old = atomicCAS(address_as_i, assumed, __double_as_longlong(value)); - } while (assumed != old); -} - -#endif /* ifndef ATOMIC_UTILS_H */ diff --git a/mkFit/best_hit_kernels.cu b/mkFit/best_hit_kernels.cu deleted file mode 100644 index 9d98aa6b..00000000 --- a/mkFit/best_hit_kernels.cu +++ /dev/null @@ -1,298 +0,0 @@ -#include "best_hit_kernels.h" - -#include "computeChi2_kernels.h" -#include "index_selection_kernels.h" -#include "reorganize_gplex.h" -#include "array_algorithms_cu.h" -#include "kalmanUpdater_kernels.h" -#include "propagation_kernels.h" - -#include -#include -#include -#include - -#define BLOCK_SIZE_X 256 - - -__device__ void getNewBestHitChi2_fn( - const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, - const float *outChi2, float &minChi2, - int &bestHit, const int hit_cnt, const int N) { - int itrack = threadIdx.x + blockDim.x*blockIdx.x; - - if (itrack < N) { - if (hit_cnt < XHitSize[itrack]) { - float chi2 = fabs(outChi2[itrack]); - if (chi2 < minChi2) { - minChi2 = chi2; - bestHit = XHitArr(itrack, hit_cnt, 0); - } - } - } -} - - -__global__ void getNewBestHitChi2_kernel( - const GPlexQI XHitSize, const GPlexHitIdx XHitArr, - const float *outChi2, float *minChi2, - int *bestHit, const int hit_cnt, const int N) { - int itrack = threadIdx.x + blockDim.x*blockIdx.x; - if (itrack < N) { - getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2, - minChi2[itrack], bestHit[itrack], hit_cnt, N); - } -} - - -void getNewBestHitChi2_wrapper(const cudaStream_t &stream, - const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, - const GPlexQF &outChi2, float *minChi2, int *bestHit, - const int hit_cnt, const int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - getNewBestHitChi2_kernel <<< grid, block, 0, stream >>> - (XHitSize, XHitArr, outChi2.ptr, minChi2, bestHit, hit_cnt, N); -} - - -void fill_array_cu(float *array, const int size, const float value) { - thrust::device_ptr d_ptr(array); - thrust::fill(d_ptr, d_ptr + size, value); -} - - -__device__ void updateTracksWithBestHit_fn(Hit *hits, - const float minChi2, const int bestHit, - GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar, - GPlexQF &Chi2, GPlexQI &HitsIdx, const int N) { - int itrack = threadIdx.x + blockDim.x*blockIdx.x; - if (itrack < N) { - if (bestHit >= 0) - { - Hit &hit = hits[ bestHit ]; - const float &chi2_local = minChi2; - - for (int i = 0; i < msErr.kSize; ++i) { - msErr[itrack + i*msErr.stride] = hit.errArrayCU()[i]; - } - for (int i = 0; i < msPar.kSize; ++i) { - msPar(itrack, i, 0) = hit.posArrayCU()[i]; - } - Chi2[itrack] += chi2_local; - HitsIdx[itrack] = bestHit; - } - else - { - msErr[itrack+ 0*msErr.stride] = 666; - msErr[itrack+ 1*msErr.stride] = 0; - msErr[itrack+ 2*msErr.stride] = 666; - msErr[itrack+ 3*msErr.stride] = 0; - msErr[itrack+ 4*msErr.stride] = 0; - msErr[itrack+ 5*msErr.stride] = 666; - - for (int i = 0; i < msPar.kSize; ++i) { - msPar(itrack, i, 0) = propPar(itrack, i, 0); - } - HitsIdx[itrack] = -1; - // Don't update chi2 - } - } -} - - -__global__ void updateTracksWithBestHit_kernel(Hit *hits, - const float *minChi2, const int *bestHit, - GPlexHS msErr, GPlexHV msPar, const GPlexLV propPar, - GPlexQF Chi2, GPlexQI HitsIdx, const int N) { - int itrack = threadIdx.x + blockDim.x*blockIdx.x; - if (itrack < N) { - updateTracksWithBestHit_fn - (hits, minChi2[itrack], bestHit[itrack], - msErr, msPar, propPar, Chi2, HitsIdx, N); - } -} - - -void updateTracksWithBestHit_wrapper(const cudaStream_t &stream, - LayerOfHitsCU &layer, const float *minChi2, const int *best_hit, - GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar, - GPlexQF &Chi2, GPlexQI &HitsIdx, const int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - updateTracksWithBestHit_kernel <<< grid, block, 0, stream >>> - (layer.m_hits.data(), minChi2, best_hit, msErr, msPar, propPar, Chi2, HitsIdx, N); -} - - -int getMaxNumHits_wrapper(const GPlexQI d_XHitSize, const int N) { - thrust::device_ptr d_ptr(d_XHitSize.ptr); - int maxSize= thrust::reduce(d_ptr, d_ptr + N, -1, thrust::maximum()); - maxSize = std::min(maxSize, Config::maxHitsConsidered); - - return maxSize; -} - - -__device__ void bestHit_fn( - Hit *hits, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, - const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar, - const GPlexLV &propPar, GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI &HitsIdx, - const int maxSize, const int itrack, const int N) { - - /*int itrack = threadIdx.x + blockDim.x*blockIdx.x;*/ - int bestHit_reg = -1; - float minChi2_reg = Config::chi2Cut; - - if (itrack < N) - HitsIdx[itrack] = 0; - - for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt) - { - HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, itrack, N); -#if 0 - // TODO: add CMSGeom - if (Config::useCMSGeom) { - //propagateHelixToRMPlex(psErr, psPar, inChg, msPar, propErr, propPar); - throw std::runtime_error("useCMSGeom not implemented yet for GPU"); - } else {} -#endif - computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, itrack, N); - getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2.ptr, minChi2_reg, bestHit_reg, hit_cnt, N); - } - updateTracksWithBestHit_fn - (hits, - minChi2_reg, bestHit_reg, - msErr, msPar, propPar, - Chi2, HitsIdx, - N); -} - - -__global__ void bestHit_kernel( - Hit *hits, const GPlexQI XHitSize, const GPlexHitIdx XHitArr, - const GPlexLS propErr, GPlexHS msErr, GPlexHV msPar, - const GPlexLV propPar, GPlexQF outChi2, - GPlexQF Chi2, GPlexQI HitsIdx, - const int maxSize, const int N) { - int itrack = threadIdx.x + blockDim.x*blockIdx.x; - bestHit_fn(hits, XHitSize, XHitArr, - propErr, msErr, msPar, - propPar, outChi2, - Chi2, HitsIdx, - maxSize, itrack, N); -} - - -void bestHit_wrapper(const cudaStream_t &stream, - LayerOfHitsCU &layer, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, - const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar, - const GPlexLV &propPar, GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI &HitsIdx, - const int maxSize, const int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - bestHit_kernel <<< grid, block, 0, stream >>> - (layer.m_hits.data(), XHitSize, XHitArr, - propErr, msErr, msPar, propPar, outChi2, - Chi2, HitsIdx, - maxSize, N); -} - - -template -__global__ void findBestHit_kernel(LayerOfHitsCU *layers, - EtaBinOfCandidatesCU *etabin_of_cands, - GPlexQI XHitSize, GPlexHitIdx XHitArr, - GPlexLS Err_iP, GPlexLV Par_iP, - GPlexHS *msErr_arr, GPlexHV *msPar_arr, - GPlexLS Err_iC, GPlexLV Par_iC, - GPlexQF outChi2, - GPlexQF Chi2, GPlexQI *HitsIdx_arr, - GPlexQI inChg, GPlexQI Label, GeometryCU geom, - int *maxSize, int gplex_size, - const int nlayers_per_seed) { - for (int ebin = 0; ebin != Config::nEtaBin; ++ebin) { - for (int beg = 0; beg < etabin_of_cands[ebin].m_fill_index; beg += gplex_size) { - int end = min(beg + gplex_size, etabin_of_cands[ebin].m_fill_index); - int N = end - beg; - - int tidx = threadIdx.x + blockDim.x*blockIdx.x; - int itrack = beg + tidx; - - if (itrack < end) { - - InputTracksCU_fn(etabin_of_cands[ebin].m_candidates, Err_iP, Par_iP, - inChg, Chi2, Label, HitsIdx_arr, beg, end, tidx, N); - - for (int ilay = nlayers_per_seed; ilay < Config::nLayers; ++ilay) - { - int hit_idx = ilay; - GPlexHS &msErr = msErr_arr[hit_idx]; - GPlexHV &msPar = msPar_arr[hit_idx]; - GPlexQI &HitsIdx = HitsIdx_arr[hit_idx]; - - float *radii = geom.radii; - - LayerOfHitsCU &layer = layers[ilay]; - - int maxSize_block; - selectHitIndices_fn(layer, Err_iP, Par_iP, XHitSize, XHitArr, tidx, N); - reduceMax_fn - (XHitSize.ptr, XHitSize.N, &maxSize_block); - bestHit_fn(layer.m_hits.data(), XHitSize, XHitArr, - Err_iP, msErr, msPar, Par_iP, outChi2, - Chi2, HitsIdx, maxSize_block, tidx, N); - kalmanUpdate_fn( Err_iP, msErr, Par_iP, msPar, Par_iC, Err_iC, tidx, N); - if (ilay+1 < Config::nLayers) { - float radius = radii[ilay+1]; - propagationForBuilding_fn(Err_iC, Par_iC, inChg, radius, Err_iP, Par_iP, tidx, N); - } - } - OutputTracksCU_fn(etabin_of_cands[ebin].m_candidates, - Err_iP, Par_iP, inChg, Chi2, Label, HitsIdx_arr, beg, end, tidx, N); - } - } - } -} - - -void findBestHit_wrapper(cudaStream_t &stream, - LayerOfHitsCU *layers, - EventOfCandidatesCU &event_of_cands_cu, - GPlexQI &XHitSize, GPlexHitIdx &XHitArr, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexHS *msErr, GPlexHV *msPar, - GPlexLS &Err_iC, GPlexLV &Par_iC, - GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI *HitsIdx, - GPlexQI &inChg, GPlexQI &Label, - GeometryCU &geom, - int *maxSize, int N) { - int gridx = (N-1)/BLOCK_SIZE_X + 1; - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - if (gridx > max_blocks_x) { - throw std::runtime_error("The matriplex size should be chosen such " - "that gplex_size*BLOCK_SIZE_X <= max_blocks_x"); - } - // The loop over tracks is taking care of the case where there is more tracks - // than available threads. - // We should actually not throw an exception, GPlex should be allocated with - // a smaller size in MkFitter. - - findBestHit_kernel <<< grid, block, 0, stream >>> - (layers, event_of_cands_cu.m_etabins_of_candidates, - XHitSize, XHitArr, Err_iP, Par_iP, msErr, msPar, - Err_iC, Par_iC, outChi2, Chi2, HitsIdx, inChg, Label, geom, maxSize, N, - Config::nlayers_per_seed); -} diff --git a/mkFit/best_hit_kernels.h b/mkFit/best_hit_kernels.h deleted file mode 100644 index 2a6fb16d..00000000 --- a/mkFit/best_hit_kernels.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef BEST_HIT_KERNELS_H -#define BEST_HIT_KERNELS_H - - -#include "GPlex.h" -#include "HitStructuresCU.h" -#include "GeometryCU.h" - -namespace mkfit { - -void getNewBestHitChi2_wrapper(const cudaStream_t &stream, - const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, - const GPlexQF &outChi2, float *minChi2, int *bestHit, - const int hit_cnt, const int N); - - -void fill_array_cu(float *array, const int size, const float value); - - -void updateTracksWithBestHit_wrapper(const cudaStream_t &stream, - LayerOfHitsCU &, const float *minChi2, const int *best_hit, - GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar, - GPlexQF &Chi2, GPlexQI& HitsIdx, const int N); - -int getMaxNumHits_wrapper(const GPlexQI d_XHitSize, const int N); - - -void bestHit_wrapper(const cudaStream_t &stream, - LayerOfHitsCU &layer, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, - const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar, - const GPlexLV &propPar, GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI& HitsIdx, - const int maxSize, const int N); - - -void findBestHit_wrapper(cudaStream_t &stream, - LayerOfHitsCU *layers, - EventOfCandidatesCU &event_of_cands_cu, - GPlexQI &XHitSize, GPlexHitIdx &XHitArr, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexHS *msErr, GPlexHV *msPar, - GPlexLS &Err_iC, GPlexLV &Par_iC, - GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI *HitsIdx, - GPlexQI &inChg, GPlexQI &Label, - GeometryCU &geom, - int *maxSize, int N); - - -} // end namespace mkfit -#endif /* ifndef BEST_HIT_KERNELS_H */ diff --git a/mkFit/build_tracks_kernels.cu b/mkFit/build_tracks_kernels.cu deleted file mode 100644 index e0ef847b..00000000 --- a/mkFit/build_tracks_kernels.cu +++ /dev/null @@ -1,203 +0,0 @@ -#include "build_tracks_kernels.h" - -#include "GPlex.h" -#include "reorganize_gplex.h" -#include "kalmanUpdater_kernels.h" -#include "computeChi2_kernels.h" - -constexpr int BLOCK_SIZE_X = 16; - - -__device__ void getHitFromLayer_fn(LayerOfHitsCU& layer_of_hits, - GPlexQI& HitsIdx, GPlexHV& msPar, GPlexHS& msErr, int itrack_plex, int N) -{ - if (itrack_plex < N) - { - int hit_idx = HitsIdx[itrack_plex]; - if (hit_idx >= 0) - { - Hit &hit = layer_of_hits.m_hits[hit_idx]; - - GetHitErr(msErr, (char *)hit.errArrayCU(), 0, N); - GetHitPar(msPar, (char *)hit.posArrayCU(), 0, N); - } - } -} - -__global__ void getHitFromLayer_kernel(LayerOfHitsCU& layer_of_hits, - GPlexQI HitsIdx, GPlexHV msPar, GPlexHS msErr, int N) -{ - int itrack_plex = threadIdx.x + blockDim.x * blockIdx.x; - getHitFromLayer_fn(layer_of_hits, HitsIdx, msPar, msErr, itrack_plex, N); -} - -void getHitFromLayer_wrappper( const cudaStream_t& stream, - LayerOfHitsCU& layer_cu, GPlexQI& HitsIdx, - GPlexHV& msPar, GPlexHS& msErr, int N) -{ - int gridx = (N-1)/BLOCK_SIZE_X + 1; - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - getHitFromLayer_kernel <<< grid, block, 0, stream >>> - (layer_cu, HitsIdx, msPar, msErr, N); -} - - -__device__ void updateMissingHits_fn(GPlexQI& HitsIdx, - GPlexLV& Par_iP, GPlexLS& Err_iP, - GPlexLV& Par_iC, GPlexLS& Err_iC, int i, int N) -{ - if (i < N) - { - if (HitsIdx[i] < 0) - { - for (auto j = 0; j < Err_iC.kSize; ++j) { - Err_iC[i + j * Err_iC.stride] = Err_iP[i + j * Err_iP.stride]; - } - for (auto j = 0; j < Par_iC.kSize; ++j) { - Par_iC[i + j * Par_iC.stride] = Par_iP[i + j * Par_iP.stride]; - } - } - } -} - - -__global__ void updateMissingHits_kernel(GPlexQI HitsIdx, - GPlexLV Par_iP, GPlexLS Err_iP, - GPlexLV Par_iC, GPlexLS Err_iC, int N) -{ - int i = threadIdx.x + blockDim.x * blockIdx.x; - updateMissingHits_fn(HitsIdx, Par_iP, Err_iP, Par_iC, Err_iC, i, N); -} - - -void UpdateMissingHits_wrapper( - const cudaStream_t& stream, GPlexQI& HitsIdx, - GPlexLV& Par_iP, GPlexLS& Err_iP, - GPlexLV& Par_iC, GPlexLS& Err_iC, - int N) -{ - int gridx = (N-1)/BLOCK_SIZE_X + 1; - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - updateMissingHits_kernel <<< grid, block, 0, stream >>> - (HitsIdx, Par_iP, Err_iP, Par_iC, Err_iC, N); -} - - -__device__ -void UpdateWithLastHit_fn( - LayerOfHitsCU& layer_of_hits, GPlexQI& HitsIdx, - GPlexHV& msPar, GPlexHS& msErr, - GPlexLV& Par_iP, GPlexLS& Err_iP, - GPlexLV& Par_iC, GPlexLS& Err_iC, - int itrack_plex, int N) -{ - getHitFromLayer_fn(layer_of_hits, HitsIdx, msPar, msErr, itrack_plex, N); - kalmanUpdate_fn(Err_iP, msErr, Par_iP, msPar, Par_iC, Err_iC, itrack_plex, N); - updateMissingHits_fn(HitsIdx, Par_iP, Err_iP, Par_iC, Err_iC, itrack_plex, N); -} - - -__global__ -void UpdateWithLastHit_kernel( - LayerOfHitsCU& layer_of_hits, GPlexQI HitsIdx, - GPlexHV msPar, GPlexHS msErr, - GPlexLV Par_iP, GPlexLS Err_iP, - GPlexLV Par_iC, GPlexLS Err_iC, - int N) -{ - int i = threadIdx.x + blockDim.x * blockIdx.x; - UpdateWithLastHit_fn(layer_of_hits, HitsIdx, msPar, msErr, - Par_iP, Err_iP, Par_iC, Err_iC, i, N); -} - - -void UpdateWithLastHit_wrapper( - const cudaStream_t& stream, - LayerOfHitsCU& layer_cu, GPlexQI& HitsIdx, - GPlexHV& msPar, GPlexHS& msErr, - GPlexLV& Par_iP, GPlexLS& Err_iP, - GPlexLV& Par_iC, GPlexLS& Err_iC, - int N) -{ - int gridx = (N-1)/BLOCK_SIZE_X + 1; - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - UpdateWithLastHit_kernel <<< grid, block, 0, stream >>> - (layer_cu, HitsIdx, msPar, msErr, - Par_iP, Err_iP, Par_iC, Err_iC, N); -} - - -#if 0 -__device__ void findCandidates_fn( - Hit *hits, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, - const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar, - const GPlexLV &propPar, GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI &HitsIdx, - const int maxSize, const int itrack, const int N) { - - // FIXME: We probably don't want to find candidates for - // non-existing tracks. - // However, Valid will need to be updated at some point. - // This is just a note to remember to do it. - //if (!Valid[itrack]) return; - - if (itrack < N) - HitsIdx[itrack] = 0; - - for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt) - { - HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, itrack, N); - computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, itrack, N); - // get the maxCandsPerSeed best cands for this TRACK - //getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2.ptr, minChi2_reg, bestHit_reg, hit_cnt, N); - } - // TODO - // get the maxCandsPerSeed best cands; - // Eventually add to the list of cands for this SEED - // ... -} - - -__global__ void findCandidates_kernel( - Hit *hits, const GPlexQI XHitSize, const GPlexHitIdx XHitArr, - const GPlexLS propErr, GPlexHS msErr, GPlexHV msPar, - const GPlexLV propPar, GPlexQF outChi2, - GPlexQF Chi2, GPlexQI HitsIdx, - const int maxSize, const int N) { - int itrack = threadIdx.x + blockDim.x*blockIdx.x; - findCandidates_fn(hits, XHitSize, XHitArr, - propErr, msErr, msPar, - propPar, outChi2, - Chi2, HitsIdx, - maxSize, itrack, N); -} - - -void findCandidates_wrapper(const cudaStream_t &stream, - LayerOfHitsCU &layer, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, - const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar, - const GPlexLV &propPar, GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI &HitsIdx, - const int maxSize, const int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - findCandidates_kernel <<< grid, block, 0, stream >>> - (layer.m_hits, XHitSize, XHitArr, - propErr, msErr, msPar, propPar, outChi2, - /*propErr.ptr, propErr.stride,*/ - /*msErr.ptr, msErr.stride, msErr.kSize,*/ - /*msPar.ptr, msPar.stride, msPar.kSize,*/ - /*outChi2.ptr, outChi2.stride,*/ - Chi2, HitsIdx, - maxSize, N); -} -#endif diff --git a/mkFit/build_tracks_kernels.h b/mkFit/build_tracks_kernels.h deleted file mode 100644 index ae45a1a4..00000000 --- a/mkFit/build_tracks_kernels.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef _BUILD_TRACKS_KERNELS_ -#define _BUILD_TRACKS_KERNELS_ - -#include "GPlex.h" -#include "HitStructuresCU.h" - -namespace mkfit { - -void getHitFromLayer_wrappper( const cudaStream_t& stream, - LayerOfHitsCU& layer_cu, GPlexQI& HitsIdx, - GPlexHV& msPar, GPlexHS& msErr, int N); -void UpdateMissingHits_wrapper( - const cudaStream_t& stream, GPlexQI& HitsIdx, - GPlexLV& Par_iP, GPlexLS& Err_iP, - GPlexLV& Par_iC, GPlexLS& Err_iC, - int N); - -__device__ -void UpdateWithLastHit_fn( - LayerOfHitsCU& layer_of_hits, GPlexQI& HitsIdx, - GPlexHV& msPar, GPlexHS& msErr, - GPlexLV& Par_iP, GPlexLS& Err_iP, - GPlexLV& Par_iC, GPlexLS& Err_iC, - int itrack_plex, int N); - -void UpdateWithLastHit_wrapper( - const cudaStream_t& stream, - LayerOfHitsCU& layer_cu, GPlexQI& HitsIdx, - GPlexHV& msPar, GPlexHS& msErr, - GPlexLV& Par_iP, GPlexLS& Err_iP, - GPlexLV& Par_iC, GPlexLS& Err_iC, - int N); - -} // end namespace mkfit -#endif /* ifndef _BUILD_TRACKS_KERNELS_ */ diff --git a/mkFit/buildtestMPlex.cc b/mkFit/buildtestMPlex.cc index c01c8b93..21cd1818 100644 --- a/mkFit/buildtestMPlex.cc +++ b/mkFit/buildtestMPlex.cc @@ -7,12 +7,6 @@ #include "MkBuilder.h" -#ifdef USE_CUDA -#include "FitterCU.h" -#include "BuilderCU.h" -#include "check_gpu_hit_structures.h" -#endif - #if defined(USE_VTUNE_PAUSE) #include "ittnotify.h" #endif @@ -159,22 +153,9 @@ double runBuildingTestPlexBestHit(Event& ev, MkBuilder& builder) __itt_resume(); #endif -#if USE_CUDA - //check_event_of_hits_gpu(builder.get_event_of_hits()); - //check_event_of_cands_gpu(event_of_cands); - BuilderCU builder_cu; - builder_cu.setUpBH(builder.get_event_of_hits(), builder.get_event(), - event_of_cands); -#endif - double time = dtime(); -#if USE_CUDA - builder_cu.FindTracksBestHit(event_of_cands); - builder_cu.tearDownBH(); -#else builder.FindTracksBestHit(); -#endif time = dtime() - time; @@ -212,42 +193,6 @@ double runBuildingTestPlexBestHit(Event& ev, MkBuilder& builder) return time; } -#if USE_CUDA -double runBuildingTestPlexBestHitGPU(Event& ev, MkBuilder& builder, - BuilderCU& builder_cu) -{ - builder.begin_event(&ev, 0, __func__); - - if (Config::seedInput == findSeeds) {builder.find_seeds();} - else {builder.map_seed_hits();} // all other simulated seeds need to have hit indices line up in LOH for seed fit - - builder.fit_seeds(); - - EventOfCandidates event_of_cands; - builder.find_tracks_load_seeds(event_of_cands); - // Allocate event specific arrays - builder_cu.setUpBH(builder.get_event_of_hits(), builder.get_event(), - event_of_cands); - - double time = dtime(); - - builder_cu.FindTracksBestHit(event_of_cands); - // Deallocate event specific arrays - builder_cu.tearDownBH(); - - time = dtime() - time; - if (Config::quality_val) { - if (!Config::silent) builder.quality_val_BH(event_of_cands); - } else { - builder.sim_val_BH(event_of_cands); - } - - builder.end_event(); - - return time; -} -#endif - //============================================================================== // runBuildTestPlex Combinatorial: Standard TBB @@ -441,119 +386,4 @@ double runBuildingTestPlexFV(Event& ev, MkBuilder& builder) return time; } -#if USE_CUDA -double runBuildingTestPlexCloneEngineGPU(Event& ev, EventTmp& ev_tmp, - MkBuilder& builder, - BuilderCU& builder_cu, - bool seed_based) -{ - EventOfCombCandidates &event_of_comb_cands = ev_tmp.m_event_of_comb_cands; - event_of_comb_cands.Reset(); - - builder.begin_event(&ev, &ev_tmp, __func__); - - if (Config::seedInput == findSeeds) {builder.find_seeds();} - else {builder.map_seed_hits();} // all other simulated seeds need to have hit indices line up in LOH for seed fit - - builder.fit_seeds(); - - builder.find_tracks_load_seeds(); - -#ifdef USE_VTUNE_PAUSE - __itt_resume(); -#endif - - //builder_cu.setUpFitterCE(-1 [> does not matter for now <]); - builder_cu.allocateCE(builder.get_event_of_hits(), builder.get_event(), - event_of_comb_cands); - builder_cu.setUpCE(builder.get_event_of_hits(), builder.get_event(), - event_of_comb_cands); - - double time = dtime(); - - //builder.FindTracksCloneEngine(); - builder_cu.FindTracksCloneEngine(event_of_comb_cands, seed_based); - - time = dtime() - time; - - builder_cu.tearDownCE(); -#ifdef USE_VTUNE_PAUSE - __itt_pause(); -#endif - - if (Config::quality_val) { - if (!Config::silent) builder.quality_val_COMB(); - } else {builder.sim_val_COMB();} - - builder.end_event(); - - return time; -} -#endif - -//============================================================================== -// runAllBuildTestPlexBestHitGPU -//============================================================================== - -#if USE_CUDA -double runAllBuildingTestPlexBestHitGPU(std::vector &events) -{ - - int num_builders = events.size(); - std::vector> builder_ptrs(num_builders); - std::vector event_of_cands_vec(num_builders); - std::vector builder_cu_vec(num_builders); - - for (int i = 0; i < builder_ptrs.size(); ++i) { - Event &ev = events[i]; - builder_ptrs[i] = std::unique_ptr (MkBuilder::make_builder()); - - MkBuilder &builder = * builder_ptrs[i].get(); - - builder.begin_event(&ev, 0, __func__); - - if (Config::seedInput == findSeeds) {builder.find_seeds();} - else {builder.map_seed_hits();} // all other simulated seeds need to have hit indices line up in LOH for seed fit - - builder.fit_seeds(); - - EventOfCandidates &event_of_cands = event_of_cands_vec[i]; - builder.find_tracks_load_seeds(event_of_cands); - - BuilderCU &builder_cu = builder_cu_vec[i]; - builder_cu.setUpBH(builder.get_event_of_hits(), builder.get_event(), - event_of_cands); - } - - //omp_set_num_threads(Config::numThreadsEvents); - //std::cerr << "num threads "<< omp_get_num_threads() << std::endl; -//#pragma omp parallel for reduction(+:total_time) - //for (int i = 0; i < builder_ptrs.size(); ++i) { - double time = dtime(); - tbb::parallel_for(size_t(0), builder_ptrs.size(), [&](size_t i) { - EventOfCandidates &event_of_cands = event_of_cands_vec[i]; - BuilderCU &builder_cu = builder_cu_vec[i]; - MkBuilder &builder = * builder_ptrs[i].get(); - - builder_cu.FindTracksBestHit(event_of_cands); - }); - time = dtime() - time; - - for (int i = 0; i < builder_ptrs.size(); ++i) { - EventOfCandidates &event_of_cands = event_of_cands_vec[i]; - BuilderCU &builder_cu = builder_cu_vec[i]; - builder_cu.tearDownBH(); - MkBuilder &builder = * builder_ptrs[i].get(); - if (!Config::sim_val && !Config::cmssw_val) { - if (!Config::silent) builder.quality_val(); - } else if (Config::sim_val) { - builder.root_val(); - } - - builder.end_event(); - } - - return time; -} -#endif } // end namespace mkfit diff --git a/mkFit/buildtestMPlex.h b/mkFit/buildtestMPlex.h index a0764f12..f655e31b 100644 --- a/mkFit/buildtestMPlex.h +++ b/mkFit/buildtestMPlex.h @@ -3,9 +3,6 @@ #include "Event.h" #include "Track.h" -#ifdef USE_CUDA -#include "BuilderCU.h" -#endif namespace mkfit { @@ -17,13 +14,5 @@ double runBuildingTestPlexStandard(Event& ev, MkBuilder& builder); double runBuildingTestPlexCloneEngine(Event& ev, MkBuilder& builder); double runBuildingTestPlexFV(Event& ev, MkBuilder& builder); -#if USE_CUDA -double runBuildingTestPlexBestHitGPU(Event& ev, MkBuilder& builder, - BuilderCU& builder_cu); -double runBuildingTestPlexCloneEngineGPU(Event& ev, EventTmp& ev_tmp, MkBuilder& builder, - BuilderCU& builder_cu, bool seed_based=false); -double runAllBuildingTestPlexBestHitGPU(std::vector &events); -#endif - } // end namespace mkfit #endif diff --git a/mkFit/check_gpu_hit_structures.cu b/mkFit/check_gpu_hit_structures.cu deleted file mode 100644 index b5a3234a..00000000 --- a/mkFit/check_gpu_hit_structures.cu +++ /dev/null @@ -1,144 +0,0 @@ -#include "check_gpu_hit_structures.h" - -#include "Hit.h" -#include "HitStructures.h" -#include "HitStructuresCU.h" -#include "reorganize_gplex.h" -#include "gpu_utils.h" - -#include - - -__global__ void get_hit_pos_and_err(LayerOfHitsCU *layers, - int ilay, int hit_idx, float *pos, float *err, int pos_size, int err_size) { - if (threadIdx.x + blockDim.x * blockIdx.x == 0) { - LayerOfHitsCU &layer = layers[ilay]; - Hit &hit = layer.m_hits[hit_idx]; - float *posArray = get_posArray(hit); - float *errArray = get_errArray(hit); - for (int i = 0; i < pos_size; ++i) { - pos[i] = posArray[i]; - } - for (int i = 0; i < err_size; ++i) { - err[i] = errArray[i]; - } - } -} - - -void compare_carrays(const float *h_a, const float *d_a, - const float prec, const int n) -{ - for (int i = 0; i < n; ++i) { - // should be relative comparison, verify if div by 0 will happen - if (std::abs(h_a[i] - d_a[i]) > prec) { - std::cerr << i << " : " << h_a[i] << " / " << d_a[i] << std::endl; - } - } -} - - -void check_event_of_hits_gpu(const EventOfHits& event_of_hits) -{ - EventOfHitsCU event_of_hits_cu; - event_of_hits_cu.reserve_layers(event_of_hits); - event_of_hits_cu.copyFromCPU(event_of_hits, 0); - - constexpr int pos_size = 3; - constexpr int err_size = 6; - - float *d_pos, *d_err; - float pos[pos_size], err[err_size]; - - cudaMalloc((void**)&d_pos, pos_size*sizeof(float)); - cudaMalloc((void**)&d_err, err_size*sizeof(float)); - - dim3 grid(1, 1, 1); - dim3 block(1, 1, 1); - - int ilay = 2; - int hit_idx = 3; - - get_hit_pos_and_err <<< grid, block >>> - (event_of_hits_cu.m_layers_of_hits.data(), ilay, hit_idx, d_pos, d_err, pos_size, err_size); - - cudaMemcpy(pos, d_pos, pos_size*sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy(err, d_err, err_size*sizeof(float), cudaMemcpyDeviceToHost); - - //std::cerr << "pos ......................\n"; - compare_carrays(event_of_hits.m_layers_of_hits[ilay].m_hits[hit_idx].posArray(), - pos, 1e-3, pos_size); - //std::cerr << "err ......................\n"; - compare_carrays(event_of_hits.m_layers_of_hits[ilay].m_hits[hit_idx].errArray(), - err, 1e-3, err_size); - - cudaFree(d_pos); - cudaFree(d_err); -} - - -__global__ void get_cand_pos_and_err(EtaBinOfCandidatesCU *etabin_of_cands, - const int ebin, const int itrack, float *pos, float *err, - const int pos_size, const int err_size) -{ - if (threadIdx.x + blockDim.x * blockIdx.x == 0) { - Track &track = etabin_of_cands[ebin].m_candidates[itrack]; - float *posArray = get_posArray(track); - float *errArray = get_errArray(track); - - for (int i = 0; i < pos_size; ++i) { - pos[i] = posArray[i]; - } - for (int i = 0; i < err_size; ++i) { - err[i] = errArray[i]; - } - } -} - - -void check_event_of_cands_gpu(const EventOfCandidates& event_of_cands) -{ - EventOfCandidatesCU event_of_cands_cu; - event_of_cands_cu.allocGPU(event_of_cands); - event_of_cands_cu.copyFromCPU(event_of_cands); - - constexpr int pos_size = 6; - constexpr int err_size = 21; - - float *d_pos, *d_err; - float pos[pos_size], err[err_size]; - - cudaMalloc((void**)&d_pos, pos_size*sizeof(float)); - cudaMalloc((void**)&d_err, err_size*sizeof(float)); - - dim3 grid(1, 1, 1); - dim3 block(1, 1, 1); - - int etabin = std::min(2, Config::nEtaBin-1); - int itrack = 3; - - get_cand_pos_and_err <<< grid, block >>> - (event_of_cands_cu.m_etabins_of_candidates, - etabin, itrack, d_pos, d_err, pos_size, err_size); - cudaCheckErrorSync(); - - cudaMemcpy(pos, d_pos, pos_size*sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrorSync(); - cudaMemcpy(err, d_err, err_size*sizeof(float), cudaMemcpyDeviceToHost); - cudaCheckErrorSync(); - - /*std::cerr << "pos ......................\n";*/ - compare_carrays(event_of_cands.m_etabins_of_candidates[etabin].m_candidates[itrack].posArray(), - pos, 1e-3, pos_size); - /*std::cerr << "err ......................\n";*/ - compare_carrays(event_of_cands.m_etabins_of_candidates[etabin].m_candidates[itrack].errArray(), - err, 1e-3, err_size); - - cudaFree(d_pos); - cudaFree(d_err); - - //event_of_cands_cu.copyToCPU(event_of_cands); - event_of_cands_cu.deallocGPU(); - - cudaCheckErrorSync(); -} diff --git a/mkFit/check_gpu_hit_structures.h b/mkFit/check_gpu_hit_structures.h deleted file mode 100644 index fe80c21f..00000000 --- a/mkFit/check_gpu_hit_structures.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef CHECK_GPU_HIT_STRUCTURE_H -#define CHECK_GPU_HIT_STRUCTURE_H - -#include "HitStructures.h" - -namespace mkfit { - -void check_event_of_hits_gpu(const EventOfHits& event_of_hits); -void check_event_of_cands_gpu(const EventOfCandidates& event_of_cands); - -} // end namespace mkfit -#endif /* ifndef CHECK_GPU_HIT_STRUCTURE_H */ diff --git a/mkFit/clone_engine_cuda_helpers.cu b/mkFit/clone_engine_cuda_helpers.cu deleted file mode 100644 index af4fed8c..00000000 --- a/mkFit/clone_engine_cuda_helpers.cu +++ /dev/null @@ -1,46 +0,0 @@ -#include "clone_engine_cuda_helpers.h" -#include "GPlex.h" - -#include "BestCands.h" - - -__device__ int countValidHits_fn(const int itrack, const int end_hit, - const GPlexQI* HitsIdx_arr) -{ - int result = {}; - for (int hi = 0; hi < end_hit; ++hi) - { - if (HitsIdx_arr[hi](itrack, 0, 0) >= 0) result++; - } - return result; -} - - -__device__ int countInvalidHits_fn(const int itrack, const int end_hit, - const GPlexQI* HitsIdx_arr) -{ - int result = {}; - for (int hi = 0; hi < end_hit; ++hi) - { - if (HitsIdx_arr[hi](itrack, 0, 0) == -1) result++; - } - return result; -} - - -// Return (Valid Hits, Invalid Hits) -__device__ int2 countHits_fn(const int itrack, const int end_hit, - const GPlexQI* HitsIdx_arr) -{ - auto result = int2 {}; - for (int hi = 0; hi < end_hit; ++hi) { - auto idx = HitsIdx_arr[hi](itrack, 0, 0); - if (idx >=0) { - ++result.x; - } else if (idx == -1) { - ++result.y; - } - - } - return result; -} diff --git a/mkFit/clone_engine_cuda_helpers.h b/mkFit/clone_engine_cuda_helpers.h deleted file mode 100644 index f8d3dc49..00000000 --- a/mkFit/clone_engine_cuda_helpers.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef CLONE_ENGINE_CUDA_HELPERS_H -#define CLONE_ENGINE_CUDA_HELPERS_H - -#include "GPlex.h" -#include "BestCands.h" -#include "reorganize_gplex.h" -#include "computeChi2_kernels.h" - -namespace mkfit { - -__device__ int countValidHits_fn(const int itrack, const int end_hit, - const GPlexQI *HitsIdx_arr); - -__device__ int countInvalidHits_fn(const int itrack, const int end_hit, - const GPlexQI *HitsIdx_arr); - -__device__ int2 countHits_fn(const int itrack, const int end_hit, - const GPlexQI *HitsIdx_arr); - -template -__device__ void findTrackCands_fn(const int ilay, - Hit *hits, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, - const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar, - const GPlexLV &propPar, GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI *HitsIdx_arr, - const int maxSize, const int itrack, const int icand, const int N, - CandsGPU::BestCands& cand_list) -{ - // Note: cand_list should initially be filled with sentinels - int itrack_block = threadIdx.x; // for SM arrays - float chi2_track = Chi2[itrack]; - int XHitSize_track = XHitSize[itrack]; - - GPlexQI &HitsIdx = HitsIdx_arr[ilay]; - - // Note: 0 < itrack < N = matriplex_width - HitsIdx[itrack] = 0; // reset - - for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt) { - HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, - HitsIdx, hit_cnt, itrack, N); - // TODO: add CMSGeom - computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, itrack, N); - - // make sure the hit was in the compatiblity window for the candidate - if (hit_cnt < XHitSize_track) - { - float chi2 = fabs(outChi2[itrack]); //fixme negative chi2 sometimes... - if (chi2 < Config::chi2Cut) - { - int cand_hitIdx = XHitArr(itrack, hit_cnt, 0); - int cand_nhits = countValidHits_fn(itrack, ilay, HitsIdx_arr) + 1; - float cand_chi2 = chi2_track + chi2; - cand_list.update(itrack_block, icand, cand_hitIdx, - cand_nhits, cand_chi2); - } - } - } - // now add invalid hit - // if icand has enough 'good' hits, update will discard invalid hit. - auto num_good_bad_hits = countHits_fn(itrack, ilay, HitsIdx_arr); - int hit_idx = num_good_bad_hits.y < Config::maxHolesPerCand ? -1 : -2; - cand_list.update(itrack_block, icand, hit_idx, - num_good_bad_hits.x, - chi2_track); -} - -} // end namespace mkfit -#endif // CLONE_ENGINE_CUDA_HELPERS_H diff --git a/mkFit/clone_engine_kernels.cu b/mkFit/clone_engine_kernels.cu deleted file mode 100644 index 60d814af..00000000 --- a/mkFit/clone_engine_kernels.cu +++ /dev/null @@ -1,258 +0,0 @@ -#include "clone_engine_kernels.h" -#include "clone_engine_cuda_helpers.h" - -#include "BestCands.h" - -#include "computeChi2_kernels.h" -#include "index_selection_kernels.h" -#include "reorganize_gplex.h" -#include "build_tracks_kernels.h" -#include "array_algorithms_cu.h" -#include "propagation_kernels.h" - - -constexpr int BLOCK_SIZE_X = 256; - - -__device__ void findCandidates_fn(const int ilay, - Hit *hits, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, - const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar, - const GPlexLV &propPar, GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI *HitsIdx_arr, - GPlexQB &Valid, - const int maxSize, const int itrack, const int N, - CandsGPU::BestCands& cand_list) -{ - if (itrack >= N) return; - if (!Valid[itrack]) return; - - const int icand = itrack % Config::maxCandsPerSeed; - - findTrackCands_fn(ilay, hits, XHitSize, XHitArr, - propErr, msErr, msPar, propPar, outChi2, Chi2, HitsIdx_arr, - maxSize, itrack, icand, N, cand_list); - - // At this point, each track has the best candidates organized in a stack - //__syncthreads(); no need for that, sync hapeen in merge - // sorting garbbage is fine (i.e. threads with no valid tracks to start with) - // because of the default sentinel values. - int itrack_block = threadIdx.x; - int iseed_block = itrack_block / Config::maxCandsPerSeed; - int icand_block = itrack_block % Config::maxCandsPerSeed; - cand_list.merge_cands_for_seed(iseed_block, icand_block); - // At this point the best overall candidates for seed iseed are - // stored in the first list (icand == 0) -} - - -__device__ void updateCandsWithInfo(Track *candidates, int *ntracks_per_seed, - int my_trkIdx, int my_hitIdx, int my_nhits, - float my_chi2, - int beg, int itrack) -{ - int itrack_ev = beg + itrack; - int iseed_ev = itrack_ev / Config::maxCandsPerSeed; - int icand_ev = itrack_ev % Config::maxCandsPerSeed; - - int ncands_seed = ntracks_per_seed[iseed_ev]; - - Track tmp_track; // BAD: too big to fit in registers - - if (0 <= my_trkIdx && my_trkIdx < ncands_seed) { - // Get candidate of previous step that match this trkIdx - Track& base_track = candidates[iseed_ev*Config::maxCandsPerSeed + my_trkIdx]; - - tmp_track = base_track; - tmp_track.addHitIdx(my_hitIdx, 0.f /*chi2*/); - tmp_track.setChi2(my_chi2); - } - - // Before writing the new track to the EtaBinOfCombCandidatesCU - // makes sure everybody has a copy of it. - // CUDA 8; synchronize only cands for seed; - __syncthreads(); - - if (0 <= my_trkIdx && my_trkIdx < ncands_seed) { - Track& my_track = candidates[iseed_ev*Config::maxCandsPerSeed + icand_ev]; - my_track = tmp_track; - } -} - - -__global__ void findInLayers_kernel(LayerOfHitsCU *layers, - EtaBinOfCombCandidatesCU* etabins_of_comb_candidates, - GPlexQI XHitSize, GPlexHitIdx XHitArr, - GPlexLS Err_iP, GPlexLV Par_iP, - GPlexHS *msErr_arr, GPlexHV *msPar_arr, - GPlexLS Err_iC, GPlexLV Par_iC, - GPlexQF outChi2, - GPlexQF Chi2, GPlexQI *HitsIdx_arr, - GPlexQI inChg, GPlexQI Label, - GPlexQI SeedIdx, GPlexQI CandIdx, - GPlexQB Valid, GeometryCU geom, - int *maxSize, int gplex_size, - const int nlayers_per_seed) -{ - constexpr int max_cands_per_block = (BLOCK_SIZE_X / Config::maxCandsPerSeed) * Config::maxCandsPerSeed; - - for (int ebin = 0; ebin < Config::nEtaBin; ebin++) { - EtaBinOfCombCandidatesCU& etabin_of_cands = etabins_of_comb_candidates[ebin]; - int nseed = etabin_of_cands.m_nseed; - if (nseed == 0) continue; - - int max_nseed_plex = gplex_size/Config::maxCandsPerSeed; - - for (int beg_seed = 0; beg_seed < nseed; beg_seed += max_nseed_plex) { - int beg = beg_seed * Config::maxCandsPerSeed; - int end_seed = min(beg_seed + max_nseed_plex, nseed); - int end = end_seed * Config::maxCandsPerSeed; - - int itrack_plex = threadIdx.x + blockIdx.x * max_cands_per_block; // starts at 0. Fits a gplex - int itrack /* itrack_ev */ = beg + itrack_plex; // absolute track idx. If 'num_tracks' > gplex_size - int iseed_ev = itrack / Config::maxCandsPerSeed; - int icand_ev = itrack % Config::maxCandsPerSeed; - int N = end - beg; - - int itrack_block = threadIdx.x; - int iseed_block = itrack_block / Config::maxCandsPerSeed; - int icand_block = itrack_block % Config::maxCandsPerSeed; - - int maxSize_block; - - __shared__ CandsGPU::BestCands cand_list; - - // NOTE: maxCandsPerSeed should divide BLOCK_SIZE_X because - // padding is handled differently in GPlex and CandList (in SM) - // One easy way to deal with that is to allocate GPlexes a bit wider and - // let the last threads of blocks being idle; and referencing arrays with: - // tid = tidx + bidx * BLOCK_SIZE_X; - // we still need itrack_plex for the guard. - // We might has well not do it, and take advantages of these thread to - // enlarge the number of candidates for each seed. - if (itrack_block < max_cands_per_block && itrack_plex < N) { - for (int ilay = nlayers_per_seed; ilay <= Config::nLayers; ++ilay) { - Track *tracks = etabin_of_cands.m_candidates.data(); - int *tracks_per_seed = etabin_of_cands.m_ntracks_per_seed.data(); - - int Nhits = ilay; - InputTracksAndHitIdxComb_fn(tracks, tracks_per_seed, - Err_iP, Par_iP, - inChg, Chi2, Label, HitsIdx_arr, - SeedIdx, CandIdx, Valid, Nhits , - beg, end, itrack_plex, N); - - // no cands at this position for this layer - if (Valid[itrack_plex]) { - if (ilay > nlayers_per_seed) { - UpdateWithLastHit_fn(layers[ilay-1], HitsIdx_arr[ilay-1], - msPar_arr[ilay-1], msErr_arr[ilay-1], - Par_iP, Err_iP, Par_iC, Err_iC, itrack_plex, N); - if (ilay < Config::nLayers) { - propagationForBuilding_fn(Err_iC, Par_iC, inChg, geom.radii[ilay], - Err_iP, Par_iP, itrack_plex, N); - OutputParErrCU_fn(tracks, Err_iP, Par_iP, - beg, end, itrack_plex, N); - } else { - OutputParErrCU_fn(tracks, Err_iC, Par_iC, - beg, end, itrack_plex, N); - break; - } - } - } else { - if (ilay >= Config::nLayers) { // for those invalid hits - break; - } - } - XHitSize[itrack_plex] = 0; // reset here has well, because of invalid tracks - if (Valid[itrack_plex]) { - selectHitIndices_fn(layers[ilay], Err_iP, Par_iP, XHitSize, XHitArr, itrack_plex, N); - } - reduceMaxPartial_fn - (XHitSize.ptr, - max_cands_per_block * blockIdx.x, - max_cands_per_block, - &maxSize_block); - - cand_list.reset(itrack_block); - - findCandidates_fn(ilay, - layers[ilay].m_hits.data(), XHitSize, XHitArr, - Err_iP, msErr_arr[ilay], msPar_arr[ilay], Par_iP, - outChi2, Chi2, HitsIdx_arr, Valid, - maxSize_block, itrack_plex, N, cand_list); - // cand_list: each seed has the best cands - // It's similar to CandCloner::ProcessSeddRange (called from end_smthg) - // except than everything is already sorted. - // Here we cannot avoid bank conflicts, because we scatter (as in MPI_Scatter) - // a column (representing the list of overall best candidates) to different= - // thread. This is a consequence of the BestCand structure that - // was designed to minimized bank conflicts when performing the more - // frequent stack insertion operation. - - // Don't check if Valid as a previously inValid track might pick a correct value - int my_seed_track_idx = iseed_block * Config::maxCandsPerSeed; - int my_trkIdx, my_hitIdx, my_nhits; - float my_chi2; - - cand_list.get_cand_info(my_seed_track_idx, icand_block, - my_trkIdx, my_hitIdx, my_nhits, my_chi2); - // Needs to count how many non sentinel there is: - if (!icand_ev) { - tracks_per_seed[iseed_ev] = cand_list.count_valid_cands(itrack_block); - } - __syncthreads(); - - // Now, every thread has the info of its own new best cand. - // -> Feed the etabin with that - updateCandsWithInfo(tracks, tracks_per_seed, - my_trkIdx, my_hitIdx, my_nhits, my_chi2, - beg, itrack_plex); - } // ilay - // final sorting, - } - } - } -} - -__global__ void print_stuffs(Track* tracks, int*ntracks_per_seed) { - if (threadIdx.x + blockIdx.x * blockDim.x == 0) { - int idx = 0 + 1*Config::maxCandsPerSeed ; - int val = tracks[idx].getHitIdx(0); - printf("tracks[%d] = %d\n", idx, val); - val = tracks[idx].getHitIdx(1); - printf("tracks[%d] = %d\n", idx, val); - val = tracks[idx].getHitIdx(2); - printf("tracks[%d] = %d\n", idx, val); - val = ntracks_per_seed[idx]; - printf("tracks[%d] = %d\n", idx, val); - } -} - -void findInLayers_wrapper(cudaStream_t &stream, - LayerOfHitsCU *layers, - EventOfCombCandidatesCU &event_of_cands_cu, - GPlexQI &XHitSize, GPlexHitIdx &XHitArr, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexHS *msErr, GPlexHV *msPar, - GPlexLS &Err_iC, GPlexLV &Par_iC, - GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI *HitsIdx_arr, - GPlexQI &inChg, GPlexQI &Label, - GPlexQI &SeedIdx, GPlexQI &CandIdx, - GPlexQB& Valid, - GeometryCU &geom, - int *maxSize, int N) -{ - int gridx = (N-1)/BLOCK_SIZE_X + 1; - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - findInLayers_kernel <<>> - (layers, - event_of_cands_cu.m_etabins_of_comb_candidates.data(), - XHitSize, XHitArr, Err_iP, Par_iP, msErr, msPar, - Err_iC, Par_iC, outChi2, Chi2, HitsIdx_arr, inChg, Label, - SeedIdx, CandIdx, Valid, geom, maxSize, N, Config::nlayers_per_seed); - - /*cudaCheckErrorSync();*/ -} diff --git a/mkFit/clone_engine_kernels.h b/mkFit/clone_engine_kernels.h deleted file mode 100644 index 22f0eead..00000000 --- a/mkFit/clone_engine_kernels.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef CLONE_ENGINE_KERNELS_H -#define CLONE_ENGINE_KERNELS_H - -#include "GPlex.h" -#include "HitStructuresCU.h" -#include "GeometryCU.h" - -namespace mkfit { - -void findInLayers_wrapper(cudaStream_t &stream, - LayerOfHitsCU *layers, - EventOfCombCandidatesCU &event_of_cands_cu, - GPlexQI &XHitSize, GPlexHitIdx &XHitArr, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexHS *msErr, GPlexHV *msPar, - GPlexLS &Err_iC, GPlexLV &Par_iC, - GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI *HitsIdx, - GPlexQI &inChg, GPlexQI &Label, - GPlexQI &SeedIdx, GPlexQI &CandIdx, - GPlexQB& Valid, - GeometryCU &geom, - int *maxSize, int N); - -} // end namespace mkfit -#endif /* ifndef CLONE_ENGINE_KERNELS_H */ diff --git a/mkFit/clone_engine_kernels_seed.cu b/mkFit/clone_engine_kernels_seed.cu deleted file mode 100644 index 5968c9df..00000000 --- a/mkFit/clone_engine_kernels_seed.cu +++ /dev/null @@ -1,182 +0,0 @@ -#include "clone_engine_kernels_seed.h" -#include "clone_engine_cuda_helpers.h" - -#include "BestCands.h" - -#include "computeChi2_kernels.h" -#include "index_selection_kernels.h" -#include "reorganize_gplex.h" -#include "build_tracks_kernels.h" -#include "array_algorithms_cu.h" -#include "propagation_kernels.h" - -#include - -constexpr int BLOCK_SIZE_X = 256; - - -__device__ void findCandidates_fn_seed(const int ilay, - Hit *hits, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, - const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar, - const GPlexLV &propPar, GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI *HitsIdx_arr, - GPlexQB &Valid, - const int maxSize, const int itrack, const int icand, const int N, - CandsGPU::BestCands& cand_list) -{ - if (itrack >= N) return; - - findTrackCands_fn(ilay, hits, XHitSize, XHitArr, - propErr, msErr, msPar, propPar, outChi2, Chi2, HitsIdx_arr, - maxSize, itrack, icand, N, cand_list); -} - - -__device__ void updateCandsWithInfo_seed(Track *candidates, int *ntracks_per_seed, - int iseed_ev, CandsGPU::BestCands& cand_list) -{ - int iseed_block = threadIdx.x; - Track new_cands[Config::maxCandsPerSeed]; // too big too be __shared__ - - for (int icand = 0; icand < ntracks_per_seed[iseed_ev]; ++icand) { - Track &tmp_track = new_cands[icand]; - - int my_trkIdx, my_hitIdx, my_nhits; - float my_chi2; - - cand_list.get_cand_info(iseed_block, icand, my_trkIdx, - my_hitIdx, my_nhits, my_chi2); - - if (0 <= my_trkIdx && my_trkIdx < ntracks_per_seed[iseed_ev]) { - Track& base_track = candidates[iseed_ev*Config::maxCandsPerSeed + my_trkIdx]; - - tmp_track = base_track; - tmp_track.addHitIdx(my_hitIdx, 0.f /*chi2*/); - tmp_track.setChi2(my_chi2); - } - } // icand - - // Second loop required: several new candidates can come from the same old one. - for (int icand = 0; icand < ntracks_per_seed[iseed_ev]; ++icand) { - candidates[iseed_ev*Config::maxCandsPerSeed + icand] = new_cands[icand]; - } -} - - -__global__ void findInLayers_kernel_seed (LayerOfHitsCU *layers, - EtaBinOfCombCandidatesCU* etabins_of_comb_candidates, - GPlexQI XHitSize, GPlexHitIdx XHitArr, - GPlexLS Err_iP, GPlexLV Par_iP, - GPlexHS *msErr_arr, GPlexHV *msPar_arr, - GPlexLS Err_iC, GPlexLV Par_iC, - GPlexQF outChi2, - GPlexQF Chi2, GPlexQI *HitsIdx_arr, - GPlexQI inChg, GPlexQI Label, - GPlexQI SeedIdx, GPlexQI CandIdx, - GPlexQB Valid, GeometryCU geom, - int *maxSize, int gplex_size, - const int nlayers_per_seed) -{ - int iseed_plex = threadIdx.x + blockIdx.x * blockDim.x; - - for (int ebin = 0; ebin < Config::nEtaBin; ebin++) { - EtaBinOfCombCandidatesCU& etabin_of_cands = etabins_of_comb_candidates[ebin]; - int nseed = etabin_of_cands.m_nseed; - if (nseed == 0) continue; - - //int max_nseed_plex = gplex_size; // no more /Config::maxCandsPerSeed; -> one thread - int N = gplex_size; - int maxSize_block; - - // we need both iseed_plex and iseed_ev: - // * iseed_plex to access the matriplex structures, used for the algorithm - // * iseed_ev to access the other structures (tracks, mostly), used to - // interact with the outside world. - for (int iseed_ev = iseed_plex; - iseed_ev < nseed; - iseed_ev += gridDim.x * blockDim.x) { - int iseed_block = threadIdx.x; - - __shared__ CandsGPU::BestCands cand_list; - - Track *tracks = etabin_of_cands.m_candidates.data(); - int *tracks_per_seed = etabin_of_cands.m_ntracks_per_seed.data(); - - for (int ilay = nlayers_per_seed; ilay <= Config::nLayers; ++ilay) { - cand_list.reset(iseed_block); - int Nhits = ilay; - - for (int icand_seed = 0; icand_seed < tracks_per_seed[iseed_ev]; ++icand_seed) { - InputTracksAndHitIdxComb_fn_seed(tracks, tracks_per_seed, - Err_iP, Par_iP, - inChg, Chi2, Label, HitsIdx_arr, - SeedIdx, CandIdx, Valid, Nhits , - iseed_ev, icand_seed, iseed_plex, N); - - // no cands at this position for this layer - if (ilay > nlayers_per_seed) { - UpdateWithLastHit_fn(layers[ilay-1], HitsIdx_arr[ilay-1], - msPar_arr[ilay-1], msErr_arr[ilay-1], - Par_iP, Err_iP, Par_iC, Err_iC, iseed_plex, N); - if (ilay < Config::nLayers) { - propagationForBuilding_fn(Err_iC, Par_iC, inChg, geom.radii[ilay], - Err_iP, Par_iP, iseed_plex, N); - OutputParErrCU_fn_seed(tracks, Err_iP, Par_iP, - iseed_ev, icand_seed, N); - } else { - OutputParErrCU_fn_seed(tracks, Err_iC, Par_iC, - iseed_ev, icand_seed, N); - return; - } - } - XHitSize[iseed_plex] = 0; - selectHitIndices_fn(layers[ilay], Err_iP, Par_iP, - XHitSize, XHitArr, iseed_plex, N); - reduceMaxPartial_fn - (XHitSize.ptr, blockDim.x * blockIdx.x, blockDim.x, &maxSize_block); - - findCandidates_fn_seed(ilay, - layers[ilay].m_hits.data(), XHitSize, XHitArr, - Err_iP, msErr_arr[ilay], msPar_arr[ilay], Par_iP, - outChi2, Chi2, HitsIdx_arr, Valid, - maxSize_block, iseed_plex, icand_seed, N, cand_list); - } // icand_seed - - cand_list.heap_sort(iseed_block, Config::maxCandsPerSeed); - tracks_per_seed[iseed_ev] = cand_list.count_valid_cands(iseed_block); - - updateCandsWithInfo_seed(tracks, tracks_per_seed, iseed_ev, cand_list); - } // ilay - } // iseed - } // eta -} - - -void findInLayers_wrapper_seed(cudaStream_t &stream, - LayerOfHitsCU *layers, - EventOfCombCandidatesCU &event_of_cands_cu, - GPlexQI &XHitSize, GPlexHitIdx &XHitArr, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexHS *msErr, GPlexHV *msPar, - GPlexLS &Err_iC, GPlexLV &Par_iC, - GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI *HitsIdx_arr, - GPlexQI &inChg, GPlexQI &Label, - GPlexQI &SeedIdx, GPlexQI &CandIdx, - GPlexQB& Valid, - GeometryCU &geom, - int *maxSize, int N) -{ - int gridx = (N-1)/BLOCK_SIZE_X + 1; - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - findInLayers_kernel_seed <<>> - (layers, - event_of_cands_cu.m_etabins_of_comb_candidates.data(), - XHitSize, XHitArr, Err_iP, Par_iP, msErr, msPar, - Err_iC, Par_iC, outChi2, Chi2, HitsIdx_arr, inChg, Label, - SeedIdx, CandIdx, Valid, geom, maxSize, N, Config::nlayers_per_seed); - - /*cudaCheckErrorSync();*/ -} diff --git a/mkFit/clone_engine_kernels_seed.h b/mkFit/clone_engine_kernels_seed.h deleted file mode 100644 index 27c1d377..00000000 --- a/mkFit/clone_engine_kernels_seed.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef CLONE_ENGINE_KERNELS_SEED_H -#define CLONE_ENGINE_KERNELS_SEED_H - -#include "GPlex.h" -#include "HitStructuresCU.h" -#include "GeometryCU.h" - -namespace mkfit { - -void findInLayers_wrapper_seed(cudaStream_t &stream, - LayerOfHitsCU *layers, - EventOfCombCandidatesCU &event_of_cands_cu, - GPlexQI &XHitSize, GPlexHitIdx &XHitArr, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexHS *msErr, GPlexHV *msPar, - GPlexLS &Err_iC, GPlexLV &Par_iC, - GPlexQF &outChi2, - GPlexQF &Chi2, GPlexQI *HitsIdx, - GPlexQI &inChg, GPlexQI &Label, - GPlexQI &SeedIdx, GPlexQI &CandIdx, - GPlexQB& Valid, - GeometryCU &geom, - int *maxSize, int N); - -} // end namespace mkfit -#endif /* ifndef CLONE_ENGINE_KERNELS_SEED_H */ diff --git a/mkFit/computeChi2_kernels.cu b/mkFit/computeChi2_kernels.cu deleted file mode 100644 index 46aaee59..00000000 --- a/mkFit/computeChi2_kernels.cu +++ /dev/null @@ -1,144 +0,0 @@ -#include "computeChi2_kernels.h" - -#include "GPlex.h" -#include "kalmanUpdater_kernels.h" -#include "gpu_utils.h" - -#include - -#define L 6 -#define HS 6 -#define HV 3 -#define BLOCK_SIZE_X 256 - -#include "KalmanUtilsMPlex.icc" - -__device__ void chi2Similarity_fn( - const GPlexReg2V &a, - const GPlexReg2S &c, // in registers - float *d, const size_t dN, - const int n) { - - //int n = threadIdx.x + blockIdx.x * blockDim.x; - - // manually subrtact into local vars -- 3 of them - /*float x0 = a[0 * aN + n] - b[0 * aN + n];*/ - /*float x1 = a[1 * aN + n] - b[1 * aN + n];*/ - /*float x2 = a[2 * aN + n] - b[2 * aN + n];*/ - /*d[0 * dN + n] = c[0]*x0*x0 + c[2]*x1*x1 + c[5]*x2*x2 +*/ - /*2*( c[1]*x1*x0 + c[3]*x2*x0 + c[4]*x1*x2);*/ - d[0 * dN + n] = c[0]*a[0]*a[0] - + c[2]*a[1]*a[1] - + 2*( c[1]*a[1]*a[0]); -} - - -__device__ void RotateResidulsOnTangentPlane_fn(const GPlexRegQF& r00,//r00 - const GPlexRegQF& r01,//r01 - const GPlexRegHV &a ,//res_glo - GPlexReg2V &b )//res_loc -{ - // res_loc = rotT * res_glo - // B = N R * A - RotateResidulsOnTangentPlane_impl(r00, r01, a, b, 0, 1); -} - - -__device__ void ProjectResErr_fn(const GPlexRegQF& a00, - const GPlexRegQF& a01, - const GPlexRegHS &b, - GPlexRegHH &c) -{ - // C = A * B, C is 3x3, A is 3x3 , B is 3x3 sym - - // Based on script generation and adapted to custom sizes. - c[ 0] = a00[0]*b[ 0] + a01[0]*b[ 1]; - c[ 1] = a00[0]*b[ 1] + a01[0]*b[ 2]; - c[ 2] = a00[0]*b[ 3] + a01[0]*b[ 4]; - c[ 3] = b[ 3]; - c[ 4] = b[ 4]; - c[ 5] = b[ 5]; - c[ 6] = a01[0]*b[ 0] - a00[0]*b[ 1]; - c[ 7] = a01[0]*b[ 1] - a00[0]*b[ 2]; - c[ 8] = a01[0]*b[ 3] - a00[0]*b[ 4]; -} - - -__device__ void ProjectResErrTransp_fn(const GPlexRegQF& a00, - const GPlexRegQF& a01, const GPlexRegHH& b, GPlexReg2S& c) -{ - // C = A * B, C is 3x3 sym, A is 3x3 , B is 3x3 - - // Based on script generation and adapted to custom sizes. - c[ 0] = b[ 0]*a00[0] + b[ 1]*a01[0]; - c[ 1] = b[ 3]*a00[0] + b[ 4]*a01[0]; - c[ 2] = b[ 5]; -} - - -__device__ void computeChi2_fn( - const GPlexLS &propErr, const GPlexHS &msErr, const GPlexHV &msPar, - const GPlexLV &propPar, GPlexQF &outChi2, const int n, const int N) { - //int n = threadIdx.x + blockIdx.x * blockDim.x; - /*float resErr_reg[HS]; // ~ resErr_glo*/ - GPlexRegHS resErr_reg; - - if (n < N) { - // coordinate change - GPlexRegQF rotT00; - GPlexRegQF rotT01; - const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0)); - rotT00[0] = -(msPar(n, 1, 0) + propPar(n, 1, 0))/(2*r); - rotT01[0] = (msPar(n, 0, 0) + propPar(n, 0, 0))/(2*r); - - GPlexRegHV res_glo; - subtractFirst3_fn(msPar, propPar, res_glo, N, n); - - for (int j = 0; j < HS; ++j) { - resErr_reg[j] = 0; //resErr[j*resErr_stride + n]; - } - addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n); - - GPlexReg2V res_loc; //position residual in local coordinates - RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc); - /*MPlex2S resErr_loc;//covariance sum in local position coordinates*/ - /*MPlexHH tempHH;*/ - GPlexReg2S resErr_loc; // 2x2 sym - GPlexRegHH tempHH; // 3*3 sym - ProjectResErr_fn (rotT00, rotT01, resErr_reg, tempHH); - ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc); - - /*invertCramerSym_fn(resErr_reg);*/ - invertCramerSym2x2_fn(resErr_loc); - - chi2Similarity_fn(res_loc, resErr_loc, outChi2.ptr, outChi2.stride, n); - } -} - - -__global__ void computeChi2_kernel( - const GPlexLS propErr, const GPlexHS msErr, const GPlexHV msPar, - const GPlexLV propPar, GPlexQF outChi2, const int N) { - int grid_width = blockDim.x * gridDim.x; - int itrack = threadIdx.x + blockDim.x*blockIdx.x; - for (int z = 0; z < (N-1)/grid_width +1; z++) { - itrack += z*grid_width; - - if (itrack < N) { - computeChi2_fn (propErr, msErr, msPar, propPar, outChi2, itrack, N); - } - } -} - - -void computeChi2_wrapper(cudaStream_t &stream, - const GPlexLS &propErr, const GPlexHS &msErr, - const GPlexHV &msPar, const GPlexLV &propPar, GPlexQF &outChi2, - const int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - computeChi2_kernel <<< grid, block, 0, stream >>> - (propErr, msErr, msPar, propPar, outChi2, N); - } diff --git a/mkFit/computeChi2_kernels.h b/mkFit/computeChi2_kernels.h deleted file mode 100644 index 145a6304..00000000 --- a/mkFit/computeChi2_kernels.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef _COMPUTE_CHI2_KERNELS_H_ -#define _COMPUTE_CHI2_KERNELS_H_ - -#include "HitStructuresCU.h" -#include "GPlex.h" -#include "GeometryCU.h" - -namespace mkfit { - -__device__ void computeChi2_fn(const GPlexLS &propErr, const GPlexHS &msErr, - const GPlexHV &msPar, const GPlexLV &propPar, GPlexQF &outChi2, - const int itrack, const int N); - - -void computeChi2_wrapper(cudaStream_t &stream, - const GPlexLS &propErr, const GPlexHS &msErr, const GPlexHV &msPar, - const GPlexLV &propPar, GPlexQF &outChi2, const int N); - -#if 1 -void HitToMs_wrapper(cudaStream_t& stream, - GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, GPlexQI &XHitSize, - GPlexHitIdx &XHitArr, GPlexQI &HitsIdx, int hit_cnt, int N); -#endif - -__device__ void RotateResidulsOnTangentPlane_fn(const GPlexRegQF& r00, - const GPlexRegQF& r01, const GPlexRegHV &a, GPlexReg2V &b); - -__device__ void ProjectResErr_fn(const GPlexRegQF& a00, const GPlexRegQF& a01, - const GPlexRegHS &b, GPlexRegHH &c); - -__device__ void ProjectResErrTransp_fn(const GPlexRegQF& a00, const GPlexRegQF& a01, - const GPlexRegHH &b, GPlexReg2S &c); - -} // end namespace mkfit -#endif diff --git a/mkFit/device_array_view.h b/mkFit/device_array_view.h deleted file mode 100644 index 0ebccfde..00000000 --- a/mkFit/device_array_view.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef DEVICE_ARRAY_VIEW_H_ -#define DEVICE_ARRAY_VIEW_H_ - -#include "Config.h" -#include - -namespace mkfit { - -template -class DeviceArrayView { - public: - using value_type = T; - using size_type = std::size_t; - using reference = value_type&; - using const_reference = const value_type&; - using pointer = value_type*; - using const_pointer = const pointer; - - - DeviceArrayView() : size_(0), data_(nullptr) {} - DeviceArrayView(pointer ptr, size_type count) : size_(count), data_(ptr) {} - - CUDA_CALLABLE const_reference operator[](int i) const {return data_[i];} - CUDA_CALLABLE reference operator[](int i) {return data_[i];} - - void set_view(pointer ptr, size_type count) { - data_ = ptr; - size_ = count; - } - void set_ptr(pointer ptr) { - // when we don't care about size_ : needs to be fixed? - data_ = ptr; - // size_ = count; - } - - CUDA_CALLABLE const_pointer data() const {return data_;} - CUDA_CALLABLE pointer data() {return data_;} - CUDA_CALLABLE size_t size() {return size_;} - private: - size_type size_; - pointer data_; -}; - - -} // end namespace mkfit -#endif /* DEVICE_ARRAY_VIEW_H_ */ diff --git a/mkFit/device_bundle.h b/mkFit/device_bundle.h deleted file mode 100644 index 9eb4efc4..00000000 --- a/mkFit/device_bundle.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef BUNDLE_VECTOR_H -#define BUNDLE_VECTOR_H - -#include "device_vector.h" - -namespace mkfit { - -template -class DeviceBundle { -public: - using value_type = T; - using size_type = size_t; - using pointer = value_type*; - using const_pointer = const pointer; - using reference = value_type&; - using const_reference = const value_type&; - - DeviceBundle(size_type num_part=0, size_type local_count=0); - - const_reference operator[](int i) const { return data_[i]; } - reference operator[](int i) { return data_[i]; } - - void reserve(size_type num_parts, size_type local_count); - pointer data(); - const_pointer data() const; - pointer get_ptr_to_part(size_type part_idx); - - size_type global_capacity() { return global_capacity_; } - size_type local_capacity() { return local_capacity_; } - - void copy_from_cpu(const pointer from, const cudaStream_t &stream); -private: - DeviceVector data_; - - size_type num_parts_; - size_type local_capacity_; - size_type global_capacity_; -}; - - -/////////////////////////////////////////////////////////////////////////////// -/// Implementation -/////////////////////////////////////////////////////////////////////////////// - -template -inline DeviceBundle::DeviceBundle(size_type num_part, - size_type local_count) { - reserve(num_part, local_count); -} - - -template -inline void DeviceBundle::reserve(size_type num_parts, size_type local_count) { - num_parts_ = num_parts; - local_capacity_ = local_count; - global_capacity_ = num_parts_ * local_capacity_; - data_.reserve(global_capacity_); -} - - -template -inline typename DeviceBundle::pointer DeviceBundle::data() { - return data_.data(); -} - - -template -inline typename DeviceBundle::const_pointer DeviceBundle::data() const { - return data_.data(); -} - - -template -inline typename DeviceBundle::pointer DeviceBundle::get_ptr_to_part(size_type part_idx) { - return &data_[part_idx * local_capacity_]; -} - - -template -inline void DeviceBundle::copy_from_cpu(const pointer from, - const cudaStream_t &stream) -{ - cudaMemcpyAsync(data(), from, global_capacity()*sizeof(value_type), - cudaMemcpyHostToDevice, stream); -} - -} // end namespace mkfit -#endif // BUNDLE_VECTOR_H diff --git a/mkFit/device_vector.h b/mkFit/device_vector.h deleted file mode 100644 index 7a0c2afe..00000000 --- a/mkFit/device_vector.h +++ /dev/null @@ -1,118 +0,0 @@ -#ifndef DEVICE_VECTOR_H_ -#define DEVICE_VECTOR_H_ - -#include "gpu_utils.h" -#include "Config.h" -#include "cuda_runtime.h" - -#include - -namespace mkfit { - -template -class DeviceVector { - public: - using value_type = T; - using size_type = size_t; - using pointer = value_type*; - using const_pointer = const pointer; - using reference = value_type&; - using const_reference = const value_type&; - - DeviceVector() {} - DeviceVector(size_type count) { alloc(count); } - ~DeviceVector() { free(); } - - CUDA_CALLABLE const_reference operator[](int i) const { return data_[i]; } - CUDA_CALLABLE reference operator[](int i) { return data_[i]; } - - CUDA_CALLABLE pointer data() {return data_;} - CUDA_CALLABLE const_pointer data() const {return data_;} - - - CUDA_CALLABLE size_type size() {return size_;} - CUDA_CALLABLE size_type capacity() {return capacity_;} - - void reserve(size_type count); - void resize(size_type Newsize); - - void reserve_and_resize(size_type count); - void copy_from_cpu(const_pointer from, const cudaStream_t &stream); - void copy_to_cpu(pointer to, const cudaStream_t &stream) const; - private: - void alloc(size_type count); - void free(); - - T* data_ = nullptr; - size_t size_ = 0; - size_t capacity_ = 0; -}; - -/////////////////////////////////////////////////////////////////////////////// -/// Implementation -/////////////////////////////////////////////////////////////////////////////// - -template -inline void DeviceVector::reserve(size_type count) { - if (count <= capacity_) return; - if (data_ != nullptr) free(); - alloc(count); - capacity_ = count; -} - - -template -inline void DeviceVector::resize(size_type new_size) { - assert(new_size <= capacity_); - size_ = new_size; -} - - -template -inline void DeviceVector::reserve_and_resize(size_type count) { - reserve(count); - resize(count); -} - - -template -inline void DeviceVector::copy_from_cpu(const pointer from, - const cudaStream_t& stream) -{ - cudaMemcpyAsync(data_, from, size_*sizeof(value_type), - cudaMemcpyHostToDevice, stream); -} - - -template -inline void DeviceVector::copy_to_cpu(pointer to, - const cudaStream_t& stream) const -{ - cudaMemcpyAsync(to, data_, size_*sizeof(value_type), - cudaMemcpyDeviceToHost, stream); -} - - -template -inline void DeviceVector::alloc(size_type count) { - if (count > 0) { - cudaMalloc((void**)&data_, count*sizeof(T)); - } - cudaCheckError(); - capacity_ = count; -} - - -template -inline void DeviceVector::free() { - if (data_ != nullptr) { - cudaFree(data_); - cudaCheckError(); - } - capacity_ = 0; - size_ = 0; - data_ = nullptr; -} - -} // end namespace mkfit -#endif /* DEVICE_VECTOR_H_ */ diff --git a/mkFit/fittestMPlex.cc b/mkFit/fittestMPlex.cc index 7c81e226..c443cb17 100644 --- a/mkFit/fittestMPlex.cc +++ b/mkFit/fittestMPlex.cc @@ -9,11 +9,6 @@ #include #include "MkFitter.h" -#if USE_CUDA -#include "fittestMPlex.h" -#include "FitterCU.h" -//#include -#endif #ifndef NO_ROOT #include "TFile.h" @@ -120,105 +115,4 @@ double runFittingTestPlex(Event& ev, std::vector& rectracks) return time; } -#ifdef USE_CUDA -void runAllEventsFittingTestPlexGPU(std::vector& events) -{ - double s_tmp = 0.0; -#if 0 - In principle, the warmup loop should not be required. - The separate_first_call_for_meaningful_profiling_numbers() function - should be enough. - // Warmup loop - for (int i = 0; i < 1; ++i) { - FitterCU cuFitter(NN); - cuFitter.allocateDevice(); - Event &ev = events[0]; - std::vector plex_tracks_ev; - plex_tracks_ev.resize(ev.simTracks_.size()); - - if (g_run_fit_std) runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev); - cuFitter.freeDevice(); - } -#endif - separate_first_call_for_meaningful_profiling_numbers(); - - // Reorganization (copyIn) can eventually be multithreaded. -// FIXME: revisit multithreading when track building is ported to GPU. -// omp_set_nested(1); - -// omp_set_num_threads(Config::numThreadsEvents); - double total_gpu_time = dtime(); -//#pragma omp parallel reduction(+:s_tmp) - { -// int numThreadsEvents = omp_get_num_threads(); -// int thr_idx = omp_get_thread_num(); - int numThreadsEvents = 1; - int thr_idx = 0; - - // FitterCU is declared here to share allocations and deallocations - // between the multiple events processed by a single thread. - int gplex_size = 10000; - FitterCU cuFitter(gplex_size); - cuFitter.allocateDevice(); - cuFitter.allocate_extra_addBestHit(); - - for (int evt = thr_idx+1; evt <= Config::nEvents; evt+= numThreadsEvents) { - int idx = thr_idx; - printf("==============================================================\n"); - printf("Processing event %d with thread %d\n", evt, idx); - Event &ev = events[evt-1]; - std::vector plex_tracks_ev; - plex_tracks_ev.resize(ev.simTracks_.size()); - double tmp = 0, tmp2bh = 0, tmp2 = 0, tmp2ce = 0; - - //if (g_run_fit_std) tmp = runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev); - runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev); - - printf("Matriplex fit = %.5f -------------------------------------", tmp); - printf("\n"); - s_tmp += tmp; -#if 0 // 0 for timing, 1 for validation - // Validation crashes for multiple threads. - // It is something in relation to ROOT. Not sure what. - //if (omp_get_num_threads() <= 1) { - //if (g_run_fit_std) { - std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root"; - //} - //} -#endif - } - cuFitter.free_extra_addBestHit(); - cuFitter.freeDevice(); - } - std::cerr << "###### [Fitting] Total GPU time: " << dtime() - total_gpu_time << " ######\n"; -} - - -double runFittingTestPlexGPU(FitterCU &cuFitter, - Event& ev, std::vector& rectracks) -{ - std::vector& simtracks = ev.simTracks_; - - Track *tracks_cu; - cudaMalloc((void**)&tracks_cu, simtracks.size()*sizeof(Track)); - cudaMemcpyAsync(tracks_cu, &simtracks[0], simtracks.size()*sizeof(Track), - cudaMemcpyHostToDevice, cuFitter.get_stream()); - - EventOfHitsCU events_of_hits_cu; - events_of_hits_cu.reserve_layers(ev.layerHits_); - events_of_hits_cu.copyFromCPU(ev.layerHits_, cuFitter.get_stream()); - - double time = dtime(); - - cuFitter.FitTracks(tracks_cu, simtracks.size(), events_of_hits_cu, Config::nLayers, true); - - cudaMemcpy(&rectracks[0], tracks_cu, simtracks.size()*sizeof(Track), cudaMemcpyDeviceToHost); - - time = dtime() - time; - - cudaFree(tracks_cu); - - return time; -} -#endif } // end namespace mkfit diff --git a/mkFit/fittestMPlex.h b/mkFit/fittestMPlex.h index 3f932e7a..8925167d 100644 --- a/mkFit/fittestMPlex.h +++ b/mkFit/fittestMPlex.h @@ -4,18 +4,9 @@ #include "Event.h" #include "Track.h" -#ifdef USE_CUDA -#include "FitterCU.h" -#endif - namespace mkfit { double runFittingTestPlex(Event& ev, std::vector& rectracks); -#ifdef USE_CUDA -void runAllEventsFittingTestPlexGPU(std::vector& events); -double runFittingTestPlexGPU(FitterCU &cuFitter, Event& ev, std::vector& rectracks); -#endif - } // end namespace mkfit #endif diff --git a/mkFit/fittracks_kernels.cu b/mkFit/fittracks_kernels.cu deleted file mode 100644 index 6a042d8b..00000000 --- a/mkFit/fittracks_kernels.cu +++ /dev/null @@ -1,51 +0,0 @@ -#include "fittracks_kernels.h" - -#include "kalmanUpdater_kernels.h" -#include "propagation_kernels.h" - -constexpr int BLOCK_SIZE_X = 256; - -__global__ void fittracks_kernel( - GPlexLV par_iP, GPlexLS Err_iP, - GPlexHV *msPar_arr, GPlexHS *msErr_arr, - GPlexLV par_iC, GPlexLS Err_iC, - GPlexLL errorProp, GPlexQI inChg, - const bool useParamBfield, - const int Nhits, int N) -{ - int grid_width = blockDim.x * gridDim.x; - int n = threadIdx.x + blockIdx.x * blockDim.x; - for (int hi = 0; hi < Nhits; ++hi) { - GPlexHV &msPar = msPar_arr[hi]; - GPlexHS &msErr = msErr_arr[hi]; - - for (int z = 0; z < (N-1)/grid_width +1; z++) { - n += z*grid_width; - - propagation_fn(Err_iC, par_iC, inChg, msPar, Err_iP, par_iP, useParamBfield, n, N); - kalmanUpdate_fn(Err_iP, msErr, par_iP, msPar, par_iC, Err_iC, n, N); - } - } -} - -void fittracks_wrapper(cudaStream_t &stream, - GPlexLS &Err_iP, GPlexLV &par_iP, - GPlexHS *msErr_arr, GPlexHV *msPar_arr, - GPlexLS &Err_iC, GPlexLV &par_iC, - GPlexLL &errorProp, GPlexQI &inChg, - const bool useParamBfield, - const int Nhits, const int N) -{ - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - fittracks_kernel <<< grid, block, 0, stream >>> - (par_iP, Err_iP, - msPar_arr, msErr_arr, - par_iC, Err_iC, - errorProp, inChg, - useParamBfield, - Nhits, N); -} diff --git a/mkFit/fittracks_kernels.h b/mkFit/fittracks_kernels.h deleted file mode 100644 index 9afeb13b..00000000 --- a/mkFit/fittracks_kernels.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef FITTRACKS_KERNELS_H_G3FDJYTX -#define FITTRACKS_KERNELS_H_G3FDJYTX - -#include "GPlex.h" - -namespace mkfit { - -void fittracks_wrapper(cudaStream_t &stream, - GPlexLS &Err_iP, GPlexLV &par_iP, - GPlexHS *msErr, GPlexHV *msPar, - GPlexLS &Err_iC, GPlexLV &par_iC, - GPlexLL &errorProp, GPlexQI &inChg, - const bool useParamBfield, - const int hit_idx, const int N); - -} // end namespace mkfit -#endif /* end of include guard: FITTRACKS_KERNELS_H_G3FDJYTX */ diff --git a/mkFit/gpu_utils.cu b/mkFit/gpu_utils.cu deleted file mode 100644 index 2800aa39..00000000 --- a/mkFit/gpu_utils.cu +++ /dev/null @@ -1,9 +0,0 @@ -#include "gpu_utils.h" - -void sync_gpu() { - cudaCheckErrorSync(); -} - -void separate_first_call_for_meaningful_profiling_numbers() { - sync_gpu(); -} diff --git a/mkFit/gpu_utils.h b/mkFit/gpu_utils.h deleted file mode 100644 index 6f84d800..00000000 --- a/mkFit/gpu_utils.h +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef GPU_UTILS_H -#define GPU_UTILS_H - -#include -#include - -#include -#include - -#define cudaCheckError() \ - do { \ - cudaError_t e=cudaGetLastError(); \ - CubDebugExit(e); \ - } while(0) - -#define cudaCheckErrorSync() \ - do { \ - cudaDeviceSynchronize(); \ - cudaCheckError(); \ - } while(0) - -namespace mkfit { - -// CUDA specific: -// Maximum number of blocks in the X direction of the thread grid. -constexpr int max_blocks_x = INT32_MAX; - -// The first call to a CUDA API function takes the initialization hit. -void separate_first_call_for_meaningful_profiling_numbers(); - -void sync_gpu(); - - -class Tracer { - public: - Tracer(const char* name, unsigned int color) { - nvtxEventAttributes_t eventAttrib = {0}; - eventAttrib.version = NVTX_VERSION; - eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; - eventAttrib.colorType = NVTX_COLOR_ARGB; - eventAttrib.color = color; - eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; - eventAttrib.message.ascii = name; - nvtxRangePushEx(&eventAttrib); - } - ~Tracer() { - nvtxRangePop(); - } -}; - -static std::unordered_map tracer_colors = { - {"Blue", 0x0000FF}, - {"Orange", 0xFFA500}, - {"Red", 0xFF0000}, - {"Pink", 0xFFC0CB}, - {"Green", 0x008000}, - {"PaleGreen", 0x98FB98}, - {"Chocolate", 0xD2691E} -}; - -} // end namespace mkfit - -#endif /* ifndef GPU_UTILS_H */ diff --git a/mkFit/index_selection_kernels.cu b/mkFit/index_selection_kernels.cu deleted file mode 100644 index 8116ad59..00000000 --- a/mkFit/index_selection_kernels.cu +++ /dev/null @@ -1,153 +0,0 @@ -#include "index_selection_kernels.h" -#include "Config.h" -#include "HitStructures.h" -#include "gpu_utils.h" - -#include "stdio.h" - -#define BLOCK_SIZE_X 32 - -constexpr bool tmp_useCMSGeom = false; - -__device__ void selectHitIndices_fn(const LayerOfHitsCU &layer_of_hits, - const GPlexLS &Err, const GPlexLV &Par, GPlexQI &XHitSize, - GPlexHitIdx &XHitArr, const int itrack, const int N) -{ - if (itrack < N) { - bool dump = false; - const float nSigmaPhi = 3; - const float nSigmaZ = 3; - - int xhitsize_tmp = XHitSize[itrack]; - XHitSize[itrack] = 0; - - float z, phi, dz, dphi; - { - const float x = Par(itrack, 0, 0); - const float y = Par(itrack, 1, 0); - - const float r2 = x*x + y*y; - - z = Par(itrack, 2, 0); - phi = getPhi(x, y); - dz = nSigmaZ * sqrtf(Err[5*Err.stride + itrack]); - - const float dphidx = -y/r2, dphidy = x/r2; - const float dphi2 = dphidx * dphidx * Err[0*Err.stride + itrack] + - dphidy * dphidy * Err[2*Err.stride + itrack] + - 2 * dphidx * dphidy * Err[1*Err.stride + itrack]; - -#ifdef HARD_CHECK - assert(dphi2 >= 0); -#endif - - dphi = nSigmaPhi * sqrtf(fabs(dphi2)); - - if (tmp_useCMSGeom) - { - //now correct for bending and for layer thickness unsing linear approximation - const float deltaR = Config::cmsDeltaRad; //fixme! using constant value, to be taken from layer properties - const float r = sqrt(r2); - //here alpha is the difference between posPhi and momPhi - const float alpha = phi - Par(itrack, 4, 0); - float cosA, sinA; - if (Config::useTrigApprox) { - sincos4(alpha, sinA, cosA); - } else { - cosA = cos(alpha); - sinA = sin(alpha); - } - //take abs so that we always inflate the window - const float dist = fabs(deltaR*sinA/cosA); - - dphi += dist / r; - } - } - - const LayerOfHitsCU &L = layer_of_hits; - - if (fabs(dz) > Config::m_max_dz) dz = Config::m_max_dz; - if (fabs(dphi) > Config::m_max_dphi) dphi = Config::m_max_dphi; - - const int zb1 = L.GetZBinChecked(z - dz); - const int zb2 = L.GetZBinChecked(z + dz) + 1; - const int pb1 = L.GetPhiBin(phi - dphi); - const int pb2 = L.GetPhiBin(phi + dphi) + 1; - // MT: The extra phi bins give us ~1.5% more good tracks at expense of 10% runtime. - // const int pb1 = L.GetPhiBin(phi - dphi) - 1; - // const int pb2 = L.GetPhiBin(phi + dphi) + 2; - - if (dump) - printf("LayerOfHitsCU::SelectHitIndices %6.3f %6.3f %6.6f %7.5f %3d %3d %4d %4d\n", - z, phi, dz, dphi, zb1, zb2, pb1, pb2); - - // MT: One could iterate in "spiral" order, to pick hits close to the center. - // http://stackoverflow.com/questions/398299/looping-in-a-spiral - // This would then work best with relatively small bin sizes. - // Or, set them up so I can always take 3x3 array around the intersection. - for (int zi = zb1; zi < zb2; ++zi) - { - for (int pi = pb1; pi < pb2; ++pi) - { - const int pb = pi & L.m_phi_mask; - - // MT: The following line is the biggest hog (4% total run time). - // This comes from cache misses, I presume. - // It might make sense to make first loop to extract bin indices - // and issue prefetches at the same time. - // Then enter vectorized loop to actually collect the hits in proper order. - - /*for (int hi = L.m_phi_bin_infos[zi][pb].first; hi < L.m_phi_bin_infos[zi][pb].second; ++hi)*/ - for (int hi = L.m_phi_bin_infos[zi*Config::m_nphi + pb].first; - hi < L.m_phi_bin_infos[zi*Config::m_nphi + pb].second; ++hi) - { - // MT: Access into m_hit_zs and m_hit_phis is 1% run-time each. - -#ifdef LOH_USE_PHI_Z_ARRAYS - float ddz = fabs(z - L.m_hit_zs[hi]); - float ddphi = fabs(phi - L.m_hit_phis[hi]); - if (ddphi > Config::PI) ddphi = Config::TwoPI - ddphi; - - if (dump) - printf(" SHI %3d %4d %4d %5d %6.3f %6.3f %6.4f %7.5f %s\n", - zi, pi, pb, hi, - L.m_hit_zs[hi], L.m_hit_phis[hi], ddz, ddphi, - (ddz < dz && ddphi < dphi) ? "PASS" : "FAIL"); - - // MT: Commenting this check out gives full efficiency ... - // and means our error estimations are wrong! - // Avi says we should have *minimal* search windows per layer. - // Also ... if bins are sufficiently small, we do not need the extra - // checks, see above. - // if (ddz < dz && ddphi < dphi && XHitSize[itrack] < MPlexHitIdxMax) -#endif - // MT: The following check also makes more sense with spiral traversal, - // we'd be taking in closest hits first. - if (XHitSize[itrack] < GPlexHitIdxMax) - { - XHitArr(itrack, XHitSize[itrack]++, 0) = hi; - } - } - } - } - } -} - - -__global__ void selectHitIndices_kernel(const LayerOfHitsCU layer_of_hits, - const GPlexLS Err, const GPlexLV Par, GPlexQI XHitSize, GPlexHitIdx XHitArr, const int N) { - int itrack = threadIdx.x + blockDim.x*blockIdx.x; - selectHitIndices_fn(layer_of_hits, Err, Par, XHitSize, XHitArr, itrack, N); -} - - -void selectHitIndices_wrapper(const cudaStream_t& stream, - const LayerOfHitsCU& layer_of_hits, const GPlexLS& Err, const GPlexLV& Par, - GPlexQI& XHitSize, GPlexHitIdx& XHitArr, const int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - selectHitIndices_kernel <<< grid, block, 0, stream >>> - (layer_of_hits, Err, Par, XHitSize, XHitArr, N); -} diff --git a/mkFit/index_selection_kernels.h b/mkFit/index_selection_kernels.h deleted file mode 100644 index 801b5a4b..00000000 --- a/mkFit/index_selection_kernels.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _INDEX_SELECTION_KERNELS_H_ -#define _INDEX_SELECTION_KERNELS_H_ - -#include "HitStructuresCU.h" -#include "GPlex.h" - -void selectHitIndices_wrapper(const cudaStream_t& stream, - const LayerOfHitsCU& layer_of_hits, const GPlexLS& Err, const GPlexLV& Par, - GPlexQI& XHitSize, GPlexHitIdx& XHitArr, const int N); - -__device__ void selectHitIndices_fn(const LayerOfHitsCU &layer_of_hits, - const GPlexLS &Err, const GPlexLV &Par, GPlexQI &XHitSize, - GPlexHitIdx &XHitArr, const int itrack, const int N); - -#endif // _INDEX_SELECTION_KERNELS_H_ diff --git a/mkFit/kalmanUpdater_kernels.cu b/mkFit/kalmanUpdater_kernels.cu deleted file mode 100644 index 18155c8c..00000000 --- a/mkFit/kalmanUpdater_kernels.cu +++ /dev/null @@ -1,402 +0,0 @@ -#include "Config.h" -#include "Hit.h" -#include "kalmanUpdater_kernels.h" -#include "computeChi2_kernels.h" -#include "gpu_utils.h" - -// TODO: Clean all the hard-coded #define -#define LS 21 -#define HS 6 -#define LH 18 -#define HV 3 - -#define BLOCK_SIZE_X 32 - - -__device__ void subtract_matrix(const float *a, const int aN, - const float *b, const int bN, - float *c, const int cN, - const int size, const int n) { - for (int i = 0; i < size; ++i) { - c[i*cN + n] = a[i*aN + n] - b[i*bN + n]; - - } -} - -__device__ float getHypot_fn(const float x, const float y) -{ - return sqrt(x*x + y*y); -} - -__device__ -void KalmanHTG_fn(const GPlexRegQF& a00, const GPlexRegQF& a01, - const GPlexReg2S &b, GPlexRegHH &c) -{ - - // HTG = rot * res_loc - // C = A * B - - // Based on script generation and adapted to custom sizes. - c[ 0] = a00[0]*b[ 0]; - c[ 1] = a00[0]*b[ 1]; - c[ 2] = 0.; - c[ 3] = a01[0]*b[ 0]; - c[ 4] = a01[0]*b[ 1]; - c[ 5] = 0.; - c[ 6] = b[ 1]; - c[ 7] = b[ 2]; - c[ 8] = 0.; -} - -__device__ -void KalmanGain_fn(const GPlexLS &A, const GPlexRegHH &b, GPlexRegLH &c, const int n) -{ - // C = A * B, C is 6x3, A is 6x6 sym , B is 6x3 - using T = float; - float *a = A.ptr; - int aN = A.stride; int an = n; // Global array - int bN = 1; int bn = 0; // Register array - int cN = 1; int cn = 0; - -#include "KalmanGain.ah" -} - -#include "KalmanUtilsMPlex.icc" - -__device__ -void KHMult_fn(const GPlexRegLH &A, - const GPlexRegQF& B00, - const GPlexRegQF& B01, - GPlexRegLL &C) -{ - KHMult_imp(A, B00, B01, C, 0, 1); -} - -__device__ -void KHC_fn(const GPlexRegLL &a, const GPlexLS &B, GPlexLS &C, const int n) -{ - // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym - using T = float; - int aN = 1; int an = 0; // Register array - T *b = B.ptr; int bN = B.stride; int bn = n; - T *c = C.ptr; int cN = C.stride; int cn = n; -#include "KHC.ah" -} - -// -__device__ -void ConvertToCCS_fn(const GPlexLV &a, GPlexRegLV &b, GPlexRegLL &c, const int n) -{ - ConvertToCCS_imp(a, b, c, n, n+1); -} - -__device__ -void PolarErr_fn(const GPlexRegLL &a, const float *b, int bN, GPlexRegLL &c, const int n) -{ - // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym - - // Generated code access arrays with variables cN, cn - // c[i*cN+cn] - int aN = 1; int an = 0; // Register array - int bn = n; // Global array - int cN = 1; int cn = 0; -#include "CCSErr.ah" -} - -__device__ -void PolarErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, const int n) -{ - // C = A * B, C is sym, A is 6x6 , B is 6x6 - using T = float; - int aN = 1; int an = 0; - int bN = 1; int bn = 0; - T *c = C.ptr; int cN = C.stride; int cn = n; -#include "CCSErrTransp.ah" -} - -__device__ -void ConvertToCartesian_fn(const GPlexRegLV &a, GPlexLV& b, GPlexRegLL &c, const int n) -{ - ConvertToCartesian_imp(a, b, c, n, n+1); -} - -__device__ -void CartesianErr_fn(const GPlexRegLL &a, const float *b, const int bN, GPlexRegLL &c, const int n) -{ - // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym - int aN = 1; int an = 0; - int bn = n; - int cN = 1; int cn = 0; - -#include "CartesianErr.ah" -} - -__device__ -void CartesianErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, const int n) -{ - // C = A * B, C is sym, A is 6x6 , B is 6x6 - using T = float; - int aN = 1; int an = 0; - int bN = 1; int bn = 0; - T *c = C.ptr; int cN = C.stride; int cn = n; - -#include "CartesianErrTransp.ah" -} - - -/// MultKalmanGain //////////////////////////////////////////////////////////// - -__device__ void upParam_MultKalmanGain_fn( - const float* __restrict__ a, const size_t aN, - const float* b_reg, float *c, const int N, const int n) { - // const T* __restrict__ tells the compiler that it can uses the read-only - // cache, without worrying about coherency. - // c -> kalmanGain, in register - - /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/ - // use registers to store values of 'a' that are use multiple times - // To reduce the number of registers and avoid spilling interlace - // read and compute instructions. - float a_reg[3]; - - int j; - j = 0; a_reg[0] = a[n + j*aN]; - j = 1; a_reg[1] = a[n + j*aN]; - j = 3; a_reg[2] = a[n + j*aN]; - - c[ 0] = a_reg[ 0]*b_reg[ 0] + a_reg[ 1]*b_reg[ 1] + a_reg[ 2]*b_reg[ 3]; - c[ 1] = a_reg[ 0]*b_reg[ 1] + a_reg[ 1]*b_reg[ 2] + a_reg[ 2]*b_reg[ 4]; - c[ 2] = a_reg[ 0]*b_reg[ 3] + a_reg[ 1]*b_reg[ 4] + a_reg[ 2]*b_reg[ 5]; - - j = 1; a_reg[0] = a[n + j*aN]; - j = 2; a_reg[1] = a[n + j*aN]; - j = 4; a_reg[2] = a[n + j*aN]; - - c[ 3] = a_reg[ 0]*b_reg[ 0] + a_reg[ 1]*b_reg[ 1] + a_reg[ 2]*b_reg[ 3]; - c[ 4] = a_reg[ 0]*b_reg[ 1] + a_reg[ 1]*b_reg[ 2] + a_reg[ 2]*b_reg[ 4]; - c[ 5] = a_reg[ 0]*b_reg[ 3] + a_reg[ 1]*b_reg[ 4] + a_reg[ 2]*b_reg[ 5]; - - j = 3; a_reg[0] = a[n + j*aN]; - j = 4; a_reg[1] = a[n + j*aN]; - j = 5; a_reg[2] = a[n + j*aN]; - - c[ 6] = a_reg[ 0]*b_reg[ 0] + a_reg[ 1]*b_reg[ 1] + a_reg[ 2]*b_reg[ 3]; - c[ 7] = a_reg[ 0]*b_reg[ 1] + a_reg[ 1]*b_reg[ 2] + a_reg[ 2]*b_reg[ 4]; - c[ 8] = a_reg[ 0]*b_reg[ 3] + a_reg[ 1]*b_reg[ 4] + a_reg[ 2]*b_reg[ 5]; - - j = 6; a_reg[0] = a[n + j*aN]; - j = 7; a_reg[1] = a[n + j*aN]; - j = 8; a_reg[2] = a[n + j*aN]; - - c[ 9] = a_reg[ 0]*b_reg[ 0] + a_reg[ 1]*b_reg[ 1] + a_reg[ 2]*b_reg[ 3]; - c[10] = a_reg[ 0]*b_reg[ 1] + a_reg[ 1]*b_reg[ 2] + a_reg[ 2]*b_reg[ 4]; - c[11] = a_reg[ 0]*b_reg[ 3] + a_reg[ 1]*b_reg[ 4] + a_reg[ 2]*b_reg[ 5]; - - j = 10; a_reg[0] = a[n + j*aN]; - j = 11; a_reg[1] = a[n + j*aN]; - j = 12; a_reg[2] = a[n + j*aN]; - - c[12] = a_reg[ 0]*b_reg[ 0] + a_reg[ 1]*b_reg[ 1] + a_reg[ 2]*b_reg[ 3]; - c[13] = a_reg[ 0]*b_reg[ 1] + a_reg[ 1]*b_reg[ 2] + a_reg[ 2]*b_reg[ 4]; - c[14] = a_reg[ 0]*b_reg[ 3] + a_reg[ 1]*b_reg[ 4] + a_reg[ 2]*b_reg[ 5]; - - j = 15; a_reg[0] = a[n + j*aN]; - j = 16; a_reg[1] = a[n + j*aN]; - j = 17; a_reg[2] = a[n + j*aN]; - - c[15] = a_reg[ 0]*b_reg[ 0] + a_reg[ 1]*b_reg[ 1] + a_reg[ 2]*b_reg[ 3]; - c[16] = a_reg[ 0]*b_reg[ 1] + a_reg[ 1]*b_reg[ 2] + a_reg[ 2]*b_reg[ 4]; - c[17] = a_reg[ 0]*b_reg[ 3] + a_reg[ 1]*b_reg[ 4] + a_reg[ 2]*b_reg[ 5]; -} - -/// Invert Cramer Symetric //////////////////////////////////////////////////// -__device__ void invertCramerSym_fn(float *a) { - // a is in registers. - // to use global memory, a stride will be required and accesses would be: - // a[n + stride_a * i]; - typedef float TT; - - const TT c00 = a[2] * a[5] - a[4] * a[4]; - const TT c01 = a[4] * a[3] - a[1] * a[5]; - const TT c02 = a[1] * a[4] - a[2] * a[3]; - const TT c11 = a[5] * a[0] - a[3] * a[3]; - const TT c12 = a[3] * a[1] - a[4] * a[0]; - const TT c22 = a[0] * a[2] - a[1] * a[1]; - - const TT det = a[0] * c00 + a[1] * c01 + a[3] * c02; - - const TT s = TT(1) / det; - - a[0] = s*c00; - a[1] = s*c01; - a[2] = s*c11; - a[3] = s*c02; - a[4] = s*c12; - a[5] = s*c22; -} - -__device__ void invertCramerSym2x2_fn(GPlexReg2S &a) { - float det = a[0] * a[2] - a[1] * a[1]; - const float s = float(1) / det; - const float tmp = s * a[2]; - a[1] *= -s; - a[2] = s * a[0]; - a[0] = tmp; -} - -__device__ void subtractFirst3_fn(const GPlexHV __restrict__ &A, - const GPlexLV __restrict__ &B, - GPlexRegHV &C, const int N, int n) { - using T = float; - const T *a = A.ptr; int aN = A.stride; - const T *b = B.ptr; int bN = B.stride; - T *c = C.arr; - /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/ - - if(n < N) { - c[0] = a[0*aN+n] - b[0*bN+n]; - c[1] = a[1*aN+n] - b[1*bN+n]; - c[2] = a[2*aN+n] - b[2*bN+n]; - } -} - -/// AddIntoUpperLeft3x3 ////////////////////////////////////////////////////// -__device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A, - const GPlexHS __restrict__ &B, - GPlexRegHS &C, const int N, const int n) { - using T = float; - const T *a = A.ptr; int aN = A.stride; - const T *b = B.ptr; int bN = B.stride; - T *c = C.arr; - /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/ - - if(n < N) { - c[0] = a[0*aN+n] + b[0*bN+n]; - c[1] = a[1*aN+n] + b[1*bN+n]; - c[2] = a[2*aN+n] + b[2*bN+n]; - c[3] = a[3*aN+n] + b[3*bN+n]; - c[4] = a[4*aN+n] + b[4*bN+n]; - c[5] = a[5*aN+n] + b[5*bN+n]; - } -} - - -/// MultResidualsAdd ////////////////////////////////////////////////////////// -__device__ void multResidualsAdd_fn( - const GPlexRegLH ®_a, - const GPlexLV __restrict__ &B, - const GPlexReg2V &c, - GPlexLV &D, - const int N, const int n) { - - MultResidualsAdd_imp(reg_a, B, c, D, n, n+1); -} - -__device__ -void MultResidualsAdd_all_reg(const GPlexRegLH &a, - const GPlexRegLV &b, - const GPlexReg2V &c, - GPlexRegLV &d) -{ - // outPar = psPar + kalmanGain*(dPar) - // D = B A C - // where right half of kalman gain is 0 - - // XXX Regenerate with a script. - // generate loop (can also write it manually this time, it's not much) - d[0] = b[0] + a[ 0] * c[0] + a[ 1] * c[1]; - d[1] = b[1] + a[ 3] * c[0] + a[ 4] * c[1]; - d[2] = b[2] + a[ 6] * c[0] + a[ 7] * c[1]; - d[3] = b[3] + a[ 9] * c[0] + a[10] * c[1]; - d[4] = b[4] + a[12] * c[0] + a[13] * c[1]; - d[5] = b[5] + a[15] * c[0] + a[16] * c[1]; -} - - -__device__ void kalmanUpdate_fn( - GPlexLS &propErr, const GPlexHS __restrict__ &msErr, - const GPlexLV __restrict__ &par_iP, const GPlexHV __restrict__ &msPar, - GPlexLV &par_iC, GPlexLS &outErr, const int n, const int N) { - // Note: similar results with propErr kept in registers. - // It is read-only so using the read-only cache yields more flexibility - // wrt block size without increasing the pressure on registers to much. - // There is no need to keep resErr and kalmanGain as global memory arrays. - GPlexRegHS resErr_reg; - - // If there is more matrices than max_blocks_x * BLOCK_SIZE_X - if (n < N) { - for (int j = 0; j < HS; ++j) { - resErr_reg[j] = 0; //resErr[j*resErr_stride + n]; - } - - // FIXME: Add useCMSGeom -> port propagateHelixToRMPlex -#if 0 - if (Config::useCMSGeom) { - propagateHelixToRMPlex(psErr, psPar, inChg, msPar, propErr, propPar); - } else { - propErr = psErr; - propPar = psPar; - } -#endif - GPlexRegQF rotT00; - GPlexRegQF rotT01; - const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0)); - rotT00[0] = -(msPar(n, 1, 0) + par_iP(n, 1, 0))/(2*r); - rotT01[0] = (msPar(n, 0, 0) + par_iP(n, 0, 0))/(2*r); - - GPlexRegHV res_glo; - subtractFirst3_fn(msPar, par_iP, res_glo, N, n); - - addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n); - GPlexReg2V res_loc; //position residual in local coordinates - RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc); - GPlexReg2S resErr_loc; // 2x2 sym - GPlexRegHH tempHH; // 3*3 sym - ProjectResErr_fn (rotT00, rotT01, resErr_reg, tempHH); - ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc); - - /*invertCramerSym_fn(resErr_reg);*/ - invertCramerSym2x2_fn(resErr_loc); - - // Kalman update in "polar" coordinates - GPlexRegLH K; - KalmanHTG_fn(rotT00, rotT01, resErr_loc, tempHH); - KalmanGain_fn(propErr, tempHH, K, n); - - multResidualsAdd_fn(K, par_iP, res_loc, par_iC, N, n);// propPar_pol is now the updated parameters in "polar" coordinates - GPlexRegLL tempLL; - - KHMult_fn(K, rotT00, rotT01, tempLL); - KHC_fn(tempLL, propErr, outErr, n); - subtract_matrix(propErr.ptr, propErr.stride, outErr.ptr, outErr.stride, - outErr.ptr, outErr.stride, LS, n); - - } -} - -__global__ void kalmanUpdate_kernel( - GPlexLS propErr, const GPlexHS __restrict__ msErr, - const GPlexLV __restrict__ par_iP, const GPlexHV __restrict__ msPar, - GPlexLV par_iC, GPlexLS outErr, const int N) { - int grid_width = blockDim.x * gridDim.x; - int n = threadIdx.x + blockIdx.x * blockDim.x; - - for (int z = 0; z < (N-1)/grid_width +1; z++) { - n += z*grid_width; - kalmanUpdate_fn(propErr, msErr, par_iP, msPar, par_iC, outErr, n, N); - } -} - -void kalmanUpdate_wrapper(const cudaStream_t& stream, - GPlexLS& d_propErr, const GPlexHS& d_msErr, - GPlexLV& d_par_iP, const GPlexHV& d_msPar, - GPlexLV& d_par_iC, GPlexLS& d_outErr, - const int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - kalmanUpdate_kernel <<>> - (d_propErr, d_msErr, d_par_iP, d_msPar, d_par_iC, d_outErr, N); -} - diff --git a/mkFit/kalmanUpdater_kernels.h b/mkFit/kalmanUpdater_kernels.h deleted file mode 100644 index 112462eb..00000000 --- a/mkFit/kalmanUpdater_kernels.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef _KALMAN_UPDATER_KERNELS_H_ -#define _KALMAN_UPDATER_KERNELS_H_ - -#include "GPlex.h" - -void kalmanUpdate_wrapper(const cudaStream_t& stream, - GPlexLS& d_propErr, const GPlexHS& d_msErr, - GPlexLV& d_par_iP, const GPlexHV& d_msPar, - GPlexLV& d_par_iC, GPlexLS& d_outErr, - const int N); - -//void reorganizeMs_wrapper(cudaStream_t& stream, GPlexQF& msPar, -// float *full_posArray, GPlexHS& msErr, -// float *full_errArray, int *full_hitIdx, int hi, int maxHits, -// int N, int hs, int hv, int Nhits); - -__global__ void kalmanUpdate_kernel( - GPlexLS propErr, const GPlexHS __restrict__ msErr, - const GPlexLV __restrict__ par_iP, const GPlexHV __restrict__ msPar, - GPlexLV par_iC, GPlexLS outErr, const int N); - -__device__ void kalmanUpdate_fn( - GPlexLS &propErr, const GPlexHS __restrict__ &msErr, - const GPlexLV __restrict__ &par_iP, const GPlexHV __restrict__ &msPar, - GPlexLV &par_iC, GPlexLS &outErr, const int itrack, const int N); - -__device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A, - const GPlexHS __restrict__ &B, - GPlexRegHS &c, const int N, const int n); - -__device__ void subtractFirst3_fn(const GPlexHV __restrict__ &A, - const GPlexLV __restrict__ &B, - GPlexRegHV &C, const int N, const int n); - -__device__ void invertCramerSym_fn(float *a); -__device__ void invertCramerSym2x2_fn(GPlexReg2S &a); - -#endif // _KALMAN_UPDATER_KERNELS_H_ diff --git a/mkFit/mkFit.cc b/mkFit/mkFit.cc index 18bd9ae2..c57c01c1 100644 --- a/mkFit/mkFit.cc +++ b/mkFit/mkFit.cc @@ -23,12 +23,6 @@ #include "Validation.h" #endif -#ifdef USE_CUDA -#include "FitterCU.h" -#include "BuilderCU.h" -#include "gpu_utils.h" -#endif - #include //#define DEBUG #include "Debug.h" @@ -89,8 +83,6 @@ namespace bool g_run_build_ce = false; bool g_run_build_fv = false; - bool g_seed_based = false; - std::string g_operation = "simulate_and_process";; std::string g_input_file = ""; std::string g_output_file = ""; @@ -292,46 +284,6 @@ void test_standard() double t_skip[NT] = {0}; double time = dtime(); -#if USE_CUDA_OLD - tbb::task_scheduler_init tbb_init(Config::numThreadsFinder); - //tbb::task_scheduler_init tbb_init(tbb::task_scheduler_init::automatic); - - //omp_set_num_threads(Config::numThreadsFinder); - // fittest time. Sum of all events. In case of multiple events - // being run simultaneously in different streams this time will - // be larger than the elapsed time. - - std::vector events; - std::vector validations(Config::nEvents); - - events.reserve(Config::nEvents); - // simulations are all performed before the fitting loop. - // It is mandatory in order to see the benefits of running - // multiple streams. - for (int evt = 1; evt <= Config::nEvents; ++evt) { - printf("Simulating event %d\n", evt); - events.emplace_back(geom, val, evt); - events.back().Simulate(); - dprint("Event #" << events.back().evtID() << " simtracks " << events.back().simTracks_.size() << " layerhits " << events.back().layerHits_.size()); - } - - // The first call to a GPU function always take a very long time. - // Everything needs to be initialized. - // Nothing is done in this function, except calling an harmless - // CUDA function. These function can be changed to another one - // if it becomes important to time (e.g. if you want the profiler to - // tell you how much time is spend running cudaDeviceSynchronize(), - // use another function). - separate_first_call_for_meaningful_profiling_numbers(); - - if (g_run_fit_std) runAllEventsFittingTestPlexGPU(events); - - if (g_run_build_all || g_run_build_bh) { - double total_best_hit_time = 0.; - total_best_hit_time = runAllBuildingTestPlexBestHitGPU(events); - std::cout << "Total best hit time (GPU): " << total_best_hit_time << std::endl; - } -#else std::atomic nevt{g_start_event}; std::atomic seedstot{0}, simtrackstot{0}, candstot{0}; std::atomic maxHits_all{0}, maxLayer_all{0}; @@ -344,13 +296,6 @@ void test_standard() std::vector> fps; fps.reserve(Config::numThreadsEvents); -#if USE_CUDA - separate_first_call_for_meaningful_profiling_numbers(); - - std::vector>> cuFitters(Config::numThreadsEvents); - std::vector> cuBuilders(Config::numThreadsEvents); -#endif - const std::string valfile("valtree"); for (int i = 0; i < Config::numThreadsEvents; ++i) { @@ -362,18 +307,6 @@ void test_standard() if (g_operation == "read") { fps.emplace_back(fopen(g_input_file.c_str(), "r"), [](FILE* fp) { if (fp) fclose(fp); }); } -#if USE_CUDA - constexpr int gplex_width = 10000; - cuFitters[i].reset(new FitterCU(gplex_width)); - cuFitters[i].get()->allocateDevice(); - cuFitters[i].get()->allocate_extra_addBestHit(); - cuFitters[i].get()->allocate_extra_combinatorial(); - cuFitters[i].get()->createStream(); - cuFitters[i].get()->setNumberTracks(gplex_width); - - cuBuilders[i].reset(new BuilderCU(cuFitters[i].get())); - cuBuilders[i].get()->allocateGeometry(geom); -#endif } tbb::task_scheduler_init tbb_init(Config::numThreadsFinder); @@ -398,11 +331,6 @@ void test_standard() int evstart = thisthread*events_per_thread; int evend = std::min(Config::nEvents, evstart+events_per_thread); -#if USE_CUDA - auto& cuFitter = *cuFitters[thisthread].get(); - auto& cuBuilder = *cuBuilders[thisthread].get(); -#endif - dprint("thisthread " << thisthread << " events " << Config::nEvents << " events/thread " << events_per_thread << " range " << evstart << ":" << evend); @@ -440,18 +368,11 @@ void test_standard() int maxLayer_thisthread = 0; for (int b = 0; b < Config::finderReportBestOutOfN; ++b) { - #ifndef USE_CUDA t_cur[0] = (g_run_fit_std) ? runFittingTestPlex(ev, plex_tracks) : 0; t_cur[1] = (g_run_build_all || g_run_build_bh) ? runBuildingTestPlexBestHit(ev, mkb) : 0; t_cur[3] = (g_run_build_all || g_run_build_ce) ? runBuildingTestPlexCloneEngine(ev, mkb) : 0; t_cur[4] = (g_run_build_all || g_run_build_fv) ? runBuildingTestPlexFV(ev, mkb) : 0; if (g_run_build_all || g_run_build_cmssw) runBuildingTestPlexDumbCMSSW(ev, mkb); - #else - t_cur[0] = (g_run_fit_std) ? runFittingTestPlexGPU(cuFitter, ev, plex_tracks) : 0; - t_cur[1] = (g_run_build_all || g_run_build_bh) ? runBuildingTestPlexBestHitGPU(ev, mkb, cuBuilder) : 0; - // XXXX MT note for Matthieu: ev_tmp no longer exists ----------------------------------v - t_cur[3] = (g_run_build_all || g_run_build_ce) ? runBuildingTestPlexCloneEngineGPU(ev, ev_tmp, mkb, cuBuilder, g_seed_based) : 0; - #endif t_cur[2] = (g_run_build_all || g_run_build_std) ? runBuildingTestPlexStandard(ev, mkb) : 0; if (g_run_build_ce){ ncands_thisthread = mkb.total_cands(); @@ -495,7 +416,6 @@ void test_standard() } }, tbb::simple_partitioner()); -#endif time = dtime() - time; printf("\n"); @@ -520,14 +440,6 @@ void test_standard() val->fillConfigTree(); val->saveTTrees(); } -#if USE_CUDA - for (int i = 0; i < Config::numThreadsEvents; ++i) { - cuFitters[i].get()->freeDevice(); - cuFitters[i].get()->free_extra_addBestHit(); - cuFitters[i].get()->free_extra_combinatorial(); - cuFitters[i].get()->destroyStream(); - } -#endif } //============================================================================== @@ -713,10 +625,6 @@ int main(int argc, const char *argv[]) " == --cmssw-val --read-cmssw-tracks --cmssw-match-fw %s --cmssw-match-bk %s\n" " must enable: --cmssw-pureseeds --backward-fit-pca\n" "\n----------------------------------------------------------------------------------------------------------\n\n" - "GPU specific options: \n\n" - " --num-thr-reorg number of threads to run the hits reorganization (def: %d)\n" - " --seed-based For CE. Switch to 1 CUDA thread per seed (def: %s)\n" - "\n----------------------------------------------------------------------------------------------------------\n\n" , argv[0], @@ -785,10 +693,8 @@ int main(int argc, const char *argv[]) getOpt(hitBased, g_match_opts).c_str(), getOpt(trkParamBased, g_match_opts).c_str(), getOpt(trkParamBased, g_match_opts).c_str(), getOpt(hitBased, g_match_opts).c_str(), getOpt(trkParamBased, g_match_opts).c_str(), getOpt(trkParamBased, g_match_opts).c_str(), - getOpt(labelBased, g_match_opts).c_str(), getOpt(labelBased, g_match_opts).c_str(), + getOpt(labelBased, g_match_opts).c_str(), getOpt(labelBased, g_match_opts).c_str() - Config::numThreadsReorg, - b2a(g_seed_based) ); printf("List of options for string based inputs \n"); @@ -1102,15 +1008,6 @@ int main(int argc, const char *argv[]) Config::cmsswMatchingFW = labelBased; Config::cmsswMatchingBK = labelBased; } - else if (*i == "--num-thr-reorg") - { - next_arg_or_die(mArgs, i); - Config::numThreadsReorg = atoi(i->c_str()); - } - else if (*i == "--seed-based") - { - g_seed_based = true; - } else { fprintf(stderr, "Error: Unknown option/argument '%s'.\n", i->c_str()); diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu deleted file mode 100644 index 74691be6..00000000 --- a/mkFit/propagation_kernels.cu +++ /dev/null @@ -1,316 +0,0 @@ -#include "Config.h" -#include "Debug.h" -#include "propagation_kernels.h" -#include -#include "gpu_utils.h" - - -// values from 32 to 512 give good results. -// 32 gives slightly better results (on a K40) -constexpr int BLOCK_SIZE_X = 32; - - -__device__ -void MultHelixProp_fn(const GPlexRegLL& a, const GPlexLS& b, GPlexRegLL& c, const int n) -{ - // C = A * B - - typedef float T; - /*const idx_t N = NN;*/ - - /*const T *a = A.fArray; ASSUME_ALIGNED(a, 64);*/ - /*const T *b = B.fArray; ASSUME_ALIGNED(b, 64);*/ - /*T *c = C.fArray; ASSUME_ALIGNED(c, 64);*/ - /*float *a = A.ptr;*/ - int aN = 1 ; int an = 0; // Register array - int bN = b.stride; int bn = n; // Global array - int cN = 1; int cn = 0; - -#include "MultHelixProp.ah" -} - -__device__ -void MultHelixPropTransp_fn(const GPlexRegLL& a, const GPlexRegLL& b, GPlexLS& c, const int n) -{ - // C = B * AT; - - typedef float T; - int aN = 1 ; int an = 0; // Register array - int bN = 1 ; int bn = 0; // Global array - int cN = c.stride; int cn = n; - -#include "MultHelixPropTransp.ah" -} - -// Not passing msRad.stride, as QF == 1 (second dim f msRad) -__device__ void computeMsRad_fn(const GPlexHV& __restrict__ msPar, - GPlexRegQF &msRad, const int N, const int n) { - /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/ - if (n < N) { - msRad(n, 0, 0) = hipo(msPar(n, 0, 0), msPar(n, 1, 0)); - } -} - - -#include "PropagationMPlex.icc" - -__device__ void helixAtRFromIterativeCCS_fn(const GPlexLV& inPar, - const GPlexQI& inChg, GPlexLV& outPar_global, const GPlexRegQF& msRad, - GPlexRegLL& errorProp,const bool useParamBfield, const int N, const int n) -{ - - GPlexRegLV outPar; - - if (n < N) { - for (int j = 0; j < 5; ++j) { - outPar[j] = outPar_global(n, j, 0); - } - errorProp.SetVal(0); - - helixAtRFromIterativeCCS_impl(inPar, inChg, outPar, msRad, errorProp, n, n+1, -1, useParamBfield); - - // Once computations are done. Get values from registers to global memory. - for (int j = 0; j < 5; ++j) { - outPar_global(n, j, 0) = outPar[j]; - } - } -} - -/// Similarity //////////////////////////////////////////////////////////////// -__device__ void similarity_fn(GPlexRegLL &a, GPlexLS &b, int N, int n) { - - size_t bN = b.stride; - - // Keep most values in registers. - //float b_reg[LL2]; - GPlexRegLL b_reg; - // To avoid using too many registers, tmp[] as a limited size and is reused. - float tmp[6]; - - if (n < N) { - for (int j = 0; j < b.kSize; j++) { - b_reg[j] = b[n + j*bN]; - } - - tmp[ 0] = a[0]*b_reg[ 0] + a[1]*b_reg[ 1] + a[3]*b_reg[ 6] + a[4]*b_reg[10]; - tmp[ 1] = a[0]*b_reg[ 1] + a[1]*b_reg[ 2] + a[3]*b_reg[ 7] + a[4]*b_reg[11]; - /*tmp[ 2] = a[0]*b_reg[ 3] + a[1]*b_reg[ 4] + a[3]*b_reg[ 8] + a[4]*b_reg[12];*/ - tmp[ 3] = a[0]*b_reg[ 6] + a[1]*b_reg[ 7] + a[3]*b_reg[ 9] + a[4]*b_reg[13]; - tmp[ 4] = a[0]*b_reg[10] + a[1]*b_reg[11] + a[3]*b_reg[13] + a[4]*b_reg[14]; - /*tmp[ 5] = a[0]*b_reg[15] + a[1]*b_reg[16] + a[3]*b_reg[18] + a[4]*b_reg[19];*/ - - b[ 0*bN+n] = tmp[ 0]*a[0] + tmp[ 1]*a[1] + tmp[ 3]*a[3] + tmp[ 4]*a[4]; - - - tmp[ 0] = a[6]*b_reg[ 0] + a[7]*b_reg[ 1] + a[9]*b_reg[ 6] + a[10]*b_reg[10]; - tmp[ 1] = a[6]*b_reg[ 1] + a[7]*b_reg[ 2] + a[9]*b_reg[ 7] + a[10]*b_reg[11]; - /*tmp[ 8] = a[6]*b_reg[ 3] + a[7]*b_reg[ 4] + a[9]*b_reg[ 8] + a[10]*b_reg[12];*/ - tmp[ 3] = a[6]*b_reg[ 6] + a[7]*b_reg[ 7] + a[9]*b_reg[ 9] + a[10]*b_reg[13]; - tmp[ 4] = a[6]*b_reg[10] + a[7]*b_reg[11] + a[9]*b_reg[13] + a[10]*b_reg[14]; - /*tmp[11] = a[6]*b_reg[15] + a[7]*b_reg[16] + a[9]*b_reg[18] + a[10]*b_reg[19];*/ - - b[ 1*bN+n] = tmp[ 0]*a[0] + tmp[ 1]*a[1] + tmp[ 3]*a[3] + tmp[ 4]*a[4]; - b[ 2*bN+n] = tmp[ 0]*a[6] + tmp[ 1]*a[7] + tmp[ 3]*a[9] + tmp[ 4]*a[10]; - - - tmp[ 0] = a[12]*b_reg[ 0] + a[13]*b_reg[ 1] + b_reg[ 3] + a[15]*b_reg[ 6] + a[16]*b_reg[10] + a[17]*b_reg[15]; - tmp[ 1] = a[12]*b_reg[ 1] + a[13]*b_reg[ 2] + b_reg[ 4] + a[15]*b_reg[ 7] + a[16]*b_reg[11] + a[17]*b_reg[16]; - tmp[ 2] = a[12]*b_reg[ 3] + a[13]*b_reg[ 4] + b_reg[ 5] + a[15]*b_reg[ 8] + a[16]*b_reg[12] + a[17]*b_reg[17]; - tmp[ 3] = a[12]*b_reg[ 6] + a[13]*b_reg[ 7] + b_reg[ 8] + a[15]*b_reg[ 9] + a[16]*b_reg[13] + a[17]*b_reg[18]; - tmp[ 4] = a[12]*b_reg[10] + a[13]*b_reg[11] + b_reg[12] + a[15]*b_reg[13] + a[16]*b_reg[14] + a[17]*b_reg[19]; - tmp[ 5] = a[12]*b_reg[15] + a[13]*b_reg[16] + b_reg[17] + a[15]*b_reg[18] + a[16]*b_reg[19] + a[17]*b_reg[20]; - - b[ 3*bN+n] = tmp[ 0]*a[0] + tmp[ 1]*a[1] + tmp[ 3]*a[3] + tmp[ 4]*a[4]; - b[ 4*bN+n] = tmp[ 0]*a[6] + tmp[ 1]*a[7] + tmp[ 3]*a[9] + tmp[ 4]*a[10]; - b[ 5*bN+n] = tmp[ 0]*a[12] + tmp[ 1]*a[13] + tmp[ 2] + tmp[ 3]*a[15] + tmp[ 4]*a[16] + tmp[ 5]*a[17]; - - - tmp[ 0] = a[18]*b_reg[ 0] + a[19]*b_reg[ 1] + a[21]*b_reg[ 6] + a[22]*b_reg[10]; - tmp[ 1] = a[18]*b_reg[ 1] + a[19]*b_reg[ 2] + a[21]*b_reg[ 7] + a[22]*b_reg[11]; - tmp[ 2] = a[18]*b_reg[ 3] + a[19]*b_reg[ 4] + a[21]*b_reg[ 8] + a[22]*b_reg[12]; - tmp[ 3] = a[18]*b_reg[ 6] + a[19]*b_reg[ 7] + a[21]*b_reg[ 9] + a[22]*b_reg[13]; - tmp[ 4] = a[18]*b_reg[10] + a[19]*b_reg[11] + a[21]*b_reg[13] + a[22]*b_reg[14]; - tmp[ 5] = a[18]*b_reg[15] + a[19]*b_reg[16] + a[21]*b_reg[18] + a[22]*b_reg[19]; - - b[ 6*bN+n] = tmp[ 0]*a[0] + tmp[ 1]*a[1] + tmp[ 3]*a[3] + tmp[ 4]*a[4]; - b[ 7*bN+n] = tmp[ 0]*a[6] + tmp[ 1]*a[7] + tmp[ 3]*a[9] + tmp[ 4]*a[10]; - b[ 8*bN+n] = tmp[ 0]*a[12] + tmp[ 1]*a[13] + tmp[ 2] + tmp[ 3]*a[15] + tmp[ 4]*a[16] + tmp[ 5]*a[17]; - b[ 9*bN+n] = tmp[ 0]*a[18] + tmp[ 1]*a[19] + tmp[ 3]*a[21] + tmp[ 4]*a[22]; - - - tmp[ 0] = a[24]*b_reg[ 0] + a[25]*b_reg[ 1] + a[27]*b_reg[ 6] + a[28]*b_reg[10]; - tmp[ 1] = a[24]*b_reg[ 1] + a[25]*b_reg[ 2] + a[27]*b_reg[ 7] + a[28]*b_reg[11]; - tmp[ 2] = a[24]*b_reg[ 3] + a[25]*b_reg[ 4] + a[27]*b_reg[ 8] + a[28]*b_reg[12]; - tmp[ 3] = a[24]*b_reg[ 6] + a[25]*b_reg[ 7] + a[27]*b_reg[ 9] + a[28]*b_reg[13]; - tmp[ 4] = a[24]*b_reg[10] + a[25]*b_reg[11] + a[27]*b_reg[13] + a[28]*b_reg[14]; - tmp[ 5] = a[24]*b_reg[15] + a[25]*b_reg[16] + a[27]*b_reg[18] + a[28]*b_reg[19]; - - b[10*bN+n] = tmp[ 0]*a[0] + tmp[ 1]*a[1] + tmp[ 3]*a[3] + tmp[ 4]*a[4]; - b[11*bN+n] = tmp[ 0]*a[6] + tmp[ 1]*a[7] + tmp[ 3]*a[9] + tmp[ 4]*a[10]; - b[12*bN+n] = tmp[ 0]*a[12] + tmp[ 1]*a[13] + tmp[ 2] + tmp[ 3]*a[15] + tmp[ 4]*a[16] + tmp[ 5]*a[17]; - b[13*bN+n] = tmp[ 0]*a[18] + tmp[ 1]*a[19] + tmp[ 3]*a[21] + tmp[ 4]*a[22]; - b[14*bN+n] = tmp[ 0]*a[24] + tmp[ 1]*a[25] + tmp[ 3]*a[27] + tmp[ 4]*a[28]; - - tmp[ 0] = b_reg[15]; - tmp[ 1] = b_reg[16]; - tmp[ 2] = b_reg[17]; - tmp[ 3] = b_reg[18]; - tmp[ 4] = b_reg[19]; - tmp[ 5] = b_reg[20]; - - // MultHelixPropTransp - b[15*bN+n] = tmp[ 0]*a[0] + tmp[ 1]*a[1] + tmp[ 3]*a[3] + tmp[ 4]*a[4]; - b[16*bN+n] = tmp[ 0]*a[6] + tmp[ 1]*a[7] + tmp[ 3]*a[9] + tmp[ 4]*a[10]; - b[17*bN+n] = tmp[ 0]*a[12] + tmp[ 1]*a[13] + tmp[ 2] + tmp[ 3]*a[15] + tmp[ 4]*a[16] + tmp[ 5]*a[17]; - b[18*bN+n] = tmp[ 0]*a[18] + tmp[ 1]*a[19] + tmp[ 3]*a[21] + tmp[ 4]*a[22]; - b[19*bN+n] = tmp[ 0]*a[24] + tmp[ 1]*a[25] + tmp[ 3]*a[27] + tmp[ 4]*a[28]; - b[20*bN+n] = tmp[ 5]; - } -} - - -// PropagationMPlex.cc:propagateHelixToRMPlex, first version with 6 arguments -__device__ void propagation_fn( - GPlexLS &inErr, GPlexLV &inPar, - GPlexQI &inChg, GPlexHV &msPar, - GPlexLS &outErr, GPlexLV &outPar, - const bool useParamBfield, - int n, int N) { - - GPlexRegQF msRad_reg; - // Using registers instead of shared memory is ~ 30% faster. - GPlexRegLL errorProp_reg; - // If there is more matrices than max_blocks_x * BLOCK_SIZE_X - if (n < N) { - for (int i = 0; i < inErr.kSize; ++i) { - outErr[n + i*outErr.stride] = inErr[n + i*inErr.stride]; - } - for (int i = 0; i < inPar.kSize; ++i) { - outPar[n + i*outPar.stride] = inPar[n + i*inPar.stride]; - } - for (int i = 0; i < 36; ++i) { - errorProp_reg[i] = 0.0; - } - computeMsRad_fn(msPar, msRad_reg, N, n); - helixAtRFromIterativeCCS_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, useParamBfield, N, n); - GPlexRegLL temp; - MultHelixProp_fn (errorProp_reg, outErr, temp, n); - MultHelixPropTransp_fn(errorProp_reg, temp, outErr, n); - } -} - - -__global__ void propagation_kernel( - GPlexLS inErr, - GPlexHV msPar, - GPlexLV inPar, GPlexQI inChg, - GPlexLV outPar, - GPlexLS outErr, - const bool useParamBfield, - int N) -{ - int grid_width = blockDim.x * gridDim.x; - int n = threadIdx.x + blockIdx.x * blockDim.x; - for (int z = 0; z < (N-1)/grid_width +1; z++) { - n += z*grid_width; - propagation_fn(inErr, inPar, inChg, msPar, outErr, outPar, useParamBfield, n, N); - } -} - - -void propagation_wrapper(const cudaStream_t& stream, - GPlexHV& msPar, GPlexLS& inErr, - GPlexLV& inPar, GPlexQI& inChg, - GPlexLV& outPar, - GPlexLS& outErr, - const bool useParamBfield, - const int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - propagation_kernel <<>> - (inErr, msPar, inPar, inChg, outPar, outErr, useParamBfield, N); -} - - -// PropagationMPlex.cc:propagateHelixToRMPlex, second version with 7 arguments -// Imposes the radius -__device__ void propagationForBuilding_fn( - const GPlexLS &inErr, const GPlexLV &inPar, - const GPlexQI &inChg, const float radius, - GPlexLS &outErr, GPlexLV &outPar, - const int n, const int N) { - GPlexRegQF msRad_reg; - // Using registers instead of shared memory is ~ 30% faster. - GPlexRegLL errorProp_reg; - // If there is more matrices than max_blocks_x * BLOCK_SIZE_X - if (n < N) { - - for (int i = 0; i < inErr.kSize; ++i) { - outErr[n + i*outErr.stride] = inErr[n + i*inErr.stride]; - } - for (int i = 0; i < inPar.kSize; ++i) { - outPar[n + i*outPar.stride] = inPar[n + i*inPar.stride]; - } - for (int i = 0; i < 36; ++i) { - errorProp_reg[i] = 0.0; - } - - msRad_reg(n, 0, 0) = radius; - - bool useParamBfield = false; // The propagation with radius as an arg do not use this parameter - helixAtRFromIterativeCCS_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, useParamBfield, N, n); - // TODO: port me - /*if (Config::useCMSGeom) {*/ - /*MPlexQF hitsRl;*/ - /*MPlexQF hitsXi;*/ - /*for (int n = 0; n < NN; ++n) {*/ - /*hitsRl.At(n, 0, 0) = getRlVal(r, outPar.ConstAt(n, 2, 0));*/ - /*hitsXi.At(n, 0, 0) = getXiVal(r, outPar.ConstAt(n, 2, 0));*/ - /*}*/ - /*applyMaterialEffects(hitsRl, hitsXi, outErr, outPar, N_proc);*/ - /*}*/ - /*similarity_fn(errorProp_reg, outErr, N, n);*/ - - // Matriplex version of: - // result.errors = ROOT::Math::Similarity(errorProp, outErr); - - //MultHelixProp can be optimized for polar coordinates, see GenMPlexOps.pl - /*MPlexLL temp;*/ - /*MultHelixProp (errorProp, outErr, temp);*/ - /*MultHelixPropTransp(errorProp, temp, outErr);*/ - GPlexRegLL temp; - MultHelixProp_fn (errorProp_reg, outErr, temp, n); - MultHelixPropTransp_fn(errorProp_reg, temp, outErr, n); - - } -} - -__global__ void propagationForBuilding_kernel( - const GPlexLS inErr, const GPlexLV inPar, - const GPlexQI inChg, const float radius, - GPlexLS outErr, GPlexLV outPar, - const int N) { - int grid_width = blockDim.x * gridDim.x; - int n = threadIdx.x + blockIdx.x * blockDim.x; - - for (int z = 0; z < (N-1)/grid_width +1; z++) { - n += z*grid_width; - propagationForBuilding_fn( inErr, inPar, inChg, radius, outErr, outPar, n, N); - } -} - -void propagationForBuilding_wrapper(const cudaStream_t& stream, - const GPlexLS& inErr, const GPlexLV& inPar, - const GPlexQI& inChg, const float radius, - GPlexLS& outErr, GPlexLV& outPar, - const int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - propagationForBuilding_kernel<<>> - (inErr, inPar, inChg, radius, outErr, outPar, N); -} - diff --git a/mkFit/propagation_kernels.h b/mkFit/propagation_kernels.h deleted file mode 100644 index a8818038..00000000 --- a/mkFit/propagation_kernels.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef _PROPAGATION_KERNELS_H_ -#define _PROPAGATION_KERNELS_H_ - -#include "GPlex.h" - -namespace mkfit { - -void propagation_wrapper(const cudaStream_t& stream, - GPlexHV& msPar, GPlexLS& inErr, - GPlexLV& inPar, GPlexQI& inChg, - GPlexLV& outPar, - GPlexLS& outErr, - const bool useParamBfield, - const int N); - -void propagationForBuilding_wrapper(const cudaStream_t& stream, - const GPlexLS& inErr, const GPlexLV& inPar, - const GPlexQI& inChg, const float radius, - GPlexLS& outErr, GPlexLV& outPar, - const int N); - -__device__ void propagation_fn( - GPlexLS &inErr, GPlexLV &inPar, - GPlexQI &inChg, GPlexHV &msPar, - GPlexLS &outErr, GPlexLV &outPar, - const bool useParamBfield, - int n, int N); - -__device__ void propagationForBuilding_fn( - const GPlexLS &inErr, const GPlexLV &inPar, - const GPlexQI &inChg, const float radius, - GPlexLS &outErr, GPlexLV &outPar, - const int n, const int N); - -} // end namespace mkfit -#endif // _PROPAGATION_KERNELS_H_ diff --git a/mkFit/reorganize_gplex.cu b/mkFit/reorganize_gplex.cu deleted file mode 100644 index 7efea439..00000000 --- a/mkFit/reorganize_gplex.cu +++ /dev/null @@ -1,510 +0,0 @@ -#include "reorganize_gplex.h" -#include - -#include "FitterCU.h" -#include "accessors_cu.h" -#include "Track.h" -#include "gpu_utils.h" - - -__device__ float *get_posArray(Hit &hit) { - return hit.posArrayCU(); -} - - -__device__ float *get_errArray(Hit &hit) { - return hit.errArrayCU(); -} - - -__device__ float *get_posArray(Track &track) { - return track.posArrayCU(); -} - - -__device__ float *get_errArray(Track &track) { - return track.errArrayCU(); -} - - -template -__device__ void SlurpIn_fn(GPlexObj to, // float *fArray, int stride, int kSize, - const char *arr, const int *vi, const int N) { - int j = threadIdx.x + blockDim.x * blockIdx.x; - if (j -__device__ void SlurpInIdx_fn(GPlexObj to, - const char *arr, const int idx, const int N) { - int j = threadIdx.x + blockDim.x * blockIdx.x; - if (j -__device__ void SlurpOutIdx_fn(GPlexObj from, // float *fArray, int stride, int kSize, - const char *arr, const int idx, const int N) { - int j = threadIdx.x + blockDim.x * blockIdx.x; - if (j>> - (msErr, msPar, layer.m_hits.data(), XHitSize, XHitArr, HitsIdx, hit_cnt, N); - /*cudaDeviceSynchronize();*/ -} - - -__device__ void InputTracksCU_fn (Track *tracks, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, - GPlexQI &Label, GPlexQI *HitsIdx, - const int beg, const int end, - const int itrack, const int N) { - if (itrack < (end-beg) && itrack < N) { - Track &trk = tracks[beg]; - const char *varr = (char*) &trk; - int off_error = (char*) trk.errArrayCU() - varr; - int off_param = (char*) trk.posArrayCU() - varr; - - int i= itrack + beg; - const Track &trk_i = tracks[i]; - int idx = (char*) &trk_i - varr; - - Label(itrack, 0, 0) = tracks[i].label(); - Chg(itrack, 0, 0) = tracks[i].charge(); - Chi2(itrack, 0, 0) = tracks[i].chi2(); - SlurpInIdx_fn(Err_iP, varr + off_error, idx, N); - SlurpInIdx_fn(Par_iP, varr + off_param, idx, N); - - for (int hi = 0; hi < 3; ++hi) - HitsIdx[hi](itrack, 0, 0) = tracks[i].getHitIdx(hi);//dummy value for now - } -} - - -__global__ void InputTracksCU_kernel(Track *tracks, - GPlexLS Err_iP, GPlexLV Par_iP, - GPlexQI Chg, GPlexQF Chi2, GPlexQI Label, - GPlexQI *HitsIdx, - int beg, int end, int N) { - int itrack = threadIdx.x + blockDim.x*blockIdx.x; - InputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, itrack, N); -} - - -void InputTracksCU_wrapper(const cudaStream_t &stream, - const EtaBinOfCandidatesCU &etaBin, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, - GPlexQI *HitsIdx, - const int beg, const int end, const bool inputProp, int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - InputTracksCU_kernel <<< grid, block, 0, stream >>> - (etaBin.m_candidates, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, - beg, end, N); -} - - -__device__ void InputTracksAndHitsCU_fn (Track *tracks, LayerOfHitsCU *layerHits, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexHS *msErr_arr, GPlexHV *msPar_arr, - GPlexQI &Chg, GPlexQF &Chi2, - GPlexQI &Label, GPlexQI *HitsIdx, - const int beg, const int end, - const int itrack, const int N) { - if (itrack < (end-beg) && itrack < N) { - Track &trk = tracks[beg]; - const char *varr = (char*) &trk; - int off_error = (char*) trk.errArrayCU() - varr; - int off_param = (char*) trk.posArrayCU() - varr; - - int i= itrack + beg; - const Track &trk_i = tracks[i]; - int idx = (char*) &trk_i - varr; - - Label(itrack, 0, 0) = tracks[i].label(); - Chg(itrack, 0, 0) = tracks[i].charge(); - Chi2(itrack, 0, 0) = tracks[i].chi2(); - SlurpInIdx_fn(Err_iP, varr + off_error, idx, N); - SlurpInIdx_fn(Par_iP, varr + off_param, idx, N); - - // Note Config::nLayers -- not suitable for building - for (int hi = 0; hi < Config::nLayers; ++hi) { - int hidx = tracks[i].getHitIdx(hi); - Hit &hit = layerHits[hi].m_hits[hidx]; - - HitsIdx[hi](itrack, 0, 0) = idx; - if (hidx < 0) continue; - - SlurpInIdx_fn(msErr_arr[hi], (char *)hit.errArrayCU(), 0, N); - SlurpInIdx_fn(msPar_arr[hi], (char *)hit.posArrayCU(), 0, N); - } - } -} - - -__global__ void InputTracksAndHitsCU_kernel(Track *tracks, LayerOfHitsCU *layers, - GPlexLS Err_iP, GPlexLV Par_iP, - GPlexHS *msErr_arr, GPlexHV *msPar_arr, - GPlexQI Chg, GPlexQF Chi2, GPlexQI Label, - GPlexQI *HitsIdx, - int beg, int end, int N) { - int itrack = threadIdx.x + blockDim.x*blockIdx.x; - InputTracksAndHitsCU_fn(tracks, layers, Err_iP, Par_iP, msErr_arr, msPar_arr, - Chg, Chi2, Label, HitsIdx, beg, end, itrack, N); -} - - -void InputTracksAndHitsCU_wrapper(const cudaStream_t &stream, - Track *tracks, EventOfHitsCU &event_of_hits, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexHS *msErr_arr, GPlexHV *msPar_arr, - GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, - GPlexQI *HitsIdx, - const int beg, const int end, - const bool inputProp, int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - InputTracksAndHitsCU_kernel <<< grid, block, 0, stream >>> - (tracks, event_of_hits.m_layers_of_hits.data(), - Err_iP, Par_iP, - msErr_arr, msPar_arr, - Chg, Chi2, Label, HitsIdx, - beg, end, N); -} - - -__device__ void OutputParErrCU_fn(Track *tracks, - const GPlexLS &Err, const GPlexLV &Par, - const int beg, const int end, - const int itrack_plex, const int N) { - Track &trk = tracks[beg]; - const char *varr = (char*) &trk; - int off_error = (char*) trk.errArrayCU() - varr; - int off_param = (char*) trk.posArrayCU() - varr; - - int i= itrack_plex + beg; - const Track &trk_i = tracks[i]; - int idx = (char*) &trk_i - varr; - - SlurpOutIdx_fn(Err, varr + off_error, idx, N); - SlurpOutIdx_fn(Par, varr + off_param, idx, N); -} - -__device__ void OutputParErrCU_fn_seed(Track *tracks, - const GPlexLS &Err, const GPlexLV &Par, - const int iseed_ev, - const int icand_ev, - int N) { - Track &trk = tracks[0]; - const char *varr = (char*) &trk; - int off_error = (char*) trk.errArrayCU() - varr; - int off_param = (char*) trk.posArrayCU() - varr; - - int i= iseed_ev * Config::maxCandsPerSeed + icand_ev; - const Track &trk_i = tracks[i]; - int idx = (char*) &trk_i - varr; - - SlurpOutIdx_fn(Err, varr + off_error, idx, N); - SlurpOutIdx_fn(Par, varr + off_param, idx, N); -} - - -__device__ void OutputTracksCU_fn(Track *tracks, - const GPlexLS &Err_iP, const GPlexLV &Par_iP, - const GPlexQI &Chg, const GPlexQF &Chi2, - const GPlexQI &Label, const GPlexQI *HitsIdx, - const int beg, const int end, - const int itrack, const int N, - const bool update_hit_idx) { - if (itrack < (end-beg) && itrack < N) { - Track &trk = tracks[beg]; - const char *varr = (char*) &trk; - int off_error = (char*) trk.errArrayCU() - varr; - int off_param = (char*) trk.posArrayCU() - varr; - - int i= itrack + beg; - const Track &trk_i = tracks[i]; - int idx = (char*) &trk_i - varr; - - SlurpOutIdx_fn(Err_iP, varr + off_error, idx, N); - SlurpOutIdx_fn(Par_iP, varr + off_param, idx, N); - tracks[i].setCharge(Chg(itrack, 0, 0)); - tracks[i].setChi2(Chi2(itrack, 0, 0)); - tracks[i].setLabel(Label(itrack, 0, 0)); - - if (update_hit_idx) { - tracks[i].resetHits(); - /*int nGoodItIdx = 0;*/ - for (int hi = 0; hi < Config::nLayers; ++hi) { - tracks[i].addHitIdx(HitsIdx[hi](itrack, 0, 0),0.); - // FIXME: We probably want to use registers instead of going for gmem class members: - /*int hit_idx = HitsIdx[hi](itrack, 0, 0);*/ - /*tracks[i].setHitIdx(hi, hit_idx);*/ - /*if (hit_idx >= 0) {*/ - /*nGoodItIdx++; */ - /*}*/ - } - /*tracks[i].setNGoodHitIdx(nGoodItIdx);*/ - /*tracks[i].setChi2(0.);*/ - } - } -} - -__global__ void OutputTracksCU_kernel(Track *tracks, - GPlexLS Err_iP, GPlexLV Par_iP, - GPlexQI Chg, GPlexQF Chi2, GPlexQI Label, - GPlexQI *HitsIdx, - int beg, int end, int N, - const bool update_hit_idx=true) { - int itrack = threadIdx.x + blockDim.x*blockIdx.x; - OutputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, - beg, end, itrack, N, update_hit_idx); -} - - -void OutputTracksCU_wrapper(const cudaStream_t &stream, - EtaBinOfCandidatesCU &etaBin, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, - GPlexQI *HitsIdx, - const int beg, const int end, const bool outputProp, int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - OutputTracksCU_kernel <<< grid, block, 0, stream >>> - (etaBin.m_candidates, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, N); -} - - -void OutputFittedTracksCU_wrapper(const cudaStream_t &stream, - Track *tracks_cu, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, - const int beg, const int end, int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - OutputTracksCU_kernel <<< grid, block, 0, stream >>> - (tracks_cu, Err_iP, Par_iP, Chg, Chi2, Label, nullptr, beg, end, N, false); -} - -/////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////// - -// m_tracks_per_seed: play the same role than seed_cand_idx in the cpu code -__device__ void InputTracksAndHitIdxComb_fn(Track *tracks, int *m_tracks_per_seed, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, - GPlexQI &Label, GPlexQI *HitsIdx, - GPlexQI &SeedIdx, GPlexQI &CandIdx, - GPlexQB &Valid, - const int Nhits, - const int beg, const int end, - const int itrack_plex, const int N) -{ - if (itrack_plex < N) { - int itrack_ev = beg + itrack_plex; - - // TODO:: make sure that the width of the FitterCU is a multiple of - // Config::maxCandsPerSeed; - int iseed_ev = itrack_ev / Config::maxCandsPerSeed; - int icand_ev = itrack_ev % Config::maxCandsPerSeed; - // | o : o : x : x : x | - // iseed - // <----> m_tracks_per_seed[iseed] - // <------------------> maxCandsPerSeed - Valid(itrack_plex, 0, 0) = icand_ev < m_tracks_per_seed[iseed_ev] - && m_tracks_per_seed[iseed_ev] != 0; - if (!Valid(itrack_plex, 0, 0)) { - return; - } - - Track &trk = tracks[beg]; - const char *varr = (char*) &trk; - int off_error = (char*) trk.errArrayCU() - varr; - int off_param = (char*) trk.posArrayCU() - varr; - - int i= itrack_plex + beg; // TODO: i == itrack_ev - const Track &trk_i = tracks[i]; - int idx = (char*) &trk_i - varr; - - Label(itrack_plex, 0, 0) = tracks[i].label(); - SeedIdx(itrack_plex, 0, 0) = iseed_ev; - CandIdx(itrack_plex, 0, 0) = icand_ev; - - SlurpInIdx_fn(Err_iP, varr + off_error, idx, N); - SlurpInIdx_fn(Par_iP, varr + off_param, idx, N); - - Chg(itrack_plex, 0, 0) = tracks[i].charge(); - Chi2(itrack_plex, 0, 0) = tracks[i].chi2(); - // Note Config::nLayers -- not suitable for building - for (int hi = 0; hi < Nhits; ++hi) { - HitsIdx[hi][itrack_plex] = tracks[i].getHitIdx(hi); - - int hit_idx = HitsIdx[hi][itrack_plex]; - } - } -} - -__global__ -void InputTracksAndHitIdxComb_kernel(Track *tracks, int *m_tracks_per_seed, - GPlexLS Err_iP, GPlexLV Par_iP, - GPlexQI Chg, GPlexQF Chi2, - GPlexQI Label, GPlexQI *HitsIdx, - GPlexQI SeedIdx, GPlexQI CandIdx, - GPlexQB Valid, const int Nhits, - const int beg, const int end, - const int N) -{ - int itrack = threadIdx.x + blockDim.x*blockIdx.x; - InputTracksAndHitIdxComb_fn(tracks, m_tracks_per_seed, - Err_iP, Par_iP, - Chg, Chi2, Label, HitsIdx, - SeedIdx, CandIdx, Valid, Nhits , - beg, end, itrack, N); -} - -void InputTracksAndHitIdxComb_wrapper(const cudaStream_t &stream, - const EtaBinOfCombCandidatesCU &etaBin, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, - GPlexQI &Label, GPlexQI *HitsIdx, - GPlexQI &SeedIdx, GPlexQI &CandIdx, - GPlexQB &Valid, const int Nhits, - const int beg, const int end, - const bool inputProp, int N) { - int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - max_blocks_x); - dim3 grid(gridx, 1, 1); - dim3 block(BLOCK_SIZE_X, 1, 1); - - InputTracksAndHitIdxComb_kernel<<< grid, block, 0, stream >>> - (etaBin.m_candidates.data(), etaBin.m_ntracks_per_seed.data(), - Err_iP, Par_iP, - Chg, Chi2, Label, HitsIdx, - SeedIdx, CandIdx, Valid, Nhits, - beg, end, N); -} - -/////////////////////////////////////////////////////////////////////////////// - -__device__ void InputTracksAndHitIdxComb_fn_seed(Track *tracks, int *m_tracks_per_seed, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, - GPlexQI &Label, GPlexQI *HitsIdx, - GPlexQI &SeedIdx, GPlexQI &CandIdx, - GPlexQB &Valid, - const int Nhits, - const int iseed_ev, - const int icand_ev, - const int iseed_plex, - const int N) -{ - if (iseed_plex < N) { - // seed-based algorithm do not depend on Valid - Valid(iseed_plex, 0, 0) = 1; - - Track &trk = tracks[0]; - const char *varr = (char*) &trk; - int off_error = (char*) trk.errArrayCU() - varr; - int off_param = (char*) trk.posArrayCU() - varr; - - int i = iseed_ev * Config::maxCandsPerSeed + icand_ev; - const Track &trk_i = tracks[i]; - int idx = (char*) &trk_i - varr; - - Label(iseed_plex, 0, 0) = tracks[i].label(); - SeedIdx(iseed_plex, 0, 0) = iseed_ev; - CandIdx(iseed_plex, 0, 0) = icand_ev; - - SlurpInIdx_fn(Err_iP, varr + off_error, idx, N); - SlurpInIdx_fn(Par_iP, varr + off_param, idx, N); - - Chg(iseed_plex, 0, 0) = tracks[i].charge(); - Chi2(iseed_plex, 0, 0) = tracks[i].chi2(); - // Note Config::nLayers -- not suitable for building - for (int hi = 0; hi < Nhits; ++hi) { - HitsIdx[hi][iseed_plex] = tracks[i].getHitIdx(hi); - } - } -} diff --git a/mkFit/reorganize_gplex.h b/mkFit/reorganize_gplex.h deleted file mode 100644 index c5976dde..00000000 --- a/mkFit/reorganize_gplex.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef REORGANIZE_GPLEX_H -#define REORGANIZE_GPLEX_H - -#include "GPlex.h" -#include "Hit.h" -#include "HitStructuresCU.h" -#include "Track.h" - -namespace mkfit { - -__device__ float *get_posArray(Hit &hit); -__device__ float *get_errArray(Hit &hit); -__device__ float *get_posArray(Track &track); -__device__ float *get_errArray(Track &track); - -__device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar, - Hit *hits, const GPlexQI &XHitSize, - const GPlexHitIdx &XHitArr, - GPlexQI &HitsIdx, const int hit_cnt, - const int itrack, const int N); - -__global__ void HitToMs_kernel(GPlexHS msErr, GPlexHV msPar, Hit *hits, - const GPlexQI XHitSize, const GPlexHitIdx XHitArr, - GPlexQI HitsIdx, const int hit_cnt, const int N); - -void HitToMs_wrapper(const cudaStream_t& stream, - GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, - const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, - GPlexQI &HitsIdx, const int hit_cnt, const int N); - -__device__ void InputTracksCU_fn(Track *tracks, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, - GPlexQI &Label, GPlexQI *HitsIdx, - const int beg, const int end, - const int itrack, const int N); - -__device__ void OutputParErrCU_fn(Track *tracks, - const GPlexLS &Err, const GPlexLV &Par, - const int beg, const int end, - const int itrack, const int N); - -__device__ void OutputTracksCU_fn(Track *tracks, - const GPlexLS &Err_iP, const GPlexLV &Par_iP, - const GPlexQI &Chg, const GPlexQF &Chi2, - const GPlexQI &Label, const GPlexQI *HitsIdx, - const int beg, const int end, - const int itrack, const int N, - const bool update_hit_idx=true); - -void InputTracksCU_wrapper(const cudaStream_t &stream, - const EtaBinOfCandidatesCU &etaBin, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, - GPlexQI *HitsIdx, - const int beg, const int end, const bool inputProp, int N); - -void InputTracksAndHitsCU_wrapper(const cudaStream_t &stream, - Track *tracks, EventOfHitsCU &event_of_hits, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexHS *msErr_arr, GPlexHV *msPar_arr, - GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, - GPlexQI *HitsIdx, - const int beg, const int end, - const bool inputProp, int N); - -void InputTracksAndHitIdxComb_wrapper(const cudaStream_t &stream, - const EtaBinOfCombCandidatesCU &etaBin, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, - GPlexQI &Label, GPlexQI *HitsIdx, - GPlexQI &SeedIdx, GPlexQI &CandIdx, - GPlexQB &Valid, const int Nhits, - const int beg, const int end, - const bool inputProp, int N); - -void OutputTracksCU_wrapper(const cudaStream_t &stream, - EtaBinOfCandidatesCU &etaBin, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, - GPlexQI *HitsIdx, - const int beg, const int end, bool outputProp, int N); - - -void OutputFittedTracksCU_wrapper(const cudaStream_t &stream, - Track *tracks_cu, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, - const int beg, const int end, int N); - -__device__ -void GetHitErr(GPlexHS& msErr, const char* array, const int beg, const int end); -__device__ -void GetHitPar(GPlexHV& msPar, const char* array, const int beg, const int end); - - -__device__ void InputTracksAndHitIdxComb_fn(Track *tracks, int *m_tracks_per_seed, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, - GPlexQI &Label, GPlexQI *HitsIdx, - GPlexQI &SeedIdx, GPlexQI &CandIdx, - GPlexQB &Valid, - const int Nhits, - const int beg, const int end, - const int itrack, const int N); - -__device__ void InputTracksAndHitIdxComb_fn_seed(Track *tracks, int *m_tracks_per_seed, - GPlexLS &Err_iP, GPlexLV &Par_iP, - GPlexQI &Chg, GPlexQF &Chi2, - GPlexQI &Label, GPlexQI *HitsIdx, - GPlexQI &SeedIdx, GPlexQI &CandIdx, - GPlexQB &Valid, - const int Nhits, - const int iseed_ev, - const int icand_ev, - const int iseed_plex, - const int N); - -__device__ void OutputParErrCU_fn_seed(Track *tracks, - const GPlexLS &Err, const GPlexLV &Par, - const int iseed_ev, - const int icand_ev, - int N); - -} // end namespace mkfit -#endif // REORGANIZE_GPLEX_H From b6e451cc05f8a3af16508c3144de4b4fbef86ab6 Mon Sep 17 00:00:00 2001 From: Matevz Tadel Date: Thu, 29 Oct 2020 11:48:37 -0700 Subject: [PATCH 2/3] GenMul.pl - do not generate CUDA code unless explicitly requested. --- Matriplex/GenMul.pm | 46 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/Matriplex/GenMul.pm b/Matriplex/GenMul.pm index 15bb3924..3bd06e85 100644 --- a/Matriplex/GenMul.pm +++ b/Matriplex/GenMul.pm @@ -777,6 +777,51 @@ sub dump_multiply_std_and_intrinsic select FF; } + print <<"FNORD"; +#ifdef MPLEX_INTRINSICS + + for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T)) + { +FNORD + + $S->multiply_intrinsic($a, $b, $c); + + print <<"FNORD"; + } + +#else + +#pragma omp simd + for (int n = 0; n < N; ++n) + { +FNORD + + $S->multiply_standard($a, $b, $c); + + print <<"FNORD"; + } +#endif +FNORD + + unless ($fname eq '-') + { + close FF; + select STDOUT; + } +} + +# ---------------------------------------------------------------------- + +sub dump_multiply_std_and_intrinsic_and_gpu +{ + my ($S, $fname, $a, $b, $c) = @_; + + unless ($fname eq '-') + { + open FF, ">$fname"; + select FF; + } + print <<"FNORD"; #ifndef __CUDACC__ #ifdef MPLEX_INTRINSICS @@ -809,7 +854,6 @@ FNORD #endif // __CUDACC__ FNORD - unless ($fname eq '-') { close FF; From 0d3a44c69d387be85efa5b4f26c8b1562b652f0e Mon Sep 17 00:00:00 2001 From: Steve Lantz Date: Fri, 30 Oct 2020 14:04:57 -0700 Subject: [PATCH 3/3] clean out __CUDACC__ ifdef's in from-root/Math --- from-root/Math/MatrixRepresentationsStatic.h | 16 +--------------- from-root/Math/SMatrix.h | 9 --------- from-root/Math/SVector.h | 9 --------- 3 files changed, 1 insertion(+), 33 deletions(-) diff --git a/from-root/Math/MatrixRepresentationsStatic.h b/from-root/Math/MatrixRepresentationsStatic.h index e32b7f47..78c6155c 100644 --- a/from-root/Math/MatrixRepresentationsStatic.h +++ b/from-root/Math/MatrixRepresentationsStatic.h @@ -215,9 +215,6 @@ namespace Math { public: -#if __CUDACC__ - __host__ __device__ -#endif /* constexpr */ inline MatRepSym(){} typedef T value_type; @@ -241,17 +238,9 @@ namespace Math { return fArray[off(i)]; } -#if __CUDACC__ - __host__ __device__ -#endif inline T* Array() { return fArray; } -#if __CUDACC__ - __host__ __device__ -#endif + inline const T* Array() const { return fArray; } -#ifdef __CUDACC__ - T* ArrayCU(); -#endif /** assignment : only symmetric to symmetric allowed @@ -263,9 +252,6 @@ namespace Math { return *this; } -#if __CUDACC__ - __host__ __device__ -#endif inline MatRepSym& operator=(const MatRepSym& rhs) { #pragma omp simd for(unsigned int i=0; i& rhs); /** construct from a matrix with different representation. @@ -278,9 +272,6 @@ class SMatrix { const T* Array() const; /// return pointer to internal array T* Array(); -#ifdef __CUDACC__ - T* ArrayCU(); -#endif /** @name --- STL-like interface --- The iterators access the matrix element in the order how they are diff --git a/from-root/Math/SVector.h b/from-root/Math/SVector.h index 89ee73c8..7de0c830 100644 --- a/from-root/Math/SVector.h +++ b/from-root/Math/SVector.h @@ -93,17 +93,11 @@ class SVector { /** Default constructor: vector filled with zero values */ -#if __CUDACC__ - __host__ __device__ -#endif SVector(); /// contruct from a vector expression template SVector(const VecExpr& rhs); /// copy contructor -#if __CUDACC__ - __host__ __device__ -#endif SVector(const SVector& rhs); // new constructs using STL iterator interface @@ -191,9 +185,6 @@ class SVector { const T* Array() const; /// return non-const pointer to internal array T* Array(); -#ifdef __CUDACC__ - T* ArrayCU(); -#endif /** @name --- STL-like interface --- */