fix cmake error when compiling without cuda

studouglas · Apr 14, 2016 · 5554eff · 5554eff
1 parent 1f92833
commit 5554eff
Show file tree

Hide file tree

Showing 6 changed files with 164 additions and 281 deletions.
diff --git a/Documentation/FinalPresentation/Presentation.pdf b/Documentation/FinalPresentation/Presentation.pdf
diff --git a/Documentation/FinalPresentation/Presentation.tex b/Documentation/FinalPresentation/Presentation.tex
@@ -5,6 +5,7 @@
 \usepackage[font={footnotesize}]{caption}
 \usepackage{textcomp}
 \usepackage{listings}
+\lstset{language=C++,basicstyle=\footnotesize\ttfamily,keywordstyle=\color{red}}
 \newcommand{\textapprox}{\raisebox{0.5ex}{\texttildelow}}
 \setcounter{tocdepth}{2}
 \setbeamertemplate{navigation symbols}{}
@@ -40,6 +41,9 @@ \subsection{Brief Project Overview}
 \begin{frame}
 \frametitle{Brief Project Overview}
 Take an existing particle simulation toolkit - Geant4 - and have some functions run on a GPU device to improve performance.
+\begin{block}{Geant}
+Geant4
+\end{block}
 \end{frame}
 
 \subsection{Explanation of Terms}
@@ -107,6 +111,7 @@ \section{Features}
 \begin{itemize}
 \item GPU acceleration available on an ``opt-in'' basis
 \item Easy to enable/disable GPU acceleration
+\item If GPU acceleration is enabled, some methods will run on GPU
 \item Same results whether acceleration enabled or disabled
 \end{itemize}
 \end{frame}
@@ -121,34 +126,38 @@ \subsection{Easily Enable/Disable GPU Acceleration}
 \end{itemize}
 \end{frame}
 
-%\begin{lstlisting}[language=C++,basicstyle=\ttfamily,keywordstyle=\color{red}]
-%  inline G4double GetY(G4double x)
-%  {
-%    #if GEANT4_ENABLE_CUDA
-%      return cudaVector->GetXsec(x);
-%    #else
-%      return GetXsec(x);
-%    #endif
-%  }
-%\end{lstlisting}
-
-\begin{frame}
-\frametitle{Implementation}
+\begin{frame}[fragile]
+\frametitle{Easily Enable/Disable GPU Acceleration}
+Method calls to \texttt{G4ParticleHPVector} forwarded to GPU-based implementation
 \begin{itemize}
-\item Header forwards function calls to GPU or CPU Implementation
-\item This decision is made at compile time
+\item This decision is made at compile time based on \texttt{cmake} flag
 \end{itemize}
+
+\begin{block}{Example of Forwarding}
+\begin{lstlisting}
+inline G4double GetY(G4double x)
+{
+  #if GEANT4_ENABLE_CUDA
+    return cudaVector->GetXsec(x);
+  #else
+    return GetXsec(x);
+  #endif
+}
+\end{lstlisting}
+\end{block}
 \end{frame}
 
 \begin{frame}
 \frametitle{Accelerating Module on GPU}
-\begin{itemize}
-\item a
-\end{itemize}
+Existing module \texttt{G4ParticleHPVector} ported to GPU using CUDA\\~\\
+
+\begin{block}{Definition: CUDA}
+CUDA is a GP-GPU programming model developed by NVIDIA, for use with NVIDIA graphics cards
+\end{block}
 \end{frame}
 
 \begin{frame}
-\frametitle{Why \texttt{G4ParticleHPVector}}
+\frametitle{Why \texttt{G4ParticleHPVector}?}
 \begin{itemize}
 \item Represents empirically-found probabilities of collisions for different particles based on their energy
 \item Identified as starting point by relevant stakeholders
@@ -201,20 +210,16 @@ \subsection{Impl. 1: Existing Module in GPU Memory}
 
 \subsubsection{Implementation of Select Methods on GPU}
 \begin{frame}
-\frametitle{Implementation -- \texttt{Times}}
-\end{frame}
-
-\begin{frame}
-\frametitle{Implementation -- \texttt{GetXSec}}
+\frametitle{Impl. 1 -- \texttt{Times}}
 \end{frame}
 
 \begin{frame}
-\frametitle{Implementation -- \texttt{SampleLin}}
+\frametitle{Impl. 1 -- \texttt{GetXSec}}
 \end{frame}
 
-\subsubsection{Performance}
+\subsubsection{Impl. 1: Performance}
 \begin{frame}
-\frametitle{Performance Results Summary}
+\frametitle{Impl. 1: Performance Results Summary}
 \begin{itemize}
 \item Most methods slower on GPU until \textapprox 10,000 entries in data vector
 \item Most \emph{commonly-used} methods significantly slower on GPU, even with large data vector
@@ -226,7 +231,7 @@ \subsubsection{Performance}
 \end{frame}
 
 \begin{frame}
-\frametitle{Performance Results -- \texttt{Times}}
+\frametitle{Impl. 1: Performance Results -- \texttt{Times}}
 \begin{itemize}
 \item Multiplies each point in vector by factor
 \end{itemize}
@@ -238,19 +243,19 @@ \subsubsection{Performance}
 \end{frame}
 
 \begin{frame}
-\frametitle{Performance Results -- \texttt{GetXSec}}
+\frametitle{Impl. 1: Performance Results -- \texttt{GetXSec}}
 \end{frame}
 
 \begin{frame}
-\frametitle{Performance Results -- \texttt{SampleLin}}
+\frametitle{Impl. 1: Performance Results -- \texttt{SampleLin}}
 \end{frame}
 
 \begin{frame}
-\frametitle{Performance Results -- System Tests}
+\frametitle{Impl. 1: Performance Results -- System Tests}
 \end{frame}
 
 \begin{frame}
-\frametitle{Performance Discussion}
+\frametitle{Impl. 1: Performance Discussion}
 \end{frame}
 
 
@@ -272,28 +277,28 @@ \subsection{Impl. 2: Add New GPU-Accelerated Methods to Interface}
 \end{frame}
 
 \begin{frame}
-\frametitle{Implementation -- \texttt{GetXSecList}}
+\frametitle{Impl. 2: \texttt{GetXSecList}}
 \begin{itemize}
 \item Fill an array of energies we want to get xSec values for 
 \item Send the array to the GPU to work on
 \end{itemize}
 \end{frame}
 
-\subsubsection{Performance}
+\subsubsection{Impl. 2: Performance}
 \begin{frame}
-\frametitle{Performance Results Summary}
+\frametitle{Impl. 2: Performance Results Summary}
 \end{frame}
 
 \begin{frame}
-\frametitle{Performance Results -- \texttt{GetXSecList}}
+\frametitle{Impl. 2: Performance Results -- \texttt{GetXSecList}}
 \end{frame}
 
 \begin{frame}
-\frametitle{Performance Results -- System Tests}
+\frametitle{Impl. 2: Performance Results -- System Tests}
 \end{frame}
 
 \begin{frame}
-\frametitle{Performance Discussion}
+\frametitle{Impl. 2: Performance Discussion}
 \end{frame}
 
 \subsection{Accuracy / Testing}
@@ -317,7 +322,7 @@ \subsection{Accuracy / Testing}
 \end{frame}
 
 \begin{frame}
-\frametitle{Accuracy Discussion}
+\frametitle{Accuracy}
 \begin{itemize}
 \item The deviations in SampleLin and Sample can be 
 attributed to the functions use of random numbers
@@ -340,13 +345,48 @@ \subsection{Accuracy / Testing}
 \end{itemize}
 \end{frame}
 
-\begin{frame}
+\begin{frame}[fragile]
 \frametitle{\texttt{GenerateTestResults} Details}
 \begin{itemize}
-\item Outputs simple results directly
+\item Includes testing version number in results file for analysis stage
+\item Outputs simple results directly to results file
 \item For vectors, calculates hash for vector and output it
-\item 
+\item Outputs timing data to separate file
+\end{itemize}
+\begin{block}{Example: Snippet of Generated Test Results}
+\begin{lstlisting}
+#void G4ParticleHPVector_CUDA::GetXsecBuffer(
+  G4double * queryList, G4int length)_6
+@numQueries=10
+hash: 16548307878283220284
+@numQueries=50
+hash: 3204132713354913775
+\end{lstlisting}
+\end{block}
+\end{frame}
+
+\begin{frame}
+\frametitle{\texttt{AnalyzeTestResults} Details}
+Two main functions:
+\begin{enumerate}
+\item Compare results for each test case, printing status to \texttt{stdout}
+\begin{itemize}
+\item If test failed, output differing values
+\item Summarize test results at the end with number passed
+\end{itemize}
+\item Generate \texttt{.csv} file from timing data
+\begin{itemize}
+\item One row per unique method call, columns show CPU time, GPU time, method name and parameters
+\item Can use Excel to analyze performance results
 \end{itemize}
+\end{enumerate}
+\end{frame}
+
+\begin{frame}
+\frametitle{Demonstration}
+\begin{center}
+\emph{Demonstration of Generating and Analyzing Test Results}
+\end{center}
 \end{frame}
 
 \section{Conclusion}

diff --git a/geant4.10.02/source/externals/cuda/src/G4ParticleHPVector_CUDA.cu b/geant4.10.02/source/externals/cuda/src/G4ParticleHPVector_CUDA.cu
@@ -1,152 +1,3 @@
-// #include <time.h>
-// #include <sys/time.h>
-// #include <cuda.h>
-// #include <cuda_runtime.h>
-// #include "G4ParticleHPVector_CUDA.hh"
-// #include <thrust/device_vector.h>
-// #include <stdio.h>
-// #include <iostream>
-// #include <math.h>
-
-// __global__ void SetArrayTo(int *resArray, int numQueries, int setValue)
-// {
-//   int idx = blockDim.x*blockIdx.x + threadIdx.x;
-//   if (idx < numQueries) {
-//     resArray[idx] = setValue;
-//   }
-// }
-
-// __global__ void findMinArray2(G4ParticleHPDataPoint *theData_d, G4double *queryArray_d, int *resArray_d, int numThreads, int numQueries, int nEntries)
-// {
-//   int idx = blockDim.x*blockIdx.x + threadIdx.x;
-//   for (int i = 0; i < numQueries; i++) {
-//     G4double queryEnergy = queryArray_d[i];
-
-//     // search through data points in thread's range 
-//     for (int j = idx; j <= nEntries; j+= numThreads) {
-//       if (theData_d[j].energy >  queryEnergy) {
-//         atomicMin(&resArray_d[i], j);
-//         break;
-//       }
-//     }
-
-//   }
-
-//   // slower, 13s for highest test (and seg fault too)
-//   // int start = blockIdx.x * queriesPerBlock;
-//   // int i = start;
-//   // do {
-//   //   G4double queryEnergy = queryArray_d[i];
-
-//   //   for (int j = idx; j < nEntries; j += numThreads) {
-//   //     if (theData_d[j].energy >  queryEnergy) {
-//   //       atomicMin(&resArray_d[i], j);
-//   //       break;
-//   //     }
-//   //   }
-//   //   i = ++i % numQueries;
-//   // } while (i != start);
-// }
-
-// /***********************************************
-// *   Device Methods
-// ***********************************************/
-// void G4ParticleHPVector_CUDA::SetInterpolationManager(G4InterpolationManager & aManager) {
-//   theManager = aManager;
-// }
-// void G4ParticleHPVector_CUDA::SetInterpolationManager(const G4InterpolationManager & aManager) {
-//   theManager = aManager;
-// }
-
-// double getWallTime() {
-//   struct timeval time;
-//   gettimeofday(&time, NULL);
-//   return (double)time.tv_sec + (double)time.tv_usec * 0.000001;
-// }
-
-// /***********************************************
-// *   Host Methods
-// ***********************************************/
-// void G4ParticleHPVector_CUDA::GetXsecList(G4double* energiesIn_xSecsOut, G4int numQueries, G4ParticleHPDataPoint* theData, G4int nEntries) {  
-//   if (nEntries == 0) {
-//     for (int i = 0; i < numQueries; i++) {
-//       energiesIn_xSecsOut[i] = 0.0;
-//     }
-//     return;
-//   }
-
-//   G4ParticleHPDataPoint * d_theData;
-//   G4double              * d_energiesIn_xSecsOut;
-//   G4int                 * d_minIndices;
-
-//   cudaMalloc((void**)&d_theData,             sizeof(G4ParticleHPDataPoint)            * nEntries);
-//   cudaMalloc((void**)&d_energiesIn_xSecsOut, sizeof(G4double)            * numQueries);
-//   cudaMalloc((void**)&d_minIndices,          sizeof(G4int)               * numQueries);
-//   G4int *minIndices = (G4int*)malloc(numQueries * sizeof(G4int));
-
-//   cudaMemcpy(d_theData, theData, sizeof(G4ParticleHPDataPoint) * nEntries, cudaMemcpyHostToDevice);
-//   cudaMemcpy(d_energiesIn_xSecsOut, energiesIn_xSecsOut, sizeof(G4double) * numQueries, cudaMemcpyHostToDevice);
-
-//   int queryBlocks = numQueries/THREADS_PER_BLOCK + (numQueries % THREADS_PER_BLOCK == 0 ? 0:1);
-//   int dataChunk = 1;
-//   int threadNum = nEntries/dataChunk;
-//   int arrayBlocks = threadNum/THREADS_PER_BLOCK + (threadNum % THREADS_PER_BLOCK == 0 ? 0:1);
-//   int queriesPerBlock = numQueries / arrayBlocks;
-
-//   double a = getWallTime();
-//   SetArrayTo <<< queryBlocks, THREADS_PER_BLOCK >>>(d_minIndices, numQueries, nEntries-1);
-//   findMinArray2 <<< arrayBlocks, THREADS_PER_BLOCK >>> (d_theData, d_energiesIn_xSecsOut, d_minIndices, threadNum, numQueries, nEntries);
-//   cudaDeviceSynchronize();
-//   printf("Time (nEntries = %d, numQueries = %d): %f\n", nEntries, numQueries, getWallTime() - a);
-
-//   cudaMemcpy(minIndices, d_minIndices, numQueries * sizeof(G4int), cudaMemcpyDeviceToHost);
-
-//   for (int i = 0; i < numQueries; i++) {
-//     int minIndex = minIndices[i];
-
-//     G4int low = minIndex - 1;
-//     G4int high = minIndex;
-//     G4double e = energiesIn_xSecsOut[i];
-
-//     if (minIndex == 0)
-//     {
-//       low = 0;
-//       high = 1;
-//     }
-//     else if (minIndex == nEntries)
-//     {
-//       low = nEntries - 2;
-//       high = nEntries - 1;
-//     }
-
-//     if (e < theData[nEntries-1].GetX())
-//     {
-//       if (theData[high].GetX() != 0 
-//         && (std::abs((theData[high].GetX() - theData[low].GetX()) / theData[high].GetX()) < 0.000001))
-//       {
-//         energiesIn_xSecsOut[i] = theData[low].GetY();
-//       }
-//       else
-//       {
-//         energiesIn_xSecsOut[i] = 
-//           theInt.Interpolate(theManager.GetScheme(high), e, 
-//                              theData[low].GetX(), theData[high].GetX(),
-//                              theData[low].GetY(), theData[high].GetY());
-//       }
-//     }
-//     else
-//     {
-//       energiesIn_xSecsOut[i] = theData[nEntries-1].GetY();
-//     }
-//   }
-
-//   cudaFree(d_theData);
-//   cudaFree(d_energiesIn_xSecsOut);
-//   cudaFree(d_minIndices);
-//   free(minIndices);
-// }
-
-
 #include <time.h>
 #include <sys/time.h>
 #include <cuda.h>