Skip to content

Commit

Permalink
fix cmake error when compiling without cuda
Browse files Browse the repository at this point in the history
  • Loading branch information
studouglas committed Apr 14, 2016
1 parent 1f92833 commit 5554eff
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 281 deletions.
Binary file modified Documentation/FinalPresentation/Presentation.pdf
Binary file not shown.
124 changes: 82 additions & 42 deletions Documentation/FinalPresentation/Presentation.tex
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
\usepackage[font={footnotesize}]{caption}
\usepackage{textcomp}
\usepackage{listings}
\lstset{language=C++,basicstyle=\footnotesize\ttfamily,keywordstyle=\color{red}}
\newcommand{\textapprox}{\raisebox{0.5ex}{\texttildelow}}
\setcounter{tocdepth}{2}
\setbeamertemplate{navigation symbols}{}
Expand Down Expand Up @@ -40,6 +41,9 @@ \subsection{Brief Project Overview}
\begin{frame}
\frametitle{Brief Project Overview}
Take an existing particle simulation toolkit - Geant4 - and have some functions run on a GPU device to improve performance.
\begin{block}{Geant}
Geant4
\end{block}
\end{frame}

\subsection{Explanation of Terms}
Expand Down Expand Up @@ -107,6 +111,7 @@ \section{Features}
\begin{itemize}
\item GPU acceleration available on an ``opt-in'' basis
\item Easy to enable/disable GPU acceleration
\item If GPU acceleration is enabled, some methods will run on GPU
\item Same results whether acceleration enabled or disabled
\end{itemize}
\end{frame}
Expand All @@ -121,34 +126,38 @@ \subsection{Easily Enable/Disable GPU Acceleration}
\end{itemize}
\end{frame}

%\begin{lstlisting}[language=C++,basicstyle=\ttfamily,keywordstyle=\color{red}]
% inline G4double GetY(G4double x)
% {
% #if GEANT4_ENABLE_CUDA
% return cudaVector->GetXsec(x);
% #else
% return GetXsec(x);
% #endif
% }
%\end{lstlisting}

\begin{frame}
\frametitle{Implementation}
\begin{frame}[fragile]
\frametitle{Easily Enable/Disable GPU Acceleration}
Method calls to \texttt{G4ParticleHPVector} forwarded to GPU-based implementation
\begin{itemize}
\item Header forwards function calls to GPU or CPU Implementation
\item This decision is made at compile time
\item This decision is made at compile time based on \texttt{cmake} flag
\end{itemize}

\begin{block}{Example of Forwarding}
\begin{lstlisting}
inline G4double GetY(G4double x)
{
#if GEANT4_ENABLE_CUDA
return cudaVector->GetXsec(x);
#else
return GetXsec(x);
#endif
}
\end{lstlisting}
\end{block}
\end{frame}

\begin{frame}
\frametitle{Accelerating Module on GPU}
\begin{itemize}
\item a
\end{itemize}
Existing module \texttt{G4ParticleHPVector} ported to GPU using CUDA\\~\\

\begin{block}{Definition: CUDA}
CUDA is a GP-GPU programming model developed by NVIDIA, for use with NVIDIA graphics cards
\end{block}
\end{frame}

\begin{frame}
\frametitle{Why \texttt{G4ParticleHPVector}}
\frametitle{Why \texttt{G4ParticleHPVector}?}
\begin{itemize}
\item Represents empirically-found probabilities of collisions for different particles based on their energy
\item Identified as starting point by relevant stakeholders
Expand Down Expand Up @@ -201,20 +210,16 @@ \subsection{Impl. 1: Existing Module in GPU Memory}

\subsubsection{Implementation of Select Methods on GPU}
\begin{frame}
\frametitle{Implementation -- \texttt{Times}}
\end{frame}

\begin{frame}
\frametitle{Implementation -- \texttt{GetXSec}}
\frametitle{Impl. 1 -- \texttt{Times}}
\end{frame}

\begin{frame}
\frametitle{Implementation -- \texttt{SampleLin}}
\frametitle{Impl. 1 -- \texttt{GetXSec}}
\end{frame}

\subsubsection{Performance}
\subsubsection{Impl. 1: Performance}
\begin{frame}
\frametitle{Performance Results Summary}
\frametitle{Impl. 1: Performance Results Summary}
\begin{itemize}
\item Most methods slower on GPU until \textapprox 10,000 entries in data vector
\item Most \emph{commonly-used} methods significantly slower on GPU, even with large data vector
Expand All @@ -226,7 +231,7 @@ \subsubsection{Performance}
\end{frame}

\begin{frame}
\frametitle{Performance Results -- \texttt{Times}}
\frametitle{Impl. 1: Performance Results -- \texttt{Times}}
\begin{itemize}
\item Multiplies each point in vector by factor
\end{itemize}
Expand All @@ -238,19 +243,19 @@ \subsubsection{Performance}
\end{frame}

\begin{frame}
\frametitle{Performance Results -- \texttt{GetXSec}}
\frametitle{Impl. 1: Performance Results -- \texttt{GetXSec}}
\end{frame}

\begin{frame}
\frametitle{Performance Results -- \texttt{SampleLin}}
\frametitle{Impl. 1: Performance Results -- \texttt{SampleLin}}
\end{frame}

\begin{frame}
\frametitle{Performance Results -- System Tests}
\frametitle{Impl. 1: Performance Results -- System Tests}
\end{frame}

\begin{frame}
\frametitle{Performance Discussion}
\frametitle{Impl. 1: Performance Discussion}
\end{frame}


Expand All @@ -272,28 +277,28 @@ \subsection{Impl. 2: Add New GPU-Accelerated Methods to Interface}
\end{frame}

\begin{frame}
\frametitle{Implementation -- \texttt{GetXSecList}}
\frametitle{Impl. 2: \texttt{GetXSecList}}
\begin{itemize}
\item Fill an array of energies we want to get xSec values for
\item Send the array to the GPU to work on
\end{itemize}
\end{frame}

\subsubsection{Performance}
\subsubsection{Impl. 2: Performance}
\begin{frame}
\frametitle{Performance Results Summary}
\frametitle{Impl. 2: Performance Results Summary}
\end{frame}

\begin{frame}
\frametitle{Performance Results -- \texttt{GetXSecList}}
\frametitle{Impl. 2: Performance Results -- \texttt{GetXSecList}}
\end{frame}

\begin{frame}
\frametitle{Performance Results -- System Tests}
\frametitle{Impl. 2: Performance Results -- System Tests}
\end{frame}

\begin{frame}
\frametitle{Performance Discussion}
\frametitle{Impl. 2: Performance Discussion}
\end{frame}

\subsection{Accuracy / Testing}
Expand All @@ -317,7 +322,7 @@ \subsection{Accuracy / Testing}
\end{frame}

\begin{frame}
\frametitle{Accuracy Discussion}
\frametitle{Accuracy}
\begin{itemize}
\item The deviations in SampleLin and Sample can be
attributed to the functions use of random numbers
Expand All @@ -340,13 +345,48 @@ \subsection{Accuracy / Testing}
\end{itemize}
\end{frame}

\begin{frame}
\begin{frame}[fragile]
\frametitle{\texttt{GenerateTestResults} Details}
\begin{itemize}
\item Outputs simple results directly
\item Includes testing version number in results file for analysis stage
\item Outputs simple results directly to results file
\item For vectors, calculates hash for vector and output it
\item
\item Outputs timing data to separate file
\end{itemize}
\begin{block}{Example: Snippet of Generated Test Results}
\begin{lstlisting}
#void G4ParticleHPVector_CUDA::GetXsecBuffer(
G4double * queryList, G4int length)_6
@numQueries=10
hash: 16548307878283220284
@numQueries=50
hash: 3204132713354913775
\end{lstlisting}
\end{block}
\end{frame}

\begin{frame}
\frametitle{\texttt{AnalyzeTestResults} Details}
Two main functions:
\begin{enumerate}
\item Compare results for each test case, printing status to \texttt{stdout}
\begin{itemize}
\item If test failed, output differing values
\item Summarize test results at the end with number passed
\end{itemize}
\item Generate \texttt{.csv} file from timing data
\begin{itemize}
\item One row per unique method call, columns show CPU time, GPU time, method name and parameters
\item Can use Excel to analyze performance results
\end{itemize}
\end{enumerate}
\end{frame}

\begin{frame}
\frametitle{Demonstration}
\begin{center}
\emph{Demonstration of Generating and Analyzing Test Results}
\end{center}
\end{frame}

\section{Conclusion}
Expand Down
149 changes: 0 additions & 149 deletions geant4.10.02/source/externals/cuda/src/G4ParticleHPVector_CUDA.cu
Original file line number Diff line number Diff line change
@@ -1,152 +1,3 @@
// #include <time.h>
// #include <sys/time.h>
// #include <cuda.h>
// #include <cuda_runtime.h>
// #include "G4ParticleHPVector_CUDA.hh"
// #include <thrust/device_vector.h>
// #include <stdio.h>
// #include <iostream>
// #include <math.h>

// __global__ void SetArrayTo(int *resArray, int numQueries, int setValue)
// {
// int idx = blockDim.x*blockIdx.x + threadIdx.x;
// if (idx < numQueries) {
// resArray[idx] = setValue;
// }
// }

// __global__ void findMinArray2(G4ParticleHPDataPoint *theData_d, G4double *queryArray_d, int *resArray_d, int numThreads, int numQueries, int nEntries)
// {
// int idx = blockDim.x*blockIdx.x + threadIdx.x;
// for (int i = 0; i < numQueries; i++) {
// G4double queryEnergy = queryArray_d[i];

// // search through data points in thread's range
// for (int j = idx; j <= nEntries; j+= numThreads) {
// if (theData_d[j].energy > queryEnergy) {
// atomicMin(&resArray_d[i], j);
// break;
// }
// }

// }

// // slower, 13s for highest test (and seg fault too)
// // int start = blockIdx.x * queriesPerBlock;
// // int i = start;
// // do {
// // G4double queryEnergy = queryArray_d[i];

// // for (int j = idx; j < nEntries; j += numThreads) {
// // if (theData_d[j].energy > queryEnergy) {
// // atomicMin(&resArray_d[i], j);
// // break;
// // }
// // }
// // i = ++i % numQueries;
// // } while (i != start);
// }

// /***********************************************
// * Device Methods
// ***********************************************/
// void G4ParticleHPVector_CUDA::SetInterpolationManager(G4InterpolationManager & aManager) {
// theManager = aManager;
// }
// void G4ParticleHPVector_CUDA::SetInterpolationManager(const G4InterpolationManager & aManager) {
// theManager = aManager;
// }

// double getWallTime() {
// struct timeval time;
// gettimeofday(&time, NULL);
// return (double)time.tv_sec + (double)time.tv_usec * 0.000001;
// }

// /***********************************************
// * Host Methods
// ***********************************************/
// void G4ParticleHPVector_CUDA::GetXsecList(G4double* energiesIn_xSecsOut, G4int numQueries, G4ParticleHPDataPoint* theData, G4int nEntries) {
// if (nEntries == 0) {
// for (int i = 0; i < numQueries; i++) {
// energiesIn_xSecsOut[i] = 0.0;
// }
// return;
// }

// G4ParticleHPDataPoint * d_theData;
// G4double * d_energiesIn_xSecsOut;
// G4int * d_minIndices;

// cudaMalloc((void**)&d_theData, sizeof(G4ParticleHPDataPoint) * nEntries);
// cudaMalloc((void**)&d_energiesIn_xSecsOut, sizeof(G4double) * numQueries);
// cudaMalloc((void**)&d_minIndices, sizeof(G4int) * numQueries);
// G4int *minIndices = (G4int*)malloc(numQueries * sizeof(G4int));

// cudaMemcpy(d_theData, theData, sizeof(G4ParticleHPDataPoint) * nEntries, cudaMemcpyHostToDevice);
// cudaMemcpy(d_energiesIn_xSecsOut, energiesIn_xSecsOut, sizeof(G4double) * numQueries, cudaMemcpyHostToDevice);

// int queryBlocks = numQueries/THREADS_PER_BLOCK + (numQueries % THREADS_PER_BLOCK == 0 ? 0:1);
// int dataChunk = 1;
// int threadNum = nEntries/dataChunk;
// int arrayBlocks = threadNum/THREADS_PER_BLOCK + (threadNum % THREADS_PER_BLOCK == 0 ? 0:1);
// int queriesPerBlock = numQueries / arrayBlocks;

// double a = getWallTime();
// SetArrayTo <<< queryBlocks, THREADS_PER_BLOCK >>>(d_minIndices, numQueries, nEntries-1);
// findMinArray2 <<< arrayBlocks, THREADS_PER_BLOCK >>> (d_theData, d_energiesIn_xSecsOut, d_minIndices, threadNum, numQueries, nEntries);
// cudaDeviceSynchronize();
// printf("Time (nEntries = %d, numQueries = %d): %f\n", nEntries, numQueries, getWallTime() - a);

// cudaMemcpy(minIndices, d_minIndices, numQueries * sizeof(G4int), cudaMemcpyDeviceToHost);

// for (int i = 0; i < numQueries; i++) {
// int minIndex = minIndices[i];

// G4int low = minIndex - 1;
// G4int high = minIndex;
// G4double e = energiesIn_xSecsOut[i];

// if (minIndex == 0)
// {
// low = 0;
// high = 1;
// }
// else if (minIndex == nEntries)
// {
// low = nEntries - 2;
// high = nEntries - 1;
// }

// if (e < theData[nEntries-1].GetX())
// {
// if (theData[high].GetX() != 0
// && (std::abs((theData[high].GetX() - theData[low].GetX()) / theData[high].GetX()) < 0.000001))
// {
// energiesIn_xSecsOut[i] = theData[low].GetY();
// }
// else
// {
// energiesIn_xSecsOut[i] =
// theInt.Interpolate(theManager.GetScheme(high), e,
// theData[low].GetX(), theData[high].GetX(),
// theData[low].GetY(), theData[high].GetY());
// }
// }
// else
// {
// energiesIn_xSecsOut[i] = theData[nEntries-1].GetY();
// }
// }

// cudaFree(d_theData);
// cudaFree(d_energiesIn_xSecsOut);
// cudaFree(d_minIndices);
// free(minIndices);
// }


#include <time.h>
#include <sys/time.h>
#include <cuda.h>
Expand Down
Loading

0 comments on commit 5554eff

Please sign in to comment.