From 4eafc92922785546c796697f8040cbe9208879f4 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 26 Jan 2016 20:14:33 -0600 Subject: [PATCH 001/700] Initial commit for GPUOpen Launch --- samples/0_Intro/bit_extract/Makefile | 24 +++ samples/0_Intro/bit_extract/README.md | 4 + samples/0_Intro/bit_extract/bit_extract.cpp | 102 +++++++++++++ samples/0_Intro/square/Makefile | 18 +++ samples/0_Intro/square/README.md | 15 ++ samples/0_Intro/square/square.cu | 94 ++++++++++++ samples/0_Intro/square/square.hipref.cpp | 94 ++++++++++++ samples/1_Utils/hipInfo/Makefile | 16 ++ samples/1_Utils/hipInfo/README.md | 6 + samples/1_Utils/hipInfo/hipInfo.cpp | 154 ++++++++++++++++++++ 10 files changed, 527 insertions(+) create mode 100644 samples/0_Intro/bit_extract/Makefile create mode 100644 samples/0_Intro/bit_extract/README.md create mode 100644 samples/0_Intro/bit_extract/bit_extract.cpp create mode 100644 samples/0_Intro/square/Makefile create mode 100644 samples/0_Intro/square/README.md create mode 100644 samples/0_Intro/square/square.cu create mode 100644 samples/0_Intro/square/square.hipref.cpp create mode 100644 samples/1_Utils/hipInfo/Makefile create mode 100644 samples/1_Utils/hipInfo/README.md create mode 100644 samples/1_Utils/hipInfo/hipInfo.cpp diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile new file mode 100644 index 000000000..d0a2d4bc7 --- /dev/null +++ b/samples/0_Intro/bit_extract/Makefile @@ -0,0 +1,24 @@ +#Dependencies : [MYHIP]/bin must be in user's path. + +HIP_PATH?=$(shell hipconfig --path) +HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) +HIPCC=$(HIP_PATH)/bin/hipcc + +# Show how to use PLATFORM to specify different options for each compiler: +ifeq (${HIP_PLATFORM}, nvcc) + HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 +endif +ifeq (${HIP_PLATFORM}, hcc) + HIPCC_FLAGS = +endif + + +EXE=bit_extract + +$(EXE): bit_extract.cpp + $(HIPCC) $(HIPCC_FLAGS) $< -o $@ + + +clean: + rm -f *.o $(EXE) + diff --git a/samples/0_Intro/bit_extract/README.md b/samples/0_Intro/bit_extract/README.md new file mode 100644 index 000000000..025b7d307 --- /dev/null +++ b/samples/0_Intro/bit_extract/README.md @@ -0,0 +1,4 @@ +# bit_extract + +Show an application written directly in HIP which uses platform-specific check on __HIP_PLATFORM_HCC__ to enable use of +an instruction that only exists on the HCC platform. diff --git a/samples/0_Intro/bit_extract/bit_extract.cpp b/samples/0_Intro/bit_extract/bit_extract.cpp new file mode 100644 index 000000000..d32b65c15 --- /dev/null +++ b/samples/0_Intro/bit_extract/bit_extract.cpp @@ -0,0 +1,102 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include +#include + + +#define CHECK(error) \ + if (error != hipSuccess) { \ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + } + +void __global__ +bit_extract_kernel(hipLaunchParm lp, uint32_t *C_d, const uint32_t *A_d, size_t N) +{ + KERNELBEGIN; + + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (size_t i=offset; i> 8); +#endif + } + + KERNELEND; +} + + +int main(int argc, char *argv[]) +{ + uint32_t *A_d, *C_d; + uint32_t *A_h, *C_h; + size_t N = 1000000; + size_t Nbytes = N * sizeof(uint32_t); + + int deviceId; + CHECK (hipGetDevice(&deviceId)); + hipDeviceProp_t props; + CHECK(hipDeviceGetProperties(&props, deviceId)); + printf ("info: running on device #%d %s\n", deviceId, props.name); + + + printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + A_h = (uint32_t*)malloc(Nbytes); + CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); + C_h = (uint32_t*)malloc(Nbytes); + CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); + + for (size_t i=0; i> 8); + if (C_h[i] != Agold) { + fprintf (stderr, "mismatch detected.\n"); + printf ("%zu: %08x =? %08x (Ain=%08x)\n", i, C_h[i], Agold, A_h[i]); + CHECK(hipErrorUnknown); + } + } +} diff --git a/samples/0_Intro/square/Makefile b/samples/0_Intro/square/Makefile new file mode 100644 index 000000000..9fb03d867 --- /dev/null +++ b/samples/0_Intro/square/Makefile @@ -0,0 +1,18 @@ +HIP_PATH?=$(shell hipconfig --path) +HIPCC=$(HIP_PATH)/bin/hipcc + +all: square.hip.out + +square.cuda.out : square.cu + nvcc square.cu -o $@ + +#hipify square.cu > square.cpp +# Then review & finish port in square.cpp + +square.hip.out: square.hipref.cpp + $(HIPCC) square.hipref.cpp -o $@ + + + +clean: + rm -f *.o *.out diff --git a/samples/0_Intro/square/README.md b/samples/0_Intro/square/README.md new file mode 100644 index 000000000..56b558e5b --- /dev/null +++ b/samples/0_Intro/square/README.md @@ -0,0 +1,15 @@ +# square.md + +Simple test which shows how to use hipify to port CUDA code to HIP. Covered in more detail in blog. + +1. Add hip/bin path to the PATH : + export PATH=$PATH:[MYHIP]/bin + +2. hipify square.cu > square.cpp + +3. Manually edit square.cpp to add hipLaunchParms lp to kernel parms: + vector_square(hipLaunchParm lp, T *C_d, const T *A_d, size_t N) + + (see square.hipref.cpp for the correct output after running hipify and the above manual step) + +4. make diff --git a/samples/0_Intro/square/square.cu b/samples/0_Intro/square/square.cu new file mode 100644 index 000000000..996344ed4 --- /dev/null +++ b/samples/0_Intro/square/square.cu @@ -0,0 +1,94 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include +#include + +#define CHECK(error) \ + if (error != cudaSuccess) { \ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + } + + +/* + * Square each element in the array A and write to array C. + */ +template +__global__ void +vector_square(T *C_d, const T *A_d, size_t N) +{ + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; + + for (size_t i=offset; i>> (C_d, A_d, N); + + printf ("info: copy Device2Host\n"); + CHECK ( cudaMemcpy(C_h, C_d, Nbytes, cudaMemcpyDeviceToHost)); + + printf ("info: check result\n"); + for (size_t i=0; i +#include + +#define CHECK(error) \ + if (error != hipSuccess) { \ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + } + + +/* + * Square each element in the array A and write to array C. + */ +template +__global__ void +vector_square(hipLaunchParm lp, T *C_d, const T *A_d, size_t N) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (size_t i=offset; i +#include +#include "hip_runtime.h" + +#define KNRM "\x1B[0m" +#define KRED "\x1B[31m" +#define KGRN "\x1B[32m" +#define KYEL "\x1B[33m" +#define KBLU "\x1B[34m" +#define KMAG "\x1B[35m" +#define KCYN "\x1B[36m" +#define KWHT "\x1B[37m" + +#define failed(...) \ + printf ("%serror: ", KRED);\ + printf (__VA_ARGS__);\ + printf ("\n");\ + printf ("error: TEST FAILED\n%s", KNRM );\ + exit(EXIT_FAILURE); + +#define HIPCHECK(error) \ + if (error != hipSuccess) { \ + printf("%serror: '%s'(%d) at %s:%d%s\n", \ + KRED,hipGetErrorString(error), error,\ + __FILE__, __LINE__,KNRM); \ + failed("API returned error code.");\ + } + +void printCompilerInfo () +{ +#ifdef __HCC__ + printf ("compiler: hcc version=%s, workweek (YYWWD) = %u\n", __hcc_version__, __hcc_workweek__); +#endif +#ifdef __NVCC__ + printf ("compiler: nvcc\n"); +#endif +} + + +double bytesToGB(size_t s) +{ + return (double)s / (1024.0*1024.0*1024.0); +} + +void printDeviceProp (int deviceId) +{ + using namespace std; + + const int w1 = 30; + + cout << left; + + cout << setw(w1) << "--------------------------------------------------------------------------------" << endl; + cout << setw(w1) << "device#" << deviceId << endl; + + hipDeviceProp_t props; + HIPCHECK(hipDeviceGetProperties(&props, deviceId)); + + + cout << setw(w1) << "Name: " << props.name << endl; + cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; + cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; + cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0<< " Mhz" << endl; + cout << setw(w1) << "totalGlobalMem" << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl << defaultfloat; + + cout << setw(w1) << "sharedMemPerBlock" << (float)props.sharedMemPerBlock / 1024.0 << " KB" << endl; + cout << setw(w1) << "regsPerBlock" << props.regsPerBlock << endl; + cout << setw(w1) << "warpSize" << props.warpSize << endl; + cout << setw(w1) << "maxThreadsPerBlock" << props.maxThreadsPerBlock << endl; + cout << setw(w1) << "maxThreadsDim.x" << props.maxThreadsDim[0] << endl; + cout << setw(w1) << "maxThreadsDim.y" << props.maxThreadsDim[1] << endl; + cout << setw(w1) << "maxThreadsDim.z" << props.maxThreadsDim[2] << endl; + + cout << setw(w1) << "maxGridSize.x" << props.maxGridSize[0] << endl; + cout << setw(w1) << "maxGridSize.y" << props.maxGridSize[1] << endl; + cout << setw(w1) << "maxGridSize.z" << props.maxGridSize[2] << endl; + + + cout << setw(w1) << "totalConstMem" << props.totalConstMem << endl; + cout << setw(w1) << "major" << props.major << endl; + cout << setw(w1) << "minor" << props.minor << endl; + cout << setw(w1) << "l2CacheSize" << props.l2CacheSize << endl; + cout << setw(w1) << "maxThreadsPerMultiProcessor" << props.maxThreadsPerMultiProcessor << endl; + cout << setw(w1) << "computeMode" << props.computeMode << endl; + + cout << setw(w1) << "arch.hasGlobalInt32Atomics" << props.arch.hasGlobalInt32Atomics << endl; + cout << setw(w1) << "arch.hasGlobalFloatAtomicExch" << props.arch.hasGlobalFloatAtomicExch << endl; + cout << setw(w1) << "arch.hasSharedInt32Atomics" << props.arch.hasSharedInt32Atomics << endl; + cout << setw(w1) << "arch.hasSharedFloatAtomicExch" << props.arch.hasSharedFloatAtomicExch << endl; + cout << setw(w1) << "arch.hasFloatAtomicAdd" << props.arch.hasFloatAtomicAdd << endl; + cout << setw(w1) << "arch.hasGlobalInt64Atomics" << props.arch.hasGlobalInt64Atomics << endl; + cout << setw(w1) << "arch.hasSharedInt64Atomics" << props.arch.hasSharedInt64Atomics << endl; + cout << setw(w1) << "arch.hasDoubles" << props.arch.hasDoubles << endl; + cout << setw(w1) << "arch.hasWarpVote" << props.arch.hasWarpVote << endl; + cout << setw(w1) << "arch.hasWarpBallot" << props.arch.hasWarpBallot << endl; + cout << setw(w1) << "arch.hasWarpShuffle" << props.arch.hasWarpShuffle << endl; + cout << setw(w1) << "arch.hasFunnelShift" << props.arch.hasFunnelShift << endl; + cout << setw(w1) << "arch.hasThreadFenceSystem" << props.arch.hasThreadFenceSystem << endl; + cout << setw(w1) << "arch.hasSyncThreadsExt" << props.arch.hasSyncThreadsExt << endl; + cout << setw(w1) << "arch.hasSurfaceFuncs" << props.arch.hasSurfaceFuncs << endl; + cout << setw(w1) << "arch.has3dGrid" << props.arch.has3dGrid << endl; + cout << setw(w1) << "arch.hasDynamicParallelism" << props.arch.hasDynamicParallelism << endl; + + cout << endl; + + size_t free, total; + + hipMemGetInfo(&free, &total); + + cout << fixed << setprecision(2); + cout << setw(w1) << "memInfo.total " << bytesToGB(total) << " GB" << endl; + cout << setw(w1) << "memInfo.free " << bytesToGB(free) << " GB (" << setprecision(0) << (float)free/total * 100.0 << "%)" << endl; + +} + +int main(int argc, char *argv[]) +{ + using namespace std; + + cout << endl; + + printCompilerInfo(); + + int deviceCnt; + + HIPCHECK(hipGetDeviceCount(&deviceCnt)); + + for (int i=0; i< deviceCnt; i++) { + printDeviceProp(i); + } + + std::cout << std::endl; +} From 6e2721890a117b4b6e022c12531dd10f6dc57fe5 Mon Sep 17 00:00:00 2001 From: Aditya Avinash Atluri Date: Tue, 26 Jan 2016 10:40:06 -0500 Subject: [PATCH 002/700] Corrected compilation error --- samples/1_Utils/hipInfo/hipInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index da2ec17ec..bff2114f9 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -82,7 +82,7 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0<< " Mhz" << endl; - cout << setw(w1) << "totalGlobalMem" << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl << defaultfloat; + cout << setw(w1) << "totalGlobalMem" << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; cout << setw(w1) << "sharedMemPerBlock" << (float)props.sharedMemPerBlock / 1024.0 << " KB" << endl; cout << setw(w1) << "regsPerBlock" << props.regsPerBlock << endl; From 9c7e2593e9bb17a88cf81cacb6395052a38e76ad Mon Sep 17 00:00:00 2001 From: Aditya Avinash Atluri Date: Tue, 26 Jan 2016 10:43:41 -0500 Subject: [PATCH 003/700] Update README.md --- samples/0_Intro/square/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/samples/0_Intro/square/README.md b/samples/0_Intro/square/README.md index 56b558e5b..59f59e716 100644 --- a/samples/0_Intro/square/README.md +++ b/samples/0_Intro/square/README.md @@ -1,14 +1,14 @@ -# square.md +# Square.md Simple test which shows how to use hipify to port CUDA code to HIP. Covered in more detail in blog. 1. Add hip/bin path to the PATH : - export PATH=$PATH:[MYHIP]/bin + export PATH=$PATH:[MYHIP]/bin -2. hipify square.cu > square.cpp +2. Do $ hipify square.cu > square.cpp 3. Manually edit square.cpp to add hipLaunchParms lp to kernel parms: - vector_square(hipLaunchParm lp, T *C_d, const T *A_d, size_t N) + vector_square(hipLaunchParm lp, T *C_d, const T *A_d, size_t N) (see square.hipref.cpp for the correct output after running hipify and the above manual step) From afac6a2c4a1bb259c32ce6444f23bdf5440a501e Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 27 Jan 2016 00:23:47 -0600 Subject: [PATCH 004/700] Update links in docs to GPUOpen and to Doxygen --- samples/0_Intro/bit_extract/README.md | 2 ++ samples/0_Intro/square/README.md | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/samples/0_Intro/bit_extract/README.md b/samples/0_Intro/bit_extract/README.md index 025b7d307..76f890659 100644 --- a/samples/0_Intro/bit_extract/README.md +++ b/samples/0_Intro/bit_extract/README.md @@ -2,3 +2,5 @@ Show an application written directly in HIP which uses platform-specific check on __HIP_PLATFORM_HCC__ to enable use of an instruction that only exists on the HCC platform. + +See related [blog](http://gpuopen.com/platform-aware-coding-inside-hip/) demonstrating platform specialization. diff --git a/samples/0_Intro/square/README.md b/samples/0_Intro/square/README.md index 59f59e716..8f9aec73c 100644 --- a/samples/0_Intro/square/README.md +++ b/samples/0_Intro/square/README.md @@ -1,6 +1,7 @@ # Square.md -Simple test which shows how to use hipify to port CUDA code to HIP. Covered in more detail in blog. +Simple test which shows how to use hipify to port CUDA code to HIP. +See related [blog](http://gpuopen.com/hip-to-be-squared-an-introductory-hip-tutorial) that explains the example. 1. Add hip/bin path to the PATH : export PATH=$PATH:[MYHIP]/bin From 8cb885f03ae902b5caefeaaa44c7e7896ada49cc Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 21:30:43 -0600 Subject: [PATCH 005/700] Add Bus Bandwidth test, leveraged from SHOC. --- samples/1_Utils/hipBusBandwidth/LICENSE.txt | 27 + samples/1_Utils/hipBusBandwidth/Makefile | 16 + .../hipBusBandwidth/ResultDatabase.cpp | 520 ++++++++++++++++++ .../1_Utils/hipBusBandwidth/ResultDatabase.h | 100 ++++ .../hipBusBandwidth/hipBusBandwidth.cpp | 170 ++++++ 5 files changed, 833 insertions(+) create mode 100644 samples/1_Utils/hipBusBandwidth/LICENSE.txt create mode 100644 samples/1_Utils/hipBusBandwidth/Makefile create mode 100644 samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp create mode 100644 samples/1_Utils/hipBusBandwidth/ResultDatabase.h create mode 100644 samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp diff --git a/samples/1_Utils/hipBusBandwidth/LICENSE.txt b/samples/1_Utils/hipBusBandwidth/LICENSE.txt new file mode 100644 index 000000000..5d0d60323 --- /dev/null +++ b/samples/1_Utils/hipBusBandwidth/LICENSE.txt @@ -0,0 +1,27 @@ + +Copyright (c) 2011, UT-Battelle, LLC +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +* Neither the name of Oak Ridge National Laboratory, nor UT-Battelle, LLC, nor + the names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/samples/1_Utils/hipBusBandwidth/Makefile b/samples/1_Utils/hipBusBandwidth/Makefile new file mode 100644 index 000000000..d23321631 --- /dev/null +++ b/samples/1_Utils/hipBusBandwidth/Makefile @@ -0,0 +1,16 @@ +HIP_PATH?=$(shell hipconfig -p) +HIPCC=$(HIP_PATH)/bin/hipcc + +EXE=hipBusBandwidth + +all: install + +$(EXE): hipBusBandwidth.cpp ResultDatabase.cpp + $(HIPCC) $^ -o $@ + +install: $(EXE) + cp $(EXE) $(HIP_PATH)/bin + + +clean: + rm -f *.o $(EXE) diff --git a/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp b/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp new file mode 100644 index 000000000..f57aed11b --- /dev/null +++ b/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp @@ -0,0 +1,520 @@ +#include "ResultDatabase.h" + +#include +#include +#include +#include + +using namespace std; + +bool ResultDatabase::Result::operator<(const Result &rhs) const +{ + if (test < rhs.test) + return true; + if (test > rhs.test) + return false; + if (atts < rhs.atts) + return true; + if (atts > rhs.atts) + return false; + return false; // less-operator returns false on equal +} + +double ResultDatabase::Result::GetMin() const +{ + double r = FLT_MAX; + for (int i=0; i= 100) + return value[n-1]; + + double index = ((n + 1.) * q / 100.) - 1; + + vector sorted = value; + sort(sorted.begin(), sorted.end()); + + if (n == 2) + return (sorted[0] * (1 - q/100.) + sorted[1] * (q/100.)); + + int index_lo = int(index); + double frac = index - index_lo; + if (frac == 0) + return sorted[index_lo]; + + double lo = sorted[index_lo]; + double hi = sorted[index_lo + 1]; + return lo + (hi-lo)*frac; +} + +double ResultDatabase::Result::GetMean() const +{ + double r = 0; + for (int i=0; i &values) +{ + for (int i=0; i= results.size()) + { + Result r; + r.test = test; + r.atts = atts; + r.unit = unit; + results.push_back(r); + } + + results[index].value.push_back(value); +} + +// **************************************************************************** +// Method: ResultDatabase::DumpDetailed +// +// Purpose: +// Writes the full results, including all trials. +// +// Arguments: +// out where to print +// +// Programmer: Jeremy Meredith +// Creation: August 14, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Nov 10 14:25:17 EST 2010 +// Renamed to DumpDetailed to make room for a DumpSummary. +// +// Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010 +// Added note about (*) missing value tag. +// +// Jeremy Meredith, Tue Nov 23 13:57:02 EST 2010 +// Changed note about missing values to be worded a little better. +// +// **************************************************************************** +void ResultDatabase::DumpDetailed(ostream &out) +{ + vector sorted(results); + + sort(sorted.begin(), sorted.end()); + + int maxtrials = 1; + for (int i=0; i maxtrials) + maxtrials = sorted[i].value.size(); + } + + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << "test\t" + << "atts\t" + << "units\t" + << "median\t" + << "mean\t" + << "stddev\t" + << "min\t" + << "max\t"; + for (int i=0; i sorted(results); + + sort(sorted.begin(), sorted.end()); + + out << std::fixed << right << std::setw(9) << std::setprecision(4); + + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << "test\t" + << "atts\t" + << "units\t" + << "median\t" + << "mean\t" + << "stddev\t" + << "min\t" + << "max\t"; + out << endl; + + for (int i=0; i sorted(results); + + sort(sorted.begin(), sorted.end()); + + //Check to see if the file is empty - if so, add the headers + emptyFile = this->IsFileEmpty(fileName); + + //Open file and append by default + ofstream out; + out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app); + + //Add headers only for empty files + if(emptyFile) + { + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << "test, " + << "atts, " + << "units, " + << "median, " + << "mean, " + << "stddev, " + << "min, " + << "max, "; + out << endl; + } + + for (int i=0; i +ResultDatabase::GetResultsForTest(const string &test) +{ + // get only the given test results + vector retval; + for (int i=0; i & +ResultDatabase::GetResults() const +{ + return results; +} diff --git a/samples/1_Utils/hipBusBandwidth/ResultDatabase.h b/samples/1_Utils/hipBusBandwidth/ResultDatabase.h new file mode 100644 index 000000000..4b63a02a1 --- /dev/null +++ b/samples/1_Utils/hipBusBandwidth/ResultDatabase.h @@ -0,0 +1,100 @@ +#ifndef RESULT_DATABASE_H +#define RESULT_DATABASE_H + +#include +#include +#include +#include +#include +using std::string; +using std::vector; +using std::ostream; +using std::ofstream; +using std::ifstream; + + +// **************************************************************************** +// Class: ResultDatabase +// +// Purpose: +// Track numerical results as they are generated. +// Print statistics of raw results. +// +// Programmer: Jeremy Meredith +// Creation: June 12, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 +// Split timing reports into detailed and summary. E.g. for serial code, +// we might report all trial values, but skip them in parallel. +// +// Jeremy Meredith, Thu Nov 11 11:40:18 EST 2010 +// Added check for missing value tag. +// +// Jeremy Meredith, Mon Nov 22 13:37:10 EST 2010 +// Added percentile statistic. +// +// Jeremy Meredith, Fri Dec 3 16:30:31 EST 2010 +// Added a method to extract a subset of results based on test name. Also, +// the Result class is now public, so that clients can use them directly. +// Added a GetResults method as well, and made several functions const. +// +// **************************************************************************** +class ResultDatabase +{ + public: + // + // A performance result for a single SHOC benchmark run. + // + struct Result + { + string test; // e.g. "readback" + string atts; // e.g. "pagelocked 4k^2" + string unit; // e.g. "MB/sec" + vector value; // e.g. "837.14" + double GetMin() const; + double GetMax() const; + double GetMedian() const; + double GetPercentile(double q) const; + double GetMean() const; + double GetStdDev() const; + + bool operator<(const Result &rhs) const; + + bool HadAnyFLTMAXValues() const + { + for (int i=0; i= FLT_MAX) + return true; + } + return false; + } + }; + + protected: + vector results; + + public: + void AddResult(const string &test, + const string &atts, + const string &unit, + double value); + void AddResults(const string &test, + const string &atts, + const string &unit, + const vector &values); + vector GetResultsForTest(const string &test); + const vector &GetResults() const; + void ClearAllResults(); + void DumpDetailed(ostream&); + void DumpSummary(ostream&); + void DumpCsv(string fileName); + + private: + bool IsFileEmpty(string fileName); + +}; + + +#endif diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp new file mode 100644 index 000000000..8481476fc --- /dev/null +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -0,0 +1,170 @@ +#include +#include +#include + +#include "ResultDatabase.h" + +// Cmdline parms: +const bool p_verbose = false; +const bool p_pinned = true; +const unsigned int p_iters = 10; + +#define CHECK_HIP_ERROR() \ +{ \ + hipError_t err = hipGetLastError(); \ + if (err != hipSuccess) \ + { \ + printf("error=%d name=%s at " \ + "ln: %d\n ",err,hipGetErrorString(err),__LINE__); \ + exit(EXIT_FAILURE); \ + } \ +} + + +// **************************************************************************** +// Function: runBenchmark +// +// Purpose: +// Measures the bandwidth of the bus connecting the host processor to the +// OpenCL device. This benchmark repeatedly transfers data chunks of various +// sizes across the bus to the OpenCL device, and calculates the bandwidth. +// +// +// Arguments: +// +// Returns: nothing +// +// Programmer: Jeremy Meredith +// Creation: September 08, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Dec 1 17:05:27 EST 2010 +// Added calculation of latency estimate. +// Ben Sander - moved to standalone test +// +// **************************************************************************** +void RunBenchmark(ResultDatabase &resultDB) +{ + // Sizes are in kb + int nSizes = 20; + int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, + 32768,65536,131072,262144,524288}; + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + // Create some host memory pattern + float *hostMem = NULL; + if (p_pinned) + { + hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats); + while (hipGetLastError() != hipSuccess) + { + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any pinned buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats); + } + } + else + { + hostMem = new float[numMaxFloats]; + } + + for (int i = 0; i < numMaxFloats; i++) + { + hostMem[i] = i % 77; + } + + float *device; + hipMalloc((void**)&device, sizeof(float) * numMaxFloats); + while (hipGetLastError() != hipSuccess) + { + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating device mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any device buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipMalloc((void**)&device, sizeof(float) * numMaxFloats); + } + + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + CHECK_HIP_ERROR(); + + // Three passes, forward and backward both + for (int pass = 0; pass < p_iters; pass++) + { + // store the times temporarily to estimate latency + //float times[nSizes]; + // Step through sizes forward on even passes and backward on odd + for (int i = 0; i < nSizes; i++) + { + int sizeIndex; + if ((pass % 2) == 0) + sizeIndex = i; + else + sizeIndex = (nSizes - 1) - i; + + int nbytes = sizes[sizeIndex] * 1024; + + hipEventRecord(start, 0); + hipMemcpy(device, hostMem, nbytes, hipMemcpyHostToDevice); + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); + //times[sizeIndex] = t; + + // Convert to GB/sec + if (p_verbose) + { + std::cerr << "size " << sizes[sizeIndex] << "k took " << t << + " ms\n"; + } + + double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t; + char sizeStr[256]; + sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]); + resultDB.AddResult("DownloadSpeed", sizeStr, "GB/sec", speed); + resultDB.AddResult("DownloadTime", sizeStr, "ms", t); + } + } + + // Cleanup + hipFree((void*)device); + CHECK_HIP_ERROR(); + if (p_pinned) + { + hipFreeHost((void*)hostMem); + CHECK_HIP_ERROR(); + } + else + { + delete[] hostMem; + } + hipEventDestroy(start); + hipEventDestroy(stop); +} + + + +int main(int argc, char *argv[]) +{ + ResultDatabase resultDB; + RunBenchmark(resultDB); + + resultDB.DumpSummary(std::cout); + + resultDB.DumpDetailed(std::cout); +} From 42257c5d71e4cb95332979e62c71dbaafa78d191 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 22:46:34 -0600 Subject: [PATCH 006/700] Add D2H test --- .../hipBusBandwidth/hipBusBandwidth.cpp | 235 +++++++++++++++++- 1 file changed, 226 insertions(+), 9 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 8481476fc..c908fa655 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -5,9 +5,15 @@ #include "ResultDatabase.h" // Cmdline parms: -const bool p_verbose = false; -const bool p_pinned = true; -const unsigned int p_iters = 10; +bool p_verbose = false; +bool p_pinned = true; +int p_iterations = 10; +int p_device = 0; +int p_detailed = 0; + +bool p_h2d = true; +bool p_d2h = true; + #define CHECK_HIP_ERROR() \ { \ @@ -43,7 +49,7 @@ const unsigned int p_iters = 10; // Ben Sander - moved to standalone test // // **************************************************************************** -void RunBenchmark(ResultDatabase &resultDB) +void RunBenchmark_H2D(ResultDatabase &resultDB) { // Sizes are in kb int nSizes = 20; @@ -51,6 +57,8 @@ void RunBenchmark(ResultDatabase &resultDB) 32768,65536,131072,262144,524288}; long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipSetDevice(p_device); + // Create some host memory pattern float *hostMem = NULL; if (p_pinned) @@ -103,7 +111,7 @@ void RunBenchmark(ResultDatabase &resultDB) CHECK_HIP_ERROR(); // Three passes, forward and backward both - for (int pass = 0; pass < p_iters; pass++) + for (int pass = 0; pass < p_iterations; pass++) { // store the times temporarily to estimate latency //float times[nSizes]; @@ -158,13 +166,222 @@ void RunBenchmark(ResultDatabase &resultDB) } +void RunBenchmark_D2H(ResultDatabase &resultDB) +{ + + // Sizes are in kb + int nSizes = 20; + int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, + 32768,65536,131072,262144,524288}; + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + // Create some host memory pattern + float *hostMem1; + float *hostMem2; + if (p_pinned) + { + hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats); + hipError_t err1 = hipGetLastError(); + hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats); + hipError_t err2 = hipGetLastError(); + while (err1 != hipSuccess || err2 != hipSuccess) + { + // free the first buffer if only the second failed + if (err1 == hipSuccess) + hipFreeHost((void*)hostMem1); + + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any pinned buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats); + err1 = hipGetLastError(); + hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats); + err2 = hipGetLastError(); + } + } + else + { + hostMem1 = new float[numMaxFloats]; + hostMem2 = new float[numMaxFloats]; + } + for (int i=0; i= argc || !parseInt(argv[i], &p_iterations)) { + failed("Bad iterations argument"); + } + } else if (!strcmp(arg, "--device") || (!strcmp(arg, "-d"))) { + if (++i >= argc || !parseInt(argv[i], &p_device)) { + failed("Bad device argument"); + } + } else if (!strcmp(arg, "--unpinned")) { + p_pinned = 0; + } else if (!strcmp(arg, "--h2d")) { + p_h2d = true; + p_d2h = false; + + } else if (!strcmp(arg, "--d2h")) { + p_h2d = false; + p_d2h = true; + + } else if (!strcmp(arg, "--help") || (!strcmp(arg, "-h"))) { + help(); + + } else if (!strcmp(arg, "--verbose")) { + p_verbose = 1; + } else if (!strcmp(arg, "--detailed")) { + p_detailed = 1; + } else { + failed("Bad argument '%s'", arg); + } + } + + return 0; +}; + + int main(int argc, char *argv[]) { - ResultDatabase resultDB; - RunBenchmark(resultDB); + parseStandardArguments(argc, argv); + + if (p_h2d) { + ResultDatabase resultDB; + RunBenchmark_H2D(resultDB); - resultDB.DumpSummary(std::cout); + resultDB.DumpSummary(std::cout); - resultDB.DumpDetailed(std::cout); + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } + } + + if (p_d2h) { + ResultDatabase resultDB; + RunBenchmark_D2H(resultDB); + + resultDB.DumpSummary(std::cout); + + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } + } } From 63d79554ab5010dd2b138f7e465348fb75bdbe23 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 Feb 2016 22:47:26 -0600 Subject: [PATCH 007/700] Add D2H test --- samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index c908fa655..b847f8db4 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -144,8 +144,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t; char sizeStr[256]; sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]); - resultDB.AddResult("DownloadSpeed", sizeStr, "GB/sec", speed); - resultDB.AddResult("DownloadTime", sizeStr, "ms", t); + resultDB.AddResult("H2D_Bandwidth", sizeStr, "GB/sec", speed); + resultDB.AddResult("H2D_Time", sizeStr, "ms", t); } } @@ -273,8 +273,8 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t; char sizeStr[256]; sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]); - resultDB.AddResult("ReadbackSpeed", sizeStr, "GB/sec", speed); - resultDB.AddResult("ReadbackTime", sizeStr, "ms", t); + resultDB.AddResult("D2H_Bandwidth", sizeStr, "GB/sec", speed); + resultDB.AddResult("D2H_Time", sizeStr, "ms", t); } //resultDB.AddResult("ReadbackLatencyEstimate", "1-2kb", "ms", times[0]-(times[1]-times[0])/1.); //resultDB.AddResult("ReadbackLatencyEstimate", "1-4kb", "ms", times[0]-(times[2]-times[0])/3.); From b5743abae250e668d808a6062a8b52756605bb3d Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 13 Feb 2016 01:14:01 -0600 Subject: [PATCH 008/700] Result formatting --- samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp b/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp index f57aed11b..7d2f3aef8 100644 --- a/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp +++ b/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp @@ -278,13 +278,16 @@ void ResultDatabase::DumpSummary(ostream &out) { vector sorted(results); + int testW = 15 ; + const int fieldW = 9; + sort(sorted.begin(), sorted.end()); - out << std::fixed << right << std::setw(9) << std::setprecision(4); + out << std::fixed << right << std::setprecision(4); // TODO: in big parallel runs, the "trials" are the procs // and we really don't want to print them all out.... - out << "test\t" + out << setw(testW) << "test\t" << setw(fieldW) << "atts\t" << "units\t" << "median\t" @@ -297,7 +300,7 @@ void ResultDatabase::DumpSummary(ostream &out) for (int i=0; i Date: Sat, 13 Feb 2016 03:17:42 -0600 Subject: [PATCH 009/700] Enable -O3, style points on array size --- samples/1_Utils/hipBusBandwidth/Makefile | 3 ++- samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/Makefile b/samples/1_Utils/hipBusBandwidth/Makefile index d23321631..77a92fb1a 100644 --- a/samples/1_Utils/hipBusBandwidth/Makefile +++ b/samples/1_Utils/hipBusBandwidth/Makefile @@ -2,11 +2,12 @@ HIP_PATH?=$(shell hipconfig -p) HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipBusBandwidth +CXXFLAGS = -O3 -g all: install $(EXE): hipBusBandwidth.cpp ResultDatabase.cpp - $(HIPCC) $^ -o $@ + $(HIPCC) $(CXXFLAGS) $^ -o $@ install: $(EXE) cp $(EXE) $(HIP_PATH)/bin diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index b847f8db4..d27672592 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -52,9 +52,9 @@ bool p_d2h = true; void RunBenchmark_H2D(ResultDatabase &resultDB) { // Sizes are in kb - int nSizes = 20; - int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, - 32768,65536,131072,262144,524288}; + int sizes[] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, 32768,65536,131072,262144,524288}; + int nSizes = sizeof(sizes) / sizeof(int); + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; hipSetDevice(p_device); From 12fcf1d0ffa1dcaee59286f4af6cd4cc6bff79ab Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 15 Feb 2016 13:16:05 +0300 Subject: [PATCH 010/700] Formatting, no functional changes. --- samples/1_Utils/hipInfo/hipInfo.cpp | 55 ++++++++++++----------------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index bff2114f9..de73aabab 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -41,9 +41,9 @@ THE SOFTWARE. #define HIPCHECK(error) \ if (error != hipSuccess) { \ - printf("%serror: '%s'(%d) at %s:%d%s\n", \ - KRED,hipGetErrorString(error), error,\ - __FILE__, __LINE__,KNRM); \ + printf("%serror: '%s'(%d) at %s:%d%s\n", \ + KRED, hipGetErrorString(error), error,\ + __FILE__, __LINE__,KNRM);\ failed("API returned error code.");\ } @@ -53,12 +53,11 @@ void printCompilerInfo () printf ("compiler: hcc version=%s, workweek (YYWWD) = %u\n", __hcc_version__, __hcc_workweek__); #endif #ifdef __NVCC__ - printf ("compiler: nvcc\n"); + printf ("compiler: nvcc\n"); #endif } - -double bytesToGB(size_t s) +double bytesToGB(size_t s) { return (double)s / (1024.0*1024.0*1024.0); } @@ -66,7 +65,6 @@ double bytesToGB(size_t s) void printDeviceProp (int deviceId) { using namespace std; - const int w1 = 30; cout << left; @@ -77,33 +75,27 @@ void printDeviceProp (int deviceId) hipDeviceProp_t props; HIPCHECK(hipDeviceGetProperties(&props, deviceId)); - cout << setw(w1) << "Name: " << props.name << endl; cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0<< " Mhz" << endl; - cout << setw(w1) << "totalGlobalMem" << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; - - cout << setw(w1) << "sharedMemPerBlock" << (float)props.sharedMemPerBlock / 1024.0 << " KB" << endl; - cout << setw(w1) << "regsPerBlock" << props.regsPerBlock << endl; - cout << setw(w1) << "warpSize" << props.warpSize << endl; - cout << setw(w1) << "maxThreadsPerBlock" << props.maxThreadsPerBlock << endl; - cout << setw(w1) << "maxThreadsDim.x" << props.maxThreadsDim[0] << endl; - cout << setw(w1) << "maxThreadsDim.y" << props.maxThreadsDim[1] << endl; - cout << setw(w1) << "maxThreadsDim.z" << props.maxThreadsDim[2] << endl; - - cout << setw(w1) << "maxGridSize.x" << props.maxGridSize[0] << endl; - cout << setw(w1) << "maxGridSize.y" << props.maxGridSize[1] << endl; - cout << setw(w1) << "maxGridSize.z" << props.maxGridSize[2] << endl; - - - cout << setw(w1) << "totalConstMem" << props.totalConstMem << endl; - cout << setw(w1) << "major" << props.major << endl; - cout << setw(w1) << "minor" << props.minor << endl; - cout << setw(w1) << "l2CacheSize" << props.l2CacheSize << endl; - cout << setw(w1) << "maxThreadsPerMultiProcessor" << props.maxThreadsPerMultiProcessor << endl; - cout << setw(w1) << "computeMode" << props.computeMode << endl; - + cout << setw(w1) << "totalGlobalMem" << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; + cout << setw(w1) << "sharedMemPerBlock" << (float)props.sharedMemPerBlock / 1024.0 << " KB" << endl; + cout << setw(w1) << "regsPerBlock" << props.regsPerBlock << endl; + cout << setw(w1) << "warpSize" << props.warpSize << endl; + cout << setw(w1) << "maxThreadsPerBlock" << props.maxThreadsPerBlock << endl; + cout << setw(w1) << "maxThreadsDim.x" << props.maxThreadsDim[0] << endl; + cout << setw(w1) << "maxThreadsDim.y" << props.maxThreadsDim[1] << endl; + cout << setw(w1) << "maxThreadsDim.z" << props.maxThreadsDim[2] << endl; + cout << setw(w1) << "maxGridSize.x" << props.maxGridSize[0] << endl; + cout << setw(w1) << "maxGridSize.y" << props.maxGridSize[1] << endl; + cout << setw(w1) << "maxGridSize.z" << props.maxGridSize[2] << endl; + cout << setw(w1) << "totalConstMem" << props.totalConstMem << endl; + cout << setw(w1) << "major" << props.major << endl; + cout << setw(w1) << "minor" << props.minor << endl; + cout << setw(w1) << "l2CacheSize" << props.l2CacheSize << endl; + cout << setw(w1) << "maxThreadsPerMultiProcessor" << props.maxThreadsPerMultiProcessor << endl; + cout << setw(w1) << "computeMode" << props.computeMode << endl; cout << setw(w1) << "arch.hasGlobalInt32Atomics" << props.arch.hasGlobalInt32Atomics << endl; cout << setw(w1) << "arch.hasGlobalFloatAtomicExch" << props.arch.hasGlobalFloatAtomicExch << endl; cout << setw(w1) << "arch.hasSharedInt32Atomics" << props.arch.hasSharedInt32Atomics << endl; @@ -121,17 +113,14 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "arch.hasSurfaceFuncs" << props.arch.hasSurfaceFuncs << endl; cout << setw(w1) << "arch.has3dGrid" << props.arch.has3dGrid << endl; cout << setw(w1) << "arch.hasDynamicParallelism" << props.arch.hasDynamicParallelism << endl; - cout << endl; size_t free, total; - hipMemGetInfo(&free, &total); cout << fixed << setprecision(2); cout << setw(w1) << "memInfo.total " << bytesToGB(total) << " GB" << endl; cout << setw(w1) << "memInfo.free " << bytesToGB(free) << " GB (" << setprecision(0) << (float)free/total * 100.0 << "%)" << endl; - } int main(int argc, char *argv[]) From ff22a6eb28ac2f2f7464f0fa0b999a19048969a5 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 18 Feb 2016 15:08:55 +0300 Subject: [PATCH 011/700] hipInfo sample update with new Device Properties. --- samples/1_Utils/hipInfo/hipInfo.cpp | 78 +++++++++++++++-------------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index de73aabab..c8979b1cc 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -76,51 +76,55 @@ void printDeviceProp (int deviceId) HIPCHECK(hipDeviceGetProperties(&props, deviceId)); cout << setw(w1) << "Name: " << props.name << endl; + cout << setw(w1) << "pciBusID: " << props.pciBusID << endl; + cout << setw(w1) << "pciDeviceID: " << props.pciDeviceID << endl; cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; + cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; - cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0<< " Mhz" << endl; - cout << setw(w1) << "totalGlobalMem" << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; - cout << setw(w1) << "sharedMemPerBlock" << (float)props.sharedMemPerBlock / 1024.0 << " KB" << endl; - cout << setw(w1) << "regsPerBlock" << props.regsPerBlock << endl; - cout << setw(w1) << "warpSize" << props.warpSize << endl; - cout << setw(w1) << "maxThreadsPerBlock" << props.maxThreadsPerBlock << endl; - cout << setw(w1) << "maxThreadsDim.x" << props.maxThreadsDim[0] << endl; - cout << setw(w1) << "maxThreadsDim.y" << props.maxThreadsDim[1] << endl; - cout << setw(w1) << "maxThreadsDim.z" << props.maxThreadsDim[2] << endl; - cout << setw(w1) << "maxGridSize.x" << props.maxGridSize[0] << endl; - cout << setw(w1) << "maxGridSize.y" << props.maxGridSize[1] << endl; - cout << setw(w1) << "maxGridSize.z" << props.maxGridSize[2] << endl; - cout << setw(w1) << "totalConstMem" << props.totalConstMem << endl; - cout << setw(w1) << "major" << props.major << endl; - cout << setw(w1) << "minor" << props.minor << endl; - cout << setw(w1) << "l2CacheSize" << props.l2CacheSize << endl; - cout << setw(w1) << "maxThreadsPerMultiProcessor" << props.maxThreadsPerMultiProcessor << endl; - cout << setw(w1) << "computeMode" << props.computeMode << endl; - cout << setw(w1) << "arch.hasGlobalInt32Atomics" << props.arch.hasGlobalInt32Atomics << endl; - cout << setw(w1) << "arch.hasGlobalFloatAtomicExch" << props.arch.hasGlobalFloatAtomicExch << endl; - cout << setw(w1) << "arch.hasSharedInt32Atomics" << props.arch.hasSharedInt32Atomics << endl; - cout << setw(w1) << "arch.hasSharedFloatAtomicExch" << props.arch.hasSharedFloatAtomicExch << endl; - cout << setw(w1) << "arch.hasFloatAtomicAdd" << props.arch.hasFloatAtomicAdd << endl; - cout << setw(w1) << "arch.hasGlobalInt64Atomics" << props.arch.hasGlobalInt64Atomics << endl; - cout << setw(w1) << "arch.hasSharedInt64Atomics" << props.arch.hasSharedInt64Atomics << endl; - cout << setw(w1) << "arch.hasDoubles" << props.arch.hasDoubles << endl; - cout << setw(w1) << "arch.hasWarpVote" << props.arch.hasWarpVote << endl; - cout << setw(w1) << "arch.hasWarpBallot" << props.arch.hasWarpBallot << endl; - cout << setw(w1) << "arch.hasWarpShuffle" << props.arch.hasWarpShuffle << endl; - cout << setw(w1) << "arch.hasFunnelShift" << props.arch.hasFunnelShift << endl; - cout << setw(w1) << "arch.hasThreadFenceSystem" << props.arch.hasThreadFenceSystem << endl; - cout << setw(w1) << "arch.hasSyncThreadsExt" << props.arch.hasSyncThreadsExt << endl; - cout << setw(w1) << "arch.hasSurfaceFuncs" << props.arch.hasSurfaceFuncs << endl; - cout << setw(w1) << "arch.has3dGrid" << props.arch.has3dGrid << endl; - cout << setw(w1) << "arch.hasDynamicParallelism" << props.arch.hasDynamicParallelism << endl; + cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0<< " Mhz" << endl; + cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; + cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) << bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl; + cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl; + cout << setw(w1) << "sharedMemPerBlock: " << (float)props.sharedMemPerBlock / 1024.0 << " KB" << endl; + cout << setw(w1) << "regsPerBlock: " << props.regsPerBlock << endl; + cout << setw(w1) << "warpSize: " << props.warpSize << endl; + cout << setw(w1) << "l2CacheSize: " << props.l2CacheSize << endl; + cout << setw(w1) << "computeMode: " << props.computeMode << endl; + cout << setw(w1) << "maxThreadsPerBlock: " << props.maxThreadsPerBlock << endl; + cout << setw(w1) << "maxThreadsDim.x: " << props.maxThreadsDim[0] << endl; + cout << setw(w1) << "maxThreadsDim.y: " << props.maxThreadsDim[1] << endl; + cout << setw(w1) << "maxThreadsDim.z: " << props.maxThreadsDim[2] << endl; + cout << setw(w1) << "maxGridSize.x: " << props.maxGridSize[0] << endl; + cout << setw(w1) << "maxGridSize.y: " << props.maxGridSize[1] << endl; + cout << setw(w1) << "maxGridSize.z: " << props.maxGridSize[2] << endl; + cout << setw(w1) << "major: " << props.major << endl; + cout << setw(w1) << "minor: " << props.minor << endl; + cout << setw(w1) << "concurrentKernels: " << props.concurrentKernels << endl; + cout << setw(w1) << "arch.hasGlobalInt32Atomics: " << props.arch.hasGlobalInt32Atomics << endl; + cout << setw(w1) << "arch.hasGlobalFloatAtomicExch: " << props.arch.hasGlobalFloatAtomicExch << endl; + cout << setw(w1) << "arch.hasSharedInt32Atomics: " << props.arch.hasSharedInt32Atomics << endl; + cout << setw(w1) << "arch.hasSharedFloatAtomicExch: " << props.arch.hasSharedFloatAtomicExch << endl; + cout << setw(w1) << "arch.hasFloatAtomicAdd: " << props.arch.hasFloatAtomicAdd << endl; + cout << setw(w1) << "arch.hasGlobalInt64Atomics: " << props.arch.hasGlobalInt64Atomics << endl; + cout << setw(w1) << "arch.hasSharedInt64Atomics: " << props.arch.hasSharedInt64Atomics << endl; + cout << setw(w1) << "arch.hasDoubles: " << props.arch.hasDoubles << endl; + cout << setw(w1) << "arch.hasWarpVote: " << props.arch.hasWarpVote << endl; + cout << setw(w1) << "arch.hasWarpBallot: " << props.arch.hasWarpBallot << endl; + cout << setw(w1) << "arch.hasWarpShuffle: " << props.arch.hasWarpShuffle << endl; + cout << setw(w1) << "arch.hasFunnelShift: " << props.arch.hasFunnelShift << endl; + cout << setw(w1) << "arch.hasThreadFenceSystem: " << props.arch.hasThreadFenceSystem << endl; + cout << setw(w1) << "arch.hasSyncThreadsExt: " << props.arch.hasSyncThreadsExt << endl; + cout << setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << endl; + cout << setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << endl; + cout << setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << endl; cout << endl; size_t free, total; hipMemGetInfo(&free, &total); cout << fixed << setprecision(2); - cout << setw(w1) << "memInfo.total " << bytesToGB(total) << " GB" << endl; - cout << setw(w1) << "memInfo.free " << bytesToGB(free) << " GB (" << setprecision(0) << (float)free/total * 100.0 << "%)" << endl; + cout << setw(w1) << "memInfo.total: " << bytesToGB(total) << " GB" << endl; + cout << setw(w1) << "memInfo.free: " << bytesToGB(free) << " GB (" << setprecision(0) << (float)free/total * 100.0 << "%)" << endl; } int main(int argc, char *argv[]) From 383310d9d002670ede8e3d1603c46508d68ca3df Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 18 Feb 2016 17:25:28 +0300 Subject: [PATCH 012/700] Device property memoryClockRate implementation. + Device property memoryClockRate is added to hipDeviceProp_t struct. + Device attribute hipDeviceAttributeMemoryClockRate is added to hipDeviceAttribute_t struct. + Tests update. + Rename hipDevAttrConcurrentKernels to hipDeviceAttributeConcurrentKernels. --- samples/1_Utils/hipInfo/hipInfo.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index c8979b1cc..18d9176a0 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -81,7 +81,8 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; - cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0<< " Mhz" << endl; + cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" << endl; + cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) << bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl; cout << setw(w1) << "totalConstMem: " << props.totalConstMem << endl; From 83612db41b780c4dee1bc58677aaa2b49f901b23 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 18 Feb 2016 18:15:01 +0300 Subject: [PATCH 013/700] Device property memoryBusWidth implementation. + Device property memoryBusWidth is added to hipDeviceProp_t struct. + Device attribute hipDeviceAttributeMemoryBusWidth is added to hipDeviceAttribute_t struct. + Tests update. --- samples/1_Utils/hipInfo/hipInfo.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 18d9176a0..9c3d2c1b5 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -80,6 +80,7 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "pciDeviceID: " << props.pciDeviceID << endl; cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor << endl; + cout << setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0 << " Mhz" << endl; From bdb6c4f423f3929fe8c03874aaf64ab771665eee Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 19 Feb 2016 13:27:03 +0300 Subject: [PATCH 014/700] Guard #ifdef USE_ROCR_20 is added for ROCR_20 device properties (memoryClockRate, memoryBusWidth) By default isn't defined. To add ROCR_20 support HIP have to be compiled as follows: make CXX_DEFINES+=-DUSE_ROCR_20 --- samples/1_Utils/hipInfo/hipInfo.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 9c3d2c1b5..c7b298705 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -80,9 +80,11 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "pciDeviceID: " << props.pciDeviceID << endl; cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor << endl; - cout << setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; +#ifdef USE_ROCR_20 cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" << endl; + cout << setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << endl; +#endif cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) << bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl; From 11b75c38de4a9448771b465c83f053e73e2cfeca Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 25 Feb 2016 23:44:39 +0300 Subject: [PATCH 015/700] Attribute hipDeviceAttributeIsMultiGpuBoard for obtaining Device property isMultiGpuBoard is added. On HIP path property obtaining done through hsa_iterate_agents and counting the devices of HSA_DEVICE_TYPE_GPU type. P.S. On multi-boards systems it might be problems with detection what board a GPU plugged into (not tested). --- samples/1_Utils/hipInfo/hipInfo.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index c7b298705..19e8cfc21 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -80,6 +80,7 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "pciDeviceID: " << props.pciDeviceID << endl; cout << setw(w1) << "multiProcessorCount: " << props.multiProcessorCount << endl; cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor << endl; + cout << setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; #ifdef USE_ROCR_20 cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" << endl; From 2fb97ae4d849b7c1307fd2f6b5f50a7f1b310915 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 26 Feb 2016 05:25:30 -0600 Subject: [PATCH 016/700] fixes for titan platform --- samples/1_Utils/hipInfo/hipInfo.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 19e8cfc21..824ab17d3 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -82,10 +82,8 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "maxThreadsPerMultiProcessor: " << props.maxThreadsPerMultiProcessor << endl; cout << setw(w1) << "isMultiGpuBoard: " << props.isMultiGpuBoard << endl; cout << setw(w1) << "clockRate: " << (float)props.clockRate / 1000.0 << " Mhz" << endl; -#ifdef USE_ROCR_20 cout << setw(w1) << "memoryClockRate: " << (float)props.memoryClockRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "memoryBusWidth: " << props.memoryBusWidth << endl; -#endif cout << setw(w1) << "clockInstructionRate: " << (float)props.clockInstructionRate / 1000.0 << " Mhz" << endl; cout << setw(w1) << "totalGlobalMem: " << fixed << setprecision(2) << bytesToGB(props.totalGlobalMem) << " GB" << endl; cout << setw(w1) << "maxSharedMemoryPerMultiProcessor: " << fixed << setprecision(2) << bytesToGB(props.maxSharedMemoryPerMultiProcessor) << " GB" << endl; From 0155e5b879dc8a61e580945404c9b1d696fbc892 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Sun, 6 Mar 2016 08:31:04 -0600 Subject: [PATCH 017/700] corrected hipDeviceGetProperties to hipGetDeviceProperties - not docs --- samples/0_Intro/bit_extract/bit_extract.cpp | 2 +- samples/0_Intro/square/square.hipref.cpp | 2 +- samples/1_Utils/hipInfo/hipInfo.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/0_Intro/bit_extract/bit_extract.cpp b/samples/0_Intro/bit_extract/bit_extract.cpp index d32b65c15..14b5be66d 100644 --- a/samples/0_Intro/bit_extract/bit_extract.cpp +++ b/samples/0_Intro/bit_extract/bit_extract.cpp @@ -60,7 +60,7 @@ int main(int argc, char *argv[]) int deviceId; CHECK (hipGetDevice(&deviceId)); hipDeviceProp_t props; - CHECK(hipDeviceGetProperties(&props, deviceId)); + CHECK(hipGetDeviceProperties(&props, deviceId)); printf ("info: running on device #%d %s\n", deviceId, props.name); diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index ed2a93827..5d53a8d58 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -53,7 +53,7 @@ int main(int argc, char *argv[]) size_t Nbytes = N * sizeof(float); hipDeviceProp_t props; - CHECK(hipDeviceGetProperties(&props, 0/*deviceID*/)); + CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/)); printf ("info: running on device %s\n", props.name); printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 824ab17d3..146d17e01 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -73,7 +73,7 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "device#" << deviceId << endl; hipDeviceProp_t props; - HIPCHECK(hipDeviceGetProperties(&props, deviceId)); + HIPCHECK(hipGetDeviceProperties(&props, deviceId)); cout << setw(w1) << "Name: " << props.name << endl; cout << setw(w1) << "pciBusID: " << props.pciBusID << endl; From f5849f462e949759b5d04400e8ae8f3d99db2fc3 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 13 Mar 2016 09:41:06 -0500 Subject: [PATCH 018/700] refactor, add support for speccing xfers in bytes --- .../hipBusBandwidth/hipBusBandwidth.cpp | 94 ++++++++++++------- 1 file changed, 58 insertions(+), 36 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index d27672592..369fa9e37 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -1,5 +1,7 @@ #include #include +#include +#include #include #include "ResultDatabase.h" @@ -27,8 +29,35 @@ bool p_d2h = true; } + +// **************************************************************************** +int sizeToBytes(int size) { + return (size < 0) ? -size : size * 1024; +} + + +// **************************************************************************** +std::string sizeToString(int size) +{ + using namespace std; + stringstream ss; + if (size < 0) { + ss << char(0x1) << setfill('0') << setw(3) << -size << "B"; + } else { + ss << size << "kB"; + } + return ss.str(); +} + + // **************************************************************************** -// Function: runBenchmark +// -sizes are in bytes, +sizes are in kb, last size must be largest +int sizes[] = {-64, -256, -512, 1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, 32768,65536,131072,262144,524288}; +int nSizes = sizeof(sizes) / sizeof(int); + + +// **************************************************************************** +// Function: RunBenchmark_H2D // // Purpose: // Measures the bandwidth of the bus connecting the host processor to the @@ -51,10 +80,6 @@ bool p_d2h = true; // **************************************************************************** void RunBenchmark_H2D(ResultDatabase &resultDB) { - // Sizes are in kb - int sizes[] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, 32768,65536,131072,262144,524288}; - int nSizes = sizeof(sizes) / sizeof(int); - long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; hipSetDevice(p_device); @@ -66,15 +91,15 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats); while (hipGetLastError() != hipSuccess) { - // drop the size and try again - if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; - --nSizes; - if (nSizes < 1) - { - std::cerr << "Error: Couldn't allocated any pinned buffer\n"; - return; - } - numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any pinned buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats); } } @@ -92,15 +117,15 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) hipMalloc((void**)&device, sizeof(float) * numMaxFloats); while (hipGetLastError() != hipSuccess) { - // drop the size and try again - if (p_verbose) std::cout << " - dropping size allocating device mem\n"; - --nSizes; - if (nSizes < 1) - { - std::cerr << "Error: Couldn't allocated any device buffer\n"; - return; - } - numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating device mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any device buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; hipMalloc((void**)&device, sizeof(float) * numMaxFloats); } @@ -124,7 +149,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) else sizeIndex = (nSizes - 1) - i; - int nbytes = sizes[sizeIndex] * 1024; + int nbytes = sizeToBytes(sizes[sizeIndex]); hipEventRecord(start, 0); hipMemcpy(device, hostMem, nbytes, hipMemcpyHostToDevice); @@ -137,13 +162,13 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) // Convert to GB/sec if (p_verbose) { - std::cerr << "size " << sizes[sizeIndex] << "k took " << t << + std::cerr << "size " << sizeToString(sizes[sizeIndex]) << " took " << t << " ms\n"; } - double speed = (double(sizes[sizeIndex]) * 1024. / (1000*1000)) / t; + double speed = (double(sizeToBytes(sizes[sizeIndex])) / (1000*1000)) / t; char sizeStr[256]; - sprintf(sizeStr, "% 7dkB", sizes[sizeIndex]); + sprintf(sizeStr, "%9s", sizeToString(sizes[sizeIndex]).c_str()); resultDB.AddResult("H2D_Bandwidth", sizeStr, "GB/sec", speed); resultDB.AddResult("H2D_Time", sizeStr, "ms", t); } @@ -166,13 +191,10 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) } + +// **************************************************************************** void RunBenchmark_D2H(ResultDatabase &resultDB) { - - // Sizes are in kb - int nSizes = 20; - int sizes[20] = {1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, - 32768,65536,131072,262144,524288}; long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; // Create some host memory pattern @@ -252,7 +274,7 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) else sizeIndex = (nSizes - 1) - i; - int nbytes = sizes[sizeIndex] * 1024; + int nbytes = sizeToBytes(sizes[sizeIndex]); hipEventRecord(start, 0); hipMemcpy(hostMem2, device, @@ -266,13 +288,13 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) // Convert to GB/sec if (p_verbose) { - std::cerr << "size " < Date: Mon, 14 Mar 2016 14:39:23 -0500 Subject: [PATCH 019/700] Add Bidir copy test and help. --- .../hipBusBandwidth/hipBusBandwidth.cpp | 209 +++++++++++++++++- 1 file changed, 198 insertions(+), 11 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 369fa9e37..88d547e2c 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -12,9 +12,11 @@ bool p_pinned = true; int p_iterations = 10; int p_device = 0; int p_detailed = 0; +bool p_async = 0; -bool p_h2d = true; -bool p_d2h = true; +bool p_h2d = true; +bool p_d2h = true; +bool p_bidir = true; #define CHECK_HIP_ERROR() \ @@ -42,6 +44,7 @@ std::string sizeToString(int size) using namespace std; stringstream ss; if (size < 0) { + // char (01) sorts before " " so will cause Byte values to be displayed before kB. ss << char(0x1) << setfill('0') << setw(3) << -size << "B"; } else { ss << size << "kB"; @@ -50,6 +53,18 @@ std::string sizeToString(int size) } +// **************************************************************************** +hipError_t memcopy(void * dst, const void *src, size_t sizeBytes, enum hipMemcpyKind kind) +{ + if (p_async) { + return hipMemcpyAsync(dst, src, sizeBytes, kind, NULL); + } else { + return hipMemcpy(dst, src, sizeBytes, kind); + } +} + + + // **************************************************************************** // -sizes are in bytes, +sizes are in kb, last size must be largest int sizes[] = {-64, -256, -512, 1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384, 32768,65536,131072,262144,524288}; @@ -152,7 +167,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) int nbytes = sizeToBytes(sizes[sizeIndex]); hipEventRecord(start, 0); - hipMemcpy(device, hostMem, nbytes, hipMemcpyHostToDevice); + memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice); hipEventRecord(stop, 0); hipEventSynchronize(stop); float t = 0; @@ -251,8 +266,7 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) hipMalloc((void**)&device, sizeof(float) * numMaxFloats); } - hipMemcpy(device, hostMem1, - numMaxFloats*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(device, hostMem1, numMaxFloats*sizeof(float), hipMemcpyHostToDevice); hipDeviceSynchronize(); hipEvent_t start, stop; @@ -277,8 +291,7 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) int nbytes = sizeToBytes(sizes[sizeIndex]); hipEventRecord(start, 0); - hipMemcpy(hostMem2, device, - nbytes, hipMemcpyDeviceToHost); + memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost); hipEventRecord(stop, 0); hipEventSynchronize(stop); float t = 0; @@ -323,6 +336,147 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) } +void RunBenchmark_Bidir(ResultDatabase &resultDB) +{ + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + hipSetDevice(p_device); + + hipStream_t stream[2]; + + + // Create some host memory pattern + float *hostMem[2] = {NULL, NULL}; + if (p_pinned) + { + while (1) + { + hipError_t e1 = hipMallocHost((void**)&hostMem[0], sizeof(float) * numMaxFloats); + hipError_t e2 = hipMallocHost((void**)&hostMem[1], sizeof(float) * numMaxFloats); + + if ((e1 == hipSuccess) && (e2 == hipSuccess)) { + break; + } else { + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any pinned buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + } + } + } + else + { + hostMem[0] = new float[numMaxFloats]; + hostMem[1] = new float[numMaxFloats]; + } + + for (int i = 0; i < numMaxFloats; i++) + { + hostMem[0][i] = i % 77; + } + + float *deviceMem[2]; + while (1) { + hipError_t e1 = hipMalloc((void**)&deviceMem[0], sizeof(float) * numMaxFloats); + hipError_t e2 = hipMalloc((void**)&deviceMem[1], sizeof(float) * numMaxFloats); + + if ((e1 == hipSuccess) && (e2 == hipSuccess)) { + break; + } else { + if (e1) { + // First alloc succeeded, so free it before trying again + hipFree(&deviceMem[0]); + } + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating device mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any device buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + } + }; + + + hipMemset(deviceMem[1], 0xFA, numMaxFloats); + + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + CHECK_HIP_ERROR(); + hipStreamCreate(&stream[0]); + hipStreamCreate(&stream[1]); + + // Three passes, forward and backward both + for (int pass = 0; pass < p_iterations; pass++) + { + // store the times temporarily to estimate latency + //float times[nSizes]; + // Step through sizes forward on even passes and backward on odd + for (int i = 0; i < nSizes; i++) + { + int sizeIndex; + if ((pass % 2) == 0) + sizeIndex = i; + else + sizeIndex = (nSizes - 1) - i; + + int nbytes = sizeToBytes(sizes[sizeIndex]); + + hipEventRecord(start, 0); + hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]); + hipMemcpyAsync(hostMem[1], deviceMem[1], nbytes, hipMemcpyDeviceToHost, stream[1]); + hipEventRecord(stop, 0); + hipEventSynchronize(stop); + float t = 0; + hipEventElapsedTime(&t, start, stop); + //times[sizeIndex] = t; + + // Convert to GB/sec + if (p_verbose) + { + std::cerr << "size " << sizeToString(sizes[sizeIndex]) << " took " << t << + " ms\n"; + } + + double speed = (double(sizeToBytes(sizes[sizeIndex])) / (1000*1000)) / t; + char sizeStr[256]; + sprintf(sizeStr, "%9s", sizeToString(sizes[sizeIndex]).c_str()); + resultDB.AddResult("Bidir_Bandwidth", sizeStr, "GB/sec", speed); + resultDB.AddResult("Bidir_Time", sizeStr, "ms", t); + } + } + + // Cleanup + hipFree((void*)deviceMem[0]); + hipFree((void*)deviceMem[1]); + CHECK_HIP_ERROR(); + if (p_pinned) + { + hipFreeHost((void*)hostMem[0]); + hipFreeHost((void*)hostMem[1]); + CHECK_HIP_ERROR(); + } + else + { + delete[] hostMem[0]; + delete[] hostMem[1]; + } + hipEventDestroy(start); + hipEventDestroy(stop); + hipStreamDestroy(stream[0]); + hipStreamDestroy(stream[1]); +} + + #define failed(...) \ printf ("error: ");\ printf (__VA_ARGS__);\ @@ -337,6 +491,17 @@ int parseInt(const char *str, int *output) } void help() { + printf ("Usage: hipBusBandwidth [OPTIONS]\n"); + printf (" --iterations, -i : Number of copy iterations to run.\n"); + printf (" --device, -d : Device ID to use (0..numDevices).\n"); + printf (" --unpinned : Use unpinned host memory.\n"); + printf (" --d2h : Run only device-to-host test.\n"); + printf (" --h2d : Run only host-to-device test.\n"); + printf (" --bidir : Run only bidir copy test.\n"); + printf (" --verbose : Print verbose status messages as test is run.\n"); + printf (" --detailed : Print detailed report (including all trials).\n"); + printf (" --async : Use hipMemcpyAsync(with NULL stream) for H2D/D2H. Default uses hipMemcpy.\n"); + }; int parseStandardArguments(int argc, char *argv[]) @@ -357,18 +522,28 @@ int parseStandardArguments(int argc, char *argv[]) } else if (!strcmp(arg, "--unpinned")) { p_pinned = 0; } else if (!strcmp(arg, "--h2d")) { - p_h2d = true; - p_d2h = false; + p_h2d = true; + p_d2h = false; + p_bidir = false; } else if (!strcmp(arg, "--d2h")) { - p_h2d = false; - p_d2h = true; + p_h2d = false; + p_d2h = true; + p_bidir = false; + + } else if (!strcmp(arg, "--bidir")) { + p_h2d = false; + p_d2h = false; + p_bidir = true; } else if (!strcmp(arg, "--help") || (!strcmp(arg, "-h"))) { help(); + exit(EXIT_SUCCESS); } else if (!strcmp(arg, "--verbose")) { p_verbose = 1; + } else if (!strcmp(arg, "--async")) { + p_async = 1; } else if (!strcmp(arg, "--detailed")) { p_detailed = 1; } else { @@ -406,4 +581,16 @@ int main(int argc, char *argv[]) resultDB.DumpDetailed(std::cout); } } + + + if (p_bidir) { + ResultDatabase resultDB; + RunBenchmark_Bidir(resultDB); + + resultDB.DumpSummary(std::cout); + + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } + } } From 6b34ae47973671aa7e254e31270679072726b0ef Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 14 Mar 2016 23:02:49 -0500 Subject: [PATCH 020/700] print device config info --- samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 88d547e2c..a43bee77e 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -490,6 +490,14 @@ int parseInt(const char *str, int *output) return !strlen(next); } + +void printConfig() { + hipDeviceProp_t props; + hipGetDeviceProperties(&props, p_device); + + printf ("Device:%s Mem=%.1fGB #CUs=%d Freq=%.0fMhz Pinned=%s\n", props.name, props.totalGlobalMem/1024.0/1024.0/1024.0, props.multiProcessorCount, props.clockRate/1000.0, p_pinned ? "YES" : "NO"); +} + void help() { printf ("Usage: hipBusBandwidth [OPTIONS]\n"); printf (" --iterations, -i : Number of copy iterations to run.\n"); @@ -560,6 +568,8 @@ int main(int argc, char *argv[]) { parseStandardArguments(argc, argv); + printConfig(); + if (p_h2d) { ResultDatabase resultDB; RunBenchmark_H2D(resultDB); From e376b1baec93ad87c239053529628e9dd3476ebd Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 15 Mar 2016 13:39:15 -0500 Subject: [PATCH 021/700] v2 deprecating hipMallocHost with hipHostAlloc --- samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 88d547e2c..3d2aa659a 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -103,7 +103,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) float *hostMem = NULL; if (p_pinned) { - hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats); + hipHostAlloc((void**)&hostMem, sizeof(float) * numMaxFloats, hipHostAllocDefault); while (hipGetLastError() != hipSuccess) { // drop the size and try again @@ -115,7 +115,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) return; } numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; - hipMallocHost((void**)&hostMem, sizeof(float) * numMaxFloats); + hipHostAlloc((void**)&hostMem, sizeof(float) * numMaxFloats, hipHostAllocDefault); } } else @@ -217,9 +217,9 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) float *hostMem2; if (p_pinned) { - hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats); + hipHostAlloc((void**)&hostMem1, sizeof(float)*numMaxFloats, hipHostAllocDefault); hipError_t err1 = hipGetLastError(); - hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats); + hipHostAlloc((void**)&hostMem2, sizeof(float)*numMaxFloats, hipHostAllocDefault); hipError_t err2 = hipGetLastError(); while (err1 != hipSuccess || err2 != hipSuccess) { From 93c30afe2f8cc84a70eec27cdb10eaafdb46eca4 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 15 Mar 2016 12:37:24 -0500 Subject: [PATCH 022/700] added performance metrics for kernel dispatch --- samples/1_Utils/hipDispatchLatency/Makefile | 15 +++ .../hipDispatchLatency/hipDispatchLatency.cpp | 120 ++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 samples/1_Utils/hipDispatchLatency/Makefile create mode 100644 samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp diff --git a/samples/1_Utils/hipDispatchLatency/Makefile b/samples/1_Utils/hipDispatchLatency/Makefile new file mode 100644 index 000000000..b4543e6f9 --- /dev/null +++ b/samples/1_Utils/hipDispatchLatency/Makefile @@ -0,0 +1,15 @@ +HIP_PATH?=$(shell hipconfig -p) +HIPCC=$(HIP_PATH)/bin/hipcc + +EXE=hipDispatchLatency + +all: install + +$(EXE): hipDispatchLatency.cpp + $(HIPCC) hipDispatchLatency.cpp -o $@ + +install: $(EXE) + cp $(EXE) $(HIP_PATH)/bin + +clean: + rm -f *.o $(EXE) diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp new file mode 100644 index 000000000..fbdbced06 --- /dev/null +++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp @@ -0,0 +1,120 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include"hip_runtime.h" +#include +#include + +#define check(msg, status) \ +if(status != hipSuccess){ \ + printf("%s failed.\n",#msg); \ + exit(1); \ +} + +#define LEN 1024*1024 +#define SIZE LEN * sizeof(float) +#define ITER 10000 + +__global__ void One(hipLaunchParm lp, float* Ad){ +} + +int main(){ + +hipError_t err; +float *A, *Ad; + +A = new float[LEN]; + +for(int i=0;i Date: Tue, 15 Mar 2016 21:05:15 -0500 Subject: [PATCH 023/700] Added single kernel launch to sample --- .../hipDispatchLatency/hipDispatchLatency.cpp | 173 ++++++++++-------- 1 file changed, 92 insertions(+), 81 deletions(-) diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp index fbdbced06..a47931958 100644 --- a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp +++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp @@ -36,85 +36,96 @@ __global__ void One(hipLaunchParm lp, float* Ad){ int main(){ -hipError_t err; -float *A, *Ad; - -A = new float[LEN]; - -for(int i=0;i Date: Tue, 15 Mar 2016 14:22:00 -0500 Subject: [PATCH 024/700] corrected first and second kernel dispatch --- samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp index a47931958..9827b9422 100644 --- a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp +++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp @@ -64,13 +64,13 @@ int main(){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); - std::cout<<"First Kernel Launch: \t\t"< Date: Fri, 18 Mar 2016 03:09:52 -0500 Subject: [PATCH 025/700] Supported --aliged mode. Add results check for H2D and D2H. --- .../hipBusBandwidth/hipBusBandwidth.cpp | 56 +++++++++++++++---- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index a43bee77e..6e875667f 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -13,6 +13,7 @@ int p_iterations = 10; int p_device = 0; int p_detailed = 0; bool p_async = 0; +bool p_alignedhost = 1; bool p_h2d = true; bool p_d2h = true; @@ -120,7 +121,11 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) } else { - hostMem = new float[numMaxFloats]; + if (p_alignedhost) { + hostMem = (float*)aligned_alloc(64, numMaxFloats*sizeof(float)); + } else { + hostMem = new float[numMaxFloats]; + } } for (int i = 0; i < numMaxFloats; i++) @@ -189,6 +194,21 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) } } + // Check. First reset the host memory, then copy-back result. Then compare against original ref value. + for (int i = 0; i < numMaxFloats; i++) + { + hostMem[i] = 0; + } + hipMemcpy(hostMem, device, numMaxFloats*sizeof(float), hipMemcpyDeviceToHost); + for (int i = 0; i < numMaxFloats; i++) + { + float ref = i % 77; + if (ref != hostMem[i]) { + printf ("error: H2D. i=%d reference:%6.f != copyback:%6.2f\n", i, ref, hostMem[i]); + } + } + + // Cleanup hipFree((void*)device); CHECK_HIP_ERROR(); @@ -199,7 +219,11 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) } else { - delete[] hostMem; + if (p_alignedhost) { + delete[] hostMem; + } else { + free(hostMem); + } } hipEventDestroy(start); hipEventDestroy(stop); @@ -254,15 +278,15 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) hipMalloc((void**)&device, sizeof(float) * numMaxFloats); while (hipGetLastError() != hipSuccess) { - // drop the size and try again - if (p_verbose) std::cout << " - dropping size allocating device mem\n"; - --nSizes; - if (nSizes < 1) - { - std::cerr << "Error: Couldn't allocated any device buffer\n"; - return; - } - numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating device mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocated any device buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; hipMalloc((void**)&device, sizeof(float) * numMaxFloats); } @@ -316,6 +340,16 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) //resultDB.AddResult("ReadbackLatencyEstimate", "2-4kb", "ms", times[1]-(times[2]-times[1])/1.); } + + // Check. First reset the host memory, then copy-back result. Then compare against original ref value. + for (int i = 0; i < numMaxFloats; i++) + { + float ref = i % 77; + if (ref != hostMem2[i]) { + printf ("error: D2H. i=%d reference:%6.f != copyback:%6.2f\n", i, ref, hostMem2[i]); + } + } + // Cleanup hipFree((void*)device); CHECK_HIP_ERROR(); From 62fb06f54e54416aa7f676a1d6ecfb3c760c334f Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 18 Mar 2016 21:28:29 -0500 Subject: [PATCH 026/700] Print Pinned or Unpinned in result summary --- samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 6e875667f..7e12da9f3 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -10,6 +10,7 @@ bool p_verbose = false; bool p_pinned = true; int p_iterations = 10; +int p_beatsperiteration=1; int p_device = 0; int p_detailed = 0; bool p_async = 0; @@ -189,8 +190,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) double speed = (double(sizeToBytes(sizes[sizeIndex])) / (1000*1000)) / t; char sizeStr[256]; sprintf(sizeStr, "%9s", sizeToString(sizes[sizeIndex]).c_str()); - resultDB.AddResult("H2D_Bandwidth", sizeStr, "GB/sec", speed); - resultDB.AddResult("H2D_Time", sizeStr, "ms", t); + resultDB.AddResult(std::string("H2D_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("H2D_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t); } } @@ -332,8 +333,8 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) double speed = (double(sizeToBytes(sizes[sizeIndex])) / (1000*1000)) / t; char sizeStr[256]; sprintf(sizeStr, "%9s", sizeToString(sizes[sizeIndex]).c_str()); - resultDB.AddResult("D2H_Bandwidth", sizeStr, "GB/sec", speed); - resultDB.AddResult("D2H_Time", sizeStr, "ms", t); + resultDB.AddResult(std::string("D2H_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("D2H_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t); } //resultDB.AddResult("ReadbackLatencyEstimate", "1-2kb", "ms", times[0]-(times[1]-times[0])/1.); //resultDB.AddResult("ReadbackLatencyEstimate", "1-4kb", "ms", times[0]-(times[2]-times[0])/3.); @@ -484,8 +485,8 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB) double speed = (double(sizeToBytes(sizes[sizeIndex])) / (1000*1000)) / t; char sizeStr[256]; sprintf(sizeStr, "%9s", sizeToString(sizes[sizeIndex]).c_str()); - resultDB.AddResult("Bidir_Bandwidth", sizeStr, "GB/sec", speed); - resultDB.AddResult("Bidir_Time", sizeStr, "ms", t); + resultDB.AddResult(std::string("Bidir_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("Bidir_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t); } } From f7e2c254dfa27b2e9d5ae43b092b5af661dddca3 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 18 Mar 2016 23:43:04 -0500 Subject: [PATCH 027/700] Improve formatting - line up cols --- samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp b/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp index 7d2f3aef8..b6ca68e57 100644 --- a/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp +++ b/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp @@ -278,7 +278,8 @@ void ResultDatabase::DumpSummary(ostream &out) { vector sorted(results); - int testW = 15 ; + const int testNameW = 24 ; + const int attW = 15; const int fieldW = 9; sort(sorted.begin(), sorted.end()); @@ -287,8 +288,9 @@ void ResultDatabase::DumpSummary(ostream &out) // TODO: in big parallel runs, the "trials" are the procs // and we really don't want to print them all out.... - out << setw(testW) << "test\t" << setw(fieldW) - << "atts\t" + out << setw(testNameW) << "test\t" + << setw(attW) << "atts\t" + << setw(fieldW) << "units\t" << "median\t" << "mean\t" @@ -300,9 +302,9 @@ void ResultDatabase::DumpSummary(ostream &out) for (int i=0; i Date: Sat, 19 Mar 2016 02:43:04 -0500 Subject: [PATCH 028/700] Add beastperiteration and onesize for testing. onesize allows running tests at one specific size. --- .../hipBusBandwidth/ResultDatabase.cpp | 24 ++-- .../hipBusBandwidth/hipBusBandwidth.cpp | 105 ++++++++++++------ 2 files changed, 85 insertions(+), 44 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp b/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp index b6ca68e57..2ec686f26 100644 --- a/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp +++ b/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp @@ -189,9 +189,13 @@ void ResultDatabase::AddResult(const string &test_orig, void ResultDatabase::DumpDetailed(ostream &out) { vector sorted(results); - sort(sorted.begin(), sorted.end()); + const int testNameW = 24 ; + const int attW = 12; + const int fieldW = 11; + out << std::fixed << right << std::setprecision(4); + int maxtrials = 1; for (int i=0; i sorted(results); + sort(sorted.begin(), sorted.end()); const int testNameW = 24 ; - const int attW = 15; + const int attW = 12; const int fieldW = 9; - - sort(sorted.begin(), sorted.end()); - out << std::fixed << right << std::setprecision(4); // TODO: in big parallel runs, the "trials" are the procs diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 7e12da9f3..1ac299008 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -14,13 +14,16 @@ int p_beatsperiteration=1; int p_device = 0; int p_detailed = 0; bool p_async = 0; -bool p_alignedhost = 1; +int p_alignedhost = 0; // align host allocs to this granularity, in bytes. 64 or 4096 are good values to try. +int p_onesize = 0; bool p_h2d = true; bool p_d2h = true; bool p_bidir = true; + + #define CHECK_HIP_ERROR() \ { \ hipError_t err = hipGetLastError(); \ @@ -46,8 +49,8 @@ std::string sizeToString(int size) using namespace std; stringstream ss; if (size < 0) { - // char (01) sorts before " " so will cause Byte values to be displayed before kB. - ss << char(0x1) << setfill('0') << setw(3) << -size << "B"; + // char (09, horiz tab) lexically sorts before " " so will cause Byte values to be displayed before kB. + ss << char(0x09)/*tab*/ << setfill('0') << setw(3) << -size << "B"; } else { ss << size << "kB"; } @@ -123,7 +126,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) else { if (p_alignedhost) { - hostMem = (float*)aligned_alloc(64, numMaxFloats*sizeof(float)); + hostMem = (float*)aligned_alloc(p_alignedhost, numMaxFloats*sizeof(float)); } else { hostMem = new float[numMaxFloats]; } @@ -170,10 +173,13 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) else sizeIndex = (nSizes - 1) - i; - int nbytes = sizeToBytes(sizes[sizeIndex]); + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); hipEventRecord(start, 0); - memcopy(device, hostMem, nbytes, hipMemcpyHostToDevice); + for (int j=0;j1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } resultDB.AddResult(std::string("H2D_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed); resultDB.AddResult(std::string("H2D_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t); + + if (p_onesize) { + break; + } } } + if (p_onesize) { + numMaxFloats = sizeToBytes(p_onesize) / sizeof(float); + } + // Check. First reset the host memory, then copy-back result. Then compare against original ref value. for (int i = 0; i < numMaxFloats; i++) { @@ -313,10 +330,13 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) else sizeIndex = (nSizes - 1) - i; - int nbytes = sizeToBytes(sizes[sizeIndex]); + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); hipEventRecord(start, 0); - memcopy(hostMem2, device, nbytes, hipMemcpyDeviceToHost); + for (int j=0;j1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } resultDB.AddResult(std::string("D2H_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed); resultDB.AddResult(std::string("D2H_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t); + if (p_onesize) { + break; + } } - //resultDB.AddResult("ReadbackLatencyEstimate", "1-2kb", "ms", times[0]-(times[1]-times[0])/1.); - //resultDB.AddResult("ReadbackLatencyEstimate", "1-4kb", "ms", times[0]-(times[2]-times[0])/3.); - //resultDB.AddResult("ReadbackLatencyEstimate", "2-4kb", "ms", times[1]-(times[2]-times[1])/1.); } - + if (p_onesize) { + numMaxFloats = sizeToBytes(p_onesize) / sizeof(float); + } // Check. First reset the host memory, then copy-back result. Then compare against original ref value. for (int i = 0; i < numMaxFloats; i++) { @@ -464,7 +491,8 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB) else sizeIndex = (nSizes - 1) - i; - int nbytes = sizeToBytes(sizes[sizeIndex]); + const int thisSize = p_onesize ? p_onesize : sizes[sizeIndex]; + const int nbytes = sizeToBytes(thisSize); hipEventRecord(start, 0); hipMemcpyAsync(deviceMem[0], hostMem[0], nbytes, hipMemcpyHostToDevice, stream[0]); @@ -473,18 +501,17 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB) hipEventSynchronize(stop); float t = 0; hipEventElapsedTime(&t, start, stop); - //times[sizeIndex] = t; // Convert to GB/sec if (p_verbose) { - std::cerr << "size " << sizeToString(sizes[sizeIndex]) << " took " << t << + std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; } - double speed = (double(sizeToBytes(sizes[sizeIndex])) / (1000*1000)) / t; + double speed = (double(sizeToBytes(thisSize)) / (1000*1000)) / t; char sizeStr[256]; - sprintf(sizeStr, "%9s", sizeToString(sizes[sizeIndex]).c_str()); + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); resultDB.AddResult(std::string("Bidir_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed); resultDB.AddResult(std::string("Bidir_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t); } @@ -535,15 +562,18 @@ void printConfig() { void help() { printf ("Usage: hipBusBandwidth [OPTIONS]\n"); - printf (" --iterations, -i : Number of copy iterations to run.\n"); - printf (" --device, -d : Device ID to use (0..numDevices).\n"); - printf (" --unpinned : Use unpinned host memory.\n"); - printf (" --d2h : Run only device-to-host test.\n"); - printf (" --h2d : Run only host-to-device test.\n"); - printf (" --bidir : Run only bidir copy test.\n"); - printf (" --verbose : Print verbose status messages as test is run.\n"); - printf (" --detailed : Print detailed report (including all trials).\n"); - printf (" --async : Use hipMemcpyAsync(with NULL stream) for H2D/D2H. Default uses hipMemcpy.\n"); + printf (" --iterations, -i : Number of copy iterations to run.\n"); + printf (" --beatsperiterations, -b : Number of beats (back-to-back copies of same size) per iteration to run.\n"); + printf (" --device, -d : Device ID to use (0..numDevices).\n"); + printf (" --unpinned : Use unpinned host memory.\n"); + printf (" --d2h : Run only device-to-host test.\n"); + printf (" --h2d : Run only host-to-device test.\n"); + printf (" --bidir : Run only bidir copy test.\n"); + printf (" --verbose : Print verbose status messages as test is run.\n"); + printf (" --detailed : Print detailed report (including all trials).\n"); + + printf (" --async : Use hipMemcpyAsync(with NULL stream) for H2D/D2H. Default uses hipMemcpy.\n"); + printf (" --onesize, -o : Only run one measurement, at specified size (in KB, or if negative in bytes)\n"); }; @@ -558,10 +588,18 @@ int parseStandardArguments(int argc, char *argv[]) if (++i >= argc || !parseInt(argv[i], &p_iterations)) { failed("Bad iterations argument"); } + } else if (!strcmp(arg, "--beatsperiteration") || (!strcmp(arg, "-b"))) { + if (++i >= argc || !parseInt(argv[i], &p_beatsperiteration)) { + failed("Bad beatsperiteration argument"); + } } else if (!strcmp(arg, "--device") || (!strcmp(arg, "-d"))) { if (++i >= argc || !parseInt(argv[i], &p_device)) { failed("Bad device argument"); } + } else if (!strcmp(arg, "--onesize") || (!strcmp(arg, "-o"))) { + if (++i >= argc || !parseInt(argv[i], &p_onesize)) { + failed("Bad onesize argument"); + } } else if (!strcmp(arg, "--unpinned")) { p_pinned = 0; } else if (!strcmp(arg, "--h2d")) { @@ -583,6 +621,7 @@ int parseStandardArguments(int argc, char *argv[]) help(); exit(EXIT_SUCCESS); + } else if (!strcmp(arg, "--verbose")) { p_verbose = 1; } else if (!strcmp(arg, "--async")) { From 3b3bae37721ba904d0dd8f1ef59c3882dc741c8f Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 22 Mar 2016 02:30:10 -0500 Subject: [PATCH 029/700] hipHostRegister and hipHostMalloc refactor. Note hipHostMalloc (not hipHostAlloc or hipMallocHost). - the hipHost* is used for all HIP APIs dealing with Host memory. (including hipHostMalloc, hipHostFree, hipHostRegister, hipHostUnregister, hipHostGetFlags, hipHostGetDevicePointer). - hipMallocHost is consistent with "hipMalloc" for allocating device memory. Enumerations hipHostMalloc* also used as optional flags parm to hipHostMalloc. --- .../hipBusBandwidth/hipBusBandwidth.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 25b14305d..2230a8eef 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -108,7 +108,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) float *hostMem = NULL; if (p_pinned) { - hipHostAlloc((void**)&hostMem, sizeof(float) * numMaxFloats, hipHostAllocDefault); + hipHostMalloc((void**)&hostMem, sizeof(float) * numMaxFloats); while (hipGetLastError() != hipSuccess) { // drop the size and try again @@ -120,7 +120,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) return; } numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; - hipHostAlloc((void**)&hostMem, sizeof(float) * numMaxFloats, hipHostAllocDefault); + hipHostMalloc((void**)&hostMem, sizeof(float) * numMaxFloats); } } else @@ -232,7 +232,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) CHECK_HIP_ERROR(); if (p_pinned) { - hipFreeHost((void*)hostMem); + hipHostFree((void*)hostMem); CHECK_HIP_ERROR(); } else @@ -259,15 +259,15 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) float *hostMem2; if (p_pinned) { - hipHostAlloc((void**)&hostMem1, sizeof(float)*numMaxFloats, hipHostAllocDefault); + hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats); hipError_t err1 = hipGetLastError(); - hipHostAlloc((void**)&hostMem2, sizeof(float)*numMaxFloats, hipHostAllocDefault); + hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats); hipError_t err2 = hipGetLastError(); while (err1 != hipSuccess || err2 != hipSuccess) { // free the first buffer if only the second failed if (err1 == hipSuccess) - hipFreeHost((void*)hostMem1); + hipHostFree((void*)hostMem1); // drop the size and try again if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; @@ -383,9 +383,9 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) CHECK_HIP_ERROR(); if (p_pinned) { - hipFreeHost((void*)hostMem1); + hipHostFree((void*)hostMem1); CHECK_HIP_ERROR(); - hipFreeHost((void*)hostMem2); + hipHostFree((void*)hostMem2); CHECK_HIP_ERROR(); } else @@ -523,8 +523,8 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB) CHECK_HIP_ERROR(); if (p_pinned) { - hipFreeHost((void*)hostMem[0]); - hipFreeHost((void*)hostMem[1]); + hipHostFree((void*)hostMem[0]); + hipHostFree((void*)hostMem[1]); CHECK_HIP_ERROR(); } else From c1dd930c92626bf8cdafa1533922d449de82641b Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 22 Mar 2016 09:27:10 -0500 Subject: [PATCH 030/700] Only include activity logger if CodeXL installed. Fix hipHostMalloc in hipBusBandwidth. --- samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 2230a8eef..faff9ba6e 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -278,9 +278,9 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) return; } numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; - hipMallocHost((void**)&hostMem1, sizeof(float)*numMaxFloats); + hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats); err1 = hipGetLastError(); - hipMallocHost((void**)&hostMem2, sizeof(float)*numMaxFloats); + hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats); err2 = hipGetLastError(); } } @@ -413,8 +413,8 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB) { while (1) { - hipError_t e1 = hipMallocHost((void**)&hostMem[0], sizeof(float) * numMaxFloats); - hipError_t e2 = hipMallocHost((void**)&hostMem[1], sizeof(float) * numMaxFloats); + hipError_t e1 = hipHostMalloc((void**)&hostMem[0], sizeof(float) * numMaxFloats); + hipError_t e2 = hipHostMalloc((void**)&hostMem[1], sizeof(float) * numMaxFloats); if ((e1 == hipSuccess) && (e2 == hipSuccess)) { break; From 4992ccfea6cbfe8e869442a680d3392478747087 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 23 Mar 2016 11:39:57 -0500 Subject: [PATCH 031/700] Logging dispatch latency through database util --- samples/1_Utils/hipDispatchLatency/Makefile | 2 +- .../hipDispatchLatency/ResultDatabase.cpp | 527 ++++++++++++++++++ .../hipDispatchLatency/ResultDatabase.h | 100 ++++ .../hipDispatchLatency/hipDispatchLatency.cpp | 37 +- 4 files changed, 655 insertions(+), 11 deletions(-) create mode 100644 samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp create mode 100644 samples/1_Utils/hipDispatchLatency/ResultDatabase.h diff --git a/samples/1_Utils/hipDispatchLatency/Makefile b/samples/1_Utils/hipDispatchLatency/Makefile index b4543e6f9..4b33a0ff6 100644 --- a/samples/1_Utils/hipDispatchLatency/Makefile +++ b/samples/1_Utils/hipDispatchLatency/Makefile @@ -6,7 +6,7 @@ EXE=hipDispatchLatency all: install $(EXE): hipDispatchLatency.cpp - $(HIPCC) hipDispatchLatency.cpp -o $@ + $(HIPCC) hipDispatchLatency.cpp ResultDatabase.cpp -o $@ install: $(EXE) cp $(EXE) $(HIP_PATH)/bin diff --git a/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp b/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp new file mode 100644 index 000000000..2ec686f26 --- /dev/null +++ b/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp @@ -0,0 +1,527 @@ +#include "ResultDatabase.h" + +#include +#include +#include +#include + +using namespace std; + +bool ResultDatabase::Result::operator<(const Result &rhs) const +{ + if (test < rhs.test) + return true; + if (test > rhs.test) + return false; + if (atts < rhs.atts) + return true; + if (atts > rhs.atts) + return false; + return false; // less-operator returns false on equal +} + +double ResultDatabase::Result::GetMin() const +{ + double r = FLT_MAX; + for (int i=0; i= 100) + return value[n-1]; + + double index = ((n + 1.) * q / 100.) - 1; + + vector sorted = value; + sort(sorted.begin(), sorted.end()); + + if (n == 2) + return (sorted[0] * (1 - q/100.) + sorted[1] * (q/100.)); + + int index_lo = int(index); + double frac = index - index_lo; + if (frac == 0) + return sorted[index_lo]; + + double lo = sorted[index_lo]; + double hi = sorted[index_lo + 1]; + return lo + (hi-lo)*frac; +} + +double ResultDatabase::Result::GetMean() const +{ + double r = 0; + for (int i=0; i &values) +{ + for (int i=0; i= results.size()) + { + Result r; + r.test = test; + r.atts = atts; + r.unit = unit; + results.push_back(r); + } + + results[index].value.push_back(value); +} + +// **************************************************************************** +// Method: ResultDatabase::DumpDetailed +// +// Purpose: +// Writes the full results, including all trials. +// +// Arguments: +// out where to print +// +// Programmer: Jeremy Meredith +// Creation: August 14, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Nov 10 14:25:17 EST 2010 +// Renamed to DumpDetailed to make room for a DumpSummary. +// +// Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010 +// Added note about (*) missing value tag. +// +// Jeremy Meredith, Tue Nov 23 13:57:02 EST 2010 +// Changed note about missing values to be worded a little better. +// +// **************************************************************************** +void ResultDatabase::DumpDetailed(ostream &out) +{ + vector sorted(results); + sort(sorted.begin(), sorted.end()); + + const int testNameW = 24 ; + const int attW = 12; + const int fieldW = 11; + out << std::fixed << right << std::setprecision(4); + + int maxtrials = 1; + for (int i=0; i maxtrials) + maxtrials = sorted[i].value.size(); + } + + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << setw(testNameW) << "test\t" + << setw(attW) << "atts\t" + << setw(fieldW) + << "median\t" + << "mean\t" + << "stddev\t" + << "min\t" + << "max\t"; + for (int i=0; i sorted(results); + sort(sorted.begin(), sorted.end()); + + const int testNameW = 24 ; + const int attW = 12; + const int fieldW = 9; + out << std::fixed << right << std::setprecision(4); + + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << setw(testNameW) << "test\t" + << setw(attW) << "atts\t" + << setw(fieldW) + << "units\t" + << "median\t" + << "mean\t" + << "stddev\t" + << "min\t" + << "max\t"; + out << endl; + + for (int i=0; i sorted(results); + + sort(sorted.begin(), sorted.end()); + + //Check to see if the file is empty - if so, add the headers + emptyFile = this->IsFileEmpty(fileName); + + //Open file and append by default + ofstream out; + out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app); + + //Add headers only for empty files + if(emptyFile) + { + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << "test, " + << "atts, " + << "units, " + << "median, " + << "mean, " + << "stddev, " + << "min, " + << "max, "; + out << endl; + } + + for (int i=0; i +ResultDatabase::GetResultsForTest(const string &test) +{ + // get only the given test results + vector retval; + for (int i=0; i & +ResultDatabase::GetResults() const +{ + return results; +} diff --git a/samples/1_Utils/hipDispatchLatency/ResultDatabase.h b/samples/1_Utils/hipDispatchLatency/ResultDatabase.h new file mode 100644 index 000000000..4b63a02a1 --- /dev/null +++ b/samples/1_Utils/hipDispatchLatency/ResultDatabase.h @@ -0,0 +1,100 @@ +#ifndef RESULT_DATABASE_H +#define RESULT_DATABASE_H + +#include +#include +#include +#include +#include +using std::string; +using std::vector; +using std::ostream; +using std::ofstream; +using std::ifstream; + + +// **************************************************************************** +// Class: ResultDatabase +// +// Purpose: +// Track numerical results as they are generated. +// Print statistics of raw results. +// +// Programmer: Jeremy Meredith +// Creation: June 12, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 +// Split timing reports into detailed and summary. E.g. for serial code, +// we might report all trial values, but skip them in parallel. +// +// Jeremy Meredith, Thu Nov 11 11:40:18 EST 2010 +// Added check for missing value tag. +// +// Jeremy Meredith, Mon Nov 22 13:37:10 EST 2010 +// Added percentile statistic. +// +// Jeremy Meredith, Fri Dec 3 16:30:31 EST 2010 +// Added a method to extract a subset of results based on test name. Also, +// the Result class is now public, so that clients can use them directly. +// Added a GetResults method as well, and made several functions const. +// +// **************************************************************************** +class ResultDatabase +{ + public: + // + // A performance result for a single SHOC benchmark run. + // + struct Result + { + string test; // e.g. "readback" + string atts; // e.g. "pagelocked 4k^2" + string unit; // e.g. "MB/sec" + vector value; // e.g. "837.14" + double GetMin() const; + double GetMax() const; + double GetMedian() const; + double GetPercentile(double q) const; + double GetMean() const; + double GetStdDev() const; + + bool operator<(const Result &rhs) const; + + bool HadAnyFLTMAXValues() const + { + for (int i=0; i= FLT_MAX) + return true; + } + return false; + } + }; + + protected: + vector results; + + public: + void AddResult(const string &test, + const string &atts, + const string &unit, + double value); + void AddResults(const string &test, + const string &atts, + const string &unit, + const vector &values); + vector GetResultsForTest(const string &test); + const vector &GetResults() const; + void ClearAllResults(); + void DumpDetailed(ostream&); + void DumpSummary(ostream&); + void DumpCsv(string fileName); + + private: + bool IsFileEmpty(string fileName); + +}; + + +#endif diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp index 9827b9422..212fc6b3b 100644 --- a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp +++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp @@ -20,6 +20,7 @@ THE SOFTWARE. #include"hip_runtime.h" #include #include +#include"ResultDatabase.h" #define check(msg, status) \ if(status != hipSuccess){ \ @@ -60,18 +61,22 @@ int main(){ hipEventCreate(&start); hipEventCreate(&stop); + ResultDatabase resultDB[8]; + hipEventRecord(start); hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); - std::cout<<"First Kernel Launch: \t\t"< Date: Tue, 29 Mar 2016 16:02:09 +0800 Subject: [PATCH 032/700] change makefile for samples --- samples/0_Intro/bit_extract/Makefile | 2 +- samples/0_Intro/square/Makefile | 2 +- samples/1_Utils/hipBusBandwidth/Makefile | 2 +- samples/1_Utils/hipDispatchLatency/Makefile | 2 +- samples/1_Utils/hipInfo/Makefile | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile index d0a2d4bc7..39fb5cf8c 100644 --- a/samples/0_Intro/bit_extract/Makefile +++ b/samples/0_Intro/bit_extract/Makefile @@ -1,6 +1,6 @@ #Dependencies : [MYHIP]/bin must be in user's path. -HIP_PATH?=$(shell hipconfig --path) +HIP_PATH=../../.. HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) HIPCC=$(HIP_PATH)/bin/hipcc diff --git a/samples/0_Intro/square/Makefile b/samples/0_Intro/square/Makefile index 9fb03d867..98ee0be4f 100644 --- a/samples/0_Intro/square/Makefile +++ b/samples/0_Intro/square/Makefile @@ -1,4 +1,4 @@ -HIP_PATH?=$(shell hipconfig --path) +HIP_PATH=../../.. HIPCC=$(HIP_PATH)/bin/hipcc all: square.hip.out diff --git a/samples/1_Utils/hipBusBandwidth/Makefile b/samples/1_Utils/hipBusBandwidth/Makefile index 77a92fb1a..a713379d8 100644 --- a/samples/1_Utils/hipBusBandwidth/Makefile +++ b/samples/1_Utils/hipBusBandwidth/Makefile @@ -1,4 +1,4 @@ -HIP_PATH?=$(shell hipconfig -p) +HIP_PATH=../../.. HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipBusBandwidth diff --git a/samples/1_Utils/hipDispatchLatency/Makefile b/samples/1_Utils/hipDispatchLatency/Makefile index 4b33a0ff6..9b2d55811 100644 --- a/samples/1_Utils/hipDispatchLatency/Makefile +++ b/samples/1_Utils/hipDispatchLatency/Makefile @@ -1,4 +1,4 @@ -HIP_PATH?=$(shell hipconfig -p) +HIP_PATH=../../.. HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipDispatchLatency diff --git a/samples/1_Utils/hipInfo/Makefile b/samples/1_Utils/hipInfo/Makefile index a36b16be5..f38f157bc 100644 --- a/samples/1_Utils/hipInfo/Makefile +++ b/samples/1_Utils/hipInfo/Makefile @@ -1,4 +1,4 @@ -HIP_PATH?=$(shell hipconfig -p) +HIP_PATH=../../.. HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipInfo From 2433bca2b15c5bee1129a549f8b1647bc70c8ee8 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 4 Apr 2016 14:47:02 +0530 Subject: [PATCH 033/700] Remove deprecated KERNELBEGIN and KERNELEND from bit_extract sample --- samples/0_Intro/bit_extract/bit_extract.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/samples/0_Intro/bit_extract/bit_extract.cpp b/samples/0_Intro/bit_extract/bit_extract.cpp index 14b5be66d..5545a99c0 100644 --- a/samples/0_Intro/bit_extract/bit_extract.cpp +++ b/samples/0_Intro/bit_extract/bit_extract.cpp @@ -33,8 +33,6 @@ THE SOFTWARE. void __global__ bit_extract_kernel(hipLaunchParm lp, uint32_t *C_d, const uint32_t *A_d, size_t N) { - KERNELBEGIN; - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); size_t stride = hipBlockDim_x * hipGridDim_x ; @@ -45,8 +43,6 @@ bit_extract_kernel(hipLaunchParm lp, uint32_t *C_d, const uint32_t *A_d, size_t C_d[i] = ((A_d[i] & 0xf00) >> 8); #endif } - - KERNELEND; } From 3fbad4bca5695211b2e39327b3399bced663b7e0 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 8 Apr 2016 02:15:46 -0500 Subject: [PATCH 034/700] Print peers in hipConfig. Also include peer APIs in vim hilighting. --- samples/1_Utils/hipInfo/hipInfo.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 146d17e01..581194f62 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -120,8 +120,25 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << endl; cout << setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << endl; cout << setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << endl; + + int deviceCnt; + hipGetDeviceCount(&deviceCnt); + cout << setw(w1) << "peers: "; + for (int i=0; i Date: Fri, 8 Apr 2016 02:17:29 -0500 Subject: [PATCH 035/700] Use HIP_PATH if set else use relative ../... --- samples/0_Intro/bit_extract/Makefile | 2 +- samples/0_Intro/square/Makefile | 2 +- samples/1_Utils/hipBusBandwidth/Makefile | 2 +- samples/1_Utils/hipDispatchLatency/Makefile | 2 +- samples/1_Utils/hipInfo/Makefile | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile index 39fb5cf8c..cdf793363 100644 --- a/samples/0_Intro/bit_extract/Makefile +++ b/samples/0_Intro/bit_extract/Makefile @@ -1,6 +1,6 @@ #Dependencies : [MYHIP]/bin must be in user's path. -HIP_PATH=../../.. +HIP_PATH=?../../.. HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) HIPCC=$(HIP_PATH)/bin/hipcc diff --git a/samples/0_Intro/square/Makefile b/samples/0_Intro/square/Makefile index 98ee0be4f..817c556b2 100644 --- a/samples/0_Intro/square/Makefile +++ b/samples/0_Intro/square/Makefile @@ -1,4 +1,4 @@ -HIP_PATH=../../.. +HIP_PATH?=../../.. HIPCC=$(HIP_PATH)/bin/hipcc all: square.hip.out diff --git a/samples/1_Utils/hipBusBandwidth/Makefile b/samples/1_Utils/hipBusBandwidth/Makefile index a713379d8..4599cacba 100644 --- a/samples/1_Utils/hipBusBandwidth/Makefile +++ b/samples/1_Utils/hipBusBandwidth/Makefile @@ -1,4 +1,4 @@ -HIP_PATH=../../.. +HIP_PATH?=../../.. HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipBusBandwidth diff --git a/samples/1_Utils/hipDispatchLatency/Makefile b/samples/1_Utils/hipDispatchLatency/Makefile index 9b2d55811..87e707923 100644 --- a/samples/1_Utils/hipDispatchLatency/Makefile +++ b/samples/1_Utils/hipDispatchLatency/Makefile @@ -1,4 +1,4 @@ -HIP_PATH=../../.. +HIP_PATH?=../../.. HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipDispatchLatency diff --git a/samples/1_Utils/hipInfo/Makefile b/samples/1_Utils/hipInfo/Makefile index f38f157bc..d69067388 100644 --- a/samples/1_Utils/hipInfo/Makefile +++ b/samples/1_Utils/hipInfo/Makefile @@ -1,4 +1,4 @@ -HIP_PATH=../../.. +HIP_PATH?=../../.. HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipInfo From c37a4c3d04d7fa0ae60692e8c7ad11e8a7edf486 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 9 Apr 2016 04:35:06 -0500 Subject: [PATCH 036/700] Remove stray debug msgs, hipInfo don't print self as peer. --- samples/1_Utils/hipInfo/hipInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 581194f62..9151d5058 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -127,7 +127,7 @@ void printDeviceProp (int deviceId) for (int i=0; i Date: Fri, 8 Apr 2016 02:15:46 -0500 Subject: [PATCH 037/700] Print peers in hipConfig. Also include peer APIs in vim hilighting. --- samples/1_Utils/hipInfo/hipInfo.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 146d17e01..581194f62 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -120,8 +120,25 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << endl; cout << setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << endl; cout << setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << endl; + + int deviceCnt; + hipGetDeviceCount(&deviceCnt); + cout << setw(w1) << "peers: "; + for (int i=0; i Date: Fri, 8 Apr 2016 02:17:29 -0500 Subject: [PATCH 038/700] Use HIP_PATH if set else use relative ../... --- samples/0_Intro/bit_extract/Makefile | 2 +- samples/0_Intro/square/Makefile | 2 +- samples/1_Utils/hipBusBandwidth/Makefile | 2 +- samples/1_Utils/hipDispatchLatency/Makefile | 2 +- samples/1_Utils/hipInfo/Makefile | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile index 39fb5cf8c..cdf793363 100644 --- a/samples/0_Intro/bit_extract/Makefile +++ b/samples/0_Intro/bit_extract/Makefile @@ -1,6 +1,6 @@ #Dependencies : [MYHIP]/bin must be in user's path. -HIP_PATH=../../.. +HIP_PATH=?../../.. HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) HIPCC=$(HIP_PATH)/bin/hipcc diff --git a/samples/0_Intro/square/Makefile b/samples/0_Intro/square/Makefile index 98ee0be4f..817c556b2 100644 --- a/samples/0_Intro/square/Makefile +++ b/samples/0_Intro/square/Makefile @@ -1,4 +1,4 @@ -HIP_PATH=../../.. +HIP_PATH?=../../.. HIPCC=$(HIP_PATH)/bin/hipcc all: square.hip.out diff --git a/samples/1_Utils/hipBusBandwidth/Makefile b/samples/1_Utils/hipBusBandwidth/Makefile index a713379d8..4599cacba 100644 --- a/samples/1_Utils/hipBusBandwidth/Makefile +++ b/samples/1_Utils/hipBusBandwidth/Makefile @@ -1,4 +1,4 @@ -HIP_PATH=../../.. +HIP_PATH?=../../.. HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipBusBandwidth diff --git a/samples/1_Utils/hipDispatchLatency/Makefile b/samples/1_Utils/hipDispatchLatency/Makefile index 9b2d55811..87e707923 100644 --- a/samples/1_Utils/hipDispatchLatency/Makefile +++ b/samples/1_Utils/hipDispatchLatency/Makefile @@ -1,4 +1,4 @@ -HIP_PATH=../../.. +HIP_PATH?=../../.. HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipDispatchLatency diff --git a/samples/1_Utils/hipInfo/Makefile b/samples/1_Utils/hipInfo/Makefile index f38f157bc..d69067388 100644 --- a/samples/1_Utils/hipInfo/Makefile +++ b/samples/1_Utils/hipInfo/Makefile @@ -1,4 +1,4 @@ -HIP_PATH=../../.. +HIP_PATH?=../../.. HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipInfo From 565300acb0317991450730654afe196c8ba5c91a Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 9 Apr 2016 04:35:06 -0500 Subject: [PATCH 039/700] Remove stray debug msgs, hipInfo don't print self as peer. --- samples/1_Utils/hipInfo/hipInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 581194f62..9151d5058 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -127,7 +127,7 @@ void printDeviceProp (int deviceId) for (int i=0; i Date: Mon, 11 Apr 2016 12:52:18 -0500 Subject: [PATCH 040/700] P2p checkpoint. - set USE_PEER_TO_PEER=3 (requires HCC "am_memtracker_update_peers") - when enabling peer, turn it on for previously allocated memory. - hipDeviceCanAccessPeer is no longer self-ware (self does not qualify as a peer) - device peerlist always includes self, so when we call allow_access we never remove self access. - hipDeviceReset() removes old peer mappings. --- samples/1_Utils/hipInfo/hipInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 9151d5058..581194f62 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -127,7 +127,7 @@ void printDeviceProp (int deviceId) for (int i=0; i Date: Mon, 11 Apr 2016 07:47:22 -0500 Subject: [PATCH 041/700] fix peer query order --- samples/1_Utils/hipInfo/hipInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 581194f62..2b6e12990 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -126,7 +126,7 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "peers: "; for (int i=0; i Date: Mon, 11 Apr 2016 12:52:18 -0500 Subject: [PATCH 042/700] P2p checkpoint. - set USE_PEER_TO_PEER=3 (requires HCC "am_memtracker_update_peers") - when enabling peer, turn it on for previously allocated memory. - hipDeviceCanAccessPeer is no longer self-ware (self does not qualify as a peer) - device peerlist always includes self, so when we call allow_access we never remove self access. - hipDeviceReset() removes old peer mappings. --- samples/1_Utils/hipInfo/hipInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 9151d5058..581194f62 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -127,7 +127,7 @@ void printDeviceProp (int deviceId) for (int i=0; i Date: Wed, 13 Apr 2016 17:32:38 -0500 Subject: [PATCH 043/700] add hcc dialects sample --- samples/0_Intro/hcc_dialects/Makefile | 66 +++++++++++++++++++ .../hcc_dialects/vadd_amp_arrayview.cpp | 48 ++++++++++++++ samples/0_Intro/hcc_dialects/vadd_hc_am.cpp | 59 +++++++++++++++++ .../0_Intro/hcc_dialects/vadd_hc_array.cpp | 53 +++++++++++++++ samples/0_Intro/hcc_dialects/vadd_hc_array.hc | 33 ++++++++++ .../hcc_dialects/vadd_hc_arrayview.cpp | 48 ++++++++++++++ samples/0_Intro/hcc_dialects/vadd_hip.cpp | 51 ++++++++++++++ 7 files changed, 358 insertions(+) create mode 100644 samples/0_Intro/hcc_dialects/Makefile create mode 100644 samples/0_Intro/hcc_dialects/vadd_amp_arrayview.cpp create mode 100644 samples/0_Intro/hcc_dialects/vadd_hc_am.cpp create mode 100644 samples/0_Intro/hcc_dialects/vadd_hc_array.cpp create mode 100644 samples/0_Intro/hcc_dialects/vadd_hc_array.hc create mode 100644 samples/0_Intro/hcc_dialects/vadd_hc_arrayview.cpp create mode 100644 samples/0_Intro/hcc_dialects/vadd_hip.cpp diff --git a/samples/0_Intro/hcc_dialects/Makefile b/samples/0_Intro/hcc_dialects/Makefile new file mode 100644 index 000000000..108d30201 --- /dev/null +++ b/samples/0_Intro/hcc_dialects/Makefile @@ -0,0 +1,66 @@ +HCC_HOME?=/opt/rocm/hcc +HCC = $(HCC_HOME)/bin/hcc + +HCC_CFLAGS= `$(HCC_HOME)/bin/hcc-config --cxxflags` +HCC_LDFLAGS= `$(HCC_HOME)/bin/hcc-config --ldflags` + +CPPAMP_CFLAGS= -std=c++amp -stdlib=libc++ -I/opt/hcc/include +CPPAMP_LDFLAGS= -std=c++amp -L/opt/hcc/lib -Wl,--rpath=/opt/hcc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive + +HIP_PATH?=/opt/rocm/hip +HIPCC=$(HIP_PATH)/bin/hipcc +HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) + +ifneq (${HIP_PLATFORM}, hcc) +$(error hcc_dialects requires hcc compiler and only runs on hcc platform) +endif + + +TARGETS=vadd_hc_arrayview vadd_hc_array vadd_amp_arrayview vadd_hip + +all: $(TARGETS) + +clean: + rm -f $(TARGETS) *.o + +run: $(TARGETS) + @for t in $(TARGETS); do\ + echo "Running $$t"; \ + ./$$t; \ + done + + +# HCC version: +vadd_hc_arrayview.o: vadd_hc_arrayview.cpp + $(HCC) $(HCC_CFLAGS) -c $< -o $@ +vadd_hc_arrayview: vadd_hc_arrayview.o + $(HCC) $(HCC_LDFLAGS) $< -o $@ + + +# HCC version, using explicit arrays: +vadd_hc_array.o: vadd_hc_array.cpp + $(HCC) $(HCC_CFLAGS) -c $< -o $@ +vadd_hc_array: vadd_hc_array.o + $(HCC) $(HCC_LDFLAGS) $< -o $@ + + +# HCC version, using AM (accelerator memory) pointer +vadd_hc_am.o: vadd_hc_am.cpp + $(HCC) $(HCC_CFLAGS) -c $< -o $@ +vadd_hc_am: vadd_hc_am.o + $(HCC) $(HCC_LDFLAGS) $< -o $@ + + + +# HIP version: +vadd_hip.o: vadd_hip.cpp + $(HIPCC) -c $< -o $@ +vadd_hip: vadd_hip.o + $(HIPCC) $< -o $@ + + +# AMP version: +vadd_amp_arrayview.o: vadd_amp_arrayview.cpp + $(HCC) $(CPPAMP_CFLAGS) -c $< -o $@ +vadd_amp_arrayview: vadd_amp_arrayview.o + $(HCC) $(CPPAMP_LDFLAGS) $< -o $@ diff --git a/samples/0_Intro/hcc_dialects/vadd_amp_arrayview.cpp b/samples/0_Intro/hcc_dialects/vadd_amp_arrayview.cpp new file mode 100644 index 000000000..6fdea5d83 --- /dev/null +++ b/samples/0_Intro/hcc_dialects/vadd_amp_arrayview.cpp @@ -0,0 +1,48 @@ +// Simple test showing how to use C++AMP syntax with array_view. +// The code uses AMP's array_view class, which provides automatic data synchronization +// of data between the host and the accelerator. As noted below, the HCC runtime +// will automatically copy data to and from the host, without the user needing +// to manually perform such copies. This is an excellent mode for developers +// new to GPU programming and matches the memory models provided by recent systems where +// CPU and GPU share the same memory pool. Advanced programmers may prefer +// more explicit control over the data movement - shown in the other vadd_hc_array and +// vadd_hc_am examples. +// This example shows the similarity between C++AMP and and HC for simple cases where +// implicit data transfer is used - really the only difference is the namespace. +// Other examples show some of the more advanced controls. + +#include + +int main(int argc, char *argv[]) +{ + int sizeElements = 1000000; + + // Allocate auto-managed host/device views of data: + concurrency::array_view A(sizeElements); + concurrency::array_view B(sizeElements); + concurrency::array_view C(sizeElements); + + // Initialize host data + for (int i=0; i (sizeElements), + [=] (concurrency::index<1> idx) restrict(amp) { + int i = idx[0]; + C[i] = A[i] + B[i]; + }); + + for (int i=0; i +#include + +int main(int argc, char *argv[]) +{ + int sizeElements = 1000000; + size_t sizeBytes = sizeElements * sizeof(float); + + // Allocate host memory + float *A_h = (float*)malloc(sizeBytes); + float *B_h = (float*)malloc(sizeBytes); + float *C_h = (float*)malloc(sizeBytes); + + // Allocate device pointers: + // Unlike array_view, these must be explicitly managed by user: + hc::accelerator acc; // grab default accelerator where we want to allocate memory: + hc::accelerator_view av = acc.get_default_view(); + + float *A_d, *B_d, *C_d; + A_d = hc::am_alloc(sizeBytes, acc, 0); + B_d = hc::am_alloc(sizeBytes, acc, 0); + C_d = hc::am_alloc(sizeBytes, acc, 0); + + // Initialize host data + for (int i=0; i (sizeElements), + [&] (hc::index<1> idx) [[hc]] { + int i = idx[0]; + C_d[i] = A_d[i] + B_d[i]; + }); + + + // This copy is in same AV as the kernel and thus will wait for the kernel to finish before executing. + av.copy(C_d, C_h); // C++ copy D2H + + + for (int i=0; i + +int main(int argc, char *argv[]) +{ + int sizeElements = 1000000; + size_t sizeBytes = sizeElements * sizeof(float); + + // Allocate host memory + float *A_h = (float*)malloc(sizeBytes); + float *B_h = (float*)malloc(sizeBytes); + float *C_h = (float*)malloc(sizeBytes); + + // Allocate device arrays<> + // Unlike array_view, these must be explicitly managed by user: + hc::array A_d(sizeElements); + hc::array B_d(sizeElements); + hc::array C_d(sizeElements); + + // Initialize host data + for (int i=0; i types are not implicitly copied, so we performed copies above. + hc::parallel_for_each(hc::extent<1> (sizeElements), + [&] (hc::index<1> idx) [[hc]] { + int i = idx[0]; + C_d[i] = A_d[i] + B_d[i]; + }); + + // HCC runtime knows that C_d depends on previous PFE and will force the copy to wait for the PFE to complte. + hc::copy(C_d, C_h); // C++ copy D2H + + + for (int i=0; i + +int main(int argc, char *argv[]) +{ + int size = 1000000; + + // Allocate auto-managed host/device views of data: + hc::array_view A(size); + hc::array_view B(size); + hc::array_view C(size); + + // Initialize host data + for (int i=0; i (size), + [=] (hc::index<1> idx) [[hc]] { + int i = idx[0]; + C[i] = A[i] + B[i]; + }); + + for (int i=0; i + +int main(int argc, char *argv[]) +{ + int sizeElements = 1000000; + + // Allocate auto-managed host/device views of data: + hc::array_view A(sizeElements); + hc::array_view B(sizeElements); + hc::array_view C(sizeElements); + + // Initialize host data + for (int i=0; i (sizeElements), + [=] (hc::index<1> idx) [[hc]] { + int i = idx[0]; + C[i] = A[i] + B[i]; + }); + + for (int i=0; i + +__global__ void vadd_hip(hipLaunchParm lp, const float *a, const float *b, float *c, int N) +{ + int idx = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + + if (idx < N) { + c[idx] = a[idx] + b[idx]; + } +} + + +int main(int argc, char *argv[]) +{ + int sizeElements = 1000000; + size_t sizeBytes = sizeElements * sizeof(float); + + // Allocate host memory + float *A_h = (float*)malloc(sizeBytes); + float *B_h = (float*)malloc(sizeBytes); + float *C_h = (float*)malloc(sizeBytes); + + // Allocate device memory: + float *A_d, *B_d, *C_d; + hipMalloc(&A_d, sizeBytes); + hipMalloc(&B_d, sizeBytes); + hipMalloc(&C_d, sizeBytes); + + // Initialize host data + for (int i=0; i Date: Wed, 13 Apr 2016 17:37:39 -0500 Subject: [PATCH 044/700] Fix HIP_PATH, CHECK macro in samples. --- samples/0_Intro/bit_extract/Makefile | 2 +- samples/0_Intro/bit_extract/bit_extract.cpp | 7 +++++-- samples/0_Intro/square/square.cu | 7 +++++-- samples/0_Intro/square/square.hipref.cpp | 7 +++++-- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile index cdf793363..b71828f5f 100644 --- a/samples/0_Intro/bit_extract/Makefile +++ b/samples/0_Intro/bit_extract/Makefile @@ -1,6 +1,6 @@ #Dependencies : [MYHIP]/bin must be in user's path. -HIP_PATH=?../../.. +HIP_PATH?=../../.. HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) HIPCC=$(HIP_PATH)/bin/hipcc diff --git a/samples/0_Intro/bit_extract/bit_extract.cpp b/samples/0_Intro/bit_extract/bit_extract.cpp index 5545a99c0..bdc8182c3 100644 --- a/samples/0_Intro/bit_extract/bit_extract.cpp +++ b/samples/0_Intro/bit_extract/bit_extract.cpp @@ -24,11 +24,14 @@ THE SOFTWARE. #include -#define CHECK(error) \ +#define CHECK(cmd) \ +{\ + hipError_t error = cmd;\ if (error != hipSuccess) { \ fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \ exit(EXIT_FAILURE);\ - } + }\ +} void __global__ bit_extract_kernel(hipLaunchParm lp, uint32_t *C_d, const uint32_t *A_d, size_t N) diff --git a/samples/0_Intro/square/square.cu b/samples/0_Intro/square/square.cu index 996344ed4..8b6980cd0 100644 --- a/samples/0_Intro/square/square.cu +++ b/samples/0_Intro/square/square.cu @@ -22,11 +22,14 @@ THE SOFTWARE. #include #include -#define CHECK(error) \ +#define CHECK(cmd) \ +{\ + hipError_t error = cmd;\ if (error != cudaSuccess) { \ fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \ exit(EXIT_FAILURE);\ - } + }\ +} /* diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index 5d53a8d58..aa1407773 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -22,11 +22,14 @@ THE SOFTWARE. #include #include -#define CHECK(error) \ +#define CHECK(cmd) \ +{\ + hipError_t error = cmd;\ if (error != hipSuccess) { \ fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \ exit(EXIT_FAILURE);\ - } + }\ +} /* From 4092a1efe8e5089e05e9f06a2efea6aee57c147a Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 15 Apr 2016 12:56:31 +0530 Subject: [PATCH 045/700] Replace /opt/hcc -> /opt/rocm/hcc and /opt/hsa -> /opt/rocm/hsa --- samples/0_Intro/hcc_dialects/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/0_Intro/hcc_dialects/Makefile b/samples/0_Intro/hcc_dialects/Makefile index 108d30201..b6e5c10af 100644 --- a/samples/0_Intro/hcc_dialects/Makefile +++ b/samples/0_Intro/hcc_dialects/Makefile @@ -4,8 +4,8 @@ HCC = $(HCC_HOME)/bin/hcc HCC_CFLAGS= `$(HCC_HOME)/bin/hcc-config --cxxflags` HCC_LDFLAGS= `$(HCC_HOME)/bin/hcc-config --ldflags` -CPPAMP_CFLAGS= -std=c++amp -stdlib=libc++ -I/opt/hcc/include -CPPAMP_LDFLAGS= -std=c++amp -L/opt/hcc/lib -Wl,--rpath=/opt/hcc/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive +CPPAMP_CFLAGS= -std=c++amp -stdlib=libc++ -I$(HCC_HOME)/include +CPPAMP_LDFLAGS= -std=c++amp -L$(HCC_HOME)/lib -Wl,--rpath=$(HCC_HOME)/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive HIP_PATH?=/opt/rocm/hip HIPCC=$(HIP_PATH)/bin/hipcc From 307b24b9d57259e4463bf6b5c46b8f41e2e3d3a2 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 18 Apr 2016 10:15:35 +0530 Subject: [PATCH 046/700] Fix makefiles in samples --- samples/0_Intro/bit_extract/Makefile | 5 ++++- samples/0_Intro/hcc_dialects/Makefile | 5 ++++- samples/0_Intro/square/Makefile | 5 ++++- samples/1_Utils/hipBusBandwidth/Makefile | 5 ++++- samples/1_Utils/hipDispatchLatency/Makefile | 5 ++++- samples/1_Utils/hipInfo/Makefile | 5 ++++- 6 files changed, 24 insertions(+), 6 deletions(-) diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile index b71828f5f..a01f60646 100644 --- a/samples/0_Intro/bit_extract/Makefile +++ b/samples/0_Intro/bit_extract/Makefile @@ -1,6 +1,9 @@ #Dependencies : [MYHIP]/bin must be in user's path. -HIP_PATH?=../../.. +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) HIPCC=$(HIP_PATH)/bin/hipcc diff --git a/samples/0_Intro/hcc_dialects/Makefile b/samples/0_Intro/hcc_dialects/Makefile index b6e5c10af..fb8fcc0c3 100644 --- a/samples/0_Intro/hcc_dialects/Makefile +++ b/samples/0_Intro/hcc_dialects/Makefile @@ -7,7 +7,10 @@ HCC_LDFLAGS= `$(HCC_HOME)/bin/hcc-config --ldflags` CPPAMP_CFLAGS= -std=c++amp -stdlib=libc++ -I$(HCC_HOME)/include CPPAMP_LDFLAGS= -std=c++amp -L$(HCC_HOME)/lib -Wl,--rpath=$(HCC_HOME)/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive -HIP_PATH?=/opt/rocm/hip +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif HIPCC=$(HIP_PATH)/bin/hipcc HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) diff --git a/samples/0_Intro/square/Makefile b/samples/0_Intro/square/Makefile index 817c556b2..89921c207 100644 --- a/samples/0_Intro/square/Makefile +++ b/samples/0_Intro/square/Makefile @@ -1,4 +1,7 @@ -HIP_PATH?=../../.. +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif HIPCC=$(HIP_PATH)/bin/hipcc all: square.hip.out diff --git a/samples/1_Utils/hipBusBandwidth/Makefile b/samples/1_Utils/hipBusBandwidth/Makefile index 4599cacba..418f25f8e 100644 --- a/samples/1_Utils/hipBusBandwidth/Makefile +++ b/samples/1_Utils/hipBusBandwidth/Makefile @@ -1,4 +1,7 @@ -HIP_PATH?=../../.. +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipBusBandwidth diff --git a/samples/1_Utils/hipDispatchLatency/Makefile b/samples/1_Utils/hipDispatchLatency/Makefile index 87e707923..387cb9aac 100644 --- a/samples/1_Utils/hipDispatchLatency/Makefile +++ b/samples/1_Utils/hipDispatchLatency/Makefile @@ -1,4 +1,7 @@ -HIP_PATH?=../../.. +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipDispatchLatency diff --git a/samples/1_Utils/hipInfo/Makefile b/samples/1_Utils/hipInfo/Makefile index d69067388..53bad55e4 100644 --- a/samples/1_Utils/hipInfo/Makefile +++ b/samples/1_Utils/hipInfo/Makefile @@ -1,4 +1,7 @@ -HIP_PATH?=../../.. +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipInfo From fb88bb1c175976ebad3970640fb610ad2338a629 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 3 May 2016 14:19:25 +0530 Subject: [PATCH 047/700] bit_extract reports PASSED when passed --- samples/0_Intro/bit_extract/bit_extract.cpp | 65 +++++++++++---------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/samples/0_Intro/bit_extract/bit_extract.cpp b/samples/0_Intro/bit_extract/bit_extract.cpp index bdc8182c3..53302b722 100644 --- a/samples/0_Intro/bit_extract/bit_extract.cpp +++ b/samples/0_Intro/bit_extract/bit_extract.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. if (error != hipSuccess) { \ fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \ exit(EXIT_FAILURE);\ - }\ + }\ } void __global__ @@ -43,59 +43,60 @@ bit_extract_kernel(hipLaunchParm lp, uint32_t *C_d, const uint32_t *A_d, size_t #ifdef __HIP_PLATFORM_HCC__ C_d[i] = hc::__bitextract_u32(A_d[i], 8, 4); #else /* defined __HIP_PLATFORM_NVCC__ or other path */ - C_d[i] = ((A_d[i] & 0xf00) >> 8); + C_d[i] = ((A_d[i] & 0xf00) >> 8); #endif - } + } } int main(int argc, char *argv[]) { - uint32_t *A_d, *C_d; - uint32_t *A_h, *C_h; - size_t N = 1000000; - size_t Nbytes = N * sizeof(uint32_t); + uint32_t *A_d, *C_d; + uint32_t *A_h, *C_h; + size_t N = 1000000; + size_t Nbytes = N * sizeof(uint32_t); int deviceId; CHECK (hipGetDevice(&deviceId)); - hipDeviceProp_t props; - CHECK(hipGetDeviceProperties(&props, deviceId)); - printf ("info: running on device #%d %s\n", deviceId, props.name); + hipDeviceProp_t props; + CHECK(hipGetDeviceProperties(&props, deviceId)); + printf ("info: running on device #%d %s\n", deviceId, props.name); - printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); - A_h = (uint32_t*)malloc(Nbytes); - CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); - C_h = (uint32_t*)malloc(Nbytes); - CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); + printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + A_h = (uint32_t*)malloc(Nbytes); + CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); + C_h = (uint32_t*)malloc(Nbytes); + CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); for (size_t i=0; i> 8); - if (C_h[i] != Agold) { + if (C_h[i] != Agold) { fprintf (stderr, "mismatch detected.\n"); printf ("%zu: %08x =? %08x (Ain=%08x)\n", i, C_h[i], Agold, A_h[i]); - CHECK(hipErrorUnknown); - } - } + CHECK(hipErrorUnknown); + } + } + printf ("PASSED!\n"); } From 07026bfdea86ae1874192bb30a00a59a546aed03 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 3 May 2016 14:32:59 +0530 Subject: [PATCH 048/700] hcc_dialects report PASSED when passed --- samples/0_Intro/hcc_dialects/vadd_amp_arrayview.cpp | 3 +++ samples/0_Intro/hcc_dialects/vadd_hc_am.cpp | 3 +++ samples/0_Intro/hcc_dialects/vadd_hc_array.cpp | 3 +++ samples/0_Intro/hcc_dialects/vadd_hc_array.hc | 3 +++ samples/0_Intro/hcc_dialects/vadd_hc_arrayview.cpp | 3 +++ samples/0_Intro/hcc_dialects/vadd_hip.cpp | 3 +++ 6 files changed, 18 insertions(+) diff --git a/samples/0_Intro/hcc_dialects/vadd_amp_arrayview.cpp b/samples/0_Intro/hcc_dialects/vadd_amp_arrayview.cpp index 6fdea5d83..485b64f68 100644 --- a/samples/0_Intro/hcc_dialects/vadd_amp_arrayview.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_amp_arrayview.cpp @@ -16,6 +16,7 @@ int main(int argc, char *argv[]) { int sizeElements = 1000000; + bool pass = true; // Allocate auto-managed host/device views of data: concurrency::array_view A(sizeElements); @@ -43,6 +44,8 @@ int main(int argc, char *argv[]) // Because C is an array_view, the HCC runtime will copy C back to host at first access here: if (C[i] != ref) { printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref); + pass = false; } }; + if (pass) printf ("PASSED!\n"); } diff --git a/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp b/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp index df6a831fd..5cb2e8c98 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp @@ -11,6 +11,7 @@ int main(int argc, char *argv[]) { int sizeElements = 1000000; size_t sizeBytes = sizeElements * sizeof(float); + bool pass = true; // Allocate host memory float *A_h = (float*)malloc(sizeBytes); @@ -54,6 +55,8 @@ int main(int argc, char *argv[]) float ref= 1.618f * i + 3.142f * i; if (C_h[i] != ref) { printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref); + pass = false; } }; + if (pass) printf ("PASSED!\n"); } diff --git a/samples/0_Intro/hcc_dialects/vadd_hc_array.cpp b/samples/0_Intro/hcc_dialects/vadd_hc_array.cpp index 63f455a61..bda3adf37 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hc_array.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_hc_array.cpp @@ -11,6 +11,7 @@ int main(int argc, char *argv[]) { int sizeElements = 1000000; size_t sizeBytes = sizeElements * sizeof(float); + bool pass = true; // Allocate host memory float *A_h = (float*)malloc(sizeBytes); @@ -48,6 +49,8 @@ int main(int argc, char *argv[]) float ref= 1.618f * i + 3.142f * i; if (C_h[i] != ref) { printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref); + pass = false; } }; + if (pass) printf ("PASSED!\n"); } diff --git a/samples/0_Intro/hcc_dialects/vadd_hc_array.hc b/samples/0_Intro/hcc_dialects/vadd_hc_array.hc index 8a9f8e71e..d57b9a7e1 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hc_array.hc +++ b/samples/0_Intro/hcc_dialects/vadd_hc_array.hc @@ -3,6 +3,7 @@ int main(int argc, char *argv[]) { int size = 1000000; + bool pass = true; // Allocate auto-managed host/device views of data: hc::array_view A(size); @@ -28,6 +29,8 @@ int main(int argc, char *argv[]) float ref= 1.618f * i + 3.142f * i; if (C[i] != ref) { printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref); + pass = false; } }; + if (pass) printf ("PASSED!\n"); } diff --git a/samples/0_Intro/hcc_dialects/vadd_hc_arrayview.cpp b/samples/0_Intro/hcc_dialects/vadd_hc_arrayview.cpp index 41d9124dd..2585f4700 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hc_arrayview.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_hc_arrayview.cpp @@ -16,6 +16,7 @@ int main(int argc, char *argv[]) { int sizeElements = 1000000; + bool pass = true; // Allocate auto-managed host/device views of data: hc::array_view A(sizeElements); @@ -43,6 +44,8 @@ int main(int argc, char *argv[]) // Because C is an array_view, the HCC runtime will copy C back to host at first access here: if (C[i] != ref) { printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C[i], ref); + pass = false; } }; + if (pass) printf ("PASSED!\n"); } diff --git a/samples/0_Intro/hcc_dialects/vadd_hip.cpp b/samples/0_Intro/hcc_dialects/vadd_hip.cpp index 31751b341..9d223ba27 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hip.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_hip.cpp @@ -14,6 +14,7 @@ int main(int argc, char *argv[]) { int sizeElements = 1000000; size_t sizeBytes = sizeElements * sizeof(float); + bool pass = true; // Allocate host memory float *A_h = (float*)malloc(sizeBytes); @@ -46,6 +47,8 @@ int main(int argc, char *argv[]) float ref= 1.618f * i + 3.142f * i; if (C_h[i] != ref) { printf ("error:%d computed=%6.2f, reference=%6.2f\n", i, C_h[i], ref); + pass = false; } }; + if (pass) printf ("PASSED!\n"); } From f6544a376bbfa07d5eaadcac9d73d06b388cc49a Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 12 May 2016 10:13:07 +0530 Subject: [PATCH 049/700] Fix square.cu to use cudaError_t instead of hipError_t Change-Id: If3314910d1c03122741d3e0a45e14a4412c473b3 --- samples/0_Intro/square/square.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/0_Intro/square/square.cu b/samples/0_Intro/square/square.cu index 8b6980cd0..ec8ca12fb 100644 --- a/samples/0_Intro/square/square.cu +++ b/samples/0_Intro/square/square.cu @@ -24,7 +24,7 @@ THE SOFTWARE. #define CHECK(cmd) \ {\ - hipError_t error = cmd;\ + cudaError_t error = cmd;\ if (error != cudaSuccess) { \ fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \ exit(EXIT_FAILURE);\ From e89fba7fe12ede0ef61fad2fb45dedf3eb1c8af0 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 20 May 2016 10:15:33 +0530 Subject: [PATCH 050/700] Fix bit_extract sample Change-Id: I933f932bac26d9a9469d5d069973af166e11cbcd --- samples/0_Intro/bit_extract/Makefile | 2 +- samples/0_Intro/bit_extract/bit_extract.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile index a01f60646..0965ae729 100644 --- a/samples/0_Intro/bit_extract/Makefile +++ b/samples/0_Intro/bit_extract/Makefile @@ -12,7 +12,7 @@ ifeq (${HIP_PLATFORM}, nvcc) HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 endif ifeq (${HIP_PLATFORM}, hcc) - HIPCC_FLAGS = + HIPCC_FLAGS = -stdlib=libc++ endif diff --git a/samples/0_Intro/bit_extract/bit_extract.cpp b/samples/0_Intro/bit_extract/bit_extract.cpp index 53302b722..746e1012b 100644 --- a/samples/0_Intro/bit_extract/bit_extract.cpp +++ b/samples/0_Intro/bit_extract/bit_extract.cpp @@ -22,6 +22,9 @@ THE SOFTWARE. #include #include #include +#ifdef __HIP_PLATFORM_HCC__ +#include +#endif #define CHECK(cmd) \ From a685f7dc797550b8443d7726375d74333887fbf9 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 13 Jun 2016 14:23:51 +0530 Subject: [PATCH 051/700] hipDispatchLatency: reduce iterations to 5120 Change-Id: I94ae4993ff5058cf15f9487a5a528fc24c1ad5fa --- samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp index 212fc6b3b..1c15ab51d 100644 --- a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp +++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp @@ -30,7 +30,7 @@ if(status != hipSuccess){ \ #define LEN 1024*1024 #define SIZE LEN * sizeof(float) -#define ITER 10000 +#define ITER 5120 __global__ void One(hipLaunchParm lp, float* Ad){ } From 339590da90a964446ba1f26460c8eac1ffb4ad29 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 12 Aug 2016 13:50:22 +0530 Subject: [PATCH 052/700] Add simple hipblas saxpy sample Change-Id: I67ae83e1e5397d5191a3c644aba068f06ff97830 --- samples/2_Advanced/hipblas_saxpy/Makefile | 34 +++++++ .../2_Advanced/hipblas_saxpy/saxpy.cublas.cpp | 94 +++++++++++++++++++ .../hipblas_saxpy/saxpy.hipblasref.cpp | 94 +++++++++++++++++++ 3 files changed, 222 insertions(+) create mode 100644 samples/2_Advanced/hipblas_saxpy/Makefile create mode 100644 samples/2_Advanced/hipblas_saxpy/saxpy.cublas.cpp create mode 100644 samples/2_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp diff --git a/samples/2_Advanced/hipblas_saxpy/Makefile b/samples/2_Advanced/hipblas_saxpy/Makefile new file mode 100644 index 000000000..ed88be2dd --- /dev/null +++ b/samples/2_Advanced/hipblas_saxpy/Makefile @@ -0,0 +1,34 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif +HIPCC=$(HIP_PATH)/bin/hipcc + +HIPCC_FLAGS += -std=c++11 +HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) +ifeq (${HIP_PLATFORM}, nvcc) + LIBS = -lcublas +endif +ifeq (${HIP_PLATFORM}, hcc) + HCBLAS_ROOT?= $(wildcard /opt/rocm/hcblas) + HIPCC_FLAGS += -stdlib=libc++ -I$(HCBLAS_ROOT)/include + LIBS = -L$(HCBLAS_ROOT)/lib -lhcblas +endif + + +all: saxpy.hipblas.out + +saxpy.cublas.out : saxpy.cublas.cpp + nvcc -std=c++11 -I$(CUDA_HOME)/include saxpy.cublas.cpp -o $@ -L$(CUDA_HOME)/lib64 -lcublas + +# $HIPBLAS_ROOT/bin/hipifyblas ./saxpy.cublas.cpp > ./saxpy.hipblas.cpp +# Then review & finish port in saxpy.hipblas.cpp + +saxpy.hipblasref.o: saxpy.hipblasref.cpp + $(HIPCC) $(HIPCC_FLAGS) -c $< -o $@ + +saxpy.hipblas.out: saxpy.hipblasref.o + $(HIPCC) $< -o $@ $(LIBS) + +clean: + rm -f *.o *.out diff --git a/samples/2_Advanced/hipblas_saxpy/saxpy.cublas.cpp b/samples/2_Advanced/hipblas_saxpy/saxpy.cublas.cpp new file mode 100644 index 000000000..03a38f3fb --- /dev/null +++ b/samples/2_Advanced/hipblas_saxpy/saxpy.cublas.cpp @@ -0,0 +1,94 @@ + +#include +#include +#include +#include + +// header file for the GPU API +#include +#include + +#define N (1024 * 500) + +#define CHECK(cmd) \ +{\ + cudaError_t error = cmd; \ + if (error != cudaSuccess) { \ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + }\ +} + +#define CHECK_BLAS(cmd) \ +{\ + cublasStatus_t error = cmd;\ + if (error != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "error: (%d) at %s:%d\n", error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + }\ +} + +int main() { + + const float a = 100.0f; + float x[N]; + float y[N], y_cpu_res[N], y_gpu_res[N]; + + // initialize the input data + std::default_random_engine random_gen; + std::uniform_real_distribution distribution(-N, N); + std::generate_n(x, N, [&]() { return distribution(random_gen); }); + std::generate_n(y, N, [&]() { return distribution(random_gen); }); + std::copy_n(y, N, y_cpu_res); + + // Explicit GPU code: + + size_t Nbytes = N*sizeof(float); + float *x_gpu, *y_gpu; + + cublasHandle_t handle; + + cudaDeviceProp props; + CHECK(cudaGetDeviceProperties(&props, 0/*deviceID*/)); + printf ("info: running on device %s\n", props.name); + + printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + CHECK(cudaMalloc(&x_gpu, Nbytes)); + CHECK(cudaMalloc(&y_gpu, Nbytes)); + + // Initialize the blas library + CHECK_BLAS ( cublasCreate(&handle)); + + // copy n elements from a vector in host memory space to a vector in GPU memory space + printf ("info: copy Host2Device\n"); + CHECK_BLAS ( cublasSetVector(N, sizeof(*x), x, 1, x_gpu, 1)); + CHECK_BLAS ( cublasSetVector(N, sizeof(*y), y, 1, y_gpu, 1)); + + printf ("info: launch 'saxpy' kernel\n"); + CHECK_BLAS ( cublasSaxpy(handle, N, &a, x_gpu, 1, y_gpu, 1)); + + cudaDeviceSynchronize(); + + printf ("info: copy Device2Host\n"); + CHECK_BLAS ( cublasGetVector(N, sizeof(*y_gpu_res), y_gpu, 1, y_gpu_res, 1)); + + // CPU implementation of saxpy + for (int i = 0; i < N; i++) { + y_cpu_res[i] = a * x[i] + y[i]; + } + + // verify the results + int errors = 0; + for (int i = 0; i < N; i++) { + if (fabs(y_cpu_res[i] - y_gpu_res[i]) > fabs(y_cpu_res[i] * 0.0001f)) + errors++; + } + std::cout << errors << " errors" << std::endl; + + CHECK( cudaFree(x_gpu)); + CHECK( cudaFree(y_gpu)); + CHECK_BLAS( cublasDestroy(handle)); + + return errors; +} diff --git a/samples/2_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp b/samples/2_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp new file mode 100644 index 000000000..3f20c8a7c --- /dev/null +++ b/samples/2_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp @@ -0,0 +1,94 @@ + +#include +#include +#include +#include + +// header file for the GPU API +#include +#include + +#define N (1024 * 500) + +#define CHECK(cmd) \ +{\ + hipError_t error = cmd; \ + if (error != hipSuccess) { \ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + }\ +} + +#define CHECK_BLAS(cmd) \ +{\ + hipblasStatus_t error = cmd;\ + if (error != HIPBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "error: (%d) at %s:%d\n", error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + }\ +} + +int main() { + + const float a = 100.0f; + float x[N]; + float y[N], y_cpu_res[N], y_gpu_res[N]; + + // initialize the input data + std::default_random_engine random_gen; + std::uniform_real_distribution distribution(-N, N); + std::generate_n(x, N, [&]() { return distribution(random_gen); }); + std::generate_n(y, N, [&]() { return distribution(random_gen); }); + std::copy_n(y, N, y_cpu_res); + + // Explicit GPU code: + + size_t Nbytes = N*sizeof(float); + float *x_gpu, *y_gpu; + + hipblasHandle_t handle; + + hipDeviceProp_t props; + CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/)); + printf ("info: running on device %s\n", props.name); + + printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); + CHECK(hipMalloc(&x_gpu, Nbytes)); + CHECK(hipMalloc(&y_gpu, Nbytes)); + + // Initialize the blas library + CHECK_BLAS ( hipblasCreate(&handle)); + + // copy n elements from a vector in host memory space to a vector in GPU memory space + printf ("info: copy Host2Device\n"); + CHECK_BLAS ( hipblasSetVector(N, sizeof(*x), x, 1, x_gpu, 1)); + CHECK_BLAS ( hipblasSetVector(N, sizeof(*y), y, 1, y_gpu, 1)); + + printf ("info: launch 'saxpy' kernel\n"); + CHECK_BLAS ( hipblasSaxpy(handle, N, &a, x_gpu, 1, y_gpu, 1)); + + hipDeviceSynchronize(); + + printf ("info: copy Device2Host\n"); + CHECK_BLAS ( hipblasGetVector(N, sizeof(*y_gpu_res), y_gpu, 1, y_gpu_res, 1)); + + // CPU implementation of saxpy + for (int i = 0; i < N; i++) { + y_cpu_res[i] = a * x[i] + y[i]; + } + + // verify the results + int errors = 0; + for (int i = 0; i < N; i++) { + if (fabs(y_cpu_res[i] - y_gpu_res[i]) > fabs(y_cpu_res[i] * 0.0001f)) + errors++; + } + std::cout << errors << " errors" << std::endl; + + CHECK( hipFree(x_gpu)); + CHECK( hipFree(y_gpu)); + CHECK_BLAS( hipblasDestroy(handle)); + + return errors; +} From 6ad8ac5d95e6079840feb428a91e6e329aa8fe57 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 18 Aug 2016 12:40:30 +0530 Subject: [PATCH 053/700] Rename 2_Advanced to 7_Advanced Change-Id: I51e5fa7f4c1dbf467f2d7182ec69d12d5fe548d0 --- samples/{2_Advanced => 7_Advanced}/hipblas_saxpy/Makefile | 0 samples/{2_Advanced => 7_Advanced}/hipblas_saxpy/saxpy.cublas.cpp | 0 .../{2_Advanced => 7_Advanced}/hipblas_saxpy/saxpy.hipblasref.cpp | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename samples/{2_Advanced => 7_Advanced}/hipblas_saxpy/Makefile (100%) rename samples/{2_Advanced => 7_Advanced}/hipblas_saxpy/saxpy.cublas.cpp (100%) rename samples/{2_Advanced => 7_Advanced}/hipblas_saxpy/saxpy.hipblasref.cpp (100%) diff --git a/samples/2_Advanced/hipblas_saxpy/Makefile b/samples/7_Advanced/hipblas_saxpy/Makefile similarity index 100% rename from samples/2_Advanced/hipblas_saxpy/Makefile rename to samples/7_Advanced/hipblas_saxpy/Makefile diff --git a/samples/2_Advanced/hipblas_saxpy/saxpy.cublas.cpp b/samples/7_Advanced/hipblas_saxpy/saxpy.cublas.cpp similarity index 100% rename from samples/2_Advanced/hipblas_saxpy/saxpy.cublas.cpp rename to samples/7_Advanced/hipblas_saxpy/saxpy.cublas.cpp diff --git a/samples/2_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp b/samples/7_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp similarity index 100% rename from samples/2_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp rename to samples/7_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp From c1b1086c71be30f30fb55a76cd98d06be15044af Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 31 Aug 2016 13:56:07 -0500 Subject: [PATCH 054/700] added sample for how-to-use pre-compiled kernels1. Corrected the exit output of kernel compilation by hipcc 2. Added sample which loads/run kernel binary during runtime? Change-Id: I26ccaca1f844fee317592e26c9e654ce548b96a8 --- samples/0_Intro/module_api/Makefile | 19 ++++ samples/0_Intro/module_api/runKernel.cpp | 105 +++++++++++++++++++++++ samples/0_Intro/module_api/vcpy_isa.cpp | 9 ++ samples/0_Intro/module_api/vcpy_isa.cu | 6 ++ samples/0_Intro/module_api/vcpy_isa.ptx | 38 ++++++++ 5 files changed, 177 insertions(+) create mode 100644 samples/0_Intro/module_api/Makefile create mode 100644 samples/0_Intro/module_api/runKernel.cpp create mode 100644 samples/0_Intro/module_api/vcpy_isa.cpp create mode 100644 samples/0_Intro/module_api/vcpy_isa.cu create mode 100644 samples/0_Intro/module_api/vcpy_isa.ptx diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile new file mode 100644 index 000000000..b58882b0f --- /dev/null +++ b/samples/0_Intro/module_api/Makefile @@ -0,0 +1,19 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif +HIPCC=$(HIP_PATH)/bin/hipcc + +all: vcpy_isa.compile runKernel.hip.out + +runKernel.cuda.out: runKernel.cpp + nvcc runKernel.cpp -o $@ + +vcpy_isa.compile: vcpy_isa.cpp + $(HIPCC) --genisa --target-isa=fiji vcpy_isa.cpp -o vcpy_isa.co + +runKernel.hip.out: runKernel.cpp + $(HIPCC) runKernel.cpp -o runKernel.hip.out + +clean: + rm -f *.co *.out diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp new file mode 100644 index 000000000..e4fa1b6d9 --- /dev/null +++ b/samples/0_Intro/module_api/runKernel.cpp @@ -0,0 +1,105 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include + +#define LEN 64 +#define SIZE LEN<<2 + +#ifdef __HIP_PLATFORM_HCC__ +#define fileName "vcpy_isa.co" +#define kernel_name "ZN12_GLOBAL__N_146_Z11hello_world16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf" +#endif + +#ifdef __HIP_PLATFORM_NVCC__ +#define fileName "vcpy_isa.ptx" +#define kernel_name "hello_world" +#endif + +int main(){ + float *A, *B; + hipDeviceptr_t Ad, Bd; + A = new float[LEN]; + B = new float[LEN]; + + for(uint32_t i=0;iargBuffer(5); + uint32_t *ptr32_t = (uint32_t*)&argBuffer[0]; + memcpy(ptr32_t + 0, &one, sizeof(uint32_t)); + memcpy(ptr32_t + 1, &one, sizeof(uint32_t)); + memcpy(ptr32_t + 2, &one, sizeof(uint32_t)); + memcpy(ptr32_t + 3, &len, sizeof(uint32_t)); + memcpy(ptr32_t + 4, &one, sizeof(uint32_t)); + memcpy(ptr32_t + 5, &one, sizeof(uint32_t)); + memcpy(&argBuffer[3], &Ad, sizeof(void*)); + memcpy(&argBuffer[4], &Bd, sizeof(void*)); + + + + size_t size = argBuffer.size()*sizeof(void*); + + void *config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0], + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END + }; + + hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); + + hipMemcpyDtoH(B, Bd, SIZE); + for(uint32_t i=LEN-4;i + +__global__ void hello_world(hipLaunchParm lp, float *a, float *b) +{ + int tx = hipThreadIdx_x; + b[tx] = a[tx]; +} + +int main(){} diff --git a/samples/0_Intro/module_api/vcpy_isa.cu b/samples/0_Intro/module_api/vcpy_isa.cu new file mode 100644 index 000000000..d2a083860 --- /dev/null +++ b/samples/0_Intro/module_api/vcpy_isa.cu @@ -0,0 +1,6 @@ + +extern "C" __global__ void hello_world(float *a, float *b) +{ + int tx = threadIdx.x; + b[tx] = a[tx]; +} diff --git a/samples/0_Intro/module_api/vcpy_isa.ptx b/samples/0_Intro/module_api/vcpy_isa.ptx new file mode 100644 index 000000000..62eb3f63d --- /dev/null +++ b/samples/0_Intro/module_api/vcpy_isa.ptx @@ -0,0 +1,38 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-19856038 +// Cuda compilation tools, release 7.5, V7.5.17 +// Based on LLVM 3.4svn +// + +.version 4.3 +.target sm_20 +.address_size 64 + + // .globl hello_world + +.visible .entry hello_world( + .param .u64 hello_world_param_0, + .param .u64 hello_world_param_1 +) +{ + .reg .f32 %f<2>; + .reg .b32 %r<2>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [hello_world_param_0]; + ld.param.u64 %rd2, [hello_world_param_1]; + cvta.to.global.u64 %rd3, %rd2; + cvta.to.global.u64 %rd4, %rd1; + mov.u32 %r1, %tid.x; + mul.wide.s32 %rd5, %r1, 4; + add.s64 %rd6, %rd4, %rd5; + ld.global.f32 %f1, [%rd6]; + add.s64 %rd7, %rd3, %rd5; + st.global.f32 [%rd7], %f1; + ret; +} + + From 52e3d0e799ced8572d15a4ec5ff9f953ed3cb0a3 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 1 Sep 2016 12:10:31 +0530 Subject: [PATCH 055/700] module_api/Makefile: Use gencodeobject instead of genisa Change-Id: I7e3523810f5603ad727b1fda7ff2d0dc53ec72d7 --- samples/0_Intro/module_api/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile index b58882b0f..582899b6d 100644 --- a/samples/0_Intro/module_api/Makefile +++ b/samples/0_Intro/module_api/Makefile @@ -10,7 +10,7 @@ runKernel.cuda.out: runKernel.cpp nvcc runKernel.cpp -o $@ vcpy_isa.compile: vcpy_isa.cpp - $(HIPCC) --genisa --target-isa=fiji vcpy_isa.cpp -o vcpy_isa.co + $(HIPCC) --gencodeobject --target-isa=fiji vcpy_isa.cpp -o vcpy_isa.co runKernel.hip.out: runKernel.cpp $(HIPCC) runKernel.cpp -o runKernel.hip.out From 176c74af6a53f35a3bff6d84c324fee4d02799dd Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 1 Sep 2016 15:11:12 +0530 Subject: [PATCH 056/700] Fixed module_api/Makefile to set flags based on HIP_PLATFORM Change-Id: I2fa9a556e0c4f25f4963ecef1d25eb922f9af1b9 --- samples/0_Intro/module_api/Makefile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile index 582899b6d..25ae7b841 100644 --- a/samples/0_Intro/module_api/Makefile +++ b/samples/0_Intro/module_api/Makefile @@ -3,14 +3,16 @@ ifeq (,$(HIP_PATH)) HIP_PATH=../../.. endif HIPCC=$(HIP_PATH)/bin/hipcc +HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler) -all: vcpy_isa.compile runKernel.hip.out +ifeq (${HIP_PLATFORM}, hcc) + GENCODEOBJECT_FLAGS=--target-isa-fiji +endif -runKernel.cuda.out: runKernel.cpp - nvcc runKernel.cpp -o $@ +all: vcpy_isa.compile runKernel.hip.out vcpy_isa.compile: vcpy_isa.cpp - $(HIPCC) --gencodeobject --target-isa=fiji vcpy_isa.cpp -o vcpy_isa.co + $(HIPCC) --gencodeobject $(GENCODEOBJECT_FLAGS) vcpy_isa.cpp -o vcpy_isa.co runKernel.hip.out: runKernel.cpp $(HIPCC) runKernel.cpp -o runKernel.hip.out From ebc17b0d6ee17658b5293333729c47b6395d1069 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 1 Sep 2016 10:39:14 -0500 Subject: [PATCH 057/700] Fixed offline kernel compilation 1. Removed vcpy_isa.ptx as it should be generated during make 2. Made argument padding specific to hcc path 3. Renamed --gencodeobject to --genco 4. Changed Makefile to work on both nvcc and hcc path Change-Id: Ifd053d541085d9ce4fd37bc21b07674786c7163e --- samples/0_Intro/module_api/Makefile | 24 +++++++++++---- samples/0_Intro/module_api/runKernel.cpp | 11 +++++-- samples/0_Intro/module_api/vcpy_isa.ptx | 38 ------------------------ 3 files changed, 27 insertions(+), 46 deletions(-) delete mode 100644 samples/0_Intro/module_api/vcpy_isa.ptx diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile index 25ae7b841..db270beaa 100644 --- a/samples/0_Intro/module_api/Makefile +++ b/samples/0_Intro/module_api/Makefile @@ -6,16 +6,28 @@ HIPCC=$(HIP_PATH)/bin/hipcc HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler) ifeq (${HIP_PLATFORM}, hcc) - GENCODEOBJECT_FLAGS=--target-isa-fiji + GENCODEOBJECT_FLAGS=--target-isa=fiji + +vcpy_isa.compile: vcpy_isa.cpp + $(HIPCC) --genco $(GENCODEOBJECT_FLAGS) vcpy_isa.cpp -o vcpy_isa.co + +clean: + rm -f *.co *.out + endif -all: vcpy_isa.compile runKernel.hip.out +ifeq (${HIP_PLATFORM}, nvcc) -vcpy_isa.compile: vcpy_isa.cpp - $(HIPCC) --gencodeobject $(GENCODEOBJECT_FLAGS) vcpy_isa.cpp -o vcpy_isa.co +vcpy_isa.compile: vcpy_isa.cu + $(HIPCC) --genco vcpy_isa.cu -o vcpy_isa.ptx + +clean: + rm -f *.ptx *.out + +endif + +all: vcpy_isa.compile runKernel.hip.out runKernel.hip.out: runKernel.cpp $(HIPCC) runKernel.cpp -o runKernel.hip.out -clean: - rm -f *.co *.out diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp index e4fa1b6d9..dcbaa4cd3 100644 --- a/samples/0_Intro/module_api/runKernel.cpp +++ b/samples/0_Intro/module_api/runKernel.cpp @@ -66,8 +66,9 @@ int main(){ hipModuleLoad(&Module, fileName); hipModuleGetFunction(&Function, Module, kernel_name); - uint32_t len = LEN; - uint32_t one = 1; +#ifdef __HIP_PLATFORM_HCC__ + uint32_t len = LEN; + uint32_t one = 1; std::vectorargBuffer(5); uint32_t *ptr32_t = (uint32_t*)&argBuffer[0]; @@ -79,7 +80,13 @@ int main(){ memcpy(ptr32_t + 5, &one, sizeof(uint32_t)); memcpy(&argBuffer[3], &Ad, sizeof(void*)); memcpy(&argBuffer[4], &Bd, sizeof(void*)); +#endif +#ifdef __HIP_PLATFORM_NVCC__ + std::vectorargBuffer(2); + memcpy(&argBuffer[0], &Ad, sizeof(void*)); + memcpy(&argBuffer[1], &Bd, sizeof(void*)); +#endif size_t size = argBuffer.size()*sizeof(void*); diff --git a/samples/0_Intro/module_api/vcpy_isa.ptx b/samples/0_Intro/module_api/vcpy_isa.ptx deleted file mode 100644 index 62eb3f63d..000000000 --- a/samples/0_Intro/module_api/vcpy_isa.ptx +++ /dev/null @@ -1,38 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// -// Compiler Build ID: CL-19856038 -// Cuda compilation tools, release 7.5, V7.5.17 -// Based on LLVM 3.4svn -// - -.version 4.3 -.target sm_20 -.address_size 64 - - // .globl hello_world - -.visible .entry hello_world( - .param .u64 hello_world_param_0, - .param .u64 hello_world_param_1 -) -{ - .reg .f32 %f<2>; - .reg .b32 %r<2>; - .reg .b64 %rd<8>; - - - ld.param.u64 %rd1, [hello_world_param_0]; - ld.param.u64 %rd2, [hello_world_param_1]; - cvta.to.global.u64 %rd3, %rd2; - cvta.to.global.u64 %rd4, %rd1; - mov.u32 %r1, %tid.x; - mul.wide.s32 %rd5, %r1, 4; - add.s64 %rd6, %rd4, %rd5; - ld.global.f32 %f1, [%rd6]; - add.s64 %rd7, %rd3, %rd5; - st.global.f32 [%rd7], %f1; - ret; -} - - From 2341e48842d9ae8ab4315e15eb2f625f24f5b70a Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 29 Aug 2016 22:03:00 -0500 Subject: [PATCH 058/700] enable hc_am example in hcc_ddialects example Change-Id: Iec2f9eb05f95cb025c157fee8fd284aab844d1a2 --- samples/0_Intro/hcc_dialects/Makefile | 9 +++++---- samples/0_Intro/hcc_dialects/vadd_hc_am.cpp | 15 +++++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/samples/0_Intro/hcc_dialects/Makefile b/samples/0_Intro/hcc_dialects/Makefile index fb8fcc0c3..3b5ceca7f 100644 --- a/samples/0_Intro/hcc_dialects/Makefile +++ b/samples/0_Intro/hcc_dialects/Makefile @@ -1,8 +1,9 @@ HCC_HOME?=/opt/rocm/hcc HCC = $(HCC_HOME)/bin/hcc -HCC_CFLAGS= `$(HCC_HOME)/bin/hcc-config --cxxflags` -HCC_LDFLAGS= `$(HCC_HOME)/bin/hcc-config --ldflags` +OPT=-O2 +HCC_CFLAGS= `$(HCC_HOME)/bin/hcc-config --cxxflags` ${OPT} +HCC_LDFLAGS= `$(HCC_HOME)/bin/hcc-config --ldflags` ${OPT} CPPAMP_CFLAGS= -std=c++amp -stdlib=libc++ -I$(HCC_HOME)/include CPPAMP_LDFLAGS= -std=c++amp -L$(HCC_HOME)/lib -Wl,--rpath=$(HCC_HOME)/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive @@ -19,7 +20,7 @@ $(error hcc_dialects requires hcc compiler and only runs on hcc platform) endif -TARGETS=vadd_hc_arrayview vadd_hc_array vadd_amp_arrayview vadd_hip +TARGETS=vadd_hc_arrayview vadd_hc_array vadd_hc_am vadd_amp_arrayview vadd_hip all: $(TARGETS) @@ -51,7 +52,7 @@ vadd_hc_array: vadd_hc_array.o vadd_hc_am.o: vadd_hc_am.cpp $(HCC) $(HCC_CFLAGS) -c $< -o $@ vadd_hc_am: vadd_hc_am.o - $(HCC) $(HCC_LDFLAGS) $< -o $@ + $(HCC) $(HCC_LDFLAGS) -lhc_am $< -o $@ diff --git a/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp b/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp index 5cb2e8c98..3685e688c 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp @@ -32,23 +32,30 @@ int main(int argc, char *argv[]) for (int i=0; i (sizeElements), [&] (hc::index<1> idx) [[hc]] { int i = idx[0]; - C_d[i] = A_d[i] + B_d[i]; + //C_d[i] = A_d[i] + B_d[i]; + C_d[0] = A_d[1] + B_d[2]; }); + cf.wait(); +#endif + // This copy is in same AV as the kernel and thus will wait for the kernel to finish before executing. - av.copy(C_d, C_h); // C++ copy D2H + av.copy(C_d, C_h, sizeBytes); // C++ copy D2H for (int i=0; i Date: Thu, 1 Sep 2016 14:00:46 -0500 Subject: [PATCH 059/700] Fix av::copy in dialects to use capture-by-value Change-Id: Ibce1488a1326f66b92b4d5b351230666b691ed31 --- samples/0_Intro/hcc_dialects/vadd_hc_am.cpp | 9 ++------- samples/0_Intro/hcc_dialects/vadd_hip.cpp | 7 +++++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp b/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp index 3685e688c..53a137f74 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp @@ -41,17 +41,12 @@ int main(int argc, char *argv[]) // Launch kernel onto AV. // Because the kernel PFE and the copies are submitted to same AV, they will execute in order // and we don't need additional synchronization to ensure the copies complete before the PFE begins. -#if 1 hc::completion_future cf= hc::parallel_for_each(av, hc::extent<1> (sizeElements), - [&] (hc::index<1> idx) [[hc]] { + [=] (hc::index<1> idx) [[hc]] { int i = idx[0]; - //C_d[i] = A_d[i] + B_d[i]; - C_d[0] = A_d[1] + B_d[2]; + C_d[i] = A_d[i] + B_d[i]; }); - cf.wait(); -#endif - // This copy is in same AV as the kernel and thus will wait for the kernel to finish before executing. diff --git a/samples/0_Intro/hcc_dialects/vadd_hip.cpp b/samples/0_Intro/hcc_dialects/vadd_hip.cpp index 9d223ba27..9684bef14 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hip.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_hip.cpp @@ -27,22 +27,25 @@ int main(int argc, char *argv[]) hipMalloc(&B_d, sizeBytes); hipMalloc(&C_d, sizeBytes); - // Initialize host data + // Initialize host memory for (int i=0; i Date: Fri, 2 Sep 2016 12:47:25 -0500 Subject: [PATCH 060/700] Fix double-lock of stream on hipModuleLaunchKernel Change-Id: I4ca164971c25f4eb8fbcca11d6258367bb3d2ab4 --- samples/0_Intro/module_api/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile index db270beaa..8981938e9 100644 --- a/samples/0_Intro/module_api/Makefile +++ b/samples/0_Intro/module_api/Makefile @@ -3,11 +3,14 @@ ifeq (,$(HIP_PATH)) HIP_PATH=../../.. endif HIPCC=$(HIP_PATH)/bin/hipcc +OPT= HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler) ifeq (${HIP_PLATFORM}, hcc) GENCODEOBJECT_FLAGS=--target-isa=fiji +all: runKernel.hip.out + vcpy_isa.compile: vcpy_isa.cpp $(HIPCC) --genco $(GENCODEOBJECT_FLAGS) vcpy_isa.cpp -o vcpy_isa.co @@ -29,5 +32,5 @@ endif all: vcpy_isa.compile runKernel.hip.out runKernel.hip.out: runKernel.cpp - $(HIPCC) runKernel.cpp -o runKernel.hip.out + $(HIPCC) $(OPT) runKernel.cpp -o runKernel.hip.out From 6ca7a87e0e733ab0459399c9e979e86ceb1e2e4d Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 2 Sep 2016 13:17:17 -0500 Subject: [PATCH 061/700] corrected offline kernel compilation on hipcc path 1. hipgenisa.sh now adds int main(){} during kernel compilation. User does not have to put it there 2. Renamed vcpy_isa.cpp to vcpy_kernel.cpp 3. Removed vcpy_isa.cu as the kernel code should be common for both paths 4. Changed Makefile and runkernel.cpp to work with above changes Change-Id: I9f8c84706b44bb500bc493a68e959762b55a0142 --- samples/0_Intro/module_api/Makefile | 10 +++++----- samples/0_Intro/module_api/runKernel.cpp | 8 ++++---- samples/0_Intro/module_api/vcpy_isa.cu | 6 ------ .../module_api/{vcpy_isa.cpp => vcpy_kernel.cpp} | 1 - 4 files changed, 9 insertions(+), 16 deletions(-) delete mode 100644 samples/0_Intro/module_api/vcpy_isa.cu rename samples/0_Intro/module_api/{vcpy_isa.cpp => vcpy_kernel.cpp} (91%) diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile index 8981938e9..99cff6bc0 100644 --- a/samples/0_Intro/module_api/Makefile +++ b/samples/0_Intro/module_api/Makefile @@ -11,8 +11,8 @@ ifeq (${HIP_PLATFORM}, hcc) all: runKernel.hip.out -vcpy_isa.compile: vcpy_isa.cpp - $(HIPCC) --genco $(GENCODEOBJECT_FLAGS) vcpy_isa.cpp -o vcpy_isa.co +vcpy_kernel.compile: vcpy_kernel.cpp + $(HIPCC) --genco $(GENCODEOBJECT_FLAGS) vcpy_kernel.cpp -o vcpy_kernel.co clean: rm -f *.co *.out @@ -21,15 +21,15 @@ endif ifeq (${HIP_PLATFORM}, nvcc) -vcpy_isa.compile: vcpy_isa.cu - $(HIPCC) --genco vcpy_isa.cu -o vcpy_isa.ptx +vcpy_kernel.compile: vcpy_kernel.cpp + $(HIPCC) --genco vcpy_kernel.cpp -o vcpy_kernel.ptx clean: rm -f *.ptx *.out endif -all: vcpy_isa.compile runKernel.hip.out +all: vcpy_kernel.compile runKernel.hip.out runKernel.hip.out: runKernel.cpp $(HIPCC) $(OPT) runKernel.cpp -o runKernel.hip.out diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp index dcbaa4cd3..1156b8ddb 100644 --- a/samples/0_Intro/module_api/runKernel.cpp +++ b/samples/0_Intro/module_api/runKernel.cpp @@ -27,12 +27,12 @@ THE SOFTWARE. #define SIZE LEN<<2 #ifdef __HIP_PLATFORM_HCC__ -#define fileName "vcpy_isa.co" +#define fileName "vcpy_kernel.co" #define kernel_name "ZN12_GLOBAL__N_146_Z11hello_world16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf" #endif #ifdef __HIP_PLATFORM_NVCC__ -#define fileName "vcpy_isa.ptx" +#define fileName "vcpy_kernel.ptx" #define kernel_name "hello_world" #endif @@ -67,8 +67,8 @@ int main(){ hipModuleGetFunction(&Function, Module, kernel_name); #ifdef __HIP_PLATFORM_HCC__ - uint32_t len = LEN; - uint32_t one = 1; + uint32_t len = LEN; + uint32_t one = 1; std::vectorargBuffer(5); uint32_t *ptr32_t = (uint32_t*)&argBuffer[0]; diff --git a/samples/0_Intro/module_api/vcpy_isa.cu b/samples/0_Intro/module_api/vcpy_isa.cu deleted file mode 100644 index d2a083860..000000000 --- a/samples/0_Intro/module_api/vcpy_isa.cu +++ /dev/null @@ -1,6 +0,0 @@ - -extern "C" __global__ void hello_world(float *a, float *b) -{ - int tx = threadIdx.x; - b[tx] = a[tx]; -} diff --git a/samples/0_Intro/module_api/vcpy_isa.cpp b/samples/0_Intro/module_api/vcpy_kernel.cpp similarity index 91% rename from samples/0_Intro/module_api/vcpy_isa.cpp rename to samples/0_Intro/module_api/vcpy_kernel.cpp index ead325311..640cf5b1b 100644 --- a/samples/0_Intro/module_api/vcpy_isa.cpp +++ b/samples/0_Intro/module_api/vcpy_kernel.cpp @@ -6,4 +6,3 @@ __global__ void hello_world(hipLaunchParm lp, float *a, float *b) b[tx] = a[tx]; } -int main(){} From 49971e8c9e94252f4ceb3d27f8818153a23e6192 Mon Sep 17 00:00:00 2001 From: pensun Date: Sat, 3 Sep 2016 21:06:58 -0500 Subject: [PATCH 062/700] For module_api sample, use vcpy_kernel.cu to generate ptx file for NV path. Change-Id: Id0033678834288c4eaa56b12e7d447119be99deb --- samples/0_Intro/module_api/Makefile | 4 ++-- samples/0_Intro/module_api/vcpy_kernel.cu | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 samples/0_Intro/module_api/vcpy_kernel.cu diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile index 99cff6bc0..2570290f0 100644 --- a/samples/0_Intro/module_api/Makefile +++ b/samples/0_Intro/module_api/Makefile @@ -21,8 +21,8 @@ endif ifeq (${HIP_PLATFORM}, nvcc) -vcpy_kernel.compile: vcpy_kernel.cpp - $(HIPCC) --genco vcpy_kernel.cpp -o vcpy_kernel.ptx +vcpy_kernel.compile: vcpy_kernel.cu + $(HIPCC) --genco vcpy_kernel.cu -o vcpy_kernel.ptx clean: rm -f *.ptx *.out diff --git a/samples/0_Intro/module_api/vcpy_kernel.cu b/samples/0_Intro/module_api/vcpy_kernel.cu new file mode 100644 index 000000000..ead7d7031 --- /dev/null +++ b/samples/0_Intro/module_api/vcpy_kernel.cu @@ -0,0 +1,6 @@ +extern "C" __global__ void hello_world(float *a, float *b) +{ + int tx = threadIdx.x; + b[tx] = a[tx]; + +} From 6618c010b53934758a9244f91fd95d288614cc4b Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Sun, 4 Sep 2016 12:35:08 +0530 Subject: [PATCH 063/700] module_api: workaround to use vcpy_kernel.cpp for NV path Change-Id: Ib4868bf02c64070e846c19427c39289609909466 --- samples/0_Intro/module_api/Makefile | 19 ++++++++----------- samples/0_Intro/module_api/runKernel.cpp | 9 ++++++--- samples/0_Intro/module_api/vcpy_kernel.cpp | 8 +++++++- samples/0_Intro/module_api/vcpy_kernel.cu | 6 ------ 4 files changed, 21 insertions(+), 21 deletions(-) delete mode 100644 samples/0_Intro/module_api/vcpy_kernel.cu diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile index 2570290f0..bda0a7dd9 100644 --- a/samples/0_Intro/module_api/Makefile +++ b/samples/0_Intro/module_api/Makefile @@ -6,13 +6,15 @@ HIPCC=$(HIP_PATH)/bin/hipcc OPT= HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler) -ifeq (${HIP_PLATFORM}, hcc) - GENCODEOBJECT_FLAGS=--target-isa=fiji +all: vcpy_kernel.compile runKernel.hip.out + +runKernel.hip.out: runKernel.cpp + $(HIPCC) $(OPT) runKernel.cpp -o runKernel.hip.out -all: runKernel.hip.out +ifeq (${HIP_PLATFORM}, hcc) vcpy_kernel.compile: vcpy_kernel.cpp - $(HIPCC) --genco $(GENCODEOBJECT_FLAGS) vcpy_kernel.cpp -o vcpy_kernel.co + $(HIPCC) --genco --target-isa=fiji vcpy_kernel.cpp -o vcpy_kernel.co clean: rm -f *.co *.out @@ -21,16 +23,11 @@ endif ifeq (${HIP_PLATFORM}, nvcc) -vcpy_kernel.compile: vcpy_kernel.cu - $(HIPCC) --genco vcpy_kernel.cu -o vcpy_kernel.ptx +vcpy_kernel.compile: vcpy_kernel.cpp + $(HIPCC) --genco vcpy_kernel.cpp -o vcpy_kernel.ptx clean: rm -f *.ptx *.out endif -all: vcpy_kernel.compile runKernel.hip.out - -runKernel.hip.out: runKernel.cpp - $(HIPCC) $(OPT) runKernel.cpp -o runKernel.hip.out - diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp index 1156b8ddb..ea372773c 100644 --- a/samples/0_Intro/module_api/runKernel.cpp +++ b/samples/0_Intro/module_api/runKernel.cpp @@ -83,9 +83,12 @@ int main(){ #endif #ifdef __HIP_PLATFORM_NVCC__ - std::vectorargBuffer(2); - memcpy(&argBuffer[0], &Ad, sizeof(void*)); - memcpy(&argBuffer[1], &Bd, sizeof(void*)); + uint32_t one = 1; + std::vectorargBuffer(3); + uint32_t *ptr32_t = (uint32_t*)&argBuffer[0]; + memcpy(ptr32_t + 0, &one, sizeof(uint32_t)); + memcpy(&argBuffer[1], &Ad, sizeof(void*)); + memcpy(&argBuffer[2], &Bd, sizeof(void*)); #endif diff --git a/samples/0_Intro/module_api/vcpy_kernel.cpp b/samples/0_Intro/module_api/vcpy_kernel.cpp index 640cf5b1b..ebb6d066a 100644 --- a/samples/0_Intro/module_api/vcpy_kernel.cpp +++ b/samples/0_Intro/module_api/vcpy_kernel.cpp @@ -1,6 +1,12 @@ #include -__global__ void hello_world(hipLaunchParm lp, float *a, float *b) +#ifdef __HIP_PLATFORM_NVCC__ +#define EXTERN_C extern "C" +#else +#define EXTERN_C +#endif + +EXTERN_C __global__ void hello_world(hipLaunchParm lp, float *a, float *b) { int tx = hipThreadIdx_x; b[tx] = a[tx]; diff --git a/samples/0_Intro/module_api/vcpy_kernel.cu b/samples/0_Intro/module_api/vcpy_kernel.cu deleted file mode 100644 index ead7d7031..000000000 --- a/samples/0_Intro/module_api/vcpy_kernel.cu +++ /dev/null @@ -1,6 +0,0 @@ -extern "C" __global__ void hello_world(float *a, float *b) -{ - int tx = threadIdx.x; - b[tx] = a[tx]; - -} From c63944fc080c46fc7f3349b2e1118bde006465b9 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Sun, 4 Sep 2016 13:49:43 +0530 Subject: [PATCH 064/700] module_api sample: no longer need EXTERN_C workaround Change-Id: Ida087d832df8e1f3620b38f920ec2853aad641c8 --- samples/0_Intro/module_api/vcpy_kernel.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/samples/0_Intro/module_api/vcpy_kernel.cpp b/samples/0_Intro/module_api/vcpy_kernel.cpp index ebb6d066a..0e051f76f 100644 --- a/samples/0_Intro/module_api/vcpy_kernel.cpp +++ b/samples/0_Intro/module_api/vcpy_kernel.cpp @@ -1,12 +1,6 @@ -#include +#include "hip/hip_runtime.h" -#ifdef __HIP_PLATFORM_NVCC__ -#define EXTERN_C extern "C" -#else -#define EXTERN_C -#endif - -EXTERN_C __global__ void hello_world(hipLaunchParm lp, float *a, float *b) +extern "C" __global__ void hello_world(hipLaunchParm lp, float *a, float *b) { int tx = hipThreadIdx_x; b[tx] = a[tx]; From b250c5a7b336aec5d0e6a9063c25531690a108c6 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Sun, 4 Sep 2016 16:26:16 +0530 Subject: [PATCH 065/700] module_api: HCC path no longer needs mangled kernel name Change-Id: I4c1cb218bfdd05c9fba57276167e3e4205b93614 --- samples/0_Intro/module_api/runKernel.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp index ea372773c..1c0dd9f34 100644 --- a/samples/0_Intro/module_api/runKernel.cpp +++ b/samples/0_Intro/module_api/runKernel.cpp @@ -28,7 +28,7 @@ THE SOFTWARE. #ifdef __HIP_PLATFORM_HCC__ #define fileName "vcpy_kernel.co" -#define kernel_name "ZN12_GLOBAL__N_146_Z11hello_world16grid_launch_parmPfS0__functor19__cxxamp_trampolineEiiiiiiPKfPf" +#define kernel_name "hello_world" #endif #ifdef __HIP_PLATFORM_NVCC__ From 71736d2ed2eeafa42afcca71837e69ba32b933a2 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Sun, 4 Sep 2016 20:37:29 +0530 Subject: [PATCH 066/700] Removed NVCC check for hipCtxXXX functions in module_api/runKernel.cpp Change-Id: I2bdd4fadf41063ec60626f1850e16f8307ebe6b5 --- samples/0_Intro/module_api/runKernel.cpp | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp index 1c0dd9f34..8c1ed1598 100644 --- a/samples/0_Intro/module_api/runKernel.cpp +++ b/samples/0_Intro/module_api/runKernel.cpp @@ -47,14 +47,11 @@ int main(){ B[i] = 0.0f; } - -#ifdef __HIP_PLATFORM_NVCC__ hipInit(0); - hipDevice_t device; - hipCtx_t context; - hipDeviceGet(&device, 0); - hipCtxCreate(&context, 0, device); -#endif + hipDevice_t device; + hipCtx_t context; + hipDeviceGet(&device, 0); + hipCtxCreate(&context, 0, device); hipMalloc((void**)&Ad, SIZE); hipMalloc((void**)&Bd, SIZE); @@ -107,9 +104,6 @@ int main(){ std::cout< Date: Sun, 4 Sep 2016 21:25:14 +0530 Subject: [PATCH 067/700] module_api sample: Remove unnecessary platform checks Change-Id: I1d531264d51ff952a3a68d554672b6d293e23379 --- samples/0_Intro/module_api/Makefile | 27 +++++++----------------- samples/0_Intro/module_api/runKernel.cpp | 9 +------- 2 files changed, 9 insertions(+), 27 deletions(-) diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile index bda0a7dd9..81e876ddb 100644 --- a/samples/0_Intro/module_api/Makefile +++ b/samples/0_Intro/module_api/Makefile @@ -3,31 +3,20 @@ ifeq (,$(HIP_PATH)) HIP_PATH=../../.. endif HIPCC=$(HIP_PATH)/bin/hipcc -OPT= HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler) -all: vcpy_kernel.compile runKernel.hip.out - -runKernel.hip.out: runKernel.cpp - $(HIPCC) $(OPT) runKernel.cpp -o runKernel.hip.out - ifeq (${HIP_PLATFORM}, hcc) - -vcpy_kernel.compile: vcpy_kernel.cpp - $(HIPCC) --genco --target-isa=fiji vcpy_kernel.cpp -o vcpy_kernel.co - -clean: - rm -f *.co *.out - + GENCO_FLAGS=--target-isa=fiji endif -ifeq (${HIP_PLATFORM}, nvcc) +all: vcpy_kernel.code runKernel.hip.out -vcpy_kernel.compile: vcpy_kernel.cpp - $(HIPCC) --genco vcpy_kernel.cpp -o vcpy_kernel.ptx +runKernel.hip.out: runKernel.cpp + $(HIPCC) $(HIPCC_FLAGS) $< -o $@ -clean: - rm -f *.ptx *.out +vcpy_kernel.code: vcpy_kernel.cpp + $(HIPCC) --genco $(GENCO_FLAGS) $< -o $@ -endif +clean: + rm -f *.code *.out diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp index 8c1ed1598..c6bba63b0 100644 --- a/samples/0_Intro/module_api/runKernel.cpp +++ b/samples/0_Intro/module_api/runKernel.cpp @@ -26,15 +26,8 @@ THE SOFTWARE. #define LEN 64 #define SIZE LEN<<2 -#ifdef __HIP_PLATFORM_HCC__ -#define fileName "vcpy_kernel.co" +#define fileName "vcpy_kernel.code" #define kernel_name "hello_world" -#endif - -#ifdef __HIP_PLATFORM_NVCC__ -#define fileName "vcpy_kernel.ptx" -#define kernel_name "hello_world" -#endif int main(){ float *A, *B; From 2145e2ba61b69ec0b275d01df4d3fb6636050f42 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 6 Sep 2016 17:47:10 +0530 Subject: [PATCH 068/700] module_api/Makefile: Update as per newer hipgenisa.sh Change-Id: I479c74eae00d7521434f2740ce5930e326ea05cf --- samples/0_Intro/module_api/Makefile | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile index 81e876ddb..f2c0ce555 100644 --- a/samples/0_Intro/module_api/Makefile +++ b/samples/0_Intro/module_api/Makefile @@ -5,17 +5,13 @@ endif HIPCC=$(HIP_PATH)/bin/hipcc HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler) -ifeq (${HIP_PLATFORM}, hcc) - GENCO_FLAGS=--target-isa=fiji -endif - all: vcpy_kernel.code runKernel.hip.out runKernel.hip.out: runKernel.cpp $(HIPCC) $(HIPCC_FLAGS) $< -o $@ vcpy_kernel.code: vcpy_kernel.cpp - $(HIPCC) --genco $(GENCO_FLAGS) $< -o $@ + $(HIPCC) --genco $(GENCO_FLAGS) $^ -o $@ clean: rm -f *.code *.out From be3fd69bd1f20f74fb7181fd8fdba9b4271c3daa Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 15 Sep 2016 11:40:17 -0500 Subject: [PATCH 069/700] added vimrc for current project 1. Added vimrc config file for HIP 2. Corrected square sample indent Change-Id: I3e1d92403571148fe6825db6ad63ad925ae69519 --- samples/0_Intro/square/square.hipref.cpp | 70 ++++++++++++------------ 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index aa1407773..7ca3a7500 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -31,7 +31,6 @@ THE SOFTWARE. }\ } - /* * Square each element in the array A and write to array C. */ @@ -43,55 +42,54 @@ vector_square(hipLaunchParm lp, T *C_d, const T *A_d, size_t N) size_t stride = hipBlockDim_x * hipGridDim_x ; for (size_t i=offset; i Date: Thu, 22 Sep 2016 12:24:55 -0500 Subject: [PATCH 070/700] Sample improvements. - Enable -O3 for hipDispatchLatency. - Use nearly-null kernel to prevent it from being optimized away. - Formatting for hipDispatchLatency. - Formatting for hipInfo. --- samples/1_Utils/hipDispatchLatency/Makefile | 8 ++-- .../hipDispatchLatency/ResultDatabase.cpp | 14 ++++--- .../hipDispatchLatency/hipDispatchLatency.cpp | 39 +++++++++++-------- samples/1_Utils/hipInfo/hipInfo.cpp | 2 +- 4 files changed, 38 insertions(+), 25 deletions(-) diff --git a/samples/1_Utils/hipDispatchLatency/Makefile b/samples/1_Utils/hipDispatchLatency/Makefile index 387cb9aac..3b69c4a33 100644 --- a/samples/1_Utils/hipDispatchLatency/Makefile +++ b/samples/1_Utils/hipDispatchLatency/Makefile @@ -6,10 +6,12 @@ HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipDispatchLatency -all: install +CXXFLAGS = -O3 -$(EXE): hipDispatchLatency.cpp - $(HIPCC) hipDispatchLatency.cpp ResultDatabase.cpp -o $@ +all: ${EXE} + +$(EXE): hipDispatchLatency.cpp ResultDatabase.cpp + $(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp ResultDatabase.cpp -o $@ install: $(EXE) cp $(EXE) $(HIP_PATH)/bin diff --git a/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp b/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp index 2ec686f26..d207154e3 100644 --- a/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp +++ b/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp @@ -253,10 +253,12 @@ void ResultDatabase::DumpDetailed(ostream &out) out << endl; } - out << endl - << "Note: Any results marked with (*) had missing values." << endl - << " This can occur on systems with a mixture of" << endl - << " device types or architectural capabilities." << endl; + if (0) { + out << endl + << "Note: Any results marked with (*) had missing values." << endl + << " This can occur on systems with a mixture of" << endl + << " device types or architectural capabilities." << endl; + } } @@ -330,9 +332,11 @@ void ResultDatabase::DumpSummary(ostream &out) out << endl; } - out << endl + if (0) { + out << endl << "Note: results marked with (*) had missing values such as" << endl << "might occur with a mixture of architectural capabilities." << endl; + } } // **************************************************************************** diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp index 1c15ab51d..65e8603a4 100644 --- a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp +++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp @@ -30,15 +30,22 @@ if(status != hipSuccess){ \ #define LEN 1024*1024 #define SIZE LEN * sizeof(float) -#define ITER 5120 +#define ITER 10120 -__global__ void One(hipLaunchParm lp, float* Ad){ + +// HCC optimizes away fully NULL kernel calls, so run one that is nearly null: +__global__ void NearlyNull(hipLaunchParm lp, float* Ad){ + if (Ad) { + Ad[0] = 42; + } } + int main(){ hipError_t err; - float *A, *Ad; + float *A; + float *Ad = NULL; A = new float[LEN]; @@ -50,11 +57,10 @@ int main(){ err = hipStreamCreate(&stream); check("Creating stream",err); - err = hipMalloc(&Ad, SIZE); - check("Allocating Ad memory on device", err); - - err = hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); - check("Doing memory copy from A to Ad", err); + //err = hipMalloc(&Ad, SIZE); + //check("Allocating Ad memory on device", err); + //err = hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); + //check("Doing memory copy from A to Ad", err); float mS = 0; hipEvent_t start, stop; @@ -63,15 +69,16 @@ int main(){ ResultDatabase resultDB[8]; + hipEventRecord(start); - hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); + hipLaunchKernel(NearlyNull, dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[0].AddResult(std::string("First Kernel Launch"), "", "uS", mS*1000); // std::cout<<"First Kernel Launch: \t\t"< Date: Thu, 22 Sep 2016 17:51:52 -0500 Subject: [PATCH 071/700] Small tool, doc, sample enhancements. - Expand message when HIP version mismatch detected. - Doc touchup. - change sorting of hipBusBandwidth so byte results shown at top. - Change-Id: Ifb4e44a5fdfb65d59c4994b11e5f13385705f7e0 --- samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp | 14 +++++++++++--- .../1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 4 ++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp b/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp index 2ec686f26..4be2ea258 100644 --- a/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp +++ b/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp @@ -7,16 +7,22 @@ using namespace std; +#define SORT_RETAIN_ATTS_ORDER 1 + + bool ResultDatabase::Result::operator<(const Result &rhs) const { if (test < rhs.test) return true; if (test > rhs.test) return false; +#if (SORT_RETAIN_ATTS_ORDER == 0) + // For ties, sort by the value of the attribute: if (atts < rhs.atts) return true; if (atts > rhs.atts) return false; +#endif return false; // less-operator returns false on equal } @@ -189,7 +195,8 @@ void ResultDatabase::AddResult(const string &test_orig, void ResultDatabase::DumpDetailed(ostream &out) { vector sorted(results); - sort(sorted.begin(), sorted.end()); + + stable_sort(sorted.begin(), sorted.end()); const int testNameW = 24 ; const int attW = 12; @@ -281,7 +288,8 @@ void ResultDatabase::DumpDetailed(ostream &out) void ResultDatabase::DumpSummary(ostream &out) { vector sorted(results); - sort(sorted.begin(), sorted.end()); + + stable_sort(sorted.begin(), sorted.end()); const int testNameW = 24 ; const int attW = 12; @@ -377,7 +385,7 @@ void ResultDatabase::DumpCsv(string fileName) bool emptyFile; vector sorted(results); - sort(sorted.begin(), sorted.end()); + stable_sort(sorted.begin(), sorted.end()); //Check to see if the file is empty - if so, add the headers emptyFile = this->IsFileEmpty(fileName); diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index faff9ba6e..a42a561ac 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -49,8 +49,8 @@ std::string sizeToString(int size) using namespace std; stringstream ss; if (size < 0) { - // char (09, horiz tab) lexically sorts before " " so will cause Byte values to be displayed before kB. - ss << char(0x09)/*tab*/ << setfill('0') << setw(3) << -size << "B"; + // char (-) lexically sorts before " " so will cause Byte values to be displayed before kB. + ss << "+" << setfill('0') << setw(3) << -size << "By"; } else { ss << size << "kB"; } From db7e460626301bd16ea25464aeaef5013badb994 Mon Sep 17 00:00:00 2001 From: sandeep kumar Date: Wed, 7 Sep 2016 17:16:12 +0530 Subject: [PATCH 072/700] Add 2_Cookbook Change-Id: I10bbbd4bcb80a5900fe6af466c8f4c94ea5efe9a --- samples/2_Cookbook/0_MatrixTranspose/Makefile | 36 ++++ .../0_MatrixTranspose/MatrixTranspose.cpp | 137 ++++++++++++++ .../2_Cookbook/0_MatrixTranspose/Readme.md | 100 ++++++++++ samples/2_Cookbook/1_hipEvent/Makefile | 36 ++++ samples/2_Cookbook/1_hipEvent/Readme.md | 74 ++++++++ samples/2_Cookbook/1_hipEvent/hipEvent.cpp | 174 ++++++++++++++++++ samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile | 36 ++++ .../2_HIP_ATP_MARKER/MatrixTranspose.cpp | 174 ++++++++++++++++++ samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md | 51 +++++ samples/2_Cookbook/3_shared_memory/Makefile | 36 ++++ .../3_shared_memory/sharedMemory.cpp | 144 +++++++++++++++ samples/2_Cookbook/4_shfl/Makefile | 36 ++++ samples/2_Cookbook/4_shfl/shfl.cpp | 143 ++++++++++++++ samples/2_Cookbook/5_2dshfl/2dshfl.cpp | 139 ++++++++++++++ samples/2_Cookbook/5_2dshfl/Makefile | 36 ++++ 15 files changed, 1352 insertions(+) create mode 100644 samples/2_Cookbook/0_MatrixTranspose/Makefile create mode 100644 samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp create mode 100644 samples/2_Cookbook/0_MatrixTranspose/Readme.md create mode 100644 samples/2_Cookbook/1_hipEvent/Makefile create mode 100644 samples/2_Cookbook/1_hipEvent/Readme.md create mode 100644 samples/2_Cookbook/1_hipEvent/hipEvent.cpp create mode 100644 samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile create mode 100644 samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp create mode 100644 samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md create mode 100644 samples/2_Cookbook/3_shared_memory/Makefile create mode 100644 samples/2_Cookbook/3_shared_memory/sharedMemory.cpp create mode 100644 samples/2_Cookbook/4_shfl/Makefile create mode 100644 samples/2_Cookbook/4_shfl/shfl.cpp create mode 100644 samples/2_Cookbook/5_2dshfl/2dshfl.cpp create mode 100644 samples/2_Cookbook/5_2dshfl/Makefile diff --git a/samples/2_Cookbook/0_MatrixTranspose/Makefile b/samples/2_Cookbook/0_MatrixTranspose/Makefile new file mode 100644 index 000000000..ffb442e44 --- /dev/null +++ b/samples/2_Cookbook/0_MatrixTranspose/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = MatrixTranspose.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./exe + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp new file mode 100644 index 000000000..c43785f5c --- /dev/null +++ b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp @@ -0,0 +1,137 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +// hip header file +#include "hip_runtime.h" + + +#define WIDTH 1024 +#define HEIGHT 1024 + +#define NUM (WIDTH*HEIGHT) + +#define THREADS_PER_BLOCK_X 16 +#define THREADS_PER_BLOCK_Y 16 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width, + const int height) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + out[y * width + x] = in[x * height + y]; +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width, + const unsigned int height) +{ + for(unsigned int j=0; j < height; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*height + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/0_MatrixTranspose/Readme.md b/samples/2_Cookbook/0_MatrixTranspose/Readme.md new file mode 100644 index 000000000..b1c0b261b --- /dev/null +++ b/samples/2_Cookbook/0_MatrixTranspose/Readme.md @@ -0,0 +1,100 @@ +## Writing first HIP program ### + +This tutorial shows how to get write simple HIP application. We will write the simplest Matrix Transpose program. + +## HIP Introduction: + +HIP is a C++ runtime API and kernel language that allows developers to create portable applications that can run on AMD and other GPU’s. Our goal was to rise above the lowest-common-denominator paths and deliver a solution that allows you, the developer, to use essential hardware features and maximize your application’s performance on GPU hardware. + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +Here is simple example showing how to write your first program in HIP. +In order to use the HIP framework, we need to add the "hip_runtime.h" header file. SInce its c++ api you can add any header file you have been using earlier while writing your c/c++ program. For gpgpu programming, we have host(microprocessor) and the device(gpu). + +## Device-side code +We will work on device side code first, Here is simple example showing a snippet of HIP device side code: + +`__global__ void matrixTranspose(hipLaunchParm lp, ` +` float *out, ` +` float *in, ` +` const int width, ` +` const int height) ` +`{ ` +` int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; ` +` int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; ` +` ` +` out[y * width + x] = in[x * height + y]; ` +`} ` + +`__global__` keyword is the Function-Type Qualifiers, it is used with functions that are executed on device and are called/launched from the hosts. +other function-type qualifiers are: +`__device__` functions are Executed on the device and Called from the device only +`__host__` functions are Executed on the host and Called from the host + +`__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "hipThreadIdx_x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function. +`__host__` cannot combine with `__global__`. + +`__global__` functions are often referred to as *kernels, and calling one is termed *launching the kernel*. + +Next keyword is `void`. HIP `__global__` functions must have a `void` return type, and the first parameter to a HIP `__global__` function must have the type `hipLaunchParm`, which is for execution configuration. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute. + +After `hipLaunchParm`, Kernel arguments follows next(i.e., `float *out, float *in, const int width, const int height`). + +The kernel function begins with +` int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;` +` int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;` +here the keyword hipBlockIdx_x, hipBlockIdx_y and hipBlockIdx_z(not used here) are the built-in functions to identify the threads in a block. The keyword hipBlockDim_x, hipBlockDim_y and hipBlockDim_z(not used here) are to identify the dimensions of the block. + +We are familiar with rest of the code on device-side. + +## Host-side code + +Now, we'll see how to call the kernel from the host. Inside the main() function, we first defined the pointers(for both, the host-side as well as device). The declaration of device pointer is similar to that of the host. Next, we have `hipDeviceProp_t`, it is the pre-defined struct for hip device properties. This is followed by `hipGetDeviceProperties(&devProp, 0)` It is used to extract the device information. The first parameter is the struct, second parameter is the device number to get properties for. Next line print the name of the device. + +We allocated memory to the Matrix on host side by using malloc and initiallized it. While in order to allocate memory on device side we will be using `hipMalloc`, it's quiet similar to that of malloc instruction. After this, we will copy the data to the allocated memory on device-side using `hipMemcpy`. +` hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);` +here the first parameter is the destination pointer, second is the source pointer, third is the size of memory copy and the last specify the direction on memory copy(which is in this case froom host to device). While in order to transfer memory from device to host, use `hipMemcpyDeviceToHost` and for device to device memory copy use `hipMemcpyDeviceToDevice`. + +Now, we'll see how to launch the kernel. +` hipLaunchKernel(matrixTranspose, ` +` dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), ` +` dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), ` +` 0, 0, ` +` gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); ` + +HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the `Cuda <<< >>>` syntax). In HIP, +- Kernels launch with the `"hipLaunchKernel"` function +- The first five parameters to hipLaunchKernel are the following: + - **symbol kernelName**: the name of the kernel to launch. To support template kernels which contains "," use the HIP_KERNEL_NAME macro. In current application it's "matrixTranspose". + - **dim3 gridDim**: 3D-grid dimensions specifying the number of blocks to launch. In MatrixTranspose sample, it's "dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y)". + - **dim3 blockDim**: 3D-block dimensions specifying the number of threads in each block.In MatrixTranspose sample, it's "dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y)". + - **size_t dynamicShared**: amount of additional shared memory to allocate when launching the kernel. In MatrixTranspose sample, it's '0'. + - **hipStream_t**: stream where the kernel should execute. A value of 0 corresponds to the NULL stream.In MatrixTranspose sample, it's '0'. +- Kernel arguments follow these first five parameters. Here, these are "gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT". + +Next, we'll copy the computed values/data back to the device using the `hipMemcpy`. Here the last parameter will be `hipMemcpyDeviceToHost` + +After, copying the data from device to memory, we will verify it with the one we computed with the cpu reference funtion. + +Finally, we will free the memory allocated earlier by using free() for host while for devices we will use `hipFree`. + +## How to build and run: +Use the make command and execute it using ./exe +Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/1_hipEvent/Makefile b/samples/2_Cookbook/1_hipEvent/Makefile new file mode 100644 index 000000000..dc0f7db2e --- /dev/null +++ b/samples/2_Cookbook/1_hipEvent/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = hipEvent.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./exe + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/1_hipEvent/Readme.md b/samples/2_Cookbook/1_hipEvent/Readme.md new file mode 100644 index 000000000..16285120f --- /dev/null +++ b/samples/2_Cookbook/1_hipEvent/Readme.md @@ -0,0 +1,74 @@ +## Using hipEvents to measure performance ### + +This tutorial is follow-up of the previous one where we learn how to write our first hip program, in which we compute Matrix Transpose. In this tutorial, we'll explain how to use the hipEvent to get the performance score for memory transfer and kernel execution time. + +## Introduction: + +Memory transfer and kernel execution are the most important parameter in parallel computing (specially HPC and machine learning). Memory bottlenecks is the main problem why we are not able to get the highest performance, therefore obtaining the memory transfer timing and kernel execution timing plays key role in application optimization. + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +We will be using the Simple Matrix Transpose application from the previous tutorial and modify it to learn how to get the performance score for memory transfer and kernel execution time. + +## hipEnvent_t + +We'll learn how to use the event management functionality of HIP runtime api. In the same sourcecode, we used for MatrixTranspose we will declare the following events as follows: + +` hipEvent_t start, stop;` + +We'll create the event with the help of following code: + +` hipEventCreate(&start);` +` hipEventCreate(&stop);` + +We'll use the "eventMs" variable to store the time taken value: +` float eventMs = 1.0f;` + +## Time taken measurement by using hipEvents: + +We'll start the timer by calling: +` hipEventRecord(start, NULL);` +in this, the first parameter is the hipEvent_t, will will mark the start of the time from which the measurement has to be performed, while the second parameter has to be of the type hipStream_t. In current situation, we have passed NULL (the default stream). We will learn about the `hipStream_t` in more detail latter. + +Now, we'll have the operation for which we need to compute the time taken. For the case of memory transfer, we'll place the `hipMemcpy`: +` hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);` + +and for kernel execution time we'll use `hipKernelLaunch`: +` hipLaunchKernel(matrixTranspose, ` +` dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), ` +` dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), ` +` 0, 0, ` +` gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); ` + +Now to mark the end of the eventRecord, we will again use the hipEventRecord by passing the stop event: +` hipEventRecord(stop, NULL);` + +Will synchronize the event with the help of: +` hipEventSynchronize(stop);` + +In order to calculate the time taken by measuring the difference of occurance marked by the start and stop event, we'll use: +` hipEventElapsedTime(&eventMs, start, stop);` +Here the first parameter will store the time taken value, second parameter is the starting marker for the event while the third one is marking the end. + +We can print the value of time take comfortably since eventMs is float variable. + +## How to build and run: +Use the make command and execute it using ./exe +Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/1_hipEvent/hipEvent.cpp b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp new file mode 100644 index 000000000..b6bc4d1db --- /dev/null +++ b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp @@ -0,0 +1,174 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +// hip header file +#include "hip_runtime.h" + +#define WIDTH 1024 +#define HEIGHT 1024 + +#define NUM (WIDTH*HEIGHT) + +#define THREADS_PER_BLOCK_X 16 +#define THREADS_PER_BLOCK_Y 16 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width, + const int height) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + out[y * width + x] = in[x * height + y]; +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width, + const unsigned int height) +{ + for(unsigned int j=0; j < height; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*height + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + float eventMs = 1.0f; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("kernel Execution time = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile b/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile new file mode 100644 index 000000000..ffb442e44 --- /dev/null +++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = MatrixTranspose.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./exe + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp b/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp new file mode 100644 index 000000000..b6bc4d1db --- /dev/null +++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp @@ -0,0 +1,174 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +// hip header file +#include "hip_runtime.h" + +#define WIDTH 1024 +#define HEIGHT 1024 + +#define NUM (WIDTH*HEIGHT) + +#define THREADS_PER_BLOCK_X 16 +#define THREADS_PER_BLOCK_Y 16 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width, + const int height) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + out[y * width + x] = in[x * height + y]; +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width, + const unsigned int height) +{ + for(unsigned int j=0; j < height; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*height + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + float eventMs = 1.0f; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("kernel Execution time = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md b/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md new file mode 100644 index 000000000..2bba31d34 --- /dev/null +++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md @@ -0,0 +1,51 @@ +## Using hipEvents to measure performance ### + +This tutorial is follow-up of the previous two tutorial where we learn how to write our first hip program, in which we compute Matrix Transpose and in second one, we added feature to measure time taken for memory transfer and kernel execution. In this tutorial, we won't make amy changes to the source code. We'll explain how to use the codexl/rocm-profiler for hip timeline tracing. + + +## Introduction: + +CodeXL and rocm-profiler are the tool used for profiling the application, which is of prominent use in optimizing the application by means of finding the memory bottlenecks and etc. + +## Requirement: +[CodeXL Installation](http://gpuopen.com/compute-product/codexl/) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +We will be using the Simple Matrix Transpose source code from the previous tutorial as it is. + +## Using CodeXL markers for HIP Functions + +HIP can generate markers at function being/end which are displayed on the CodeXL timeline view. To do this, you need to install ROCm-Profiler and enable HIP to generate the markers: + +1. Install ROCm-Profiler Installing HIP from the rocm pre-built packages, installs the ROCm-Profiler as well. Alternatively, you can build ROCm-Profiler using the instructions given below. + +2. Build HIP with ATP markers enabled HIP pre-built packages are enabled with ATP marker support by default. To enable ATP marker support when building HIP from source, use the option -DCOMPILE_HIP_ATP_MARKER=1 during the cmake configure step. + +3. Set HIP_ATP_MARKER +`export HIP_ATP_MARKER=1` + +4. Recompile the target application + +5. Run with profiler enabled to generate ATP file. +`/opt/rocm/bin/rocm-profiler -o -A ` + +##Using HIP_TRACE_API + +You can also print the HIP function strings to stderr using HIP_TRACE_API environment variable. This can also be combined with the more detailed debug information provided by the HIP_DB switch. For example: +`HIP_TRACE_API=1 HIP_DB=0x2 ./myHipApp` +Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors. + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/3_shared_memory/Makefile b/samples/2_Cookbook/3_shared_memory/Makefile new file mode 100644 index 000000000..5e9ce4721 --- /dev/null +++ b/samples/2_Cookbook/3_shared_memory/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = sharedMemory.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./exe + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp new file mode 100644 index 000000000..1106d454f --- /dev/null +++ b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp @@ -0,0 +1,144 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +// hip header file +#include "hip_runtime.h" + + +#define WIDTH 1024 +#define HEIGHT 1024 + +#define NUM (WIDTH*HEIGHT) + +#define THREADS_PER_BLOCK_X 16 +#define THREADS_PER_BLOCK_Y 16 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width, + const int height) +{ + __shared__ float sharedMem[16*16]; + + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + sharedMem[y * width + x] = in[x * height + y]; + + __syncthreads(); + + out[y * width + x] = sharedMem[y * width + x]; +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width, + const unsigned int height) +{ + for(unsigned int j=0; j < height; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*height + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/4_shfl/Makefile b/samples/2_Cookbook/4_shfl/Makefile new file mode 100644 index 000000000..1d30c7874 --- /dev/null +++ b/samples/2_Cookbook/4_shfl/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = shfl.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./exe + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/4_shfl/shfl.cpp b/samples/2_Cookbook/4_shfl/shfl.cpp new file mode 100644 index 000000000..f43809b01 --- /dev/null +++ b/samples/2_Cookbook/4_shfl/shfl.cpp @@ -0,0 +1,143 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +// hip header file +#include "hip_runtime.h" + + +#define WIDTH 4 +#define HEIGHT 4 + +#define NUM (WIDTH*HEIGHT) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width, + const int height) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + //int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + float val = in[x]; + + for(int i=0;i 0 ) { + printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/5_2dshfl/2dshfl.cpp b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp new file mode 100644 index 000000000..85bc3be2a --- /dev/null +++ b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp @@ -0,0 +1,139 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +// hip header file +#include "hip_runtime.h" + + +#define WIDTH 4 +#define HEIGHT 4 + +#define NUM (WIDTH*HEIGHT) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width, + const int height) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + float val = in[y*width + x]; + + out[x*height + y] = __shfl(val,y*width + x); +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width, + const unsigned int height) +{ + for(unsigned int j=0; j < height; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*height + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(1), + dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/5_2dshfl/Makefile b/samples/2_Cookbook/5_2dshfl/Makefile new file mode 100644 index 000000000..502d2948b --- /dev/null +++ b/samples/2_Cookbook/5_2dshfl/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = 2dshfl.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./exe + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + From d25f665837e444f263d84f0228649c8cf1b70157 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 2 Oct 2016 05:57:02 -0500 Subject: [PATCH 073/700] small typo fix Change-Id: I01906b330be8e6ec149bcdfe82def73e15931c89 --- samples/2_Cookbook/1_hipEvent/Readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/2_Cookbook/1_hipEvent/Readme.md b/samples/2_Cookbook/1_hipEvent/Readme.md index 16285120f..bf5df9c76 100644 --- a/samples/2_Cookbook/1_hipEvent/Readme.md +++ b/samples/2_Cookbook/1_hipEvent/Readme.md @@ -17,7 +17,7 @@ Programmers familiar with CUDA, OpenCL will be able to quickly learn and start c We will be using the Simple Matrix Transpose application from the previous tutorial and modify it to learn how to get the performance score for memory transfer and kernel execution time. -## hipEnvent_t +## hipEvent_t We'll learn how to use the event management functionality of HIP runtime api. In the same sourcecode, we used for MatrixTranspose we will declare the following events as follows: From 937c0389cb74444a35b256397b892dfee922d01d Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 4 Oct 2016 22:21:10 +0530 Subject: [PATCH 074/700] samples: Updated to use new hip include path Change-Id: I53a1385a17f13a997ea21d14315f15a3ad851dab --- samples/0_Intro/bit_extract/bit_extract.cpp | 2 +- samples/0_Intro/hcc_dialects/vadd_hip.cpp | 2 +- samples/0_Intro/module_api/runKernel.cpp | 4 ++-- samples/0_Intro/square/square.hipref.cpp | 2 +- samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 2 +- samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp | 2 +- samples/1_Utils/hipInfo/hipInfo.cpp | 2 +- samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp | 2 +- samples/2_Cookbook/1_hipEvent/hipEvent.cpp | 2 +- samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp | 2 +- samples/2_Cookbook/3_shared_memory/sharedMemory.cpp | 2 +- samples/2_Cookbook/4_shfl/shfl.cpp | 2 +- samples/2_Cookbook/5_2dshfl/2dshfl.cpp | 2 +- samples/7_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp | 2 +- 14 files changed, 15 insertions(+), 15 deletions(-) diff --git a/samples/0_Intro/bit_extract/bit_extract.cpp b/samples/0_Intro/bit_extract/bit_extract.cpp index 746e1012b..06ca34996 100644 --- a/samples/0_Intro/bit_extract/bit_extract.cpp +++ b/samples/0_Intro/bit_extract/bit_extract.cpp @@ -21,7 +21,7 @@ THE SOFTWARE. */ #include #include -#include +#include "hip/hip_runtime.h" #ifdef __HIP_PLATFORM_HCC__ #include #endif diff --git a/samples/0_Intro/hcc_dialects/vadd_hip.cpp b/samples/0_Intro/hcc_dialects/vadd_hip.cpp index 9684bef14..c8f425ff9 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hip.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_hip.cpp @@ -1,4 +1,4 @@ -#include +#include "hip/hip_runtime.h" __global__ void vadd_hip(hipLaunchParm lp, const float *a, const float *b, float *c, int N) { diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp index c6bba63b0..5f16677fc 100644 --- a/samples/0_Intro/module_api/runKernel.cpp +++ b/samples/0_Intro/module_api/runKernel.cpp @@ -17,8 +17,8 @@ OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include -#include +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" #include #include #include diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index 7ca3a7500..2955c6ee3 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include -#include +#include "hip/hip_runtime.h" #define CHECK(cmd) \ {\ diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index a42a561ac..7cb3e7908 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include "hip/hip_runtime.h" #include "ResultDatabase.h" diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp index 65e8603a4..e686c0768 100644 --- a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp +++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp @@ -17,7 +17,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include"hip_runtime.h" +#include "hip/hip_runtime.h" #include #include #include"ResultDatabase.h" diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index e14006d86..46741a9c9 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -21,7 +21,7 @@ THE SOFTWARE. */ #include #include -#include "hip_runtime.h" +#include "hip/hip_runtime.h" #define KNRM "\x1B[0m" #define KRED "\x1B[31m" diff --git a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp index c43785f5c..42445374b 100644 --- a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp +++ b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. #include // hip header file -#include "hip_runtime.h" +#include "hip/hip_runtime.h" #define WIDTH 1024 diff --git a/samples/2_Cookbook/1_hipEvent/hipEvent.cpp b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp index b6bc4d1db..76688a7b0 100644 --- a/samples/2_Cookbook/1_hipEvent/hipEvent.cpp +++ b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. #include // hip header file -#include "hip_runtime.h" +#include "hip/hip_runtime.h" #define WIDTH 1024 #define HEIGHT 1024 diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp b/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp index b6bc4d1db..76688a7b0 100644 --- a/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp +++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. #include // hip header file -#include "hip_runtime.h" +#include "hip/hip_runtime.h" #define WIDTH 1024 #define HEIGHT 1024 diff --git a/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp index 1106d454f..433fada9d 100644 --- a/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp +++ b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. #include // hip header file -#include "hip_runtime.h" +#include "hip/hip_runtime.h" #define WIDTH 1024 diff --git a/samples/2_Cookbook/4_shfl/shfl.cpp b/samples/2_Cookbook/4_shfl/shfl.cpp index f43809b01..2819b1f04 100644 --- a/samples/2_Cookbook/4_shfl/shfl.cpp +++ b/samples/2_Cookbook/4_shfl/shfl.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. #include // hip header file -#include "hip_runtime.h" +#include "hip/hip_runtime.h" #define WIDTH 4 diff --git a/samples/2_Cookbook/5_2dshfl/2dshfl.cpp b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp index 85bc3be2a..783879b05 100644 --- a/samples/2_Cookbook/5_2dshfl/2dshfl.cpp +++ b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp @@ -22,7 +22,7 @@ THE SOFTWARE. #include // hip header file -#include "hip_runtime.h" +#include "hip/hip_runtime.h" #define WIDTH 4 diff --git a/samples/7_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp b/samples/7_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp index 3f20c8a7c..4610a612d 100644 --- a/samples/7_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp +++ b/samples/7_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp @@ -5,7 +5,7 @@ #include // header file for the GPU API -#include +#include "hip/hip_runtime.h" #include #define N (1024 * 500) From 1ec9ecbf9c4d6b48e09f689b853cf81aa6b1f471 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 5 Oct 2016 23:07:14 +0530 Subject: [PATCH 075/700] clang-hipify -> hipify-clang in documentation Change-Id: I86ebc8112477db0d3e09f240beb3f9222d909ee6 --- samples/2_Cookbook/0_MatrixTranspose/Readme.md | 2 +- samples/2_Cookbook/1_hipEvent/Readme.md | 2 +- samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/2_Cookbook/0_MatrixTranspose/Readme.md b/samples/2_Cookbook/0_MatrixTranspose/Readme.md index b1c0b261b..5e9483b59 100644 --- a/samples/2_Cookbook/0_MatrixTranspose/Readme.md +++ b/samples/2_Cookbook/0_MatrixTranspose/Readme.md @@ -95,6 +95,6 @@ Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia - [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) - [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) - [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [hipify-clang](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/hipify-clang/README.md) - [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) - [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/1_hipEvent/Readme.md b/samples/2_Cookbook/1_hipEvent/Readme.md index bf5df9c76..e3ec8ad78 100644 --- a/samples/2_Cookbook/1_hipEvent/Readme.md +++ b/samples/2_Cookbook/1_hipEvent/Readme.md @@ -69,6 +69,6 @@ Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia - [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) - [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) - [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [hipify-clang](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/hipify-clang/README.md) - [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) - [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md b/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md index 2bba31d34..de1a80057 100644 --- a/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md +++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md @@ -46,6 +46,6 @@ Note this trace mode uses colors. "less -r" can handle raw control characters an - [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) - [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) - [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [hipify-clang](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/hipify-clang/README.md) - [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) - [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) From 1eb1c6573fa3e4827b1fc500886006f083481515 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 12 Oct 2016 19:23:48 -0500 Subject: [PATCH 076/700] added copyright to module sample kernel file Change-Id: If57e0761df63c902e1677084ff85106ec49df5de --- samples/0_Intro/module_api/vcpy_kernel.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/samples/0_Intro/module_api/vcpy_kernel.cpp b/samples/0_Intro/module_api/vcpy_kernel.cpp index 0e051f76f..0418cb4b0 100644 --- a/samples/0_Intro/module_api/vcpy_kernel.cpp +++ b/samples/0_Intro/module_api/vcpy_kernel.cpp @@ -1,3 +1,22 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + #include "hip/hip_runtime.h" extern "C" __global__ void hello_world(hipLaunchParm lp, float *a, float *b) From 435c13ef7ddf4c1ec5e0b5f421732d0797986874 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 12 Oct 2016 19:26:59 -0500 Subject: [PATCH 077/700] indent correction for square.cu Change-Id: I2ca008e260b920ac3a503ad2a4bb28cd32300c98 --- samples/0_Intro/square/square.cu | 76 ++++++++++++++++---------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/samples/0_Intro/square/square.cu b/samples/0_Intro/square/square.cu index ec8ca12fb..5f6260df7 100644 --- a/samples/0_Intro/square/square.cu +++ b/samples/0_Intro/square/square.cu @@ -26,9 +26,9 @@ THE SOFTWARE. {\ cudaError_t error = cmd;\ if (error != cudaSuccess) { \ - fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \ - exit(EXIT_FAILURE);\ - }\ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + }\ } @@ -43,55 +43,55 @@ vector_square(T *C_d, const T *A_d, size_t N) size_t stride = blockDim.x * gridDim.x ; for (size_t i=offset; i>> (C_d, A_d, N); + printf ("info: launch 'vector_square' kernel\n"); + vector_square <<>> (C_d, A_d, N); - printf ("info: copy Device2Host\n"); + printf ("info: copy Device2Host\n"); CHECK ( cudaMemcpy(C_h, C_d, Nbytes, cudaMemcpyDeviceToHost)); - printf ("info: check result\n"); + printf ("info: check result\n"); for (size_t i=0; i Date: Fri, 14 Oct 2016 23:19:25 -0500 Subject: [PATCH 078/700] Refactor module API test. - Add PASSED/FAIL indication. - Set args using struct rather than void* array. Change-Id: Ic924f88c49cc46979b12b7fef8650081e3b5f58c --- samples/0_Intro/module_api/runKernel.cpp | 55 +++++++++++++++--------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp index 5f16677fc..90b081e09 100644 --- a/samples/0_Intro/module_api/runKernel.cpp +++ b/samples/0_Intro/module_api/runKernel.cpp @@ -60,32 +60,37 @@ int main(){ uint32_t len = LEN; uint32_t one = 1; - std::vectorargBuffer(5); - uint32_t *ptr32_t = (uint32_t*)&argBuffer[0]; - memcpy(ptr32_t + 0, &one, sizeof(uint32_t)); - memcpy(ptr32_t + 1, &one, sizeof(uint32_t)); - memcpy(ptr32_t + 2, &one, sizeof(uint32_t)); - memcpy(ptr32_t + 3, &len, sizeof(uint32_t)); - memcpy(ptr32_t + 4, &one, sizeof(uint32_t)); - memcpy(ptr32_t + 5, &one, sizeof(uint32_t)); - memcpy(&argBuffer[3], &Ad, sizeof(void*)); - memcpy(&argBuffer[4], &Bd, sizeof(void*)); + struct { + uint32_t _hidden[6]; + void * _Ad; + void * _Bd; + } args; + + for (int i=0; i<6; i++) { + args._hidden[i] = 0; + } + args._Ad = Ad; + args._Bd = Bd; + #endif #ifdef __HIP_PLATFORM_NVCC__ - uint32_t one = 1; - std::vectorargBuffer(3); - uint32_t *ptr32_t = (uint32_t*)&argBuffer[0]; - memcpy(ptr32_t + 0, &one, sizeof(uint32_t)); - memcpy(&argBuffer[1], &Ad, sizeof(void*)); - memcpy(&argBuffer[2], &Bd, sizeof(void*)); + struct { + uint32_t _hidden[1]; + void * _Ad; + void * _Bd; + } args; + + args._hidden[0] = 0; + args._Ad = Ad; + args._Bd = Bd; #endif - size_t size = argBuffer.size()*sizeof(void*); + size_t size = sizeof(args); void *config[] = { - HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0], + HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END }; @@ -93,10 +98,20 @@ int main(){ hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); hipMemcpyDtoH(B, Bd, SIZE); - for(uint32_t i=LEN-4;i Date: Sat, 15 Oct 2016 23:05:04 +0530 Subject: [PATCH 079/700] samples: Updated copyright header Change-Id: I821f514ced5e34d492cb167b65d7273e26ed7b84 --- samples/0_Intro/bit_extract/bit_extract.cpp | 1 + .../hcc_dialects/vadd_amp_arrayview.cpp | 22 +++++++++++++++++++ samples/0_Intro/hcc_dialects/vadd_hc_am.cpp | 22 +++++++++++++++++++ .../0_Intro/hcc_dialects/vadd_hc_array.cpp | 22 +++++++++++++++++++ samples/0_Intro/hcc_dialects/vadd_hc_array.hc | 22 +++++++++++++++++++ .../hcc_dialects/vadd_hc_arrayview.cpp | 22 +++++++++++++++++++ samples/0_Intro/hcc_dialects/vadd_hip.cpp | 22 +++++++++++++++++++ samples/0_Intro/module_api/runKernel.cpp | 15 ++++++++----- samples/0_Intro/module_api/vcpy_kernel.cpp | 3 +++ samples/0_Intro/square/square.cu | 1 + samples/0_Intro/square/square.hipref.cpp | 1 + .../hipDispatchLatency/hipDispatchLatency.cpp | 3 +++ samples/1_Utils/hipInfo/hipInfo.cpp | 1 + .../0_MatrixTranspose/MatrixTranspose.cpp | 1 + samples/2_Cookbook/1_hipEvent/hipEvent.cpp | 1 + .../2_HIP_ATP_MARKER/MatrixTranspose.cpp | 1 + .../3_shared_memory/sharedMemory.cpp | 1 + samples/2_Cookbook/4_shfl/shfl.cpp | 1 + samples/2_Cookbook/5_2dshfl/2dshfl.cpp | 1 + 19 files changed, 157 insertions(+), 6 deletions(-) diff --git a/samples/0_Intro/bit_extract/bit_extract.cpp b/samples/0_Intro/bit_extract/bit_extract.cpp index 06ca34996..1535d2bd9 100644 --- a/samples/0_Intro/bit_extract/bit_extract.cpp +++ b/samples/0_Intro/bit_extract/bit_extract.cpp @@ -19,6 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + #include #include #include "hip/hip_runtime.h" diff --git a/samples/0_Intro/hcc_dialects/vadd_amp_arrayview.cpp b/samples/0_Intro/hcc_dialects/vadd_amp_arrayview.cpp index 485b64f68..a3162bccb 100644 --- a/samples/0_Intro/hcc_dialects/vadd_amp_arrayview.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_amp_arrayview.cpp @@ -1,3 +1,25 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + // Simple test showing how to use C++AMP syntax with array_view. // The code uses AMP's array_view class, which provides automatic data synchronization // of data between the host and the accelerator. As noted below, the HCC runtime diff --git a/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp b/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp index 53a137f74..c83051da2 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_hc_am.cpp @@ -1,3 +1,25 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + // Simple test showing how to use HC syntax with AM (accelerator memory). // AM provides a set of c-style memory management routines for allocating, // freeing, and copying memory. am_alloc returns a device pointer diff --git a/samples/0_Intro/hcc_dialects/vadd_hc_array.cpp b/samples/0_Intro/hcc_dialects/vadd_hc_array.cpp index bda3adf37..b076b926e 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hc_array.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_hc_array.cpp @@ -1,3 +1,25 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + // Simple test showing how to use HC syntax with array. // Array provides a type-safe C++ mechanism to allocate accelerator memory. // Like array_view, hc::array provides multi-dimensional indexing capability, diff --git a/samples/0_Intro/hcc_dialects/vadd_hc_array.hc b/samples/0_Intro/hcc_dialects/vadd_hc_array.hc index d57b9a7e1..9ed016c7a 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hc_array.hc +++ b/samples/0_Intro/hcc_dialects/vadd_hc_array.hc @@ -1,3 +1,25 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + #include int main(int argc, char *argv[]) diff --git a/samples/0_Intro/hcc_dialects/vadd_hc_arrayview.cpp b/samples/0_Intro/hcc_dialects/vadd_hc_arrayview.cpp index 2585f4700..15f5de4ab 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hc_arrayview.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_hc_arrayview.cpp @@ -1,3 +1,25 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + // Simple test showing how to use HC syntax with array_view. // The code uses AMP's array_view class, which provides automatic data synchronization // of data between the host and the accelerator. As noted below, the HCC runtime diff --git a/samples/0_Intro/hcc_dialects/vadd_hip.cpp b/samples/0_Intro/hcc_dialects/vadd_hip.cpp index c8f425ff9..f2afa378e 100644 --- a/samples/0_Intro/hcc_dialects/vadd_hip.cpp +++ b/samples/0_Intro/hcc_dialects/vadd_hip.cpp @@ -1,3 +1,25 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + #include "hip/hip_runtime.h" __global__ void vadd_hip(hipLaunchParm lp, const float *a, const float *b, float *c, int N) diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp index 90b081e09..b91507aaa 100644 --- a/samples/0_Intro/module_api/runKernel.cpp +++ b/samples/0_Intro/module_api/runKernel.cpp @@ -1,19 +1,22 @@ /* Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ diff --git a/samples/0_Intro/module_api/vcpy_kernel.cpp b/samples/0_Intro/module_api/vcpy_kernel.cpp index 0418cb4b0..0375eee34 100644 --- a/samples/0_Intro/module_api/vcpy_kernel.cpp +++ b/samples/0_Intro/module_api/vcpy_kernel.cpp @@ -1,13 +1,16 @@ /* Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE diff --git a/samples/0_Intro/square/square.cu b/samples/0_Intro/square/square.cu index 5f6260df7..82b31db14 100644 --- a/samples/0_Intro/square/square.cu +++ b/samples/0_Intro/square/square.cu @@ -19,6 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + #include #include diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index 2955c6ee3..3c863b8b7 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -19,6 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + #include #include "hip/hip_runtime.h" diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp index e686c0768..b343386b5 100644 --- a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp +++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp @@ -1,13 +1,16 @@ /* Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 46741a9c9..0403162bd 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -19,6 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + #include #include #include "hip/hip_runtime.h" diff --git a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp index 42445374b..91733c025 100644 --- a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp +++ b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp @@ -19,6 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + #include // hip header file diff --git a/samples/2_Cookbook/1_hipEvent/hipEvent.cpp b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp index 76688a7b0..1abe1180d 100644 --- a/samples/2_Cookbook/1_hipEvent/hipEvent.cpp +++ b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp @@ -19,6 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + #include // hip header file diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp b/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp index 76688a7b0..1abe1180d 100644 --- a/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp +++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp @@ -19,6 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + #include // hip header file diff --git a/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp index 433fada9d..9950b8d02 100644 --- a/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp +++ b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp @@ -19,6 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + #include // hip header file diff --git a/samples/2_Cookbook/4_shfl/shfl.cpp b/samples/2_Cookbook/4_shfl/shfl.cpp index 2819b1f04..07d5cd42d 100644 --- a/samples/2_Cookbook/4_shfl/shfl.cpp +++ b/samples/2_Cookbook/4_shfl/shfl.cpp @@ -19,6 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + #include // hip header file diff --git a/samples/2_Cookbook/5_2dshfl/2dshfl.cpp b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp index 783879b05..16e5c7489 100644 --- a/samples/2_Cookbook/5_2dshfl/2dshfl.cpp +++ b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp @@ -19,6 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + #include // hip header file From 04af19866f9e2ce1b7d168f18058a517c76aeb8b Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Fri, 14 Oct 2016 18:00:26 +0530 Subject: [PATCH 080/700] Add more apps to 2_Cookbook Change-Id: Iafe462df9726a32f450bd240a2de3eaa73a10057 --- samples/2_Cookbook/0_MatrixTranspose/Makefile | 2 +- .../0_MatrixTranspose/MatrixTranspose.cpp | 28 ++-- samples/2_Cookbook/1_hipEvent/Makefile | 2 +- samples/2_Cookbook/1_hipEvent/hipEvent.cpp | 27 ++-- samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile | 2 +- .../2_HIP_ATP_MARKER/MatrixTranspose.cpp | 27 ++-- samples/2_Cookbook/3_shared_memory/Makefile | 2 +- samples/2_Cookbook/3_shared_memory/Readme.md | 42 +++++ .../3_shared_memory/sharedMemory.cpp | 31 ++-- samples/2_Cookbook/4_shfl/Makefile | 4 +- samples/2_Cookbook/4_shfl/Readme.md | 51 ++++++ samples/2_Cookbook/4_shfl/shfl.cpp | 23 ++- samples/2_Cookbook/5_2dshfl/2dshfl.cpp | 21 ++- samples/2_Cookbook/5_2dshfl/Makefile | 2 +- samples/2_Cookbook/5_2dshfl/Readme.md | 51 ++++++ samples/2_Cookbook/6_dynamic_shared/Makefile | 36 +++++ samples/2_Cookbook/6_dynamic_shared/Readme.md | 47 ++++++ .../6_dynamic_shared/dynamic_shared.cpp | 141 +++++++++++++++++ samples/2_Cookbook/7_streams/Makefile | 36 +++++ samples/2_Cookbook/7_streams/Readme.md | 57 +++++++ samples/2_Cookbook/7_streams/stream.cpp | 148 ++++++++++++++++++ 21 files changed, 686 insertions(+), 94 deletions(-) create mode 100644 samples/2_Cookbook/3_shared_memory/Readme.md create mode 100644 samples/2_Cookbook/4_shfl/Readme.md create mode 100644 samples/2_Cookbook/5_2dshfl/Readme.md create mode 100644 samples/2_Cookbook/6_dynamic_shared/Makefile create mode 100644 samples/2_Cookbook/6_dynamic_shared/Readme.md create mode 100644 samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp create mode 100644 samples/2_Cookbook/7_streams/Makefile create mode 100644 samples/2_Cookbook/7_streams/Readme.md create mode 100644 samples/2_Cookbook/7_streams/stream.cpp diff --git a/samples/2_Cookbook/0_MatrixTranspose/Makefile b/samples/2_Cookbook/0_MatrixTranspose/Makefile index ffb442e44..d3630a1c1 100644 --- a/samples/2_Cookbook/0_MatrixTranspose/Makefile +++ b/samples/2_Cookbook/0_MatrixTranspose/Makefile @@ -10,7 +10,7 @@ TARGET=hcc SOURCES = MatrixTranspose.cpp OBJECTS = $(SOURCES:.cpp=.o) -EXECUTABLE=./exe +EXECUTABLE=./MatrixTranspose .PHONY: test diff --git a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp index 91733c025..264fcbed5 100644 --- a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp +++ b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp @@ -27,12 +27,12 @@ THE SOFTWARE. #define WIDTH 1024 -#define HEIGHT 1024 -#define NUM (WIDTH*HEIGHT) -#define THREADS_PER_BLOCK_X 16 -#define THREADS_PER_BLOCK_Y 16 +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 #define THREADS_PER_BLOCK_Z 1 // Device (Kernel) function, it must be void @@ -40,27 +40,25 @@ THE SOFTWARE. __global__ void matrixTranspose(hipLaunchParm lp, float *out, float *in, - const int width, - const int height) + const int width) { int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - out[y * width + x] = in[x * height + y]; + out[y * width + x] = in[x * width + y]; } // CPU implementation of matrix transpose void matrixTransposeCPUReference( float * output, float * input, - const unsigned int width, - const unsigned int height) + const unsigned int width) { - for(unsigned int j=0; j < height; j++) + for(unsigned int j=0; j < width; j++) { for(unsigned int i=0; i < width; i++) { - output[i*height + j] = input[j*width + i]; + output[i*width + j] = input[j*width + i]; } } } @@ -100,22 +98,22 @@ int main() { // Lauching kernel from host hipLaunchKernel(matrixTranspose, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, - gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + gpuTransposeMatrix , gpuMatrix, WIDTH); // Memory transfer from device to host hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); // CPU MatrixTranspose computation - matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); // verify the results errors = 0; double eps = 1.0E-6; for (i = 0; i < NUM; i++) { - if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { errors++; } } diff --git a/samples/2_Cookbook/1_hipEvent/Makefile b/samples/2_Cookbook/1_hipEvent/Makefile index dc0f7db2e..be4bc2169 100644 --- a/samples/2_Cookbook/1_hipEvent/Makefile +++ b/samples/2_Cookbook/1_hipEvent/Makefile @@ -10,7 +10,7 @@ TARGET=hcc SOURCES = hipEvent.cpp OBJECTS = $(SOURCES:.cpp=.o) -EXECUTABLE=./exe +EXECUTABLE=./hipEvent .PHONY: test diff --git a/samples/2_Cookbook/1_hipEvent/hipEvent.cpp b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp index 1abe1180d..f2aea146e 100644 --- a/samples/2_Cookbook/1_hipEvent/hipEvent.cpp +++ b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp @@ -26,12 +26,11 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #define WIDTH 1024 -#define HEIGHT 1024 -#define NUM (WIDTH*HEIGHT) +#define NUM (WIDTH*WIDTH) -#define THREADS_PER_BLOCK_X 16 -#define THREADS_PER_BLOCK_Y 16 +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 #define THREADS_PER_BLOCK_Z 1 // Device (Kernel) function, it must be void @@ -39,27 +38,25 @@ THE SOFTWARE. __global__ void matrixTranspose(hipLaunchParm lp, float *out, float *in, - const int width, - const int height) + const int width) { int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - out[y * width + x] = in[x * height + y]; + out[y * width + x] = in[x * width + y]; } // CPU implementation of matrix transpose void matrixTransposeCPUReference( float * output, float * input, - const unsigned int width, - const unsigned int height) + const unsigned int width) { - for(unsigned int j=0; j < height; j++) + for(unsigned int j=0; j < width; j++) { for(unsigned int i=0; i < width; i++) { - output[i*height + j] = input[j*width + i]; + output[i*width + j] = input[j*width + i]; } } } @@ -118,10 +115,10 @@ int main() { // Lauching kernel from host hipLaunchKernel(matrixTranspose, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, - gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + gpuTransposeMatrix , gpuMatrix, WIDTH); // Record the stop event hipEventRecord(stop, NULL); @@ -146,13 +143,13 @@ int main() { printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); // CPU MatrixTranspose computation - matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); // verify the results errors = 0; double eps = 1.0E-6; for (i = 0; i < NUM; i++) { - if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { errors++; } } diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile b/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile index ffb442e44..d3630a1c1 100644 --- a/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile +++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile @@ -10,7 +10,7 @@ TARGET=hcc SOURCES = MatrixTranspose.cpp OBJECTS = $(SOURCES:.cpp=.o) -EXECUTABLE=./exe +EXECUTABLE=./MatrixTranspose .PHONY: test diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp b/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp index 1abe1180d..f2aea146e 100644 --- a/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp +++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp @@ -26,12 +26,11 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #define WIDTH 1024 -#define HEIGHT 1024 -#define NUM (WIDTH*HEIGHT) +#define NUM (WIDTH*WIDTH) -#define THREADS_PER_BLOCK_X 16 -#define THREADS_PER_BLOCK_Y 16 +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 #define THREADS_PER_BLOCK_Z 1 // Device (Kernel) function, it must be void @@ -39,27 +38,25 @@ THE SOFTWARE. __global__ void matrixTranspose(hipLaunchParm lp, float *out, float *in, - const int width, - const int height) + const int width) { int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - out[y * width + x] = in[x * height + y]; + out[y * width + x] = in[x * width + y]; } // CPU implementation of matrix transpose void matrixTransposeCPUReference( float * output, float * input, - const unsigned int width, - const unsigned int height) + const unsigned int width) { - for(unsigned int j=0; j < height; j++) + for(unsigned int j=0; j < width; j++) { for(unsigned int i=0; i < width; i++) { - output[i*height + j] = input[j*width + i]; + output[i*width + j] = input[j*width + i]; } } } @@ -118,10 +115,10 @@ int main() { // Lauching kernel from host hipLaunchKernel(matrixTranspose, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, - gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + gpuTransposeMatrix , gpuMatrix, WIDTH); // Record the stop event hipEventRecord(stop, NULL); @@ -146,13 +143,13 @@ int main() { printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); // CPU MatrixTranspose computation - matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); // verify the results errors = 0; double eps = 1.0E-6; for (i = 0; i < NUM; i++) { - if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { errors++; } } diff --git a/samples/2_Cookbook/3_shared_memory/Makefile b/samples/2_Cookbook/3_shared_memory/Makefile index 5e9ce4721..24aafcd15 100644 --- a/samples/2_Cookbook/3_shared_memory/Makefile +++ b/samples/2_Cookbook/3_shared_memory/Makefile @@ -10,7 +10,7 @@ TARGET=hcc SOURCES = sharedMemory.cpp OBJECTS = $(SOURCES:.cpp=.o) -EXECUTABLE=./exe +EXECUTABLE=./sharedMemory .PHONY: test diff --git a/samples/2_Cookbook/3_shared_memory/Readme.md b/samples/2_Cookbook/3_shared_memory/Readme.md new file mode 100644 index 000000000..6b9393397 --- /dev/null +++ b/samples/2_Cookbook/3_shared_memory/Readme.md @@ -0,0 +1,42 @@ +## Using shared memory ### + +Earlier we learned how to write our first hip program, in which we compute Matrix Transpose. In this tutorial, we'll explain how to use the shared memory to improve the performance. + +## Introduction: + +As we mentioned earlier that Memory bottlenecks is the main problem why we are not able to get the highest performance, therefore minimizing the latency for memory access plays prominent role in application optimization. In this tutorial, we'll learn how to use static shared memory and will explain the dynamic one latter. + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +We will be using the Simple Matrix Transpose application from the previous tutorial and modify it to learn how to use shared memory. + +## Shared Memory + +Shared memory is way more faster than that of global and constant memory and accessible to all the threads in the block. If the size of shared memory is known at compile time, we can specify the size and will use the static shared memory. In the same sourcecode, we will use the `__shared__` variable type qualifier as follows: + +` __shared__ float sharedMem[1024*1024];` + +Be careful while using shared memory, since all threads within the block can access the shared memory, we need to sync the operation of individual threads by using: + +` __syncthreads();` + +## How to build and run: +Use the make command and execute it using ./exe +Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp index 9950b8d02..9b51aba44 100644 --- a/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp +++ b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp @@ -26,13 +26,12 @@ THE SOFTWARE. #include "hip/hip_runtime.h" -#define WIDTH 1024 -#define HEIGHT 1024 +#define WIDTH 64 -#define NUM (WIDTH*HEIGHT) +#define NUM (WIDTH*WIDTH) -#define THREADS_PER_BLOCK_X 16 -#define THREADS_PER_BLOCK_Y 16 +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 #define THREADS_PER_BLOCK_Z 1 // Device (Kernel) function, it must be void @@ -40,15 +39,14 @@ THE SOFTWARE. __global__ void matrixTranspose(hipLaunchParm lp, float *out, float *in, - const int width, - const int height) + const int width) { - __shared__ float sharedMem[16*16]; + __shared__ float sharedMem[WIDTH*WIDTH]; int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - sharedMem[y * width + x] = in[x * height + y]; + sharedMem[y * width + x] = in[x * width + y]; __syncthreads(); @@ -59,14 +57,13 @@ __global__ void matrixTranspose(hipLaunchParm lp, void matrixTransposeCPUReference( float * output, float * input, - const unsigned int width, - const unsigned int height) + const unsigned int width) { - for(unsigned int j=0; j < height; j++) + for(unsigned int j=0; j < width; j++) { for(unsigned int i=0; i < width; i++) { - output[i*height + j] = input[j*width + i]; + output[i*width + j] = input[j*width + i]; } } } @@ -106,22 +103,22 @@ int main() { // Lauching kernel from host hipLaunchKernel(matrixTranspose, - dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, - gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + gpuTransposeMatrix , gpuMatrix, WIDTH); // Memory transfer from device to host hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); // CPU MatrixTranspose computation - matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); // verify the results errors = 0; double eps = 1.0E-6; for (i = 0; i < NUM; i++) { - if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); errors++; } diff --git a/samples/2_Cookbook/4_shfl/Makefile b/samples/2_Cookbook/4_shfl/Makefile index 1d30c7874..3383cf2bf 100644 --- a/samples/2_Cookbook/4_shfl/Makefile +++ b/samples/2_Cookbook/4_shfl/Makefile @@ -10,7 +10,7 @@ TARGET=hcc SOURCES = shfl.cpp OBJECTS = $(SOURCES:.cpp=.o) -EXECUTABLE=./exe +EXECUTABLE=./shfl .PHONY: test @@ -22,7 +22,7 @@ CXX=$(HIPCC) $(EXECUTABLE): $(OBJECTS) - $(HIPCC) $(OBJECTS) -o $@ + $(HIPCC) $(OBJECTS) -o $@ test: $(EXECUTABLE) diff --git a/samples/2_Cookbook/4_shfl/Readme.md b/samples/2_Cookbook/4_shfl/Readme.md new file mode 100644 index 000000000..da6290185 --- /dev/null +++ b/samples/2_Cookbook/4_shfl/Readme.md @@ -0,0 +1,51 @@ +## Warp shfl operations ### + +In this tutorial, we'll explain how to use the warp shfl operations to improve the performance. + +## Introduction: + +Let's talk about Warp first. The kernel code is executed in groups of fixed number of threads known as Warp. For nvidia WarpSize is 32 while for AMD, 32 for Polaris architecture and 64 for rest. Threads in a warp are referred to as lanes and are numbered from 0 to warpSize -1. With the help of shfl ops, we can directly exchange values of variable between threads without using any memory ops within a warp. There are four types of shfl ops: +` int __shfl (int var, int srcLane, int width=warpSize); ` +` float __shfl (float var, int srcLane, int width=warpSize); ` +` int __shfl_up (int var, unsigned int delta, int width=warpSize); ` +` float __shfl_up (float var, unsigned int delta, int width=warpSize); ` +` int __shfl_down (int var, unsigned int delta, int width=warpSize); ` +` float __shfl_down (float var, unsigned int delta, int width=warpSize); ` +` int __shfl_xor (int var, int laneMask, int width=warpSize) ` +` float __shfl_xor (float var, int laneMask, int width=warpSize); ` + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +We will be using the Simple Matrix Transpose application from the previous tutorial and modify it to learn how to use shared memory. + +## __shfl ops + +In this tutorial, we'll use `__shfl()` ops. In the same sourcecode, we used for MatrixTranspose. We'll add the following: + +` out[i*width + j] = __shfl(val,j*width + i);` + +Be careful while using shfl operations, since all exchanges are possible between the threads of corresponding warp only. + +## How to build and run: +Use the make command and execute it using ./exe +Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. + +## requirement for nvidia +please make sure you have a 3.0 or higher compute capable device in order to use warp shfl operations and add `-gencode arch=compute=30, code=sm_30` nvcc flag in the Makefile while using this application. + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/4_shfl/shfl.cpp b/samples/2_Cookbook/4_shfl/shfl.cpp index 07d5cd42d..e0f4c2120 100644 --- a/samples/2_Cookbook/4_shfl/shfl.cpp +++ b/samples/2_Cookbook/4_shfl/shfl.cpp @@ -27,9 +27,8 @@ THE SOFTWARE. #define WIDTH 4 -#define HEIGHT 4 -#define NUM (WIDTH*HEIGHT) +#define NUM (WIDTH*WIDTH) #define THREADS_PER_BLOCK_X 4 #define THREADS_PER_BLOCK_Y 4 @@ -40,17 +39,16 @@ THE SOFTWARE. __global__ void matrixTranspose(hipLaunchParm lp, float *out, float *in, - const int width, - const int height) + const int width) { int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - //int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + float val = in[x]; for(int i=0;i 0 ) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); errors++; } diff --git a/samples/2_Cookbook/5_2dshfl/2dshfl.cpp b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp index 16e5c7489..1b22a0c29 100644 --- a/samples/2_Cookbook/5_2dshfl/2dshfl.cpp +++ b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp @@ -27,9 +27,8 @@ THE SOFTWARE. #define WIDTH 4 -#define HEIGHT 4 -#define NUM (WIDTH*HEIGHT) +#define NUM (WIDTH*WIDTH) #define THREADS_PER_BLOCK_X 4 #define THREADS_PER_BLOCK_Y 4 @@ -40,28 +39,26 @@ THE SOFTWARE. __global__ void matrixTranspose(hipLaunchParm lp, float *out, float *in, - const int width, - const int height) + const int width) { int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; float val = in[y*width + x]; - out[x*height + y] = __shfl(val,y*width + x); + out[x*width + y] = __shfl(val,y*width + x); } // CPU implementation of matrix transpose void matrixTransposeCPUReference( float * output, float * input, - const unsigned int width, - const unsigned int height) + const unsigned int width) { - for(unsigned int j=0; j < height; j++) + for(unsigned int j=0; j < width; j++) { for(unsigned int i=0; i < width; i++) { - output[i*height + j] = input[j*width + i]; + output[i*width + j] = input[j*width + i]; } } } @@ -104,19 +101,19 @@ int main() { dim3(1), dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y), 0, 0, - gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + gpuTransposeMatrix , gpuMatrix, WIDTH); // Memory transfer from device to host hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); // CPU MatrixTranspose computation - matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); // verify the results errors = 0; double eps = 1.0E-6; for (i = 0; i < NUM; i++) { - if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) { printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); errors++; } diff --git a/samples/2_Cookbook/5_2dshfl/Makefile b/samples/2_Cookbook/5_2dshfl/Makefile index 502d2948b..b742bbf80 100644 --- a/samples/2_Cookbook/5_2dshfl/Makefile +++ b/samples/2_Cookbook/5_2dshfl/Makefile @@ -10,7 +10,7 @@ TARGET=hcc SOURCES = 2dshfl.cpp OBJECTS = $(SOURCES:.cpp=.o) -EXECUTABLE=./exe +EXECUTABLE=./2dshfl .PHONY: test diff --git a/samples/2_Cookbook/5_2dshfl/Readme.md b/samples/2_Cookbook/5_2dshfl/Readme.md new file mode 100644 index 000000000..fba114152 --- /dev/null +++ b/samples/2_Cookbook/5_2dshfl/Readme.md @@ -0,0 +1,51 @@ +## Warp shfl operations in 2D ### + +This tutorial is follow-up of the previous tutorial, where we learned how to use shfl ops. In this tutorial, we'll explain how to scale similar kind of operations to multi-dimensional space by using previous tutorial source-code. + +## Introduction: + +Let's talk about Warp first. The kernel code is executed in groups of fixed number of threads known as Warp. For nvidia WarpSize is 32 while for AMD, 32 for Polaris architecture and 64 for rest. Threads in a warp are referred to as lanes and are numbered from 0 to warpSize -1. With the help of shfl ops, we can directly exchange values of variable between threads without using any memory ops within a warp. There are four types of shfl ops: +` int __shfl (int var, int srcLane, int width=warpSize); ` +` float __shfl (float var, int srcLane, int width=warpSize); ` +` int __shfl_up (int var, unsigned int delta, int width=warpSize); ` +` float __shfl_up (float var, unsigned int delta, int width=warpSize); ` +` int __shfl_down (int var, unsigned int delta, int width=warpSize); ` +` float __shfl_down (float var, unsigned int delta, int width=warpSize); ` +` int __shfl_xor (int var, int laneMask, int width=warpSize) ` +` float __shfl_xor (float var, int laneMask, int width=warpSize); ` + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +We will be using the Simple Matrix Transpose application from the previous tutorial and modify it to learn how to use shared memory. + +## __shfl ops in 2D + +In the same sourcecode, we used for MatrixTranspose. We'll add the following: +` int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; ` +` out[x*width + y] = __shfl(val,y*width + x); ` + +With the help of this application, we can say that kernel code can be converted into multi-dimensional threads with ease. + +## How to build and run: +Use the make command and execute it using ./exe +Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. + +## requirement for nvidia +please make sure you have a 3.0 or higher compute capable device in order to use warp shfl operations and add `-gencode arch=compute=30, code=sm_30` nvcc flag in the Makefile while using this application. + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/6_dynamic_shared/Makefile b/samples/2_Cookbook/6_dynamic_shared/Makefile new file mode 100644 index 000000000..5d867a58c --- /dev/null +++ b/samples/2_Cookbook/6_dynamic_shared/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = dynamic_shared.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./dynamic_shared + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/6_dynamic_shared/Readme.md b/samples/2_Cookbook/6_dynamic_shared/Readme.md new file mode 100644 index 000000000..a10fd56a9 --- /dev/null +++ b/samples/2_Cookbook/6_dynamic_shared/Readme.md @@ -0,0 +1,47 @@ +## Using Dynamic shared memory ### + +Earlier we learned how to use static shared memory. In this tutorial, we'll explain how to use the dynamic version of shared memory to improve the performance. + +## Introduction: + +As we mentioned earlier that Memory bottlenecks is the main problem why we are not able to get the highest performance, therefore minimizing the latency for memory access plays prominent role in application optimization. In this tutorial, we'll learn how to use dynamic shared memory. + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +We will be using the Simple Matrix Transpose application from the previous tutorial and modify it to learn how to use shared memory. + +## Shared Memory + +Shared memory is way more faster than that of global and constant memory and accessible to all the threads in the block. For In the same sourcecode, we will use the `HIP_DYNAMIC_SHARED` keyword to declare dynamic shared memory as follows: + +` HIP_DYNAMIC_SHARED(float, sharedMem) ` +here the first parameter is the data type while the second one is the variable name. + +The other important change is: +` hipLaunchKernel(matrixTranspose, ` + dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + sizeof(float)*WIDTH*WIDTH, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH); +here we replaced 4th parameter with amount of additional shared memory to allocate when launching the kernel. + +## How to build and run: +Use the make command and execute it using ./exe +Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp b/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp new file mode 100644 index 000000000..22d7eb962 --- /dev/null +++ b/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp @@ -0,0 +1,141 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +// hip header file +#include "hip/hip_runtime.h" + +#define WIDTH 16 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width) +{ + // declare dynamic shared memory + HIP_DYNAMIC_SHARED(float, sharedMem); + + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + sharedMem[y * width + x] = in[x * width + y]; + + __syncthreads(); + + out[y * width + x] = sharedMem[y * width + x]; +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width) +{ + for(unsigned int j=0; j < width; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*width + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + sizeof(float)*WIDTH*WIDTH, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { + printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("dynamic_shared PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/7_streams/Makefile b/samples/2_Cookbook/7_streams/Makefile new file mode 100644 index 000000000..64b0f0e09 --- /dev/null +++ b/samples/2_Cookbook/7_streams/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = stream.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./stream + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/7_streams/Readme.md b/samples/2_Cookbook/7_streams/Readme.md new file mode 100644 index 000000000..a75149925 --- /dev/null +++ b/samples/2_Cookbook/7_streams/Readme.md @@ -0,0 +1,57 @@ +## Streams ### + +In all Earlier tutorial we used single stream, In this tutorial, we'll explain how to launch multiple streams. + +## Introduction: + +The various instances of kernel to be executed on device in exact launch order defined by Host are called streams. We can launch multiple streams on a single device. We will learn how to learn two streams which can we scaled with ease. + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +We will be using the Simple Matrix Transpose application from the previous tutorial and modify it to learn how to launch multiple streams. + +## Streams + +In this tutorial, we'll use both instances of shared memory (i.e., static and dynamic) as different streams. We declare stream as follows: +` hipStream_t streams[num_streams]; ` + +and create stream using `hipStreamCreate` as follows: +` for(int i=0;i +#include + +#define WIDTH 32 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +using namespace std; + +__global__ void matrixTranspose_static_shared(hipLaunchParm lp, + float *out, + float *in, + const int width) +{ + __shared__ float sharedMem[WIDTH*WIDTH]; + + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + sharedMem[y * width + x] = in[x * width + y]; + + __syncthreads(); + + out[y * width + x] = sharedMem[y * width + x]; +} + +__global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, + float *out, + float *in, + const int width) +{ + // declare dynamic shared memory + HIP_DYNAMIC_SHARED(float, sharedMem) + + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + sharedMem[y * width + x] = in[x * width + y]; + + __syncthreads(); + + out[y * width + x] = sharedMem[y * width + x]; +} + +void MultipleStream (float **data, float *randArray, float **gpuTransposeMatrix, float **TransposeMatrix, int width) +{ + const int num_streams = 2; + hipStream_t streams[num_streams]; + + for(int i=0;i eps ) { + printf("%d stream0: %f stream1 %f\n",i,TransposeMatrix[0][i],TransposeMatrix[1][i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("stream PASSED!\n"); + } + + free(randArray); + for(int i=0;i<2;i++){ + hipFree(data[i]); + hipFree(gpuTransposeMatrix[i]); + free(TransposeMatrix[i]); + } + + hipDeviceReset(); + return 0; +} From 23dc2f02cd361bfc3d0d410b8493f037c06ed81e Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 26 Oct 2016 10:30:42 -0500 Subject: [PATCH 081/700] Rename HIP_ATP_MARKER and profiling vars HIP_PROFILE_API HIP_DB_START_API HIP_DB_STOP_API Change-Id: I6c4da67212ff8217e6356a2622d4c6278a188c34 --- samples/2_Cookbook/{2_HIP_ATP_MARKER => 2_CodeXL_ATP}/Makefile | 0 .../{2_HIP_ATP_MARKER => 2_CodeXL_ATP}/MatrixTranspose.cpp | 0 samples/2_Cookbook/{2_HIP_ATP_MARKER => 2_CodeXL_ATP}/Readme.md | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename samples/2_Cookbook/{2_HIP_ATP_MARKER => 2_CodeXL_ATP}/Makefile (100%) rename samples/2_Cookbook/{2_HIP_ATP_MARKER => 2_CodeXL_ATP}/MatrixTranspose.cpp (100%) rename samples/2_Cookbook/{2_HIP_ATP_MARKER => 2_CodeXL_ATP}/Readme.md (100%) diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile b/samples/2_Cookbook/2_CodeXL_ATP/Makefile similarity index 100% rename from samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile rename to samples/2_Cookbook/2_CodeXL_ATP/Makefile diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp b/samples/2_Cookbook/2_CodeXL_ATP/MatrixTranspose.cpp similarity index 100% rename from samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp rename to samples/2_Cookbook/2_CodeXL_ATP/MatrixTranspose.cpp diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md b/samples/2_Cookbook/2_CodeXL_ATP/Readme.md similarity index 100% rename from samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md rename to samples/2_Cookbook/2_CodeXL_ATP/Readme.md From f8c4fa982a7ccb60967a8ab383fa71931c58bb4e Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 27 Oct 2016 20:37:46 -0500 Subject: [PATCH 082/700] Add new hipdemangleatp and snapshot sample update for new functionality Change-Id: Ie19c683b2b0bdfeb0c3fcf89444c2e21b7f606e7 --- samples/0_Intro/square/Makefile | 6 +- .../2_CodeXL_ATP/MatrixTranspose.cpp | 172 --------------- .../{2_CodeXL_ATP => 2_Profiler}/Makefile | 22 +- .../2_Cookbook/2_Profiler/MatrixTranspose.cpp | 195 ++++++++++++++++++ .../{2_CodeXL_ATP => 2_Profiler}/Readme.md | 14 +- 5 files changed, 218 insertions(+), 191 deletions(-) delete mode 100644 samples/2_Cookbook/2_CodeXL_ATP/MatrixTranspose.cpp rename samples/2_Cookbook/{2_CodeXL_ATP => 2_Profiler}/Makefile (52%) create mode 100644 samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp rename samples/2_Cookbook/{2_CodeXL_ATP => 2_Profiler}/Readme.md (83%) diff --git a/samples/0_Intro/square/Makefile b/samples/0_Intro/square/Makefile index 89921c207..1e8cdba08 100644 --- a/samples/0_Intro/square/Makefile +++ b/samples/0_Intro/square/Makefile @@ -1,7 +1,4 @@ HIP_PATH?= $(wildcard /opt/rocm/hip) -ifeq (,$(HIP_PATH)) - HIP_PATH=../../.. -endif HIPCC=$(HIP_PATH)/bin/hipcc all: square.hip.out @@ -11,9 +8,10 @@ square.cuda.out : square.cu #hipify square.cu > square.cpp # Then review & finish port in square.cpp +# square.hip.out: square.hipref.cpp - $(HIPCC) square.hipref.cpp -o $@ + $(HIPCC) $(CXXFLAGS) square.hipref.cpp -o $@ diff --git a/samples/2_Cookbook/2_CodeXL_ATP/MatrixTranspose.cpp b/samples/2_Cookbook/2_CodeXL_ATP/MatrixTranspose.cpp deleted file mode 100644 index f2aea146e..000000000 --- a/samples/2_Cookbook/2_CodeXL_ATP/MatrixTranspose.cpp +++ /dev/null @@ -1,172 +0,0 @@ -/* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include - -// hip header file -#include "hip/hip_runtime.h" - -#define WIDTH 1024 - -#define NUM (WIDTH*WIDTH) - -#define THREADS_PER_BLOCK_X 4 -#define THREADS_PER_BLOCK_Y 4 -#define THREADS_PER_BLOCK_Z 1 - -// Device (Kernel) function, it must be void -// hipLaunchParm provides the execution configuration -__global__ void matrixTranspose(hipLaunchParm lp, - float *out, - float *in, - const int width) -{ - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - - out[y * width + x] = in[x * width + y]; -} - -// CPU implementation of matrix transpose -void matrixTransposeCPUReference( - float * output, - float * input, - const unsigned int width) -{ - for(unsigned int j=0; j < width; j++) - { - for(unsigned int i=0; i < width; i++) - { - output[i*width + j] = input[j*width + i]; - } - } -} - -int main() { - - float* Matrix; - float* TransposeMatrix; - float* cpuTransposeMatrix; - - float* gpuMatrix; - float* gpuTransposeMatrix; - - hipDeviceProp_t devProp; - hipGetDeviceProperties(&devProp, 0); - - std::cout << "Device name " << devProp.name << std::endl; - - hipEvent_t start, stop; - hipEventCreate(&start); - hipEventCreate(&stop); - float eventMs = 1.0f; - - int i; - int errors; - - Matrix = (float*)malloc(NUM * sizeof(float)); - TransposeMatrix = (float*)malloc(NUM * sizeof(float)); - cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); - - // initialize the input data - for (i = 0; i < NUM; i++) { - Matrix[i] = (float)i*10.0f; - } - - // allocate the memory on the device side - hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); - hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); - - // Record the start event - hipEventRecord(start, NULL); - - // Memory transfer from host to device - hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); - - // Record the stop event - hipEventRecord(stop, NULL); - hipEventSynchronize(stop); - - hipEventElapsedTime(&eventMs, start, stop); - - printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs); - - // Record the start event - hipEventRecord(start, NULL); - - // Lauching kernel from host - hipLaunchKernel(matrixTranspose, - dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - gpuTransposeMatrix , gpuMatrix, WIDTH); - - // Record the stop event - hipEventRecord(stop, NULL); - hipEventSynchronize(stop); - - hipEventElapsedTime(&eventMs, start, stop); - - printf ("kernel Execution time = %6.3fms\n", eventMs); - - // Record the start event - hipEventRecord(start, NULL); - - // Memory transfer from device to host - hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); - - // Record the stop event - hipEventRecord(stop, NULL); - hipEventSynchronize(stop); - - hipEventElapsedTime(&eventMs, start, stop); - - printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); - - // CPU MatrixTranspose computation - matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); - - // verify the results - errors = 0; - double eps = 1.0E-6; - for (i = 0; i < NUM; i++) { - if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { - errors++; - } - } - if (errors!=0) { - printf("FAILED: %d errors\n",errors); - } else { - printf ("PASSED!\n"); - } - - //free the resources on device side - hipFree(gpuMatrix); - hipFree(gpuTransposeMatrix); - - //free the resources on host side - free(Matrix); - free(TransposeMatrix); - free(cpuTransposeMatrix); - - return errors; -} diff --git a/samples/2_Cookbook/2_CodeXL_ATP/Makefile b/samples/2_Cookbook/2_Profiler/Makefile similarity index 52% rename from samples/2_Cookbook/2_CodeXL_ATP/Makefile rename to samples/2_Cookbook/2_Profiler/Makefile index d3630a1c1..4b9a063f3 100644 --- a/samples/2_Cookbook/2_CodeXL_ATP/Makefile +++ b/samples/2_Cookbook/2_Profiler/Makefile @@ -1,10 +1,12 @@ HIP_PATH?= $(wildcard /opt/rocm/hip) -ifeq (,$(HIP_PATH)) - HIP_PATH=../../.. -endif HIPCC=$(HIP_PATH)/bin/hipcc + +HIPPROFILER=/opt/rocm/bin/rocm-profiler +PROFILER_OPT=-A -o MT.atp -e HIP_PROFILE_API=1 +HIPPROFILER_POST_CMD=$(HIP_PATH)/bin/hipdemangleatp MT.atp + TARGET=hcc SOURCES = MatrixTranspose.cpp @@ -15,9 +17,12 @@ EXECUTABLE=./MatrixTranspose .PHONY: test -all: $(EXECUTABLE) test +all: $(EXECUTABLE) profile + -CXXFLAGS =-g + +OPT =-g +CXXFLAGS =$(OPT) CXX=$(HIPCC) @@ -25,7 +30,12 @@ $(EXECUTABLE): $(OBJECTS) $(HIPCC) $(OBJECTS) -o $@ -test: $(EXECUTABLE) +profile: $(EXECUTABLE) + $(HIPPROFILER) $(PROFILER_OPT) $(EXECUTABLE) + $(HIPPROFILER_POST_CMD) + + +run: $(EXECUTABLE) $(EXECUTABLE) diff --git a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp new file mode 100644 index 000000000..7500957df --- /dev/null +++ b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp @@ -0,0 +1,195 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +// hip header file +#include "hip/hip_runtime.h" +#include "hip/hip_profile.h" + +#define WIDTH 1024 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + out[y * width + x] = in[x * width + y]; +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width) +{ + for(unsigned int j=0; j < width; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*width + j] = input[j*width + i]; + } + } +} + +int main() { + + //HIP_SCOPED_MARKER(__func__, "MainFunc"); + HIP_BEGIN_MARKER(__func__, "MainFunc"); + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + hipEvent_t start, stop; + float eventMs = 1.0f; + { + // Show example of how to create a "scoped marker". + // The scoped marker records the time spent inside the { scope } of the marker - the begin timestamp is at the + // beginning of the code scope, and the end is recorded when the SCOPE exits. This can be viewed in CodeXL + // timeline relative to other GPU and CPU events. + // This marker captures the time spent in setup including host allocation, initialization, and device memory allocation. + HIP_SCOPED_MARKER("Setup", "App"); + + hipEventCreate(&start); + hipEventCreate(&stop); + + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (int i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + } + + { + HIP_SCOPED_MARKER("Loop", "App"); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + hipEventElapsedTime(&eventMs, start, stop); + } + + int errors = 0; + { + HIP_SCOPED_MARKER("Teardown", "App"); + + + printf ("kernel Execution time = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); + + // verify the results + double eps = 1.0E-6; + for (int i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + } + + HIP_END_MARKER(); + + return errors; +} diff --git a/samples/2_Cookbook/2_CodeXL_ATP/Readme.md b/samples/2_Cookbook/2_Profiler/Readme.md similarity index 83% rename from samples/2_Cookbook/2_CodeXL_ATP/Readme.md rename to samples/2_Cookbook/2_Profiler/Readme.md index de1a80057..92a8be228 100644 --- a/samples/2_Cookbook/2_CodeXL_ATP/Readme.md +++ b/samples/2_Cookbook/2_Profiler/Readme.md @@ -1,6 +1,6 @@ ## Using hipEvents to measure performance ### -This tutorial is follow-up of the previous two tutorial where we learn how to write our first hip program, in which we compute Matrix Transpose and in second one, we added feature to measure time taken for memory transfer and kernel execution. In this tutorial, we won't make amy changes to the source code. We'll explain how to use the codexl/rocm-profiler for hip timeline tracing. +This tutorial is follow-up of the previous two tutorial where we learn how to write our first hip program, in which we compute Matrix Transpose and in second one, we added feature to measure time taken for memory transfer and kernel execution. In this tutorial, we'll explain how to use the codexl/rocm-profiler for hip timeline tracing. Also, we will augment the source code with additional markers so we can see the high-level application flow alongside the information that CodeXL automatically collects. ## Introduction: @@ -24,15 +24,11 @@ HIP can generate markers at function being/end which are displayed on the CodeXL 1. Install ROCm-Profiler Installing HIP from the rocm pre-built packages, installs the ROCm-Profiler as well. Alternatively, you can build ROCm-Profiler using the instructions given below. -2. Build HIP with ATP markers enabled HIP pre-built packages are enabled with ATP marker support by default. To enable ATP marker support when building HIP from source, use the option -DCOMPILE_HIP_ATP_MARKER=1 during the cmake configure step. -3. Set HIP_ATP_MARKER -`export HIP_ATP_MARKER=1` - -4. Recompile the target application - -5. Run with profiler enabled to generate ATP file. -`/opt/rocm/bin/rocm-profiler -o -A ` +2. Run with profiler enabled to generate ATP file. +(These steps are also captured in the Makefile) +The HIP_PROFILE_API enables display of the HIP APIs on the CodeXL trimeline view. +`/opt/rocm/bin/rocm-profiler -o -A -e HIP_PROFILE_API=1 ` ##Using HIP_TRACE_API From e2fbab109d117de9aecf809bf515f0f1f38d5a15 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 27 Oct 2016 21:26:28 -0500 Subject: [PATCH 083/700] show how to use variety of HIP_PROFILE features Change-Id: I6edd66ac4c068b64e1dc3787d7f1f69ab3238469 --- .../2_Cookbook/2_Profiler/MatrixTranspose.cpp | 175 ++++++++++-------- 1 file changed, 95 insertions(+), 80 deletions(-) diff --git a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp index 7500957df..b6a6b141d 100644 --- a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp +++ b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp @@ -34,6 +34,8 @@ THE SOFTWARE. #define THREADS_PER_BLOCK_Y 4 #define THREADS_PER_BLOCK_Z 1 +#define ITERATIONS 10 + // Device (Kernel) function, it must be void // hipLaunchParm provides the execution configuration __global__ void matrixTranspose(hipLaunchParm lp, @@ -62,10 +64,72 @@ void matrixTransposeCPUReference( } } -int main() { - //HIP_SCOPED_MARKER(__func__, "MainFunc"); - HIP_BEGIN_MARKER(__func__, "MainFunc"); +// Use a separate function to demonstrate how to use function name as part of scoped marker: +void runGPU(float *Matrix, float *TransposeMatrix, + float* gpuMatrix, float* gpuTransposeMatrix) { + + // __func__ is a standard C++ macro which expands to the name of the function, in this case "runGPU" + HIP_SCOPED_MARKER(__func__, "MyGroup"); + + for (int i=0; i eps ) { - errors++; - } - } - if (errors!=0) { - printf("FAILED: %d errors\n",errors); - } else { - printf ("PASSED!\n"); - } + // verify the results + double eps = 1.0E-6; + for (int i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } - //free the resources on device side - hipFree(gpuMatrix); - hipFree(gpuTransposeMatrix); + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); - //free the resources on host side - free(Matrix); - free(TransposeMatrix); - free(cpuTransposeMatrix); - } + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); HIP_END_MARKER(); From b5b7663c51013d58fff9334ee21e433d0d18faeb Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 27 Oct 2016 22:05:52 -0500 Subject: [PATCH 084/700] Add initial hipProfileStart/Stop And modify sample to show how to use. Still needs some work to understand interaction with CXL. Change-Id: I2579824d2dd7863ea23874d34f0dabb3cb305d3e --- samples/2_Cookbook/2_Profiler/Makefile | 7 +++++ .../2_Cookbook/2_Profiler/MatrixTranspose.cpp | 27 +++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/samples/2_Cookbook/2_Profiler/Makefile b/samples/2_Cookbook/2_Profiler/Makefile index 4b9a063f3..db2d00818 100644 --- a/samples/2_Cookbook/2_Profiler/Makefile +++ b/samples/2_Cookbook/2_Profiler/Makefile @@ -35,6 +35,13 @@ profile: $(EXECUTABLE) $(HIPPROFILER_POST_CMD) +# Pass option to control start and stop iterations for profiling - see MatrixTranspose.cpp for implementation: +# Note we start profiler in --startdisabled mode - no timing collected until app enabled it via hipProfilerStart() +profile_trigger: $(EXECUTABLE) + $(HIPPROFILER) $(PROFILER_OPT) --startdisabled $(EXECUTABLE) 3 6 + $(HIPPROFILER_POST_CMD) + + run: $(EXECUTABLE) $(EXECUTABLE) diff --git a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp index b6a6b141d..3747bb4ec 100644 --- a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp +++ b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp @@ -36,6 +36,10 @@ THE SOFTWARE. #define ITERATIONS 10 +// Cmdline parms to control start and stop triggers +int startTriggerIteration=-1; +int stopTriggerIteration=-1; + // Device (Kernel) function, it must be void // hipLaunchParm provides the execution configuration __global__ void matrixTranspose(hipLaunchParm lp, @@ -74,6 +78,13 @@ void runGPU(float *Matrix, float *TransposeMatrix, for (int i=0; i= 2) { + startTriggerIteration = atoi(argv[1]); + printf ("info : will start tracing at iteration:%d\n", startTriggerIteration); + } + if (argc >= 3) { + stopTriggerIteration = atoi(argv[2]); + printf ("info : will stop tracing at iteration:%d\n", stopTriggerIteration); + } float* Matrix; float* TransposeMatrix; @@ -166,6 +186,8 @@ int main() { // allocate the memory on the device side hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // FYI, the scoped-marker will be destroyed here when the scope exits, and will record its "end" timestamp. } runGPU(Matrix, TransposeMatrix, gpuMatrix, gpuTransposeMatrix); @@ -204,7 +226,8 @@ int main() { free(TransposeMatrix); free(cpuTransposeMatrix); - HIP_END_MARKER(); + // This ends the last marker started in this thread, in this case "Check&TearDown" + HIP_END_MARKER(); return errors; } From ab73e76987c41f5ad0ab5d0bba732b385df697e8 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 4 Nov 2016 06:34:07 -0500 Subject: [PATCH 085/700] Print non-peers too Change-Id: I2a6905edcdf144aa732ae3120c17780477f232ac --- samples/1_Utils/hipInfo/hipInfo.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 0403162bd..42a879e73 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -133,6 +133,15 @@ void printDeviceProp (int deviceId) } } cout << endl; + cout << setw(w1) << "non-peers: "; + for (int i=0; i Date: Sun, 6 Nov 2016 04:07:51 -0600 Subject: [PATCH 086/700] Update gitignore for some common output files Change-Id: I9cd60f042af4dba07fe0fdbd2ee442936ff8c7bd --- samples/0_Intro/hcc_dialects/.gitignore | 5 +++++ samples/0_Intro/module_api/.gitignore | 5 +++++ 2 files changed, 10 insertions(+) create mode 100644 samples/0_Intro/hcc_dialects/.gitignore create mode 100644 samples/0_Intro/module_api/.gitignore diff --git a/samples/0_Intro/hcc_dialects/.gitignore b/samples/0_Intro/hcc_dialects/.gitignore new file mode 100644 index 000000000..bce1cdf19 --- /dev/null +++ b/samples/0_Intro/hcc_dialects/.gitignore @@ -0,0 +1,5 @@ +vadd_amp_arrayview +vadd_hc_am +vadd_hc_array +vadd_hc_arrayview +vadd_hip diff --git a/samples/0_Intro/module_api/.gitignore b/samples/0_Intro/module_api/.gitignore new file mode 100644 index 000000000..c1d81e043 --- /dev/null +++ b/samples/0_Intro/module_api/.gitignore @@ -0,0 +1,5 @@ +runKernel.hip.out +vcpy_isa.code +vcpy_isa.hsaco +vcpy_kernel.co +vcpy_kernel.code From aaf1547bff0a29418d8b5f5a77815a5be6ac6bc7 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 10 Nov 2016 11:27:28 +0530 Subject: [PATCH 087/700] hcc_dialects/Makefile: use clamp-config Change-Id: I86df82f75b75125825e22d0545209a19386d9936 --- samples/0_Intro/hcc_dialects/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/0_Intro/hcc_dialects/Makefile b/samples/0_Intro/hcc_dialects/Makefile index 3b5ceca7f..4a514b669 100644 --- a/samples/0_Intro/hcc_dialects/Makefile +++ b/samples/0_Intro/hcc_dialects/Makefile @@ -5,8 +5,8 @@ OPT=-O2 HCC_CFLAGS= `$(HCC_HOME)/bin/hcc-config --cxxflags` ${OPT} HCC_LDFLAGS= `$(HCC_HOME)/bin/hcc-config --ldflags` ${OPT} -CPPAMP_CFLAGS= -std=c++amp -stdlib=libc++ -I$(HCC_HOME)/include -CPPAMP_LDFLAGS= -std=c++amp -L$(HCC_HOME)/lib -Wl,--rpath=$(HCC_HOME)/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive +CPPAMP_CFLAGS= `$(HCC_HOME)/bin/clamp-config --cxxflags` +CPPAMP_LDFLAGS= `$(HCC_HOME)/bin/clamp-config --ldflags` HIP_PATH?= $(wildcard /opt/rocm/hip) ifeq (,$(HIP_PATH)) From 8829e2626c2b37c87a690e783dd0559971338ffe Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Wed, 9 Nov 2016 12:06:45 +0530 Subject: [PATCH 088/700] Add p2p for cookbook Change-Id: Id2e77ab31123ef95885d665efe34bc0d4596733a (cherry picked from commit 6fbd0352713ca36e399b1ed4f17c486207a53875) --- samples/2_Cookbook/8_peer2peer/Makefile | 36 +++ samples/2_Cookbook/8_peer2peer/peer2peer.cpp | 241 +++++++++++++++++++ 2 files changed, 277 insertions(+) create mode 100644 samples/2_Cookbook/8_peer2peer/Makefile create mode 100644 samples/2_Cookbook/8_peer2peer/peer2peer.cpp diff --git a/samples/2_Cookbook/8_peer2peer/Makefile b/samples/2_Cookbook/8_peer2peer/Makefile new file mode 100644 index 000000000..a1dad7d1d --- /dev/null +++ b/samples/2_Cookbook/8_peer2peer/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = peer2peer.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./peer2peer + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp new file mode 100644 index 000000000..624de56cb --- /dev/null +++ b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp @@ -0,0 +1,241 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANUMTY OF ANY KIND, EXPRESS OR +IMPLIED, INUMCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNUMESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANUMY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INUM AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INUM CONUMECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#define WIDTH 32 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +using namespace std; + +#define KNRM "\x1B[0m" +#define KRED "\x1B[31m" + +#define failed(...) \ + printf ("%serror: ", KRED);\ + printf (__VA_ARGS__);\ + printf ("\n");\ + printf ("error: TEST FAILED\n%s", KNRM );\ + abort(); + +#define HIPCHECK(error) \ +{\ + hipError_t localError = error; \ + if (localError != hipSuccess) { \ + printf("%serror: '%s'(%d) from %s at %s:%d%s\n", \ + KRED, hipGetErrorString(localError), localError,\ + #error,__FILE__, __LINE__, KNRM); \ + failed("API returned error code.");\ + }\ +} + +void checkPeer2PeerSupport() +{ + int gpuCount; + int canAccessPeer; + int p2pCapableDeviceCount=0; + + HIPCHECK(hipGetDeviceCount(&gpuCount)); + + if (gpuCount < 2) + printf("Peer2Peer application requires atleast 2 gpu devices"); + + for (int currentGpu=0; currentGpu eps ) { + printf("%d cpu: %f gpu peered data %f\n",i,randArray[i],TransposeMatrix[1][i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("Peer2Peer PASSED!\n"); + } + + free(randArray); + for(int i=0;i<2;i++){ + hipFree(data[i]); + hipFree(gpuTransposeMatrix[i]); + free(TransposeMatrix[i]); + } + + HIPCHECK(hipSetDevice(peerGpu)); + HIPCHECK(hipDeviceReset()); + + HIPCHECK(hipSetDevice(currentGpu)); + HIPCHECK(hipDeviceReset()); + + return 0; +} From 5e86e5f565df320877cf29f47f85b45b9cd54656 Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Tue, 15 Nov 2016 12:41:05 +0530 Subject: [PATCH 089/700] fix_format Change-Id: I34e265de434263a11654e5deba044c3f21e86578 --- samples/2_Cookbook/8_peer2peer/Makefile | 71 ++++++++++---------- samples/2_Cookbook/8_peer2peer/peer2peer.cpp | 12 ++-- 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/samples/2_Cookbook/8_peer2peer/Makefile b/samples/2_Cookbook/8_peer2peer/Makefile index a1dad7d1d..5cb747392 100644 --- a/samples/2_Cookbook/8_peer2peer/Makefile +++ b/samples/2_Cookbook/8_peer2peer/Makefile @@ -1,36 +1,35 @@ -HIP_PATH?= $(wildcard /opt/rocm/hip) -ifeq (,$(HIP_PATH)) - HIP_PATH=../../.. -endif - -HIPCC=$(HIP_PATH)/bin/hipcc - -TARGET=hcc - -SOURCES = peer2peer.cpp -OBJECTS = $(SOURCES:.cpp=.o) - -EXECUTABLE=./peer2peer - -.PHONY: test - - -all: $(EXECUTABLE) test - -CXXFLAGS =-g -CXX=$(HIPCC) - - -$(EXECUTABLE): $(OBJECTS) - $(HIPCC) $(OBJECTS) -o $@ - - -test: $(EXECUTABLE) - $(EXECUTABLE) - - -clean: - rm -f $(EXECUTABLE) - rm -f $(OBJECTS) - rm -f $(HIP_PATH)/src/*.o - +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = peer2peer.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./peer2peer + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: +rm -f $(EXECUTABLE) +rm -f $(OBJECTS) +rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp index 624de56cb..990599e1c 100644 --- a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp +++ b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp @@ -108,7 +108,7 @@ void disablePeer2Peer(int currentGpu, int peerGpu) HIPCHECK(hipSetDevice(currentGpu)); hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu); - + if(canAccessPeer==1){ HIPCHECK(hipDeviceDisablePeerAccess(peerGpu)); } @@ -155,7 +155,7 @@ __global__ void matrixTranspose_dynamic_shared(hipLaunchParm lp, int main(){ checkPeer2PeerSupport(); - + int gpuCount; int currentGpu, peerGpu; @@ -191,10 +191,10 @@ int main(){ 0, 0, gpuTransposeMatrix[0], data[0], width); - HIPCHECK(hipSetDevice(peerGpu)); - TransposeMatrix[1] = (float*)malloc(NUM * sizeof(float)); - hipMalloc((void**)&gpuTransposeMatrix[1], NUM * sizeof(float)); - hipMalloc((void**)&data[1], NUM * sizeof(float)); + HIPCHECK(hipSetDevice(peerGpu)); + TransposeMatrix[1] = (float*)malloc(NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix[1], NUM * sizeof(float)); + hipMalloc((void**)&data[1], NUM * sizeof(float)); hipMemcpy(data[1], gpuTransposeMatrix[0], NUM * sizeof(float), hipMemcpyDeviceToDevice); hipLaunchKernel(matrixTranspose_dynamic_shared, From c590cd6865d234b544e775e365663d36c3c3a84c Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 26 Nov 2016 08:55:51 -0600 Subject: [PATCH 090/700] Add more debug info --- samples/1_Utils/hipCommander/LICENSE.txt | 27 + samples/1_Utils/hipCommander/Makefile | 33 + .../1_Utils/hipCommander/ResultDatabase.cpp | 527 ++++++++ samples/1_Utils/hipCommander/ResultDatabase.h | 100 ++ samples/1_Utils/hipCommander/TODO | 50 + samples/1_Utils/hipCommander/c.cmd | 3 + samples/1_Utils/hipCommander/classic.cmd | 1 + samples/1_Utils/hipCommander/hipCommander.cpp | 1096 +++++++++++++++++ samples/1_Utils/hipCommander/l2.hcm | 3 + samples/1_Utils/hipCommander/loop.hcm | 3 + samples/1_Utils/hipCommander/loop2.hcm | 2 + .../1_Utils/hipCommander/nullkernel.hip.cpp | 7 + samples/1_Utils/hipCommander/nullkernel.hsaco | Bin 0 -> 10265 bytes .../1_Utils/hipCommander/perf/latency2.hcm | 10 + .../hipCommander/perf/latency_hostsync.hcm | 8 + .../hipCommander/perf/latency_nosync.hcm | 5 + .../hipCommander/perf/latency_nullstream.hcm | 7 + .../perf/modulelaunch_latency.hcm | 5 + samples/1_Utils/hipCommander/setstream.hcm | 3 + samples/1_Utils/hipCommander/testcase.cpp | 21 + 20 files changed, 1911 insertions(+) create mode 100644 samples/1_Utils/hipCommander/LICENSE.txt create mode 100644 samples/1_Utils/hipCommander/Makefile create mode 100644 samples/1_Utils/hipCommander/ResultDatabase.cpp create mode 100644 samples/1_Utils/hipCommander/ResultDatabase.h create mode 100644 samples/1_Utils/hipCommander/TODO create mode 100644 samples/1_Utils/hipCommander/c.cmd create mode 100644 samples/1_Utils/hipCommander/classic.cmd create mode 100644 samples/1_Utils/hipCommander/hipCommander.cpp create mode 100644 samples/1_Utils/hipCommander/l2.hcm create mode 100644 samples/1_Utils/hipCommander/loop.hcm create mode 100644 samples/1_Utils/hipCommander/loop2.hcm create mode 100644 samples/1_Utils/hipCommander/nullkernel.hip.cpp create mode 100755 samples/1_Utils/hipCommander/nullkernel.hsaco create mode 100644 samples/1_Utils/hipCommander/perf/latency2.hcm create mode 100644 samples/1_Utils/hipCommander/perf/latency_hostsync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/latency_nosync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/latency_nullstream.hcm create mode 100644 samples/1_Utils/hipCommander/perf/modulelaunch_latency.hcm create mode 100644 samples/1_Utils/hipCommander/setstream.hcm create mode 100644 samples/1_Utils/hipCommander/testcase.cpp diff --git a/samples/1_Utils/hipCommander/LICENSE.txt b/samples/1_Utils/hipCommander/LICENSE.txt new file mode 100644 index 000000000..5d0d60323 --- /dev/null +++ b/samples/1_Utils/hipCommander/LICENSE.txt @@ -0,0 +1,27 @@ + +Copyright (c) 2011, UT-Battelle, LLC +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +* Neither the name of Oak Ridge National Laboratory, nor UT-Battelle, LLC, nor + the names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/samples/1_Utils/hipCommander/Makefile b/samples/1_Utils/hipCommander/Makefile new file mode 100644 index 000000000..e770c636a --- /dev/null +++ b/samples/1_Utils/hipCommander/Makefile @@ -0,0 +1,33 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif +HIPCC=$(HIP_PATH)/bin/hipcc + +EXE=hipCommander +OPT=-O3 +#CXXFLAGS = -O3 -g +CXXFLAGS = $(OPT) --std=c++11 + +HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) +ifeq (${HIP_PLATFORM}, hcc) + CXXFLAGS += " -stdlib=libc++" +endif + +CODE_OBJECTS=nullkernel.hsaco + +all: ${EXE} ${CODE_OBJECTS} + +$(EXE): hipCommander.cpp + $(HIPCC) $(CXXFLAGS) $^ -o $@ + +nullkernel.hsaco : nullkernel.hip.cpp + $(HIPCC) --genco nullkernel.hip -o nullkernel.hsaco + + +install: $(EXE) + cp $(EXE) $(HIP_PATH)/bin + + +clean: + rm -f *.o *.co $(EXE) diff --git a/samples/1_Utils/hipCommander/ResultDatabase.cpp b/samples/1_Utils/hipCommander/ResultDatabase.cpp new file mode 100644 index 000000000..2ec686f26 --- /dev/null +++ b/samples/1_Utils/hipCommander/ResultDatabase.cpp @@ -0,0 +1,527 @@ +#include "ResultDatabase.h" + +#include +#include +#include +#include + +using namespace std; + +bool ResultDatabase::Result::operator<(const Result &rhs) const +{ + if (test < rhs.test) + return true; + if (test > rhs.test) + return false; + if (atts < rhs.atts) + return true; + if (atts > rhs.atts) + return false; + return false; // less-operator returns false on equal +} + +double ResultDatabase::Result::GetMin() const +{ + double r = FLT_MAX; + for (int i=0; i= 100) + return value[n-1]; + + double index = ((n + 1.) * q / 100.) - 1; + + vector sorted = value; + sort(sorted.begin(), sorted.end()); + + if (n == 2) + return (sorted[0] * (1 - q/100.) + sorted[1] * (q/100.)); + + int index_lo = int(index); + double frac = index - index_lo; + if (frac == 0) + return sorted[index_lo]; + + double lo = sorted[index_lo]; + double hi = sorted[index_lo + 1]; + return lo + (hi-lo)*frac; +} + +double ResultDatabase::Result::GetMean() const +{ + double r = 0; + for (int i=0; i &values) +{ + for (int i=0; i= results.size()) + { + Result r; + r.test = test; + r.atts = atts; + r.unit = unit; + results.push_back(r); + } + + results[index].value.push_back(value); +} + +// **************************************************************************** +// Method: ResultDatabase::DumpDetailed +// +// Purpose: +// Writes the full results, including all trials. +// +// Arguments: +// out where to print +// +// Programmer: Jeremy Meredith +// Creation: August 14, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Nov 10 14:25:17 EST 2010 +// Renamed to DumpDetailed to make room for a DumpSummary. +// +// Jeremy Meredith, Thu Nov 11 11:39:57 EST 2010 +// Added note about (*) missing value tag. +// +// Jeremy Meredith, Tue Nov 23 13:57:02 EST 2010 +// Changed note about missing values to be worded a little better. +// +// **************************************************************************** +void ResultDatabase::DumpDetailed(ostream &out) +{ + vector sorted(results); + sort(sorted.begin(), sorted.end()); + + const int testNameW = 24 ; + const int attW = 12; + const int fieldW = 11; + out << std::fixed << right << std::setprecision(4); + + int maxtrials = 1; + for (int i=0; i maxtrials) + maxtrials = sorted[i].value.size(); + } + + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << setw(testNameW) << "test\t" + << setw(attW) << "atts\t" + << setw(fieldW) + << "median\t" + << "mean\t" + << "stddev\t" + << "min\t" + << "max\t"; + for (int i=0; i sorted(results); + sort(sorted.begin(), sorted.end()); + + const int testNameW = 24 ; + const int attW = 12; + const int fieldW = 9; + out << std::fixed << right << std::setprecision(4); + + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << setw(testNameW) << "test\t" + << setw(attW) << "atts\t" + << setw(fieldW) + << "units\t" + << "median\t" + << "mean\t" + << "stddev\t" + << "min\t" + << "max\t"; + out << endl; + + for (int i=0; i sorted(results); + + sort(sorted.begin(), sorted.end()); + + //Check to see if the file is empty - if so, add the headers + emptyFile = this->IsFileEmpty(fileName); + + //Open file and append by default + ofstream out; + out.open(fileName.c_str(), std::ofstream::out | std::ofstream::app); + + //Add headers only for empty files + if(emptyFile) + { + // TODO: in big parallel runs, the "trials" are the procs + // and we really don't want to print them all out.... + out << "test, " + << "atts, " + << "units, " + << "median, " + << "mean, " + << "stddev, " + << "min, " + << "max, "; + out << endl; + } + + for (int i=0; i +ResultDatabase::GetResultsForTest(const string &test) +{ + // get only the given test results + vector retval; + for (int i=0; i & +ResultDatabase::GetResults() const +{ + return results; +} diff --git a/samples/1_Utils/hipCommander/ResultDatabase.h b/samples/1_Utils/hipCommander/ResultDatabase.h new file mode 100644 index 000000000..4b63a02a1 --- /dev/null +++ b/samples/1_Utils/hipCommander/ResultDatabase.h @@ -0,0 +1,100 @@ +#ifndef RESULT_DATABASE_H +#define RESULT_DATABASE_H + +#include +#include +#include +#include +#include +using std::string; +using std::vector; +using std::ostream; +using std::ofstream; +using std::ifstream; + + +// **************************************************************************** +// Class: ResultDatabase +// +// Purpose: +// Track numerical results as they are generated. +// Print statistics of raw results. +// +// Programmer: Jeremy Meredith +// Creation: June 12, 2009 +// +// Modifications: +// Jeremy Meredith, Wed Nov 10 14:20:47 EST 2010 +// Split timing reports into detailed and summary. E.g. for serial code, +// we might report all trial values, but skip them in parallel. +// +// Jeremy Meredith, Thu Nov 11 11:40:18 EST 2010 +// Added check for missing value tag. +// +// Jeremy Meredith, Mon Nov 22 13:37:10 EST 2010 +// Added percentile statistic. +// +// Jeremy Meredith, Fri Dec 3 16:30:31 EST 2010 +// Added a method to extract a subset of results based on test name. Also, +// the Result class is now public, so that clients can use them directly. +// Added a GetResults method as well, and made several functions const. +// +// **************************************************************************** +class ResultDatabase +{ + public: + // + // A performance result for a single SHOC benchmark run. + // + struct Result + { + string test; // e.g. "readback" + string atts; // e.g. "pagelocked 4k^2" + string unit; // e.g. "MB/sec" + vector value; // e.g. "837.14" + double GetMin() const; + double GetMax() const; + double GetMedian() const; + double GetPercentile(double q) const; + double GetMean() const; + double GetStdDev() const; + + bool operator<(const Result &rhs) const; + + bool HadAnyFLTMAXValues() const + { + for (int i=0; i= FLT_MAX) + return true; + } + return false; + } + }; + + protected: + vector results; + + public: + void AddResult(const string &test, + const string &atts, + const string &unit, + double value); + void AddResults(const string &test, + const string &atts, + const string &unit, + const vector &values); + vector GetResultsForTest(const string &test); + const vector &GetResults() const; + void ClearAllResults(); + void DumpDetailed(ostream&); + void DumpSummary(ostream&); + void DumpCsv(string fileName); + + private: + bool IsFileEmpty(string fileName); + +}; + + +#endif diff --git a/samples/1_Utils/hipCommander/TODO b/samples/1_Utils/hipCommander/TODO new file mode 100644 index 000000000..4c835cfce --- /dev/null +++ b/samples/1_Utils/hipCommander/TODO @@ -0,0 +1,50 @@ +_ Add AQL kernel. +_ Fix &*kernel command so the kernel name/type is an argument not a new command. + +_ Add command to parse only. +_ Add regression to parse all the hcm files. + +_ Partition HCC, HIP, HSA, OpenCL commands into separate files. + + +_ Show time for back-to-back copies. +_ Add variables. + %loopcnt + + ./hipCommander %loopcnt=4 + +_ Add datasize command. + + +_ Add ( ) to parsing. +_ Add argument parsing and checking. + +_ Add verbose option to print each step of setup. + - print deliniater between setup and run. Add run start message. + + - print sizes of all buffers. + - print each command before running. + - show start/stop of timer routine. + +_ +_ Clear documentation on what each oepration does. +_ Add time instrumentation for each command. +_ Add pcie atomic. + + +_ Add tests for negative cases, ie endloop w/o opening loop. + + +README tips +--- +- HIP_API_TRACE combined with -v is useful to track the exact commands generates by hipCommander. + + +Other ideas: +--- +[ ] Perf guide : stream creation very slow on HCC and should be avoided. + + +Scratch: + + diff --git a/samples/1_Utils/hipCommander/c.cmd b/samples/1_Utils/hipCommander/c.cmd new file mode 100644 index 000000000..db1107120 --- /dev/null +++ b/samples/1_Utils/hipCommander/c.cmd @@ -0,0 +1,3 @@ +loop,1000; H2D; NullKernel; D2H; endloop; +streamsync; +printTiming, 1000 diff --git a/samples/1_Utils/hipCommander/classic.cmd b/samples/1_Utils/hipCommander/classic.cmd new file mode 100644 index 000000000..c149eec5f --- /dev/null +++ b/samples/1_Utils/hipCommander/classic.cmd @@ -0,0 +1 @@ +H2D; NullKernel, D2H, streamsync diff --git a/samples/1_Utils/hipCommander/hipCommander.cpp b/samples/1_Utils/hipCommander/hipCommander.cpp new file mode 100644 index 000000000..9c07d066b --- /dev/null +++ b/samples/1_Utils/hipCommander/hipCommander.cpp @@ -0,0 +1,1096 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#ifdef __HIP_PLATFORM_HCC__ +#include +#include +#include +#include +#endif + +#include + +#include "ResultDatabase.h" +#include "nullkernel.hip.cpp" + +bool g_printedTiming = false; + +// Cmdline parms: +int p_device = 0; +const char* p_command = "H2D; NullKernel; D2H"; +const char* p_file = nullptr; +unsigned p_verbose = 0x0; +unsigned p_db = 0x0; +unsigned p_blockingSync = 0x0; + +//--- +int p_iterations = 1; + +#define KNRM "\x1B[0m" +#define KRED "\x1B[31m" +#define KGRN "\x1B[32m" + + +#define failed(...) \ + printf ("error: ");\ + printf (__VA_ARGS__);\ + printf ("\n");\ + abort(); + + +#define HIPCHECK(error) \ +{\ + hipError_t localError = error; \ + if (localError != hipSuccess) { \ + printf("%serror: '%s'(%d) from %s at %s:%d%s\n", \ + KRED,hipGetErrorString(localError), localError,\ + #error,\ + __FILE__, __LINE__,KNRM); \ + failed("API returned error code.");\ + }\ +} +#define HIPASSERT(condition, msg) \ + if (! (condition) ) { \ + failed("%sassertion %s at %s:%d: %s%s\n", \ + KRED, #condition,\ + __FILE__, __LINE__,msg, KNRM); \ + } + + + + + + + +int parseInt(const char *str, int *output) +{ + char *next; + *output = strtol(str, &next, 0); + return !strlen(next); +} + + +void printConfig() { + hipDeviceProp_t props; + HIPCHECK(hipGetDeviceProperties(&props, p_device)); + + printf ("Device:%s Mem=%.1fGB #CUs=%d Freq=%.0fMhz\n", props.name, props.totalGlobalMem/1024.0/1024.0/1024.0, props.multiProcessorCount, props.clockRate/1000.0); +} + + + + +void help() { + printf ("Usage: hipBusBandwidth [OPTIONS]\n"); + printf (" --file, -f : Read string of commands from file\n"); + printf (" --command, -c : String specifying commands to run.\n"); + printf (" --iterations, -i : Number of copy iterations to run.\n"); + printf (" --device, -d : Device ID to use (0..numDevices).\n"); + printf (" --verbose, -v : Verbose printing of status. Fore more info, combine with HIP_TRACE_API on ROCm\n"); +}; + + + +int parseStandardArguments(int argc, char *argv[]) +{ + for (int i = 1; i < argc; i++) { + const char *arg = argv[i]; + + if (!strcmp(arg, " ")) { + // skip NULL args. + } else if (!strcmp(arg, "--iterations") || (!strcmp(arg, "-i"))) { + if (++i >= argc || !parseInt(argv[i], &p_iterations)) { + failed("Bad --iterations argument"); + } + + } else if (!strcmp(arg, "--device") || (!strcmp(arg, "-d"))) { + if (++i >= argc || !parseInt(argv[i], &p_device)) { + failed("Bad --device argument"); + } + + } else if (!strcmp(arg, "--file") || (!strcmp(arg, "-f"))) { + if (++i >= argc) { + failed("Bad --file argument"); + } else { + p_file = argv[i]; + } + + } else if (!strcmp(arg, "--commands") || (!strcmp(arg, "-c"))) { + if (++i >= argc) { + failed("Bad --commands argument"); + } else { + p_command = argv[i]; + } + + } else if (!strcmp(arg, "--verbose") || (!strcmp(arg, "-v"))) { + p_verbose = 1; + + } else if (!strcmp(arg, "--blockingSync") || (!strcmp(arg, "-B"))) { + p_blockingSync = 1; + + + } else if (!strcmp(arg, "--help") || (!strcmp(arg, "-h"))) { + help(); + exit(EXIT_SUCCESS); + + } else { + failed("Bad argument '%s'", arg); + } + } + + return 0; +}; + +// Returns the current system time in microseconds +inline long long get_time() +{ + struct timeval tv; + gettimeofday(&tv, 0); + return (tv.tv_sec * 1000000) + tv.tv_usec; +} + + +class Command; + + +//================================================================================================= +// A stream of commands , specified as a string. +class CommandStream { +public: + // State that is inherited by sub-blocks: + struct CommandStreamState { + hipStream_t _currentStream; + std::vector _streams; + vector _subBlocks; + }; +public: + CommandStream(std::string commandStreamString, int iterations); + ~CommandStream(); + + hipStream_t currentStream() const { return _state._currentStream; }; + + void print(const std::string &indent="") const; + void printBrief(std::ostream &s=std::cout) const ; + void run(); + void recordTime(); + void printTiming(int iterations=0); + + CommandStream *currentCommandStream() { + return _parseInSubBlock ? _state._subBlocks.back() : this; + }; + + void enterSubBlock(CommandStream *commandStream) { + _parseInSubBlock = true; + _state._subBlocks.push_back(commandStream); + }; + + void exitSubBlock() { + _parseInSubBlock = false; + }; + + + void setParent(CommandStream *parentCmdStream) + { + _parentCommandStream = parentCmdStream; + _state = parentCmdStream->_state; + }; + CommandStream * getParent() { return _parentCommandStream; }; + + void setStream(int streamIndex); + + CommandStreamState &getState() { return _state; }; + +private: + static void tokenize(const std::string &s, char delim, std::vector &tokens); + void parse(const std::string fullCmd); + +protected: + CommandStreamState _state; +private: + + + // List of commands to run in this stream: + std::vector _commands; + + + + // Number of iterations to run the command loop + int _iterations; + + + + + // Us to run the the command-stream. Only valid after run is called. + long long _startTime; + double _elapsedUs; + + // Track nested loop of command streams: + CommandStream *_parentCommandStream; + + // Track if we are parsing commands in the subblock. + bool _parseInSubBlock; + +}; + + +//================================================================================================= +class Command { +public: + + // @p minArgs : Minimum arguments for command. -1 = don't check. + // @p maxArgs : Minimum arguments for command. 0 means min=max, ie exact #arguments expected. -1 = don't check max. + Command(CommandStream *cmdStream, const std::vector &args, int minArgs=0, int maxArgs=0) + : _commandStream(cmdStream), + _args(args) + { + int numArgs = args.size() - 1; + + if ((minArgs != -1 ) && (numArgs < minArgs)) { + // TODO - print full command here. + failed ("Not enough arguments for command %s. (Expected %d, got %d)", args[0].c_str(), minArgs, numArgs); + } + + // Check for an exact number of arguments: + if (maxArgs == 0) { + maxArgs = minArgs; + } + if ((maxArgs != -1 ) && (numArgs > maxArgs)) { + failed ("Too many arguments for command %s. (Expected %d, got %d)", args[0].c_str(), maxArgs, numArgs); + } + }; + + void printBrief(std::ostream &s=std::cout) const + { + s << _args[0]; + } + + virtual ~Command() {}; + + virtual void print(const std::string &indent = "") const { + std::cout << indent << "["; + std::for_each(_args.begin(), _args.end(), [] (const std::string &s) { + std::cout << s; + }); + std::cout << "]"; + }; + + + virtual void run() = 0; + +protected: + int readIntArg(int argIndex, const std::string &argName) + { + // TODO - catch references to non-existant arguments here. + int argVal; + try { + argVal = std::stoi(_args[argIndex]); + } catch (std::invalid_argument) { + failed ("Command %s has bad %s argument ('%s')", _args[0].c_str(), argName.c_str(), _args[argIndex].c_str()); + } + return argVal; + } +protected: + CommandStream *_commandStream; + std::vector _args; +}; + + +#define FILENAME "nullkernel.hsaco" +#define KERNEL_NAME "NullKernel" + + +#ifdef __HIP_PLATFORM_HCC__ +//================================================================================================= +// Use Aql to launch the NULL kernel. +class AqlKernelCommand : public Command +{ +public: + AqlKernelCommand(CommandStream *cmdStream, const std::vector args) : + Command(cmdStream, args) + { + hc::accelerator_view *av; + HIPCHECK(hipHccGetAcceleratorView(cmdStream->currentStream(), &av)); + + hc::accelerator acc = av->get_accelerator(); + + hsa_region_t systemRegion = *(hsa_region_t*)acc.get_hsa_am_system_region(); + + _hsaAgent = *(hsa_agent_t*) acc.get_hsa_agent(); + + std::ifstream file(FILENAME, std::ios::binary | std::ios::ate); + std::streamsize fsize = file.tellg(); + file.seekg(0, std::ios::beg); + + std::vector buffer(fsize); + if (file.read(buffer.data(), fsize)) + { + uint64_t elfSize = ElfSize(&buffer[0]); + + assert(fsize == elfSize); + + //TODO - replace module load code with explicit module load and unload. + + hipModule_t module; + HIPCHECK(hipModuleLoadData(&module, &buffer[0])); + HIPCHECK(hipModuleGetFunction(&_function, module, KERNEL_NAME)); + + } else { + failed("could not open code object '%s'\n", FILENAME); + } + }; + + ~AqlKernelCommand() {}; + + void run() override { +#define LEN 64 + uint32_t len = LEN; + uint32_t one = 1; + + float *Ad = NULL; + + size_t argSize = 36; + char argBuffer[argSize]; + *(uint32_t*) (&argBuffer[0]) = len; + *(uint32_t*) (&argBuffer[4]) = one; + *(uint32_t*) (&argBuffer[8]) = one; + *(uint32_t*) (&argBuffer[12]) = len; + *(uint32_t*) (&argBuffer[16]) = one; + *(uint32_t*) (&argBuffer[20]) = one; + *(float**) (&argBuffer[24]) = Ad; // Ad pointer argument + + + void *config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0], + HIP_LAUNCH_PARAM_BUFFER_SIZE, &argSize, + HIP_LAUNCH_PARAM_END + }; + + hipModuleLaunchKernel(_function, len, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); + }; + + +public: + hsa_queue_t _hsaQueue; + hsa_agent_t _hsaAgent; + + hipFunction_t _function; + +private: + static uint64_t ElfSize(const void *emi){ + const Elf64_Ehdr *ehdr = (const Elf64_Ehdr*)emi; + const Elf64_Shdr *shdr = (const Elf64_Shdr*)((char*)emi + ehdr->e_shoff); + + uint64_t max_offset = ehdr->e_shoff; + uint64_t total_size = max_offset + ehdr->e_shentsize * ehdr->e_shnum; + + for(uint16_t i=0;i < ehdr->e_shnum;++i){ + uint64_t cur_offset = static_cast(shdr[i].sh_offset); + if(max_offset < cur_offset){ + max_offset = cur_offset; + total_size = max_offset; + if(SHT_NOBITS != shdr[i].sh_type){ + total_size += static_cast(shdr[i].sh_size); + } + } + } + return total_size; + } +}; +#endif + +//================================================================================================= +// HCC optimizes away fully NULL kernel calls, so run one that is nearly null: +class ModuleKernelCommand : public Command +{ +public: + ModuleKernelCommand(CommandStream *cmdStream, const std::vector args) : + Command(cmdStream, args), + _stream (cmdStream->currentStream()) { + + hipModule_t module; + HIPCHECK(hipModuleLoad(&module, FILENAME)); + HIPCHECK(hipModuleGetFunction(&_function, module, KERNEL_NAME)); + }; + ~ModuleKernelCommand() {}; + + void run() override { +#define LEN 64 + uint32_t len = LEN; + uint32_t one = 1; + + float *Ad = NULL; + + size_t argSize = 36; + char argBuffer[argSize]; + *(uint32_t*) (&argBuffer[0]) = len; + *(uint32_t*) (&argBuffer[4]) = one; + *(uint32_t*) (&argBuffer[8]) = one; + *(uint32_t*) (&argBuffer[12]) = len; + *(uint32_t*) (&argBuffer[16]) = one; + *(uint32_t*) (&argBuffer[20]) = one; + *(float**) (&argBuffer[24]) = Ad; // Ad pointer argument + + + void *config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, &argBuffer[0], + HIP_LAUNCH_PARAM_BUFFER_SIZE, &argSize, + HIP_LAUNCH_PARAM_END + }; + + hipModuleLaunchKernel(_function, len, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); + }; + + +public: + hipFunction_t _function; + hipStream_t _stream; +}; + + +class KernelCommand : public Command +{ +public: + enum Type {Null, VectorAdd}; + KernelCommand(CommandStream *cmdStream, const std::vector args, Type kind) : + Command(cmdStream, args), + _kind(kind), + _stream (cmdStream->currentStream()) { + }; + ~KernelCommand() {}; + + + void run() override { + static const int gridX = 64; + static const int groupX = 64; + + switch (_kind) { + case Null: + hipLaunchKernel(NullKernel, dim3(gridX/groupX), dim3(gridX), 0, _stream, nullptr); + break; + case VectorAdd: + assert(0); // TODO + break; + }; + } +private: + Type _kind; + hipStream_t _stream; +}; + + +#ifdef __HIP_PLATFORM_HCC__ +//================================================================================================= +class PfeCommand : public Command +{ +public: + PfeCommand(CommandStream *cmdStream, const std::vector args, hipStream_t stream = 0) : + Command(cmdStream, args) + { + HIPCHECK(hipHccGetAcceleratorView(stream, &_av)); + } + + ~PfeCommand() { } + + + void run() override { + static const int gridX = 64; + static const int groupX = 64; + auto cf = hc::parallel_for_each(*_av, hc::extent<1>(gridX).tile(groupX), + [=](hc::index<1>& idx) __HC__ { + }); + } +private: + hc::accelerator_view *_av; +}; +#endif + + + +//================================================================================================= +class CopyCommand : public Command +{ +enum MemType {PinnedHost, UnpinnedHost, Device} ; + +public: + CopyCommand(CommandStream *cmdStream, const std::vector &args, hipMemcpyKind kind, bool isAsync, bool isPinnedHost) ; + + ~CopyCommand() + { + if (_dst) { + dealloc(_dst, _dstType); + _dst = NULL; + }; + + if (_src) { + dealloc(_src, _srcType); + _src = NULL; + } + } + + + void run() override { + if (_isAsync) { + HIPCHECK(hipMemcpyAsync(_dst, _src, _sizeBytes, _kind, _stream)); + } else { + HIPCHECK(hipMemcpy(_dst, _src, _sizeBytes, _kind)); + } + }; + +private: + void * alloc(size_t size, MemType memType) { + void * p; + if (memType == Device) { + HIPCHECK(hipMalloc(&p, size)); + + } else if (memType == PinnedHost) { + HIPCHECK(hipHostMalloc(&p, size)); + + } else if (memType == UnpinnedHost) { + p = (char*)malloc(size); + HIPASSERT(p, "malloc failed"); + + } else { + HIPASSERT(0, "unsupported memType"); + } + + return p; + }; + + + void dealloc(void *p, MemType memType) { + if (memType == Device) { + HIPCHECK(hipFree(p)); + } else if (memType == PinnedHost) { + HIPCHECK(hipHostFree(p)); + } else if (memType == UnpinnedHost) { + free(p); + } else { + HIPASSERT(0, "unsupported memType"); + } + } + + +private: + bool _isAsync; + hipStream_t _stream; + hipMemcpyKind _kind; + + size_t _sizeBytes; + void *_dst; + MemType _dstType; + + void *_src; + MemType _srcType; +}; + + +//================================================================================================= +class DeviceSyncCommand : public Command +{ +public: + DeviceSyncCommand(CommandStream *cmdStream, const std::vector &args) : + Command(cmdStream, args) {}; + + void run() override { + HIPCHECK(hipDeviceSynchronize()); + }; +}; + + +//================================================================================================= +class StreamSyncCommand : public Command +{ +public: + StreamSyncCommand(CommandStream *cmdStream, const std::vector &args) : + Command(cmdStream, args), + _stream(cmdStream->currentStream()) + {}; + + const char *help() { + return "synchronizes the current stream"; + }; + + + + void run() override { + HIPCHECK(hipStreamSynchronize(_stream)); + }; + +private: + hipStream_t _stream; +}; + + +//================================================================================================= + +//================================================================================================= +class LoopCommand : public Command +{ +public: + LoopCommand(CommandStream *parentCmdStream, const std::vector &args) : + Command(parentCmdStream, args, 1) + { + int loopCnt; + try { + loopCnt = std::stoi(args[1]); + } catch (std::invalid_argument) { + failed ("bad LOOP_CNT=%s", args[1].c_str()); + } + + _commandStream = new CommandStream("", loopCnt); + _commandStream->setParent(parentCmdStream); + parentCmdStream->enterSubBlock(_commandStream); + + }; + + + void print(const std::string &indent = "") const override { + Command::print(); + _commandStream->print (indent + " "); + }; + + void run() override { + _commandStream->run(); + }; +}; + + +//================================================================================================= +class EndBlockCommand : public Command +{ +public: + EndBlockCommand(CommandStream *blockCmdStream, CommandStream *parentCmdStream, const std::vector &args) : + Command(parentCmdStream, args, 0, 1), + _blockCmdStream(blockCmdStream), + _printTiming(0) + { + int argCnt = args.size()-1; + if (argCnt >= 1 ) { + _printTiming = readIntArg(1, "PRINT_TIMING"); + } + + if (parentCmdStream == nullptr) { + failed ("%s without corresponding command to start block", args[0].c_str()); + } + parentCmdStream->exitSubBlock(); + }; + + void run() override { + if (_printTiming) { + _blockCmdStream->printTiming(); + } + + }; +private: + + CommandStream *_blockCmdStream; + + // print the stream when loop exits. + int _printTiming; +}; + + +//================================================================================================= +class SetStreamCommand : public Command +{ +public: + SetStreamCommand(CommandStream *cmdStream, const std::vector &args) : + Command(cmdStream, args, 1) + { + int streamIndex = readIntArg(1, "STREAM_INDEX"); + + cmdStream->setStream(streamIndex); + + }; + + void run() override { + }; +}; + + +//================================================================================================= +class PrintTimingCommand : public Command +{ +public: + PrintTimingCommand(CommandStream *cmdStream, const std::vector &args) + : Command(cmdStream, args, 1) + { + _iterations = readIntArg(1, "ITERATIONS"); + }; + + void run() override { + _commandStream->printTiming(_iterations); + }; + +private: + int _iterations; +}; + + +//================================================================================================= +CopyCommand::CopyCommand(CommandStream *cmdStream, const std::vector &args, + hipMemcpyKind kind, bool isAsync, bool isPinnedHost) : + Command(cmdStream, args) , + _isAsync(isAsync), + _kind(kind), + _stream(cmdStream->currentStream()) + { + switch (kind) { + case hipMemcpyDeviceToHost: + _srcType = Device; + _dstType = isPinnedHost ? PinnedHost : UnpinnedHost; + break; + case hipMemcpyHostToDevice: + _srcType = isPinnedHost ? PinnedHost : UnpinnedHost; + _dstType = Device; + break; + default: + HIPASSERT(0, "Unknown hipMemcpyKind"); + }; + + _sizeBytes = 64; //TODO, support reading from arg. + + _dst = alloc(_sizeBytes, _dstType); + _src = alloc(_sizeBytes, _srcType); + }; + + +//================================================================================================= +//================================================================================================= +// Implementations: +//================================================================================================= + +//================================================================================================= +CommandStream::CommandStream(std::string commandStreamString, int iterations) + : _iterations(iterations), + _startTime(0), + _elapsedUs(0.0), + _parentCommandStream(nullptr), + _parseInSubBlock(false) +{ + std::vector tokens; + tokenize(commandStreamString, ';', tokens); + + + std::for_each(tokens.begin(), tokens.end(), [&] (const std::string s) { + this->parse(s); + }); + + setStream(0); +} + + +CommandStream::~CommandStream() +{ + std::for_each(_state._streams.begin(), _state._streams.end(), [&] (hipStream_t s) { + if (s) { + HIPCHECK(hipStreamDestroy(s)); + } + }); + + std::for_each(_commands.begin(), _commands.end(), [&] (Command *c) { + delete c; + }); + + +} + + +void CommandStream::setStream(int streamIndex) +{ + + if (streamIndex >= _state._streams.size()) { + _state._streams.resize(streamIndex+1); + } + + if (streamIndex && (_state._streams[streamIndex] == nullptr)) { + // Create new stream: + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + _state._streams[streamIndex] = stream; + _state._currentStream = stream; + } else { + // Use existing stream: + + _state._currentStream = _state._streams[streamIndex]; + } + +} + + +void CommandStream::tokenize(const std::string &s, char delim, std::vector &tokens) +{ + std::stringstream ss; + ss.str(s); + std::string item; + while (getline(ss, item, delim)) { + item.erase (std::remove (item.begin(), item.end(), ' '), item.end()); // remove whitespace. + tokens.push_back(item); + } +} + +void trim(std::string *s) +{ + // trim whitespace from begin and end: + const char *t = "\t\n\r\f\v"; + s->erase(0, s->find_first_not_of(t)); + s->erase(s->find_last_not_of(t)+1); +} + +void ltrim(std::string *s) +{ + // trim whitespace from begin and end: + const char *t = "\t\n\r\f\v"; + s->erase(0, s->find_first_not_of(t)); +} + +void CommandStream::parse(std::string fullCmd) +{ + //convert to lower-case: + std::transform(fullCmd.begin(), fullCmd.end(), fullCmd.begin(), ::tolower); + trim(&fullCmd); + + if (p_db) { + printf ("parse: <%s>\n", fullCmd.c_str()); + } + + + + std::string c; + std::vector args; + size_t leftParenZ = fullCmd.find_first_of('('); + if (leftParenZ == string::npos) { + c = fullCmd; + args.push_back(c); + } else { + c = fullCmd.substr(0, leftParenZ); + args.push_back(c); + size_t rightParenZ = fullCmd.find_first_of(')', leftParenZ); + std::string argStr = fullCmd.substr(leftParenZ+1, rightParenZ-leftParenZ-1); + //printf ("c=%s argstr='%s' leftParenZ=%zu rightParenZ=%zu\n", c.c_str(), argStr.c_str(), leftParenZ, rightParenZ); + tokenize(argStr, ',', args); + + } + + + + + + if ((args.size()==0) || (fullCmd.c_str()[0] == '#') ) { + if (p_db) { + printf (" skip comment\n"); + } + return; + } + + + + Command *cmd = NULL; + CommandStream *cmdStream = currentCommandStream(); + + if (c == "h2d") { + cmd = new CopyCommand(cmdStream, args, hipMemcpyHostToDevice, true/*isAsync*/, true/*isPinned*/); + //= h2d + //= Performs an async host-to-device copy of array A_h to A_d. + //= The size of these arrays may be set with the datasize command. + + } else if (c == "d2h") { + cmd = new CopyCommand(cmdStream, args, hipMemcpyDeviceToHost, true/*isAsync*/, true/*isPinned*/); + //= d2h + //= Performs an async device-to-host copy of array A_d to A_h. + //= The size of these arrays may be set with the datasize command. + + } else if (c == "modulekernel") { + cmd = new ModuleKernelCommand(cmdStream, args); + + } else if (c == "nullkernel") { + cmd = new KernelCommand(cmdStream, args, KernelCommand::Null); + //= nullkernel + //= Dispatches a null kernel to the device. + + } else if (c == "vectoraddkernel") { + cmd = new KernelCommand(cmdStream, args, KernelCommand::VectorAdd); + +#ifdef __HIP_PLATFORM_HCC__ + } else if (c == "nullpfe") { + cmd = new PfeCommand(cmdStream, args); + + } else if (c == "aqlkernel") { + cmd = new AqlKernelCommand(cmdStream, args); +#endif + + } else if (c == "devicesync") { + cmd = new DeviceSyncCommand(cmdStream, args); + + } else if (c == "streamsync") { + //= streamsync + //= Execute hipStreamSynchronize. + //= This will cause the host thread to wait until the current stream + //= completes all pending operations. + cmd = new StreamSyncCommand(cmdStream, args); + + } else if (c == "setstream") { + //= setstream(STREAM_INDEX); + //= Set current stream used by subsequent commands. + //= STREAM_INDEX is index starting from 0...N. + //= This function will create new stream on first call to setstream or re-use previous + //= stream if setstream has already been called with STREAM_INDEX. + //= STREAM_INDEX=0 will use the default "null" stream associated with the device, and will not create a new stream. + //= The default stream has special, conservative synchronization properties. + + cmd = new SetStreamCommand(cmdStream, args); + + } else if (c == "printtiming") { + cmd = new PrintTimingCommand(cmdStream, args); + + } else if (c == "loop") { + //= loop(LOOP_CNT) + //= Loop over next set of commands (until 'endloop' command) for LOOP_CNT iterations. + //= Loops can be nested. + + cmd = new LoopCommand(cmdStream, args); + + } else if (c == "endloop") { + //= endloop + //= End a looped sequence. Must be paired with a preceding loop command. + //= Command between the `loop` and `endloop` must be executed + + CommandStream * parentCmdStream = cmdStream->getParent() ; + cmd = new EndBlockCommand(cmdStream, parentCmdStream, args); + cmdStream = parentCmdStream; + + } else { + std::cerr << "error: Bad command '" << fullCmd << "\n"; + HIPASSERT(0, "bad command in command-stream"); + } + + if (cmd) { + cmdStream->_commands.push_back(cmd); + } +} + + + + +void CommandStream::print(const std::string &indent) const +{ + for (auto cmdI = _commands.begin(); cmdI != _commands.end(); cmdI++) { + (*cmdI)->print(indent); + }; +} + + +void CommandStream::printBrief(std::ostream &s) const +{ + for (auto cmdI = _commands.begin(); cmdI != _commands.end(); cmdI++) { + (*cmdI)->printBrief(s); + s << ";"; + }; +} + +void CommandStream::run() +{ + _startTime = get_time(); + for (int i=0; i<_iterations; i++) { + for (auto cmdI = _commands.begin(); cmdI != _commands.end(); cmdI++) { + if (p_verbose) { + (*cmdI)->print(); + } + (*cmdI)->run(); + } + } + + // Record time, if not already stored. (an earlier printTime command will also store the time) + recordTime(); +}; + +void CommandStream::recordTime() +{ + if (_elapsedUs == 0.0) { + auto stopTime = get_time(); + _elapsedUs = stopTime - _startTime; + } +} + + +void CommandStream::printTiming(int iterations) +{ + + if ((_state._subBlocks.size() == 1) && (_commands.size()==1)) { + //printf ("print just the loop\n"); + _state._subBlocks.front()->printTiming(iterations); + } else { + g_printedTiming = true; + + recordTime(); + if (iterations == 0) { + iterations = _iterations; + } + std::cout << "command<"; printBrief(std::cout); + std::cout << ">," ; + printf (" iterations,%d, total_time,%6.3f, time/iteration,%6.3f\n", iterations, _elapsedUs, _elapsedUs/iterations); + } +}; + + + + + +//================================================================================================= +int main(int argc, char *argv[]) +{ + parseStandardArguments(argc, argv); + + printConfig(); + + CommandStream *cs; + + if (p_blockingSync) { +#ifdef __HIP_PLATFORM_HCC__ + printf ("setting BlockingSync for AMD\n"); + setenv("HIP_BLOCKING_SYNC", "1", 1); + +#endif +#ifdef __HIP_PLATFORM_NVCC__ + printf ("setting cudaDeviceBlockingSync\n"); + HIPCHECK(hipSetDeviceFlags(cudaDeviceBlockingSync)); +#endif + }; + + + if (p_file) { + // TODO - catch exception on file IO here: + std::ifstream file(p_file); + std::string str; + std::string file_contents; + while (std::getline(file, str)) + { + file_contents += str; + } + + cs = new CommandStream(file_contents, p_iterations); + + } else { + cs = new CommandStream(p_command, p_iterations); + } + + cs->print(); + printf ("------\n"); + + cs->run(); + if (!g_printedTiming) { + cs->printTiming(); + } + + delete cs; +} + + + +// TODO - add error checking for arguments. diff --git a/samples/1_Utils/hipCommander/l2.hcm b/samples/1_Utils/hipCommander/l2.hcm new file mode 100644 index 000000000..b541bd6a6 --- /dev/null +++ b/samples/1_Utils/hipCommander/l2.hcm @@ -0,0 +1,3 @@ +setstream,1; +NullKernel; streamsync; +loop,10000; H2D; NullKernel; streamsync; endloop,1; diff --git a/samples/1_Utils/hipCommander/loop.hcm b/samples/1_Utils/hipCommander/loop.hcm new file mode 100644 index 000000000..db1107120 --- /dev/null +++ b/samples/1_Utils/hipCommander/loop.hcm @@ -0,0 +1,3 @@ +loop,1000; H2D; NullKernel; D2H; endloop; +streamsync; +printTiming, 1000 diff --git a/samples/1_Utils/hipCommander/loop2.hcm b/samples/1_Utils/hipCommander/loop2.hcm new file mode 100644 index 000000000..b8a14aa15 --- /dev/null +++ b/samples/1_Utils/hipCommander/loop2.hcm @@ -0,0 +1,2 @@ +setstream,1; +loop,1000; NullKernel; syncstream; endloop,1, diff --git a/samples/1_Utils/hipCommander/nullkernel.hip.cpp b/samples/1_Utils/hipCommander/nullkernel.hip.cpp new file mode 100644 index 000000000..890e9bdc1 --- /dev/null +++ b/samples/1_Utils/hipCommander/nullkernel.hip.cpp @@ -0,0 +1,7 @@ +#include "hip/hip_runtime.h" + +extern "C" __global__ void NullKernel(hipLaunchParm lp, float* Ad){ + if (Ad) { + Ad[0] = 42; + } +} diff --git a/samples/1_Utils/hipCommander/nullkernel.hsaco b/samples/1_Utils/hipCommander/nullkernel.hsaco new file mode 100755 index 0000000000000000000000000000000000000000..585b55cce587ed928acc1308d8a76361cdd87e57 GIT binary patch literal 10265 zcmeHNO>7%Q6n-9W){e7rDA1N4A(nuuP*E$pN!(QGAwP9XMB;{~MIu7lxa&>h;Gf9e zRH;N?J4e){oH(I02gIQ#BrYIvLE_l(-t3J3NKt=6)k-_E=gs?@ zo&DZ>E6;i^Cq_@QKmcS5;-JO>3<$-{8`6Sg{Sd;w9U`EjSi$drP&d;xF`1-cVMk)} zyI<=RN=e~4&yxIFXejt*dOYOWNhdcY$op{GQ5pX!{nq9&&*`z#++QqalPqA>_pO> zv5lj4Avc$G+)N>lv$^v{CoyiAWIf>)om9?p<U<1^Z(sJ!sobsZ=PAn7uXqa53X% z>{NCrRdOtMBst;sSeDV3n|GasL?SVM)|wcya*5=*Y;vO8G7LA9b<$QQpLP~TMl%MQ zB&^++pBuOB-IqT5^e=sOwi;c3;3ME8;3ME8;3ME8;3ME8;3ME8@T?;swu88Lqn#`6 z>ZIU4H?{=tzzx*3;oiT#y;#TW=X2ErilXf0mbNzW9*h{UP1@I9Vd^GLcNwm4z)o~! zX9pl04S#{CnCrkgBk;@OtiE=u99~;H2WuNqO(`#&gwLu%#VEL9i;PS%*?aek}+q+w3*kPhaH};D@b>%AgWfdococln}#6tZWBckhX-) zV2cvRz4jf}imN4Gd;t=y`u?=V(3h!dZ&dwi5%1+rWjR3aNRZydXyV*q21;A_5EJ7i zC1T!I!Y0kx(h{M4JfAzNhU4U~gdz}CRa347<9Aq5)x${RNaGdt(?!x%*v$a04d=PK z9aV7sfy%9QCETe&#B673IS|tNSpdvHE9+NTv>bt-7Xha3w%|LOS*WbeAx34mckQp8 zYeZMe;d`q9OB)E`6KgWvA2k1$A0Yn^Kx@PLpS}Foy!_X8&wmT^U#rXiE02=@FN^#~ z`pEf@zDtYKP|40gfr}70iFFF7GwWdtMK1RD)hE z{&)UkePaG=SYs-DwvEpOtm_rdN41FA!L)LPDdkYR*1xWXJ45jK_Sd{tmTv{CwV{*& z=I&y-1Jy*REn=$sw0?j;ws>{2<<&A1b;9>wbO{vLU|F*D~{|R>qy=PVHNl$xYG9Mf} zLBF|qBl)ZNrC*99SMWpu*-?`+l%Ei$H#jE$k`eNO>-gO@T zu1)JGAd~KxfuDHKG4n`Y`dF{ilRnaKPxtkhaeZ|3gE7ykbTR)nK&*5r=cdjhb&DRI zk)!mbJSrhJlPb+Xj5xOlG5j0Y3^n9MnicmN%x^lVpDJW zx9)MddLfRbpA-`>V=rYyzkSQ&WFFG{Fv48oFY}Myi0pR#xkH{my{W2iO>DiYVeHu{ zaQ*oPv52(-XPe~h+)RP*_BYyl$6XZoq)E + +static const int BLOCKSIZEX=32; +static const int BLOCKSIZEY=16; + +__global__ void fails(hipLaunchParm lp, float* pErrorI) +{ + if(pErrorI!=0) + { + pErrorI[0]=1; + } +} + +int main() +{ + dim3 blocks(1,1); + dim3 threads(BLOCKSIZEX,BLOCKSIZEY); + float error; + + hipLaunchKernel(HIP_KERNEL_NAME(fails), blocks, threads, 0, 0, &error); +} From 70716bee425f1268b67ef8ac84f1d1a3188ed6de Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Wed, 14 Dec 2016 15:49:40 +0530 Subject: [PATCH 091/700] Fixes in Makefile of couple of samples - modified Makefile for hipblas_saxpy to replaced hcblas.so with hipblas.so as part of HCSWAP-100 - Resolved missing separator issue in peer2peer cookbook Makefile Change-Id: I678fea267eee1481f02da09379339ed78d3f95f2 --- samples/2_Cookbook/8_peer2peer/Makefile | 13 +++++++------ samples/7_Advanced/hipblas_saxpy/Makefile | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/samples/2_Cookbook/8_peer2peer/Makefile b/samples/2_Cookbook/8_peer2peer/Makefile index 5cb747392..0bf9e6f93 100644 --- a/samples/2_Cookbook/8_peer2peer/Makefile +++ b/samples/2_Cookbook/8_peer2peer/Makefile @@ -1,6 +1,6 @@ HIP_PATH?= $(wildcard /opt/rocm/hip) ifeq (,$(HIP_PATH)) - HIP_PATH=../../.. + HIP_PATH=../../.. endif HIPCC=$(HIP_PATH)/bin/hipcc @@ -22,14 +22,15 @@ CXX=$(HIPCC) $(EXECUTABLE): $(OBJECTS) - $(HIPCC) $(OBJECTS) -o $@ + $(HIPCC) $(OBJECTS) -o $@ + test: $(EXECUTABLE) - $(EXECUTABLE) + $(EXECUTABLE) clean: -rm -f $(EXECUTABLE) -rm -f $(OBJECTS) -rm -f $(HIP_PATH)/src/*.o + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o diff --git a/samples/7_Advanced/hipblas_saxpy/Makefile b/samples/7_Advanced/hipblas_saxpy/Makefile index ed88be2dd..8586e75d2 100644 --- a/samples/7_Advanced/hipblas_saxpy/Makefile +++ b/samples/7_Advanced/hipblas_saxpy/Makefile @@ -12,7 +12,7 @@ endif ifeq (${HIP_PLATFORM}, hcc) HCBLAS_ROOT?= $(wildcard /opt/rocm/hcblas) HIPCC_FLAGS += -stdlib=libc++ -I$(HCBLAS_ROOT)/include - LIBS = -L$(HCBLAS_ROOT)/lib -lhcblas + LIBS = -L$(HCBLAS_ROOT)/lib -lhipblas -rpath $(HIP_PATH)/lib endif From 5ae4e8bd6783b63a4d6974c97e1d37b3d41b31fb Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Wed, 14 Dec 2016 15:49:40 +0530 Subject: [PATCH 092/700] Fixes in Makefile of couple of samples - modified Makefile for hipblas_saxpy to replaced hcblas.so with hipblas.so as part of HCSWAP-100 - Resolved missing separator issue in peer2peer cookbook Makefile Change-Id: I678fea267eee1481f02da09379339ed78d3f95f2 --- samples/2_Cookbook/8_peer2peer/Makefile | 13 +++++++------ samples/7_Advanced/hipblas_saxpy/Makefile | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/samples/2_Cookbook/8_peer2peer/Makefile b/samples/2_Cookbook/8_peer2peer/Makefile index 5cb747392..0bf9e6f93 100644 --- a/samples/2_Cookbook/8_peer2peer/Makefile +++ b/samples/2_Cookbook/8_peer2peer/Makefile @@ -1,6 +1,6 @@ HIP_PATH?= $(wildcard /opt/rocm/hip) ifeq (,$(HIP_PATH)) - HIP_PATH=../../.. + HIP_PATH=../../.. endif HIPCC=$(HIP_PATH)/bin/hipcc @@ -22,14 +22,15 @@ CXX=$(HIPCC) $(EXECUTABLE): $(OBJECTS) - $(HIPCC) $(OBJECTS) -o $@ + $(HIPCC) $(OBJECTS) -o $@ + test: $(EXECUTABLE) - $(EXECUTABLE) + $(EXECUTABLE) clean: -rm -f $(EXECUTABLE) -rm -f $(OBJECTS) -rm -f $(HIP_PATH)/src/*.o + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o diff --git a/samples/7_Advanced/hipblas_saxpy/Makefile b/samples/7_Advanced/hipblas_saxpy/Makefile index ed88be2dd..8586e75d2 100644 --- a/samples/7_Advanced/hipblas_saxpy/Makefile +++ b/samples/7_Advanced/hipblas_saxpy/Makefile @@ -12,7 +12,7 @@ endif ifeq (${HIP_PLATFORM}, hcc) HCBLAS_ROOT?= $(wildcard /opt/rocm/hcblas) HIPCC_FLAGS += -stdlib=libc++ -I$(HCBLAS_ROOT)/include - LIBS = -L$(HCBLAS_ROOT)/lib -lhcblas + LIBS = -L$(HCBLAS_ROOT)/lib -lhipblas -rpath $(HIP_PATH)/lib endif From 054fc61f6e3beb7cd87c21fadf43ff4395bad410 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 15 Dec 2016 14:41:27 -0600 Subject: [PATCH 093/700] remove TODO file --- samples/1_Utils/hipCommander/TODO | 50 ------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 samples/1_Utils/hipCommander/TODO diff --git a/samples/1_Utils/hipCommander/TODO b/samples/1_Utils/hipCommander/TODO deleted file mode 100644 index 4c835cfce..000000000 --- a/samples/1_Utils/hipCommander/TODO +++ /dev/null @@ -1,50 +0,0 @@ -_ Add AQL kernel. -_ Fix &*kernel command so the kernel name/type is an argument not a new command. - -_ Add command to parse only. -_ Add regression to parse all the hcm files. - -_ Partition HCC, HIP, HSA, OpenCL commands into separate files. - - -_ Show time for back-to-back copies. -_ Add variables. - %loopcnt - - ./hipCommander %loopcnt=4 - -_ Add datasize command. - - -_ Add ( ) to parsing. -_ Add argument parsing and checking. - -_ Add verbose option to print each step of setup. - - print deliniater between setup and run. Add run start message. - - - print sizes of all buffers. - - print each command before running. - - show start/stop of timer routine. - -_ -_ Clear documentation on what each oepration does. -_ Add time instrumentation for each command. -_ Add pcie atomic. - - -_ Add tests for negative cases, ie endloop w/o opening loop. - - -README tips ---- -- HIP_API_TRACE combined with -v is useful to track the exact commands generates by hipCommander. - - -Other ideas: ---- -[ ] Perf guide : stream creation very slow on HCC and should be avoided. - - -Scratch: - - From cee24a20f2edb40eea955e223729dd922944dac0 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 16 Dec 2016 08:55:11 -0600 Subject: [PATCH 094/700] Print limits on CUDA devices --- samples/1_Utils/hipInfo/hipInfo.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 42a879e73..cf4660eae 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -63,6 +63,14 @@ double bytesToGB(size_t s) return (double)s / (1024.0*1024.0*1024.0); } +#define printLimit(w1, limit, units) \ +{\ + size_t val;\ + cudaDeviceGetLimit(&val, limit);\ + std::cout << setw(w1) << #limit": " << val << " " << units << std::endl;\ +} + + void printDeviceProp (int deviceId) { using namespace std; @@ -144,6 +152,17 @@ void printDeviceProp (int deviceId) cout << endl; +#ifdef __HIP_PLATFORM_NVCC__ + // Limits: + cout << endl; + printLimit(w1, cudaLimitStackSize, "bytes/thread"); + printLimit(w1, cudaLimitPrintfFifoSize, "bytes/device"); + printLimit(w1, cudaLimitMallocHeapSize, "bytes/device"); + printLimit(w1, cudaLimitDevRuntimeSyncDepth, "grids"); + printLimit(w1, cudaLimitDevRuntimePendingLaunchCount, "launches"); +#endif + + cout << endl; From ba8fe1675fe087bf44c6d37d7108246286bcd333 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 19 Jan 2017 15:04:32 +0530 Subject: [PATCH 095/700] Fixed hipcommander default execution for HCSWAP-106 Change-Id: I9fbd10dfaeeb4928b2ec23ceed131b5200a658f9 --- samples/1_Utils/hipCommander/hipCommander.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/1_Utils/hipCommander/hipCommander.cpp b/samples/1_Utils/hipCommander/hipCommander.cpp index 9c07d066b..0add1ce3e 100644 --- a/samples/1_Utils/hipCommander/hipCommander.cpp +++ b/samples/1_Utils/hipCommander/hipCommander.cpp @@ -23,7 +23,7 @@ bool g_printedTiming = false; // Cmdline parms: int p_device = 0; -const char* p_command = "H2D; NullKernel; D2H"; +const char* p_command = "setstream(1); H2D; NullKernel; D2H;"; const char* p_file = nullptr; unsigned p_verbose = 0x0; unsigned p_db = 0x0; From 0390b12175aea6190b021c3758a675a4dffe4ad4 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 19 Jan 2017 12:33:06 -0600 Subject: [PATCH 096/700] Doc update - describe debug techniques Also tweak sample to remove unneeded HIP_KERNEL_NAME. Comment update --- samples/0_Intro/square/square.hipref.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index 3c863b8b7..0073c1399 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -81,7 +81,7 @@ int main(int argc, char *argv[]) const unsigned threadsPerBlock = 256; printf ("info: launch 'vector_square' kernel\n"); - hipLaunchKernel(HIP_KERNEL_NAME(vector_square), dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N); + hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N); printf ("info: copy Device2Host\n"); CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); From dc2f5fcc7d9969d322bc4c7ae55556afc6234dc0 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 7 Feb 2017 15:03:46 +0530 Subject: [PATCH 097/700] Command scripts for latency measurements Change-Id: I8c28765a09fb0358447367939de524b12699a317 --- .../hipCommander/perf/scripts/latency_2_d2h_h2d.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_2_d2h_kernel.hcm | 9 +++++++++ .../perf/scripts/latency_2_d2h_sync_h2d.hcm | 9 +++++++++ .../perf/scripts/latency_2_d2h_sync_kernel.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_2_h2d_d2h.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_2_h2d_kernel.hcm | 9 +++++++++ .../perf/scripts/latency_2_h2d_sync_d2h.hcm | 9 +++++++++ .../perf/scripts/latency_2_h2d_sync_kernel.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_2_kernel_d2h.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_2_kernel_h2d.hcm | 9 +++++++++ .../perf/scripts/latency_2_kernel_sync_d2h.hcm | 9 +++++++++ .../perf/scripts/latency_2_kernel_sync_h2d.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_2_sync.hcm | 9 +++++++++ .../1_Utils/hipCommander/perf/scripts/latency_2d2h.hcm | 10 ++++++++++ .../hipCommander/perf/scripts/latency_2d2h_wosync.hcm | 10 ++++++++++ .../1_Utils/hipCommander/perf/scripts/latency_2h2d.hcm | 10 ++++++++++ .../hipCommander/perf/scripts/latency_2h2d_kernel.hcm | 9 +++++++++ .../perf/scripts/latency_2h2d_kernel_wosync.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_2h2d_wosync.hcm | 10 ++++++++++ .../hipCommander/perf/scripts/latency_2kernels.hcm | 10 ++++++++++ .../perf/scripts/latency_2kernels_wosync.hcm | 10 ++++++++++ .../hipCommander/perf/scripts/latency_3_d2h_h2d.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_3_d2h_kernel.hcm | 9 +++++++++ .../perf/scripts/latency_3_d2h_sync_h2d.hcm | 9 +++++++++ .../perf/scripts/latency_3_d2h_sync_kernel.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_3_h2d_d2h.hcm | 10 ++++++++++ .../hipCommander/perf/scripts/latency_3_h2d_kernel.hcm | 9 +++++++++ .../perf/scripts/latency_3_h2d_sync_d2h.hcm | 9 +++++++++ .../perf/scripts/latency_3_h2d_sync_kernel.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_3_kernel_d2h.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_3_kernel_h2d.hcm | 9 +++++++++ .../perf/scripts/latency_3_kernel_sync_d2h.hcm | 9 +++++++++ .../perf/scripts/latency_3_kernel_sync_h2d.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_3_sync.hcm | 9 +++++++++ .../1_Utils/hipCommander/perf/scripts/latency_3d2h.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_3d2h_wosync.hcm | 9 +++++++++ .../1_Utils/hipCommander/perf/scripts/latency_3h2d.hcm | 10 ++++++++++ .../hipCommander/perf/scripts/latency_3h2d_wosync.hcm | 10 ++++++++++ .../hipCommander/perf/scripts/latency_3kernels.hcm | 10 ++++++++++ .../perf/scripts/latency_3kernels_wosync.hcm | 10 ++++++++++ .../hipCommander/perf/scripts/latency_4kernels.hcm | 10 ++++++++++ .../hipCommander/perf/scripts/latency_5kernels.hcm | 10 ++++++++++ .../hipCommander/perf/scripts/latency_6kernels.hcm | 10 ++++++++++ .../1_Utils/hipCommander/perf/scripts/latency_d2h.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_d2h_h2d.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_d2h_kernel.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_d2h_sync_h2d.hcm | 9 +++++++++ .../perf/scripts/latency_d2h_sync_kernel.hcm | 9 +++++++++ .../1_Utils/hipCommander/perf/scripts/latency_h2d.hcm | 10 ++++++++++ .../hipCommander/perf/scripts/latency_h2d_10.hcm | 2 ++ .../hipCommander/perf/scripts/latency_h2d_d2h.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_h2d_kernel.hcm | 9 +++++++++ .../perf/scripts/latency_h2d_kernel_d2h.hcm | 9 +++++++++ .../perf/scripts/latency_h2d_kernel_d2h_wosync.hcm | 9 +++++++++ .../perf/scripts/latency_h2d_kernel_wosync.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_h2d_sync_d2h.hcm | 9 +++++++++ .../perf/scripts/latency_h2d_sync_kernel_sync.hcm | 9 +++++++++ .../perf/scripts/latency_h2d_sync_kernel_sync_d2h.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_kernel.hcm | 10 ++++++++++ .../perf/scripts/latency_kernel_barrier.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_kernel_d2h.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_kernel_h2d.hcm | 9 +++++++++ .../perf/scripts/latency_kernel_sync_d2h.hcm | 9 +++++++++ .../perf/scripts/latency_kernel_sync_h2d.hcm | 9 +++++++++ .../hipCommander/perf/scripts/latency_streamcreate.hcm | 2 ++ .../1_Utils/hipCommander/perf/scripts/latency_sync.hcm | 9 +++++++++ 66 files changed, 596 insertions(+) create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_kernel.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_sync_h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_sync_kernel.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_kernel.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_sync_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_sync_kernel.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_sync_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_sync_h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2_sync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2d2h_wosync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2h2d_kernel.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2h2d_kernel_wosync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2h2d_wosync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2kernels.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_2kernels_wosync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_kernel.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_sync_h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_sync_kernel.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_kernel.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_sync_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_sync_kernel.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_sync_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_sync_h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3_sync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3d2h_wosync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3h2d_wosync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3kernels.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_3kernels_wosync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_4kernels.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_5kernels.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_6kernels.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_d2h_h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_d2h_kernel.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_d2h_sync_h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_d2h_sync_kernel.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_h2d_10.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_h2d_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel_d2h_wosync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel_wosync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_h2d_sync_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_h2d_sync_kernel_sync.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_h2d_sync_kernel_sync_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_kernel.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_kernel_barrier.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_kernel_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_kernel_h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_kernel_sync_d2h.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_kernel_sync_h2d.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_streamcreate.hcm create mode 100644 samples/1_Utils/hipCommander/perf/scripts/latency_sync.hcm diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_h2d.hcm new file mode 100644 index 000000000..640bb2be7 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_h2d.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1); +loop(10); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1); +loop(100); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1); +loop(100); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1); +loop(1000); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1); +loop(1000); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1); +loop(10000); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1); +loop(10000); D2H; H2D; streamsync;D2H; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_kernel.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_kernel.hcm new file mode 100644 index 000000000..c1bc0f670 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_kernel.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1); +loop(10); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1); +loop(100); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1); +loop(100); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1); +loop(1000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1); +loop(1000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1); +loop(10000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1); +loop(10000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_sync_h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_sync_h2d.hcm new file mode 100644 index 000000000..0e787f9bd --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_sync_h2d.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1); +loop(10); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1); +loop(100); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1); +loop(100); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1); +loop(1000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1); +loop(1000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1); +loop(10000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1); +loop(10000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_sync_kernel.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_sync_kernel.hcm new file mode 100644 index 000000000..8d7fddc14 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2_d2h_sync_kernel.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1); +loop(10); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1); +loop(100); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1); +loop(100); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1); +loop(1000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1); +loop(1000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1); +loop(10000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1); +loop(10000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_d2h.hcm new file mode 100644 index 000000000..7d845d03a --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1); +loop(10); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1); +loop(100); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1); +loop(100); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1); +loop(1000); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1); +loop(1000); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1); +loop(10000); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1); +loop(10000); H2D; D2H; streamsync;H2D; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_kernel.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_kernel.hcm new file mode 100644 index 000000000..49c0d7714 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_kernel.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1); +loop(10); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync;endloop(1); +loop(100); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync;endloop(1); +loop(100); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync; endloop(1); +loop(1000); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync; endloop(1); +loop(1000); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync; endloop(1); +loop(10000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1); +loop(10000); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_sync_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_sync_d2h.hcm new file mode 100644 index 000000000..fe1f14bee --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_sync_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1); +loop(10); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1); +loop(100); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1); +loop(100); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1); +loop(1000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1); +loop(1000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1); +loop(10000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1); +loop(10000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_sync_kernel.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_sync_kernel.hcm new file mode 100644 index 000000000..0762001da --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2_h2d_sync_kernel.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(10); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(100); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(100); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(1000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(1000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(10000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(10000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_d2h.hcm new file mode 100644 index 000000000..88003ba47 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1); +loop(10); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1); +loop(100); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1); +loop(100); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1); +loop(1000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1); +loop(1000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1); +loop(10000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1); +loop(10000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_h2d.hcm new file mode 100644 index 000000000..01913f848 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_h2d.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1); +loop(10); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1); +loop(100); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1); +loop(100); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1); +loop(1000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1); +loop(1000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1); +loop(10000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1); +loop(10000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_sync_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_sync_d2h.hcm new file mode 100644 index 000000000..530eb8f68 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_sync_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(10); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(100); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(100); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_sync_h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_sync_h2d.hcm new file mode 100644 index 000000000..6d83ee87c --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2_kernel_sync_h2d.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(10); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(100); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(100); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2_sync.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2_sync.hcm new file mode 100644 index 000000000..8b9e233a9 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2_sync.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); streamsync; streamsync; endloop(1); +loop(10); streamsync; streamsync; endloop(1); +loop(100); streamsync; streamsync; endloop(1); +loop(100); streamsync; streamsync; endloop(1); +loop(1000); streamsync; streamsync; endloop(1); +loop(1000); streamsync; streamsync; endloop(1); +loop(10000); streamsync; streamsync; endloop(1); +loop(10000); streamsync; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2d2h.hcm new file mode 100644 index 000000000..83cdc4ff7 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2d2h.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); D2H; streamsync; D2H; streamsync; endloop(1); +loop(10); D2H; streamsync; D2H; streamsync; endloop(1); +loop(100); D2H; streamsync; D2H; streamsync; endloop(1); +loop(100); D2H; streamsync; D2H; streamsync; endloop(1); +loop(1000); D2H;streamsync; D2H; streamsync; endloop(1); +loop(1000); D2H; streamsync; D2H; streamsync; endloop(1); +loop(1000); D2H; streamsync; D2H; streamsync; endloop(1); +loop(10000); D2H; streamsync; D2H; streamsync; endloop(1); +loop(10000); D2H; streamsync; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2d2h_wosync.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2d2h_wosync.hcm new file mode 100644 index 000000000..4b9140358 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2d2h_wosync.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); D2H; D2H; streamsync; endloop(1); +loop(10); D2H; D2H; streamsync; endloop(1); +loop(100); D2H; D2H; streamsync; endloop(1); +loop(100); D2H; D2H; streamsync; endloop(1); +loop(1000); D2H; D2H; streamsync; endloop(1); +loop(1000); D2H; D2H; streamsync; endloop(1); +loop(1000); D2H; D2H; streamsync; endloop(1); +loop(10000); D2H; D2H; streamsync; endloop(1); +loop(10000); D2H; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2h2d.hcm new file mode 100644 index 000000000..a2e4311bf --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2h2d.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); H2D; streamsync; H2D; streamsync; endloop(1); +loop(10); H2D; streamsync; H2D; streamsync; endloop(1); +loop(100); H2D; streamsync; H2D; streamsync; endloop(1); +loop(100); H2D; streamsync; H2D; streamsync; endloop(1); +loop(1000); H2D;streamsync; H2D; streamsync; endloop(1); +loop(1000); H2D; streamsync; H2D; streamsync; endloop(1); +loop(1000); H2D; streamsync; H2D; streamsync; endloop(1); +loop(10000); H2D; streamsync; H2D; streamsync; endloop(1); +loop(10000); H2D; streamsync; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2h2d_kernel.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2h2d_kernel.hcm new file mode 100644 index 000000000..0c622614c --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2h2d_kernel.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel; streamsync;endloop(1); +loop(10); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(100); H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel; streamsync;endloop(1); +loop(100); H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel; streamsync;endloop(1); +loop(1000); H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel; streamsync;endloop(1); +loop(1000); H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel; streamsync;endloop(1); +loop(10000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(10000); H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel; streamsync;endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2h2d_kernel_wosync.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2h2d_kernel_wosync.hcm new file mode 100644 index 000000000..d73467da1 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2h2d_kernel_wosync.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1); +loop(10); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync;endloop(1); +loop(100); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync;endloop(1); +loop(100); H2D; NullKernel; streamsync; H2D; NullKernel; streamsync;endloop(1); +loop(1000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1); +loop(1000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1); +loop(10000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1); +loop(10000); H2D ; NullKernel; streamsync;H2D; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2h2d_wosync.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2h2d_wosync.hcm new file mode 100644 index 000000000..35f5e6852 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2h2d_wosync.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); H2D; H2D; streamsync; endloop(1); +loop(10); H2D; H2D; streamsync; endloop(1); +loop(100); H2D; H2D; streamsync; endloop(1); +loop(100); H2D; H2D; streamsync; endloop(1); +loop(1000); H2D; H2D; streamsync; endloop(1); +loop(1000); H2D; H2D; streamsync; endloop(1); +loop(1000); H2D; H2D; streamsync; endloop(1); +loop(10000); H2D; H2D; streamsync; endloop(1); +loop(10000); H2D; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2kernels.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2kernels.hcm new file mode 100644 index 000000000..3b85c6bef --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2kernels.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(10); NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(100); NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(100); NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_2kernels_wosync.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_2kernels_wosync.hcm new file mode 100644 index 000000000..584d6b802 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_2kernels_wosync.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); NullKernel; NullKernel; streamsync; endloop(1); +loop(10); NullKernel; NullKernel; streamsync; endloop(1); +loop(100); NullKernel; NullKernel; streamsync; endloop(1); +loop(100); NullKernel; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_h2d.hcm new file mode 100644 index 000000000..7f0fce96c --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_h2d.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1); +loop(10); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1); +loop(100); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1); +loop(100); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1); +loop(1000); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1); +loop(1000); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1); +loop(10000); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1); +loop(10000); D2H; H2D; streamsync;D2H; H2D; streamsync; D2H; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_kernel.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_kernel.hcm new file mode 100644 index 000000000..a384439b5 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_kernel.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1); +loop(10); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1); +loop(100); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1); +loop(100); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1); +loop(1000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1); +loop(1000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1); +loop(10000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1); +loop(10000); D2H; NullKernel; streamsync;D2H; NullKernel; streamsync;streamsync; D2H; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_sync_h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_sync_h2d.hcm new file mode 100644 index 000000000..1cab6ff0d --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_sync_h2d.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1); +loop(10); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1); +loop(100); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1); +loop(100); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1); +loop(1000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1); +loop(1000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1); +loop(10000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1); +loop(10000); D2H; streamsync; H2D; streamsync;D2H; streamsync; H2D; streamsync; D2H; streamsync; H2D;streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_sync_kernel.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_sync_kernel.hcm new file mode 100644 index 000000000..ff5b09a3d --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3_d2h_sync_kernel.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1); +loop(10); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1); +loop(100); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1); +loop(100); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1); +loop(1000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1); +loop(1000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1); +loop(10000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1); +loop(10000); D2H; streamsync; NullKernel; streamsync;D2H; streamsync; NullKernel; streamsync; D2H; streamsync; NullKernel;streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_d2h.hcm new file mode 100644 index 000000000..d8921a64e --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_d2h.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1); +loop(10); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1); +loop(100); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1); +loop(100); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1); +loop(1000); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1); +loop(1000); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1); +loop(10000); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1); +loop(10000); H2D; D2H; streamsync;H2D; D2H; streamsync; H2D; D2H; streamsync; endloop(1); + diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_kernel.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_kernel.hcm new file mode 100644 index 000000000..4ccbf9a83 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_kernel.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1); +loop(10); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1); +loop(100); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1); +loop(100); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1); +loop(1000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1); +loop(1000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1); +loop(10000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1); +loop(10000); H2D; NullKernel; streamsync;H2D; NullKernel; streamsync;streamsync; H2D; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_sync_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_sync_d2h.hcm new file mode 100644 index 000000000..a3d9a282f --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_sync_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1); +loop(10); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1); +loop(100); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1); +loop(100); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1); +loop(1000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1); +loop(1000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1); +loop(10000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1); +loop(10000); H2D; streamsync; D2H; streamsync;H2D; streamsync; D2H; streamsync; H2D; streamsync; D2H;streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_sync_kernel.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_sync_kernel.hcm new file mode 100644 index 000000000..56554d15f --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3_h2d_sync_kernel.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1); +loop(10); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1); +loop(100); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1); +loop(100); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1); +loop(1000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1); +loop(1000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1); +loop(10000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1); +loop(10000); H2D; streamsync; NullKernel; streamsync;H2D; streamsync; NullKernel; streamsync; H2D; streamsync; NullKernel;streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_d2h.hcm new file mode 100644 index 000000000..a6e3a683d --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1); +loop(10); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1); +loop(100); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1); +loop(100); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1); +loop(1000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1); +loop(1000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1); +loop(10000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1); +loop(10000); NullKernel; D2H; streamsync;NullKernel; D2H; streamsync; NullKernel; D2H; streamsync;endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_h2d.hcm new file mode 100644 index 000000000..eae3eadc5 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_h2d.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1); +loop(10); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1); +loop(100); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1); +loop(100); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1); +loop(1000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1); +loop(1000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1); +loop(10000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1); +loop(10000); NullKernel; H2D; streamsync;NullKernel; H2D; streamsync; NullKernel; H2D; streamsync;endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_sync_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_sync_d2h.hcm new file mode 100644 index 000000000..9e21709b0 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_sync_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(10); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(100); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(100); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync;NullKernel; streamsync; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_sync_h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_sync_h2d.hcm new file mode 100644 index 000000000..b1ef7ef9f --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3_kernel_sync_h2d.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(10); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(100); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(100); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync;NullKernel; streamsync; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3_sync.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3_sync.hcm new file mode 100644 index 000000000..bc8d21c59 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3_sync.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10);streamsync; streamsync; streamsync; endloop(1); +loop(10);streamsync; streamsync; streamsync; endloop(1); +loop(100);streamsync; streamsync; streamsync; endloop(1); +loop(100);streamsync; streamsync; streamsync; endloop(1); +loop(1000);streamsync; streamsync; streamsync; endloop(1); +loop(1000);streamsync; streamsync; streamsync; endloop(1); +loop(10000);streamsync; streamsync; streamsync; endloop(1); +loop(10000);streamsync; streamsync; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3d2h.hcm new file mode 100644 index 000000000..4e07574b9 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1); +loop(10); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1); +loop(100); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1); +loop(100); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1); +loop(1000); D2H;streamsync; D2H; streamsync; D2H; streamsync; endloop(1); +loop(1000); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1); +loop(10000); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1); +loop(10000); D2H; streamsync; D2H; streamsync; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3d2h_wosync.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3d2h_wosync.hcm new file mode 100644 index 000000000..e96707fed --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3d2h_wosync.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; D2H; D2H; streamsync; endloop(1); +loop(10); D2H; D2H; D2H; streamsync; endloop(1); +loop(100); D2H; D2H; D2H; streamsync; endloop(1); +loop(100); D2H; D2H; D2H; streamsync; endloop(1); +loop(1000); D2H; D2H; D2H; streamsync; endloop(1); +loop(1000); D2H; D2H; D2H; streamsync; endloop(1); +loop(10000); D2H; D2H; D2H; streamsync; endloop(1); +loop(10000); D2H; D2H; D2H;streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3h2d.hcm new file mode 100644 index 000000000..82151adb8 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3h2d.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); H2D; streamsync;H2D;streamsync; H2D; streamsync; endloop(1); +loop(10); H2D; streamsync;H2D;streamsync; H2D; streamsync; endloop(1); +loop(100); H2D; streamsync;H2D; streamsync;H2D; streamsync; endloop(1); +loop(100); H2D;streamsync; H2D; streamsync;H2D; streamsync; endloop(1); +loop(1000); H2D;streamsync; H2D;streamsync; H2D; streamsync; endloop(1); +loop(1000); H2D;streamsync; H2D; streamsync;H2D; streamsync; endloop(1); +loop(1000); H2D;streamsync; H2D; streamsync;H2D; streamsync; endloop(1); +loop(10000); H2D;streamsync; H2D; streamsync;H2D; streamsync; endloop(1); +loop(10000); H2D;streamsync; H2D;streamsync; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3h2d_wosync.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3h2d_wosync.hcm new file mode 100644 index 000000000..7d96bfcfa --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3h2d_wosync.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); H2D; H2D; H2D; streamsync; endloop(1); +loop(10); H2D; H2D; H2D; streamsync; endloop(1); +loop(100); H2D; H2D; H2D; streamsync; endloop(1); +loop(100); H2D; H2D; H2D; streamsync; endloop(1); +loop(1000); H2D; H2D; H2D; streamsync; endloop(1); +loop(1000); H2D; H2D; H2D; streamsync; endloop(1); +loop(1000); H2D; H2D; H2D; streamsync; endloop(1); +loop(10000); H2D; H2D; H2D; streamsync; endloop(1); +loop(10000); H2D; H2D; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3kernels.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3kernels.hcm new file mode 100644 index 000000000..2e8306dfd --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3kernels.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(10); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(100); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(100); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; NullKernel; streamsync; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_3kernels_wosync.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_3kernels_wosync.hcm new file mode 100644 index 000000000..85cd0dd4d --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_3kernels_wosync.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(10); NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(100); NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(100); NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; NullKernel; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_4kernels.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_4kernels.hcm new file mode 100644 index 000000000..48a822362 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_4kernels.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(10); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(100); NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1); +loop(100); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_5kernels.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_5kernels.hcm new file mode 100644 index 000000000..70ad00c24 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_5kernels.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); NullKernel; NullKernel; NullKernel; NullKernel;NullKernel; streamsync; endloop(1); +loop(10); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(100); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1); +loop(100); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_6kernels.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_6kernels.hcm new file mode 100644 index 000000000..1bbb5694b --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_6kernels.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); NullKernel; NullKernel; NullKernel; NullKernel;NullKernel;NullKernel; streamsync; endloop(1); +loop(10); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1); +loop(100); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;NullKernel;streamsync; endloop(1); +loop(100); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1); +loop(1000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1); +loop(10000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1); +loop(10000); NullKernel; NullKernel; NullKernel; NullKernel; NullKernel; NullKernel;streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_d2h.hcm new file mode 100644 index 000000000..54f06a348 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; streamsync; endloop(1); +loop(10); D2H; streamsync; endloop(1); +loop(100); D2H; streamsync; endloop(1); +loop(100); D2H; streamsync; endloop(1); +loop(1000); D2H; streamsync; endloop(1); +loop(1000); D2H; streamsync; endloop(1); +loop(10000); D2H; streamsync; endloop(1); +loop(10000); D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_d2h_h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_d2h_h2d.hcm new file mode 100644 index 000000000..6667ba95f --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_d2h_h2d.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; H2D; streamsync; endloop(1); +loop(10); D2H; H2D; streamsync; endloop(1); +loop(100); D2H; H2D; streamsync; endloop(1); +loop(100); D2H; H2D; streamsync; endloop(1); +loop(1000); D2H; H2D; streamsync; endloop(1); +loop(1000); D2H; H2D; streamsync; endloop(1); +loop(10000); D2H; H2D; streamsync; endloop(1); +loop(10000); D2H; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_d2h_kernel.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_d2h_kernel.hcm new file mode 100644 index 000000000..fe770c5e9 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_d2h_kernel.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; NullKernel; streamsync; endloop(1); +loop(10); D2H; NullKernel; streamsync; endloop(1); +loop(100); D2H; NullKernel; streamsync; endloop(1); +loop(100); D2H; NullKernel; streamsync; endloop(1); +loop(1000); D2H; NullKernel; streamsync; endloop(1); +loop(1000); D2H; NullKernel; streamsync; endloop(1); +loop(10000); D2H; NullKernel; streamsync; endloop(1); +loop(10000); D2H; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_d2h_sync_h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_d2h_sync_h2d.hcm new file mode 100644 index 000000000..20ec95150 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_d2h_sync_h2d.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; streamsync; H2D; streamsync; endloop(1); +loop(10); D2H; streamsync; H2D; streamsync; endloop(1); +loop(100); D2H; streamsync; H2D; streamsync; endloop(1); +loop(100); D2H; streamsync; H2D; streamsync; endloop(1); +loop(1000); D2H; streamsync; H2D; streamsync; endloop(1); +loop(1000); D2H; streamsync; H2D; streamsync; endloop(1); +loop(10000); D2H; streamsync; H2D; streamsync; endloop(1); +loop(10000); D2H; streamsync; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_d2h_sync_kernel.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_d2h_sync_kernel.hcm new file mode 100644 index 000000000..77e483b3d --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_d2h_sync_kernel.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); D2H; streamsync;NullKernel; streamsync; endloop(1); +loop(10); D2H; streamsync;NullKernel; streamsync; endloop(1); +loop(100); D2H; streamsync;NullKernel; streamsync; endloop(1); +loop(100); D2H; streamsync;NullKernel; streamsync; endloop(1); +loop(1000); D2H; streamsync;NullKernel; streamsync; endloop(1); +loop(1000); D2H; streamsync;NullKernel; streamsync; endloop(1); +loop(10000); D2H; streamsync;NullKernel; streamsync; endloop(1); +loop(10000); D2H; streamsync;NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d.hcm new file mode 100644 index 000000000..f5642bfdf --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); H2D; streamsync; endloop(1); +loop(10); H2D; streamsync; endloop(1); +loop(100); H2D; streamsync; endloop(1); +loop(100); H2D; streamsync; endloop(1); +loop(1000); H2D; streamsync; endloop(1); +loop(1000); H2D; streamsync; endloop(1); +loop(1000); H2D; streamsync; endloop(1); +loop(10000); H2D; streamsync; endloop(1); +loop(10000); H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_10.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_10.hcm new file mode 100644 index 000000000..05452b9c8 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_10.hcm @@ -0,0 +1,2 @@ +setstream(1); +loop(10); H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_d2h.hcm new file mode 100644 index 000000000..dad9fc743 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; D2H; streamsync; endloop(1); +loop(10); H2D; D2H; streamsync; endloop(1); +loop(100); H2D; D2H; streamsync; endloop(1); +loop(100); H2D; D2H; streamsync; endloop(1); +loop(1000); H2D; D2H; streamsync; endloop(1); +loop(1000); H2D; D2H; streamsync; endloop(1); +loop(10000); H2D; D2H; streamsync; endloop(1); +loop(10000); H2D; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel.hcm new file mode 100644 index 000000000..1b60640b9 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; NullKernel; streamsync; endloop(1); +loop(10); H2D; NullKernel; streamsync; endloop(1); +loop(100); H2D; NullKernel; streamsync; endloop(1); +loop(100); H2D; NullKernel; streamsync; endloop(1); +loop(1000); H2D; NullKernel; streamsync; endloop(1); +loop(1000); H2D; NullKernel; streamsync; endloop(1); +loop(10000); H2D; NullKernel; streamsync; endloop(1); +loop(10000); H2D; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel_d2h.hcm new file mode 100644 index 000000000..6e4e9f354 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10);H2D; streamsync; NullKernel; D2H; streamsync;endloop(1); +loop(10); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1); +loop(100); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1); +loop(100); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1); +loop(1000); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1); +loop(1000); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1); +loop(10000); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1); +loop(10000); H2D; streamsync; NullKernel; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel_d2h_wosync.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel_d2h_wosync.hcm new file mode 100644 index 000000000..4e94a26eb --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel_d2h_wosync.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10);H2D; NullKernel; D2H; streamsync;endloop(1); +loop(10); H2D; NullKernel; D2H; streamsync; endloop(1); +loop(100); H2D; NullKernel; D2H; streamsync; endloop(1); +loop(100); H2D; NullKernel; D2H; streamsync; endloop(1); +loop(1000); H2D; NullKernel; D2H; streamsync; endloop(1); +loop(1000); H2D; NullKernel; D2H; streamsync; endloop(1); +loop(10000); H2D; NullKernel; D2H; streamsync; endloop(1); +loop(10000); H2D; NullKernel; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel_wosync.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel_wosync.hcm new file mode 100644 index 000000000..b3b40d319 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_kernel_wosync.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; NullKernel; streamsync; endloop(1); +loop(10); H2D; NullKernel; streamsync; endloop(1); +loop(100); H2D; NullKernel; streamsync; endloop(1); +loop(100); H2D; NullKernel; streamsync; endloop(1); +loop(1000); H2D; NullKernel; streamsync; endloop(1); +loop(1000); H2D; NullKernel; streamsync; endloop(1); +loop(10000); H2D; NullKernel; streamsync; endloop(1); +loop(10000); H2D ; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_sync_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_sync_d2h.hcm new file mode 100644 index 000000000..030213d1b --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_sync_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; streamsync; D2H; streamsync; endloop(1); +loop(10); H2D; streamsync; D2H; streamsync; endloop(1); +loop(100); H2D; streamsync; D2H; streamsync; endloop(1); +loop(100); H2D; streamsync; D2H; streamsync; endloop(1); +loop(1000); H2D; streamsync; D2H; streamsync; endloop(1); +loop(1000); H2D; streamsync; D2H; streamsync; endloop(1); +loop(10000); H2D; streamsync; D2H; streamsync; endloop(1); +loop(10000); H2D; streamsync; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_sync_kernel_sync.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_sync_kernel_sync.hcm new file mode 100644 index 000000000..146c74bca --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_sync_kernel_sync.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(10); H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(100); H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(100); H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(1000); H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(1000); H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(10000); H2D; streamsync; NullKernel; streamsync; endloop(1); +loop(10000); H2D; streamsync; NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_sync_kernel_sync_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_sync_kernel_sync_d2h.hcm new file mode 100644 index 000000000..366d04f46 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_h2d_sync_kernel_sync_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10);H2D; streamsync; NullKernel;streamsync; D2H; streamsync;endloop(1); +loop(10); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(100); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(100); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(1000); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(1000); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(10000); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(10000); H2D; streamsync; NullKernel; streamsync; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_kernel.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_kernel.hcm new file mode 100644 index 000000000..027d89aad --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_kernel.hcm @@ -0,0 +1,10 @@ +setstream(1); +loop(10); NullKernel; streamsync; endloop(1); +loop(10); NullKernel; streamsync; endloop(1); +loop(100); NullKernel; streamsync; endloop(1); +loop(100); NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_barrier.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_barrier.hcm new file mode 100644 index 000000000..fb6a867e7 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_barrier.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); NullKernel; streamsync; streamsync; endloop(1); +loop(10); NullKernel; streamsync; streamsync; endloop(1); +loop(100); NullKernel; streamsync; streamsync; endloop(1); +loop(100); NullKernel; streamsync; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_d2h.hcm new file mode 100644 index 000000000..2e64472db --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); NullKernel; D2H; streamsync; endloop(1); +loop(10); NullKernel; D2H; streamsync; endloop(1); +loop(100); NullKernel; D2H; streamsync; endloop(1); +loop(100); NullKernel; D2H; streamsync; endloop(1); +loop(1000); NullKernel; D2H; streamsync; endloop(1); +loop(1000); NullKernel; D2H; streamsync; endloop(1); +loop(10000); NullKernel; D2H; streamsync; endloop(1); +loop(10000); NullKernel; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_h2d.hcm new file mode 100644 index 000000000..b220a69c6 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_h2d.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); NullKernel; H2D; streamsync; endloop(1); +loop(10); NullKernel; H2D; streamsync; endloop(1); +loop(100); NullKernel; H2D; streamsync; endloop(1); +loop(100); NullKernel; H2D; streamsync; endloop(1); +loop(1000); NullKernel; H2D; streamsync; endloop(1); +loop(1000); NullKernel; H2D; streamsync; endloop(1); +loop(10000); NullKernel; H2D; streamsync; endloop(1); +loop(10000); NullKernel; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_sync_d2h.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_sync_d2h.hcm new file mode 100644 index 000000000..48b332b1c --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_sync_d2h.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(10); NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(100); NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(100); NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; D2H; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; D2H; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_sync_h2d.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_sync_h2d.hcm new file mode 100644 index 000000000..5a45d5537 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_kernel_sync_h2d.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(10); NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(100); NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(100); NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(1000); NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; H2D; streamsync; endloop(1); +loop(10000); NullKernel; streamsync; H2D; streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_streamcreate.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_streamcreate.hcm new file mode 100644 index 000000000..1e6aef5dc --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_streamcreate.hcm @@ -0,0 +1,2 @@ +setstream(1); +loop(10);setstream(1);setstream(2);setstream(3);setstream(4);setstream(5);streamsync; endloop(1); diff --git a/samples/1_Utils/hipCommander/perf/scripts/latency_sync.hcm b/samples/1_Utils/hipCommander/perf/scripts/latency_sync.hcm new file mode 100644 index 000000000..a784013b1 --- /dev/null +++ b/samples/1_Utils/hipCommander/perf/scripts/latency_sync.hcm @@ -0,0 +1,9 @@ +setstream(1); +loop(10); streamsync; endloop(1); +loop(10); streamsync; endloop(1); +loop(100); streamsync; endloop(1); +loop(100); streamsync; endloop(1); +loop(1000); streamsync; endloop(1); +loop(1000); streamsync; endloop(1); +loop(10000); streamsync; endloop(1); +loop(10000); streamsync; endloop(1); From b5e3f593773648236a3c61ea6540b075cbf1ab67 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 17 Feb 2017 08:51:02 -0600 Subject: [PATCH 098/700] removed hipblas samples as it is not yet supported Change-Id: I354b710e652ce0d0413d670530ceb8b70f4993d5 --- samples/7_Advanced/hipblas_saxpy/Makefile | 34 ------- .../7_Advanced/hipblas_saxpy/saxpy.cublas.cpp | 94 ------------------- .../hipblas_saxpy/saxpy.hipblasref.cpp | 94 ------------------- 3 files changed, 222 deletions(-) delete mode 100644 samples/7_Advanced/hipblas_saxpy/Makefile delete mode 100644 samples/7_Advanced/hipblas_saxpy/saxpy.cublas.cpp delete mode 100644 samples/7_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp diff --git a/samples/7_Advanced/hipblas_saxpy/Makefile b/samples/7_Advanced/hipblas_saxpy/Makefile deleted file mode 100644 index 8586e75d2..000000000 --- a/samples/7_Advanced/hipblas_saxpy/Makefile +++ /dev/null @@ -1,34 +0,0 @@ -HIP_PATH?= $(wildcard /opt/rocm/hip) -ifeq (,$(HIP_PATH)) - HIP_PATH=../../.. -endif -HIPCC=$(HIP_PATH)/bin/hipcc - -HIPCC_FLAGS += -std=c++11 -HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) -ifeq (${HIP_PLATFORM}, nvcc) - LIBS = -lcublas -endif -ifeq (${HIP_PLATFORM}, hcc) - HCBLAS_ROOT?= $(wildcard /opt/rocm/hcblas) - HIPCC_FLAGS += -stdlib=libc++ -I$(HCBLAS_ROOT)/include - LIBS = -L$(HCBLAS_ROOT)/lib -lhipblas -rpath $(HIP_PATH)/lib -endif - - -all: saxpy.hipblas.out - -saxpy.cublas.out : saxpy.cublas.cpp - nvcc -std=c++11 -I$(CUDA_HOME)/include saxpy.cublas.cpp -o $@ -L$(CUDA_HOME)/lib64 -lcublas - -# $HIPBLAS_ROOT/bin/hipifyblas ./saxpy.cublas.cpp > ./saxpy.hipblas.cpp -# Then review & finish port in saxpy.hipblas.cpp - -saxpy.hipblasref.o: saxpy.hipblasref.cpp - $(HIPCC) $(HIPCC_FLAGS) -c $< -o $@ - -saxpy.hipblas.out: saxpy.hipblasref.o - $(HIPCC) $< -o $@ $(LIBS) - -clean: - rm -f *.o *.out diff --git a/samples/7_Advanced/hipblas_saxpy/saxpy.cublas.cpp b/samples/7_Advanced/hipblas_saxpy/saxpy.cublas.cpp deleted file mode 100644 index 03a38f3fb..000000000 --- a/samples/7_Advanced/hipblas_saxpy/saxpy.cublas.cpp +++ /dev/null @@ -1,94 +0,0 @@ - -#include -#include -#include -#include - -// header file for the GPU API -#include -#include - -#define N (1024 * 500) - -#define CHECK(cmd) \ -{\ - cudaError_t error = cmd; \ - if (error != cudaSuccess) { \ - fprintf(stderr, "error: '%s'(%d) at %s:%d\n", cudaGetErrorString(error), error,__FILE__, __LINE__); \ - exit(EXIT_FAILURE);\ - }\ -} - -#define CHECK_BLAS(cmd) \ -{\ - cublasStatus_t error = cmd;\ - if (error != CUBLAS_STATUS_SUCCESS) { \ - fprintf(stderr, "error: (%d) at %s:%d\n", error,__FILE__, __LINE__); \ - exit(EXIT_FAILURE);\ - }\ -} - -int main() { - - const float a = 100.0f; - float x[N]; - float y[N], y_cpu_res[N], y_gpu_res[N]; - - // initialize the input data - std::default_random_engine random_gen; - std::uniform_real_distribution distribution(-N, N); - std::generate_n(x, N, [&]() { return distribution(random_gen); }); - std::generate_n(y, N, [&]() { return distribution(random_gen); }); - std::copy_n(y, N, y_cpu_res); - - // Explicit GPU code: - - size_t Nbytes = N*sizeof(float); - float *x_gpu, *y_gpu; - - cublasHandle_t handle; - - cudaDeviceProp props; - CHECK(cudaGetDeviceProperties(&props, 0/*deviceID*/)); - printf ("info: running on device %s\n", props.name); - - printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); - printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); - CHECK(cudaMalloc(&x_gpu, Nbytes)); - CHECK(cudaMalloc(&y_gpu, Nbytes)); - - // Initialize the blas library - CHECK_BLAS ( cublasCreate(&handle)); - - // copy n elements from a vector in host memory space to a vector in GPU memory space - printf ("info: copy Host2Device\n"); - CHECK_BLAS ( cublasSetVector(N, sizeof(*x), x, 1, x_gpu, 1)); - CHECK_BLAS ( cublasSetVector(N, sizeof(*y), y, 1, y_gpu, 1)); - - printf ("info: launch 'saxpy' kernel\n"); - CHECK_BLAS ( cublasSaxpy(handle, N, &a, x_gpu, 1, y_gpu, 1)); - - cudaDeviceSynchronize(); - - printf ("info: copy Device2Host\n"); - CHECK_BLAS ( cublasGetVector(N, sizeof(*y_gpu_res), y_gpu, 1, y_gpu_res, 1)); - - // CPU implementation of saxpy - for (int i = 0; i < N; i++) { - y_cpu_res[i] = a * x[i] + y[i]; - } - - // verify the results - int errors = 0; - for (int i = 0; i < N; i++) { - if (fabs(y_cpu_res[i] - y_gpu_res[i]) > fabs(y_cpu_res[i] * 0.0001f)) - errors++; - } - std::cout << errors << " errors" << std::endl; - - CHECK( cudaFree(x_gpu)); - CHECK( cudaFree(y_gpu)); - CHECK_BLAS( cublasDestroy(handle)); - - return errors; -} diff --git a/samples/7_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp b/samples/7_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp deleted file mode 100644 index 4610a612d..000000000 --- a/samples/7_Advanced/hipblas_saxpy/saxpy.hipblasref.cpp +++ /dev/null @@ -1,94 +0,0 @@ - -#include -#include -#include -#include - -// header file for the GPU API -#include "hip/hip_runtime.h" -#include - -#define N (1024 * 500) - -#define CHECK(cmd) \ -{\ - hipError_t error = cmd; \ - if (error != hipSuccess) { \ - fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \ - exit(EXIT_FAILURE);\ - }\ -} - -#define CHECK_BLAS(cmd) \ -{\ - hipblasStatus_t error = cmd;\ - if (error != HIPBLAS_STATUS_SUCCESS) { \ - fprintf(stderr, "error: (%d) at %s:%d\n", error,__FILE__, __LINE__); \ - exit(EXIT_FAILURE);\ - }\ -} - -int main() { - - const float a = 100.0f; - float x[N]; - float y[N], y_cpu_res[N], y_gpu_res[N]; - - // initialize the input data - std::default_random_engine random_gen; - std::uniform_real_distribution distribution(-N, N); - std::generate_n(x, N, [&]() { return distribution(random_gen); }); - std::generate_n(y, N, [&]() { return distribution(random_gen); }); - std::copy_n(y, N, y_cpu_res); - - // Explicit GPU code: - - size_t Nbytes = N*sizeof(float); - float *x_gpu, *y_gpu; - - hipblasHandle_t handle; - - hipDeviceProp_t props; - CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/)); - printf ("info: running on device %s\n", props.name); - - printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); - printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); - CHECK(hipMalloc(&x_gpu, Nbytes)); - CHECK(hipMalloc(&y_gpu, Nbytes)); - - // Initialize the blas library - CHECK_BLAS ( hipblasCreate(&handle)); - - // copy n elements from a vector in host memory space to a vector in GPU memory space - printf ("info: copy Host2Device\n"); - CHECK_BLAS ( hipblasSetVector(N, sizeof(*x), x, 1, x_gpu, 1)); - CHECK_BLAS ( hipblasSetVector(N, sizeof(*y), y, 1, y_gpu, 1)); - - printf ("info: launch 'saxpy' kernel\n"); - CHECK_BLAS ( hipblasSaxpy(handle, N, &a, x_gpu, 1, y_gpu, 1)); - - hipDeviceSynchronize(); - - printf ("info: copy Device2Host\n"); - CHECK_BLAS ( hipblasGetVector(N, sizeof(*y_gpu_res), y_gpu, 1, y_gpu_res, 1)); - - // CPU implementation of saxpy - for (int i = 0; i < N; i++) { - y_cpu_res[i] = a * x[i] + y[i]; - } - - // verify the results - int errors = 0; - for (int i = 0; i < N; i++) { - if (fabs(y_cpu_res[i] - y_gpu_res[i]) > fabs(y_cpu_res[i] * 0.0001f)) - errors++; - } - std::cout << errors << " errors" << std::endl; - - CHECK( hipFree(x_gpu)); - CHECK( hipFree(y_gpu)); - CHECK_BLAS( hipblasDestroy(handle)); - - return errors; -} From af22699ec601e3942849b016cf3a088228d4d6c7 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 7 Mar 2017 11:24:32 -0600 Subject: [PATCH 099/700] added new field to hipDeviceProp_t structure gcnArch. 1. It is an integer containing gfx values 701, 801, 802, 803 2. On NV path, it is zero Change-Id: I2b4c7f48981d0214d8c6b1905d2cc85b16203419 --- samples/0_Intro/square/square.hipref.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index 0073c1399..e694bfb8a 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -32,7 +32,7 @@ THE SOFTWARE. }\ } -/* +/* * Square each element in the array A and write to array C. */ template @@ -58,16 +58,18 @@ int main(int argc, char *argv[]) hipDeviceProp_t props; CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/)); printf ("info: running on device %s\n", props.name); - + #ifdef __HIP_PLATFORM_HCC__ + printf ("info: architecture on AMD GPU device is: %d\n",props.gcnArch); + #endif printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); A_h = (float*)malloc(Nbytes); CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); C_h = (float*)malloc(Nbytes); CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); // Fill with Phi + i - for (size_t i=0; i Date: Thu, 9 Mar 2017 08:52:50 -0600 Subject: [PATCH 100/700] make 4_shfl cookbook sample only for fiji 1. __shfl is not supported on hawaii gfx701 Change-Id: Iac09f5d30ee0674b8f58a6e74ec5c49b02be32ad --- samples/2_Cookbook/4_shfl/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/samples/2_Cookbook/4_shfl/Makefile b/samples/2_Cookbook/4_shfl/Makefile index 3383cf2bf..21c0e9395 100644 --- a/samples/2_Cookbook/4_shfl/Makefile +++ b/samples/2_Cookbook/4_shfl/Makefile @@ -22,7 +22,7 @@ CXX=$(HIPCC) $(EXECUTABLE): $(OBJECTS) - $(HIPCC) $(OBJECTS) -o $@ + $(HIPCC) --amdgpu-target=gfx803 $(OBJECTS) -o $@ test: $(EXECUTABLE) @@ -33,4 +33,3 @@ clean: rm -f $(EXECUTABLE) rm -f $(OBJECTS) rm -f $(HIP_PATH)/src/*.o - From 5eb39f1c6b26bc8c92bed2ae27df8cf3828647c9 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Fri, 10 Mar 2017 10:29:52 +0530 Subject: [PATCH 101/700] Fix for HCSWAP-128, make 5_2dshfl cookbook sample only for fiji Change-Id: I8869c28151bca1bd47a053a2808e93a801d16d00 --- samples/2_Cookbook/5_2dshfl/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/2_Cookbook/5_2dshfl/Makefile b/samples/2_Cookbook/5_2dshfl/Makefile index b742bbf80..6abaf658b 100644 --- a/samples/2_Cookbook/5_2dshfl/Makefile +++ b/samples/2_Cookbook/5_2dshfl/Makefile @@ -22,7 +22,7 @@ CXX=$(HIPCC) $(EXECUTABLE): $(OBJECTS) - $(HIPCC) $(OBJECTS) -o $@ + $(HIPCC) --amdgpu-target=gfx803 $(OBJECTS) -o $@ test: $(EXECUTABLE) From b25691cb87f8bb1010cb37a0973cbe9e5b572986 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 17 Feb 2017 17:14:55 -0600 Subject: [PATCH 102/700] Add first step to a "registerd" mode in hipBusBandwidth. --- .../hipBusBandwidth/hipBusBandwidth.cpp | 165 +++++++++++------- 1 file changed, 105 insertions(+), 60 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 7cb3e7908..a1b2fd170 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -6,9 +6,12 @@ #include "ResultDatabase.h" +enum MallocMode {MallocPinned, MallocUnpinned, MallocRegistered}; + // Cmdline parms: bool p_verbose = false; -bool p_pinned = true; +MallocMode p_malloc_mode = MallocPinned; +int p_numa_ctl = -1; int p_iterations = 10; int p_beatsperiteration=1; int p_device = 0; @@ -21,7 +24,7 @@ bool p_h2d = true; bool p_d2h = true; bool p_bidir = true; - +#define NO_CHECK #define CHECK_HIP_ERROR() \ @@ -36,6 +39,14 @@ bool p_bidir = true; } +std::string mallocModeString(int mallocMode) { + switch (mallocMode) { + case MallocPinned : return "pinned"; + case MallocUnpinned: return "unpinned"; + case MallocRegistered: return "registered"; + default: return "mallocmode-UNKNOWN"; + }; +}; // **************************************************************************** int sizeToBytes(int size) { @@ -106,7 +117,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) // Create some host memory pattern float *hostMem = NULL; - if (p_pinned) + if (p_malloc_mode == MallocPinned) { hipHostMalloc((void**)&hostMem, sizeof(float) * numMaxFloats); while (hipGetLastError() != hipSuccess) @@ -116,20 +127,29 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) --nSizes; if (nSizes < 1) { - std::cerr << "Error: Couldn't allocated any pinned buffer\n"; + std::cerr << "Error: Couldn't allocate any pinned buffer\n"; return; } numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; hipHostMalloc((void**)&hostMem, sizeof(float) * numMaxFloats); } } - else + else if (p_malloc_mode == MallocUnpinned) { if (p_alignedhost) { hostMem = (float*)aligned_alloc(p_alignedhost, numMaxFloats*sizeof(float)); } else { hostMem = new float[numMaxFloats]; } + } + else if (p_malloc_mode == MallocRegistered) + { + if (p_numa_ctl == -1) { + hostMem = (float*)malloc(numMaxFloats*sizeof(float)); + } + + hipHostRegister(hostMem, numMaxFloats * sizeof(float), 0); + CHECK_HIP_ERROR(); } for (int i = 0; i < numMaxFloats; i++) @@ -146,7 +166,7 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) --nSizes; if (nSizes < 1) { - std::cerr << "Error: Couldn't allocated any device buffer\n"; + std::cerr << "Error: Couldn't allocate any device buffer\n"; return; } numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; @@ -199,8 +219,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) } else { sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); } - resultDB.AddResult(std::string("H2D_Bandwidth") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "GB/sec", speed); - resultDB.AddResult(std::string("H2D_Time") + (p_pinned ? "_Pinned" : "_Unpinned"), sizeStr, "ms", t); + resultDB.AddResult(std::string("H2D_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("H2D_Time") + mallocModeString(p_malloc_mode), sizeStr, "ms", t); if (p_onesize) { break; @@ -212,6 +232,8 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) numMaxFloats = sizeToBytes(p_onesize) / sizeof(float); } +#ifndef NO_CHECK + // Check. First reset the host memory, then copy-back result. Then compare against original ref value. for (int i = 0; i < numMaxFloats; i++) { @@ -225,24 +247,36 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) printf ("error: H2D. i=%d reference:%6.f != copyback:%6.2f\n", i, ref, hostMem[i]); } } +#endif // Cleanup hipFree((void*)device); CHECK_HIP_ERROR(); - if (p_pinned) - { + switch (p_malloc_mode) { + case MallocPinned: hipHostFree((void*)hostMem); CHECK_HIP_ERROR(); - } - else - { + break; + + case MallocUnpinned: if (p_alignedhost) { delete[] hostMem; } else { free(hostMem); } + break; + + case MallocRegistered: + hipHostUnregister(hostMem); + CHECK_HIP_ERROR(); + free(hostMem); + break; + default: + assert(0); } + + hipEventDestroy(start); hipEventDestroy(stop); } @@ -257,38 +291,40 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) // Create some host memory pattern float *hostMem1; float *hostMem2; - if (p_pinned) + if (p_malloc_mode == MallocPinned) { hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats); hipError_t err1 = hipGetLastError(); hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats); hipError_t err2 = hipGetLastError(); - while (err1 != hipSuccess || err2 != hipSuccess) - { - // free the first buffer if only the second failed - if (err1 == hipSuccess) - hipHostFree((void*)hostMem1); - - // drop the size and try again - if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; - --nSizes; - if (nSizes < 1) - { - std::cerr << "Error: Couldn't allocated any pinned buffer\n"; - return; - } - numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; - hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats); - err1 = hipGetLastError(); - hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats); - err2 = hipGetLastError(); - } - } - else + while (err1 != hipSuccess || err2 != hipSuccess) + { + // free the first buffer if only the second failed + if (err1 == hipSuccess) + hipHostFree((void*)hostMem1); + + // drop the size and try again + if (p_verbose) std::cout << " - dropping size allocating pinned mem\n"; + --nSizes; + if (nSizes < 1) + { + std::cerr << "Error: Couldn't allocate any pinned buffer\n"; + return; + } + numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + hipHostMalloc((void**)&hostMem1, sizeof(float)*numMaxFloats); + err1 = hipGetLastError(); + hipHostMalloc((void**)&hostMem2, sizeof(float)*numMaxFloats); + err2 = hipGetLastError(); + } + } + else if (p_malloc_mode == MallocUnpinned) { hostMem1 = new float[numMaxFloats]; hostMem2 = new float[numMaxFloats]; } + + for (int i=0; i Date: Fri, 10 Mar 2017 15:04:46 -0600 Subject: [PATCH 103/700] Refactor registered memory calls. --- .../hipBusBandwidth/hipBusBandwidth.cpp | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index a1b2fd170..09f78543c 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -24,7 +24,7 @@ bool p_h2d = true; bool p_d2h = true; bool p_bidir = true; -#define NO_CHECK +//#define NO_CHECK #define CHECK_HIP_ERROR() \ @@ -151,6 +151,10 @@ void RunBenchmark_H2D(ResultDatabase &resultDB) hipHostRegister(hostMem, numMaxFloats * sizeof(float), 0); CHECK_HIP_ERROR(); } + else + { + assert(0); + } for (int i = 0; i < numMaxFloats; i++) { @@ -323,6 +327,22 @@ void RunBenchmark_D2H(ResultDatabase &resultDB) hostMem1 = new float[numMaxFloats]; hostMem2 = new float[numMaxFloats]; } + else if (p_malloc_mode == MallocRegistered) + { + if (p_numa_ctl == -1) { + hostMem1 = (float*)malloc(numMaxFloats*sizeof(float)); + hostMem2 = (float*)malloc(numMaxFloats*sizeof(float)); + } + + hipHostRegister(hostMem1, numMaxFloats * sizeof(float), 0); + CHECK_HIP_ERROR(); + hipHostRegister(hostMem2, numMaxFloats * sizeof(float), 0); + CHECK_HIP_ERROR(); + } + else + { + assert(0); + } for (int i=0; i Date: Tue, 14 Mar 2017 15:56:18 +0530 Subject: [PATCH 104/700] 4_shfl and 5_2dshfl samples are unsupported on gfx701 Change-Id: I81eb880350f25e89573ba14c62b549c6c43f8c91 --- samples/2_Cookbook/4_shfl/Makefile | 6 +++++- samples/2_Cookbook/5_2dshfl/Makefile | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/samples/2_Cookbook/4_shfl/Makefile b/samples/2_Cookbook/4_shfl/Makefile index 21c0e9395..56f54d951 100644 --- a/samples/2_Cookbook/4_shfl/Makefile +++ b/samples/2_Cookbook/4_shfl/Makefile @@ -3,6 +3,10 @@ ifeq (,$(HIP_PATH)) HIP_PATH=../../.. endif +ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET))) + $(error gfx701 is not a supported device for this sample) +endif + HIPCC=$(HIP_PATH)/bin/hipcc TARGET=hcc @@ -22,7 +26,7 @@ CXX=$(HIPCC) $(EXECUTABLE): $(OBJECTS) - $(HIPCC) --amdgpu-target=gfx803 $(OBJECTS) -o $@ + $(HIPCC) $(OBJECTS) -o $@ test: $(EXECUTABLE) diff --git a/samples/2_Cookbook/5_2dshfl/Makefile b/samples/2_Cookbook/5_2dshfl/Makefile index 6abaf658b..cfadb1a31 100644 --- a/samples/2_Cookbook/5_2dshfl/Makefile +++ b/samples/2_Cookbook/5_2dshfl/Makefile @@ -3,6 +3,10 @@ ifeq (,$(HIP_PATH)) HIP_PATH=../../.. endif +ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET))) + $(error gfx701 is not a supported device for this sample) +endif + HIPCC=$(HIP_PATH)/bin/hipcc TARGET=hcc @@ -22,7 +26,7 @@ CXX=$(HIPCC) $(EXECUTABLE): $(OBJECTS) - $(HIPCC) --amdgpu-target=gfx803 $(OBJECTS) -o $@ + $(HIPCC) $(OBJECTS) -o $@ test: $(EXECUTABLE) From 219343027fdf45bcb86a0070665cb58f54502c7f Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 17 Mar 2017 13:11:34 -0500 Subject: [PATCH 105/700] Added default module launch api functionality 1. As in hipModuleLaunchKernel(..., kernelParams, nullptr); works with this commit 2. Added headers AMDGPUPTNote.h, AMDGPURuntimeMetadata.h to do code object meta data parsing 3. Changed CMake to look at llvm link libraries 4. HIP developer should set env variable LLVM_HOME to remove link errors 5. HIP depends on installed LLVM (not source, not build) 6. Added sample to test out the feature 7. Right now HCC does not support embedding metadata in code object. Use clang opencl 8. Changed HIPCC to read LLVM_HOME env var 9. New argument to CMake should be given -DLLVM_HOME= Change-Id: Iba38194aa872d97cc2c90a8e5ff746c48055c868 --- samples/0_Intro/module_api/Makefile | 8 +- samples/0_Intro/module_api/defaultDriver.cpp | 89 +++++++++++++++++++ samples/0_Intro/module_api/runKernel.cpp | 4 +- samples/0_Intro/module_api/test.cl | 12 +++ samples/0_Intro/module_api/test.co | Bin 0 -> 9824 bytes 5 files changed, 108 insertions(+), 5 deletions(-) create mode 100644 samples/0_Intro/module_api/defaultDriver.cpp create mode 100644 samples/0_Intro/module_api/test.cl create mode 100755 samples/0_Intro/module_api/test.co diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile index f2c0ce555..632a8d3e7 100644 --- a/samples/0_Intro/module_api/Makefile +++ b/samples/0_Intro/module_api/Makefile @@ -5,14 +5,16 @@ endif HIPCC=$(HIP_PATH)/bin/hipcc HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler) -all: vcpy_kernel.code runKernel.hip.out +all: vcpy_kernel.code runKernel.hip.out defaultDriver.hip.out runKernel.hip.out: runKernel.cpp $(HIPCC) $(HIPCC_FLAGS) $< -o $@ +defaultDriver.hip.out: defaultDriver.cpp + $(HIPCC) $(HIPCC_FLAGS) $< -o $@ + vcpy_kernel.code: vcpy_kernel.cpp - $(HIPCC) --genco $(GENCO_FLAGS) $^ -o $@ + $(HIPCC) --genco $(GENCO_FLAGS) $^ -o $@ clean: rm -f *.code *.out - diff --git a/samples/0_Intro/module_api/defaultDriver.cpp b/samples/0_Intro/module_api/defaultDriver.cpp new file mode 100644 index 000000000..a29271a0d --- /dev/null +++ b/samples/0_Intro/module_api/defaultDriver.cpp @@ -0,0 +1,89 @@ +/* +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" +#include +#include +#include + +#define LEN 64 +#define SIZE LEN<<2 + +#define fileName "test.co" +#define kernel_name "vadd" + +int main(){ + float *A, *B, *C; + hipDeviceptr_t Ad, Bd, Cd; + A = new float[LEN]; + B = new float[LEN]; + C = new float[LEN]; + + for(uint32_t i=0;iH@ikcag#Q!>Xh`B3Zc@Bch`yO`hvZV z8c^LpLG*zK-~;#qzJcbkyp=xZp)W{%;2VH*c4oX@lU6MRTGTy~_spEjoHOT~nZ1l} z950_xGMPCdV1(Q!8En7E33D8g)kC5IXpY!q9IpGw1j#`^8__s_jw9f5*p3DxMB;Hl zM>6U?3@%S_j>OU*)Eh;X@NXF38R2>oOFrtMuY}?SfJWCoUKL%5rJe%xC()DhJNIko zMf0O#;32#!$kGn_hj143B7401*jKa_*+YjBhVX+iPL^249x?ii_Df3(ks7v<{wDk| zoJh}-u}BWOeHzdgmoIYJ?_qo#2W6+Hrzbw6tnTHP7FPUrr`Bu~>4#LGq8hzC6){eA z*r`UfS)^P}bT6?DzuIm#!t=FH{UWuDgmAg(vS7)tH`|+nZ<+-wA#(g;XW4JhUo6t3 z>Jqz#V6$>3okX>R~OsOZi|nutA?ErJHNPmt`r1Kcd^YH-2fcalGZOY zLl&Ir)={uiq_&zgQ}VpF-|4VQ;KxFm3Sq%ei4d&(#Yl1{(gwMRl3lQhSY$ zPXVM2Ao8e!B+eH4rJ%M3brBOz*BV}tikN}G3Jbdah0PWQy03+O?y9@a+I@1#bM>4vFQ~;}Q16q1nEP0f(G_FvEDp zdo-Z)5Dz0CW%36LY=e0o*ZEhoA|si)SgkcMk*J>)*g9D)BtEB_2fZULMh3;u?1Z9iIpA zhh6!I!6n3%-$4kj^8`KZq!CCXkVYVlz%vyA+{gJ3cYN5{#XkQqCrjiLfEcVMCSZnf+KJ%R4w0R2tOC@(z5D$wWS6@Ug44gVlun9{K%iLY{sdMXsioGy-V^ z(g>sxNF$I&AdNs8fiwbX1fJ;#$g}1LFK{;}g}3<5gAkpy16~%d`|_=Sf5a7M&d^pR zL(u!^Tf5VcuNGp1clS53#5wk_3hJ#SjG zOx18bdX(%o%#rmlY;}q=Gi$YQy<5q<&HBta=K0M=bZFVEx4NM}6KUo_glcZZa~$2Q zT2|Gr*sf{m%=dJM`F_PTD_*6l8LDBaN9p5@90VKnXkbG(w1VbY1&>twb`of+XcJoR8-Y44cjz*%hoJI_x&fHhV8hj zrMX^3ovr!>rn_pjU>H`V;5eq~!FhY7QgwYtJxa>8##O&vq~-DgPia5pyEX7T16EV{ zUlPV(!lv(9^Ao8&|P zS{U_Mz2=fU|3gFG?RYF?1Z%18H$VV1LskL2-U-{h#W6W{OuM2*SDg%=d|X-h;7n=h z)O^zUztO|EVT_`$N2o6w0?Yq%APWKiw=oE)i)&h{$#pLANg<4SXbZn>A;|TA0NS_* zNPW4^C8oebkbG%h;#Yx(=S7k)_YsNZK7wN)@H*jbydD%ZxlhS+DvA65^N}h;0~`W6 z>Y@)3&?H_JUFol!zxLu>A%ch8EE~v3t2?>h5knaG;9vrUjf=~`tM<20mLx81J_~wKLJkil%fCt literal 0 HcmV?d00001 From 323807d02beb026bcb282059d8579f339af66127 Mon Sep 17 00:00:00 2001 From: pensun Date: Fri, 17 Mar 2017 17:17:12 +0000 Subject: [PATCH 106/700] Initial integration with Alex' Generic Grid Launch Change-Id: I559afb80e9e39ec0d119bb3bf3b85ef9e448caf6 --- samples/0_Intro/square/square.hipref.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index e694bfb8a..118f8acf1 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -83,7 +83,7 @@ int main(int argc, char *argv[]) const unsigned threadsPerBlock = 256; printf ("info: launch 'vector_square' kernel\n"); - hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N); + hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, nullptr, C_d, A_d, N); printf ("info: copy Device2Host\n"); CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); From c4c4d95db66100423477b8109601159642d15713 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Mon, 20 Mar 2017 17:03:21 -0500 Subject: [PATCH 107/700] revert workaround for square sample and update doc on GGL Change-Id: I731c68ca4111e7dc2e45bef51c4cad2c23fc81f8 --- samples/0_Intro/square/square.hipref.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index 118f8acf1..963ab6326 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -83,7 +83,7 @@ int main(int argc, char *argv[]) const unsigned threadsPerBlock = 256; printf ("info: launch 'vector_square' kernel\n"); - hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, nullptr, C_d, A_d, N); + hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, nullptr, C_d, A_d, N); printf ("info: copy Device2Host\n"); CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); From 7735b454a133547a23b189f617ba2062666f4ade Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 31 Mar 2017 12:11:34 -0500 Subject: [PATCH 108/700] added new api hipHccModuleLaunchKernel 1. hipHccModuleLaunchKernel is same as hipModuleLaunchKernel with OpenCL workitem model 2. Added copy right 3. Fixed header naming Change-Id: I6a7c35a3566e2f8d3f5056613e34193775d4b236 --- samples/0_Intro/module_api/runKernel.cpp | 18 ++++++++++-------- samples/0_Intro/module_api/vcpy_kernel.cpp | 3 +-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp index 201892a4a..e7d54beb5 100644 --- a/samples/0_Intro/module_api/runKernel.cpp +++ b/samples/0_Intro/module_api/runKernel.cpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -22,9 +22,10 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" -#include -#include -#include +#include +#include +#include +#include #define LEN 64 #define SIZE LEN<<2 @@ -43,10 +44,10 @@ int main(){ B[i] = 0.0f; } - hipInit(0); - hipDevice_t device; - hipCtx_t context; - hipDeviceGet(&device, 0); + hipInit(0); + hipDevice_t device; + hipCtx_t context; + hipDeviceGet(&device, 0); hipCtxCreate(&context, 0, device); hipMalloc((void**)&Ad, SIZE); @@ -101,6 +102,7 @@ int main(){ hipModuleLaunchKernel(Function, 1, 1, 1, LEN, 1, 1, 0, 0, NULL, (void**)&config); hipMemcpyDtoH(B, Bd, SIZE); + int mismatchCount = 0; for(uint32_t i=0;i Date: Fri, 31 Mar 2017 13:35:26 -0500 Subject: [PATCH 109/700] Fixed bit_extract Change-Id: I92d7b7a302e3fa0db84889fb5dc6b612e6a53c73 --- samples/0_Intro/bit_extract/Makefile | 1 - samples/0_Intro/bit_extract/bit_extract.cpp | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile index 0965ae729..78f6a2faa 100644 --- a/samples/0_Intro/bit_extract/Makefile +++ b/samples/0_Intro/bit_extract/Makefile @@ -24,4 +24,3 @@ $(EXE): bit_extract.cpp clean: rm -f *.o $(EXE) - diff --git a/samples/0_Intro/bit_extract/bit_extract.cpp b/samples/0_Intro/bit_extract/bit_extract.cpp index 1535d2bd9..a30f2d052 100644 --- a/samples/0_Intro/bit_extract/bit_extract.cpp +++ b/samples/0_Intro/bit_extract/bit_extract.cpp @@ -37,7 +37,7 @@ THE SOFTWARE. }\ } -void __global__ +__global__ void bit_extract_kernel(hipLaunchParm lp, uint32_t *C_d, const uint32_t *A_d, size_t N) { size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); @@ -45,7 +45,7 @@ bit_extract_kernel(hipLaunchParm lp, uint32_t *C_d, const uint32_t *A_d, size_t for (size_t i=offset; i> 8); #endif @@ -73,7 +73,7 @@ int main(int argc, char *argv[]) C_h = (uint32_t*)malloc(Nbytes); CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); - for (size_t i=0; i Date: Fri, 31 Mar 2017 14:13:46 -0500 Subject: [PATCH 110/700] added debug support for HIP sample Change-Id: Ia7265234082039b68114f7421f4dbcb7149d4d2b --- samples/0_Intro/module_api/runKernel.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/samples/0_Intro/module_api/runKernel.cpp b/samples/0_Intro/module_api/runKernel.cpp index e7d54beb5..355f0bf5d 100644 --- a/samples/0_Intro/module_api/runKernel.cpp +++ b/samples/0_Intro/module_api/runKernel.cpp @@ -33,6 +33,9 @@ THE SOFTWARE. #define fileName "vcpy_kernel.code" #define kernel_name "hello_world" +#define HIP_CHECK(status) \ +if(status != hipSuccess) {std::cout<<"Got Status: "< Date: Fri, 31 Mar 2017 14:30:40 -0500 Subject: [PATCH 111/700] added module api sample which uses hipHccModuleLaunchKernel Change-Id: I7bce60b4480a3b5ff7ed69c3256078ded65a0945 --- samples/0_Intro/module_api/Makefile | 3 + .../0_Intro/module_api/launchKernelHcc.cpp | 112 ++++++++++++++++++ samples/0_Intro/module_api/runKernel.cpp | 4 - 3 files changed, 115 insertions(+), 4 deletions(-) create mode 100644 samples/0_Intro/module_api/launchKernelHcc.cpp diff --git a/samples/0_Intro/module_api/Makefile b/samples/0_Intro/module_api/Makefile index 632a8d3e7..38bd00a6a 100644 --- a/samples/0_Intro/module_api/Makefile +++ b/samples/0_Intro/module_api/Makefile @@ -10,6 +10,9 @@ all: vcpy_kernel.code runKernel.hip.out defaultDriver.hip.out runKernel.hip.out: runKernel.cpp $(HIPCC) $(HIPCC_FLAGS) $< -o $@ +launchKernelHcc.hip.out: launchKernelHcc.cpp + $(HIPCC) $(HIPCC_FLAGS) $< -o $@ + defaultDriver.hip.out: defaultDriver.cpp $(HIPCC) $(HIPCC_FLAGS) $< -o $@ diff --git a/samples/0_Intro/module_api/launchKernelHcc.cpp b/samples/0_Intro/module_api/launchKernelHcc.cpp new file mode 100644 index 000000000..e86e44cb2 --- /dev/null +++ b/samples/0_Intro/module_api/launchKernelHcc.cpp @@ -0,0 +1,112 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + + +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" +#include +#include +#include + +#ifdef __HIP_PLATFORM_HCC__ +#include +#endif + +#define LEN 64 +#define SIZE LEN<<2 + +#define fileName "vcpy_kernel.code" +#define kernel_name "hello_world" + +#define HIP_CHECK(status) \ +if(status != hipSuccess) {std::cout<<"Got Status: "< Date: Fri, 7 Apr 2017 14:51:54 +0530 Subject: [PATCH 112/700] Fix build issues in hipCommander sample - Remove -stdlib=libstdc++ from Makefile - Removed deleted HIP header file fom includes Change-Id: Ia189396bee19fc52b679259df56c6c6e2bafb6fe --- samples/1_Utils/hipCommander/Makefile | 3 --- samples/1_Utils/hipCommander/hipCommander.cpp | 1 - 2 files changed, 4 deletions(-) diff --git a/samples/1_Utils/hipCommander/Makefile b/samples/1_Utils/hipCommander/Makefile index e770c636a..a411763b7 100644 --- a/samples/1_Utils/hipCommander/Makefile +++ b/samples/1_Utils/hipCommander/Makefile @@ -10,9 +10,6 @@ OPT=-O3 CXXFLAGS = $(OPT) --std=c++11 HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) -ifeq (${HIP_PLATFORM}, hcc) - CXXFLAGS += " -stdlib=libc++" -endif CODE_OBJECTS=nullkernel.hsaco diff --git a/samples/1_Utils/hipCommander/hipCommander.cpp b/samples/1_Utils/hipCommander/hipCommander.cpp index 0add1ce3e..4b93180b1 100644 --- a/samples/1_Utils/hipCommander/hipCommander.cpp +++ b/samples/1_Utils/hipCommander/hipCommander.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #endif #include From ad280696c67430a735203b0d79a5ed3100603239 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 7 Apr 2017 15:24:10 +0530 Subject: [PATCH 113/700] Fix build issues with bit_extract sample Change-Id: I628b3c83a16f7adf0ab8ca60aecde8c073c34fd9 --- samples/0_Intro/bit_extract/Makefile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile index 78f6a2faa..08bca6e64 100644 --- a/samples/0_Intro/bit_extract/Makefile +++ b/samples/0_Intro/bit_extract/Makefile @@ -11,10 +11,6 @@ HIPCC=$(HIP_PATH)/bin/hipcc ifeq (${HIP_PLATFORM}, nvcc) HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 endif -ifeq (${HIP_PLATFORM}, hcc) - HIPCC_FLAGS = -stdlib=libc++ -endif - EXE=bit_extract From 9fcc03e2b64dde04a82935ce2fd559a262498108 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 7 Apr 2017 15:40:09 +0530 Subject: [PATCH 114/700] Merge branch 'amd-develop' into amd-master Change-Id: I53d5a8916d769c4f0fe60d2ee3b240551da80b4f (cherry picked from commit 01c523f6c9fc5a66d5f6e60d5efe78c29c9fe317) --- samples/0_Intro/bit_extract/Makefile | 4 ---- samples/1_Utils/hipCommander/Makefile | 3 --- samples/1_Utils/hipCommander/hipCommander.cpp | 1 - 3 files changed, 8 deletions(-) diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile index 78f6a2faa..08bca6e64 100644 --- a/samples/0_Intro/bit_extract/Makefile +++ b/samples/0_Intro/bit_extract/Makefile @@ -11,10 +11,6 @@ HIPCC=$(HIP_PATH)/bin/hipcc ifeq (${HIP_PLATFORM}, nvcc) HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 endif -ifeq (${HIP_PLATFORM}, hcc) - HIPCC_FLAGS = -stdlib=libc++ -endif - EXE=bit_extract diff --git a/samples/1_Utils/hipCommander/Makefile b/samples/1_Utils/hipCommander/Makefile index e770c636a..a411763b7 100644 --- a/samples/1_Utils/hipCommander/Makefile +++ b/samples/1_Utils/hipCommander/Makefile @@ -10,9 +10,6 @@ OPT=-O3 CXXFLAGS = $(OPT) --std=c++11 HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) -ifeq (${HIP_PLATFORM}, hcc) - CXXFLAGS += " -stdlib=libc++" -endif CODE_OBJECTS=nullkernel.hsaco diff --git a/samples/1_Utils/hipCommander/hipCommander.cpp b/samples/1_Utils/hipCommander/hipCommander.cpp index 0add1ce3e..4b93180b1 100644 --- a/samples/1_Utils/hipCommander/hipCommander.cpp +++ b/samples/1_Utils/hipCommander/hipCommander.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #endif #include From 2335bcdd0302afda3d2e5360d542719a0a55da89 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 21 Apr 2017 09:01:34 -0500 Subject: [PATCH 115/700] Fix compilation error with nvcc (c++ nullptr) --- samples/0_Intro/square/Makefile | 1 + samples/0_Intro/square/square.hipref.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/0_Intro/square/Makefile b/samples/0_Intro/square/Makefile index 1e8cdba08..aa48cc586 100644 --- a/samples/0_Intro/square/Makefile +++ b/samples/0_Intro/square/Makefile @@ -15,5 +15,6 @@ square.hip.out: square.hipref.cpp + clean: rm -f *.o *.out diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index 963ab6326..e694bfb8a 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -83,7 +83,7 @@ int main(int argc, char *argv[]) const unsigned threadsPerBlock = 256; printf ("info: launch 'vector_square' kernel\n"); - hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, nullptr, C_d, A_d, N); + hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N); printf ("info: copy Device2Host\n"); CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); From b8fd2f159a4003d204f60f24186e3627afef6c3a Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 24 Apr 2017 08:50:43 +0530 Subject: [PATCH 116/700] Merge branch 'amd-develop' into amd-master Change-Id: I312fb9d1181733ef5160d1e993e2ae57ced0f6b3 (cherry picked from commit 88fb807af081f31314031d4549e98d9b621cfc41) --- samples/0_Intro/square/Makefile | 1 + samples/0_Intro/square/square.hipref.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/0_Intro/square/Makefile b/samples/0_Intro/square/Makefile index 1e8cdba08..aa48cc586 100644 --- a/samples/0_Intro/square/Makefile +++ b/samples/0_Intro/square/Makefile @@ -15,5 +15,6 @@ square.hip.out: square.hipref.cpp + clean: rm -f *.o *.out diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index 963ab6326..e694bfb8a 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -83,7 +83,7 @@ int main(int argc, char *argv[]) const unsigned threadsPerBlock = 256; printf ("info: launch 'vector_square' kernel\n"); - hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, nullptr, C_d, A_d, N); + hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N); printf ("info: copy Device2Host\n"); CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); From 3bc6df20444a1eef0b3fd8d39dab6ee6ea793cbc Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Thu, 4 May 2017 13:57:01 +0530 Subject: [PATCH 117/700] Print msg for single gpu Change-Id: I2d23c73542add8973990ba96592016726994422e --- samples/2_Cookbook/8_peer2peer/peer2peer.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp index 990599e1c..0f532a2f0 100644 --- a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp +++ b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp @@ -55,13 +55,9 @@ void checkPeer2PeerSupport() { int gpuCount; int canAccessPeer; - int p2pCapableDeviceCount=0; HIPCHECK(hipGetDeviceCount(&gpuCount)); - if (gpuCount < 2) - printf("Peer2Peer application requires atleast 2 gpu devices"); - for (int currentGpu=0; currentGpu Date: Thu, 11 May 2017 11:30:49 +0530 Subject: [PATCH 118/700] Add unroll and inline asm cookbook samples Change-Id: Ie5a0fbb01b7fca82959090d89299533d49e092f1 --- samples/2_Cookbook/10_inline_asm/Makefile | 35 ++++ .../2_Cookbook/10_inline_asm/inline_asm.cpp | 174 ++++++++++++++++++ samples/2_Cookbook/9_unroll/Makefile | 39 ++++ samples/2_Cookbook/9_unroll/unroll.cpp | 141 ++++++++++++++ 4 files changed, 389 insertions(+) create mode 100644 samples/2_Cookbook/10_inline_asm/Makefile create mode 100644 samples/2_Cookbook/10_inline_asm/inline_asm.cpp create mode 100644 samples/2_Cookbook/9_unroll/Makefile create mode 100644 samples/2_Cookbook/9_unroll/unroll.cpp diff --git a/samples/2_Cookbook/10_inline_asm/Makefile b/samples/2_Cookbook/10_inline_asm/Makefile new file mode 100644 index 000000000..77a769963 --- /dev/null +++ b/samples/2_Cookbook/10_inline_asm/Makefile @@ -0,0 +1,35 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = inline_asm.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./inline_asm + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o diff --git a/samples/2_Cookbook/10_inline_asm/inline_asm.cpp b/samples/2_Cookbook/10_inline_asm/inline_asm.cpp new file mode 100644 index 000000000..2b4fc3de9 --- /dev/null +++ b/samples/2_Cookbook/10_inline_asm/inline_asm.cpp @@ -0,0 +1,174 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +// hip header file +#include "hip/hip_runtime.h" + +#define WIDTH 1024 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width) +{ + + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x])); +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width) +{ + for(unsigned int j=0; j < width; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*width + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + float eventMs = 1.0f; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("kernel Execution time = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { + printf("gpu%f cpu %f \n",TransposeMatrix[i],cpuTransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/9_unroll/Makefile b/samples/2_Cookbook/9_unroll/Makefile new file mode 100644 index 000000000..b71f3d835 --- /dev/null +++ b/samples/2_Cookbook/9_unroll/Makefile @@ -0,0 +1,39 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET))) + $(error gfx701 is not a supported device for this sample) +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = unroll.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./unroll + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o diff --git a/samples/2_Cookbook/9_unroll/unroll.cpp b/samples/2_Cookbook/9_unroll/unroll.cpp new file mode 100644 index 000000000..22f1c75e6 --- /dev/null +++ b/samples/2_Cookbook/9_unroll/unroll.cpp @@ -0,0 +1,141 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +// hip header file +#include "hip/hip_runtime.h" + + +#define WIDTH 4 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + float val = in[x]; + +#pragma unroll + for(int i=0;i eps ) { + printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} From 7c6b0384bbbf3fea339c53832349cedff22c0b41 Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Thu, 11 May 2017 18:43:24 +0530 Subject: [PATCH 119/700] Add readme for inline asm and unroll cookbook samples Change-Id: I71b7a5652c3dad181c5df60ab0dd1b81d79f1bfb --- samples/2_Cookbook/10_inline_asm/Readme.md | 47 +++++++++++++++++++++ samples/2_Cookbook/9_unroll/Readme.md | 48 ++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 samples/2_Cookbook/10_inline_asm/Readme.md create mode 100644 samples/2_Cookbook/9_unroll/Readme.md diff --git a/samples/2_Cookbook/10_inline_asm/Readme.md b/samples/2_Cookbook/10_inline_asm/Readme.md new file mode 100644 index 000000000..8c9854722 --- /dev/null +++ b/samples/2_Cookbook/10_inline_asm/Readme.md @@ -0,0 +1,47 @@ +## inline asm ### + +This tutorial is about how to use inline GCN asm in kernel. In this tutorial, we'll explain how to by using the simple Matrix Transpose. + +## Introduction: + +If you want to take advantage of the extra performance benefits of writing in assembly as well as take advantage of special GPU hardware features that were only available through assemby, then this tutorial is for you. In this tutorial we'll be explaining how to start writing inline asm in kernel. + +For more insight Please read the following blogs by Ben Sander +[The Art of AMDGCN Assembly: How to Bend the Machine to Your Will](gpuopen.com/amdgcn-assembly) +[AMD GCN Assembly: Cross-Lane Operations](http://gpuopen.com/amd-gcn-assembly-cross-lane-operations/) + +For more information: +[AMD GCN3 ISA Architecture Manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/) +[User Guide for AMDGPU Back-end](llvm.org/docs/AMDGPUUsage.html) + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +We will be using the Simple Matrix Transpose application from the our very first tutorial. + +## asm() Assembler statement + +We insert the GCN isa into the kernel using asm() Assembler statement. In the same sourcecode, we used for MatrixTranspose. We'll add the following: + +` asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x])); ` + +## How to build and run: +Use the make command and execute it using ./exe +Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. + + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/9_unroll/Readme.md b/samples/2_Cookbook/9_unroll/Readme.md new file mode 100644 index 000000000..3c2635c0e --- /dev/null +++ b/samples/2_Cookbook/9_unroll/Readme.md @@ -0,0 +1,48 @@ +## Using Pragma unroll ### + +In this tutorial, we'll explain how to use #pragma unroll to improve the performance. + +## Introduction: + +Loop unrolling optimization hints can be specified with #pragma unroll and #pragma nounroll. The pragma is placed immediately before a for loop. +Specifying #pragma unroll without a parameter directs the loop unroller to attempt to fully unroll the loop if the trip count is known at compile time and attempt to partially unroll the loop if the trip count is not known at compile time. + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +For this tutorial we will be using MatrixTranspose with shfl operation i.e., our 4_shfl tutorial since it is the only examples where we used loops inside the kernel. + +In this tutorial, we'll use `#pragma unroll`. In the same sourcecode, we used for MatrixTranspose. We'll add it just before the for loop as following: + +`#pragma unroll ` +` for(int i=0;i Date: Mon, 12 Jun 2017 17:14:12 +0530 Subject: [PATCH 120/700] Add peer2peer bandwidth and latency test Change-Id: I6d88e4aa9f6e64096af16579eebef4740734203e --- .../hipBusBandwidth/hipBusBandwidth.cpp | 395 +++++++++++++++++- 1 file changed, 372 insertions(+), 23 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 09f78543c..b3b0b3e4a 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -16,13 +16,15 @@ int p_iterations = 10; int p_beatsperiteration=1; int p_device = 0; int p_detailed = 0; -bool p_async = 0; +bool p_async = 0; int p_alignedhost = 0; // align host allocs to this granularity, in bytes. 64 or 4096 are good values to try. -int p_onesize = 0; +int p_onesize = 0; bool p_h2d = true; bool p_d2h = true; bool p_bidir = true; +bool p_p2p = false; + //#define NO_CHECK @@ -70,7 +72,7 @@ std::string sizeToString(int size) // **************************************************************************** -hipError_t memcopy(void * dst, const void *src, size_t sizeBytes, enum hipMemcpyKind kind) +hipError_t memcopy(void * dst, const void *src, size_t sizeBytes, enum hipMemcpyKind kind ) { if (p_async) { return hipMemcpyAsync(dst, src, sizeBytes, kind, NULL); @@ -632,6 +634,9 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB) } + + + #define failed(...) \ printf ("error: ");\ printf (__VA_ARGS__);\ @@ -646,6 +651,326 @@ int parseInt(const char *str, int *output) } +void checkPeer2PeerSupport() +{ + int deviceCnt; + hipGetDeviceCount(&deviceCnt); + std::cout << "Total no. of available gpu #" << deviceCnt << "\n" << std::endl; + + for(int deviceId=0; deviceIdhost then host-->GPU2)\n\n" << std::endl; +} + +void enablePeer2Peer(int currentGpu, int peerGpu) +{ + int canAccessPeer; + + hipSetDevice(currentGpu); + hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu); + + if(canAccessPeer==1){ + hipDeviceEnablePeerAccess(peerGpu, 0); + } +} + +void disablePeer2Peer(int currentGpu, int peerGpu) +{ + int canAccessPeer; + + hipSetDevice(currentGpu); + hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu); + + if(canAccessPeer==1){ + hipDeviceDisablePeerAccess(peerGpu); + } +} + +std::string gpuIDToString(int gpuID) +{ + using namespace std; + stringstream ss; + ss << gpuID; + return ss.str(); +} + +void RunBenchmark_P2P_Unidir(ResultDatabase &resultDB) +{ + int gpuCount; + hipGetDeviceCount(&gpuCount); + + int currentGpu, peerGpu; + + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + for (currentGpu=0; currentGpu1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + + string cGpu, pGpu; + cGpu = gpuIDToString(currentGpu); + pGpu = gpuIDToString(peerGpu); + + resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "ms", t); + + if (p_onesize) { + break; + } + } + + } + + if (p_onesize) { + numMaxFloats = sizeToBytes(p_onesize) / sizeof(float); + } + + disablePeer2Peer(currentGpu, peerGpu); + + hipEventDestroy(start); + hipEventDestroy(stop); + + // Cleanup + hipFree((void*)currentGpuMem); + hipFree((void*)peerGpuMem); + CHECK_HIP_ERROR(); + + hipSetDevice(peerGpu); + hipDeviceReset(); + + hipSetDevice(currentGpu); + hipDeviceReset(); + } + + } + +} + +void RunBenchmark_P2P_Bidir(ResultDatabase &resultDB) { + + int gpuCount; + hipGetDeviceCount(&gpuCount); + + hipStream_t stream[2]; + + int currentGpu, peerGpu; + + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + for (currentGpu=0; currentGpu1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + + string cGpu, pGpu; + cGpu = gpuIDToString(currentGpu); + pGpu = gpuIDToString(peerGpu); + + resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "ms", t); + + if (p_onesize) { + break; + } + } + + } + + if (p_onesize) { + numMaxFloats = sizeToBytes(p_onesize) / sizeof(float); + } + + disablePeer2Peer(currentGpu, peerGpu); + + hipEventDestroy(start); + hipEventDestroy(stop); + + for (int i=0; i<2; i++) { + hipStreamDestroy(stream[i]); + + hipFree((void*)currentGpuMem[i]); + hipFree((void*)peerGpuMem[i]); + CHECK_HIP_ERROR(); + } + + hipSetDevice(peerGpu); + hipDeviceReset(); + + hipSetDevice(currentGpu); + hipDeviceReset(); + } + } +} + + void printConfig() { hipDeviceProp_t props; hipGetDeviceProperties(&props, p_device); @@ -662,9 +987,9 @@ void help() { printf (" --d2h : Run only device-to-host test.\n"); printf (" --h2d : Run only host-to-device test.\n"); printf (" --bidir : Run only bidir copy test.\n"); + printf (" --p2p : Run only peer2peer unidir and bidir copy tests.\n"); printf (" --verbose : Print verbose status messages as test is run.\n"); printf (" --detailed : Print detailed report (including all trials).\n"); - printf (" --async : Use hipMemcpyAsync(with NULL stream) for H2D/D2H. Default uses hipMemcpy.\n"); printf (" --onesize, -o : Only run one measurement, at specified size (in KB, or if negative in bytes)\n"); @@ -712,6 +1037,12 @@ int parseStandardArguments(int argc, char *argv[]) p_d2h = false; p_bidir = true; + } else if (!strcmp(arg, "--p2p")) { + p_h2d = false; + p_d2h = false; + p_bidir = false; + p_p2p = true; + } else if (!strcmp(arg, "--help") || (!strcmp(arg, "-h"))) { help(); exit(EXIT_SUCCESS); @@ -737,39 +1068,57 @@ int main(int argc, char *argv[]) { parseStandardArguments(argc, argv); - printConfig(); + if (p_p2p) { + checkPeer2PeerSupport(); - if (p_h2d) { - ResultDatabase resultDB; - RunBenchmark_H2D(resultDB); + ResultDatabase resultDB_Unidir, resultDB_Bidir; - resultDB.DumpSummary(std::cout); + RunBenchmark_P2P_Unidir(resultDB_Unidir); + RunBenchmark_P2P_Bidir(resultDB_Bidir); + + resultDB_Unidir.DumpSummary(std::cout); + resultDB_Bidir.DumpSummary(std::cout); if (p_detailed) { - resultDB.DumpDetailed(std::cout); + resultDB_Unidir.DumpDetailed(std::cout); + resultDB_Bidir.DumpDetailed(std::cout); } } + else { + printConfig(); - if (p_d2h) { - ResultDatabase resultDB; - RunBenchmark_D2H(resultDB); + if (p_h2d) { + ResultDatabase resultDB; + RunBenchmark_H2D(resultDB); - resultDB.DumpSummary(std::cout); + resultDB.DumpSummary(std::cout); - if (p_detailed) { - resultDB.DumpDetailed(std::cout); + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } } - } + if (p_d2h) { + ResultDatabase resultDB; + RunBenchmark_D2H(resultDB); - if (p_bidir) { - ResultDatabase resultDB; - RunBenchmark_Bidir(resultDB); + resultDB.DumpSummary(std::cout); - resultDB.DumpSummary(std::cout); + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } + } - if (p_detailed) { - resultDB.DumpDetailed(std::cout); + + if (p_bidir) { + ResultDatabase resultDB; + RunBenchmark_Bidir(resultDB); + + resultDB.DumpSummary(std::cout); + + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } } } } From a491a49f982a7d8378e16988af16c71bb03756d0 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 20 Jun 2017 11:35:52 -0500 Subject: [PATCH 121/700] removed rm for /opt/rocm/hip/src in inline asm sample Change-Id: I0c02bccd4cd35e01a8e889ea1e586ea8baf0ab90 --- samples/2_Cookbook/10_inline_asm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/2_Cookbook/10_inline_asm/Makefile b/samples/2_Cookbook/10_inline_asm/Makefile index 77a769963..6ad3c201b 100644 --- a/samples/2_Cookbook/10_inline_asm/Makefile +++ b/samples/2_Cookbook/10_inline_asm/Makefile @@ -32,4 +32,4 @@ test: $(EXECUTABLE) clean: rm -f $(EXECUTABLE) rm -f $(OBJECTS) - rm -f $(HIP_PATH)/src/*.o + From 98905a7272b9fbc6b0c9712cd47577962b8a17c1 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 29 Jun 2017 12:01:40 -0500 Subject: [PATCH 122/700] automate gcnarch detection Change-Id: Ibbad22db136f7f5e2be84c82e9169298a144cc77 --- samples/1_Utils/hipInfo/hipInfo.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index cf4660eae..0401745ef 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -129,6 +129,7 @@ void printDeviceProp (int deviceId) cout << setw(w1) << "arch.hasSurfaceFuncs: " << props.arch.hasSurfaceFuncs << endl; cout << setw(w1) << "arch.has3dGrid: " << props.arch.has3dGrid << endl; cout << setw(w1) << "arch.hasDynamicParallelism: " << props.arch.hasDynamicParallelism << endl; + cout << setw(w1) << "gcnArch: " << props.gcnArch << endl; int deviceCnt; hipGetDeviceCount(&deviceCnt); From 8252ae785b3772476b0a32c748cd6671d6140e5e Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 5 Jul 2017 11:44:44 +0530 Subject: [PATCH 123/700] GPUOpen-ProfessionalCompute-Tools -> ROCm-Developer-Tools Change-Id: I9f5b29dd1097385acecb0c672770d8adca2fdcf7 --- samples/2_Cookbook/0_MatrixTranspose/Readme.md | 18 +++++++++--------- samples/2_Cookbook/10_inline_asm/Readme.md | 18 +++++++++--------- samples/2_Cookbook/1_hipEvent/Readme.md | 18 +++++++++--------- samples/2_Cookbook/2_Profiler/Readme.md | 16 ++++++++-------- samples/2_Cookbook/3_shared_memory/Readme.md | 18 +++++++++--------- samples/2_Cookbook/4_shfl/Readme.md | 18 +++++++++--------- samples/2_Cookbook/5_2dshfl/Readme.md | 18 +++++++++--------- samples/2_Cookbook/6_dynamic_shared/Readme.md | 18 +++++++++--------- samples/2_Cookbook/7_streams/Readme.md | 18 +++++++++--------- samples/2_Cookbook/9_unroll/Readme.md | 18 +++++++++--------- 10 files changed, 89 insertions(+), 89 deletions(-) diff --git a/samples/2_Cookbook/0_MatrixTranspose/Readme.md b/samples/2_Cookbook/0_MatrixTranspose/Readme.md index 5e9483b59..ab5dbdc95 100644 --- a/samples/2_Cookbook/0_MatrixTranspose/Readme.md +++ b/samples/2_Cookbook/0_MatrixTranspose/Readme.md @@ -7,7 +7,7 @@ This tutorial shows how to get write simple HIP application. We will write the s HIP is a C++ runtime API and kernel language that allows developers to create portable applications that can run on AMD and other GPU’s. Our goal was to rise above the lowest-common-denominator paths and deliver a solution that allows you, the developer, to use essential hardware features and maximize your application’s performance on GPU hardware. ## Requirement: -For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) +For hardware requirement and software installation [Installation](https://github.com/ROCm-Developer-Tools/HIP/INSTALL.md) ## prerequiste knowledge: @@ -90,11 +90,11 @@ Use the make command and execute it using ./exe Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. ## More Info: -- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) -- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) -- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) -- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) -- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [hipify-clang](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/hipify-clang/README.md) -- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) -- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) +- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [hipify-clang](https://github.com/ROCm-Developer-Tools/HIP/hipify-clang/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/10_inline_asm/Readme.md b/samples/2_Cookbook/10_inline_asm/Readme.md index 8c9854722..0e64fe9c6 100644 --- a/samples/2_Cookbook/10_inline_asm/Readme.md +++ b/samples/2_Cookbook/10_inline_asm/Readme.md @@ -15,7 +15,7 @@ For more information: [User Guide for AMDGPU Back-end](llvm.org/docs/AMDGPUUsage.html) ## Requirement: -For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) +For hardware requirement and software installation [Installation](https://github.com/ROCm-Developer-Tools/HIP/INSTALL.md) ## prerequiste knowledge: @@ -37,11 +37,11 @@ Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia ## More Info: -- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) -- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) -- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) -- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) -- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) -- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) -- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) +- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/ROCm-Developer-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/1_hipEvent/Readme.md b/samples/2_Cookbook/1_hipEvent/Readme.md index e3ec8ad78..ea4f3a67e 100644 --- a/samples/2_Cookbook/1_hipEvent/Readme.md +++ b/samples/2_Cookbook/1_hipEvent/Readme.md @@ -7,7 +7,7 @@ This tutorial is follow-up of the previous one where we learn how to write our f Memory transfer and kernel execution are the most important parameter in parallel computing (specially HPC and machine learning). Memory bottlenecks is the main problem why we are not able to get the highest performance, therefore obtaining the memory transfer timing and kernel execution timing plays key role in application optimization. ## Requirement: -For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) +For hardware requirement and software installation [Installation](https://github.com/ROCm-Developer-Tools/HIP/INSTALL.md) ## prerequiste knowledge: @@ -64,11 +64,11 @@ Use the make command and execute it using ./exe Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. ## More Info: -- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) -- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) -- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) -- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) -- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [hipify-clang](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/hipify-clang/README.md) -- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) -- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) +- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [hipify-clang](https://github.com/ROCm-Developer-Tools/HIP/hipify-clang/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/2_Profiler/Readme.md b/samples/2_Cookbook/2_Profiler/Readme.md index 92a8be228..4059e4219 100644 --- a/samples/2_Cookbook/2_Profiler/Readme.md +++ b/samples/2_Cookbook/2_Profiler/Readme.md @@ -37,11 +37,11 @@ You can also print the HIP function strings to stderr using HIP_TRACE_API enviro Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors. ## More Info: -- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) -- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) -- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) -- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) -- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [hipify-clang](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/hipify-clang/README.md) -- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) -- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) +- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [hipify-clang](https://github.com/ROCm-Developer-Tools/HIP/hipify-clang/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/3_shared_memory/Readme.md b/samples/2_Cookbook/3_shared_memory/Readme.md index 6b9393397..8b9e102ec 100644 --- a/samples/2_Cookbook/3_shared_memory/Readme.md +++ b/samples/2_Cookbook/3_shared_memory/Readme.md @@ -7,7 +7,7 @@ Earlier we learned how to write our first hip program, in which we compute Matri As we mentioned earlier that Memory bottlenecks is the main problem why we are not able to get the highest performance, therefore minimizing the latency for memory access plays prominent role in application optimization. In this tutorial, we'll learn how to use static shared memory and will explain the dynamic one latter. ## Requirement: -For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) +For hardware requirement and software installation [Installation](https://github.com/ROCm-Developer-Tools/HIP/INSTALL.md) ## prerequiste knowledge: @@ -32,11 +32,11 @@ Use the make command and execute it using ./exe Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. ## More Info: -- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) -- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) -- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) -- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) -- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) -- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) -- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) +- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/ROCm-Developer-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/4_shfl/Readme.md b/samples/2_Cookbook/4_shfl/Readme.md index da6290185..923d2f383 100644 --- a/samples/2_Cookbook/4_shfl/Readme.md +++ b/samples/2_Cookbook/4_shfl/Readme.md @@ -15,7 +15,7 @@ Let's talk about Warp first. The kernel code is executed in groups of fixed numb ` float __shfl_xor (float var, int laneMask, int width=warpSize); ` ## Requirement: -For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) +For hardware requirement and software installation [Installation](https://github.com/ROCm-Developer-Tools/HIP/INSTALL.md) ## prerequiste knowledge: @@ -41,11 +41,11 @@ Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia please make sure you have a 3.0 or higher compute capable device in order to use warp shfl operations and add `-gencode arch=compute=30, code=sm_30` nvcc flag in the Makefile while using this application. ## More Info: -- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) -- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) -- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) -- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) -- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) -- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) -- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) +- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/ROCm-Developer-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/5_2dshfl/Readme.md b/samples/2_Cookbook/5_2dshfl/Readme.md index fba114152..8efff49d8 100644 --- a/samples/2_Cookbook/5_2dshfl/Readme.md +++ b/samples/2_Cookbook/5_2dshfl/Readme.md @@ -15,7 +15,7 @@ Let's talk about Warp first. The kernel code is executed in groups of fixed numb ` float __shfl_xor (float var, int laneMask, int width=warpSize); ` ## Requirement: -For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) +For hardware requirement and software installation [Installation](https://github.com/ROCm-Developer-Tools/HIP/INSTALL.md) ## prerequiste knowledge: @@ -41,11 +41,11 @@ Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia please make sure you have a 3.0 or higher compute capable device in order to use warp shfl operations and add `-gencode arch=compute=30, code=sm_30` nvcc flag in the Makefile while using this application. ## More Info: -- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) -- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) -- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) -- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) -- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) -- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) -- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) +- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/ROCm-Developer-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/6_dynamic_shared/Readme.md b/samples/2_Cookbook/6_dynamic_shared/Readme.md index a10fd56a9..15ea299a9 100644 --- a/samples/2_Cookbook/6_dynamic_shared/Readme.md +++ b/samples/2_Cookbook/6_dynamic_shared/Readme.md @@ -7,7 +7,7 @@ Earlier we learned how to use static shared memory. In this tutorial, we'll expl As we mentioned earlier that Memory bottlenecks is the main problem why we are not able to get the highest performance, therefore minimizing the latency for memory access plays prominent role in application optimization. In this tutorial, we'll learn how to use dynamic shared memory. ## Requirement: -For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) +For hardware requirement and software installation [Installation](https://github.com/ROCm-Developer-Tools/HIP/INSTALL.md) ## prerequiste knowledge: @@ -37,11 +37,11 @@ Use the make command and execute it using ./exe Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. ## More Info: -- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) -- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) -- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) -- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) -- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) -- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) -- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) +- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/ROCm-Developer-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/7_streams/Readme.md b/samples/2_Cookbook/7_streams/Readme.md index a75149925..ca295d3f4 100644 --- a/samples/2_Cookbook/7_streams/Readme.md +++ b/samples/2_Cookbook/7_streams/Readme.md @@ -7,7 +7,7 @@ In all Earlier tutorial we used single stream, In this tutorial, we'll explain h The various instances of kernel to be executed on device in exact launch order defined by Host are called streams. We can launch multiple streams on a single device. We will learn how to learn two streams which can we scaled with ease. ## Requirement: -For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) +For hardware requirement and software installation [Installation](https://github.com/ROCm-Developer-Tools/HIP/INSTALL.md) ## prerequiste knowledge: @@ -47,11 +47,11 @@ Use the make command and execute it using ./exe Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. ## More Info: -- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) -- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) -- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) -- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) -- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) -- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) -- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) +- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/ROCm-Developer-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/9_unroll/Readme.md b/samples/2_Cookbook/9_unroll/Readme.md index 3c2635c0e..194eb9c7f 100644 --- a/samples/2_Cookbook/9_unroll/Readme.md +++ b/samples/2_Cookbook/9_unroll/Readme.md @@ -8,7 +8,7 @@ Loop unrolling optimization hints can be specified with #pragma unroll and #prag Specifying #pragma unroll without a parameter directs the loop unroller to attempt to fully unroll the loop if the trip count is known at compile time and attempt to partially unroll the loop if the trip count is not known at compile time. ## Requirement: -For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) +For hardware requirement and software installation [Installation](https://github.com/ROCm-Developer-Tools/HIP/INSTALL.md) ## prerequiste knowledge: @@ -38,11 +38,11 @@ Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia please make sure you have a 3.0 or higher compute capable device in order to use warp shfl operations and add `-gencode arch=compute=30, code=sm_30` nvcc flag in the Makefile while using this application. ## More Info: -- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) -- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) -- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) -- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) -- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) -- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) -- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) -- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) +- [HIP FAQ](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://rocm-developer-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/ROCm-Developer-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/ROCm-Developer-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/ROCm-Developer-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/ROCm-Developer-Tools/HIP/RELEASE.md) From 8e3e104313314ffb18a53fca1c4faadbcb4dd8ab Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 21 Jul 2017 15:50:12 -0500 Subject: [PATCH 124/700] fixed device selection during compilation to use rocm_agent_enumerator 1. Changed hipcc to use rocm_agent_enumerator 2. Changed square sample test to use device variable --- samples/0_Intro/square/square.hipref.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index e694bfb8a..167cb135f 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -54,9 +54,10 @@ int main(int argc, char *argv[]) float *A_h, *C_h; size_t N = 1000000; size_t Nbytes = N * sizeof(float); - + static int device = 0; + CHECK(hipSetDevice(device)); hipDeviceProp_t props; - CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/)); + CHECK(hipGetDeviceProperties(&props, device/*deviceID*/)); printf ("info: running on device %s\n", props.name); #ifdef __HIP_PLATFORM_HCC__ printf ("info: architecture on AMD GPU device is: %d\n",props.gcnArch); From cd48b06719bad5705da20de9d8c09a4e8b16314f Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 21 Jul 2017 15:50:12 -0500 Subject: [PATCH 125/700] fixed device selection during compilation to use rocm_agent_enumerator 1. Changed hipcc to use rocm_agent_enumerator 2. Changed square sample test to use device variable --- samples/0_Intro/square/square.hipref.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index e694bfb8a..167cb135f 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -54,9 +54,10 @@ int main(int argc, char *argv[]) float *A_h, *C_h; size_t N = 1000000; size_t Nbytes = N * sizeof(float); - + static int device = 0; + CHECK(hipSetDevice(device)); hipDeviceProp_t props; - CHECK(hipGetDeviceProperties(&props, 0/*deviceID*/)); + CHECK(hipGetDeviceProperties(&props, device/*deviceID*/)); printf ("info: running on device %s\n", props.name); #ifdef __HIP_PLATFORM_HCC__ printf ("info: architecture on AMD GPU device is: %d\n",props.gcnArch); From 6ac55d2b34c1fcebbbbb5270192071ab5cf1ebeb Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 26 Jul 2017 18:52:53 -0500 Subject: [PATCH 126/700] Refactor dispatch latency test and fix several bugs. --- .../hipDispatchLatency/ResultDatabase.cpp | 29 ++- .../hipDispatchLatency/hipDispatchLatency.cpp | 227 +++++++++--------- 2 files changed, 140 insertions(+), 116 deletions(-) diff --git a/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp b/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp index d207154e3..f6f2fab70 100644 --- a/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp +++ b/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp @@ -7,16 +7,23 @@ using namespace std; +#define SORT_BY_NAME 0 +#define SORT_RETAIN_ATTS_ORDER 1 + + bool ResultDatabase::Result::operator<(const Result &rhs) const { if (test < rhs.test) return true; if (test > rhs.test) return false; +#if (SORT_RETAIN_ATTS_ORDER == 0) + // For ties, sort by the value of the attribute: if (atts < rhs.atts) return true; if (atts > rhs.atts) return false; +#endif return false; // less-operator returns false on equal } @@ -189,7 +196,10 @@ void ResultDatabase::AddResult(const string &test_orig, void ResultDatabase::DumpDetailed(ostream &out) { vector sorted(results); - sort(sorted.begin(), sorted.end()); + +#if SORT_BY_NAME + stable_sort(sorted.begin(), sorted.end()); +#endif const int testNameW = 24 ; const int attW = 12; @@ -283,12 +293,15 @@ void ResultDatabase::DumpDetailed(ostream &out) void ResultDatabase::DumpSummary(ostream &out) { vector sorted(results); - sort(sorted.begin(), sorted.end()); - const int testNameW = 24 ; +#if SORT_BY_NAME + stable_sort(sorted.begin(), sorted.end()); +#endif + + const int testNameW = 32 ; const int attW = 12; const int fieldW = 9; - out << std::fixed << right << std::setprecision(4); + out << std::fixed << right << std::setprecision(2); // TODO: in big parallel runs, the "trials" are the procs // and we really don't want to print them all out.... @@ -334,8 +347,8 @@ void ResultDatabase::DumpSummary(ostream &out) } if (0) { out << endl - << "Note: results marked with (*) had missing values such as" << endl - << "might occur with a mixture of architectural capabilities." << endl; + << "Note: results marked with (*) had missing values such as" << endl + << "might occur with a mixture of architectural capabilities." << endl; } } @@ -381,7 +394,9 @@ void ResultDatabase::DumpCsv(string fileName) bool emptyFile; vector sorted(results); - sort(sorted.begin(), sorted.end()); +#if SORT_BY_NAME + stable_sort(sorted.begin(), sorted.end()); +#endif //Check to see if the file is empty - if so, add the headers emptyFile = this->IsFileEmpty(fileName); diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp index b343386b5..2a4f6ff64 100644 --- a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp +++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp @@ -25,15 +25,27 @@ THE SOFTWARE. #include #include"ResultDatabase.h" -#define check(msg, status) \ -if(status != hipSuccess){ \ - printf("%s failed.\n",#msg); \ - exit(1); \ +#define PRINT_PROGRESS 0 + +#define check(cmd) \ +{\ + hipError_t status = cmd;\ + if(status != hipSuccess){ \ + printf("error: '%s'(%d) from %s at %s:%d\n", \ + hipGetErrorString(status), status, #cmd,\ + __FILE__, __LINE__); \ + abort(); \ + }\ } #define LEN 1024*1024 -#define SIZE LEN * sizeof(float) -#define ITER 10120 + +#define NUM_GROUPS 1 +#define GROUP_SIZE 64 +#define TEST_ITERS 20 +#define DISPATCHES_PER_TEST 100 + +const unsigned p_tests = 0xfffffff; // HCC optimizes away fully NULL kernel calls, so run one that is nearly null: @@ -44,115 +56,112 @@ __global__ void NearlyNull(hipLaunchParm lp, float* Ad){ } +ResultDatabase resultDB; + + +void stopTest(hipEvent_t start, hipEvent_t stop, const char *msg, int iters) +{ + float mS = 0; + check(hipEventRecord(stop)); + check(hipDeviceSynchronize()); + check(hipEventElapsedTime(&mS, start, stop)); + resultDB.AddResult(std::string(msg), "", "uS", mS*1000/iters); + if (PRINT_PROGRESS & 0x1 ) { + std::cout<< msg <<"\t\t"< Date: Wed, 13 Sep 2017 12:57:37 +0530 Subject: [PATCH 127/700] Add more info for inline asm in hip kernel guide and cookbook readme --- samples/2_Cookbook/10_inline_asm/Readme.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/samples/2_Cookbook/10_inline_asm/Readme.md b/samples/2_Cookbook/10_inline_asm/Readme.md index 0e64fe9c6..7d0301bc7 100644 --- a/samples/2_Cookbook/10_inline_asm/Readme.md +++ b/samples/2_Cookbook/10_inline_asm/Readme.md @@ -27,10 +27,23 @@ We will be using the Simple Matrix Transpose application from the our very first ## asm() Assembler statement -We insert the GCN isa into the kernel using asm() Assembler statement. In the same sourcecode, we used for MatrixTranspose. We'll add the following: +In the same sourcecode, we used for MatrixTranspose. We'll add the following: ` asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x])); ` +GCN ISA In-line assembly, is supported. For example: + +``` +asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i])); +``` + +We insert the GCN isa into the kernel using `asm()` Assembler statement. +`volatile` keyword is used so that the optimizers must not change the number of volatile operations or change their order of execution relative to other volatile operations. +`v_mac_f32_e32` is the GCN instruction, for more information please refer - [AMD GCN3 ISA architecture manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/) +Index for the respective operand in the ordered fashion is provided by `%` followed by position in the list of operands +`"v"` is the constraint code (for target-specific AMDGPU) for 32-bit VGPR register, for more info please refer - [Supported Constraint Code List for AMDGPU](https://llvm.org/docs/LangRef.html#supported-constraint-code-list) +Output Constraints are specified by an `"="` prefix as shown above ("=v"). This indicate that assemby will write to this operand, and the operand will then be made available as a return value of the asm expression. Input constraints do not have a prefix - just the constraint code. The constraint string of `"0"` says to use the assigned register for output as an input as well (it being the 0'th constraint). + ## How to build and run: Use the make command and execute it using ./exe Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. From 9fef6f860c56bb138de0b262e33a543259f6cbf3 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 21 Oct 2017 07:47:32 -0500 Subject: [PATCH 128/700] Use 2X for bidir memory bandwidth calc --- samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index b3b0b3e4a..2d67b0444 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -594,7 +594,7 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB) " ms\n"; } - double speed = (double(sizeToBytes(thisSize)) / (1000*1000)) / t; + double speed = (double(sizeToBytes(2*thisSize)) / (1000*1000)) / t; char sizeStr[256]; sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); resultDB.AddResult(std::string("Bidir_Bandwidth") + "_" + mallocModeString(p_malloc_mode), sizeStr, "GB/sec", speed); From f19c685f88c489b1b0a4c1454e0d4ad9d6428d2b Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Mon, 23 Oct 2017 21:57:20 +0530 Subject: [PATCH 129/700] Use 2X for bidir p2p memory bandwidth calc --- samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 2d67b0444..4bd1bc016 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -922,7 +922,7 @@ void RunBenchmark_P2P_Bidir(ResultDatabase &resultDB) { std::cerr << "size " << sizeToString(thisSize) << " took " << t << " ms\n"; } - double speed = (double(sizeToBytes(thisSize) * p_beatsperiteration) / (1000*1000)) / t; + double speed = (double(sizeToBytes(2*thisSize) * p_beatsperiteration) / (1000*1000)) / t; char sizeStr[256]; if (p_beatsperiteration>1) { sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); From 626521007d054bbb82c811fa15c343dad10fcc67 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 24 Oct 2017 18:12:25 +0530 Subject: [PATCH 130/700] Example showing globals use with module APIs --- samples/0_Intro/module_api_global/Makefile | 17 ++ .../0_Intro/module_api_global/runKernel.cpp | 167 ++++++++++++++++++ .../0_Intro/module_api_global/vcpy_kernel.cpp | 41 +++++ 3 files changed, 225 insertions(+) create mode 100644 samples/0_Intro/module_api_global/Makefile create mode 100644 samples/0_Intro/module_api_global/runKernel.cpp create mode 100644 samples/0_Intro/module_api_global/vcpy_kernel.cpp diff --git a/samples/0_Intro/module_api_global/Makefile b/samples/0_Intro/module_api_global/Makefile new file mode 100644 index 000000000..97605020b --- /dev/null +++ b/samples/0_Intro/module_api_global/Makefile @@ -0,0 +1,17 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif +HIPCC=$(HIP_PATH)/bin/hipcc +HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler) + +all: vcpy_kernel.code runKernel.hip.out + +runKernel.hip.out: runKernel.cpp + $(HIPCC) $(HIPCC_FLAGS) $< -o $@ + +vcpy_kernel.code: vcpy_kernel.cpp + $(HIPCC) --genco $(GENCO_FLAGS) $^ -o $@ + +clean: + rm -f *.code *.out diff --git a/samples/0_Intro/module_api_global/runKernel.cpp b/samples/0_Intro/module_api_global/runKernel.cpp new file mode 100644 index 000000000..3f84720ed --- /dev/null +++ b/samples/0_Intro/module_api_global/runKernel.cpp @@ -0,0 +1,167 @@ +/* +Copyright (c) 2017 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" +#include +#include +#include +#include + +#define LEN 64 +#define SIZE LEN*sizeof(float) + +#define fileName "vcpy_kernel.code" +float myDeviceGlobal; +float myDeviceGlobalArray[16]; +#define HIP_CHECK(cmd) \ +{\ + hipError_t status = cmd;\ + if(status != hipSuccess) {std::cout<<"error: #"<