-
Notifications
You must be signed in to change notification settings - Fork 31
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
20,721 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
|
||
cmake_minimum_required(VERSION 3.8 FATAL_ERROR) | ||
|
||
project(sgemm LANGUAGES CXX CUDA) | ||
|
||
|
||
find_package(CUDA REQUIRED) | ||
|
||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) | ||
set(CMAKE_COLOR_MAKEFILE ON) | ||
set(VERBOSE_BUILD ON) | ||
set(CMAKE_CXX_STANDARD 11) | ||
set(DEFAULT_BUILD_TYPE "Release") | ||
|
||
|
||
set_property(GLOBAL PROPERTY USE_FOLDERS ON) | ||
|
||
include(CTest) | ||
|
||
add_executable(sgemm | ||
eval.cu | ||
helper.hpp | ||
template.cu | ||
template.hu | ||
common/catch.hpp | ||
common/fmt.hpp | ||
common/clara.hpp | ||
common/utils.hpp | ||
) | ||
|
||
|
||
# CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS Auto) | ||
|
||
target_compile_features(sgemm PUBLIC cxx_std_11) | ||
|
||
# We need to explicitly state that we need all CUDA files in the particle | ||
# library to be built with -dc as the member functions could be called by | ||
# other libraries and executables | ||
set_target_properties( sgemm PROPERTIES | ||
# CUDA_SEPARABLE_COMPILATION ON | ||
CUDA_RESOLVE_DEVICE_SYMBOLS ON | ||
) | ||
|
||
target_link_libraries(sgemm ${CUDA_LIBRARIES}) | ||
|
||
|
||
include_directories(sgemm | ||
${PROJECT_SOURCE_DIR}/src | ||
${CUDA_INCLUDE_DIRS} | ||
) | ||
|
||
if(APPLE) | ||
# We need to add the default path to the driver (libcuda.dylib) as an rpath, | ||
# so that the static cuda runtime can find it at runtime. | ||
set_property(TARGET sgemm PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) | ||
endif() | ||
|
||
|
||
|
||
enable_testing() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# 7-point Stencil with Thread-coarsening and Register Tiling | ||
|
||
## Objective | ||
The purpose of this lab is to practice the thread coarsening and register tiling optimization techniques using 7-point stencil as an example. | ||
|
||
## Procedure | ||
1. Edit the `kernel` function in `template.cu` to implement a 7-point stencil (refer to the [lecture slides](https://bw-course.ncsa.illinois.edu/mod/resource/view.php?id=574)) with combined register tiling and x-y shared memory tiling, and thread coarsening along the z-dimension. | ||
|
||
``` | ||
out(i, j, k) = C0 *in(i, j, k) | ||
+ C1 * ( in(i-1, j, k) | ||
+ in(i, j-1, k) | ||
+ in(i, j, k-1) | ||
+ in(i+1, j, k) | ||
+ in(i, j+1, k) | ||
+ in(i, j, k+1) ) | ||
``` | ||
2. Edit the `launchStencil` function in `template.cu` to launch the kernel you implemented. The function should launch 2D CUDA grid and blocks, where each thread is responsible for computing an entire column in the z-deminsion. | ||
`A0` and `Anext` in the code template correspond to `in` and `out`, respectively. The output dimension of the 7-point stencil computation is one smaller than the input dimension on both sides for all boundaries (e.g., output dimension is 6x6x6 for an input of 8x8x8). Only those "internal" elements needs to be calculated. | ||
3. Test your code using rai | ||
`rai -p <path to your stencil folder>` | ||
Be sure to add any additional flags that are required by your course (`--queue` or others). | ||
4. Submit your code on rai | ||
## Other notes | ||
To simplify the kernel code, you do not need to support input data with z-extent less than 2. | ||
The data is stored in column-major order. For example, you might consider using a macro to simplify your data access indexing: | ||
```c++ | ||
__global__ void kernel(...) {} | ||
#define A0(i, j, k) A0[((k)*ny + (j))*nx + (i)] | ||
// your kernel code | ||
#undef A0 | ||
} | ||
``` |
Oops, something went wrong.