add sgemm-regtiled-coarsened

illinois-impact · Feb 1, 2019 · 43d08f1 · 43d08f1
1 parent f878bff
commit 43d08f1
Show file tree

Hide file tree

Showing 11 changed files with 20,721 additions and 0 deletions.
diff --git a/labs/sgemm-regtiled-coarsened/CMakeLists.txt b/labs/sgemm-regtiled-coarsened/CMakeLists.txt
@@ -0,0 +1,60 @@
+
+cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+
+project(sgemm LANGUAGES CXX CUDA)
+
+
+find_package(CUDA REQUIRED)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_COLOR_MAKEFILE ON)
+set(VERBOSE_BUILD ON)
+set(CMAKE_CXX_STANDARD 11)
+set(DEFAULT_BUILD_TYPE "Release")
+
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+include(CTest)
+
+add_executable(sgemm
+  eval.cu
+  helper.hpp
+  template.cu
+  template.hu
+  common/catch.hpp
+  common/fmt.hpp
+  common/clara.hpp
+  common/utils.hpp
+)
+
+
+# CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS Auto)
+
+target_compile_features(sgemm PUBLIC cxx_std_11)
+
+# We need to explicitly state that we need all CUDA files in the particle
+# library to be built with -dc as the member functions could be called by
+# other libraries and executables
+set_target_properties( sgemm PROPERTIES
+                       # CUDA_SEPARABLE_COMPILATION ON
+                       CUDA_RESOLVE_DEVICE_SYMBOLS ON
+                      )
+
+target_link_libraries(sgemm ${CUDA_LIBRARIES})
+
+
+include_directories(sgemm
+  ${PROJECT_SOURCE_DIR}/src
+  ${CUDA_INCLUDE_DIRS}
+)
+
+if(APPLE)
+  # We need to add the default path to the driver (libcuda.dylib) as an rpath,
+  # so that the static cuda runtime can find it at runtime.
+  set_property(TARGET sgemm PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+endif()
+
+
+
+enable_testing()
diff --git a/labs/sgemm-regtiled-coarsened/README.md b/labs/sgemm-regtiled-coarsened/README.md
@@ -0,0 +1,43 @@
+# 7-point Stencil with Thread-coarsening and Register Tiling
+
+## Objective
+The purpose of this lab is to practice the thread coarsening and register tiling optimization techniques using 7-point stencil as an example.
+
+## Procedure
+1. Edit the `kernel` function in `template.cu` to implement a 7-point stencil (refer to the [lecture slides](https://bw-course.ncsa.illinois.edu/mod/resource/view.php?id=574)) with combined register tiling and x-y shared memory tiling, and thread coarsening along the z-dimension.
+
+    ```
+    out(i, j, k) =  C0 *in(i, j, k)
+                  + C1 * (  in(i-1, j, k)
+                          + in(i, j-1, k)
+                          + in(i, j, k-1)
+                          + in(i+1, j, k)
+                          + in(i, j+1, k)
+                          + in(i, j, k+1) )
+    ```
+
+2. Edit the `launchStencil` function in `template.cu` to launch the kernel you implemented. The function should launch 2D CUDA grid and blocks, where each thread is responsible for computing an entire column in the z-deminsion.
+
+    `A0` and `Anext` in the code template correspond to `in` and `out`, respectively. The output dimension of the 7-point stencil computation is one smaller than the input dimension on both sides for all boundaries (e.g., output dimension is 6x6x6 for an input of 8x8x8). Only those "internal" elements needs to be calculated.
+
+3. Test your code using rai
+
+    `rai -p <path to your stencil folder>`
+
+    Be sure to add any additional flags that are required by your course (`--queue` or others).
+
+4. Submit your code on rai
+
+## Other notes
+
+To simplify the kernel code, you do not need to support input data with z-extent less than 2.
+
+The data is stored in column-major order. For example, you might consider using a macro to simplify your data access indexing:
+
+```c++
+__global__ void kernel(...) {}
+    #define A0(i, j, k) A0[((k)*ny + (j))*nx + (i)]
+    // your kernel code
+    #undef A0
+}
+```