-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #24 from haruhi55/data_transfer_s2f
feat(unittest): Implement basic unittest for transferring 2D data tiles between global and shared memory
- Loading branch information
Showing
33 changed files
with
485 additions
and
100 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,4 +4,3 @@ | |
[submodule "3rd-party/googletest"] | ||
path = 3rd-party/googletest | ||
url = [email protected]:google/googletest.git | ||
branch = main |
Submodule googletest
updated
73 files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
[TBD] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#pragma once | ||
|
||
#include "cell/traits/base.hpp" | ||
#include "layout.hpp" | ||
|
||
namespace tiledcuda::cell::traits { | ||
|
||
/// @brief Configurations for transfering a single 2D data tile from global | ||
/// memory to shared memory, which include configurating the layout of data tile | ||
/// and thread tile. | ||
/// @tparam Element_: the element type | ||
/// @tparam kThreads: number of threads in a thread block | ||
template <typename Element_, const int kRows_, const int kCols_, | ||
const int kShmRows_, const int kShmCols_, const int kThreads, | ||
typename Base = TraitsBase<Element_>> | ||
struct G2S2DCopyTraits : public Base { | ||
using Element = Element_; | ||
|
||
static constexpr int kRows = kRows_; | ||
static constexpr int kCols = kCols_; | ||
|
||
static constexpr int kShmRows = kShmRows_; | ||
static constexpr int kShmCols = kShmCols_; | ||
|
||
using SrcLayout = RowMajor<kRows, kCols, kCols>; | ||
|
||
// To avoid bank conflicts, the shared memory requires a swizzled layout | ||
static constexpr int kSwizzleMode = kShmCols % 32 ? 1 : 0; | ||
using Swizzled = | ||
SwizzledRowMajor<Element, kShmRows, kShmCols, kSwizzleMode>; | ||
using DstLayout = typename Swizzled::SmemLayout; | ||
|
||
// threads in a thread block are laid out as a 2D tile | ||
// that has a shape of kThreadsRows x kThreadsCols. | ||
static constexpr int kThreadsCols = kShmCols / Base::kNumPerAccess; | ||
static constexpr int kThreadsRows = kThreads / kThreadsCols; | ||
using ThreadLayout = RowMajor<kThreadsRows, kThreadsCols, kThreadsCols>; | ||
|
||
using ValueLayout = Layout<Shape<_1, Int<Base::kNumPerAccess>>>; | ||
|
||
#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED) | ||
using CopyInst = | ||
Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, Element>; | ||
#else | ||
using CopyInst = Copy_Atom<DefaultCopy, Element>; | ||
#endif | ||
|
||
using TiledCopy = | ||
decltype(make_tiled_copy(CopyInst{}, ThreadLayout{}, ValueLayout{})); | ||
}; | ||
|
||
/// @brief Configurations for transfering a single 2D data tile from shared | ||
/// memory to global memory, which include configurating the layout of data tile | ||
/// and thread tile. | ||
/// @tparam Element_: the element type | ||
/// @tparam kThreads: number of threads in a thread block | ||
template <typename Element_, const int kRows_, const int kCols_, | ||
const int kShmRows_, const int kShmCols_, const int kThreads, | ||
typename Base = TraitsBase<Element_>> | ||
struct S2G2DCopyTraits : public Base { | ||
using Element = Element_; | ||
|
||
static constexpr int kRows = kRows_; | ||
static constexpr int kCols = kCols_; | ||
|
||
static constexpr int kShmRows = kShmRows_; | ||
static constexpr int kShmCols = kShmCols_; | ||
|
||
static constexpr int kSwizzleMode = kShmCols % 32 ? 1 : 0; | ||
using Swizzled = | ||
SwizzledRowMajor<Element, kShmRows, kShmCols, kSwizzleMode>; | ||
using SrcLayout = typename Swizzled::SmemLayout; | ||
|
||
// To avoid bank conflicts, the shared memory requires a swizzled layout | ||
using DstLayout = RowMajor<kRows, kCols, kCols>; | ||
|
||
// threads in a thread block are laid out as a 2D tile | ||
// that has a shape of kThreadsRows x kThreadsCols. | ||
static constexpr int kThreadsCols = kShmCols / Base::kNumPerAccess; | ||
static constexpr int kThreadsRows = kThreads / kThreadsCols; | ||
using ThreadLayout = RowMajor<kThreadsRows, kThreadsCols, kThreadsCols>; | ||
|
||
using ValueLayout = Layout<Shape<_1, Int<Base::kNumPerAccess>>>; | ||
|
||
// transfer data from global memory to shared memory has cp.async, | ||
// while transfer data from shared memory to global memory does not have. | ||
// for the latter case, the copy instruction should be the default one. | ||
using TiledCopy = decltype(make_tiled_copy( | ||
Copy_Atom<DefaultCopy, Element>{}, ThreadLayout{}, ValueLayout{})); | ||
}; | ||
|
||
} // namespace tiledcuda::cell::traits |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#pragma once | ||
|
||
#if defined(__CUDA_ARCH__) | ||
#define HOST_DEVICE __forceinline__ __host__ __device__ | ||
#define DEVICE __forceinline__ __device__ | ||
#define HOST __forceinline__ __host__ | ||
#else | ||
#define HOST_DEVICE inline | ||
#define DEVICE inline | ||
#define HOST inline | ||
#endif | ||
|
||
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) | ||
#define CUTE_ARCH_CP_ASYNC_SM80_ENABLED | ||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
#pragma once | ||
|
||
#include "config.hpp" | ||
|
||
#include <cublas_v2.h> | ||
#include <cuda.h> | ||
#include <cuda_runtime.h> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.