Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EXSWHTEC-273 - Implement tests for warp shfl_xor and shfl functions #194

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
cea96af
SWDEV-355313 - Move catch tests and samples
gargrahul Oct 26, 2022
909e7e4
SWDEV-355313 - Add README
gargrahul Nov 7, 2022
094b9af
SWDEV-355313 - Update amd-staging branch
gargrahul Nov 28, 2022
9daa6d0
SWDEV-355313 - Update README
gargrahul Dec 2, 2022
c49043e
SWDEV-355313 - Update latest code
gargrahul Dec 6, 2022
edf514a
Migrate basic Cooperative Groups tests and integrate to catch
nives-vukovic Jan 4, 2023
5610a48
Refactor basic Cooperative Groups tests
nives-vukovic Jan 10, 2023
c455740
Rename tiled partition related files and fix minor bug
nives-vukovic Jan 11, 2023
82fc666
Add LaunchCooperativeKernal and LaunchCooperativeKernelMultiDevice tests
nives-vukovic Jan 11, 2023
ef4fa46
Refactor hipCGThreadBlockTileType to use common function
nives-vukovic Jan 12, 2023
9c0f995
Merge remote-tracking branch 'origin/develop' into hipCoopGroups_wip
nives-vukovic Jan 12, 2023
b28aa60
Fix updated file not added during merge
nives-vukovic Jan 12, 2023
cc32117
Add coalesced_group type tests
nives-vukovic Jan 12, 2023
a177e26
Add coalesced_group shuffle_up and shuffle_down tests
nives-vukovic Jan 12, 2023
cdeadcf
Add coalesced_group shuffle tests - test fails
nives-vukovic Jan 13, 2023
7b84ac1
Merge remote-tracking branch 'upstream/develop' into cg_base_dino
music-dino Feb 1, 2023
d414bce
Implement common code for cooperative group tests
music-dino Feb 1, 2023
609fae5
Fixed compilation errror in cooperative_groups_common.hh
music-dino Feb 1, 2023
8cfb58b
Implement busy wait device function
music-dino Feb 1, 2023
5cf02ca
Add thread and block dimensions generators
music-dino Feb 2, 2023
fc11bf9
Move cpu_grid.h and supporting functions to catch/include
nives-vukovic Mar 1, 2023
18f2450
Use warp_size from properties in grid/block dims generators
nives-vukovic Mar 1, 2023
65a1e57
Fix condition for warp size 32 on AMD
nives-vukovic Mar 1, 2023
c01665f
Fix cpu_grid.h for warp function tests
nives-vukovic Mar 2, 2023
e41e642
Add missing include into cpu_grid.h
nives-vukovic Mar 2, 2023
e0e35e9
Merge remote-tracking branch 'origin/develop' into warp_common
nives-vukovic Mar 2, 2023
d4291ae
Add common functions and definitions for warp functions
nives-vukovic Mar 2, 2023
1c154be
Remove unnecessary memset
nives-vukovic Mar 2, 2023
2aed190
Cleanup leftover cooperative groups files
nives-vukovic Mar 2, 2023
8d2fd57
EXSWHTEC-273 - Implement tests for warp shfl_xor and shfl functions
nives-vukovic Mar 3, 2023
50f71f6
EXSWHTEC-273 - Fix doxygen comments
nives-vukovic Mar 3, 2023
583e30a
Add memory reset after allocation
nives-vukovic Mar 3, 2023
5a1b649
Merge branch 'warp_common' into warp_shfl_xor_shfl_tests
nives-vukovic Mar 3, 2023
d409da1
EXSWHTEC-273 - Disable test that fails on AMD
nives-vukovic Mar 6, 2023
269fa2a
Merge branch 'develop' into warp_shfl_xor_shfl_tests
mangupta Mar 10, 2023
729962c
EXSWHTEC-273 - Fix doxygen comments
milos-mozetic Mar 23, 2023
792358c
Expand Warp Test to include random and predefined test version
nives-vukovic May 3, 2023
fb1615d
Add comments for block and grid dimensions generate functions
nives-vukovic May 3, 2023
a9a3156
Merge branch 'warp_common' into warp_shfl_xor_shfl_tests
nives-vukovic May 3, 2023
7ff05a6
EXSWHTEC-273 - Modify warp shfl xor and shfl tests according to commo…
nives-vukovic May 4, 2023
d278e59
Merge remote-tracking branch 'origin/develop' into warp_shfl_xor_shfl…
nives-vukovic Jun 26, 2023
daa891e
Merge branch 'develop' into warp_shfl_xor_shfl_tests
rakesroy Jul 11, 2023
22eb41a
Reduce common code for warp tests
nives-vukovic Jul 13, 2023
f604016
Merge branch 'warp_common' into warp_shfl_xor_shfl_tests
nives-vukovic Jul 13, 2023
837879d
EXSWHTEC-273 - Create separate warp shfl common code
nives-vukovic Jul 13, 2023
68fe614
Merge remote-tracking branch 'upstream/develop' into warp_shfl_xor_sh…
mirza-halilcevic Sep 29, 2023
544693a
Merge remote-tracking branch 'origin/develop' into warp_shfl_xor_shfl…
nives-vukovic Nov 17, 2023
6faa880
Merge branch 'develop' into warp_shfl_xor_shfl_tests
rakesroy Nov 17, 2023
584b13c
Merge branch 'develop' into warp_shfl_xor_shfl_tests
rakesroy Nov 18, 2023
2ec14e6
Merge remote-tracking branch 'origin/develop' into warp_shfl_xor_shfl…
nives-vukovic Dec 8, 2023
24cd322
EXSWHTEC-273 - Set correct test_defgroups
nives-vukovic Dec 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions catch/include/cpu_grid.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ struct CPUGrid {
unsigned int thread_count_;
};

/* Generate dimensions for 1D, 2D and 3D blocks of threads */
inline dim3 GenerateThreadDimensions() {
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0));
Expand All @@ -99,6 +100,7 @@ inline dim3 GenerateThreadDimensions() {
dim3(props.warpSize + 1, 3, 3));
}

/* Generate dimensions for 1D, 2D and 3D grids of blocks */
inline dim3 GenerateBlockDimensions() {
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0));
Expand All @@ -116,6 +118,7 @@ inline dim3 GenerateBlockDimensions() {
dim3(5, 5, 5));
}

/* Generate dimensions for 1D, 2D and 3D blocks of threads - reduced set */
inline dim3 GenerateThreadDimensionsForShuffle() {
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0));
Expand All @@ -136,6 +139,7 @@ inline dim3 GenerateThreadDimensionsForShuffle() {
dim3(props.warpSize + 1, 3, 3));
}

/* Generate dimensions for 1D, 2D and 3D grids of blocks - reduced set */
inline dim3 GenerateBlockDimensionsForShuffle() {
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, 0));
Expand Down
1 change: 1 addition & 0 deletions catch/unit/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ add_subdirectory(compiler)
add_subdirectory(errorHandling)
add_subdirectory(cooperativeGrps)
add_subdirectory(context)
add_subdirectory(warp)
add_subdirectory(dynamicLoading)
add_subdirectory(g++)
add_subdirectory(module)
Expand Down
9 changes: 9 additions & 0 deletions catch/unit/warp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Common Tests - Test independent of all platforms
set(TEST_SRC
warp_shfl_xor.cc
warp_shfl.cc
)

hip_add_exe_to_target(NAME WarpTest
TEST_SRC ${TEST_SRC}
TEST_TARGET_NAME build_tests)
84 changes: 84 additions & 0 deletions catch/unit/warp/warp_common.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

#include <hip_test_common.hh>
#include <hip/hip_cooperative_groups.h>

static __device__ bool deactivate_thread(const uint64_t* const active_masks) {
const auto warp =
cooperative_groups::tiled_partition(cooperative_groups::this_thread_block(), warpSize);
const auto block = cooperative_groups::this_thread_block();
const auto warps_per_block = (block.size() + warpSize - 1) / warpSize;
const auto block_rank = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
const auto idx = block_rank * warps_per_block + block.thread_rank() / warpSize;

return !(active_masks[idx] & (static_cast<uint64_t>(1) << warp.thread_rank()));
}

static inline std::mt19937& GetRandomGenerator() {
static std::mt19937 mt(std::random_device{}());
return mt;
}

template <typename T> static inline T GenerateRandomInteger(const T min, const T max) {
std::uniform_int_distribution<T> dist(min, max);
return dist(GetRandomGenerator());
}

template <typename T> static inline T GenerateRandomReal(const T min, const T max) {
std::uniform_real_distribution<T> dist(min, max);
return dist(GetRandomGenerator());
}

inline int generate_width(int warp_size) {
int exponent = 0;
while (warp_size >>= 1) {
++exponent;
}

return GENERATE_COPY(map([](int e) { return 1 << e; }, range(1, exponent + 1)));
}

inline uint64_t get_active_mask(unsigned int warp_id, unsigned int warp_size) {
uint64_t active_mask = 0;
switch (warp_id % 5) {
case 0: // even threads in the warp
active_mask = 0xAAAAAAAAAAAAAAAA;
break;
case 1: // odd threads in the warp
active_mask = 0x5555555555555555;
break;
case 2: // first half of the warp
for (int i = 0; i < warp_size / 2; i++) {
active_mask = active_mask | (static_cast<uint64_t>(1) << i);
}
break;
case 3: // second half of the warp
for (int i = warp_size / 2; i < warp_size; i++) {
active_mask = active_mask | (static_cast<uint64_t>(1) << i);
}
break;
case 4: // all threads
active_mask = 0xFFFFFFFFFFFFFFFF;
break;
}
return active_mask;
}
121 changes: 121 additions & 0 deletions catch/unit/warp/warp_shfl.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#include "warp_shfl_common.hh"

#include <bitset>

/**
* @addtogroup shfl shfl
* @{
* @ingroup DeviceLanguageTest
* `T __shfl(T var, int src_lane, int width = warpSize)` -
* Contains unit test for warp shfl function
*/

namespace cg = cooperative_groups;

template <typename T>
__global__ void shfl(T* const out, const T* const in, const uint64_t* const active_masks,
const uint8_t* const src_lanes, const int width) {
if (deactivate_thread(active_masks)) {
return;
}
const auto grid = cg::this_grid();
const auto block = cg::this_thread_block();
T var = in[grid.thread_rank()];
out[grid.thread_rank()] = __shfl(var, src_lanes[block.thread_rank() % width], width);
}

template <typename T> class WarpShfl : public WarpShflTest<WarpShfl<T>, T> {
public:
void launch_kernel(T* const arr_dev, T* const input_dev, const uint64_t* const active_masks) {
width_ = generate_width(this->warp_size_);
INFO("Width: " << width_);
const auto alloc_size = width_ * sizeof(uint8_t);
LinearAllocGuard<uint8_t> src_lanes_dev(LinearAllocs::hipMalloc, alloc_size);
src_lanes_.resize(width_);
std::generate(src_lanes_.begin(), src_lanes_.end(),
[this] { return GenerateRandomInteger(0, static_cast<int>(2 * width_)); });

HIP_CHECK(hipMemcpy(src_lanes_dev.ptr(), src_lanes_.data(), alloc_size, hipMemcpyHostToDevice));
shfl<<<this->grid_.grid_dim_, this->grid_.block_dim_>>>(arr_dev, input_dev, active_masks,
src_lanes_dev.ptr(), width_);
}

void validate(const T* const arr, const T* const input) {
ArrayAllOf(arr, this->grid_.thread_count_, [this, &input](unsigned int i) -> std::optional<T> {
const auto rank_in_block = this->grid_.thread_rank_in_block(i).value();
const auto rank_in_warp = rank_in_block % this->warp_size_;
const auto rank_in_partition = rank_in_block % width_;
const int src_lane = src_lanes_[rank_in_partition] % width_;
const int src_offset = src_lane - rank_in_partition;

const auto mask_idx = this->warps_in_block_ * (i / this->grid_.threads_in_block_count_) +
rank_in_block / this->warp_size_;
const std::bitset<sizeof(uint64_t) * 8> active_mask(this->active_masks_[mask_idx]);

if (!active_mask.test(rank_in_warp) || (!active_mask.test((rank_in_warp + src_offset))) ||
(rank_in_block + src_offset >= this->grid_.threads_in_block_count_)) {
return std::nullopt;
}

return input[i + src_offset];
});
};

private:
std::vector<uint8_t> src_lanes_;
int width_;
};

/**
* Test Description
* ------------------------
* - Validates the warp shuffle behavior for all valid width sizes {2, 4, 8, 16, 32,
* 64(if supported)} for generated shuffle target lanes. The threads are deactivated based on the
* passed active mask. The test is run for all overloads of shfl.
* Test source
* ------------------------
* - unit/warp/warp_shfl.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.2
* - Device supports warp shuffle
*/
TEMPLATE_TEST_CASE("Unit_Warp_Shfl_Positive_Basic", "", int, unsigned int, long, unsigned long,
long long, unsigned long long, float, double) {
int device;
hipDeviceProp_t device_properties;
HIP_CHECK(hipGetDevice(&device));
HIP_CHECK(hipGetDeviceProperties(&device_properties, device));

if (!device_properties.arch.hasWarpShuffle) {
HipTest::HIP_SKIP_TEST("Device doesn't support Warp Shuffle!");
return;
}

SECTION("Shfl with specified active mask and input values") {
WarpShfl<TestType>().run(false);
}

SECTION("Shfl with random active mask and input values") {
WarpShfl<TestType>().run(true);
}
}
114 changes: 114 additions & 0 deletions catch/unit/warp/warp_shfl_common.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

#include "warp_common.hh"

#include <cpu_grid.h>
#include <resource_guards.hh>
#include <utils.hh>

template <typename Derived, typename T> class WarpShflTest {
public:
WarpShflTest() : warp_size_{get_warp_size()} {}

void run(bool random = false) {
const auto blocks = GenerateBlockDimensionsForShuffle();
INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
const auto threads = GenerateThreadDimensionsForShuffle();
INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
grid_ = CPUGrid(blocks, threads);

const auto alloc_size = grid_.thread_count_ * sizeof(T);
LinearAllocGuard<T> input_dev(LinearAllocs::hipMalloc, alloc_size);
LinearAllocGuard<T> input(LinearAllocs::hipHostMalloc, alloc_size);
LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
HIP_CHECK(hipMemset(arr_dev.ptr(), 0, alloc_size));

warps_in_block_ = (grid_.threads_in_block_count_ + warp_size_ - 1) / warp_size_;
const auto warps_in_grid = warps_in_block_ * grid_.block_count_;
LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
warps_in_grid * sizeof(uint64_t));
active_masks_.resize(warps_in_grid);

generate_input(input.ptr(), random);

HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks_.data(),
warps_in_grid * sizeof(uint64_t), hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(input_dev.ptr(), input.ptr(), alloc_size, hipMemcpyHostToDevice));
cast_to_derived().launch_kernel(arr_dev.ptr(), input_dev.ptr(), active_masks_dev.ptr());
HIP_CHECK(hipGetLastError());
HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
HIP_CHECK(hipDeviceSynchronize());

cast_to_derived().validate(arr.ptr(), input.ptr());
}

private:
int get_warp_size() const {
int current_dev = -1;
HIP_CHECK(hipGetDevice(&current_dev));
int warp_size = 0u;
HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
return warp_size;
}

void generate_input(T* input, bool random) {
if (random) {
std::generate(active_masks_.begin(), active_masks_.end(), [] {
return GenerateRandomInteger(0ul, std::numeric_limits<uint64_t>().max());
});

if constexpr (std::is_same_v<float, T> || std::is_same_v<double, T>) {
std::generate_n(input, grid_.thread_count_, [] {
return static_cast<T>(
GenerateRandomReal(std::numeric_limits<T>().min(), std::numeric_limits<T>().max()));
});
} else {
std::generate_n(input, grid_.thread_count_, [] {
return static_cast<T>(GenerateRandomInteger(std::numeric_limits<T>().min(),
std::numeric_limits<T>().max()));
});
}
} else {
unsigned long long int i = 0;
std::generate(active_masks_.begin(), active_masks_.end(),
[this, &i]() { return get_active_mask(i++, warp_size_); });

i = 0;
std::generate_n(input, grid_.thread_count_, [&i]() {
if (static_cast<T>(i) > std::numeric_limits<T>().max())
i = 0;
else
i++;
return static_cast<T>(i);
});
}
}

Derived& cast_to_derived() { return reinterpret_cast<Derived&>(*this); }

protected:
const int warp_size_;
CPUGrid grid_;
unsigned int warps_in_block_;
std::vector<uint64_t> active_masks_;
};
Loading