ROCm · nives-vukovic · Oct 26, 2022 · Nov 7, 2022 · Nov 28, 2022 · Dec 2, 2022
@@ -78,6 +78,7 @@ struct CPUGrid {
   unsigned int thread_count_;
 };
 
+/* Generate dimensions for 1D, 2D and 3D blocks of threads */
 inline dim3 GenerateThreadDimensions() {
   hipDeviceProp_t props;
   HIP_CHECK(hipGetDeviceProperties(&props, 0));
@@ -99,6 +100,7 @@ inline dim3 GenerateThreadDimensions() {
       dim3(props.warpSize + 1, 3, 3));
 }
 
+/* Generate dimensions for 1D, 2D and 3D grids of blocks */
 inline dim3 GenerateBlockDimensions() {
   hipDeviceProp_t props;
   HIP_CHECK(hipGetDeviceProperties(&props, 0));
@@ -116,6 +118,7 @@ inline dim3 GenerateBlockDimensions() {
                        dim3(5, 5, 5));
 }
 
+/* Generate dimensions for 1D, 2D and 3D blocks of threads - reduced set */
 inline dim3 GenerateThreadDimensionsForShuffle() {
   hipDeviceProp_t props;
   HIP_CHECK(hipGetDeviceProperties(&props, 0));
@@ -136,6 +139,7 @@ inline dim3 GenerateThreadDimensionsForShuffle() {
       dim3(props.warpSize + 1, 3, 3));
 }
 
+/* Generate dimensions for 1D, 2D and 3D grids of blocks - reduced set */
 inline dim3 GenerateBlockDimensionsForShuffle() {
   hipDeviceProp_t props;
   HIP_CHECK(hipGetDeviceProperties(&props, 0));

@@ -36,6 +36,7 @@ add_subdirectory(compiler)
 add_subdirectory(errorHandling)
 add_subdirectory(cooperativeGrps)
 add_subdirectory(context)
+add_subdirectory(warp)
 add_subdirectory(dynamicLoading)
 add_subdirectory(g++)
 add_subdirectory(module)

@@ -0,0 +1,9 @@
+# Common Tests - Test independent of all platforms
+set(TEST_SRC
+  warp_shfl_xor.cc
+  warp_shfl.cc
+)
+
+hip_add_exe_to_target(NAME WarpTest
+                      TEST_SRC ${TEST_SRC}
+                      TEST_TARGET_NAME build_tests)
@@ -0,0 +1,84 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip_test_common.hh>
+#include <hip/hip_cooperative_groups.h>
+
+static __device__ bool deactivate_thread(const uint64_t* const active_masks) {
+  const auto warp =
+      cooperative_groups::tiled_partition(cooperative_groups::this_thread_block(), warpSize);
+  const auto block = cooperative_groups::this_thread_block();
+  const auto warps_per_block = (block.size() + warpSize - 1) / warpSize;
+  const auto block_rank = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
+  const auto idx = block_rank * warps_per_block + block.thread_rank() / warpSize;
+
+  return !(active_masks[idx] & (static_cast<uint64_t>(1) << warp.thread_rank()));
+}
+
+static inline std::mt19937& GetRandomGenerator() {
+  static std::mt19937 mt(std::random_device{}());
+  return mt;
+}
+
+template <typename T> static inline T GenerateRandomInteger(const T min, const T max) {
+  std::uniform_int_distribution<T> dist(min, max);
+  return dist(GetRandomGenerator());
+}
+
+template <typename T> static inline T GenerateRandomReal(const T min, const T max) {
+  std::uniform_real_distribution<T> dist(min, max);
+  return dist(GetRandomGenerator());
+}
+
+inline int generate_width(int warp_size) {
+  int exponent = 0;
+  while (warp_size >>= 1) {
+    ++exponent;
+  }
+
+  return GENERATE_COPY(map([](int e) { return 1 << e; }, range(1, exponent + 1)));
+}
+
+inline uint64_t get_active_mask(unsigned int warp_id, unsigned int warp_size) {
+  uint64_t active_mask = 0;
+  switch (warp_id % 5) {
+    case 0:  // even threads in the warp
+      active_mask = 0xAAAAAAAAAAAAAAAA;
+      break;
+    case 1:  // odd threads in the warp
+      active_mask = 0x5555555555555555;
+      break;
+    case 2:  // first half of the warp
+      for (int i = 0; i < warp_size / 2; i++) {
+        active_mask = active_mask | (static_cast<uint64_t>(1) << i);
+      }
+      break;
+    case 3:  // second half of the warp
+      for (int i = warp_size / 2; i < warp_size; i++) {
+        active_mask = active_mask | (static_cast<uint64_t>(1) << i);
+      }
+      break;
+    case 4:  // all threads
+      active_mask = 0xFFFFFFFFFFFFFFFF;
+      break;
+  }
+  return active_mask;
+}
@@ -0,0 +1,121 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "warp_shfl_common.hh"
+
+#include <bitset>
+
+/**
+ * @addtogroup shfl shfl
+ * @{
+ * @ingroup DeviceLanguageTest
+ * `T __shfl(T var, int src_lane, int width = warpSize)` -
+ * Contains unit test for warp shfl function
+ */
+
+namespace cg = cooperative_groups;
+
+template <typename T>
+__global__ void shfl(T* const out, const T* const in, const uint64_t* const active_masks,
+                     const uint8_t* const src_lanes, const int width) {
+  if (deactivate_thread(active_masks)) {
+    return;
+  }
+  const auto grid = cg::this_grid();
+  const auto block = cg::this_thread_block();
+  T var = in[grid.thread_rank()];
+  out[grid.thread_rank()] = __shfl(var, src_lanes[block.thread_rank() % width], width);
+}
+
+template <typename T> class WarpShfl : public WarpShflTest<WarpShfl<T>, T> {
+ public:
+  void launch_kernel(T* const arr_dev, T* const input_dev, const uint64_t* const active_masks) {
+    width_ = generate_width(this->warp_size_);
+    INFO("Width: " << width_);
+    const auto alloc_size = width_ * sizeof(uint8_t);
+    LinearAllocGuard<uint8_t> src_lanes_dev(LinearAllocs::hipMalloc, alloc_size);
+    src_lanes_.resize(width_);
+    std::generate(src_lanes_.begin(), src_lanes_.end(),
+                  [this] { return GenerateRandomInteger(0, static_cast<int>(2 * width_)); });
+
+    HIP_CHECK(hipMemcpy(src_lanes_dev.ptr(), src_lanes_.data(), alloc_size, hipMemcpyHostToDevice));
+    shfl<<<this->grid_.grid_dim_, this->grid_.block_dim_>>>(arr_dev, input_dev, active_masks,
+                                                            src_lanes_dev.ptr(), width_);
+  }
+
+  void validate(const T* const arr, const T* const input) {
+    ArrayAllOf(arr, this->grid_.thread_count_, [this, &input](unsigned int i) -> std::optional<T> {
+      const auto rank_in_block = this->grid_.thread_rank_in_block(i).value();
+      const auto rank_in_warp = rank_in_block % this->warp_size_;
+      const auto rank_in_partition = rank_in_block % width_;
+      const int src_lane = src_lanes_[rank_in_partition] % width_;
+      const int src_offset = src_lane - rank_in_partition;
+
+      const auto mask_idx = this->warps_in_block_ * (i / this->grid_.threads_in_block_count_) +
+          rank_in_block / this->warp_size_;
+      const std::bitset<sizeof(uint64_t) * 8> active_mask(this->active_masks_[mask_idx]);
+
+      if (!active_mask.test(rank_in_warp) || (!active_mask.test((rank_in_warp + src_offset))) ||
+          (rank_in_block + src_offset >= this->grid_.threads_in_block_count_)) {
+        return std::nullopt;
+      }
+
+      return input[i + src_offset];
+    });
+  };
+
+ private:
+  std::vector<uint8_t> src_lanes_;
+  int width_;
+};
+
+/**
+ * Test Description
+ * ------------------------
+ *  - Validates the warp shuffle behavior for all valid width sizes {2, 4, 8, 16, 32,
+ * 64(if supported)} for generated shuffle target lanes. The threads are deactivated based on the
+ * passed active mask. The test is run for all overloads of shfl.
+ * Test source
+ * ------------------------
+ *  - unit/warp/warp_shfl.cc
+ * Test requirements
+ * ------------------------
+ *  - HIP_VERSION >= 5.2
+ *  - Device supports warp shuffle
+ */
+TEMPLATE_TEST_CASE("Unit_Warp_Shfl_Positive_Basic", "", int, unsigned int, long, unsigned long,
+                   long long, unsigned long long, float, double) {
+  int device;
+  hipDeviceProp_t device_properties;
+  HIP_CHECK(hipGetDevice(&device));
+  HIP_CHECK(hipGetDeviceProperties(&device_properties, device));
+
+  if (!device_properties.arch.hasWarpShuffle) {
+    HipTest::HIP_SKIP_TEST("Device doesn't support Warp Shuffle!");
+    return;
+  }
+
+  SECTION("Shfl with specified active mask and input values") {
+    WarpShfl<TestType>().run(false);
+  }
+
+  SECTION("Shfl with random active mask and input values") {
+    WarpShfl<TestType>().run(true);
+  }
+}
@@ -0,0 +1,114 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "warp_common.hh"
+
+#include <cpu_grid.h>
+#include <resource_guards.hh>
+#include <utils.hh>
+
+template <typename Derived, typename T> class WarpShflTest {
+ public:
+  WarpShflTest() : warp_size_{get_warp_size()} {}
+
+  void run(bool random = false) {
+    const auto blocks = GenerateBlockDimensionsForShuffle();
+    INFO("Grid dimensions: x " << blocks.x << ", y " << blocks.y << ", z " << blocks.z);
+    const auto threads = GenerateThreadDimensionsForShuffle();
+    INFO("Block dimensions: x " << threads.x << ", y " << threads.y << ", z " << threads.z);
+    grid_ = CPUGrid(blocks, threads);
+
+    const auto alloc_size = grid_.thread_count_ * sizeof(T);
+    LinearAllocGuard<T> input_dev(LinearAllocs::hipMalloc, alloc_size);
+    LinearAllocGuard<T> input(LinearAllocs::hipHostMalloc, alloc_size);
+    LinearAllocGuard<T> arr_dev(LinearAllocs::hipMalloc, alloc_size);
+    LinearAllocGuard<T> arr(LinearAllocs::hipHostMalloc, alloc_size);
+    HIP_CHECK(hipMemset(arr_dev.ptr(), 0, alloc_size));
+
+    warps_in_block_ = (grid_.threads_in_block_count_ + warp_size_ - 1) / warp_size_;
+    const auto warps_in_grid = warps_in_block_ * grid_.block_count_;
+    LinearAllocGuard<uint64_t> active_masks_dev(LinearAllocs::hipMalloc,
+                                                warps_in_grid * sizeof(uint64_t));
+    active_masks_.resize(warps_in_grid);
+
+    generate_input(input.ptr(), random);
+
+    HIP_CHECK(hipMemcpy(active_masks_dev.ptr(), active_masks_.data(),
+                        warps_in_grid * sizeof(uint64_t), hipMemcpyHostToDevice));
+    HIP_CHECK(hipMemcpy(input_dev.ptr(), input.ptr(), alloc_size, hipMemcpyHostToDevice));
+    cast_to_derived().launch_kernel(arr_dev.ptr(), input_dev.ptr(), active_masks_dev.ptr());
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipMemcpy(arr.ptr(), arr_dev.ptr(), alloc_size, hipMemcpyDeviceToHost));
+    HIP_CHECK(hipDeviceSynchronize());
+
+    cast_to_derived().validate(arr.ptr(), input.ptr());
+  }
+
+ private:
+  int get_warp_size() const {
+    int current_dev = -1;
+    HIP_CHECK(hipGetDevice(&current_dev));
+    int warp_size = 0u;
+    HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0));
+    return warp_size;
+  }
+
+  void generate_input(T* input, bool random) {
+    if (random) {
+      std::generate(active_masks_.begin(), active_masks_.end(), [] {
+        return GenerateRandomInteger(0ul, std::numeric_limits<uint64_t>().max());
+      });
+
+      if constexpr (std::is_same_v<float, T> || std::is_same_v<double, T>) {
+        std::generate_n(input, grid_.thread_count_, [] {
+          return static_cast<T>(
+              GenerateRandomReal(std::numeric_limits<T>().min(), std::numeric_limits<T>().max()));
+        });
+      } else {
+        std::generate_n(input, grid_.thread_count_, [] {
+          return static_cast<T>(GenerateRandomInteger(std::numeric_limits<T>().min(),
+                                                      std::numeric_limits<T>().max()));
+        });
+      }
+    } else {
+      unsigned long long int i = 0;
+      std::generate(active_masks_.begin(), active_masks_.end(),
+                    [this, &i]() { return get_active_mask(i++, warp_size_); });
+
+      i = 0;
+      std::generate_n(input, grid_.thread_count_, [&i]() {
+        if (static_cast<T>(i) > std::numeric_limits<T>().max())
+          i = 0;
+        else
+          i++;
+        return static_cast<T>(i);
+      });
+    }
+  }
+
+  Derived& cast_to_derived() { return reinterpret_cast<Derived&>(*this); }
+
+ protected:
+  const int warp_size_;
+  CPUGrid grid_;
+  unsigned int warps_in_block_;
+  std::vector<uint64_t> active_masks_;
+};