Squash commits together

replace constant with appropriate block dimension WIP: add strides as a kernel parameter for non-packed tensors WIP: created non-packed variants for remaining 3D conv kernels fix: treat input and output as 6D NDHWGC tensor use std::call_once for initializing random seed revamp naive kernels to use strides in the non-packed tensors case (controlled by a bool flag) WIP: fixed kernel compilation issues but unable to load kernel code object WIP: fixed issue with hip rtc split channel strides into group, channels_per_group in solver fix indexing to left-to-right order fix bug with too much padding between kernel args num channels should be a multiple of num groups re-enable naive ref kernels with strides array 2D forward tests are all working now WIP: debugging bwd tests WIP: tests up till 3D bwd conv passing fix bug in bwd ndhwc kernel fix formatting disable prints fix readability-inconsistent-declaration-parameter-name fix clang-format fix hip tidy issue reverting the change to static init of random seed address comments and tidy issues. Remove extra print removed blank line change remove unneeded include env var for choosing packed vs non-packed reference kernel fix warnings from hip-tidy address comment about array initialization clear a tiny hip tidy issue
ROCm · Sep 17, 2023 · 6e37c43 · 6e37c43
1 parent d176b8f
commit 6e37c43
Show file tree

Hide file tree

Showing 8 changed files with 1,681 additions and 580 deletions.
diff --git a/src/include/miopen/hipoc_kernel.hpp b/src/include/miopen/hipoc_kernel.hpp
@@ -47,7 +47,7 @@ inline HipEventPtr make_hip_event()
 
 #if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017
 
-#if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017
+#if 0 // Keep around other storage techinques -- @pfultz2 27.03.2017
 template <class T, class U>
 struct KernelArgsPair
 {
@@ -65,9 +65,16 @@ struct KernelArgsPair
 template <class T, class U>
 struct KernelArgsPair
 {
-    KernelArgsPair(T x, U y) : first(x), second(y) {}
-    T first;
-    U second;
+    static const int alignment = alignof(U);
+    static const int padding   = (alignment - (sizeof(T) % alignment)) % alignment;
+    static_assert(padding >= 0, "padding cannot be negative");
+    static const int second_index = sizeof(T) + padding;
+    KernelArgsPair(T x, U y)
+    {
+        new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
+        new(buffer + second_index) U(y);
+    }
+    alignas(U) char buffer[second_index + sizeof(U)] = {};
 };
 #endif
 

diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp
@@ -25,9 +25,14 @@
  *******************************************************************************/
 #pragma once
 
-#include <string>
 #include <miopen/conv/context.hpp>
 
+#include <string>
+#include <array>
+#include <algorithm>
+#include <vector>
+#include <cassert>
+
 namespace miopen {
 
 namespace solver {
@@ -51,5 +56,157 @@ bool IsOutputBfp16(const ProblemDescription&);
 bool IsOutputInt8(const ProblemDescription&);
 bool IsOutputInt32(const ProblemDescription&);
 
+int GetGroupStrideIndex(const ProblemDescription& problem);
+
+void printTensorStrides(const TensorDescriptor& inDesc,
+                        const TensorDescriptor& wDesc,
+                        const TensorDescriptor& outDesc);
+
+// TODO(Amber): Uncomment when hip RTC accepts std::array
+// using StrideIndexType = int;
+// using Strides3D       = std::array<StrideIndexType, 3>;
+// using Strides4D       = std::array<StrideIndexType, 4>;
+// using Strides5D       = std::array<StrideIndexType, 5>;
+// using Strides6D       = std::array<StrideIndexType, 6>;
+#if 1
+template <typename T, unsigned N>
+class MyArray
+{
+    T data_[N] = {};
+
+public:
+    constexpr static const unsigned SIZE = N;
+    __host__ __device__ constexpr unsigned size() const { return N; }
+
+    __host__ __device__ const T& operator[](unsigned i) const { return data_[i]; }
+
+    __host__ T& operator[](unsigned i) { return data_[i]; }
+
+    __host__ __device__ MyArray()                   = default;
+    __host__ __device__ MyArray(const MyArray&)     = default;
+    __host__ __device__ MyArray(MyArray&&) noexcept = default;
+    __host__ __device__ MyArray& operator=(const MyArray&) = default;
+    __host__ __device__ MyArray& operator=(MyArray&&) noexcept = default;
+    __host__ __device__ ~MyArray()                             = default;
+};
+
+using StrideIndexType = int;
+using Strides5D       = MyArray<StrideIndexType, 5u>;
+using Strides6D       = MyArray<StrideIndexType, 6u>;
+
+#else
+
+extern "C" typedef int StrideIndexType;
+
+extern "C" typedef struct
+{
+    StrideIndexType v[5];
+} Strides5D;
+
+extern "C" typedef struct
+{
+    StrideIndexType v[6];
+} Strides6D;
+
+#endif
+
+namespace internal {
+template <unsigned N>
+struct ChooseStride
+{
+};
+
+template <>
+struct ChooseStride<5u>
+{
+    using type = Strides5D;
+};
+
+template <>
+struct ChooseStride<6u>
+{
+    using type = Strides6D;
+};
+
+} // end namespace internal
+
+template <unsigned N, typename V>
+auto MakeStrideArray(V vec)
+{
+    typename internal::ChooseStride<N>::type ret;
+    assert(vec.size() == N);
+
+    // MIOpen stores strides for NHWC in NCHW order, i.e. C stride in 2nd from left.
+    // We sort the input stride vector so that smallest stride is at index 0. This
+    // (little-endian) order is what naive convolution kernel expects for strides
+    std::sort(vec.begin(), vec.end());
+
+    for(unsigned i = 0; i < N; ++i)
+    {
+        ret[i] = static_cast<StrideIndexType>(vec[i]);
+    }
+    return ret;
+}
+
+/**
+ * split the strides for C dimension in a tensor descriptor into (G, C_per_group).
+ * Normally, (in packed case) num channels is a multiplying factor in the stride of
+ * whatever lies to the left of C, e.g., in NCHW, N's stride contains C as a
+ * factor. We output NGCHW for NCHW (and NHWGC for NHWC)
+ * where the stride[G] = stride[N] / num_groups
+ */
+template <typename V>
+V SplitStrideCtoGC(int num_groups, const V& orig_strides, int G_stride_idx)
+{
+    assert(G_stride_idx > 0 && G_stride_idx <= orig_strides.size());
+    // (G_stride_idx - 1) is the stride index of whatever lies to the left and
+    // contains C or K as a multiplying factor. We divide this value by num_groups
+    // to get G_stride_val
+    assert(orig_strides[G_stride_idx - 1] % num_groups == 0);
+
+    V ret{orig_strides};
+    auto G_stride_val = orig_strides[G_stride_idx - 1] / num_groups;
+
+    ret.insert(ret.begin() + G_stride_idx, G_stride_val);
+
+    return ret;
+}
+
+/**
+ * Weight tensor has original dims: [K, C_per_group, Y, X] (2D case)
+ * We return a new stride vector with strides for [G, K_per_group, C_per_group, Y, X]
+ * Stride for G is computed as stride[C_per_group] * K_per_group and inserted at
+ * left most position
+ */
+template <typename V>
+V SplitWeiStrideKtoGK(int k_per_group, const V& wei_strides)
+{
+    V ret{wei_strides};
+    ret.insert(ret.begin(), wei_strides[0] * k_per_group);
+    return ret;
+}
+
+template <typename StrideArray>
+void printStrideArray(const char* name, const StrideArray& sarr)
+{
+    printf("%s = [", name);
+    for(unsigned i = 0; i < StrideArray::SIZE; ++i)
+    {
+        printf("%d,", sarr[i]);
+    }
+    printf("]\n");
+}
+
+template <typename StrideArray>
+void printStrideArrays(const StrideArray& in_strides,
+                       const StrideArray& wei_strides,
+                       const StrideArray& out_strides)
+{
+
+    printStrideArray("in_strides", in_strides);
+    printStrideArray("wei_strides", wei_strides);
+    printStrideArray("out_strides", out_strides);
+}
+
 } // namespace solver
 } // namespace miopen