Skip to content

Commit

Permalink
Squash commits together
Browse files Browse the repository at this point in the history
replace constant with appropriate block dimension

WIP: add strides as a kernel parameter for non-packed tensors

WIP: created non-packed variants for remaining 3D conv kernels

fix: treat input and output as 6D NDHWGC tensor

use std::call_once for initializing random seed

revamp naive kernels to use strides in the non-packed tensors case (controlled by a bool flag)

WIP: fixed kernel compilation issues but unable to load kernel code object

WIP: fixed issue with hip rtc

split channel strides into group, channels_per_group in solver

fix indexing to left-to-right order

fix bug with too much padding between kernel args

num channels should be a multiple of num groups

re-enable naive ref kernels with strides array

2D forward tests are all working now

WIP: debugging bwd tests

WIP: tests up till 3D bwd conv passing

fix bug in bwd ndhwc kernel

fix formatting

disable prints

fix readability-inconsistent-declaration-parameter-name

fix clang-format

fix hip tidy issue

reverting the change to static init of random seed

address comments and tidy issues. Remove extra print

removed blank line change

remove unneeded include

env var for choosing packed vs non-packed reference kernel

fix warnings from hip-tidy

address comment about array initialization

clear a tiny hip tidy issue
  • Loading branch information
amberhassaan committed Sep 17, 2023
1 parent d176b8f commit 6e37c43
Show file tree
Hide file tree
Showing 8 changed files with 1,681 additions and 580 deletions.
15 changes: 11 additions & 4 deletions src/include/miopen/hipoc_kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ inline HipEventPtr make_hip_event()

#if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017

#if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017
#if 0 // Keep around other storage techinques -- @pfultz2 27.03.2017
template <class T, class U>
struct KernelArgsPair
{
Expand All @@ -65,9 +65,16 @@ struct KernelArgsPair
template <class T, class U>
struct KernelArgsPair
{
KernelArgsPair(T x, U y) : first(x), second(y) {}
T first;
U second;
static const int alignment = alignof(U);
static const int padding = (alignment - (sizeof(T) % alignment)) % alignment;
static_assert(padding >= 0, "padding cannot be negative");
static const int second_index = sizeof(T) + padding;
KernelArgsPair(T x, U y)
{
new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
new(buffer + second_index) U(y);
}
alignas(U) char buffer[second_index + sizeof(U)] = {};
};
#endif

Expand Down
159 changes: 158 additions & 1 deletion src/include/miopen/solver/conv_direct_naive_conv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,14 @@
*******************************************************************************/
#pragma once

#include <string>
#include <miopen/conv/context.hpp>

#include <string>
#include <array>
#include <algorithm>
#include <vector>
#include <cassert>

namespace miopen {

namespace solver {
Expand All @@ -51,5 +56,157 @@ bool IsOutputBfp16(const ProblemDescription&);
bool IsOutputInt8(const ProblemDescription&);
bool IsOutputInt32(const ProblemDescription&);

int GetGroupStrideIndex(const ProblemDescription& problem);

void printTensorStrides(const TensorDescriptor& inDesc,
const TensorDescriptor& wDesc,
const TensorDescriptor& outDesc);

// TODO(Amber): Uncomment when hip RTC accepts std::array
// using StrideIndexType = int;
// using Strides3D = std::array<StrideIndexType, 3>;
// using Strides4D = std::array<StrideIndexType, 4>;
// using Strides5D = std::array<StrideIndexType, 5>;
// using Strides6D = std::array<StrideIndexType, 6>;
#if 1
template <typename T, unsigned N>
class MyArray
{
T data_[N] = {};

public:
constexpr static const unsigned SIZE = N;
__host__ __device__ constexpr unsigned size() const { return N; }

__host__ __device__ const T& operator[](unsigned i) const { return data_[i]; }

__host__ T& operator[](unsigned i) { return data_[i]; }

__host__ __device__ MyArray() = default;
__host__ __device__ MyArray(const MyArray&) = default;
__host__ __device__ MyArray(MyArray&&) noexcept = default;
__host__ __device__ MyArray& operator=(const MyArray&) = default;
__host__ __device__ MyArray& operator=(MyArray&&) noexcept = default;
__host__ __device__ ~MyArray() = default;
};

using StrideIndexType = int;
using Strides5D = MyArray<StrideIndexType, 5u>;
using Strides6D = MyArray<StrideIndexType, 6u>;

#else
extern "C" typedef int StrideIndexType;
extern "C" typedef struct
{
StrideIndexType v[5];
} Strides5D;
extern "C" typedef struct
{
StrideIndexType v[6];
} Strides6D;
#endif

namespace internal {
template <unsigned N>
struct ChooseStride
{
};

template <>
struct ChooseStride<5u>
{
using type = Strides5D;
};

template <>
struct ChooseStride<6u>
{
using type = Strides6D;
};

} // end namespace internal

template <unsigned N, typename V>
auto MakeStrideArray(V vec)
{
typename internal::ChooseStride<N>::type ret;
assert(vec.size() == N);

// MIOpen stores strides for NHWC in NCHW order, i.e. C stride in 2nd from left.
// We sort the input stride vector so that smallest stride is at index 0. This
// (little-endian) order is what naive convolution kernel expects for strides
std::sort(vec.begin(), vec.end());

for(unsigned i = 0; i < N; ++i)
{
ret[i] = static_cast<StrideIndexType>(vec[i]);
}
return ret;
}

/**
* split the strides for C dimension in a tensor descriptor into (G, C_per_group).
* Normally, (in packed case) num channels is a multiplying factor in the stride of
* whatever lies to the left of C, e.g., in NCHW, N's stride contains C as a
* factor. We output NGCHW for NCHW (and NHWGC for NHWC)
* where the stride[G] = stride[N] / num_groups
*/
template <typename V>
V SplitStrideCtoGC(int num_groups, const V& orig_strides, int G_stride_idx)
{
assert(G_stride_idx > 0 && G_stride_idx <= orig_strides.size());
// (G_stride_idx - 1) is the stride index of whatever lies to the left and
// contains C or K as a multiplying factor. We divide this value by num_groups
// to get G_stride_val
assert(orig_strides[G_stride_idx - 1] % num_groups == 0);

V ret{orig_strides};
auto G_stride_val = orig_strides[G_stride_idx - 1] / num_groups;

ret.insert(ret.begin() + G_stride_idx, G_stride_val);

return ret;
}

/**
* Weight tensor has original dims: [K, C_per_group, Y, X] (2D case)
* We return a new stride vector with strides for [G, K_per_group, C_per_group, Y, X]
* Stride for G is computed as stride[C_per_group] * K_per_group and inserted at
* left most position
*/
template <typename V>
V SplitWeiStrideKtoGK(int k_per_group, const V& wei_strides)
{
V ret{wei_strides};
ret.insert(ret.begin(), wei_strides[0] * k_per_group);
return ret;
}

template <typename StrideArray>
void printStrideArray(const char* name, const StrideArray& sarr)
{
printf("%s = [", name);
for(unsigned i = 0; i < StrideArray::SIZE; ++i)
{
printf("%d,", sarr[i]);
}
printf("]\n");
}

template <typename StrideArray>
void printStrideArrays(const StrideArray& in_strides,
const StrideArray& wei_strides,
const StrideArray& out_strides)
{

printStrideArray("in_strides", in_strides);
printStrideArray("wei_strides", wei_strides);
printStrideArray("out_strides", out_strides);
}

} // namespace solver
} // namespace miopen
Loading

0 comments on commit 6e37c43

Please sign in to comment.