Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop' into C7/remove_random_d…
Browse files Browse the repository at this point in the history
…evice
  • Loading branch information
CAHEK7 committed Oct 6, 2023
2 parents 25fa66d + 14118a4 commit 4e2a841
Show file tree
Hide file tree
Showing 35 changed files with 2,797 additions and 1,640 deletions.
3 changes: 3 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ set( MIOpen_Source
solver/activ/bwd_1.cpp
solver/activ/fwd_0.cpp
solver/activ/fwd_1.cpp
solver/batchnorm/backward_ck.cpp
solver/batchnorm/backward_per_activation.cpp
solver/batchnorm/backward_per_activation_fused.cpp
solver/batchnorm/backward_spatial_multiple.cpp
Expand All @@ -163,6 +164,7 @@ set( MIOpen_Source
solver/batchnorm/forward_per_activation_fused.cpp
solver/batchnorm/forward_spatial_multiple.cpp
solver/batchnorm/forward_spatial_single.cpp
solver/batchnorm/forward_training_ck.cpp
solver/conv_asm_1x1u.cpp
solver/conv_asm_1x1u_bias_activ_fused.cpp
solver/conv_asm_1x1u_stride2.cpp
Expand Down Expand Up @@ -388,6 +390,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
kernels/workaround_issue_1431.hpp
kernels/hip_f8_impl.hpp
kernels/hip_float8.hpp
kernels/stride_array.hpp
)

set(MIOPEN_KERNELS
Expand Down
7 changes: 0 additions & 7 deletions src/batch_norm_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,13 +243,6 @@ miopenBatchNormalizationBackward(miopenHandle_t handle,
const void* savedMean,
const void* savedInvVariance)
{
// bfloat16 not supported for batchnorm operation
if(miopen::deref(xDesc).GetType() == miopenBFloat16 ||
miopen::deref(dyDesc).GetType() == miopenBFloat16 ||
miopen::deref(dxDesc).GetType() == miopenBFloat16)
{
return miopenStatusNotImplemented;
}

MIOPEN_LOG_FUNCTION(handle,
bn_mode,
Expand Down
2 changes: 1 addition & 1 deletion src/hip/hip_build_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ static boost::filesystem::path HipBuildImpl(boost::optional<TmpDir>& tmp_dir,
auto env = std::string("");

if(params.find("-std=") == std::string::npos)
params += " --std=c++11";
params += " --std=c++17";

#if HIP_PACKAGE_VERSION_FLAT < 4001000000ULL
params += " --cuda-gpu-arch=" + lots.device;
Expand Down
20 changes: 20 additions & 0 deletions src/include/miopen/batchnorm/solvers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,26 @@ struct BnCKFwdInference final : BatchnormSolver
const miopen::batchnorm::ProblemDescription& problem) const override;
};

struct BnCKBwdBackward final : BatchnormSolver
{
const std::string& SolverDbId() const override { return GetSolverDbId<BnCKBwdBackward>(); }

bool IsApplicable(const ExecutionContext& context,
const miopen::batchnorm::ProblemDescription& problem) const override;
ConvSolution GetSolution(const ExecutionContext& context,
const miopen::batchnorm::ProblemDescription& problem) const override;
};

struct BnCKFwdTraining final : BatchnormSolver
{
const std::string& SolverDbId() const override { return GetSolverDbId<BnCKFwdTraining>(); }

bool IsApplicable(const ExecutionContext& context,
const miopen::batchnorm::ProblemDescription& problem) const override;
ConvSolution GetSolution(const ExecutionContext& context,
const miopen::batchnorm::ProblemDescription& problem) const override;
};

} // namespace batchnorm

} // namespace solver
Expand Down
24 changes: 8 additions & 16 deletions src/include/miopen/hipoc_kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,15 @@
#ifndef GUARD_MIOPEN_HIPOC_KERNEL_HPP
#define GUARD_MIOPEN_HIPOC_KERNEL_HPP

#include <array>
#include <cassert>
#include <miopen/errors.hpp>
#include <miopen/hipoc_program.hpp>
#include <miopen/stringutils.hpp>
#include <miopen/op_kernel_args.hpp>

#include <array>
#include <cassert>
#include <cstring>
#include <vector>
#include <memory.h>

namespace miopen {

Expand All @@ -47,29 +48,20 @@ inline HipEventPtr make_hip_event()

#if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017

#if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017
template <class T, class U>
struct KernelArgsPair
{
static const int alignment = sizeof(U);
static const int padding = (alignment - sizeof(T) % alignment) % alignment;
static const int second_index = sizeof(T) + padding;
constexpr static auto alignU = alignof(U);
constexpr static auto padding = (alignU - (sizeof(T) % alignU)) % alignU;
constexpr static auto second_index = sizeof(T) + padding;
KernelArgsPair(T x, U y)
{
new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
new(buffer + second_index) U(y);
}

alignas(U) char buffer[second_index + sizeof(U)] = {};
};
#else
template <class T, class U>
struct KernelArgsPair
{
KernelArgsPair(T x, U y) : first(x), second(y) {}
T first;
U second;
};
#endif

template <class... Ts>
struct KernelArgsPack;
Expand Down
95 changes: 94 additions & 1 deletion src/include/miopen/solver/conv_direct_naive_conv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,15 @@
*******************************************************************************/
#pragma once

#include <string>
#include <miopen/execution_context.hpp>
#include <miopen/problem_description.hpp>
#include "miopen/../../kernels/stride_array.hpp"

#include <array>
#include <algorithm>
#include <cassert>
#include <string>
#include <vector>

namespace miopen {

Expand All @@ -54,5 +60,92 @@ bool IsOutputBfp16(const ProblemDescription&);
bool IsOutputInt8(const ProblemDescription&);
bool IsOutputInt32(const ProblemDescription&);

namespace conv_internal {

void DebugPrintTensorStrides(const TensorDescriptor& inDesc,
const TensorDescriptor& wDesc,
const TensorDescriptor& outDesc);

/**
* Get the index where group (G) stride should go. For NCHW, we want to convert
* its strides to NGCHW, and for NHWC, we want to convert its strides to NHWGC.
* Same applies for the 3D case.
*/
int GetGroupStrideIndex(const ProblemDescription& problem);

/**
* split the strides for C dimension in a tensor descriptor into (G, C_per_group).
* Normally, (in packed case) num channels is a multiplying factor in the stride of
* whatever lies to the left of C, e.g., in NCHW, N's stride contains C as a
* factor. We output NGCHW for NCHW (and NHWGC for NHWC)
* where the stride[G] = stride[N] / num_groups
*/
template <typename V>
V SplitStrideCtoGC(int num_groups, const V& orig_strides, int G_stride_idx)
{
assert(G_stride_idx > 0 && G_stride_idx <= orig_strides.size());
// (G_stride_idx - 1) is the stride index of whatever lies to the left and
// contains C or K as a multiplying factor. We divide this value by num_groups
// to get G_stride_val
assert(orig_strides[G_stride_idx - 1] % num_groups == 0);

V ret{orig_strides};
auto G_stride_val = orig_strides[G_stride_idx - 1] / num_groups;

ret.insert(ret.begin() + G_stride_idx, G_stride_val);

return ret;
}

/**
* Weight tensor has original dims: [K, C_per_group, Y, X] (2D case)
* We return a new stride vector with strides for [G, K_per_group, C_per_group, Y, X]
* Stride for G is computed as stride[C_per_group] * K_per_group and inserted at
* left most position
*/
template <typename V>
V SplitWeiStrideKtoGK(int k_per_group, const V& wei_strides)
{
V ret{wei_strides};
ret.insert(ret.begin(), wei_strides[0] * k_per_group);
return ret;
}

template <unsigned N>
struct ChooseStride
{
};

template <>
struct ChooseStride<5u>
{
using type = Strides5D;
};

template <>
struct ChooseStride<6u>
{
using type = Strides6D;
};

template <unsigned N, typename V>
auto MakeStrideArray(V vec)
{
typename ChooseStride<N>::type ret;
assert(vec.size() == N);

// MIOpen stores strides for NHWC in NCHW order, i.e. C stride in 2nd from left.
// We sort the input stride vector so that smallest stride is at index 0. This
// (little-endian) order is what naive convolution kernel expects for strides
std::sort(vec.begin(), vec.end());

for(unsigned i = 0; i < N; ++i)
{
ret[i] = static_cast<StrideIndexType>(vec[i]);
}
return ret;
}
} // end namespace conv_internal

} // namespace solver
} // namespace miopen
65 changes: 55 additions & 10 deletions src/include/miopen/solver/implicitgemm_ck_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,10 @@ typename ConvPtrsType::iterator FindConvPtrByID(ConvPtrsType& conv_ptrs,
});
}

template <typename DeviceOpType, typename CKArgsType>
std::vector<std::string> FillValidKernelsIDs(const ProblemDescription& problem)
template <typename DeviceOpType,
typename CKArgsType,
typename ProblemDescriptionType = ProblemDescription>
std::vector<std::string> FillValidKernelsIDs(const ProblemDescriptionType& problem)
{
const auto args = CKArgsType{problem};
const auto conv_ptrs = DeviceOpType::GetInstances();
Expand All @@ -59,29 +61,36 @@ std::vector<std::string> FillValidKernelsIDs(const ProblemDescription& problem)
return valid_kernels;
}

template <typename DeviceOpType, typename CKArgsType>
bool IsCKArgsSupported(const ProblemDescription& problem, const std::string& kernel_id)
template <typename DeviceOpType,
typename CKArgsType,
typename ProblemDescriptionType = ProblemDescription>
bool IsCKArgsSupported(const ProblemDescriptionType& problem, const std::string& kernel_id)
{
auto conv_ptrs = DeviceOpType::GetInstances();
auto ptr_iter = FindConvPtrByID(conv_ptrs, kernel_id);

return (ptr_iter != conv_ptrs.end()) && CKArgsType{problem}.IsSupportedBy(*ptr_iter);
}

template <typename DeviceOpType, typename CKArgsType>
bool IsCKApplicable(const ProblemDescription& problem)
template <typename DeviceOpType,
typename CKArgsType,
typename ProblemDescriptionType = ProblemDescription>
bool IsCKApplicable(const ProblemDescriptionType& problem)
{
const auto args = CKArgsType{problem};
if(!std::all_of(args.strides.begin(), args.strides.end(), [](auto x) { return x == 1; }))
return false;
// if(!std::all_of(args.strides.begin(), args.strides.end(), [](auto x) { return x == 1; }))
// return false;

const auto ptrs = DeviceOpType::GetInstances();
return std::any_of(
ptrs.begin(), ptrs.end(), [&args](auto& ptr) { return args.IsSupportedBy(ptr); });
}

template <typename DeviceOpType, typename CKArgsType, typename CastType>
ConvSolution InitInvokerFactory(const ProblemDescription& problem, const std::string& kernel_id)
template <typename DeviceOpType,
typename CKArgsType,
typename CastType,
typename ProblemDescriptionType = ProblemDescription>
ConvSolution InitInvokerFactory(const ProblemDescriptionType& problem, const std::string& kernel_id)
{
auto conv_ptrs = DeviceOpType::GetInstances();
auto ptr_iter = FindConvPtrByID(conv_ptrs, kernel_id);
Expand Down Expand Up @@ -112,5 +121,41 @@ ConvSolution InitInvokerFactory(const ProblemDescription& problem, const std::st
return result;
}

template <typename DeviceOpType,
typename CKArgsType,
typename CastType,
typename ProblemDescriptionType = ProblemDescription>
ConvSolution InitAnyInvokerFactory(const ProblemDescriptionType& problem,
const std::string& kernel_id)
{
auto conv_ptrs = DeviceOpType::GetInstances();
auto ptr_iter = FindConvPtrByID(conv_ptrs, kernel_id);

if(ptr_iter == conv_ptrs.end())
return {miopenStatusInvalidValue};

ConvSolution result;
result.invoker_factory =
[ck_args = CKArgsType{problem},
sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}](const std::vector<Kernel>&) mutable {
return [ck_args = std::move(ck_args), sh_conv_ptr = std::move(sh_conv_ptr)](
const Handle& handle, const AnyInvokeParams& primitive_parameters) {
const auto& data_ctx = primitive_parameters.CastTo<CastType>();
auto argument_ptr = ck_args.MakeArgPtr(sh_conv_ptr, data_ctx);
auto invoker_ptr = sh_conv_ptr->MakeInvokerPointer();

const auto enable_profiling = handle.IsProfilingEnabled();
float elapsed_time =
invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling});
if(enable_profiling)
{
handle.ResetKernelTime();
handle.AccumKernelTime(elapsed_time);
}
};
};
return result;
}

} // namespace solver
} // namespace miopen
6 changes: 3 additions & 3 deletions src/kernels/gpu_reference_kernel/fp8_kern_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,6 @@

#define KERNEL_NAME_SUFFIX CAT(CAT(INPUT_TYPE, _), CAT(CAT(WEIGHTS_TYPE, _), OUTPUT_TYPE))

#define FWD_KERNEL_NAME CAT(naive_conv_fwd_nchw_, KERNEL_NAME_SUFFIX)
#define BWD_KERNEL_NAME CAT(naive_conv_bwd_nchw_, KERNEL_NAME_SUFFIX)
#define WRW_KERNEL_NAME CAT(naive_conv_wrw_nchw_, KERNEL_NAME_SUFFIX)
#define FWD_KERNEL_NAME CAT(naive_conv_packed_fwd_nchw_, KERNEL_NAME_SUFFIX)
#define BWD_KERNEL_NAME CAT(naive_conv_packed_bwd_nchw_, KERNEL_NAME_SUFFIX)
#define WRW_KERNEL_NAME CAT(naive_conv_packed_wrw_nchw_, KERNEL_NAME_SUFFIX)
Loading

0 comments on commit 4e2a841

Please sign in to comment.