From 382bd23b6d96ced43a297c7ae3ed835cfc8b5345 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 30 Oct 2024 18:07:09 +0000
Subject: [PATCH 01/13] fix driver for old ocl kernel

---
 driver/bn_driver.hpp | 110 +++++++++++++++++++++++--------------------
 driver/dm_bnorm.cpp  |   7 +--
 2 files changed, 63 insertions(+), 54 deletions(-)
diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index c89c3f166d..3c93b2960f 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -66,7 +66,11 @@
 
 //#define BN_RUNFOR_PROFILER
 
-template <typename Tgpu, typename Tref, typename Tmix = Tgpu>
+template <typename Tgpu,
+          typename Tref,
+          typename Tmix      = Tgpu,
+          typename TEspmix   = Tgpu,
+          typename TCKOCLmix = Tgpu>
 class BatchNormDriver : public Driver
 {
 public:
@@ -133,8 +137,8 @@ class BatchNormDriver : public Driver
     tensor<Tref> out_ref;
 
     // forward
-    GpumemTensor<Tgpu> scale;
-    GpumemTensor<Tgpu> bias;
+    GpumemTensor<TEspmix> scale;
+    GpumemTensor<TEspmix> bias;
 
     // forward inference
     GpumemTensor<Tmix> estMean;
@@ -153,14 +157,14 @@ class BatchNormDriver : public Driver
     tensor<Tref> runVariance_ref;
 
     // backward needed different type for bwd.
-    GpumemTensor<Tmix> out_bwd;
+    GpumemTensor<TCKOCLmix> out_bwd;
 
-    GpumemTensor<Tgpu> bnScale;
+    GpumemTensor<TEspmix> bnScale;
     GpumemTensor<Tmix> dScale;
     GpumemTensor<Tmix> dBias;
     // savedMean declared above as Tmix as well
     GpumemTensor<Tmix> savedInvVar;
-    GpumemTensor<Tmix> dy;
+    GpumemTensor<TCKOCLmix> dy;
 
     tensor<Tref> dBias_ref;
     tensor<Tref> dScale_ref;
@@ -170,8 +174,8 @@ class BatchNormDriver : public Driver
     miopenTensorLayout_t bn_layout;
 };
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::ParseCmdLineArgs(int argc, char* argv[])
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::ParseCmdLineArgs(int argc, char* argv[])
 {
     inflags.Parse(argc, argv);
 
@@ -183,8 +187,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::ParseCmdLineArgs(int argc, char* argv[])
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::GetandSetData()
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::GetandSetData()
 {
 
     std::vector<int> in_len = GetInputTensorLengthsFromCmdLine();
@@ -201,8 +205,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::GetandSetData()
     if(isFwdInfer || isFwdTrain)
     {
         out.AllocOnHost(tensor<Tgpu>{bn_layout, in_len});
-        scale.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
-        bias.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
+        scale.AllocOnHost(tensor<TEspmix>{bn_layout, derivedBnDesc.GetLengths()});
+        bias.AllocOnHost(tensor<TEspmix>{bn_layout, derivedBnDesc.GetLengths()});
 
         auto gen_value_scale_bias = [](auto...) {
             return prng::gen_descreet_uniform_sign<Tgpu>(1e-2, 100);
@@ -236,13 +240,13 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::GetandSetData()
     }
     else if(isBwd)
     {
-        out_bwd.AllocOnHost(tensor<Tmix>{bn_layout, in_len});
+        out_bwd.AllocOnHost(tensor<TCKOCLmix>{bn_layout, in_len});
 
-        bnScale.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
-        dy.AllocOnHost(tensor<Tmix>{bn_layout, in_len});
+        bnScale.AllocOnHost(tensor<TEspmix>{bn_layout, derivedBnDesc.GetLengths()});
+        dy.AllocOnHost(tensor<TCKOCLmix>{bn_layout, in_len});
 
         auto gen_var_bwd = [](auto...) {
-            return static_cast<Tmix>(1e-2 * (prng::gen_0_to_B(100) + 1));
+            return static_cast<TCKOCLmix>(1e-2 * (prng::gen_0_to_B(100) + 1));
         };
         dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_var_bwd);
 
@@ -269,8 +273,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::GetandSetData()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::AddCmdLineArgs()
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::AddCmdLineArgs()
 {
     inflags.AddInputFlag(
         "forw",
@@ -321,8 +325,9 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::AddCmdLineArgs()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-std::vector<int> BatchNormDriver<Tgpu, Tref, Tmix>::GetInputTensorLengthsFromCmdLine()
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+std::vector<int>
+BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::GetInputTensorLengthsFromCmdLine()
 {
     int in_n = inflags.GetValueInt("batchsize");
     int in_c = inflags.GetValueInt("in_channels");
@@ -344,8 +349,8 @@ std::vector<int> BatchNormDriver<Tgpu, Tref, Tmix>::GetInputTensorLengthsFromCmd
     }
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-bool BatchNormDriver<Tgpu, Tref, Tmix>::ChkLayout_ShortName()
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+bool BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::ChkLayout_ShortName()
 {
     // check for short name of layout type
     if(inflags.FindShortName("layout") == 'L')
@@ -361,8 +366,9 @@ bool BatchNormDriver<Tgpu, Tref, Tmix>::ChkLayout_ShortName()
     }
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Tref, Tmix>::ValidateLayoutInputParameters(std::string layout_value)
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::ValidateLayoutInputParameters(
+    std::string layout_value)
 {
     if(!ChkLayout_ShortName())
     {
@@ -377,8 +383,8 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::ValidateLayoutInputParameters(std::strin
     }
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::SetBNParametersFromCmdLineArgs()
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::SetBNParametersFromCmdLineArgs()
 {
 
     //    	double bnAlpha = inflags.GetValueDouble("alpha");
@@ -510,8 +516,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::SetBNParametersFromCmdLineArgs()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::AllocateBuffersAndCopy()
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::AllocateBuffersAndCopy()
 {
     status_t status = STATUS_SUCCESS;
     DEFINE_CONTEXT(ctx);
@@ -596,8 +602,10 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::AllocateBuffersAndCopy()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdInference(Tref epsilon, float alpha, float beta)
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::runGPUFwdInference(Tref epsilon,
+                                                                               float alpha,
+                                                                               float beta)
 {
 
     if(keepRunningMeanVar)
@@ -644,11 +652,11 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdInference(Tref epsilon, float a
     return;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdTrain(Tref epsilon,
-                                                       Tref eAF,
-                                                       float alpha,
-                                                       float beta)
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::runGPUFwdTrain(Tref epsilon,
+                                                                           Tref eAF,
+                                                                           float alpha,
+                                                                           float beta)
 {
     if(saveMeanVar && keepRunningMeanVar)
     {
@@ -767,8 +775,8 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdTrain(Tref epsilon,
 #endif
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardGPU()
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::RunForwardGPU()
 {
 
     float alpha = static_cast<float>(1), beta = static_cast<float>(0);
@@ -867,8 +875,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardGPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdInference(Tref epsilon)
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::runCPUFwdInference(Tref epsilon)
 {
     int size{0};
     miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size);
@@ -916,8 +924,8 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdInference(Tref epsilon)
     return;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdTrain(Tref epsilon, Tref eAF)
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::runCPUFwdTrain(Tref epsilon, Tref eAF)
 {
     int size{0};
     miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size);
@@ -985,8 +993,8 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdTrain(Tref epsilon, Tref eAF)
     }
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardCPU()
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::RunForwardCPU()
 {
     //	T alpha = 0., beta  = 0.;
     Tref epsilon = static_cast<Tref>(EPSILON);
@@ -1014,8 +1022,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardCPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardGPU()
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::RunBackwardGPU()
 {
     if(!back)
         return miopenStatusSuccess;
@@ -1134,8 +1142,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardGPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::VerifyForward()
 {
 
     // jump out since we are forcing forward off when doing backwards.
@@ -1350,8 +1358,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardCPU()
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::RunBackwardCPU()
 {
 
     if(!back)
@@ -1439,8 +1447,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardCPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyBackward()
+template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::VerifyBackward()
 {
 
     if(!back)
diff --git a/driver/dm_bnorm.cpp b/driver/dm_bnorm.cpp
index 24e986fa1d..6381a8a83c 100644
--- a/driver/dm_bnorm.cpp
+++ b/driver/dm_bnorm.cpp
@@ -30,12 +30,13 @@ static Driver* makeDriver(const std::string& base_arg)
 {
     if(base_arg == "bnorm")
         return new BatchNormDriver<float, double>();
+    //  <Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>
     if(base_arg == "bnormfp16")
-        return new BatchNormDriver<float16, double, float16>();
+        return new BatchNormDriver<float16, double, float, float, float16>();
     if(base_arg == "bnormfp16fp32")
-        return new BatchNormDriver<float16, double, float>();
+        return new BatchNormDriver<float16, double, float, float16, float>();
     if(base_arg == "bnormbfp16fp32")
-        return new BatchNormDriver<bfloat16, double, float>();
+        return new BatchNormDriver<bfloat16, double, float, bfloat16, float>();
     return nullptr;
 }
 

From b4e1a00fc056c9230ab18f7f4088ae9e2f4f9aaa Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 30 Oct 2024 21:18:56 +0000
Subject: [PATCH 02/13] fix template name

---
 driver/bn_driver.hpp  | 207 +++++++++++++++++++++---------------------
 test/gtest/bn_bwd.cpp |   6 --
 2 files changed, 105 insertions(+), 108 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 3c93b2960f..1c9806b629 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -66,15 +66,15 @@
 
 //#define BN_RUNFOR_PROFILER
 
-template <typename Tgpu,
+template <typename TInput,
           typename Tref,
-          typename Tmix      = Tgpu,
-          typename TEspmix   = Tgpu,
-          typename TCKOCLmix = Tgpu>
+          typename TAcc       = TInput,
+          typename TScaleBias = TInput,
+          typename TOut       = TInput>
 class BatchNormDriver : public Driver
 {
 public:
-    BatchNormDriver() : Driver() { data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf; }
+    BatchNormDriver() : Driver() { data_type = (sizeof(TInput) == 4) ? miopenFloat : miopenHalf; }
 
     int AddCmdLineArgs() override;
     int ParseCmdLineArgs(int argc, char* argv[]) override;
@@ -132,39 +132,39 @@ class BatchNormDriver : public Driver
     InputFlags inflags;
     bool isDepthSpecified = false;
 
-    GpumemTensor<Tgpu> in;
-    GpumemTensor<Tgpu> out;
+    GpumemTensor<TInput> in;
+    GpumemTensor<TInput> out; // for forward the output maches input type.
     tensor<Tref> out_ref;
 
     // forward
-    GpumemTensor<TEspmix> scale;
-    GpumemTensor<TEspmix> bias;
+    GpumemTensor<TScaleBias> scale;
+    GpumemTensor<TScaleBias> bias;
 
     // forward inference
-    GpumemTensor<Tmix> estMean;
-    GpumemTensor<Tmix> estVariance;
+    GpumemTensor<TAcc> estMean;
+    GpumemTensor<TAcc> estVariance;
 
-    GpumemTensor<Tmix> savedMean;
+    GpumemTensor<TAcc> savedMean;
     tensor<Tref> savedMean_ref;
 
     // forward training
-    GpumemTensor<Tmix> savedVariance;
-    GpumemTensor<Tmix> runMean;
-    GpumemTensor<Tmix> runVariance;
+    GpumemTensor<TAcc> savedVariance;
+    GpumemTensor<TAcc> runMean;
+    GpumemTensor<TAcc> runVariance;
     // ref
     tensor<Tref> savedVariance_ref;
     tensor<Tref> runMean_ref;
     tensor<Tref> runVariance_ref;
 
     // backward needed different type for bwd.
-    GpumemTensor<TCKOCLmix> out_bwd;
+    GpumemTensor<TOut> out_bwd;
 
-    GpumemTensor<TEspmix> bnScale;
-    GpumemTensor<Tmix> dScale;
-    GpumemTensor<Tmix> dBias;
-    // savedMean declared above as Tmix as well
-    GpumemTensor<Tmix> savedInvVar;
-    GpumemTensor<TCKOCLmix> dy;
+    GpumemTensor<TScaleBias> bnScale;
+    GpumemTensor<TAcc> dScale;
+    GpumemTensor<TAcc> dBias;
+    // savedMean declared above as TAcc as well
+    GpumemTensor<TAcc> savedInvVar;
+    GpumemTensor<TOut> dy;
 
     tensor<Tref> dBias_ref;
     tensor<Tref> dScale_ref;
@@ -174,8 +174,9 @@ class BatchNormDriver : public Driver
     miopenTensorLayout_t bn_layout;
 };
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::ParseCmdLineArgs(int argc, char* argv[])
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::ParseCmdLineArgs(int argc,
+                                                                                 char* argv[])
 {
     inflags.Parse(argc, argv);
 
@@ -187,16 +188,16 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::ParseCmdLineArgs(int
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::GetandSetData()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::GetandSetData()
 {
 
     std::vector<int> in_len = GetInputTensorLengthsFromCmdLine();
     SetBNParametersFromCmdLineArgs();
 
-    auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<Tgpu>(1e-2, 100); };
+    auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<TInput>(1e-2, 100); };
 
-    in.AllocOnHost(tensor<Tgpu>{bn_layout, in_len});
+    in.AllocOnHost(tensor<TInput>{bn_layout, in_len});
     in.InitHostData(in.GetTensor().desc.GetElementSize(), true, gen_value);
 
     auto derivedBnDesc = miopen::TensorDescriptor{};
@@ -204,12 +205,12 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::GetandSetData()
 
     if(isFwdInfer || isFwdTrain)
     {
-        out.AllocOnHost(tensor<Tgpu>{bn_layout, in_len});
-        scale.AllocOnHost(tensor<TEspmix>{bn_layout, derivedBnDesc.GetLengths()});
-        bias.AllocOnHost(tensor<TEspmix>{bn_layout, derivedBnDesc.GetLengths()});
+        out.AllocOnHost(tensor<TInput>{bn_layout, in_len});
+        scale.AllocOnHost(tensor<TScaleBias>{bn_layout, derivedBnDesc.GetLengths()});
+        bias.AllocOnHost(tensor<TScaleBias>{bn_layout, derivedBnDesc.GetLengths()});
 
         auto gen_value_scale_bias = [](auto...) {
-            return prng::gen_descreet_uniform_sign<Tgpu>(1e-2, 100);
+            return prng::gen_descreet_uniform_sign<TInput>(1e-2, 100);
         };
 
         scale.InitHostData(scale.GetTensor().desc.GetElementSize(), true, gen_value_scale_bias);
@@ -217,23 +218,23 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::GetandSetData()
     }
     if(isFwdInfer)
     {
-        estMean.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
-        estVariance.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+        estMean.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
+        estVariance.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
 
         auto gen_value_emean = [](auto...) {
-            return prng::gen_descreet_uniform_sign<Tmix>(1e-2, 100);
+            return prng::gen_descreet_uniform_sign<TAcc>(1e-2, 100);
         };
         estMean.InitHostData(estMean.GetTensor().desc.GetElementSize(), true, gen_value_emean);
     }
     else if(isFwdTrain)
     {
-        savedMean.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
-        savedVariance.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
-        runMean.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
-        runVariance.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+        savedMean.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
+        savedVariance.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
+        runMean.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
+        runVariance.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
 
         auto gen_var = [](auto...) {
-            return static_cast<Tmix>(1e-2 * (prng::gen_0_to_B(100) + 1));
+            return static_cast<TAcc>(1e-2 * (prng::gen_0_to_B(100) + 1));
         };
         runMean.InitHostData(runMean.GetTensor().desc.GetElementSize(), true, gen_var);
         runVariance.InitHostData(runVariance.GetTensor().desc.GetElementSize(), true, gen_var);
@@ -242,7 +243,7 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::GetandSetData()
     {
         out_bwd.AllocOnHost(tensor<TCKOCLmix>{bn_layout, in_len});
 
-        bnScale.AllocOnHost(tensor<TEspmix>{bn_layout, derivedBnDesc.GetLengths()});
+        bnScale.AllocOnHost(tensor<TScaleBias>{bn_layout, derivedBnDesc.GetLengths()});
         dy.AllocOnHost(tensor<TCKOCLmix>{bn_layout, in_len});
 
         auto gen_var_bwd = [](auto...) {
@@ -250,17 +251,17 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::GetandSetData()
         };
         dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_var_bwd);
 
-        dScale.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
-        dBias.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
-        savedMean.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
-        savedInvVar.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+        dScale.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
+        dBias.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
+        savedMean.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
+        savedInvVar.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
 
         bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value);
 
         savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_var_bwd);
 
         auto gen_in_var = [](auto...) {
-            return static_cast<Tmix>(1e-2 * (prng::gen_0_to_B(100) + 1));
+            return static_cast<TAcc>(1e-2 * (prng::gen_0_to_B(100) + 1));
         };
         savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var);
     }
@@ -273,8 +274,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::GetandSetData()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::AddCmdLineArgs()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::AddCmdLineArgs()
 {
     inflags.AddInputFlag(
         "forw",
@@ -325,9 +326,9 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::AddCmdLineArgs()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
 std::vector<int>
-BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::GetInputTensorLengthsFromCmdLine()
+BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::GetInputTensorLengthsFromCmdLine()
 {
     int in_n = inflags.GetValueInt("batchsize");
     int in_c = inflags.GetValueInt("in_channels");
@@ -349,8 +350,8 @@ BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::GetInputTensorLengthsFrom
     }
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-bool BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::ChkLayout_ShortName()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+bool BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::ChkLayout_ShortName()
 {
     // check for short name of layout type
     if(inflags.FindShortName("layout") == 'L')
@@ -366,8 +367,8 @@ bool BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::ChkLayout_ShortName(
     }
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::ValidateLayoutInputParameters(
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::ValidateLayoutInputParameters(
     std::string layout_value)
 {
     if(!ChkLayout_ShortName())
@@ -383,8 +384,8 @@ void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::ValidateLayoutInputP
     }
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::SetBNParametersFromCmdLineArgs()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::SetBNParametersFromCmdLineArgs()
 {
 
     //    	double bnAlpha = inflags.GetValueDouble("alpha");
@@ -516,8 +517,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::SetBNParametersFromCm
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::AllocateBuffersAndCopy()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::AllocateBuffersAndCopy()
 {
     status_t status = STATUS_SUCCESS;
     DEFINE_CONTEXT(ctx);
@@ -602,10 +603,10 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::AllocateBuffersAndCop
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::runGPUFwdInference(Tref epsilon,
-                                                                               float alpha,
-                                                                               float beta)
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::runGPUFwdInference(Tref epsilon,
+                                                                                    float alpha,
+                                                                                    float beta)
 {
 
     if(keepRunningMeanVar)
@@ -652,11 +653,11 @@ void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::runGPUFwdInference(T
     return;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::runGPUFwdTrain(Tref epsilon,
-                                                                           Tref eAF,
-                                                                           float alpha,
-                                                                           float beta)
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::runGPUFwdTrain(Tref epsilon,
+                                                                                Tref eAF,
+                                                                                float alpha,
+                                                                                float beta)
 {
     if(saveMeanVar && keepRunningMeanVar)
     {
@@ -775,8 +776,8 @@ void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::runGPUFwdTrain(Tref
 #endif
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::RunForwardGPU()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::RunForwardGPU()
 {
 
     float alpha = static_cast<float>(1), beta = static_cast<float>(0);
@@ -875,8 +876,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::RunForwardGPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::runCPUFwdInference(Tref epsilon)
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::runCPUFwdInference(Tref epsilon)
 {
     int size{0};
     miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size);
@@ -924,8 +925,9 @@ void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::runCPUFwdInference(T
     return;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::runCPUFwdTrain(Tref epsilon, Tref eAF)
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::runCPUFwdTrain(Tref epsilon,
+                                                                                Tref eAF)
 {
     int size{0};
     miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size);
@@ -993,8 +995,8 @@ void BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::runCPUFwdTrain(Tref
     }
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::RunForwardCPU()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::RunForwardCPU()
 {
     //	T alpha = 0., beta  = 0.;
     Tref epsilon = static_cast<Tref>(EPSILON);
@@ -1022,8 +1024,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::RunForwardCPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::RunBackwardGPU()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::RunBackwardGPU()
 {
     if(!back)
         return miopenStatusSuccess;
@@ -1142,18 +1144,18 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::RunBackwardGPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::VerifyForward()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::VerifyForward()
 {
 
     // jump out since we are forcing forward off when doing backwards.
     if(!forw)
         return miopenStatusSuccess;
 
-    const Tref maxrms = static_cast<Tref>((sizeof(Tgpu) == 4) ? RMSTOL_FP32 : RMSTOL_FP16);
+    const Tref maxrms = static_cast<Tref>((sizeof(TInput) == 4) ? RMSTOL_FP32 : RMSTOL_FP16);
 
 #if(MIO_BN_DEBUG == 1)
-    const Tref tolerance = static_cast<Tref>((sizeof(Tgpu) == 4) ? ERRTOL_FP32 : ERRTOL_FP16);
+    const Tref tolerance = static_cast<Tref>((sizeof(TInput) == 4) ? ERRTOL_FP32 : ERRTOL_FP16);
     Tref diff            = static_cast<Tref>(0.);
 #endif
 
@@ -1181,13 +1183,13 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::VerifyForward()
                                i < MIO_BN_MAX_DEBUGLOOP;
                     i++)
                 {
-                    diff = fabs(Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i])));
+                    diff = fabs(TAcc(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i])));
                     if(!std::isfinite(diff) || diff > tolerance)
                     {
                         std::cout << "rm[" << i << "]: " << runMean.GetVector()[i];
                         std::cout << ", rm_host[" << i << "]: " << runMean_ref.data[i];
                         std::cout << ", diff[" << i << "]: "
-                                  << Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i]))
+                                  << TAcc(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i]))
                                   << std::endl;
                     }
                 }
@@ -1211,13 +1213,13 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::VerifyForward()
                     i++)
                 {
                     diff = fabs(
-                        Tmix(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i])));
+                        TAcc(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i])));
                     if(!std::isfinite(diff) || diff > tolerance)
                     {
                         std::cout << "rv[" << i << "]: " << runVariance.GetVector()[i];
                         std::cout << ", rv_host[" << i << "]: " << runVariance_ref.data[i];
                         std::cout << ", diff[" << i << "]: "
-                                  << Tmix(fabs(runVariance.GetVector()[i]) -
+                                  << TAcc(fabs(runVariance.GetVector()[i]) -
                                           fabs(runVariance_ref.data[i]))
                                   << std::endl;
                     }
@@ -1247,14 +1249,14 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::VerifyForward()
                                i < MIO_BN_MAX_DEBUGLOOP;
                     i++)
                 {
-                    diff = fabs(Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedMean_ref.data[i])));
+                    diff = fabs(TAcc(fabs(savedMean.GetVector()[i]) - fabs(savedMean_ref.data[i])));
                     maxval = maxval < diff ? diff : maxval;
                     if(!std::isfinite(diff) || diff > tolerance)
                     {
                         std::cout << "sm[" << i << "]: " << savedMean.GetVector()[i];
                         std::cout << ", sm_host[" << i << "]: " << savedMean_ref.data[i];
                         std::cout << ", diff[" << i << "]: "
-                                  << Tmix(fabs(savedMean.GetVector()[i]) -
+                                  << TAcc(fabs(savedMean.GetVector()[i]) -
                                           fabs(savedMean_ref.data[i]))
                                   << std::endl;
                     }
@@ -1282,13 +1284,13 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::VerifyForward()
                     i++)
                 {
                     diff = fabs(
-                        Tmix(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i])));
+                        TAcc(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i])));
                     if(!std::isfinite(diff) || diff > tolerance)
                     {
                         std::cout << "sv[" << i << "]: " << savedVariance.GetVector()[i];
                         std::cout << ", sv_host[" << i << "]: " << savedVariance_ref.data[i];
                         std::cout << ", diff[" << i << "]: "
-                                  << Tmix(fabs(savedVariance.GetVector()[i]) -
+                                  << TAcc(fabs(savedVariance.GetVector()[i]) -
                                           fabs(savedVariance_ref.data[i]))
                                   << std::endl;
                     }
@@ -1358,8 +1360,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::VerifyForward()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::RunBackwardCPU()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::RunBackwardCPU()
 {
 
     if(!back)
@@ -1447,15 +1449,16 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::RunBackwardCPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix, typename TEspmix, typename TCKOCLmix>
-int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::VerifyBackward()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::VerifyBackward()
 {
 
     if(!back)
         return miopenStatusSuccess;
 
-    const Tref maxrms = static_cast<Tref>(((sizeof(Tgpu) == 4) ? RMSTOL_FP32 : RMSTOL_FP16) * 1000);
-    bool anError      = false;
+    const Tref maxrms =
+        static_cast<Tref>(((sizeof(TInput) == 4) ? RMSTOL_FP32 : RMSTOL_FP16) * 1000);
+    bool anError = false;
 
     RunBackwardCPU();
 
@@ -1465,7 +1468,7 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::VerifyBackward()
 
 #if(MIO_BN_DEBUG == 1)
     const Tref tolerance =
-        static_cast<Tref>(1000 * (sizeof(Tgpu) == 4) ? ERRTOL_FP32 : ERRTOL_FP16);
+        static_cast<Tref>(1000 * (sizeof(TInput) == 4) ? ERRTOL_FP32 : ERRTOL_FP16);
     Tref diff = static_cast<Tref>(0.0);
 #endif
     maxval          = static_cast<Tref>(0.0);
@@ -1478,14 +1481,14 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::VerifyBackward()
 #if(MIO_BN_DEBUG == 1)
         for(int i = 0; i < out_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
         {
-            diff   = fabs(Tgpu(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])));
+            diff   = fabs(TInput(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])));
             maxval = maxval < diff ? diff : maxval;
             if(!std::isfinite(diff) || diff > tolerance)
             {
                 std::cout << "out_ref[" << i << "]: " << out_ref.data[i];
                 std::cout << "\tout_bwd.GetVector()[" << i << "]: " << out_bwd.GetVector()[i];
                 std::cout << "\tdiff[" << i
-                          << "]: " << Tgpu(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i]));
+                          << "]: " << TInput(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i]));
                 std::cout << "\tratioH: "
                           << fabs(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])) /
                                  fabs(out_bwd.GetVector()[i])
@@ -1511,14 +1514,14 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::VerifyBackward()
 #if(MIO_BN_DEBUG == 1)
         for(int i = 0; i < dScale.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
         {
-            auto diff = fabs(Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])));
+            auto diff = fabs(TAcc(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])));
             maxval    = maxval < diff ? diff : maxval;
             if(!std::isfinite(diff) || diff > tolerance)
             {
                 std::cout << "dscale[" << i << "]: " << dScale.GetVector()[i];
                 std::cout << "\tdscale_host[" << i << "]: " << dScale_ref.data[i];
                 std::cout << "\tdiff[" << i
-                          << "]: " << Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i]));
+                          << "]: " << TAcc(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i]));
                 std::cout << "\tratioH: "
                           << fabs(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])) /
                                  fabs(dScale_ref.data[i])
@@ -1543,13 +1546,13 @@ int BatchNormDriver<Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>::VerifyBackward()
 #if(MIO_BN_DEBUG == 1)
         for(int i = 0; i < dBias.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
         {
-            diff = fabs(Tmix(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i])));
+            diff = fabs(TAcc(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i])));
             if(!std::isfinite(diff) || diff > tolerance)
             {
                 std::cout << "dbias[" << i << "]: " << dBias.GetVector()[i];
                 std::cout << "\tdbias_host[" << i << "]: " << dBias_ref.data[i];
                 std::cout << "\tdiff[" << i
-                          << "]: " << Tmix(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i]));
+                          << "]: " << TAcc(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i]));
                 std::cout << "\tratioH: "
                           << fabs(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i])) /
                                  fabs(dBias_ref.data[i])
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
index 164c36bb18..1a77050d81 100644
--- a/test/gtest/bn_bwd.cpp
+++ b/test/gtest/bn_bwd.cpp
@@ -36,12 +36,6 @@ struct GPU_BN_BWD_Large_FP16
 {
 };
 
-// bf16 NHWC solver accepts is only on CK solver
-// XDataType       : bfloat16
-// YDataYype       : bfloat16
-// ScaleDataType   : bfloat16
-// BiasDataType    : bfloat16
-// MeanVarDataType : float
 struct GPU_BN_BWD_Small_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
 {
 };

From 14f1a1f4f0670456d1a9722ce73abdf85a32ff2e Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 31 Oct 2024 03:25:01 +0000
Subject: [PATCH 03/13] fix review comments

---
 driver/bn_driver.hpp | 97 +++++++++++++++++++++-----------------------
 driver/dm_bnorm.cpp  |  2 +-
 2 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 1c9806b629..61197129ab 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -174,9 +174,8 @@ class BatchNormDriver : public Driver
     miopenTensorLayout_t bn_layout;
 };
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::ParseCmdLineArgs(int argc,
-                                                                                 char* argv[])
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::ParseCmdLineArgs(int argc, char* argv[])
 {
     inflags.Parse(argc, argv);
 
@@ -188,8 +187,8 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::ParseCmdLineArgs
     return miopenStatusSuccess;
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::GetandSetData()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::GetandSetData()
 {
 
     std::vector<int> in_len = GetInputTensorLengthsFromCmdLine();
@@ -241,13 +240,13 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::GetandSetData()
     }
     else if(isBwd)
     {
-        out_bwd.AllocOnHost(tensor<TCKOCLmix>{bn_layout, in_len});
+        out_bwd.AllocOnHost(tensor<TOut>{bn_layout, in_len});
 
         bnScale.AllocOnHost(tensor<TScaleBias>{bn_layout, derivedBnDesc.GetLengths()});
-        dy.AllocOnHost(tensor<TCKOCLmix>{bn_layout, in_len});
+        dy.AllocOnHost(tensor<TOut>{bn_layout, in_len});
 
         auto gen_var_bwd = [](auto...) {
-            return static_cast<TCKOCLmix>(1e-2 * (prng::gen_0_to_B(100) + 1));
+            return static_cast<TOut>(1e-2 * (prng::gen_0_to_B(100) + 1));
         };
         dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_var_bwd);
 
@@ -258,11 +257,10 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::GetandSetData()
 
         bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value);
 
-        savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_var_bwd);
-
         auto gen_in_var = [](auto...) {
             return static_cast<TAcc>(1e-2 * (prng::gen_0_to_B(100) + 1));
         };
+        savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_in_var);
         savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var);
     }
     else
@@ -274,8 +272,8 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::GetandSetData()
     return miopenStatusSuccess;
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::AddCmdLineArgs()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::AddCmdLineArgs()
 {
     inflags.AddInputFlag(
         "forw",
@@ -326,9 +324,9 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::AddCmdLineArgs()
     return miopenStatusSuccess;
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
 std::vector<int>
-BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::GetInputTensorLengthsFromCmdLine()
+BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::GetInputTensorLengthsFromCmdLine()
 {
     int in_n = inflags.GetValueInt("batchsize");
     int in_c = inflags.GetValueInt("in_channels");
@@ -350,8 +348,8 @@ BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::GetInputTensorLength
     }
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-bool BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::ChkLayout_ShortName()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+bool BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::ChkLayout_ShortName()
 {
     // check for short name of layout type
     if(inflags.FindShortName("layout") == 'L')
@@ -367,8 +365,8 @@ bool BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::ChkLayout_Short
     }
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::ValidateLayoutInputParameters(
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::ValidateLayoutInputParameters(
     std::string layout_value)
 {
     if(!ChkLayout_ShortName())
@@ -384,8 +382,8 @@ void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::ValidateLayoutI
     }
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::SetBNParametersFromCmdLineArgs()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::SetBNParametersFromCmdLineArgs()
 {
 
     //    	double bnAlpha = inflags.GetValueDouble("alpha");
@@ -517,8 +515,8 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::SetBNParametersF
     return miopenStatusSuccess;
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::AllocateBuffersAndCopy()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::AllocateBuffersAndCopy()
 {
     status_t status = STATUS_SUCCESS;
     DEFINE_CONTEXT(ctx);
@@ -603,10 +601,10 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::AllocateBuffersA
     return miopenStatusSuccess;
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::runGPUFwdInference(Tref epsilon,
-                                                                                    float alpha,
-                                                                                    float beta)
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::runGPUFwdInference(Tref epsilon,
+                                                                               float alpha,
+                                                                               float beta)
 {
 
     if(keepRunningMeanVar)
@@ -653,11 +651,11 @@ void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::runGPUFwdInfere
     return;
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::runGPUFwdTrain(Tref epsilon,
-                                                                                Tref eAF,
-                                                                                float alpha,
-                                                                                float beta)
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::runGPUFwdTrain(Tref epsilon,
+                                                                           Tref eAF,
+                                                                           float alpha,
+                                                                           float beta)
 {
     if(saveMeanVar && keepRunningMeanVar)
     {
@@ -776,8 +774,8 @@ void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::runGPUFwdTrain(
 #endif
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::RunForwardGPU()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunForwardGPU()
 {
 
     float alpha = static_cast<float>(1), beta = static_cast<float>(0);
@@ -876,8 +874,8 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::RunForwardGPU()
     return miopenStatusSuccess;
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::runCPUFwdInference(Tref epsilon)
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::runCPUFwdInference(Tref epsilon)
 {
     int size{0};
     miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size);
@@ -925,9 +923,8 @@ void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::runCPUFwdInfere
     return;
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::runCPUFwdTrain(Tref epsilon,
-                                                                                Tref eAF)
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::runCPUFwdTrain(Tref epsilon, Tref eAF)
 {
     int size{0};
     miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size);
@@ -995,8 +992,8 @@ void BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::runCPUFwdTrain(
     }
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::RunForwardCPU()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunForwardCPU()
 {
     //	T alpha = 0., beta  = 0.;
     Tref epsilon = static_cast<Tref>(EPSILON);
@@ -1024,8 +1021,8 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::RunForwardCPU()
     return miopenStatusSuccess;
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::RunBackwardGPU()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunBackwardGPU()
 {
     if(!back)
         return miopenStatusSuccess;
@@ -1144,8 +1141,8 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::RunBackwardGPU()
     return miopenStatusSuccess;
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::VerifyForward()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::VerifyForward()
 {
 
     // jump out since we are forcing forward off when doing backwards.
@@ -1360,8 +1357,8 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::VerifyForward()
     return miopenStatusSuccess;
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::RunBackwardCPU()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunBackwardCPU()
 {
 
     if(!back)
@@ -1449,8 +1446,8 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::RunBackwardCPU()
     return miopenStatusSuccess;
 }
 
-template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TCKOCLmix>
-int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::VerifyBackward()
+template <typename TInput, typename Tref, typename TAcc, typename TScaleBias, typename TOut>
+int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::VerifyBackward()
 {
 
     if(!back)
@@ -1481,14 +1478,14 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TCKOCLmix>::VerifyBackward()
 #if(MIO_BN_DEBUG == 1)
         for(int i = 0; i < out_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
         {
-            diff   = fabs(TInput(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])));
+            diff   = fabs(TOut(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])));
             maxval = maxval < diff ? diff : maxval;
             if(!std::isfinite(diff) || diff > tolerance)
             {
                 std::cout << "out_ref[" << i << "]: " << out_ref.data[i];
                 std::cout << "\tout_bwd.GetVector()[" << i << "]: " << out_bwd.GetVector()[i];
                 std::cout << "\tdiff[" << i
-                          << "]: " << TInput(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i]));
+                          << "]: " << TOut(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i]));
                 std::cout << "\tratioH: "
                           << fabs(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])) /
                                  fabs(out_bwd.GetVector()[i])
diff --git a/driver/dm_bnorm.cpp b/driver/dm_bnorm.cpp
index 6381a8a83c..0ee3da5d0a 100644
--- a/driver/dm_bnorm.cpp
+++ b/driver/dm_bnorm.cpp
@@ -30,7 +30,7 @@ static Driver* makeDriver(const std::string& base_arg)
 {
     if(base_arg == "bnorm")
         return new BatchNormDriver<float, double>();
-    //  <Tgpu, Tref, Tmix, TEspmix, TCKOCLmix>
+    //  <Tgpu, Tref, TAcc, TScaleBias, TOut>
     if(base_arg == "bnormfp16")
         return new BatchNormDriver<float16, double, float, float, float16>();
     if(base_arg == "bnormfp16fp32")

From 6a5cc924adeef3e175f25a9e7dc743c3e06a8ea9 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Fri, 1 Nov 2024 01:32:47 +0000
Subject: [PATCH 04/13] fix isApplicable for CK and OCL bn spatial

---
 src/batchnorm/problem_description.cpp         | 16 ++++----
 .../miopen/batchnorm/problem_description.hpp  | 29 ++++++++------
 src/solver/batchnorm/backward_ck.cpp          | 17 +++++++-
 .../batchnorm/backward_per_activation.cpp     |  5 +--
 .../batchnorm/backward_spatial_multiple.cpp   | 10 ++---
 .../batchnorm/backward_spatial_single.cpp     | 39 +++++++++++++------
 src/solver/batchnorm/forward_inference.cpp    | 28 +++++++++----
 src/solver/batchnorm/forward_inference_ck.cpp | 16 +++++++-
 .../batchnorm/forward_per_activation.cpp      |  2 +-
 .../batchnorm/forward_spatial_multiple.cpp    |  2 +-
 .../batchnorm/forward_spatial_single.cpp      | 37 +++++++++++-------
 src/solver/batchnorm/forward_training_ck.cpp  | 15 ++++++-
 12 files changed, 149 insertions(+), 67 deletions(-)

diff --git a/src/batchnorm/problem_description.cpp b/src/batchnorm/problem_description.cpp
index ac63fdf73e..7eb2592c46 100644
--- a/src/batchnorm/problem_description.cpp
+++ b/src/batchnorm/problem_description.cpp
@@ -67,7 +67,7 @@ NetworkConfig ProblemDescription::MakeForwardTrainingNetworkConfig() const
     size_t ygridsize = 1;
 
     bool bfpmixparm = false;
-    if(xDesc.GetType() == miopenHalf && GetBnScaleBiasMeanVarDesc().GetType() == miopenFloat)
+    if(IsMix())
     {
         bfpmixparm = true;
     }
@@ -137,7 +137,7 @@ NetworkConfig ProblemDescription::MakeForwardTrainingNetworkConfig() const
             ss << "fp16" << static_cast<int>(IsFp16());
             ss << "fp32" << static_cast<int>(IsFp32());
             ss << "fp64" << static_cast<int>(IsFp64());
-            ss << "fbf16" << static_cast<int>(IsBfp16());
+            ss << "fbf16" << static_cast<int>(IsBFp16());
             ss << "fmix" << static_cast<int>(IsMix());
             ss << "c" << c;
         }
@@ -154,7 +154,7 @@ NetworkConfig ProblemDescription::MakeForwardTrainingNetworkConfig() const
             ss << "fp16" << static_cast<int>(IsFp16());
             ss << "fp32" << static_cast<int>(IsFp32());
             ss << "fp64" << static_cast<int>(IsFp64());
-            ss << "fbf16" << static_cast<int>(IsBfp16());
+            ss << "fbf16" << static_cast<int>(IsBFp16());
             ss << "fmix" << static_cast<int>(IsMix());
             ss << "single" << static_cast<int>(single);
             ss << "n" << n;
@@ -173,7 +173,7 @@ NetworkConfig ProblemDescription::MakeForwardTrainingNetworkConfig() const
         ss << "fp16" << static_cast<int>(IsFp16());
         ss << "fp32" << static_cast<int>(IsFp32());
         ss << "fp64" << static_cast<int>(IsFp64());
-        ss << "fbf16" << static_cast<int>(IsBfp16());
+        ss << "fbf16" << static_cast<int>(IsBFp16());
         ss << "fmix" << static_cast<int>(IsMix());
         ss << "gx" << xgridsize;
         ss << "gy" << ygridsize;
@@ -203,7 +203,7 @@ NetworkConfig ProblemDescription::MakeForwardInferenceNetworkConfig() const
     ss << "fp16" << static_cast<int>(IsFp16());
     ss << "fp32" << static_cast<int>(IsFp32());
     ss << "fp64" << static_cast<int>(IsFp64());
-    ss << "fbf16" << static_cast<int>(IsBfp16());
+    ss << "fbf16" << static_cast<int>(IsBFp16());
     ss << "fmix" << static_cast<int>(IsMix());
     ss << "mode" << bn_mode;
     ss << "HWdims" << in_cstride;
@@ -218,7 +218,7 @@ NetworkConfig ProblemDescription::MakeBackwardNetworkConfig() const
     std::ostringstream ss;
 
     bool bfpmixparm = false;
-    if(xDesc.GetType() == miopenHalf && GetScaleBiasDiffDesc().GetType() == miopenFloat)
+    if(xDesc.GetType() == miopenHalf && GetBnScale().GetType() == miopenFloat)
     {
         bfpmixparm = true;
     }
@@ -311,7 +311,7 @@ NetworkConfig ProblemDescription::MakeBackwardNetworkConfig() const
         ss << "fp16" << static_cast<int>(IsFp16());
         ss << "fp32" << static_cast<int>(IsFp32());
         ss << "fp64" << static_cast<int>(IsFp64());
-        ss << "fbf16" << static_cast<int>(IsBfp16());
+        ss << "fbf16" << static_cast<int>(IsBFp16());
         ss << "fmix" << static_cast<int>(IsMix());
         ss << "single" << static_cast<int>(single);
         ss << "gcn" << ldsgcn;
@@ -334,7 +334,7 @@ NetworkConfig ProblemDescription::MakeBackwardNetworkConfig() const
         ss << "fp16" << static_cast<int>(IsFp16());
         ss << "fp32" << static_cast<int>(IsFp32());
         ss << "fp64" << static_cast<int>(IsFp64());
-        ss << "fbf16" << static_cast<int>(IsBfp16());
+        ss << "fbf16" << static_cast<int>(IsBFp16());
         ss << "fmix" << static_cast<int>(IsMix());
         ss << "nhw" << in_nhw;
     }
diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp
index b0ecb64b7e..a1ed9d2594 100644
--- a/src/include/miopen/batchnorm/problem_description.hpp
+++ b/src/include/miopen/batchnorm/problem_description.hpp
@@ -52,6 +52,16 @@ struct ProblemDescriptionTag
 {
 };
 
+inline bool is_fp16_or_bfp16(miopenDataType_t type)
+{
+    return ((type == miopenHalf) || (type == miopenBFloat16));
+}
+
+inline bool is_fp32_or_fp64(miopenDataType_t type)
+{
+    return ((type == miopenFloat) || (type == miopenDouble));
+}
+
 struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
                                                     ProblemDescriptionTag
 #if MIOPEN_ENABLE_SQLITE
@@ -173,13 +183,13 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
         return dxDesc;
     }
 
-    const TensorDescriptor& GetBnScaleBiasMeanVarDesc() const
-    {
-        assert(direction == Direction::ForwardTraining || direction == Direction::ForwardInference);
-        return scaleDesc;
-    }
+    const TensorDescriptor& GetBnScale() const { return scaleDesc; }
+
+    const TensorDescriptor& GetBnBias() const { return biasDesc; }
+
+    const TensorDescriptor& GetBnSMean() const { return sMeanDesc; }
 
-    const TensorDescriptor& GetScaleBiasDiffDesc() const { return scaleDesc; }
+    const TensorDescriptor& GetBnSVar() const { return sVarianceDesc; }
 
     bool GetResultSave() const
     {
@@ -233,11 +243,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
     bool IsFp64() const { return xDesc.GetType() == miopenDouble; }
     bool IsFp32() const { return xDesc.GetType() == miopenFloat; }
     bool IsFp16() const { return xDesc.GetType() == miopenHalf; }
-    bool IsMix() const
-    {
-        return xDesc.GetType() == miopenHalf && sMeanDesc.GetType() == miopenFloat;
-    }
-    bool IsBfp16() const { return xDesc.GetType() == miopenBFloat16; }
+    bool IsBFp16() const { return xDesc.GetType() == miopenBFloat16; }
+    bool IsMix() const { return (IsFp16() || IsBFp16()) && sMeanDesc.GetType() == miopenFloat; }
 
     void Serialize(std::ostream& stream) const { stream << MakeNetworkConfig().ToString(); }
 
diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
index 198b046eff..fd2613f807 100644
--- a/src/solver/batchnorm/backward_ck.cpp
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -367,12 +367,25 @@ bool BnCKBwdBackward::IsApplicable(
         return false;
     if(bn_problem.GetDirection() != miopen::batchnorm::Direction::Backward)
         return false;
-    if(bn_problem.GetXDesc().GetType() != bn_problem.GetScaleBiasDiffDesc().GetType())
-        return false;
     if(bn_problem.GetMode() != miopenBNSpatial)
         return false;
     if(!bn_problem.Is2D())
         return false;
+    // case 1 : fp16 or bfp16
+    if(!((::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+          bn_problem.GetDXDesc().GetType() == miopenFloat &&
+          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetBnScale().GetType()) &&
+          bn_problem.GetDYDesc().GetType() == miopenFloat &&
+          bn_problem.GetBnSMean().GetType() == miopenFloat &&
+          bn_problem.GetBnSVar().GetType() == miopenFloat) ||
+         // case 1 : fp32 or fp64
+         (::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSVar().GetType()))))
+        return false;
 
     switch(bn_problem.GetXDesc().GetType())
     {
diff --git a/src/solver/batchnorm/backward_per_activation.cpp b/src/solver/batchnorm/backward_per_activation.cpp
index af52fbc339..9a02ad9f25 100644
--- a/src/solver/batchnorm/backward_per_activation.cpp
+++ b/src/solver/batchnorm/backward_per_activation.cpp
@@ -57,14 +57,13 @@ BnBwdTrainingPerActivation::GetSolution(const ExecutionContext& context,
     bool bfp16parm  = false;
     bool bfp32parm  = true;
 
-    if(problem.GetXDesc().GetType() == miopenHalf &&
-       problem.GetScaleBiasDiffDesc().GetType() == miopenHalf)
+    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
     {
         bfp16parm = true;
         bfp32parm = false;
     }
     else if(problem.GetXDesc().GetType() == miopenHalf &&
-            problem.GetScaleBiasDiffDesc().GetType() == miopenFloat)
+            problem.GetBnScale().GetType() == miopenFloat)
     {
         bfpmixparm = true;
         bfp32parm  = false;
diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp
index 7fa9c0f89a..d9c5596ce2 100644
--- a/src/solver/batchnorm/backward_spatial_multiple.cpp
+++ b/src/solver/batchnorm/backward_spatial_multiple.cpp
@@ -50,8 +50,7 @@ bool BnBwdTrainingSpatialMultiple::IsApplicable(
     }
 
 #if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR
-    if(problem.GetXDesc().GetType() == miopenHalf &&
-       problem.GetScaleBiasDiffDesc().GetType() == miopenHalf)
+    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
     {
         // bfp16parm = true;
         // Unsupported kernel mode, error in kernel code
@@ -72,14 +71,13 @@ ConvSolution BnBwdTrainingSpatialMultiple::GetSolution(
     bool bfp16parm  = false;
     bool bfp32parm  = true;
 
-    if(problem.GetXDesc().GetType() == miopenHalf &&
-       problem.GetScaleBiasDiffDesc().GetType() == miopenHalf)
+    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
     {
         bfp16parm = true;
         bfp32parm = false;
     }
     else if(problem.GetXDesc().GetType() == miopenHalf &&
-            problem.GetScaleBiasDiffDesc().GetType() == miopenFloat)
+            problem.GetBnScale().GetType() == miopenFloat)
     {
         bfpmixparm = true;
         bfp32parm  = false;
@@ -274,7 +272,7 @@ ConvSolution BnBwdTrainingSpatialMultiple::GetSolution(
         }
     }
 
-    const auto dtype    = problem.GetScaleBiasDiffDesc().GetType();
+    const auto dtype    = problem.GetBnScale().GetType();
     const auto useSaved = problem.UseSaved();
 
     result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp
index 86fa5a68c7..c8d8681488 100644
--- a/src/solver/batchnorm/backward_spatial_single.cpp
+++ b/src/solver/batchnorm/backward_spatial_single.cpp
@@ -40,17 +40,17 @@ namespace solver {
 namespace batchnorm {
 
 bool BnBwdTrainingSpatialSingle::IsApplicable(
-    const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const
+    const ExecutionContext&, const miopen::batchnorm::ProblemDescription& bn_problem) const
 {
-    if(problem.GetDirection() != miopen::batchnorm::Direction::Backward ||
-       problem.GetMode() != miopenBNSpatial)
+    if(bn_problem.GetDirection() != miopen::batchnorm::Direction::Backward ||
+       bn_problem.GetMode() != miopenBNSpatial)
         return false;
-    if(!problem.Is2D())
+    if(!bn_problem.Is2D())
         return false;
 
 #if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR
-    if(problem.GetXDesc().GetType() == miopenHalf &&
-       problem.GetScaleBiasDiffDesc().GetType() == miopenHalf)
+    if(bn_problem.GetXDesc().GetType() == miopenHalf &&
+       bn_problem.GetBnScale().GetType() == miopenHalf)
     {
         // bfp16parm = true;
         // Unsupported kernel mode, error in kernel code
@@ -59,11 +59,27 @@ bool BnBwdTrainingSpatialSingle::IsApplicable(
     }
 #endif
 
-    if(problem.IsLayoutNHWC())
+    if(bn_problem.IsLayoutNHWC())
         return true;
 
+    // case 1 : fp16 or bfp16
+    if(!((::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetDXDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetDYDesc().GetType()) &&
+          bn_problem.GetBnScale().GetType() == miopenFloat &&
+          bn_problem.GetBnSMean().GetType() == miopenFloat &&
+          bn_problem.GetBnSVar().GetType() == miopenFloat) ||
+         // case 1 : fp32 or fp64
+         (::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSVar().GetType()))))
+        return false;
+
     int n, c, h, w;
-    std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
+    std::tie(n, c, h, w) = tien<4>(bn_problem.GetXDesc().GetLengths());
 
     unsigned int in_cstride = h * w;
     unsigned int in_nhw     = n * in_cstride;
@@ -83,14 +99,13 @@ BnBwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context,
     bool bfp16parm  = false;
     bool bfp32parm  = true;
 
-    if(problem.GetXDesc().GetType() == miopenHalf &&
-       problem.GetScaleBiasDiffDesc().GetType() == miopenHalf)
+    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
     {
         bfp16parm = true;
         bfp32parm = false;
     }
     else if(problem.GetXDesc().GetType() == miopenHalf &&
-            problem.GetScaleBiasDiffDesc().GetType() == miopenFloat)
+            problem.GetBnScale().GetType() == miopenFloat)
     {
         bfpmixparm = true;
         bfp32parm  = false;
@@ -278,7 +293,7 @@ BnBwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context,
         result.construction_params.push_back(kernel);
     }
 
-    const auto dtype    = problem.GetScaleBiasDiffDesc().GetType();
+    const auto dtype    = problem.GetBnScale().GetType();
     const auto useSaved = problem.UseSaved();
 
     result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
diff --git a/src/solver/batchnorm/forward_inference.cpp b/src/solver/batchnorm/forward_inference.cpp
index 2f945c46a9..f62c161092 100644
--- a/src/solver/batchnorm/forward_inference.cpp
+++ b/src/solver/batchnorm/forward_inference.cpp
@@ -39,16 +39,29 @@ namespace solver {
 namespace batchnorm {
 
 bool BnFwdInference::IsApplicable(const ExecutionContext&,
-                                  const miopen::batchnorm::ProblemDescription& problem) const
+                                  const miopen::batchnorm::ProblemDescription& bn_problem) const
 {
-    if(problem.IsLayoutNHWC())
+    if(bn_problem.IsLayoutNHWC())
         return false;
-    if(problem.GetDirection() != miopen::batchnorm::Direction::ForwardInference)
+    if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardInference)
         return false;
-    if(!(problem.IsFp32() or problem.IsFp16()))
+    if(!(bn_problem.IsFp32() or bn_problem.IsFp16()))
         return false;
-    if(!problem.Is2D())
+    if(!bn_problem.Is2D())
         return false;
+
+    // case 1 : mix type
+    if(!((bn_problem.GetXDesc().GetType() == miopenHalf &&
+          bn_problem.GetYDesc().GetType() == miopenHalf &&
+          bn_problem.GetBnScale().GetType() == miopenFloat &&
+          bn_problem.GetBnBias().GetType() == miopenFloat) ||
+         // case 2 : float type
+         (bn_problem.GetXDesc().GetType() == miopenFloat &&
+          bn_problem.GetYDesc().GetType() == miopenFloat &&
+          bn_problem.GetBnScale().GetType() == miopenFloat &&
+          bn_problem.GetBnBias().GetType() == miopenFloat)))
+        return false;
+
     return true;
 }
 
@@ -60,14 +73,13 @@ ConvSolution BnFwdInference::GetSolution(const ExecutionContext& context,
     bool bfpmixparm = false;
     bool bfp16parm  = false;
     bool bfp32parm  = true;
-    if(problem.GetXDesc().GetType() == miopenHalf &&
-       problem.GetBnScaleBiasMeanVarDesc().GetType() == miopenHalf)
+    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
     {
         bfp16parm = true;
         bfp32parm = false;
     }
     else if(problem.GetXDesc().GetType() == miopenHalf &&
-            problem.GetBnScaleBiasMeanVarDesc().GetType() == miopenFloat)
+            problem.GetBnScale().GetType() == miopenFloat)
     {
         bfpmixparm = true;
         bfp32parm  = false;
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index 529065d2ae..c4fc3b00ef 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -334,7 +334,21 @@ bool BnCKFwdInference::IsApplicable(
         return false;
     if(bn_problem.GetMode() != miopenBNSpatial)
         return false;
-    if(bn_problem.GetXDesc().GetType() != bn_problem.GetScaleBiasDiffDesc().GetType())
+
+    // case 1 : fp16 or bfp16
+    if(!((::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetBnScale().GetType()) &&
+          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetBnBias().GetType()) &&
+          bn_problem.GetBnSMean().GetType() == miopenFloat &&
+          bn_problem.GetBnSVar().GetType() == miopenFloat) ||
+         // case 1 : fp32 or fp64
+         (::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSVar().GetType()))))
         return false;
 
     switch(bn_problem.GetXDesc().GetType())
diff --git a/src/solver/batchnorm/forward_per_activation.cpp b/src/solver/batchnorm/forward_per_activation.cpp
index 0e4d6da799..1f653e9b7d 100644
--- a/src/solver/batchnorm/forward_per_activation.cpp
+++ b/src/solver/batchnorm/forward_per_activation.cpp
@@ -61,7 +61,7 @@ BnFwdTrainingPerActivation::GetSolution(const ExecutionContext& context,
 
     {
         decltype(auto) handle                 = context.GetStream();
-        decltype(auto) bnScaleBiasMeanVarDesc = problem.GetBnScaleBiasMeanVarDesc();
+        decltype(auto) bnScaleBiasMeanVarDesc = problem.GetBnScale();
 
         unsigned int in_nhw  = n * in_cstride;
         unsigned int in_nchw = n * in_nstride;
diff --git a/src/solver/batchnorm/forward_spatial_multiple.cpp b/src/solver/batchnorm/forward_spatial_multiple.cpp
index 9f85c8d7ab..f8bde6be6a 100644
--- a/src/solver/batchnorm/forward_spatial_multiple.cpp
+++ b/src/solver/batchnorm/forward_spatial_multiple.cpp
@@ -55,7 +55,7 @@ ConvSolution BnFwdTrainingSpatialMultiple::GetSolution(
 {
     const auto& handle                 = context.GetStream();
     const auto& xDesc                  = problem.GetXDesc();
-    const auto& bnScaleBiasMeanVarDesc = problem.GetBnScaleBiasMeanVarDesc();
+    const auto& bnScaleBiasMeanVarDesc = problem.GetBnScale();
 
     int n, c, h, w;
     std::tie(n, c, h, w) = tien<4>(xDesc.GetLengths());
diff --git a/src/solver/batchnorm/forward_spatial_single.cpp b/src/solver/batchnorm/forward_spatial_single.cpp
index 732181073a..fd30c8ffba 100644
--- a/src/solver/batchnorm/forward_spatial_single.cpp
+++ b/src/solver/batchnorm/forward_spatial_single.cpp
@@ -40,17 +40,29 @@ namespace solver {
 namespace batchnorm {
 
 bool BnFwdTrainingSpatialSingle::IsApplicable(
-    const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const
+    const ExecutionContext&, const miopen::batchnorm::ProblemDescription& bn_problem) const
 {
-    if(problem.GetDirection() != miopen::batchnorm::Direction::ForwardTraining ||
-       problem.GetMode() != miopenBNSpatial)
+    if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardTraining ||
+       bn_problem.GetMode() != miopenBNSpatial)
         return false;
 
-    if(problem.IsLayoutNHWC())
+    if(bn_problem.IsLayoutNHWC())
         return true;
 
+    // case 1 : mix type
+    if(!((bn_problem.GetXDesc().GetType() == miopenHalf &&
+          bn_problem.GetYDesc().GetType() == miopenHalf &&
+          bn_problem.GetBnScale().GetType() == miopenFloat &&
+          bn_problem.GetBnBias().GetType() == miopenFloat) ||
+         // case 2 : float type
+         (bn_problem.GetXDesc().GetType() == miopenFloat &&
+          bn_problem.GetYDesc().GetType() == miopenFloat &&
+          bn_problem.GetBnScale().GetType() == miopenFloat &&
+          bn_problem.GetBnBias().GetType() == miopenFloat)))
+        return false;
+
     int n, c, h, w;
-    std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
+    std::tie(n, c, h, w) = tien<4>(bn_problem.GetXDesc().GetLengths());
 
     unsigned int in_cstride = h * w;
     unsigned int in_nhw     = n * in_cstride;
@@ -58,13 +70,13 @@ bool BnFwdTrainingSpatialSingle::IsApplicable(
     bool bfpmixparm = false;
     bool bfp32parm  = true;
 
-    if(problem.GetXDesc().GetType() == miopenHalf &&
-       problem.GetBnScaleBiasMeanVarDesc().GetType() == miopenHalf)
+    if(bn_problem.GetXDesc().GetType() == miopenHalf &&
+       bn_problem.GetBnScale().GetType() == miopenHalf)
     {
         bfp32parm = false;
     }
-    else if(problem.GetXDesc().GetType() == miopenHalf &&
-            problem.GetBnScaleBiasMeanVarDesc().GetType() == miopenFloat)
+    else if(bn_problem.GetXDesc().GetType() == miopenHalf &&
+            bn_problem.GetBnScale().GetType() == miopenFloat)
     {
         bfpmixparm = true;
         bfp32parm  = false;
@@ -97,14 +109,13 @@ BnFwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context,
     bool bfp16parm  = false;
     bool bfp32parm  = true;
 
-    if(problem.GetXDesc().GetType() == miopenHalf &&
-       problem.GetBnScaleBiasMeanVarDesc().GetType() == miopenHalf)
+    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
     {
         bfp16parm = true;
         bfp32parm = false;
     }
     else if(problem.GetXDesc().GetType() == miopenHalf &&
-            problem.GetBnScaleBiasMeanVarDesc().GetType() == miopenFloat)
+            problem.GetBnScale().GetType() == miopenFloat)
     {
         bfpmixparm = true;
         bfp32parm  = false;
@@ -238,7 +249,7 @@ BnFwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context,
         result.construction_params.push_back(kernel);
     }
 
-    const auto dtype = problem.GetBnScaleBiasMeanVarDesc().GetType();
+    const auto dtype = problem.GetBnScale().GetType();
     const auto vn4   = (variant != 4);
 
     result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
diff --git a/src/solver/batchnorm/forward_training_ck.cpp b/src/solver/batchnorm/forward_training_ck.cpp
index fec919cf79..62c6a7932f 100644
--- a/src/solver/batchnorm/forward_training_ck.cpp
+++ b/src/solver/batchnorm/forward_training_ck.cpp
@@ -361,7 +361,20 @@ bool BnCKFwdTraining::IsApplicable(
         return false;
     if(bn_problem.GetMode() != miopenBNSpatial)
         return false;
-    if(bn_problem.GetXDesc().GetType() != bn_problem.GetScaleBiasDiffDesc().GetType())
+    // case 1 : fp16 or bfp16
+    if(!((::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetBnScale().GetType()) &&
+          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetBnBias().GetType()) &&
+          bn_problem.GetBnSMean().GetType() == miopenFloat &&
+          bn_problem.GetBnSVar().GetType() == miopenFloat) ||
+         // case 1 : fp32 or fp64
+         (::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
+          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSVar().GetType()))))
         return false;
 
     switch(bn_problem.GetXDesc().GetType())

From 0242120b60af764fd6acfea1f6814d1d34dedcdc Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Sat, 2 Nov 2024 03:30:28 +0000
Subject: [PATCH 05/13] fix gtest issue in bn for ck and ocl

---
 src/batch_norm_api.cpp                        |  41 ++++--
 src/driver_arguments.cpp                      | 119 +++++++++++++++---
 src/fusion.cpp                                |  21 ++--
 src/include/miopen/driver_arguments.hpp       |   5 +-
 .../batchnorm/backward_spatial_single.cpp     |   3 -
 src/solver/batchnorm/forward_inference.cpp    |   7 +-
 .../batchnorm/forward_spatial_single.cpp      |   7 +-
 test/fusionHost.hpp                           |   3 +-
 test/gtest/bn.hpp                             |  25 ++--
 test/gtest/bn_bwd.cpp                         |  52 ++++----
 test/gtest/bn_fwd_train.cpp                   |  62 ++++-----
 test/gtest/bn_infer.cpp                       |  58 ++++-----
 test/gtest/test_operations.hpp                |   9 +-
 13 files changed, 259 insertions(+), 153 deletions(-)

diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp
index d3b824cee0..59828ebdb5 100644
--- a/src/batch_norm_api.cpp
+++ b/src/batch_norm_api.cpp
@@ -50,7 +50,10 @@ namespace miopen {
 namespace debug {
 
 void LogCmdBNorm(const miopenTensorDescriptor_t xDesc,
-                 const miopenTensorDescriptor_t sMeanDesc,
+                 const miopenTensorDescriptor_t yDesc,
+                 const miopenTensorDescriptor_t scaleDesc,
+                 const miopenTensorDescriptor_t biasDesc,
+                 const miopenTensorDescriptor_t saveMeanDesc,
                  miopenBatchNormMode_t bn_mode,
                  const void* resultRunningMean,
                  const void* resultRunningVariance,
@@ -61,7 +64,10 @@ void LogCmdBNorm(const miopenTensorDescriptor_t xDesc,
     if(miopen::IsLoggingCmd())
     {
         const std::string& str = BnormArgsForMIOpenDriver(xDesc,
-                                                          sMeanDesc,
+                                                          yDesc,
+                                                          scaleDesc,
+                                                          biasDesc,
+                                                          saveMeanDesc,
                                                           bn_mode,
                                                           resultRunningMean,
                                                           resultRunningVariance,
@@ -206,7 +212,7 @@ miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle,
                                             const miopenTensorDescriptor_t yDesc,
                                             void* y,
                                             const miopenTensorDescriptor_t scaleDesc,
-                                            const miopenTensorDescriptor_t BiasDesc,
+                                            const miopenTensorDescriptor_t biasDesc,
                                             const miopenTensorDescriptor_t estMeanDesc,
                                             const miopenTensorDescriptor_t estVarianceDesc,
                                             void* bnScale,
@@ -222,7 +228,7 @@ miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle,
                         yDesc,
                         y,
                         scaleDesc,
-                        BiasDesc,
+                        biasDesc,
                         estMeanDesc,
                         estVarianceDesc,
                         bnScale,
@@ -232,12 +238,15 @@ miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle,
                         epsilon);
 
     miopen::debug::LogCmdBNorm(xDesc,
+                               yDesc,
+                               scaleDesc,
+                               biasDesc,
                                estMeanDesc,
                                bn_mode,
-                               estimatedMean,
-                               estimatedVariance,
                                nullptr,
                                nullptr,
+                               estMeanDesc,
+                               estimatedVariance,
                                miopen::debug::BatchNormDirection_t::ForwardInference);
 
     // In case of NxCxDxHxW
@@ -256,7 +265,7 @@ miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle,
                         : miopen::deref(yDesc),
             DataCast(y),
             miopen::deref(scaleDesc),
-            miopen::deref(BiasDesc),
+            miopen::deref(biasDesc),
             miopen::deref(estMeanDesc),
             miopen::deref(estVarianceDesc),
             DataCast(bnScale),
@@ -277,7 +286,7 @@ miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle,
                                            const miopenTensorDescriptor_t yDesc,
                                            void* y,
                                            const miopenTensorDescriptor_t scaleDesc,
-                                           const miopenTensorDescriptor_t BiasDesc,
+                                           const miopenTensorDescriptor_t biasDesc,
                                            const miopenTensorDescriptor_t savedMeanDesc,
                                            const miopenTensorDescriptor_t savedVarianceDesc,
                                            void* bnScale,
@@ -296,7 +305,7 @@ miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle,
                         yDesc,
                         y,
                         scaleDesc,
-                        BiasDesc,
+                        biasDesc,
                         savedMeanDesc,
                         savedVarianceDesc,
                         bnScale,
@@ -309,6 +318,9 @@ miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle,
                         resultSaveInvVariance);
 
     miopen::debug::LogCmdBNorm(xDesc,
+                               yDesc,
+                               scaleDesc,
+                               biasDesc,
                                savedMeanDesc,
                                bn_mode,
                                resultRunningMean,
@@ -332,7 +344,7 @@ miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle,
                         : miopen::deref(yDesc),
             DataCast(y),
             miopen::deref(scaleDesc),
-            miopen::deref(BiasDesc),
+            miopen::deref(biasDesc),
             miopen::deref(savedMeanDesc),
             miopen::deref(savedVarianceDesc),
             DataCast(bnScale),
@@ -360,7 +372,7 @@ miopenBatchNormalizationBackward_V2(miopenHandle_t handle,
                                     const miopenTensorDescriptor_t dxDesc,
                                     void* dx,
                                     const miopenTensorDescriptor_t scaleDesc,
-                                    const miopenTensorDescriptor_t BiasDesc,
+                                    const miopenTensorDescriptor_t biasDesc,
                                     const miopenTensorDescriptor_t savedMeanDesc,
                                     const miopenTensorDescriptor_t savedVarianceDesc,
                                     const void* bnScale,
@@ -379,7 +391,7 @@ miopenBatchNormalizationBackward_V2(miopenHandle_t handle,
                         dxDesc,
                         dx,
                         scaleDesc,
-                        BiasDesc,
+                        biasDesc,
                         savedMeanDesc,
                         savedVarianceDesc,
                         bnScale,
@@ -389,6 +401,9 @@ miopenBatchNormalizationBackward_V2(miopenHandle_t handle,
                         savedMean,
                         savedInvVariance);
     miopen::debug::LogCmdBNorm(xDesc,
+                               dyDesc,
+                               scaleDesc,
+                               biasDesc,
                                savedMeanDesc,
                                bn_mode,
                                nullptr,
@@ -417,7 +432,7 @@ miopenBatchNormalizationBackward_V2(miopenHandle_t handle,
                         : miopen::deref(dxDesc),
             DataCast(dx),
             miopen::deref(scaleDesc),
-            miopen::deref(BiasDesc),
+            miopen::deref(biasDesc),
             miopen::deref(savedMeanDesc),
             miopen::deref(savedVarianceDesc),
             DataCast(bnScale),
diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp
index e75ec31902..b7970d9833 100644
--- a/src/driver_arguments.cpp
+++ b/src/driver_arguments.cpp
@@ -64,27 +64,105 @@ void ConvDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc)
 // We choose scaleMean because its a accumulator type.
 void BnDataType(std::stringstream& ss,
                 const miopen::TensorDescriptor& xDesc,
-                const miopen::TensorDescriptor& sMeanDesc)
+                const miopen::TensorDescriptor& yDesc,
+                const miopen::TensorDescriptor& scaleDesc,
+                const miopen::TensorDescriptor& biasDesc,
+                const miopen::TensorDescriptor& sMeanDesc,
+                const BatchNormDirection_t bn_mode)
 {
-    if(xDesc.GetType() == miopenHalf && sMeanDesc.GetType() == miopenHalf)
+    if(bn_mode == BatchNormDirection_t::ForwardInference ||
+       bn_mode == BatchNormDirection_t::ForwardTraining)
     {
-        ss << "bnormfp16";
-    }
-    else if(xDesc.GetType() == miopenBFloat16 && sMeanDesc.GetType() == miopenBFloat16)
-    {
-        ss << "bnormbfp16";
-    }
-    else if(xDesc.GetType() == miopenHalf && sMeanDesc.GetType() == miopenFloat)
-    {
-        ss << "bnormfp16fp32";
+        if(xDesc.GetType() == miopenHalf && yDesc.GetType() == miopenHalf &&
+           scaleDesc.GetType() == miopenFloat && biasDesc.GetType() == miopenFloat &&
+           sMeanDesc.GetType() == miopenFloat)
+        {
+            ss << "bnormfp16";
+        }
+        else if(xDesc.GetType() == miopenBFloat16 && yDesc.GetType() == miopenBFloat16 &&
+                scaleDesc.GetType() == miopenFloat && biasDesc.GetType() == miopenFloat &&
+                sMeanDesc.GetType() == miopenFloat)
+        {
+            ss << "bnormbfp16";
+        }
+        else if(xDesc.GetType() == miopenHalf && yDesc.GetType() == miopenHalf &&
+                scaleDesc.GetType() == miopenHalf && biasDesc.GetType() == miopenHalf &&
+                sMeanDesc.GetType() == miopenFloat)
+        {
+            ss << "bnormfp16fp32";
+        }
+        else if(xDesc.GetType() == miopenBFloat16 && yDesc.GetType() == miopenBFloat16 &&
+                scaleDesc.GetType() == miopenBFloat16 && biasDesc.GetType() == miopenBFloat16 &&
+                sMeanDesc.GetType() == miopenFloat)
+        {
+            ss << "bnormbfp16fp32";
+        }
+        else
+        {
+            ss << "bnorm";
+        }
     }
-    else if(xDesc.GetType() == miopenBFloat16 && sMeanDesc.GetType() == miopenFloat)
+    else if(bn_mode == BatchNormDirection_t::ForwardTraining)
     {
-        ss << "bnormbfp16fp32";
+        if(xDesc.GetType() == miopenHalf && yDesc.GetType() == miopenHalf &&
+           scaleDesc.GetType() == miopenFloat && biasDesc.GetType() == miopenFloat &&
+           sMeanDesc.GetType() == miopenFloat)
+        {
+            ss << "bnormfp16";
+        }
+        else if(xDesc.GetType() == miopenBFloat16 && yDesc.GetType() == miopenBFloat16 &&
+                scaleDesc.GetType() == miopenFloat && biasDesc.GetType() == miopenFloat &&
+                sMeanDesc.GetType() == miopenFloat)
+        {
+            ss << "bnormbfp16";
+        }
+        else if(xDesc.GetType() == miopenHalf && yDesc.GetType() == miopenHalf &&
+                scaleDesc.GetType() == miopenHalf && biasDesc.GetType() == miopenHalf &&
+                sMeanDesc.GetType() == miopenFloat)
+        {
+            ss << "bnormfp16fp32";
+        }
+        else if(xDesc.GetType() == miopenBFloat16 && yDesc.GetType() == miopenBFloat16 &&
+                scaleDesc.GetType() == miopenBFloat16 && biasDesc.GetType() == miopenBFloat16 &&
+                sMeanDesc.GetType() == miopenFloat)
+        {
+            ss << "bnormbfp16fp32";
+        }
+        else
+        {
+            ss << "bnorm";
+        }
     }
-    else
+    else if(bn_mode == BatchNormDirection_t::Backward)
     {
-        ss << "bnorm";
+        if(xDesc.GetType() == miopenHalf && yDesc.GetType() == miopenHalf &&
+           scaleDesc.GetType() == miopenFloat && biasDesc.GetType() == miopenFloat &&
+           sMeanDesc.GetType() == miopenFloat)
+        {
+            ss << "bnormfp16";
+        }
+        else if(xDesc.GetType() == miopenBFloat16 && yDesc.GetType() == miopenBFloat16 &&
+                scaleDesc.GetType() == miopenFloat && biasDesc.GetType() == miopenFloat &&
+                sMeanDesc.GetType() == miopenFloat)
+        {
+            ss << "bnormbfp16";
+        }
+        else if(xDesc.GetType() == miopenHalf && yDesc.GetType() == miopenFloat &&
+                scaleDesc.GetType() == miopenHalf && biasDesc.GetType() == miopenFloat &&
+                sMeanDesc.GetType() == miopenFloat)
+        {
+            ss << "bnormfp16fp32";
+        }
+        else if(xDesc.GetType() == miopenBFloat16 && yDesc.GetType() == miopenFloat &&
+                scaleDesc.GetType() == miopenBFloat16 && biasDesc.GetType() == miopenFloat &&
+                sMeanDesc.GetType() == miopenFloat)
+        {
+            ss << "bnormbfp16fp32";
+        }
+        else
+        {
+            ss << "bnorm";
+        }
     }
 }
 
@@ -228,6 +306,9 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc,
 }
 
 std::string BnormArgsForMIOpenDriver(const miopenTensorDescriptor_t xDesc,
+                                     const miopenTensorDescriptor_t yDesc,
+                                     const miopenTensorDescriptor_t scaleDesc,
+                                     const miopenTensorDescriptor_t biasDesc,
                                      const miopenTensorDescriptor_t sMeanDesc,
                                      miopenBatchNormMode_t bn_mode,
                                      const void* resultRunningMean,
@@ -241,7 +322,13 @@ std::string BnormArgsForMIOpenDriver(const miopenTensorDescriptor_t xDesc,
     miopenGetTensorDescriptorSize(xDesc, &size);
     std::stringstream ss;
     if(print_for_bn_driver)
-        BnDataType(ss, miopen::deref(xDesc), miopen::deref(sMeanDesc));
+        BnDataType(ss,
+                   miopen::deref(xDesc),
+                   miopen::deref(yDesc),
+                   miopen::deref(scaleDesc),
+                   miopen::deref(biasDesc),
+                   miopen::deref(sMeanDesc),
+                   dir);
 
     ss << " -n " << miopen::deref(xDesc).GetLengths()[0] // clang-format off
             << " -c " << miopen::deref(xDesc).GetLengths()[1];
diff --git a/src/fusion.cpp b/src/fusion.cpp
index e536f6a1a1..3c52f6c2a0 100644
--- a/src/fusion.cpp
+++ b/src/fusion.cpp
@@ -389,15 +389,18 @@ std::string LogCmdBnormFusion(const miopenFusionPlanDescriptor_t fusePlanDesc, i
 
     if(bn_op != nullptr)
     {
-        str += BnormArgsForMIOpenDriver(&bn_op->input_desc,
-                                        &bn_op->base_desc,
-                                        bn_op->mode,
-                                        nullptr,
-                                        nullptr,
-                                        nullptr,
-                                        nullptr,
-                                        miopen::debug::BatchNormDirection_t::ForwardInference,
-                                        false);
+        // str += BnormArgsForMIOpenDriver(&bn_op->input_desc,
+        //                                 &bn_op->base_desc,
+        //                                 nullptr,
+        //                                 nullptr,
+        //                                 nullptr,
+        //                                 bn_op->mode,
+        //                                 nullptr,
+        //                                 nullptr,
+        //                                 nullptr,
+        //                                 nullptr,
+        //                                 miopen::debug::BatchNormDirection_t::ForwardInference,
+        //                                 false);
     }
     else
     {
diff --git a/src/include/miopen/driver_arguments.hpp b/src/include/miopen/driver_arguments.hpp
index a964e7fe27..f28a772e3c 100644
--- a/src/include/miopen/driver_arguments.hpp
+++ b/src/include/miopen/driver_arguments.hpp
@@ -67,7 +67,10 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc,
                                     bool print_for_conv_driver = true);
 
 std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc,
-                                     miopenTensorDescriptor_t sMeanDesc,
+                                     miopenTensorDescriptor_t yDesc,
+                                     miopenTensorDescriptor_t scaleDesc,
+                                     miopenTensorDescriptor_t biasDesc,
+                                     miopenTensorDescriptor_t saveMeanDesc,
                                      miopenBatchNormMode_t bn_mode,
                                      const void* resultRunningMean,
                                      const void* resultRunningVariance,
diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp
index ef569e18a7..3e4bb8bf78 100644
--- a/src/solver/batchnorm/backward_spatial_single.cpp
+++ b/src/solver/batchnorm/backward_spatial_single.cpp
@@ -59,9 +59,6 @@ bool BnBwdTrainingSpatialSingle::IsApplicable(
     }
 #endif
 
-    if(bn_problem.IsLayoutNHWC())
-        return true;
-
     // case 1 : fp16 or bfp16
     if(!((::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
           ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetDXDesc().GetType()) &&
diff --git a/src/solver/batchnorm/forward_inference.cpp b/src/solver/batchnorm/forward_inference.cpp
index 1fc2139285..8d08ece535 100644
--- a/src/solver/batchnorm/forward_inference.cpp
+++ b/src/solver/batchnorm/forward_inference.cpp
@@ -51,8 +51,8 @@ bool BnFwdInference::IsApplicable(const ExecutionContext&,
         return false;
 
     // case 1 : mix type
-    if(!((bn_problem.GetXDesc().GetType() == miopenHalf &&
-          bn_problem.GetYDesc().GetType() == miopenHalf &&
+    if(!((::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
           bn_problem.GetBnScale().GetType() == miopenFloat &&
           bn_problem.GetBnBias().GetType() == miopenFloat) ||
          // case 2 : float type
@@ -74,8 +74,7 @@ ConvSolution BnFwdInference::GetSolution(const ExecutionContext& context,
     bool bbfpmixparam = false;
     bool bfp16parm    = false;
     bool bfp32parm    = true;
-    if(problem.GetXDesc().GetType() == miopenHalf &&
-       problem.GetBnScale().GetType() == miopenHalf)
+    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
     {
         bfp16parm = true;
         bfp32parm = false;
diff --git a/src/solver/batchnorm/forward_spatial_single.cpp b/src/solver/batchnorm/forward_spatial_single.cpp
index 3dc1ae7707..91f5c66a03 100644
--- a/src/solver/batchnorm/forward_spatial_single.cpp
+++ b/src/solver/batchnorm/forward_spatial_single.cpp
@@ -46,12 +46,9 @@ bool BnFwdTrainingSpatialSingle::IsApplicable(
        bn_problem.GetMode() != miopenBNSpatial)
         return false;
 
-    if(bn_problem.IsLayoutNHWC())
-        return true;
-
     // case 1 : mix type
-    if(!((bn_problem.GetXDesc().GetType() == miopenHalf &&
-          bn_problem.GetYDesc().GetType() == miopenHalf &&
+    if(!((::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
           bn_problem.GetBnScale().GetType() == miopenFloat &&
           bn_problem.GetBnBias().GetType() == miopenFloat) ||
          // case 2 : float type
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index d525b79cf6..6fbc428b1a 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -285,13 +285,14 @@ void batchNormSpatialHostFwdTrain(const tensor<T>& input,
 }
 
 template <typename XDataType,
+          typename DxDataType,
           typename DyDataType,
           typename ScaleDataType,
           typename AccDataType,
           typename RefDataType>
 void batchNormSpatialHostBwdTrain(const tensor<XDataType>& x_input,
                                   const tensor<DyDataType>& dy_input,
-                                  tensor<RefDataType>& dx_out,
+                                  tensor<DxDataType>& dx_out,
                                   const tensor<ScaleDataType>& bnScale,
                                   tensor<RefDataType>& dscale,
                                   tensor<RefDataType>& dbias,
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index a195b6b81e..1021952b54 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -263,22 +263,18 @@ struct BNBwdTest
         auto&& handle = get_handle();
         bn_bwd_test_data.output.data =
             handle.Read<DyDataType>(bn_bwd_test_data.out_dev, bn_bwd_test_data.output.data.size());
-        bn_bwd_test_data.dScale.data = handle.Read<DxDataType>(bn_bwd_test_data.dScale_dev,
-                                                               bn_bwd_test_data.dScale.data.size());
-        bn_bwd_test_data.dBias.data =
-            handle.Read<DxDataType>(bn_bwd_test_data.dBias_dev, bn_bwd_test_data.dBias.data.size());
+        bn_bwd_test_data.dScale.data = handle.Read<DscaleDbiasDataType>(
+            bn_bwd_test_data.dScale_dev, bn_bwd_test_data.dScale.data.size());
+        bn_bwd_test_data.dBias.data = handle.Read<DscaleDbiasDataType>(
+            bn_bwd_test_data.dBias_dev, bn_bwd_test_data.dBias.data.size());
 
-        test::ComputeCPUBNBwd<XDataType,
-                              DxDataType,
-                              DyDataType,
-                              AccDataType,
-                              ScaleDataType,
-                              DscaleDbiasDataType,
-                              MeanVarDataType>(bn_bwd_test_data);
+        test::ComputeCPUBNBwd(bn_bwd_test_data);
 
-        test::CompareTensor<DxDataType>(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 5e-4);
-        test::CompareTensor<DxDataType>(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 5e-4);
-        test::CompareTensor<DxDataType>(bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, 5e-4);
+        test::CompareTensor<DxDataType>(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, bwd_tol);
+        test::CompareTensor<DscaleDbiasDataType>(
+            bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, bwd_tol);
+        test::CompareTensor<DscaleDbiasDataType>(
+            bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, bwd_tol);
     }
 
     BNTestCase bn_config;
@@ -294,6 +290,7 @@ struct BNBwdTest
         bn_bwd_test_data;
     miopenTensorLayout_t tensor_layout;
     BNApiType api_type;
+    double bwd_tol = 4e-3;
 };
 
 template <typename XDataType,
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
index 1a77050d81..94add304a5 100644
--- a/test/gtest/bn_bwd.cpp
+++ b/test/gtest/bn_bwd.cpp
@@ -25,22 +25,30 @@
  *******************************************************************************/
 
 #include "bn.hpp"
-
-struct GPU_BN_BWD_Small_FP16
+/* typename XDataType,
+   typename DxDataType,
+   typename DyDataType,
+   typename AccDataType,
+   typename ScaleDataType,
+   typename DscaleDbiasDataType,
+   typename MeanVarDataType> */
+
+struct GPU_BN_CK_BWD_Large_FP16
     : BNBwdTest<half_float::half, float, float, float, half_float::half, float, float>
 {
 };
 
-struct GPU_BN_BWD_Large_FP16
-    : BNBwdTest<half_float::half, float, float, float, half_float::half, float, float>
+struct GPU_BN_OCL_BWD_Large_FP16
+    : BNBwdTest<half_float::half, half_float::half, half_float::half, float, float, float, float>
 {
 };
 
-struct GPU_BN_BWD_Small_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
+struct GPU_BN_CK_BWD_Large_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
 {
 };
 
-struct GPU_BN_BWD_Large_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
+struct GPU_BN_OCL_BWD_Large_BFP16
+    : BNBwdTest<bfloat16, bfloat16, bfloat16, float, float, float, float>
 {
 };
 
@@ -61,12 +69,12 @@ struct GPU_BN_BWD_Large_FP64 : BNBwdTest<double, double, double, double, double,
 };
 
 // fp16
-TEST_P(GPU_BN_BWD_Small_FP16, BnV1SmallBWDCKfp16) {}
-TEST_P(GPU_BN_BWD_Large_FP16, BnV2LargeBWDCKfp16) {}
+TEST_P(GPU_BN_CK_BWD_Large_FP16, BnV2LargeBWDCKfp16) {}
+TEST_P(GPU_BN_OCL_BWD_Large_FP16, BnV2LargeBWDOCLfp16) {}
 
 // bfp16
-TEST_P(GPU_BN_BWD_Small_BFP16, BnV1SmallBWDCKbfp16) {}
-TEST_P(GPU_BN_BWD_Large_BFP16, BnV2LargeBWDCKbfp16) {}
+TEST_P(GPU_BN_CK_BWD_Large_BFP16, BnV2LargeBWDCKbfp16) {}
+TEST_P(GPU_BN_OCL_BWD_Large_BFP16, BnV2LargeBWDOCLbfp16) {}
 
 // fp32 (float)
 TEST_P(GPU_BN_BWD_Small_FP32, BnV1SmallBWDCKfp32) {}
@@ -78,31 +86,31 @@ TEST_P(GPU_BN_BWD_Large_FP64, BnV2LargeBWDCKfp64) {}
 
 // fp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_BWD_Small_FP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         GPU_BN_CK_BWD_Large_FP16,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV1})),
+                                          testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_BWD_Large_FP16,
+                         GPU_BN_OCL_BWD_Large_FP16,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenTensorNCHW}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
 // bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_BWD_Small_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         GPU_BN_CK_BWD_Large_BFP16,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV1})),
+                                          testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_BWD_Large_BFP16,
+                         GPU_BN_OCL_BWD_Large_BFP16,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenTensorNCHW}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
@@ -110,7 +118,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_BWD_Small_FP32,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenTensorNCHW}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
@@ -120,7 +128,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
-// fp64
+// // fp64
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_BWD_Small_FP64,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp
index 8cc60e2d55..2f4ec26204 100644
--- a/test/gtest/bn_fwd_train.cpp
+++ b/test/gtest/bn_fwd_train.cpp
@@ -31,13 +31,14 @@
 // ScaleDataType   : half
 // BiasDataType    : half
 // MeanVarDataType : float
-struct GPU_BN_FWD_Train_Small_FP16
+
+struct GPU_BN_CK_FWD_Train_Large_FP16
     : BNFwdTrainTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
 {
 };
 
-struct GPU_BN_FWD_Train_Large_FP16
-    : BNFwdTrainTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
+struct GPU_BN_OCL_FWD_Train_Large_FP16
+    : BNFwdTrainTest<half_float::half, half_float::half, float, float, float>
 {
 };
 
@@ -46,11 +47,13 @@ struct GPU_BN_FWD_Train_Large_FP16
 // ScaleDataType   : bfloat16
 // BiasDataType    : bfloat16
 // MeanVarDataType : float
-struct GPU_BN_FWD_Train_Small_BFP16 : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
+
+struct GPU_BN_CK_FWD_Train_Large_BFP16
+    : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
 {
 };
 
-struct GPU_BN_FWD_Train_Large_BFP16 : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
+struct GPU_BN_OCL_FWD_Train_Large_BFP16 : BNFwdTrainTest<bfloat16, bfloat16, float, float, float>
 {
 };
 
@@ -71,55 +74,56 @@ struct GPU_BN_FWD_Train_Large_FP64 : BNFwdTrainTest<double, double, double, doub
 };
 
 // fp16
-TEST_P(GPU_BN_FWD_Train_Small_FP16, BnV1SmallFWD_TrainCKfp16) {}
-TEST_P(GPU_BN_FWD_Train_Large_FP16, BnV2LargeFWD_TrainCKfp16) {}
+TEST_P(GPU_BN_CK_FWD_Train_Large_FP16, BnV2LargeFWD_TrainCKfp16) {}
+TEST_P(GPU_BN_OCL_FWD_Train_Large_FP16, BnV2LargeFWD_TrainOCLfp16) {}
 
-// bfp16
-TEST_P(GPU_BN_FWD_Train_Small_BFP16, BnV1SmallFWD_TrainCKbfp16) {}
-TEST_P(GPU_BN_FWD_Train_Large_BFP16, BnV2LargeFWD_TrainCKbfp16) {}
+// // bfp16
+TEST_P(GPU_BN_CK_FWD_Train_Large_BFP16, BnV2LargeFWD_TrainCKbfp16) {}
+TEST_P(GPU_BN_OCL_FWD_Train_Large_BFP16, BnV2LargeFWD_TrainOCLbfp16) {}
 
-// fp32 (float)
+// // fp32 (float)
 TEST_P(GPU_BN_FWD_Train_Small_FP32, BnV1SmallFWD_TrainCKfp32) {}
 TEST_P(GPU_BN_FWD_Train_Large_FP32, BnV2LargeFWD_TrainCKfp32) {}
 
-// fp64
+// // // fp64
 TEST_P(GPU_BN_FWD_Train_Small_FP64, BnV1SmallFWD_TrainCKfp64) {}
 TEST_P(GPU_BN_FWD_Train_Large_FP64, BnV2LargeFWD_TrainCKfp64) {}
 
 // fp16
-INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_FWD_Train_Small_FP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV1})),
-                         TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_FWD_Train_Large_FP16,
+                         GPU_BN_CK_FWD_Train_Large_FP16,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
-// bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_FWD_Train_Small_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV1})),
+                         GPU_BN_OCL_FWD_Train_Large_FP16,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator());
 
+// // bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_FWD_Train_Large_BFP16,
+                         GPU_BN_CK_FWD_Train_Large_BFP16,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
-// fp32
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_FWD_Train_Small_FP32,
+                         GPU_BN_OCL_FWD_Train_Large_BFP16,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
+                         TestNameGenerator());
+
+// // fp32
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BN_FWD_Train_Small_FP32,
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -130,10 +134,10 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
-// fp64
+// // fp64
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_FWD_Train_Small_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index 4f32a2b3bd..28a2c24bff 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -26,33 +26,36 @@
 
 #include "bn.hpp"
 
-// NHWC solver accepts
 // XDataType       : half_float::half
 // YDataYype       : half_float::half
 // ScaleDataType   : half_float::half
 // BiasDataType    : half_float::half
 // MeanVarDataType : float
-struct GPU_BN_Infer_Small_FP16
+struct GPU_BN_CK_Infer_Large_FP16
     : BNInferTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
 {
 };
 
-struct GPU_BN_Infer_Large_FP16
-    : BNInferTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
+struct GPU_BN_OCL_Infer_Large_FP16
+    : BNInferTest<half_float::half, half_float::half, float, float, float>
 {
 };
 
-// bf16 NHWC solver accepts is only on CK solver
 // XDataType       : bfloat16
 // YDataYype       : bfloat16
 // ScaleDataType   : bfloat16
 // BiasDataType    : bfloat16
 // MeanVarDataType : float
-struct GPU_BN_Infer_Small_BFP16 : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
+struct GPU_BN_CK_Infer_Large_BFP16 : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
 {
 };
 
-struct GPU_BN_Infer_Large_BFP16 : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
+// XDataType       : bfloat16
+// YDataYype       : bfloat16
+// ScaleDataType   : float
+// BiasDataType    : float
+// MeanVarDataType : float
+struct GPU_BN_OCL_Infer_Large_BFP16 : BNInferTest<bfloat16, bfloat16, float, float, float>
 {
 };
 
@@ -73,14 +76,14 @@ struct GPU_BN_Infer_Large_FP64 : BNInferTest<double, double, double, double, dou
 };
 
 // fp16
-TEST_P(GPU_BN_Infer_Small_FP16, BnV1SmallInferCKfp16) {}
-TEST_P(GPU_BN_Infer_Large_FP16, BnV2LargeInferCKfp16) {}
+TEST_P(GPU_BN_CK_Infer_Large_FP16, BnV2LargeInferCKfp16) {}
+TEST_P(GPU_BN_OCL_Infer_Large_FP16, BnV2LargeInferOCLfp16) {}
 
 // bfp16
-TEST_P(GPU_BN_Infer_Small_BFP16, BnV1SmallInferCKbfp16) {}
-TEST_P(GPU_BN_Infer_Large_BFP16, BnV2LargeInferCKbfp16) {}
+TEST_P(GPU_BN_CK_Infer_Large_BFP16, BnV2LargeInferCKbfp16) {}
+TEST_P(GPU_BN_OCL_Infer_Large_BFP16, BnV2LargeInferOCLbfp16) {}
 
-// fp32 (float)
+// // fp32 (float)
 TEST_P(GPU_BN_Infer_Small_FP32, BnV1SmallInferCKfp32) {}
 TEST_P(GPU_BN_Infer_Large_FP32, BnV2LargeInferCKfp32) {}
 
@@ -90,40 +93,39 @@ TEST_P(GPU_BN_Infer_Large_FP64, BnV2LargeInferCKfp64) {}
 
 // fp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_Infer_Small_FP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         GPU_BN_CK_Infer_Large_FP16,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV1})),
+                                          testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_Infer_Large_FP16,
+                         GPU_BN_OCL_Infer_Large_FP16,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV2})),
+                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator());
-
 // bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_Infer_Small_BFP16,
+                         GPU_BN_CK_Infer_Large_BFP16,
                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV1})),
+                                          testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_Infer_Large_BFP16,
+                         GPU_BN_OCL_Infer_Large_BFP16,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV2})),
+                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator());
 
 // fp32
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_Infer_Small_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV2})),
+                                          testing::ValuesIn({testBNAPIV1})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
@@ -135,9 +137,9 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 // fp64
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_Infer_Small_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV2})),
+                                          testing::ValuesIn({testBNAPIV1})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp
index 298ac55e3e..2e07e4f02a 100644
--- a/test/gtest/test_operations.hpp
+++ b/test/gtest/test_operations.hpp
@@ -38,14 +38,7 @@ void ComputeCPUBNInference(DLModule& dl_module)
                                   dl_module.estVariance);
 }
 
-template <typename XDataType,
-          typename DxDataType,
-          typename DyDataType,
-          typename AccDataType,
-          typename ScaleDataType,
-          typename DscaleDbiasDataType,
-          typename MeanVarDataType,
-          typename DLModule>
+template <typename DLModule>
 void ComputeCPUBNBwd(DLModule& dl_module)
 {
     batchNormSpatialHostBwdTrain(dl_module.input,

From 185b7ec83ce7e4dddfabf5c2ef25648de4ee3f60 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Sat, 2 Nov 2024 03:47:32 +0000
Subject: [PATCH 06/13] fix minor issue

---
 src/driver_arguments.cpp | 35 ++++-------------------------------
 1 file changed, 4 insertions(+), 31 deletions(-)

diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp
index b7970d9833..8641a37ce6 100644
--- a/src/driver_arguments.cpp
+++ b/src/driver_arguments.cpp
@@ -102,37 +102,6 @@ void BnDataType(std::stringstream& ss,
             ss << "bnorm";
         }
     }
-    else if(bn_mode == BatchNormDirection_t::ForwardTraining)
-    {
-        if(xDesc.GetType() == miopenHalf && yDesc.GetType() == miopenHalf &&
-           scaleDesc.GetType() == miopenFloat && biasDesc.GetType() == miopenFloat &&
-           sMeanDesc.GetType() == miopenFloat)
-        {
-            ss << "bnormfp16";
-        }
-        else if(xDesc.GetType() == miopenBFloat16 && yDesc.GetType() == miopenBFloat16 &&
-                scaleDesc.GetType() == miopenFloat && biasDesc.GetType() == miopenFloat &&
-                sMeanDesc.GetType() == miopenFloat)
-        {
-            ss << "bnormbfp16";
-        }
-        else if(xDesc.GetType() == miopenHalf && yDesc.GetType() == miopenHalf &&
-                scaleDesc.GetType() == miopenHalf && biasDesc.GetType() == miopenHalf &&
-                sMeanDesc.GetType() == miopenFloat)
-        {
-            ss << "bnormfp16fp32";
-        }
-        else if(xDesc.GetType() == miopenBFloat16 && yDesc.GetType() == miopenBFloat16 &&
-                scaleDesc.GetType() == miopenBFloat16 && biasDesc.GetType() == miopenBFloat16 &&
-                sMeanDesc.GetType() == miopenFloat)
-        {
-            ss << "bnormbfp16fp32";
-        }
-        else
-        {
-            ss << "bnorm";
-        }
-    }
     else if(bn_mode == BatchNormDirection_t::Backward)
     {
         if(xDesc.GetType() == miopenHalf && yDesc.GetType() == miopenHalf &&
@@ -164,6 +133,10 @@ void BnDataType(std::stringstream& ss,
             ss << "bnorm";
         }
     }
+    else
+    {
+        MIOPEN_THROW("Bad Op direction");
+    }
 }
 
 void BnDriverInfo(std::stringstream& ss,

From e34c18f62932f5a7e55994d83818309a669f4ed8 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Sat, 2 Nov 2024 05:21:14 +0000
Subject: [PATCH 07/13] fix clang format

---
 src/ocl/batchnormocl.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp
index 84c587b4f5..78d2dbce66 100644
--- a/src/ocl/batchnormocl.cpp
+++ b/src/ocl/batchnormocl.cpp
@@ -154,8 +154,7 @@ void BatchNormForwardTraining(Handle& handle,
     const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdTrainingSpatialSingle,
                                                  solver::batchnorm::BnCKFwdTraining,
                                                  solver::batchnorm::BnFwdTrainingSpatialMultiple,
-                                                 solver::batchnorm::BnFwdTrainingPerActivation
-                                                 >{};
+                                                 solver::batchnorm::BnFwdTrainingPerActivation>{};
 
     solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
 

From e9f7092f77eab84f65bd5c77f0de4523fd3842c3 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Sat, 2 Nov 2024 07:09:06 +0000
Subject: [PATCH 08/13] hip tidy

---
 src/driver_arguments.cpp                |  6 ++++--
 src/include/miopen/driver_arguments.hpp | 12 ++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp
index 8641a37ce6..b730325add 100644
--- a/src/driver_arguments.cpp
+++ b/src/driver_arguments.cpp
@@ -282,7 +282,7 @@ std::string BnormArgsForMIOpenDriver(const miopenTensorDescriptor_t xDesc,
                                      const miopenTensorDescriptor_t yDesc,
                                      const miopenTensorDescriptor_t scaleDesc,
                                      const miopenTensorDescriptor_t biasDesc,
-                                     const miopenTensorDescriptor_t sMeanDesc,
+                                     const miopenTensorDescriptor_t saveMeanDesc,
                                      miopenBatchNormMode_t bn_mode,
                                      const void* resultRunningMean,
                                      const void* resultRunningVariance,
@@ -295,13 +295,15 @@ std::string BnormArgsForMIOpenDriver(const miopenTensorDescriptor_t xDesc,
     miopenGetTensorDescriptorSize(xDesc, &size);
     std::stringstream ss;
     if(print_for_bn_driver)
+    {
         BnDataType(ss,
                    miopen::deref(xDesc),
                    miopen::deref(yDesc),
                    miopen::deref(scaleDesc),
                    miopen::deref(biasDesc),
-                   miopen::deref(sMeanDesc),
+                   miopen::deref(saveMeanDesc),
                    dir);
+    }
 
     ss << " -n " << miopen::deref(xDesc).GetLengths()[0] // clang-format off
             << " -c " << miopen::deref(xDesc).GetLengths()[1];
diff --git a/src/include/miopen/driver_arguments.hpp b/src/include/miopen/driver_arguments.hpp
index f28a772e3c..aa59d86036 100644
--- a/src/include/miopen/driver_arguments.hpp
+++ b/src/include/miopen/driver_arguments.hpp
@@ -66,12 +66,12 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc,
                                     std::optional<uint64_t> immediate_mode_solver_id,
                                     bool print_for_conv_driver = true);
 
-std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc,
-                                     miopenTensorDescriptor_t yDesc,
-                                     miopenTensorDescriptor_t scaleDesc,
-                                     miopenTensorDescriptor_t biasDesc,
-                                     miopenTensorDescriptor_t saveMeanDesc,
-                                     miopenBatchNormMode_t bn_mode,
+std::string BnormArgsForMIOpenDriver(const miopenTensorDescriptor_t xDesc,
+                                     const miopenTensorDescriptor_t yDesc,
+                                     const miopenTensorDescriptor_t scaleDesc,
+                                     const miopenTensorDescriptor_t biasDesc,
+                                     const miopenTensorDescriptor_t saveMeanDesc,
+                                     const miopenBatchNormMode_t bn_mode,
                                      const void* resultRunningMean,
                                      const void* resultRunningVariance,
                                      const void* resultSaveMean,

From 45741cccb913b3768523f0076a2f6bbf42d84103 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Sat, 2 Nov 2024 08:51:17 +0000
Subject: [PATCH 09/13] fix gtest sample and fix hiptidy

---
 src/include/miopen/driver_arguments.hpp | 12 ++++++------
 test/gtest/bn_bwd.cpp                   | 12 ++++++------
 test/gtest/bn_fwd_train.cpp             |  8 ++++----
 test/gtest/bn_infer.cpp                 | 24 ++++++++++++------------
 test/gtest/bn_test_data.hpp             |  4 +---
 5 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/src/include/miopen/driver_arguments.hpp b/src/include/miopen/driver_arguments.hpp
index aa59d86036..f28a772e3c 100644
--- a/src/include/miopen/driver_arguments.hpp
+++ b/src/include/miopen/driver_arguments.hpp
@@ -66,12 +66,12 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc,
                                     std::optional<uint64_t> immediate_mode_solver_id,
                                     bool print_for_conv_driver = true);
 
-std::string BnormArgsForMIOpenDriver(const miopenTensorDescriptor_t xDesc,
-                                     const miopenTensorDescriptor_t yDesc,
-                                     const miopenTensorDescriptor_t scaleDesc,
-                                     const miopenTensorDescriptor_t biasDesc,
-                                     const miopenTensorDescriptor_t saveMeanDesc,
-                                     const miopenBatchNormMode_t bn_mode,
+std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc,
+                                     miopenTensorDescriptor_t yDesc,
+                                     miopenTensorDescriptor_t scaleDesc,
+                                     miopenTensorDescriptor_t biasDesc,
+                                     miopenTensorDescriptor_t saveMeanDesc,
+                                     miopenBatchNormMode_t bn_mode,
                                      const void* resultRunningMean,
                                      const void* resultRunningVariance,
                                      const void* resultSaveMean,
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
index 94add304a5..ba4c136469 100644
--- a/test/gtest/bn_bwd.cpp
+++ b/test/gtest/bn_bwd.cpp
@@ -87,7 +87,7 @@ TEST_P(GPU_BN_BWD_Large_FP64, BnV2LargeBWDCKfp64) {}
 // fp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_BWD_Large_FP16,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -102,7 +102,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 // bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_BWD_Large_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -117,28 +117,28 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 // fp32
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_BWD_Small_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_BWD_Large_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 // // fp64
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_BWD_Small_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_BWD_Large_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp
index 2f4ec26204..d65b184c1c 100644
--- a/test/gtest/bn_fwd_train.cpp
+++ b/test/gtest/bn_fwd_train.cpp
@@ -93,7 +93,7 @@ TEST_P(GPU_BN_FWD_Train_Large_FP64, BnV2LargeFWD_TrainCKfp64) {}
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_FWD_Train_Large_FP16,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -108,7 +108,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 // // bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_FWD_Train_Large_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -130,7 +130,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_FWD_Train_Large_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -144,7 +144,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_FWD_Train_Large_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index 28a2c24bff..05a154f10b 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -83,18 +83,18 @@ TEST_P(GPU_BN_OCL_Infer_Large_FP16, BnV2LargeInferOCLfp16) {}
 TEST_P(GPU_BN_CK_Infer_Large_BFP16, BnV2LargeInferCKbfp16) {}
 TEST_P(GPU_BN_OCL_Infer_Large_BFP16, BnV2LargeInferOCLbfp16) {}
 
-// // fp32 (float)
-TEST_P(GPU_BN_Infer_Small_FP32, BnV1SmallInferCKfp32) {}
-TEST_P(GPU_BN_Infer_Large_FP32, BnV2LargeInferCKfp32) {}
+// // // fp32 (float)
+TEST_P(GPU_BN_Infer_Small_FP32, BnV1SmallInferfp32) {}
+TEST_P(GPU_BN_Infer_Large_FP32, BnV2LargeInferfp32) {}
 
-// fp64
-TEST_P(GPU_BN_Infer_Small_FP64, BnV1SmallInferCKfp64) {}
-TEST_P(GPU_BN_Infer_Large_FP64, BnV2LargeInferCKfp64) {}
+// // // fp64
+TEST_P(GPU_BN_Infer_Small_FP64, BnV1SmallInferfp64) {}
+TEST_P(GPU_BN_Infer_Large_FP64, BnV2LargeInferfp64) {}
 
-// fp16
+// // fp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_Infer_Large_FP16,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -105,7 +105,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                                           testing::ValuesIn({miopenTensorNCHW}),
                                           testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator());
-// bfp16
+// // // bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_Infer_Large_BFP16,
                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
@@ -130,11 +130,11 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_Infer_Large_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
-// fp64
+// // fp64
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_Infer_Small_FP64,
                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
@@ -144,7 +144,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_Infer_Large_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index 9afa8ea4ed..d3b1c6b073 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -67,8 +67,6 @@ inline std::vector<BNTestCase> NetworkLarge()
     // pyt_mlperf_resnet50v1.5
     return {
         {192, 1, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
-        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
-        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
         {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
         {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
         {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
@@ -106,7 +104,7 @@ inline std::vector<BNTestCase> NetworkSmall()
         {192, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
         {16, 8, 132, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
         {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
-        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 2048, 17, 17, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
 
     };
 }

From a648b717ac2cf6e93fe656209d74172a514bf2c3 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Sat, 2 Nov 2024 09:19:22 +0000
Subject: [PATCH 10/13] * move type check logic to problem description of batch
 norm * add type checks to other ocl solvers * fix other minor issues

---
 src/batchnorm/problem_description.cpp         | 108 ++++++++++++++++++
 src/fusion.cpp                                |  24 ++--
 .../miopen/batchnorm/problem_description.hpp  |  20 ++--
 src/solver/batchnorm/backward_ck.cpp          |  15 +--
 .../batchnorm/backward_per_activation.cpp     |   8 +-
 .../batchnorm/backward_spatial_multiple.cpp   |   2 +
 .../batchnorm/backward_spatial_single.cpp     |  16 +--
 src/solver/batchnorm/forward_inference.cpp    |  12 +-
 src/solver/batchnorm/forward_inference_ck.cpp |  16 +--
 .../batchnorm/forward_inference_fused.cpp     |   1 -
 .../batchnorm/forward_per_activation.cpp      |   8 +-
 .../batchnorm/forward_spatial_multiple.cpp    |   3 +
 .../batchnorm/forward_spatial_single.cpp      |  11 +-
 src/solver/batchnorm/forward_training_ck.cpp  |  15 +--
 14 files changed, 154 insertions(+), 105 deletions(-)

diff --git a/src/batchnorm/problem_description.cpp b/src/batchnorm/problem_description.cpp
index 7eb2592c46..ec082ae7b1 100644
--- a/src/batchnorm/problem_description.cpp
+++ b/src/batchnorm/problem_description.cpp
@@ -36,6 +36,114 @@ namespace miopen {
 
 namespace batchnorm {
 
+bool is_fp16_or_bfp16(miopenDataType_t type)
+{
+    return ((type == miopenHalf) || (type == miopenBFloat16));
+}
+
+bool is_fp32_or_fp64(miopenDataType_t type)
+{
+    return ((type == miopenFloat) || (type == miopenDouble));
+}
+
+bool IsOCLInferTypeValid(const ProblemDescription& bn_problem)
+{
+    // case 1 : mix type
+    return ((is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+             is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
+             bn_problem.GetBnScale().GetType() == miopenFloat &&
+             bn_problem.GetBnBias().GetType() == miopenFloat) ||
+            // case 2 : float type
+            (bn_problem.GetXDesc().GetType() == miopenFloat &&
+             bn_problem.GetYDesc().GetType() == miopenFloat &&
+             bn_problem.GetBnScale().GetType() == miopenFloat &&
+             bn_problem.GetBnBias().GetType() == miopenFloat));
+}
+
+bool IsCKInferTypeValid(const ProblemDescription& bn_problem)
+{
+    // case 1 : mix type
+    return ((is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+             is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
+             is_fp16_or_bfp16(bn_problem.GetBnScale().GetType()) &&
+             is_fp16_or_bfp16(bn_problem.GetBnBias().GetType()) &&
+             bn_problem.GetBnSMean().GetType() == miopenFloat &&
+             bn_problem.GetBnSVar().GetType() == miopenFloat) ||
+            // case 2 : fp32 or fp64
+            (is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnSVar().GetType())));
+}
+
+bool IsOCLFwdTrainTypeValid(const ProblemDescription& bn_problem)
+{
+    // case 1 : mix type
+    return ((is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+             is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
+             bn_problem.GetBnScale().GetType() == miopenFloat &&
+             bn_problem.GetBnBias().GetType() == miopenFloat) ||
+            // case 2 : float type
+            (bn_problem.GetXDesc().GetType() == miopenFloat &&
+             bn_problem.GetYDesc().GetType() == miopenFloat &&
+             bn_problem.GetBnScale().GetType() == miopenFloat &&
+             bn_problem.GetBnBias().GetType() == miopenFloat));
+}
+
+bool IsCKFwdTrainTypeValid(const ProblemDescription& bn_problem)
+{
+    // case 1 : mix type
+    return ((is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+             is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
+             is_fp16_or_bfp16(bn_problem.GetBnScale().GetType()) &&
+             is_fp16_or_bfp16(bn_problem.GetBnBias().GetType()) &&
+             bn_problem.GetBnSMean().GetType() == miopenFloat &&
+             bn_problem.GetBnSVar().GetType() == miopenFloat) ||
+            // case 2 : fp32 or fp64
+            (is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnSVar().GetType())));
+}
+
+bool IsOCLBwdTypeValid(const ProblemDescription& bn_problem)
+{
+    return ((is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+             is_fp16_or_bfp16(bn_problem.GetDXDesc().GetType()) &&
+             is_fp16_or_bfp16(bn_problem.GetDYDesc().GetType()) &&
+             bn_problem.GetBnScale().GetType() == miopenFloat &&
+             bn_problem.GetBnSMean().GetType() == miopenFloat &&
+             bn_problem.GetBnSVar().GetType() == miopenFloat) ||
+            // case 1 : fp32 or fp64
+            (is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnSVar().GetType())));
+}
+
+bool IsCKBwdTypeValid(const ProblemDescription& bn_problem)
+{
+    return ((is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+             bn_problem.GetDXDesc().GetType() == miopenFloat &&
+             is_fp16_or_bfp16(bn_problem.GetBnScale().GetType()) &&
+             bn_problem.GetDYDesc().GetType() == miopenFloat &&
+             bn_problem.GetBnSMean().GetType() == miopenFloat &&
+             bn_problem.GetBnSVar().GetType() == miopenFloat) ||
+            // case 1 : fp32 or fp64
+            (is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetBnSVar().GetType())));
+}
+
 NetworkConfig ProblemDescription::MakeNetworkConfig() const
 {
     switch(direction)
diff --git a/src/fusion.cpp b/src/fusion.cpp
index 3c52f6c2a0..5541409a67 100644
--- a/src/fusion.cpp
+++ b/src/fusion.cpp
@@ -389,18 +389,18 @@ std::string LogCmdBnormFusion(const miopenFusionPlanDescriptor_t fusePlanDesc, i
 
     if(bn_op != nullptr)
     {
-        // str += BnormArgsForMIOpenDriver(&bn_op->input_desc,
-        //                                 &bn_op->base_desc,
-        //                                 nullptr,
-        //                                 nullptr,
-        //                                 nullptr,
-        //                                 bn_op->mode,
-        //                                 nullptr,
-        //                                 nullptr,
-        //                                 nullptr,
-        //                                 nullptr,
-        //                                 miopen::debug::BatchNormDirection_t::ForwardInference,
-        //                                 false);
+        str += BnormArgsForMIOpenDriver(&bn_op->input_desc,
+                                        nullptr,
+                                        nullptr,
+                                        nullptr,
+                                        nullptr,
+                                        bn_op->mode,
+                                        nullptr,
+                                        nullptr,
+                                        nullptr,
+                                        nullptr,
+                                        miopen::debug::BatchNormDirection_t::ForwardInference,
+                                        false); // having false allows safe handling of nullptrs
     }
     else
     {
diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp
index a1ed9d2594..72d0365426 100644
--- a/src/include/miopen/batchnorm/problem_description.hpp
+++ b/src/include/miopen/batchnorm/problem_description.hpp
@@ -52,15 +52,8 @@ struct ProblemDescriptionTag
 {
 };
 
-inline bool is_fp16_or_bfp16(miopenDataType_t type)
-{
-    return ((type == miopenHalf) || (type == miopenBFloat16));
-}
-
-inline bool is_fp32_or_fp64(miopenDataType_t type)
-{
-    return ((type == miopenFloat) || (type == miopenDouble));
-}
+bool is_fp16_or_bfp16(miopenDataType_t type);
+bool is_fp32_or_fp64(miopenDataType_t type);
 
 struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
                                                     ProblemDescriptionTag
@@ -363,6 +356,15 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
     }
 };
 
+bool IsOCLInferTypeValid(const ProblemDescription& bn_problem);
+bool IsCKInferTypeValid(const ProblemDescription& bn_problem);
+
+bool IsOCLFwdTrainTypeValid(const ProblemDescription& bn_problem);
+bool IsCKFwdTrainTypeValid(const ProblemDescription& bn_problem);
+
+bool IsOCLBwdTypeValid(const ProblemDescription& bn_problem);
+bool IsCKBwdTypeValid(const ProblemDescription& bn_problem);
+
 } // namespace batchnorm
 
 } // namespace miopen
diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
index fd2613f807..451d73d28a 100644
--- a/src/solver/batchnorm/backward_ck.cpp
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -371,20 +371,7 @@ bool BnCKBwdBackward::IsApplicable(
         return false;
     if(!bn_problem.Is2D())
         return false;
-    // case 1 : fp16 or bfp16
-    if(!((::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
-          bn_problem.GetDXDesc().GetType() == miopenFloat &&
-          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetBnScale().GetType()) &&
-          bn_problem.GetDYDesc().GetType() == miopenFloat &&
-          bn_problem.GetBnSMean().GetType() == miopenFloat &&
-          bn_problem.GetBnSVar().GetType() == miopenFloat) ||
-         // case 1 : fp32 or fp64
-         (::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSVar().GetType()))))
+    if(!IsCKBwdTypeValid(bn_problem))
         return false;
 
     switch(bn_problem.GetXDesc().GetType())
diff --git a/src/solver/batchnorm/backward_per_activation.cpp b/src/solver/batchnorm/backward_per_activation.cpp
index adf72045a3..24e785f8de 100644
--- a/src/solver/batchnorm/backward_per_activation.cpp
+++ b/src/solver/batchnorm/backward_per_activation.cpp
@@ -43,8 +43,12 @@ bool BnBwdTrainingPerActivation::IsApplicable(
 {
     if(!problem.Is2D())
         return false;
-    return problem.GetDirection() == miopen::batchnorm::Direction::Backward &&
-           problem.GetMode() == miopenBNPerActivation;
+    if(problem.GetDirection() != miopen::batchnorm::Direction::Backward &&
+       problem.GetMode() != miopenBNPerActivation)
+        return false;
+    if(!::miopen::batchnorm::IsOCLBwdTypeValid(problem))
+        return false;
+    return true;
 }
 
 ConvSolution
diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp
index 846f8c41c8..e26922f478 100644
--- a/src/solver/batchnorm/backward_spatial_multiple.cpp
+++ b/src/solver/batchnorm/backward_spatial_multiple.cpp
@@ -48,6 +48,8 @@ bool BnBwdTrainingSpatialMultiple::IsApplicable(
     {
         return false;
     }
+    if(!IsOCLBwdTypeValid(problem))
+        return false;
 
 #if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR
     if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp
index 3e4bb8bf78..9b375f517c 100644
--- a/src/solver/batchnorm/backward_spatial_single.cpp
+++ b/src/solver/batchnorm/backward_spatial_single.cpp
@@ -58,21 +58,7 @@ bool BnBwdTrainingSpatialSingle::IsApplicable(
         return false;
     }
 #endif
-
-    // case 1 : fp16 or bfp16
-    if(!((::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetDXDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetDYDesc().GetType()) &&
-          bn_problem.GetBnScale().GetType() == miopenFloat &&
-          bn_problem.GetBnSMean().GetType() == miopenFloat &&
-          bn_problem.GetBnSVar().GetType() == miopenFloat) ||
-         // case 1 : fp32 or fp64
-         (::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSVar().GetType()))))
+    if(!IsOCLBwdTypeValid(bn_problem))
         return false;
 
     int n, c, h, w;
diff --git a/src/solver/batchnorm/forward_inference.cpp b/src/solver/batchnorm/forward_inference.cpp
index 8d08ece535..d22bb01797 100644
--- a/src/solver/batchnorm/forward_inference.cpp
+++ b/src/solver/batchnorm/forward_inference.cpp
@@ -49,17 +49,7 @@ bool BnFwdInference::IsApplicable(const ExecutionContext&,
         return false;
     if(!bn_problem.Is2D())
         return false;
-
-    // case 1 : mix type
-    if(!((::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
-          bn_problem.GetBnScale().GetType() == miopenFloat &&
-          bn_problem.GetBnBias().GetType() == miopenFloat) ||
-         // case 2 : float type
-         (bn_problem.GetXDesc().GetType() == miopenFloat &&
-          bn_problem.GetYDesc().GetType() == miopenFloat &&
-          bn_problem.GetBnScale().GetType() == miopenFloat &&
-          bn_problem.GetBnBias().GetType() == miopenFloat)))
+    if(!IsOCLInferTypeValid(bn_problem))
         return false;
 
     return true;
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index c4fc3b00ef..b89e2f3df1 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -334,21 +334,7 @@ bool BnCKFwdInference::IsApplicable(
         return false;
     if(bn_problem.GetMode() != miopenBNSpatial)
         return false;
-
-    // case 1 : fp16 or bfp16
-    if(!((::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetBnScale().GetType()) &&
-          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetBnBias().GetType()) &&
-          bn_problem.GetBnSMean().GetType() == miopenFloat &&
-          bn_problem.GetBnSVar().GetType() == miopenFloat) ||
-         // case 1 : fp32 or fp64
-         (::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSVar().GetType()))))
+    if(!IsCKInferTypeValid(bn_problem))
         return false;
 
     switch(bn_problem.GetXDesc().GetType())
diff --git a/src/solver/batchnorm/forward_inference_fused.cpp b/src/solver/batchnorm/forward_inference_fused.cpp
index b527b1bd61..19b9e01934 100644
--- a/src/solver/batchnorm/forward_inference_fused.cpp
+++ b/src/solver/batchnorm/forward_inference_fused.cpp
@@ -55,7 +55,6 @@ bool BnFwdInferActivationFused::IsApplicable(const FusionContext& /*context*/,
         return false;
     if(desc.op_map.at(1)->kind() != miopenFusionOpActivForward)
         return false;
-
     return true;
 }
 
diff --git a/src/solver/batchnorm/forward_per_activation.cpp b/src/solver/batchnorm/forward_per_activation.cpp
index 54c03a1835..de83a8ecc1 100644
--- a/src/solver/batchnorm/forward_per_activation.cpp
+++ b/src/solver/batchnorm/forward_per_activation.cpp
@@ -41,8 +41,12 @@ namespace batchnorm {
 bool BnFwdTrainingPerActivation::IsApplicable(
     const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const
 {
-    return problem.GetDirection() == miopen::batchnorm::Direction::ForwardTraining ||
-           problem.GetMode() == miopenBNPerActivation;
+    if(problem.GetDirection() != miopen::batchnorm::Direction::ForwardTraining ||
+       problem.GetMode() != miopenBNPerActivation)
+        return false;
+    if(!IsOCLFwdTrainTypeValid(problem))
+        return false;
+    return true;
 }
 
 ConvSolution
diff --git a/src/solver/batchnorm/forward_spatial_multiple.cpp b/src/solver/batchnorm/forward_spatial_multiple.cpp
index dba21d3550..6a2c42743b 100644
--- a/src/solver/batchnorm/forward_spatial_multiple.cpp
+++ b/src/solver/batchnorm/forward_spatial_multiple.cpp
@@ -47,6 +47,9 @@ bool BnFwdTrainingSpatialMultiple::IsApplicable(
        problem.GetMode() != miopenBNSpatial)
         return false;
 
+    if(!IsOCLFwdTrainTypeValid(problem))
+        return false;
+
     return !BnFwdTrainingSpatialSingle{}.IsApplicable(context, problem);
 }
 
diff --git a/src/solver/batchnorm/forward_spatial_single.cpp b/src/solver/batchnorm/forward_spatial_single.cpp
index 91f5c66a03..ccfebce987 100644
--- a/src/solver/batchnorm/forward_spatial_single.cpp
+++ b/src/solver/batchnorm/forward_spatial_single.cpp
@@ -46,16 +46,7 @@ bool BnFwdTrainingSpatialSingle::IsApplicable(
        bn_problem.GetMode() != miopenBNSpatial)
         return false;
 
-    // case 1 : mix type
-    if(!((::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
-          bn_problem.GetBnScale().GetType() == miopenFloat &&
-          bn_problem.GetBnBias().GetType() == miopenFloat) ||
-         // case 2 : float type
-         (bn_problem.GetXDesc().GetType() == miopenFloat &&
-          bn_problem.GetYDesc().GetType() == miopenFloat &&
-          bn_problem.GetBnScale().GetType() == miopenFloat &&
-          bn_problem.GetBnBias().GetType() == miopenFloat)))
+    if(!IsOCLFwdTrainTypeValid(bn_problem))
         return false;
 
     int n, c, h, w;
diff --git a/src/solver/batchnorm/forward_training_ck.cpp b/src/solver/batchnorm/forward_training_ck.cpp
index 62c6a7932f..27640eb4d9 100644
--- a/src/solver/batchnorm/forward_training_ck.cpp
+++ b/src/solver/batchnorm/forward_training_ck.cpp
@@ -361,20 +361,7 @@ bool BnCKFwdTraining::IsApplicable(
         return false;
     if(bn_problem.GetMode() != miopenBNSpatial)
         return false;
-    // case 1 : fp16 or bfp16
-    if(!((::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetBnScale().GetType()) &&
-          ::miopen::batchnorm::is_fp16_or_bfp16(bn_problem.GetBnBias().GetType()) &&
-          bn_problem.GetBnSMean().GetType() == miopenFloat &&
-          bn_problem.GetBnSVar().GetType() == miopenFloat) ||
-         // case 1 : fp32 or fp64
-         (::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
-          ::miopen::batchnorm::is_fp32_or_fp64(bn_problem.GetBnSVar().GetType()))))
+    if(!IsCKFwdTrainTypeValid(bn_problem))
         return false;
 
     switch(bn_problem.GetXDesc().GetType())

From bed3d9aa4dc06481026f06046ef7a8556c33a498 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Sat, 2 Nov 2024 10:01:21 +0000
Subject: [PATCH 11/13] fix network cache

---
 src/batchnorm/problem_description.cpp          |  6 ++++++
 .../miopen/batchnorm/problem_description.hpp   |  2 ++
 test/gtest/bn_bwd.cpp                          | 12 ++++++------
 test/gtest/bn_fwd_train.cpp                    | 10 +++++-----
 test/gtest/bn_infer.cpp                        | 18 +++++++++---------
 5 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/batchnorm/problem_description.cpp b/src/batchnorm/problem_description.cpp
index ec082ae7b1..8a10bd6f27 100644
--- a/src/batchnorm/problem_description.cpp
+++ b/src/batchnorm/problem_description.cpp
@@ -295,6 +295,8 @@ NetworkConfig ProblemDescription::MakeForwardTrainingNetworkConfig() const
         ss << "hw" << in_cstride;
     }
     ss << "layout" << in_layout;
+    ss << "scaleType" << static_cast<int>(IsScaleFp16());
+    ss << "scaleType" << static_cast<int>(IsScaleFp32());
 
     return NetworkConfig{ss.str()};
 }
@@ -317,6 +319,8 @@ NetworkConfig ProblemDescription::MakeForwardInferenceNetworkConfig() const
     ss << "HWdims" << in_cstride;
     ss << "C" << c;
     ss << "layout" << in_layout;
+    ss << "scaleType" << static_cast<int>(IsScaleFp16());
+    ss << "scaleType" << static_cast<int>(IsScaleFp32());
 
     return NetworkConfig{ss.str()};
 }
@@ -447,6 +451,8 @@ NetworkConfig ProblemDescription::MakeBackwardNetworkConfig() const
         ss << "nhw" << in_nhw;
     }
     ss << "layout" << in_layout;
+    ss << "scaleType" << static_cast<int>(IsScaleFp16());
+    ss << "scaleType" << static_cast<int>(IsScaleFp32());
 
     return NetworkConfig{ss.str()};
 }
diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp
index 72d0365426..4fed9033a6 100644
--- a/src/include/miopen/batchnorm/problem_description.hpp
+++ b/src/include/miopen/batchnorm/problem_description.hpp
@@ -238,6 +238,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
     bool IsFp16() const { return xDesc.GetType() == miopenHalf; }
     bool IsBFp16() const { return xDesc.GetType() == miopenBFloat16; }
     bool IsMix() const { return (IsFp16() || IsBFp16()) && sMeanDesc.GetType() == miopenFloat; }
+    bool IsScaleFp16() const { return scaleDesc.GetType() == miopenHalf; }
+    bool IsScaleFp32() const { return scaleDesc.GetType() == miopenFloat; }
 
     void Serialize(std::ostream& stream) const { stream << MakeNetworkConfig().ToString(); }
 
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
index ba4c136469..9b93df6c81 100644
--- a/test/gtest/bn_bwd.cpp
+++ b/test/gtest/bn_bwd.cpp
@@ -87,7 +87,7 @@ TEST_P(GPU_BN_BWD_Large_FP64, BnV2LargeBWDCKfp64) {}
 // fp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_BWD_Large_FP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -102,9 +102,9 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 // bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_BWD_Large_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV2})),
+                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
@@ -119,12 +119,12 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_BWD_Small_FP32,
                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW}),
-                                          testing::ValuesIn({testBNAPIV2})),
+                                          testing::ValuesIn({testBNAPIV1})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_BWD_Large_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -133,7 +133,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_BWD_Small_FP64,
                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV2})),
+                                          testing::ValuesIn({testBNAPIV1})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp
index d65b184c1c..1953accee7 100644
--- a/test/gtest/bn_fwd_train.cpp
+++ b/test/gtest/bn_fwd_train.cpp
@@ -93,7 +93,7 @@ TEST_P(GPU_BN_FWD_Train_Large_FP64, BnV2LargeFWD_TrainCKfp64) {}
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_FWD_Train_Large_FP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -108,9 +108,9 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 // // bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_FWD_Train_Large_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV2})),
+                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
@@ -130,7 +130,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_FWD_Train_Large_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -144,7 +144,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_FWD_Train_Large_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index 05a154f10b..e717e149b0 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -83,18 +83,18 @@ TEST_P(GPU_BN_OCL_Infer_Large_FP16, BnV2LargeInferOCLfp16) {}
 TEST_P(GPU_BN_CK_Infer_Large_BFP16, BnV2LargeInferCKbfp16) {}
 TEST_P(GPU_BN_OCL_Infer_Large_BFP16, BnV2LargeInferOCLbfp16) {}
 
-// // // fp32 (float)
+// fp32 (float)
 TEST_P(GPU_BN_Infer_Small_FP32, BnV1SmallInferfp32) {}
 TEST_P(GPU_BN_Infer_Large_FP32, BnV2LargeInferfp32) {}
 
-// // // fp64
+// fp64
 TEST_P(GPU_BN_Infer_Small_FP64, BnV1SmallInferfp64) {}
 TEST_P(GPU_BN_Infer_Large_FP64, BnV2LargeInferfp64) {}
 
-// // fp16
+// fp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_Infer_Large_FP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -105,10 +105,10 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                                           testing::ValuesIn({miopenTensorNCHW}),
                                           testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator());
-// // // bfp16
+// bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_Infer_Large_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -130,11 +130,11 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_Infer_Large_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
-// // fp64
+// fp64
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_Infer_Small_FP64,
                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
@@ -144,7 +144,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_Infer_Large_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());

From 743cb9e879e51f114605917781211b0c16ba9b82 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Sat, 2 Nov 2024 13:12:22 +0000
Subject: [PATCH 12/13] fix minor testing issue

---
 src/batchnorm/problem_description.cpp | 67 +++++++++++++--------------
 test/gtest/bn_bwd.cpp                 |  8 ++--
 test/gtest/bn_fwd_train.cpp           | 14 +++---
 test/gtest/bn_infer.cpp               |  4 +-
 4 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/src/batchnorm/problem_description.cpp b/src/batchnorm/problem_description.cpp
index 8a10bd6f27..daebf49208 100644
--- a/src/batchnorm/problem_description.cpp
+++ b/src/batchnorm/problem_description.cpp
@@ -46,18 +46,18 @@ bool is_fp32_or_fp64(miopenDataType_t type)
     return ((type == miopenFloat) || (type == miopenDouble));
 }
 
+bool is_fp32(miopenDataType_t type) { return (type == miopenFloat); }
+
 bool IsOCLInferTypeValid(const ProblemDescription& bn_problem)
 {
     // case 1 : mix type
-    return ((is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
-             is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
-             bn_problem.GetBnScale().GetType() == miopenFloat &&
-             bn_problem.GetBnBias().GetType() == miopenFloat) ||
-            // case 2 : float type
-            (bn_problem.GetXDesc().GetType() == miopenFloat &&
-             bn_problem.GetYDesc().GetType() == miopenFloat &&
-             bn_problem.GetBnScale().GetType() == miopenFloat &&
-             bn_problem.GetBnBias().GetType() == miopenFloat));
+    return (
+        (is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+         is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
+         is_fp32(bn_problem.GetBnScale().GetType()) && is_fp32(bn_problem.GetBnBias().GetType())) ||
+        // case 2 : float type
+        (is_fp32(bn_problem.GetXDesc().GetType()) && is_fp32(bn_problem.GetYDesc().GetType()) &&
+         is_fp32(bn_problem.GetBnScale().GetType()) && is_fp32(bn_problem.GetBnBias().GetType())));
 }
 
 bool IsCKInferTypeValid(const ProblemDescription& bn_problem)
@@ -67,8 +67,8 @@ bool IsCKInferTypeValid(const ProblemDescription& bn_problem)
              is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
              is_fp16_or_bfp16(bn_problem.GetBnScale().GetType()) &&
              is_fp16_or_bfp16(bn_problem.GetBnBias().GetType()) &&
-             bn_problem.GetBnSMean().GetType() == miopenFloat &&
-             bn_problem.GetBnSVar().GetType() == miopenFloat) ||
+             is_fp32(bn_problem.GetBnSMean().GetType()) &&
+             is_fp32(bn_problem.GetBnSVar().GetType())) ||
             // case 2 : fp32 or fp64
             (is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
              is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
@@ -81,15 +81,13 @@ bool IsCKInferTypeValid(const ProblemDescription& bn_problem)
 bool IsOCLFwdTrainTypeValid(const ProblemDescription& bn_problem)
 {
     // case 1 : mix type
-    return ((is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
-             is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
-             bn_problem.GetBnScale().GetType() == miopenFloat &&
-             bn_problem.GetBnBias().GetType() == miopenFloat) ||
-            // case 2 : float type
-            (bn_problem.GetXDesc().GetType() == miopenFloat &&
-             bn_problem.GetYDesc().GetType() == miopenFloat &&
-             bn_problem.GetBnScale().GetType() == miopenFloat &&
-             bn_problem.GetBnBias().GetType() == miopenFloat));
+    return (
+        (is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+         is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
+         is_fp32(bn_problem.GetBnScale().GetType()) && is_fp32(bn_problem.GetBnBias().GetType())) ||
+        // case 2 : float type
+        (is_fp32(bn_problem.GetXDesc().GetType()) && is_fp32(bn_problem.GetYDesc().GetType()) &&
+         is_fp32(bn_problem.GetBnScale().GetType()) && is_fp32(bn_problem.GetBnBias().GetType())));
 }
 
 bool IsCKFwdTrainTypeValid(const ProblemDescription& bn_problem)
@@ -99,8 +97,8 @@ bool IsCKFwdTrainTypeValid(const ProblemDescription& bn_problem)
              is_fp16_or_bfp16(bn_problem.GetYDesc().GetType()) &&
              is_fp16_or_bfp16(bn_problem.GetBnScale().GetType()) &&
              is_fp16_or_bfp16(bn_problem.GetBnBias().GetType()) &&
-             bn_problem.GetBnSMean().GetType() == miopenFloat &&
-             bn_problem.GetBnSVar().GetType() == miopenFloat) ||
+             is_fp32(bn_problem.GetBnSMean().GetType()) &&
+             is_fp32(bn_problem.GetBnSVar().GetType())) ||
             // case 2 : fp32 or fp64
             (is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
              is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
@@ -112,19 +110,16 @@ bool IsCKFwdTrainTypeValid(const ProblemDescription& bn_problem)
 
 bool IsOCLBwdTypeValid(const ProblemDescription& bn_problem)
 {
-    return ((is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
-             is_fp16_or_bfp16(bn_problem.GetDXDesc().GetType()) &&
-             is_fp16_or_bfp16(bn_problem.GetDYDesc().GetType()) &&
-             bn_problem.GetBnScale().GetType() == miopenFloat &&
-             bn_problem.GetBnSMean().GetType() == miopenFloat &&
-             bn_problem.GetBnSVar().GetType() == miopenFloat) ||
-            // case 1 : fp32 or fp64
-            (is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
-             is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
-             is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
-             is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
-             is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
-             is_fp32_or_fp64(bn_problem.GetBnSVar().GetType())));
+    return (
+        (is_fp16_or_bfp16(bn_problem.GetXDesc().GetType()) &&
+         is_fp16_or_bfp16(bn_problem.GetDXDesc().GetType()) &&
+         is_fp16_or_bfp16(bn_problem.GetDYDesc().GetType()) &&
+         is_fp32(bn_problem.GetBnScale().GetType()) && is_fp32(bn_problem.GetBnSMean().GetType()) &&
+         is_fp32(bn_problem.GetBnSVar().GetType())) ||
+        // case 1 : fp32
+        (is_fp32(bn_problem.GetXDesc().GetType()) && is_fp32(bn_problem.GetDXDesc().GetType()) &&
+         is_fp32(bn_problem.GetBnScale().GetType()) && is_fp32(bn_problem.GetBnBias().GetType()) &&
+         is_fp32(bn_problem.GetBnSMean().GetType()) && is_fp32(bn_problem.GetBnSVar().GetType())));
 }
 
 bool IsCKBwdTypeValid(const ProblemDescription& bn_problem)
@@ -137,7 +132,7 @@ bool IsCKBwdTypeValid(const ProblemDescription& bn_problem)
              bn_problem.GetBnSVar().GetType() == miopenFloat) ||
             // case 1 : fp32 or fp64
             (is_fp32_or_fp64(bn_problem.GetXDesc().GetType()) &&
-             is_fp32_or_fp64(bn_problem.GetYDesc().GetType()) &&
+             is_fp32_or_fp64(bn_problem.GetDXDesc().GetType()) &&
              is_fp32_or_fp64(bn_problem.GetBnScale().GetType()) &&
              is_fp32_or_fp64(bn_problem.GetBnBias().GetType()) &&
              is_fp32_or_fp64(bn_problem.GetBnSMean().GetType()) &&
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
index 9b93df6c81..2dbe462b99 100644
--- a/test/gtest/bn_bwd.cpp
+++ b/test/gtest/bn_bwd.cpp
@@ -87,7 +87,7 @@ TEST_P(GPU_BN_BWD_Large_FP64, BnV2LargeBWDCKfp64) {}
 // fp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_BWD_Large_FP16,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -102,9 +102,9 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 // bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_BWD_Large_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
+                                          testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
@@ -138,7 +138,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_BWD_Large_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp
index 1953accee7..e6419400b7 100644
--- a/test/gtest/bn_fwd_train.cpp
+++ b/test/gtest/bn_fwd_train.cpp
@@ -93,7 +93,7 @@ TEST_P(GPU_BN_FWD_Train_Large_FP64, BnV2LargeFWD_TrainCKfp64) {}
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_FWD_Train_Large_FP16,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -108,9 +108,9 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 // // bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_FWD_Train_Large_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
+                                          testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
@@ -123,14 +123,14 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 // // fp32
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_FWD_Train_Small_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_FWD_Train_Large_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -139,12 +139,12 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_FWD_Train_Small_FP64,
                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV2})),
+                                          testing::ValuesIn({testBNAPIV1})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_FWD_Train_Large_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index e717e149b0..75c23c44f2 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -94,7 +94,7 @@ TEST_P(GPU_BN_Infer_Large_FP64, BnV2LargeInferfp64) {}
 // fp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_Infer_Large_FP16,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -108,7 +108,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 // bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_Infer_Large_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());

From e1bb0c1b989bce863016d8241f331ba1d7c84fce Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Sat, 2 Nov 2024 19:24:53 +0000
Subject: [PATCH 13/13] disable ck bn for now

---
 src/ocl/batchnormocl.cpp                   |  9 +++++----
 src/solver/batchnorm/forward_inference.cpp |  2 +-
 test/gtest/bn_bwd.cpp                      | 12 ++++++------
 test/gtest/bn_fwd_train.cpp                | 16 ++++++++--------
 test/gtest/bn_infer.cpp                    | 18 +++++++++---------
 5 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp
index 78d2dbce66..77442e030a 100644
--- a/src/ocl/batchnormocl.cpp
+++ b/src/ocl/batchnormocl.cpp
@@ -152,7 +152,7 @@ void BatchNormForwardTraining(Handle& handle,
     }();
 
     const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdTrainingSpatialSingle,
-                                                 solver::batchnorm::BnCKFwdTraining,
+                                                //  solver::batchnorm::BnCKFwdTraining,
                                                  solver::batchnorm::BnFwdTrainingSpatialMultiple,
                                                  solver::batchnorm::BnFwdTrainingPerActivation>{};
 
@@ -250,8 +250,9 @@ void BatchNormForwardInference(Handle& handle,
         }();
 
         const auto algo    = AlgorithmName{"miopenBatchNormalizationForwardInference"};
-        const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdInference,
-                                                     solver::batchnorm::BnCKFwdInference>{};
+        const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdInference
+                                                    //  solver::batchnorm::BnCKFwdInference
+                                                     >{};
 
         solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
     }
@@ -394,7 +395,7 @@ void BatchNormBackward(Handle& handle,
     }();
 
     const auto solvers = solver::SolverContainer<solver::batchnorm::BnBwdTrainingSpatialSingle,
-                                                 solver::batchnorm::BnCKBwdBackward,
+                                                //  solver::batchnorm::BnCKBwdBackward,
                                                  solver::batchnorm::BnBwdTrainingSpatialMultiple,
                                                  solver::batchnorm::BnBwdTrainingPerActivation>{};
 
diff --git a/src/solver/batchnorm/forward_inference.cpp b/src/solver/batchnorm/forward_inference.cpp
index d22bb01797..a05fce5105 100644
--- a/src/solver/batchnorm/forward_inference.cpp
+++ b/src/solver/batchnorm/forward_inference.cpp
@@ -45,7 +45,7 @@ bool BnFwdInference::IsApplicable(const ExecutionContext&,
         return false;
     if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardInference)
         return false;
-    if(!(bn_problem.IsFp32() or bn_problem.IsFp16() || bn_problem.IsBFp16()))
+    if(!(bn_problem.IsFp32() or bn_problem.IsFp16() or bn_problem.IsBFp16()))
         return false;
     if(!bn_problem.Is2D())
         return false;
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
index 2dbe462b99..a84a8a8feb 100644
--- a/test/gtest/bn_bwd.cpp
+++ b/test/gtest/bn_bwd.cpp
@@ -69,11 +69,11 @@ struct GPU_BN_BWD_Large_FP64 : BNBwdTest<double, double, double, double, double,
 };
 
 // fp16
-TEST_P(GPU_BN_CK_BWD_Large_FP16, BnV2LargeBWDCKfp16) {}
+TEST_P(GPU_BN_CK_BWD_Large_FP16, DISABLED_BnV2LargeBWDCKfp16) {}
 TEST_P(GPU_BN_OCL_BWD_Large_FP16, BnV2LargeBWDOCLfp16) {}
 
 // bfp16
-TEST_P(GPU_BN_CK_BWD_Large_BFP16, BnV2LargeBWDCKbfp16) {}
+TEST_P(GPU_BN_CK_BWD_Large_BFP16, DISABLED_BnV2LargeBWDCKbfp16) {}
 TEST_P(GPU_BN_OCL_BWD_Large_BFP16, BnV2LargeBWDOCLbfp16) {}
 
 // fp32 (float)
@@ -81,8 +81,8 @@ TEST_P(GPU_BN_BWD_Small_FP32, BnV1SmallBWDCKfp32) {}
 TEST_P(GPU_BN_BWD_Large_FP32, BnV2LargeBWDCKfp32) {}
 
 // fp64
-TEST_P(GPU_BN_BWD_Small_FP64, BnV1SmallBWDCKfp64) {}
-TEST_P(GPU_BN_BWD_Large_FP64, BnV2LargeBWDCKfp64) {}
+TEST_P(GPU_BN_BWD_Small_FP64, DISABLED_BnV1SmallBWDCKfp64) {}
+TEST_P(GPU_BN_BWD_Large_FP64, DISABLED_BnV2LargeBWDCKfp64) {}
 
 // fp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
@@ -102,7 +102,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 // bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_CK_BWD_Large_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
@@ -125,7 +125,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_BWD_Large_FP32,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenTensorNCHW}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 // // fp64
diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp
index e6419400b7..c46da84245 100644
--- a/test/gtest/bn_fwd_train.cpp
+++ b/test/gtest/bn_fwd_train.cpp
@@ -74,11 +74,11 @@ struct GPU_BN_FWD_Train_Large_FP64 : BNFwdTrainTest<double, double, double, doub
 };
 
 // fp16
-TEST_P(GPU_BN_CK_FWD_Train_Large_FP16, BnV2LargeFWD_TrainCKfp16) {}
+TEST_P(GPU_BN_CK_FWD_Train_Large_FP16, DISABLED_BnV2LargeFWD_TrainCKfp16) {}
 TEST_P(GPU_BN_OCL_FWD_Train_Large_FP16, BnV2LargeFWD_TrainOCLfp16) {}
 
 // // bfp16
-TEST_P(GPU_BN_CK_FWD_Train_Large_BFP16, BnV2LargeFWD_TrainCKbfp16) {}
+TEST_P(GPU_BN_CK_FWD_Train_Large_BFP16, DISABLED_BnV2LargeFWD_TrainCKbfp16) {}
 TEST_P(GPU_BN_OCL_FWD_Train_Large_BFP16, BnV2LargeFWD_TrainOCLbfp16) {}
 
 // // fp32 (float)
@@ -86,8 +86,8 @@ TEST_P(GPU_BN_FWD_Train_Small_FP32, BnV1SmallFWD_TrainCKfp32) {}
 TEST_P(GPU_BN_FWD_Train_Large_FP32, BnV2LargeFWD_TrainCKfp32) {}
 
 // // // fp64
-TEST_P(GPU_BN_FWD_Train_Small_FP64, BnV1SmallFWD_TrainCKfp64) {}
-TEST_P(GPU_BN_FWD_Train_Large_FP64, BnV2LargeFWD_TrainCKfp64) {}
+TEST_P(GPU_BN_FWD_Train_Small_FP64, DISABLED_BnV1SmallFWD_TrainCKfp64) {}
+TEST_P(GPU_BN_FWD_Train_Large_FP64, DISABLED_BnV2LargeFWD_TrainCKfp64) {}
 
 // fp16
 
@@ -123,14 +123,14 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 // // fp32
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_FWD_Train_Small_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
-                                          testing::ValuesIn({testBNAPIV2})),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({testBNAPIV1})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_FWD_Train_Large_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index 75c23c44f2..591cbd0b1a 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -76,11 +76,11 @@ struct GPU_BN_Infer_Large_FP64 : BNInferTest<double, double, double, double, dou
 };
 
 // fp16
-TEST_P(GPU_BN_CK_Infer_Large_FP16, BnV2LargeInferCKfp16) {}
+TEST_P(GPU_BN_CK_Infer_Large_FP16, DISABLED_BnV2LargeInferCKfp16) {}
 TEST_P(GPU_BN_OCL_Infer_Large_FP16, BnV2LargeInferOCLfp16) {}
 
 // bfp16
-TEST_P(GPU_BN_CK_Infer_Large_BFP16, BnV2LargeInferCKbfp16) {}
+TEST_P(GPU_BN_CK_Infer_Large_BFP16, DISABLED_BnV2LargeInferCKbfp16) {}
 TEST_P(GPU_BN_OCL_Infer_Large_BFP16, BnV2LargeInferOCLbfp16) {}
 
 // fp32 (float)
@@ -88,8 +88,8 @@ TEST_P(GPU_BN_Infer_Small_FP32, BnV1SmallInferfp32) {}
 TEST_P(GPU_BN_Infer_Large_FP32, BnV2LargeInferfp32) {}
 
 // fp64
-TEST_P(GPU_BN_Infer_Small_FP64, BnV1SmallInferfp64) {}
-TEST_P(GPU_BN_Infer_Large_FP64, BnV2LargeInferfp64) {}
+TEST_P(GPU_BN_Infer_Small_FP64, DISABLED_BnV1SmallInferfp64) {}
+TEST_P(GPU_BN_Infer_Large_FP64, DISABLED_BnV2LargeInferfp64) {}
 
 // fp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
@@ -123,15 +123,15 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 // fp32
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_Infer_Small_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCHW}),
                                           testing::ValuesIn({testBNAPIV1})),
                          TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_Infer_Large_FP32,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCHW}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());
 // fp64
@@ -144,7 +144,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BN_Infer_Large_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator());